1 /* { dg-do run { target mipsisa64*-*-* } } */
2 /* { dg-mips-options "-mips64 -O2 -mips3d -mhard-float -mgp64" } */
4 /* Matrix Multiplications */
8 typedef float v2sf __attribute__((vector_size(8)));
10 float a[4] = {1.1, 2.2, 3.3, 4.4};
11 float b[4][4] = {{1, 2, 3, 4},
16 float c[4]; /* Result for matrix_multiply1() */
17 float d[4]; /* Result for matrix_multiply2() */
18 float e[4]; /* Result for matrix_multiply3() */
19 float f[4]; /* Result for matrix_multiply4() */
21 void matrix_multiply1();
22 NOMIPS16 void matrix_multiply2();
23 NOMIPS16 void matrix_multiply3();
24 NOMIPS16 void matrix_multiply4();
30 /* Version 1. Use float calculations */
33 /* Version 2. Use paired-single instructions inside the inner loop*/
35 for (i = 0; i < 4; i++)
39 /* Version 3. Use paired-single instructions and unroll the inner loop */
41 for (i = 0; i < 4; i++)
45 /* Version 4. Use paired-single instructions and unroll all loops */
47 for (i = 0; i < 4; i++)
51 printf ("Test Passes\n");
55 void matrix_multiply1()
59 for (i = 0; i < 4; i++)
63 for (j = 0; j < 4; j ++)
64 c[i] += a[j] * b[j][i];
68 NOMIPS16 void matrix_multiply2()
74 for (i = 0; i < 4; i++)
76 result = (v2sf) {0.0, 0.0};
78 for (j = 0; j < 4; j+=2)
80 /* Load two float values into m1 */
81 m1 = (v2sf) {a[j], a[j+1]};
82 m2 = (v2sf) {b[j][i], b[j+1][i]};
84 /* Multiply and add */
88 /* Reduction add at the end */
89 temp = __builtin_mips_addr_ps (result, result);
90 d[i] = __builtin_mips_cvt_s_pl (temp);
94 NOMIPS16 void matrix_multiply3()
100 m1 = (v2sf) {a[0], a[1]};
101 m2 = (v2sf) {a[2], a[3]};
103 for (i = 0; i < 4; i++)
105 n1 = (v2sf) {b[0][i], b[1][i]};
106 n2 = (v2sf) {b[2][i], b[3][i]};
108 /* Multiply and add */
109 result = m1 * n1 + m2 * n2;
111 /* Reduction add at the end */
112 temp = __builtin_mips_addr_ps (result, result);
113 e[i] = __builtin_mips_cvt_s_pl (temp);
117 NOMIPS16 void matrix_multiply4()
120 v2sf n1, n2, n3, n4, n5, n6, n7, n8;
121 v2sf temp1, temp2, temp3, temp4;
122 v2sf result1, result2;
124 /* Load a[0] a[1] values into m1
125 Load a[2] a[3] values into m2 */
126 m1 = (v2sf) {a[0], a[1]};
127 m2 = (v2sf) {a[2], a[3]};
129 /* Load b[0][0] b[1][0] values into n1
130 Load b[2][0] b[3][0] values into n2
131 Load b[0][1] b[1][1] values into n3
132 Load b[2][1] b[3][1] values into n4
133 Load b[0][2] b[1][2] values into n5
134 Load b[2][2] b[3][2] values into n6
135 Load b[0][3] b[1][3] values into n7
136 Load b[2][3] b[3][3] values into n8 */
137 n1 = (v2sf) {b[0][0], b[1][0]};
138 n2 = (v2sf) {b[2][0], b[3][0]};
139 n3 = (v2sf) {b[0][1], b[1][1]};
140 n4 = (v2sf) {b[2][1], b[3][1]};
141 n5 = (v2sf) {b[0][2], b[1][2]};
142 n6 = (v2sf) {b[2][2], b[3][2]};
143 n7 = (v2sf) {b[0][3], b[1][3]};
144 n8 = (v2sf) {b[2][3], b[3][3]};
146 temp1 = m1 * n1 + m2 * n2;
147 temp2 = m1 * n3 + m2 * n4;
148 temp3 = m1 * n5 + m2 * n6;
149 temp4 = m1 * n7 + m2 * n8;
151 result1 = __builtin_mips_addr_ps (temp1, temp2);
152 result2 = __builtin_mips_addr_ps (temp3, temp4);
154 f[0] = __builtin_mips_cvt_s_pu (result1);
155 f[1] = __builtin_mips_cvt_s_pl (result1);
156 f[2] = __builtin_mips_cvt_s_pu (result2);
157 f[3] = __builtin_mips_cvt_s_pl (result2);