gcc/testsuite/gcc.target/mips/mips-3d-9.c

   1 /* { dg-do run { target mipsisa64*-*-* } } */
   2 /* { dg-mips-options "-mips64 -O2 -mips3d -mhard-float -mgp64" } */
   3
   4 /* Matrix Multiplications */
   5 #include <stdlib.h>
   6 #include <stdio.h>
   7
   8 typedef float v2sf __attribute__((vector_size(8)));
   9
  10 float a[4] = {1.1, 2.2, 3.3, 4.4};
  11 float b[4][4] = {{1, 2, 3, 4},
  12                  {5, 6, 7, 8},
  13                  {9, 10, 11, 12},
  14                  {13, 14, 15, 16}};
  15
  16 float c[4]; /* Result for matrix_multiply1() */
  17 float d[4]; /* Result for matrix_multiply2() */
  18 float e[4]; /* Result for matrix_multiply3() */
  19 float f[4]; /* Result for matrix_multiply4() */
  20
  21 void matrix_multiply1();
  22 NOMIPS16 void matrix_multiply2();
  23 NOMIPS16 void matrix_multiply3();
  24 NOMIPS16 void matrix_multiply4();
  25
  26 int main ()
  27 {
  28   int i;
  29
  30   /* Version 1. Use float calculations */
  31   matrix_multiply1();
  32
  33   /* Version 2. Use paired-single instructions inside the inner loop*/
  34   matrix_multiply2();
  35   for (i = 0; i < 4; i++)
  36     if (d[i] != c[i])
  37       abort();
  38
  39   /* Version 3. Use paired-single instructions and unroll the inner loop */
  40   matrix_multiply3();
  41   for (i = 0; i < 4; i++)
  42     if (e[i] != c[i])
  43       abort();
  44
  45   /* Version 4. Use paired-single instructions and unroll all loops */
  46   matrix_multiply4();
  47   for (i = 0; i < 4; i++)
  48     if (f[i] != c[i])
  49       abort();
  50
  51   printf ("Test Passes\n");
  52   exit (0);
  53 }
  54
  55 void matrix_multiply1()
  56 {
  57   int i, j;
  58
  59   for (i = 0; i < 4; i++)
  60    {
  61      c[i] = 0.0;
  62
  63      for (j = 0; j < 4; j ++)
  64        c[i] += a[j] * b[j][i];
  65    }
  66 }
  67
  68 NOMIPS16 void matrix_multiply2()
  69 {
  70   int i, j;
  71   v2sf m1, m2;
  72   v2sf result, temp;
  73
  74   for (i = 0; i < 4; i++)
  75    {
  76      result = (v2sf) {0.0, 0.0};
  77
  78      for (j = 0; j < 4; j+=2)
  79      {
  80        /* Load two float values into m1 */
  81        m1 = (v2sf) {a[j], a[j+1]};
  82        m2 = (v2sf) {b[j][i], b[j+1][i]};
  83
  84        /* Multiply and add */
  85        result += m1 * m2;
  86      }
  87
  88      /* Reduction add at the end */
  89      temp = __builtin_mips_addr_ps (result, result);
  90      d[i] = __builtin_mips_cvt_s_pl (temp);
  91    }
  92 }
  93
  94 NOMIPS16 void matrix_multiply3()
  95 {
  96   int i;
  97   v2sf m1, m2, n1, n2;
  98   v2sf result, temp;
  99
 100   m1 = (v2sf) {a[0], a[1]};
 101   m2 = (v2sf) {a[2], a[3]};
 102
 103   for (i = 0; i < 4; i++)
 104    {
 105      n1 = (v2sf) {b[0][i], b[1][i]};
 106      n2 = (v2sf) {b[2][i], b[3][i]};
 107
 108      /* Multiply and add */
 109      result = m1 * n1 + m2 * n2;
 110
 111      /* Reduction add at the end */
 112      temp = __builtin_mips_addr_ps (result, result);
 113      e[i] = __builtin_mips_cvt_s_pl (temp);
 114    }
 115 }
 116
 117 NOMIPS16 void matrix_multiply4()
 118 {
 119   v2sf m1, m2;
 120   v2sf n1, n2, n3, n4, n5, n6, n7, n8;
 121   v2sf temp1, temp2, temp3, temp4;
 122   v2sf result1, result2;
 123
 124   /* Load a[0] a[1] values into m1
 125      Load a[2] a[3] values into m2 */
 126   m1 = (v2sf) {a[0], a[1]};
 127   m2 = (v2sf) {a[2], a[3]};
 128
 129   /* Load b[0][0] b[1][0] values into n1
 130      Load b[2][0] b[3][0] values into n2
 131      Load b[0][1] b[1][1] values into n3
 132      Load b[2][1] b[3][1] values into n4
 133      Load b[0][2] b[1][2] values into n5
 134      Load b[2][2] b[3][2] values into n6
 135      Load b[0][3] b[1][3] values into n7
 136      Load b[2][3] b[3][3] values into n8 */
 137   n1 = (v2sf) {b[0][0], b[1][0]};
 138   n2 = (v2sf) {b[2][0], b[3][0]};
 139   n3 = (v2sf) {b[0][1], b[1][1]};
 140   n4 = (v2sf) {b[2][1], b[3][1]};
 141   n5 = (v2sf) {b[0][2], b[1][2]};
 142   n6 = (v2sf) {b[2][2], b[3][2]};
 143   n7 = (v2sf) {b[0][3], b[1][3]};
 144   n8 = (v2sf) {b[2][3], b[3][3]};
 145
 146   temp1 = m1 * n1 + m2 * n2;
 147   temp2 = m1 * n3 + m2 * n4;
 148   temp3 = m1 * n5 + m2 * n6;
 149   temp4 = m1 * n7 + m2 * n8;
 150
 151   result1 = __builtin_mips_addr_ps (temp1, temp2);
 152   result2 = __builtin_mips_addr_ps (temp3, temp4);
 153
 154   f[0] = __builtin_mips_cvt_s_pu (result1);
 155   f[1] = __builtin_mips_cvt_s_pl (result1);
 156   f[2] = __builtin_mips_cvt_s_pu (result2);
 157   f[3] = __builtin_mips_cvt_s_pl (result2);
 158 }