gcc/config/rs6000/si2vmx.h

   1 /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
   2    Copyright (C) 2007 Free Software Foundation, Inc.
   3
   4    This file is free software; you can redistribute it and/or modify it under
   5    the terms of the GNU General Public License as published by the Free
   6    Software Foundation; either version 2 of the License, or (at your option)
   7    any later version.
   8
   9    This file is distributed in the hope that it will be useful, but WITHOUT
  10    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12    for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this file; see the file COPYING.  If not, write to the Free
  16    Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  17    02110-1301, USA.  */
  18
  19 /* As a special exception, if you include this header file into source files
  20    compiled by GCC, this header file does not by itself cause  the resulting
  21    executable to be covered by the GNU General Public License.  This exception
  22    does not however invalidate any other reasons why the executable file might be
  23    covered by the GNU General Public License.  */
  24
  25 #ifndef _SI2VMX_H_
  26 #define _SI2VMX_H_      1
  27
  28 #ifndef __SPU__
  29
  30 #include <stdlib.h>
  31 #include <vec_types.h>
  32
  33
  34 /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
  35  * Users can override the action by defining it prior to including this
  36  * header file.
  37  */
  38 #ifndef SPU_HALT_ACTION
  39 #define SPU_HALT_ACTION         abort()
  40 #endif
  41
  42 /* Specify a default stop action for the spu_stop intrinsic.
  43  * Users can override the action by defining it prior to including this
  44  * header file.
  45  */
  46 #ifndef SPU_STOP_ACTION
  47 #define SPU_STOP_ACTION         abort()
  48 #endif
  49
  50
  51 /* Specify a default action for unsupported intrinsic.
  52  * Users can override the action by defining it prior to including this
  53  * header file.
  54  */
  55 #ifndef SPU_UNSUPPORTED_ACTION
  56 #define SPU_UNSUPPORTED_ACTION  abort()
  57 #endif
  58
  59
  60 /* Casting intrinsics - from scalar to quadword
  61  */
  62
  63 static __inline qword si_from_uchar(unsigned char c) {
  64   union {
  65     qword q;
  66     unsigned char c[16];
  67   } x;
  68   x.c[3] = c;
  69   return (x.q);
  70 }
  71
  72 static __inline qword si_from_char(signed char c) {
  73   union {
  74     qword q;
  75     signed char c[16];
  76   } x;
  77   x.c[3] = c;
  78   return (x.q);
  79 }
  80
  81 static __inline qword si_from_ushort(unsigned short s) {
  82   union {
  83     qword q;
  84     unsigned short s[8];
  85   } x;
  86   x.s[1] = s;
  87   return (x.q);
  88 }
  89
  90 static __inline qword si_from_short(short s) {
  91   union {
  92     qword q;
  93     short s[8];
  94   } x;
  95   x.s[1] = s;
  96   return (x.q);
  97 }
  98
  99
 100 static __inline qword si_from_uint(unsigned int i) {
 101   union {
 102     qword q;
 103     unsigned int i[4];
 104   } x;
 105   x.i[0] = i;
 106   return (x.q);
 107 }
 108
 109 static __inline qword si_from_int(int i) {
 110   union {
 111     qword q;
 112     int i[4];
 113   } x;
 114   x.i[0] = i;
 115   return (x.q);
 116 }
 117
 118 static __inline qword si_from_ullong(unsigned long long l) {
 119   union {
 120     qword q;
 121     unsigned long long l[2];
 122   } x;
 123   x.l[0] = l;
 124   return (x.q);
 125 }
 126
 127 static __inline qword si_from_llong(long long l) {
 128   union {
 129     qword q;
 130     long long l[2];
 131   } x;
 132   x.l[0] = l;
 133   return (x.q);
 134 }
 135
 136 static __inline qword si_from_float(float f) {
 137   union {
 138     qword q;
 139     float f[4];
 140   } x;
 141   x.f[0] = f;
 142   return (x.q);
 143 }
 144
 145 static __inline qword si_from_double(double d) {
 146   union {
 147     qword q;
 148     double d[2];
 149   } x;
 150   x.d[0] = d;
 151   return (x.q);
 152 }
 153
 154 static __inline qword si_from_ptr(void *ptr) {
 155   union {
 156     qword q;
 157     void *p;
 158   } x;
 159   x.p = ptr;
 160   return (x.q);
 161 }
 162
 163
 164 /* Casting intrinsics - from quadword to scalar
 165  */
 166 static __inline unsigned char si_to_uchar(qword q) {
 167   union {
 168     qword q;
 169     unsigned char c[16];
 170   } x;
 171   x.q = q;
 172   return (x.c[3]);
 173 }
 174
 175 static __inline signed char si_to_char(qword q) {
 176   union {
 177     qword q;
 178     signed char c[16];
 179   } x;
 180   x.q = q;
 181   return (x.c[3]);
 182 }
 183
 184 static __inline unsigned short si_to_ushort(qword q) {
 185   union {
 186     qword q;
 187     unsigned short s[8];
 188   } x;
 189   x.q = q;
 190   return (x.s[1]);
 191 }
 192
 193 static __inline short si_to_short(qword q) {
 194   union {
 195     qword q;
 196     short s[8];
 197   } x;
 198   x.q = q;
 199   return (x.s[1]);
 200 }
 201
 202 static __inline unsigned int si_to_uint(qword q) {
 203   union {
 204     qword q;
 205     unsigned int i[4];
 206   } x;
 207   x.q = q;
 208   return (x.i[0]);
 209 }
 210
 211 static __inline int si_to_int(qword q) {
 212   union {
 213     qword q;
 214     int i[4];
 215   } x;
 216   x.q = q;
 217   return (x.i[0]);
 218 }
 219
 220 static __inline unsigned long long si_to_ullong(qword q) {
 221   union {
 222     qword q;
 223     unsigned long long l[2];
 224   } x;
 225   x.q = q;
 226   return (x.l[0]);
 227 }
 228
 229 static __inline long long si_to_llong(qword q) {
 230   union {
 231     qword q;
 232     long long l[2];
 233   } x;
 234   x.q = q;
 235   return (x.l[0]);
 236 }
 237
 238 static __inline float si_to_float(qword q) {
 239   union {
 240     qword q;
 241     float f[4];
 242   } x;
 243   x.q = q;
 244   return (x.f[0]);
 245 }
 246
 247 static __inline double si_to_double(qword q) {
 248   union {
 249     qword q;
 250     double d[2];
 251   } x;
 252   x.q = q;
 253   return (x.d[0]);
 254 }
 255
 256 static __inline void * si_to_ptr(qword q) {
 257   union {
 258     qword q;
 259     void *p;
 260   } x;
 261   x.q = q;
 262   return (x.p);
 263 }
 264
 265
 266 /* Absolute difference
 267  */
 268 static __inline qword si_absdb(qword a, qword b)
 269 {
 270   vec_uchar16 ac, bc, dc;
 271
 272   ac = (vec_uchar16)(a);
 273   bc = (vec_uchar16)(b);
 274   dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc));
 275
 276   return ((qword)(dc));
 277 }
 278
 279 /* Add intrinsics
 280  */
 281 #define si_a(_a, _b)            ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
 282
 283 #define si_ah(_a, _b)           ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
 284
 285 static __inline qword si_ai(qword a, int b)
 286 {
 287   return ((qword)(vec_add((vec_int4)(a),
 288                           vec_splat((vec_int4)(si_from_int(b)), 0))));
 289 }
 290
 291
 292 static __inline qword si_ahi(qword a, short b)
 293 {
 294   return ((qword)(vec_add((vec_short8)(a),
 295                           vec_splat((vec_short8)(si_from_short(b)), 1))));
 296 }
 297
 298
 299 #define si_fa(_a, _b)   ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
 300
 301
 302 static __inline qword si_dfa(qword a, qword b)
 303 {
 304   union {
 305     vec_double2 v;
 306     double d[2];
 307   } ad, bd, dd;
 308
 309   ad.v = (vec_double2)(a);
 310   bd.v = (vec_double2)(b);
 311   dd.d[0] = ad.d[0] + bd.d[0];
 312   dd.d[1] = ad.d[1] + bd.d[1];
 313
 314   return ((qword)(dd.v));
 315 }
 316
 317 /* Add word extended
 318  */
 319 #define si_addx(_a, _b, _c)     ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)),     \
 320                                                  vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
 321
 322
 323 /* Bit-wise AND
 324  */
 325 #define si_and(_a, _b)          ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
 326
 327
 328 static __inline qword si_andbi(qword a, signed char b)
 329 {
 330   return ((qword)(vec_and((vec_char16)(a),
 331                           vec_splat((vec_char16)(si_from_char(b)), 3))));
 332 }
 333
 334 static __inline qword si_andhi(qword a, signed short b)
 335 {
 336   return ((qword)(vec_and((vec_short8)(a),
 337                           vec_splat((vec_short8)(si_from_short(b)), 1))));
 338 }
 339
 340
 341 static __inline qword si_andi(qword a, signed int b)
 342 {
 343   return ((qword)(vec_and((vec_int4)(a),
 344                           vec_splat((vec_int4)(si_from_int(b)), 0))));
 345 }
 346
 347
 348 /* Bit-wise AND with complement
 349  */
 350 #define si_andc(_a, _b)         ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
 351
 352
 353 /* Average byte vectors
 354  */
 355 #define si_avgb(_a, _b)         ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
 356
 357
 358 /* Branch indirect and set link on external data
 359  */
 360 #define si_bisled(_func)        /* not mappable */
 361 #define si_bisledd(_func)       /* not mappable */
 362 #define si_bislede(_func)       /* not mappable */
 363
 364
 365 /* Borrow generate
 366  */
 367 #define si_bg(_a, _b)           ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
 368
 369 #define si_bgx(_a, _b, _c)      ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)),            \
 370                                                         vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)),    \
 371                                                                 (vec_uint4)(_c))), vec_splat_u32(1))))
 372
 373 /* Compare absolute equal
 374  */
 375 static __inline qword si_fcmeq(qword a, qword b)
 376 {
 377   vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
 378
 379   return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb),
 380                                   vec_andc((vec_float4)(b), msb))));
 381 }
 382
 383 static __inline qword si_dfcmeq(qword a, qword b)
 384 {
 385   vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 386   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
 387   vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
 388
 389   vec_uint4 biteq;
 390   vec_uint4 aabs;
 391   vec_uint4 babs;
 392   vec_uint4 a_gt;
 393   vec_uint4 ahi_inf;
 394   vec_uint4 anan;
 395   vec_uint4 result;
 396
 397   union {
 398     vec_uchar16 v;
 399     int i[4];
 400   } x;
 401
 402   /* Shift 4 bytes  */
 403   x.i[3] = 4 << 3;
 404
 405   /*  Mask out sign bits */
 406   aabs = vec_and((vec_uint4)a,sign_mask);
 407   babs = vec_and((vec_uint4)b,sign_mask);
 408
 409   /*  A)  Check for bit equality, store in high word */
 410   biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs);
 411   biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
 412
 413   /*
 414       B)  Check if a is NaN, store in high word
 415
 416       B1) If the high word is greater than max_exp (indicates a NaN)
 417       B2) If the low word is greater than 0
 418   */
 419   a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
 420
 421   /*  B3) Check if the high word is equal to the inf exponent */
 422   ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
 423
 424   /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
 425   anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
 426
 427   /*  result = A and not B  */
 428   result = vec_andc(biteq, anan);
 429
 430   /*  Promote high words to 64 bits and return  */
 431   return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
 432 }
 433
 434
 435 /* Compare absolute greater than
 436  */
 437 static __inline qword si_fcmgt(qword a, qword b)
 438 {
 439   vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
 440
 441   return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb),
 442                                   vec_andc((vec_float4)(b), msb))));
 443 }
 444
 445 static __inline qword si_dfcmgt(qword a, qword b)
 446 {
 447   vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 448   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
 449   vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 450
 451   union {
 452     vec_uchar16 v;
 453     int i[4];
 454   } x;
 455
 456   /* Shift 4 bytes  */
 457   x.i[3] = 4 << 3;
 458
 459   // absolute value of a,b
 460   vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
 461   vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
 462
 463   // check if a is nan
 464   vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
 465   vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
 466   a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
 467   a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
 468
 469   // check if b is nan
 470   vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
 471   vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
 472   b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
 473   b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
 474
 475   // A) Check if the exponents are different
 476   vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs);
 477
 478   // B) Check if high word equal, and low word greater
 479   vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs);
 480   vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs);
 481   vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
 482
 483   //  If either A or B is true, return true (unless NaNs detected)
 484   vec_uint4 r = vec_or(gt_hi, eqgt);
 485
 486   // splat the high words of the comparison step
 487   r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
 488
 489   // correct for NaNs in input
 490   return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
 491 }
 492
 493
 494 /* Compare equal
 495  */
 496 static __inline qword si_ceqb(qword a, qword b)
 497 {
 498   return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b))));
 499 }
 500
 501 static __inline qword si_ceqh(qword a, qword b)
 502 {
 503   return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b))));
 504 }
 505
 506 static __inline qword si_ceq(qword a, qword b)
 507 {
 508   return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b))));
 509 }
 510
 511 static __inline qword si_fceq(qword a, qword b)
 512 {
 513   return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b))));
 514 }
 515
 516 static __inline qword si_ceqbi(qword a, signed char b)
 517 {
 518   return ((qword)(vec_cmpeq((vec_char16)(a),
 519                             vec_splat((vec_char16)(si_from_char(b)), 3))));
 520 }
 521
 522 static __inline qword si_ceqhi(qword a, signed short b)
 523 {
 524   return ((qword)(vec_cmpeq((vec_short8)(a),
 525                           vec_splat((vec_short8)(si_from_short(b)), 1))));
 526 }
 527
 528 static __inline qword si_ceqi(qword a, signed int b)
 529 {
 530   return ((qword)(vec_cmpeq((vec_int4)(a),
 531                           vec_splat((vec_int4)(si_from_int(b)), 0))));
 532 }
 533
 534 static __inline qword si_dfceq(qword a, qword b)
 535 {
 536   vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 537   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
 538   vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
 539
 540   vec_uint4 biteq;
 541   vec_uint4 aabs;
 542   vec_uint4 babs;
 543   vec_uint4 a_gt;
 544   vec_uint4 ahi_inf;
 545   vec_uint4 anan;
 546   vec_uint4 iszero;
 547   vec_uint4 result;
 548
 549   union {
 550     vec_uchar16 v;
 551     int i[4];
 552   } x;
 553
 554   /* Shift 4 bytes  */
 555   x.i[3] = 4 << 3;
 556
 557   /*  A)  Check for bit equality, store in high word */
 558   biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b);
 559   biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
 560
 561   /*  Mask out sign bits */
 562   aabs = vec_and((vec_uint4)a,sign_mask);
 563   babs = vec_and((vec_uint4)b,sign_mask);
 564
 565   /*
 566       B)  Check if a is NaN, store in high word
 567
 568       B1) If the high word is greater than max_exp (indicates a NaN)
 569       B2) If the low word is greater than 0
 570   */
 571   a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
 572
 573   /*  B3) Check if the high word is equal to the inf exponent */
 574   ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
 575
 576   /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
 577   anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
 578
 579   /*  C)  Check for 0 = -0 special case */
 580   iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0));
 581   iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
 582
 583   /*  result = (A or C) and not B  */
 584   result = vec_or(biteq,iszero);
 585   result = vec_andc(result, anan);
 586
 587   /*  Promote high words to 64 bits and return  */
 588   return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
 589 }
 590
 591
 592 /* Compare greater than
 593  */
 594 static __inline qword si_cgtb(qword a, qword b)
 595 {
 596   return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b))));
 597 }
 598
 599 static __inline qword si_cgth(qword a, qword b)
 600 {
 601   return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b))));
 602 }
 603
 604 static __inline qword si_cgt(qword a, qword b)
 605 {
 606   return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b))));
 607 }
 608
 609 static __inline qword si_clgtb(qword a, qword b)
 610 {
 611   return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b))));
 612 }
 613
 614 static __inline qword si_clgth(qword a, qword b)
 615 {
 616   return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b))));
 617 }
 618
 619 static __inline qword si_clgt(qword a, qword b)
 620 {
 621   return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b))));
 622 }
 623
 624 static __inline qword si_fcgt(qword a, qword b)
 625 {
 626   return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b))));
 627 }
 628
 629 static __inline qword si_dfcgt(qword a, qword b)
 630 {
 631   vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 632   vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
 633   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
 634   vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 635
 636   union {
 637     vec_uchar16 v;
 638     int i[4];
 639   } x;
 640
 641   /* Shift 4 bytes  */
 642   x.i[3] = 4 << 3;
 643
 644   // absolute value of a,b
 645   vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
 646   vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
 647
 648   // check if a is nan
 649   vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
 650   vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
 651   a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
 652   a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
 653
 654   // check if b is nan
 655   vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
 656   vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
 657   b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
 658   b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
 659
 660   // sign of a
 661   vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
 662   asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi);
 663
 664   // sign of b
 665   vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
 666   bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi);
 667
 668   // negative a
 669   vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs);
 670   vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7)));
 671   abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
 672   vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1)));
 673
 674   // pick the one we want
 675   vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel);
 676
 677   // negative b
 678   vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs);
 679   bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
 680   vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1)));
 681
 682   // pick the one we want
 683   vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel);
 684
 685   // A) Check if the exponents are different
 686   vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval);
 687
 688   // B) Check if high word equal, and low word greater
 689   vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval);
 690   vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval);
 691   vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
 692
 693   //  If either A or B is true, return true (unless NaNs detected)
 694   vec_uint4 r = vec_or(gt_hi, eqgt);
 695
 696   // splat the high words of the comparison step
 697   r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
 698
 699   // correct for NaNs in input
 700   return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
 701 }
 702
 703 static __inline qword si_cgtbi(qword a, signed char b)
 704 {
 705   return ((qword)(vec_cmpgt((vec_char16)(a),
 706                             vec_splat((vec_char16)(si_from_char(b)), 3))));
 707 }
 708
 709 static __inline qword si_cgthi(qword a, signed short b)
 710 {
 711   return ((qword)(vec_cmpgt((vec_short8)(a),
 712                             vec_splat((vec_short8)(si_from_short(b)), 1))));
 713 }
 714
 715 static __inline qword si_cgti(qword a, signed int b)
 716 {
 717   return ((qword)(vec_cmpgt((vec_int4)(a),
 718                             vec_splat((vec_int4)(si_from_int(b)), 0))));
 719 }
 720
 721 static __inline qword si_clgtbi(qword a, unsigned char b)
 722 {
 723   return ((qword)(vec_cmpgt((vec_uchar16)(a),
 724                             vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
 725 }
 726
 727 static __inline qword si_clgthi(qword a, unsigned short b)
 728 {
 729   return ((qword)(vec_cmpgt((vec_ushort8)(a),
 730                             vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
 731 }
 732
 733 static __inline qword si_clgti(qword a, unsigned int b)
 734 {
 735   return ((qword)(vec_cmpgt((vec_uint4)(a),
 736                             vec_splat((vec_uint4)(si_from_uint(b)), 0))));
 737 }
 738
 739 static __inline qword si_dftsv(qword a, char b)
 740 {
 741   vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 742   vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 743   vec_uint4 result = (vec_uint4){0};
 744   vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
 745   sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi);
 746   vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask);
 747
 748   union {
 749     vec_uchar16 v;
 750     int i[4];
 751   } x;
 752
 753   /* Shift 4 bytes  */
 754   x.i[3] = 4 << 3;
 755
 756   /* Nan or +inf or -inf  */
 757   if (b & 0x70)
 758   {
 759     vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
 760     vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
 761      /* NaN  */
 762      if (b & 0x40)
 763      {
 764        vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
 765        a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
 766        a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
 767        result = vec_or(result, a_nan);
 768      }
 769      /* inf  */
 770      if (b & 0x30)
 771      {
 772        a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf);
 773        a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi);
 774         /* +inf  */
 775         if (b & 0x20)
 776           result = vec_or(vec_andc(a_inf, sign), result);
 777         /* -inf  */
 778         if (b & 0x10)
 779           result = vec_or(vec_and(a_inf, sign), result);
 780      }
 781   }
 782   /* 0 or denorm  */
 783   if (b & 0xF)
 784   {
 785     vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0));
 786     iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
 787     /* denorm  */
 788     if (b & 0x3)
 789     {
 790       vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
 791       vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero);
 792       isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi);
 793       /* +denorm  */
 794      if (b & 0x2)
 795         result = vec_or(vec_andc(isdenorm, sign), result);
 796       /* -denorm  */
 797      if (b & 0x1)
 798         result = vec_or(vec_and(isdenorm, sign), result);
 799     }
 800     /* 0  */
 801     if (b & 0xC)
 802     {
 803       iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi);
 804       /* +0  */
 805      if (b & 0x8)
 806         result = vec_or(vec_andc(iszero, sign), result);
 807       /* -0  */
 808      if (b & 0x4)
 809         result = vec_or(vec_and(iszero, sign), result);
 810     }
 811   }
 812   return ((qword)result);
 813 }
 814
 815
 816 /* Carry generate
 817  */
 818 #define si_cg(_a, _b)           ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
 819
 820 #define si_cgx(_a, _b, _c)      ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)),             \
 821                                                 vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)),     \
 822                                                          vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
 823
 824
 825 /* Count ones for bytes
 826  */
 827 static __inline qword si_cntb(qword a)
 828 {
 829   vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
 830   vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
 831   vec_uchar16 av;
 832
 833   av = (vec_uchar16)(a);
 834
 835   return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av),
 836                           vec_perm(nib_cnt, nib_cnt, vec_sr (av, four)))));
 837 }
 838
 839 /* Count ones for bytes
 840  */
 841 static __inline qword si_clz(qword a)
 842 {
 843   vec_uchar16 av;
 844   vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3;
 845   vec_uchar16 four    = vec_splat_u8(4);
 846   vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
 847   vec_uchar16 eight   = vec_splat_u8(8);
 848   vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
 849   vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
 850
 851   av = (vec_uchar16)(a);
 852
 853   cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four));
 854   cnt_lo = vec_perm(nib_cnt, nib_cnt, av);
 855
 856   cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four)));
 857
 858   tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight));
 859   tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen));
 860   tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour));
 861
 862   cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight)));
 863   cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen)));
 864   cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour)));
 865
 866   return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour))));
 867 }
 868
 869 /* Convert to float
 870  */
 871 #define si_cuflt(_a, _b)        ((qword)(vec_ctf((vec_uint4)(_a), _b)))
 872 #define si_csflt(_a, _b)        ((qword)(vec_ctf((vec_int4)(_a), _b)))
 873
 874 /* Convert to signed int
 875  */
 876 #define si_cflts(_a, _b)        ((qword)(vec_cts((vec_float4)(_a), _b)))
 877
 878 /* Convert to unsigned int
 879  */
 880 #define si_cfltu(_a, _b)        ((qword)(vec_ctu((vec_float4)(_a), _b)))
 881
 882 /* Synchronize
 883  */
 884 #define si_dsync()              /* do nothing */
 885 #define si_sync()               /* do nothing */
 886 #define si_syncc()              /* do nothing */
 887
 888
 889 /* Equivalence
 890  */
 891 static __inline qword si_eqv(qword a, qword b)
 892 {
 893   vec_uchar16 d;
 894
 895   d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b));
 896   return ((qword)(vec_nor(d, d)));
 897 }
 898
 899 /* Extend
 900  */
 901 static __inline qword si_xsbh(qword a)
 902 {
 903   vec_char16 av;
 904
 905   av = (vec_char16)(a);
 906   return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15,
 907                                                               0, 0, 0, 0, 0, 0, 0, 0})))));
 908 }
 909
 910 static __inline qword si_xshw(qword a)
 911 {
 912   vec_short8 av;
 913
 914   av = (vec_short8)(a);
 915   return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7,
 916                                                               10,11,14,15,
 917                                                               0, 0, 0, 0,
 918                                                               0, 0, 0, 0})))));
 919 }
 920
 921 static __inline qword si_xswd(qword a)
 922 {
 923   vec_int4 av;
 924
 925   av = (vec_int4)(a);
 926   return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})),
 927                            ((vec_uchar16){20, 21, 22, 23,
 928                                            4,  5,  6,  7,
 929                                           28, 29, 30, 31,
 930                                           12, 13, 14, 15}))));
 931 }
 932
 933 static __inline qword si_fesd(qword a)
 934 {
 935   union {
 936     double d[2];
 937     vec_double2 vd;
 938   } out;
 939   union {
 940     float f[4];
 941     vec_float4 vf;
 942   } in;
 943
 944   in.vf = (vec_float4)(a);
 945   out.d[0] = (double)(in.f[0]);
 946   out.d[1] = (double)(in.f[2]);
 947   return ((qword)(out.vd));
 948 }
 949
 950 /* Gather
 951  */
 952 static __inline qword si_gbb(qword a)
 953 {
 954   vec_uchar16 bits;
 955   vec_uint4   bytes;
 956
 957   bits  = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0,
 958                                                                             7, 6, 5, 4, 3, 2, 1, 0}));
 959   bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0}));
 960
 961   return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0,
 962                                                         0, 0, 0, 0, 0, 0, 0, 0}))));
 963 }
 964
 965
 966 static __inline qword si_gbh(qword a)
 967 {
 968   vec_ushort8 bits;
 969   vec_uint4   bytes;
 970
 971   bits  = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0}));
 972
 973   bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0});
 974
 975   return ((qword)(vec_sld(bytes, bytes, 12)));
 976 }
 977
 978 static __inline qword si_gb(qword a)
 979 {
 980   vec_uint4 bits;
 981   vec_uint4 bytes;
 982
 983   bits  = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0}));
 984   bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0}));
 985   return ((qword)(vec_sld(bytes, bytes, 12)));
 986 }
 987
 988
 989 /* Compare and halt
 990  */
 991 static __inline void si_heq(qword a, qword b)
 992 {
 993   union {
 994     vector unsigned int v;
 995     unsigned int i[4];
 996   } aa, bb;
 997
 998   aa.v = (vector unsigned int)(a);
 999   bb.v = (vector unsigned int)(b);
1000
1001   if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; };
1002 }
1003
1004 static __inline void si_heqi(qword a, unsigned int b)
1005 {
1006   union {
1007     vector unsigned int v;
1008     unsigned int i[4];
1009   } aa;
1010
1011   aa.v = (vector unsigned int)(a);
1012
1013   if (aa.i[0] == b) { SPU_HALT_ACTION; };
1014 }
1015
1016 static __inline void si_hgt(qword a, qword b)
1017 {
1018   union {
1019     vector signed int v;
1020     signed int i[4];
1021   } aa, bb;
1022
1023   aa.v = (vector signed int)(a);
1024   bb.v = (vector signed int)(b);
1025
1026   if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1027 }
1028
1029 static __inline void si_hgti(qword a, signed int b)
1030 {
1031   union {
1032     vector signed int v;
1033     signed int i[4];
1034   } aa;
1035
1036   aa.v = (vector signed int)(a);
1037
1038   if (aa.i[0] > b) { SPU_HALT_ACTION; };
1039 }
1040
1041 static __inline void si_hlgt(qword a, qword b)
1042 {
1043   union {
1044     vector unsigned int v;
1045     unsigned int i[4];
1046   } aa, bb;
1047
1048   aa.v = (vector unsigned int)(a);
1049   bb.v = (vector unsigned int)(b);
1050
1051   if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1052 }
1053
1054 static __inline void si_hlgti(qword a, unsigned int b)
1055 {
1056   union {
1057     vector unsigned int v;
1058     unsigned int i[4];
1059   } aa;
1060
1061   aa.v = (vector unsigned int)(a);
1062
1063   if (aa.i[0] > b) { SPU_HALT_ACTION; };
1064 }
1065
1066
1067 /* Multiply and Add
1068  */
1069 static __inline qword si_mpya(qword a, qword b, qword c)
1070 {
1071   return ((qword)(vec_msum(vec_and((vec_short8)(a),
1072                                    ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})),
1073                            (vec_short8)(b), (vec_int4)(c))));
1074 }
1075
1076 static __inline qword si_fma(qword a, qword b, qword c)
1077 {
1078   return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1079 }
1080
1081 static __inline qword si_dfma(qword a, qword b, qword c)
1082 {
1083   union {
1084     vec_double2 v;
1085     double d[2];
1086   } aa, bb, cc, dd;
1087
1088   aa.v = (vec_double2)(a);
1089   bb.v = (vec_double2)(b);
1090   cc.v = (vec_double2)(c);
1091   dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0];
1092   dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1];
1093   return ((qword)(dd.v));
1094 }
1095
1096 /* Form Mask
1097  */
1098 #define si_fsmbi(_a)    si_fsmb(si_from_int(_a))
1099
1100 static __inline qword si_fsmb(qword a)
1101 {
1102   vec_char16 mask;
1103   vec_ushort8 in;
1104
1105   in = (vec_ushort8)(a);
1106   mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2,
1107                                                       3, 3, 3, 3, 3, 3, 3, 3})));
1108   return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7,
1109                                                       0, 1, 2, 3, 4, 5, 6, 7})),
1110                           vec_splat_u8(7))));
1111 }
1112
1113
1114 static __inline qword si_fsmh(qword a)
1115 {
1116   vec_uchar16 in;
1117   vec_short8 mask;
1118
1119   in = (vec_uchar16)(a);
1120   mask = (vec_short8)(vec_splat(in, 3));
1121   return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})),
1122                           vec_splat_u16(15))));
1123 }
1124
1125 static __inline qword si_fsm(qword a)
1126 {
1127   vec_uchar16 in;
1128   vec_int4 mask;
1129
1130   in = (vec_uchar16)(a);
1131   mask = (vec_int4)(vec_splat(in, 3));
1132   return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})),
1133                           ((vec_uint4){31,31,31,31}))));
1134 }
1135
1136 /* Move from/to registers
1137  */
1138 #define si_fscrrd()             ((qword)((vec_uint4){0}))
1139 #define si_fscrwr(_a)
1140
1141 #define si_mfspr(_reg)          ((qword)((vec_uint4){0}))
1142 #define si_mtspr(_reg, _a)
1143
1144 /* Multiply High High Add
1145  */
1146 static __inline qword si_mpyhha(qword a, qword b, qword c)
1147 {
1148   return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c))));
1149 }
1150
1151 static __inline qword si_mpyhhau(qword a, qword b, qword c)
1152 {
1153   return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c))));
1154 }
1155
1156 /* Multiply Subtract
1157  */
1158 static __inline qword si_fms(qword a, qword b, qword c)
1159 {
1160   return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b),
1161                            vec_sub(((vec_float4){0.0f}), (vec_float4)(c)))));
1162 }
1163
1164 static __inline qword si_dfms(qword a, qword b, qword c)
1165 {
1166   union {
1167     vec_double2 v;
1168     double d[2];
1169   } aa, bb, cc, dd;
1170
1171   aa.v = (vec_double2)(a);
1172   bb.v = (vec_double2)(b);
1173   cc.v = (vec_double2)(c);
1174   dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0];
1175   dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1];
1176   return ((qword)(dd.v));
1177 }
1178
1179 /* Multiply
1180  */
1181 static __inline qword si_fm(qword a, qword b)
1182 {
1183   return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f}))));
1184 }
1185
1186 static __inline qword si_dfm(qword a, qword b)
1187 {
1188   union {
1189     vec_double2 v;
1190     double d[2];
1191   } aa, bb, dd;
1192
1193   aa.v = (vec_double2)(a);
1194   bb.v = (vec_double2)(b);
1195   dd.d[0] = aa.d[0] * bb.d[0];
1196   dd.d[1] = aa.d[1] * bb.d[1];
1197   return ((qword)(dd.v));
1198 }
1199
1200 /* Multiply High
1201  */
1202 static __inline qword si_mpyh(qword a, qword b)
1203 {
1204   vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16};
1205
1206   return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen)));
1207 }
1208
1209
1210 /* Multiply High High
1211  */
1212 static __inline qword si_mpyhh(qword a, qword b)
1213 {
1214   return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b))));
1215 }
1216
1217 static __inline qword si_mpyhhu(qword a, qword b)
1218 {
1219   return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b))));
1220 }
1221
1222 /* Multiply Odd
1223  */
1224 static __inline qword si_mpy(qword a, qword b)
1225 {
1226   return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b))));
1227 }
1228
1229 static __inline qword si_mpyu(qword a, qword b)
1230 {
1231   return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b))));
1232 }
1233
1234 static __inline qword si_mpyi(qword a, short b)
1235 {
1236   return ((qword)(vec_mulo((vec_short8)(a),
1237                            vec_splat((vec_short8)(si_from_short(b)), 1))));
1238 }
1239
1240 static __inline qword si_mpyui(qword a, unsigned short b)
1241 {
1242   return ((qword)(vec_mulo((vec_ushort8)(a),
1243                            vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1244 }
1245
1246 /* Multiply and Shift Right
1247  */
1248 static __inline qword si_mpys(qword a, qword b)
1249 {
1250   return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16}))));
1251 }
1252
1253 /* Nand
1254  */
1255 static __inline qword si_nand(qword a, qword b)
1256 {
1257   vec_uchar16 d;
1258
1259   d = vec_and((vec_uchar16)(a), (vec_uchar16)(b));
1260   return ((qword)(vec_nor(d, d)));
1261 }
1262
1263 /* Negative Multiply Add
1264  */
1265 static __inline qword si_dfnma(qword a, qword b, qword c)
1266 {
1267   union {
1268     vec_double2 v;
1269     double d[2];
1270   } aa, bb, cc, dd;
1271
1272   aa.v = (vec_double2)(a);
1273   bb.v = (vec_double2)(b);
1274   cc.v = (vec_double2)(c);
1275   dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0];
1276   dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1];
1277   return ((qword)(dd.v));
1278 }
1279
1280 /* Negative Multiply and Subtract
1281  */
1282 static __inline qword si_fnms(qword a, qword b, qword c)
1283 {
1284   return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1285 }
1286
1287 static __inline qword si_dfnms(qword a, qword b, qword c)
1288 {
1289   union {
1290     vec_double2 v;
1291     double d[2];
1292   } aa, bb, cc, dd;
1293
1294   aa.v = (vec_double2)(a);
1295   bb.v = (vec_double2)(b);
1296   cc.v = (vec_double2)(c);
1297   dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0];
1298   dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1];
1299   return ((qword)(dd.v));
1300 }
1301
1302 /* Nor
1303  */
1304 static __inline qword si_nor(qword a, qword b)
1305 {
1306   return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b))));
1307 }
1308
1309 /* Or
1310  */
1311 static __inline qword si_or(qword a, qword b)
1312 {
1313   return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b))));
1314 }
1315
1316 static __inline qword si_orbi(qword a, unsigned char b)
1317 {
1318   return ((qword)(vec_or((vec_uchar16)(a),
1319                          vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1320 }
1321
1322 static __inline qword si_orhi(qword a, unsigned short b)
1323 {
1324   return ((qword)(vec_or((vec_ushort8)(a),
1325                           vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1326 }
1327
1328 static __inline qword si_ori(qword a, unsigned int b)
1329 {
1330   return ((qword)(vec_or((vec_uint4)(a),
1331                           vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1332 }
1333
1334 /* Or Complement
1335  */
1336 static __inline qword si_orc(qword a, qword b)
1337 {
1338   return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b)))));
1339 }
1340
1341
1342 /* Or Across
1343  */
1344 static __inline qword si_orx(qword a)
1345 {
1346   vec_uchar16 tmp;
1347   tmp = (vec_uchar16)(a);
1348   tmp = vec_or(tmp, vec_sld(tmp, tmp, 8));
1349   tmp = vec_or(tmp, vec_sld(tmp, tmp, 4));
1350   return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1351                                               0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1352 }
1353
1354
1355 /* Estimates
1356  */
1357 static __inline qword si_frest(qword a)
1358 {
1359   return ((qword)(vec_re((vec_float4)(a))));
1360 }
1361
1362 static __inline qword si_frsqest(qword a)
1363 {
1364   return ((qword)(vec_rsqrte((vec_float4)(a))));
1365 }
1366
1367 #define si_fi(_a, _d)           (_d)
1368
1369 /* Channel Read and Write
1370  */
1371 #define si_rdch(_channel)               ((qword)(vec_splat_u8(0)))      /* not mappable */
1372 #define si_rchcnt(_channel)             ((qword)(vec_splat_u8(0)))      /* not mappable */
1373 #define si_wrch(_channel, _a)           /* not mappable */
1374
1375 /* Rotate Left
1376  */
1377 static __inline qword si_roth(qword a, qword b)
1378 {
1379   return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b))));
1380 }
1381
1382 static __inline qword si_rot(qword a, qword b)
1383 {
1384   return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b))));
1385 }
1386
1387 static __inline qword si_rothi(qword a, int b)
1388 {
1389   return ((qword)(vec_rl((vec_ushort8)(a),
1390                          vec_splat((vec_ushort8)(si_from_int(b)), 1))));
1391 }
1392
1393 static __inline qword si_roti(qword a, int b)
1394 {
1395   return ((qword)(vec_rl((vec_uint4)(a),
1396                          vec_splat((vec_uint4)(si_from_int(b)), 0))));
1397 }
1398
1399 /* Rotate Left with Mask
1400  */
1401 static __inline qword si_rothm(qword a, qword b)
1402 {
1403   vec_ushort8 neg_b;
1404   vec_ushort8 mask;
1405
1406   neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1407   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1408   return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1409 }
1410
1411 static __inline qword si_rotm(qword a, qword b)
1412 {
1413   vec_uint4 neg_b;
1414   vec_uint4 mask;
1415
1416   neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1417   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1418   return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1419 }
1420
1421 static __inline qword si_rothmi(qword a, int b)
1422 {
1423   vec_ushort8 neg_b;
1424   vec_ushort8 mask;
1425
1426   neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1427   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1428   return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1429 }
1430
1431 static __inline qword si_rotmi(qword a, int b)
1432 {
1433   vec_uint4 neg_b;
1434   vec_uint4 mask;
1435
1436   neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1437   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1438   return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1439 }
1440
1441
1442 /* Rotate Left Algebraic with Mask
1443  */
1444 static __inline qword si_rotmah(qword a, qword b)
1445 {
1446   vec_ushort8 neg_b;
1447   vec_ushort8 mask;
1448
1449   neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1450   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1451   return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1452 }
1453
1454 static __inline qword si_rotma(qword a, qword b)
1455 {
1456   vec_uint4 neg_b;
1457   vec_uint4 mask;
1458
1459   neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1460   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1461   return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1462 }
1463
1464
1465 static __inline qword si_rotmahi(qword a, int b)
1466 {
1467   vec_ushort8 neg_b;
1468   vec_ushort8 mask;
1469
1470   neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1471   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1472   return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1473 }
1474
1475 static __inline qword si_rotmai(qword a, int b)
1476 {
1477   vec_uint4 neg_b;
1478   vec_uint4 mask;
1479
1480   neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1481   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1482   return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1483 }
1484
1485
1486 /* Rotate Left Quadword by Bytes with Mask
1487  */
1488 static __inline qword si_rotqmbyi(qword a, int count)
1489 {
1490   union {
1491     vec_uchar16 v;
1492     int i[4];
1493   } x;
1494   vec_uchar16 mask;
1495
1496   count = 0 - count;
1497   x.i[3] = count << 3;
1498   mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1499
1500   return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1501 }
1502
1503
1504 static __inline qword si_rotqmby(qword a, qword count)
1505 {
1506   union {
1507     vec_uchar16 v;
1508     int i[4];
1509   } x;
1510   int cnt;
1511   vec_uchar16 mask;
1512
1513   x.v = (vec_uchar16)(count);
1514   x.i[0] = cnt = (0 - x.i[0]) << 3;
1515
1516   x.v = vec_splat(x.v, 3);
1517   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1518
1519   return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1520 }
1521
1522
1523 /* Rotate Left Quadword by Bytes
1524  */
1525 static __inline qword si_rotqbyi(qword a, int count)
1526 {
1527   union {
1528     vec_uchar16 v;
1529     int i[4];
1530   } left, right;
1531
1532   count <<= 3;
1533   left.i[3] = count;
1534   right.i[3] = 0 - count;
1535   return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v))));
1536 }
1537
1538 static __inline qword si_rotqby(qword a, qword count)
1539 {
1540   vec_uchar16 left, right;
1541
1542   left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1543   right = vec_sub(vec_splat_u8(0), left);
1544   return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1545 }
1546
1547 /* Rotate Left Quadword by Bytes Bit Count
1548  */
1549 static __inline qword si_rotqbybi(qword a, qword count)
1550 {
1551   vec_uchar16 left, right;
1552
1553   left = vec_splat((vec_uchar16)(count), 3);
1554   right = vec_sub(vec_splat_u8(7), left);
1555   return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1556 }
1557
1558
1559 /* Rotate Left Quadword by Bytes Bit Count
1560  */
1561 static __inline qword si_rotqbii(qword a, int count)
1562 {
1563   vec_uchar16 x, y;
1564   vec_uchar16 result;
1565
1566   x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3);
1567   y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1568                            (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1569   result = vec_or(vec_sll((qword)(a), x), y);
1570   return ((qword)(result));
1571 }
1572
1573 static __inline qword si_rotqbi(qword a, qword count)
1574 {
1575   vec_uchar16 x, y;
1576   vec_uchar16 result;
1577
1578   x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7));
1579   y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1580                            (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1581
1582   result = vec_or(vec_sll((qword)(a), x), y);
1583   return ((qword)(result));
1584 }
1585
1586
1587 /* Rotate Left Quadword and Mask by Bits
1588  */
1589 static __inline qword si_rotqmbii(qword a, int count)
1590 {
1591   return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3))));
1592 }
1593
1594 static __inline qword si_rotqmbi(qword a, qword count)
1595 {
1596   return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3)))));
1597 }
1598
1599
1600 /* Rotate Left Quadword and Mask by Bytes with Bit Count
1601  */
1602 static __inline qword si_rotqmbybi(qword a, qword count)
1603 {
1604   union {
1605     vec_uchar16 v;
1606     int i[4];
1607   } x;
1608   int cnt;
1609   vec_uchar16 mask;
1610
1611   x.v = (vec_uchar16)(count);
1612   x.i[0] = cnt = 0 - (x.i[0] & ~7);
1613   x.v = vec_splat(x.v, 3);
1614   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1615
1616   return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1617 }
1618
1619
1620
1621
1622 /* Round Double to Float
1623  */
1624 static __inline qword si_frds(qword a)
1625 {
1626   union {
1627     vec_float4 v;
1628     float f[4];
1629   } d;
1630   union {
1631     vec_double2 v;
1632     double d[2];
1633   } in;
1634
1635   in.v = (vec_double2)(a);
1636   d.v = (vec_float4){0.0f};
1637   d.f[0] = (float)in.d[0];
1638   d.f[2] = (float)in.d[1];
1639
1640   return ((qword)(d.v));
1641 }
1642
1643 /* Select Bits
1644  */
1645 static __inline qword si_selb(qword a, qword b, qword c)
1646 {
1647   return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c))));
1648 }
1649
1650
1651 /* Shuffle Bytes
1652  */
1653 static __inline qword si_shufb(qword a, qword b, qword pattern)
1654 {
1655   vec_uchar16 pat;
1656
1657   pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1658                 vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)),
1659                 vec_sra((vec_uchar16)(pattern), vec_splat_u8(7)));
1660   return ((qword)(vec_perm(vec_perm(a, b, pattern),
1661                            ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0,
1662                                           0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1663                            pat)));
1664 }
1665
1666
1667 /* Shift Left
1668  */
1669 static __inline qword si_shlh(qword a, qword b)
1670 {
1671   vec_ushort8 mask;
1672
1673   mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15));
1674   return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask)));
1675 }
1676
1677 static __inline qword si_shl(qword a, qword b)
1678 {
1679   vec_uint4 mask;
1680
1681   mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1682   return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask)));
1683 }
1684
1685
1686 static __inline qword si_shlhi(qword a, unsigned int b)
1687 {
1688   vec_ushort8 mask;
1689   vec_ushort8 bv;
1690
1691   bv = vec_splat((vec_ushort8)(si_from_int(b)), 1);
1692   mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15));
1693   return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask)));
1694 }
1695
1696 static __inline qword si_shli(qword a, unsigned int b)
1697 {
1698   vec_uint4 bv;
1699   vec_uint4 mask;
1700
1701   bv = vec_splat((vec_uint4)(si_from_uint(b)), 0);
1702   mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1703   return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask)));
1704 }
1705
1706
1707 /* Shift Left Quadword
1708  */
1709 static __inline qword si_shlqbii(qword a, unsigned int count)
1710 {
1711   vec_uchar16 x;
1712
1713   x = vec_splat((vec_uchar16)(si_from_uint(count)), 3);
1714   return ((qword)(vec_sll((vec_uchar16)(a), x)));
1715 }
1716
1717 static __inline qword si_shlqbi(qword a, qword count)
1718 {
1719   vec_uchar16 x;
1720
1721   x = vec_splat((vec_uchar16)(count), 3);
1722   return ((qword)(vec_sll((vec_uchar16)(a), x)));
1723 }
1724
1725
1726 /* Shift Left Quadword by Bytes
1727  */
1728 static __inline qword si_shlqbyi(qword a, unsigned int count)
1729 {
1730   union {
1731     vec_uchar16 v;
1732     int i[4];
1733   } x;
1734   vec_uchar16 mask;
1735
1736   x.i[3] = count << 3;
1737   mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1738   return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1739 }
1740
1741 static __inline qword si_shlqby(qword a, qword count)
1742 {
1743   union {
1744     vec_uchar16 v;
1745     unsigned int i[4];
1746   } x;
1747   unsigned int cnt;
1748   vec_uchar16 mask;
1749
1750   x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1751   cnt = x.i[0];
1752   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1753   return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1754 }
1755
1756 /* Shift Left Quadword by Bytes with Bit Count
1757  */
1758 static __inline qword si_shlqbybi(qword a, qword count)
1759 {
1760   union {
1761     vec_uchar16 v;
1762     int i[4];
1763   } x;
1764   unsigned int cnt;
1765   vec_uchar16 mask;
1766
1767   x.v = vec_splat((vec_uchar16)(count), 3);
1768   cnt = x.i[0];
1769   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1770   return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1771 }
1772
1773
1774 /* Stop and Signal
1775  */
1776 #define si_stop(_type)          SPU_STOP_ACTION
1777 #define si_stopd(a, b, c)       SPU_STOP_ACTION
1778
1779
1780 /* Subtract
1781  */
1782 static __inline qword si_sfh(qword a, qword b)
1783 {
1784   return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a))));
1785 }
1786
1787 static __inline qword si_sf(qword a, qword b)
1788 {
1789   return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a))));
1790 }
1791
1792 static __inline qword si_fs(qword a, qword b)
1793 {
1794   return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b))));
1795 }
1796
1797 static __inline qword si_dfs(qword a, qword b)
1798 {
1799   union {
1800     vec_double2 v;
1801     double d[2];
1802   } aa, bb, dd;
1803
1804   aa.v = (vec_double2)(a);
1805   bb.v = (vec_double2)(b);
1806   dd.d[0] = aa.d[0] - bb.d[0];
1807   dd.d[1] = aa.d[1] - bb.d[1];
1808   return ((qword)(dd.v));
1809 }
1810
1811 static __inline qword si_sfhi(qword a, short b)
1812 {
1813   return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1),
1814                           (vec_short8)(a))));
1815 }
1816
1817 static __inline qword si_sfi(qword a, int b)
1818 {
1819   return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0),
1820                           (vec_int4)(a))));
1821 }
1822
1823 /* Subtract word extended
1824  */
1825 #define si_sfx(_a, _b, _c)      ((qword)(vec_add(vec_add((vec_uint4)(_b),                               \
1826                                                          vec_nor((vec_uint4)(_a), (vec_uint4)(_a))),    \
1827                                                  vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1828
1829
1830 /* Sum Bytes into Shorts
1831  */
1832 static __inline qword si_sumb(qword a, qword b)
1833 {
1834   vec_uint4 zero = (vec_uint4){0};
1835   vec_ushort8 sum_a, sum_b;
1836
1837   sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero);
1838   sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero);
1839
1840   return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19,  2,  3, 22, 23,  6,  7,
1841                                                         26, 27, 10, 11, 30, 31, 14, 15}))));
1842 }
1843
1844 /* Exclusive OR
1845  */
1846 static __inline qword si_xor(qword a, qword b)
1847 {
1848   return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b))));
1849 }
1850
1851 static __inline qword si_xorbi(qword a, unsigned char b)
1852 {
1853   return ((qword)(vec_xor((vec_uchar16)(a),
1854                           vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1855 }
1856
1857 static __inline qword si_xorhi(qword a, unsigned short b)
1858 {
1859   return ((qword)(vec_xor((vec_ushort8)(a),
1860                           vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1861 }
1862
1863 static __inline qword si_xori(qword a, unsigned int b)
1864 {
1865   return ((qword)(vec_xor((vec_uint4)(a),
1866                           vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1867 }
1868
1869
1870 /* Generate Controls for Sub-Quadword Insertion
1871  */
1872 static __inline qword si_cbd(qword a, int imm)
1873 {
1874   union {
1875     vec_uint4 v;
1876     unsigned char c[16];
1877   } shmask;
1878
1879   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1880   shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03;
1881   return ((qword)(shmask.v));
1882 }
1883
1884 static __inline qword si_cdd(qword a, int imm)
1885 {
1886   union {
1887     vec_uint4 v;
1888     unsigned long long ll[2];
1889   } shmask;
1890
1891   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1892   shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL;
1893   return ((qword)(shmask.v));
1894 }
1895
1896 static __inline qword si_chd(qword a, int imm)
1897 {
1898   union {
1899     vec_uint4 v;
1900     unsigned short s[8];
1901   } shmask;
1902
1903   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1904   shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203;
1905   return ((qword)(shmask.v));
1906 }
1907
1908 static __inline qword si_cwd(qword a, int imm)
1909 {
1910   union {
1911     vec_uint4 v;
1912     unsigned int i[4];
1913   } shmask;
1914
1915   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1916   shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203;
1917   return ((qword)(shmask.v));
1918 }
1919
1920 static __inline qword si_cbx(qword a, qword b)
1921 {
1922   union {
1923     vec_uint4 v;
1924     unsigned char c[16];
1925   } shmask;
1926
1927   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1928   shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03;
1929   return ((qword)(shmask.v));
1930 }
1931
1932
1933 static __inline qword si_cdx(qword a, qword b)
1934 {
1935   union {
1936     vec_uint4 v;
1937     unsigned long long ll[2];
1938   } shmask;
1939
1940   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1941   shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL;
1942   return ((qword)(shmask.v));
1943 }
1944
1945 static __inline qword si_chx(qword a, qword b)
1946 {
1947   union {
1948     vec_uint4 v;
1949     unsigned short s[8];
1950   } shmask;
1951
1952   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1953   shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203;
1954   return ((qword)(shmask.v));
1955 }
1956
1957 static __inline qword si_cwx(qword a, qword b)
1958 {
1959   union {
1960     vec_uint4 v;
1961     unsigned int i[4];
1962   } shmask;
1963
1964   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1965   shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203;
1966   return ((qword)(shmask.v));
1967 }
1968
1969
1970 /* Constant Formation
1971  */
1972 static __inline qword si_il(signed short imm)
1973 {
1974   return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0)));
1975 }
1976
1977
1978 static __inline qword si_ila(unsigned int imm)
1979 {
1980   return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0)));
1981 }
1982
1983 static __inline qword si_ilh(signed short imm)
1984 {
1985   return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1)));
1986 }
1987
1988 static __inline qword si_ilhu(signed short imm)
1989 {
1990   return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0)));
1991 }
1992
1993 static __inline qword si_iohl(qword a, unsigned short imm)
1994 {
1995   return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0))));
1996 }
1997
1998 /* No Operation
1999  */
2000 #define si_lnop()               /* do nothing */
2001 #define si_nop()                /* do nothing */
2002
2003
2004 /* Memory Load and Store
2005  */
2006 static __inline qword si_lqa(unsigned int imm)
2007 {
2008   return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2009 }
2010
2011 static __inline qword si_lqd(qword a, unsigned int imm)
2012 {
2013   return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm))));
2014 }
2015
2016 static __inline qword si_lqr(unsigned int imm)
2017 {
2018   return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2019 }
2020
2021 static __inline qword si_lqx(qword a, qword b)
2022 {
2023   return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0))));
2024 }
2025
2026 static __inline void si_stqa(qword a, unsigned int imm)
2027 {
2028   vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2029 }
2030
2031 static __inline void si_stqd(qword a, qword b, unsigned int imm)
2032 {
2033   vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm));
2034 }
2035
2036 static __inline void si_stqr(qword a, unsigned int imm)
2037 {
2038   vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2039 }
2040
2041 static __inline void si_stqx(qword a, qword b, qword c)
2042 {
2043   vec_st((vec_uchar16)(a),
2044          si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))),
2045          (vector unsigned char *)(0));
2046 }
2047
2048 #endif /* !__SPU__ */
2049 #endif /* !_SI2VMX_H_ */
2050