gcc/config/spu/vmx2spu.h

   1 /* Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc.
   2
   3    This file is free software; you can redistribute it and/or modify it under
   4    the terms of the GNU General Public License as published by the Free
   5    Software Foundation; either version 3 of the License, or (at your option)
   6    any later version.
   7
   8    This file is distributed in the hope that it will be useful, but WITHOUT
   9    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11    for more details.
  12
  13    Under Section 7 of GPL version 3, you are granted additional
  14    permissions described in the GCC Runtime Library Exception, version
  15    3.1, as published by the Free Software Foundation.
  16
  17    You should have received a copy of the GNU General Public License and
  18    a copy of the GCC Runtime Library Exception along with this program;
  19    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  20    <http://www.gnu.org/licenses/>.  */
  21
  22 #ifndef _VMX2SPU_H_
  23 #define _VMX2SPU_H_     1
  24
  25 #ifdef __cplusplus
  26
  27 #ifdef __SPU__
  28
  29 #include <spu_intrinsics.h>
  30 #include <vec_types.h>
  31
  32 /* This file maps generic VMX intrinsics and predicates to the SPU using
  33  * overloaded C++ functions.
  34  */
  35
  36 /************************************************************************
  37  *                        INTRINSICS
  38  ************************************************************************/
  39
  40 /* vec_abs (vector absolute value)
  41  * =======
  42  */
  43 static inline vec_char16 vec_abs(vec_char16 a)
  44 {
  45   vec_char16 minus_a;
  46
  47   minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101));
  48   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
  49 }
  50
  51 static inline vec_short8 vec_abs(vec_short8 a)
  52 {
  53   return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
  54 }
  55
  56 static inline vec_int4 vec_abs(vec_int4 a)
  57 {
  58   return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
  59 }
  60
  61 static inline vec_float4 vec_abs(vec_float4 a)
  62 {
  63   return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1)));
  64 }
  65
  66 /* vec_abss (vector absolute value saturate)
  67  * ========
  68  */
  69 static inline vec_char16 vec_abss(vec_char16 a)
  70 {
  71   vec_char16 minus_a;
  72
  73   minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)),
  74                                 (vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1)));
  75   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
  76 }
  77
  78 static inline vec_short8 vec_abss(vec_short8 a)
  79 {
  80   vec_short8 minus_a;
  81
  82   minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}))));
  83   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
  84 }
  85
  86 static inline vec_int4 vec_abss(vec_int4 a)
  87 {
  88   vec_int4 minus_a;
  89
  90   minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000}))));
  91   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
  92 }
  93
  94
  95 /* vec_add (vector add)
  96  * =======
  97  */
  98 static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b)
  99 {
 100   return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)),
 101                                 spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)),
 102                                 spu_splats((unsigned short)(0xFF00)))));
 103 }
 104
 105 static inline vec_char16 vec_add(vec_char16 a, vec_char16 b)
 106 {
 107   return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
 108 }
 109
 110 static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b)
 111 {
 112   return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
 113 }
 114
 115 static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b)
 116 {
 117   return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
 118 }
 119
 120 static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b)
 121 {
 122   return (spu_add(a, b));
 123 }
 124
 125 static inline vec_short8 vec_add(vec_short8 a, vec_short8 b)
 126 {
 127   return (spu_add(a, b));
 128 }
 129
 130 static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b)
 131 {
 132   return (spu_add((vec_short8)(a), b));
 133 }
 134
 135 static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b)
 136 {
 137   return (spu_add(a, (vec_short8)(b)));
 138 }
 139
 140 static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b)
 141 {
 142   return (spu_add(a, b));
 143 }
 144
 145 static inline vec_int4 vec_add(vec_int4 a, vec_int4 b)
 146 {
 147   return (spu_add(a, b));
 148 }
 149
 150 static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b)
 151 {
 152   return (spu_add((vec_int4)(a), b));
 153 }
 154
 155 static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b)
 156 {
 157   return (spu_add(a, (vec_int4)(b)));
 158 }
 159
 160 static inline vec_float4 vec_add(vec_float4 a, vec_float4 b)
 161 {
 162   return (spu_add(a, b));
 163 }
 164
 165 /* vec_addc (vector add carryout unsigned word)
 166  * ========
 167  */
 168 #define vec_addc(_a, _b)        spu_genc(_a, _b)
 169
 170 /* vec_adds (vector add saturated)
 171  * ========
 172  */
 173 static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b)
 174 {
 175   vec_uchar16 s1, s2, s, d;
 176
 177   s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
 178   s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
 179   s  = spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
 180                                           8, 24, 10, 26, 12, 28, 14, 30}));
 181   d  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
 182                                           9, 25, 11, 27, 13, 29, 15, 31}));
 183   return (spu_or(d, spu_cmpeq(s, 1)));
 184 }
 185
 186 static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b)
 187 {
 188   vec_uchar16 s1, s2, s, d;
 189
 190   s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
 191   s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
 192   s  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
 193                                           9, 25, 11, 27, 13, 29, 15, 31}));
 194   d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F));
 195   d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F));
 196   return ((vec_char16)(d));
 197 }
 198
 199 static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b)
 200 {
 201   return (vec_adds((vec_char16)(a), b));
 202 }
 203
 204 static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b)
 205 {
 206   return (vec_adds(a, (vec_char16)(b)));
 207 }
 208
 209 static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b)
 210 {
 211   vec_ushort8 s, d;
 212
 213   s = spu_add(a, b);
 214   d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15));
 215   return (d);
 216 }
 217
 218 static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b)
 219 {
 220   vec_short8 s, d;
 221
 222   s = spu_add(a, b);
 223   d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15)));
 224   d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15)));
 225   return (d);
 226 }
 227
 228 static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b)
 229 {
 230   return (vec_adds((vec_short8)(a), b));
 231 }
 232
 233 static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b)
 234 {
 235   return (vec_adds(a, (vec_short8)(b)));
 236 }
 237
 238 static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b)
 239 {
 240   return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31)));
 241 }
 242
 243 static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b)
 244 {
 245   vec_int4 s, d;
 246
 247   s = spu_add(a, b);
 248   d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31));
 249   d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31));
 250   return (d);
 251 }
 252
 253 static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b)
 254 {
 255   return (vec_adds((vec_int4)(a), b));
 256 }
 257
 258 static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b)
 259 {
 260   return (vec_adds(a, (vec_int4)(b)));
 261 }
 262
 263 /* vec_and (vector logical and)
 264  * =======
 265  */
 266 static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b)
 267 {
 268   return (spu_and(a, b));
 269 }
 270
 271 static inline vec_char16 vec_and(vec_char16 a, vec_char16 b)
 272 {
 273   return (spu_and(a, b));
 274 }
 275
 276 static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b)
 277 {
 278   return (spu_and((vec_char16)(a), b));
 279 }
 280
 281 static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b)
 282 {
 283   return (spu_and(a, (vec_char16)(b)));
 284 }
 285
 286 static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b)
 287 {
 288   return (spu_and(a, b));
 289 }
 290
 291 static inline vec_short8 vec_and(vec_short8 a, vec_short8 b)
 292 {
 293   return (spu_and(a, b));
 294 }
 295
 296 static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b)
 297 {
 298   return (spu_and((vec_short8)(a), b));
 299 }
 300
 301 static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b)
 302 {
 303   return (spu_and(a, (vec_short8)(b)));
 304 }
 305
 306 static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b)
 307 {
 308   return (spu_and(a, b));
 309 }
 310
 311 static inline vec_int4 vec_and(vec_int4 a, vec_int4 b)
 312 {
 313   return (spu_and(a, b));
 314 }
 315
 316 static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b)
 317 {
 318   return (spu_and((vec_int4)(a), b));
 319 }
 320
 321 static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b)
 322 {
 323   return (spu_and(a, (vec_int4)(b)));
 324 }
 325
 326 static inline vec_float4 vec_and(vec_float4 a, vec_float4 b)
 327 {
 328   return (spu_and(a, b));
 329 }
 330
 331 static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b)
 332 {
 333   return (spu_and((vec_float4)(a),b));
 334 }
 335
 336 static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b)
 337 {
 338   return (spu_and(a, (vec_float4)(b)));
 339 }
 340
 341
 342 /* vec_andc (vector logical and with complement)
 343  * ========
 344  */
 345 static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b)
 346 {
 347   return (spu_andc(a, b));
 348 }
 349
 350 static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b)
 351 {
 352   return (spu_andc(a, b));
 353 }
 354
 355 static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b)
 356 {
 357   return (spu_andc((vec_char16)(a), b));
 358 }
 359
 360 static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b)
 361 {
 362   return (spu_andc(a, (vec_char16)(b)));
 363 }
 364
 365 static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b)
 366 {
 367   return (spu_andc(a, b));
 368 }
 369
 370 static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b)
 371 {
 372   return (spu_andc(a, b));
 373 }
 374
 375 static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b)
 376 {
 377   return (spu_andc((vec_short8)(a), b));
 378 }
 379
 380 static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b)
 381 {
 382   return (spu_andc(a, (vec_short8)(b)));
 383 }
 384
 385 static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b)
 386 {
 387   return (spu_andc(a, b));
 388 }
 389
 390 static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b)
 391 {
 392   return (spu_andc(a, b));
 393 }
 394
 395 static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b)
 396 {
 397   return (spu_andc((vec_int4)(a), b));
 398 }
 399
 400 static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b)
 401 {
 402   return (spu_andc(a, (vec_int4)(b)));
 403 }
 404
 405 static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b)
 406 {
 407   return (spu_andc(a,b));
 408 }
 409
 410 static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b)
 411 {
 412   return (spu_andc((vec_float4)(a),b));
 413 }
 414
 415 static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b)
 416 {
 417   return (spu_andc(a, (vec_float4)(b)));
 418 }
 419
 420 /* vec_avg (vector average)
 421  * =======
 422  */
 423 static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b)
 424 {
 425   return (spu_avg(a, b));
 426 }
 427
 428 static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b)
 429 {
 430   return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)),
 431                                (vec_uchar16)(spu_and(spu_xor(a,b), 0x80)))));
 432 }
 433
 434 static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b)
 435 {
 436   return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
 437                   spu_and(spu_or(a, b), 1)));
 438 }
 439
 440 static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b)
 441 {
 442   return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
 443                   spu_and(spu_or(a, b), 1)));
 444 }
 445
 446 static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b)
 447 {
 448   return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
 449                   spu_and(spu_or(a, b), 1)));
 450 }
 451
 452 static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b)
 453 {
 454   return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
 455                   spu_and(spu_or(a, b), 1)));
 456 }
 457
 458
 459 /* vec_ceil (vector ceiling)
 460  * ========
 461  */
 462 static inline vec_float4 vec_ceil(vec_float4 a)
 463 {
 464   vec_int4  exp;
 465   vec_uint4 mask;
 466
 467   a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF))));
 468   exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
 469   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
 470   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
 471   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
 472
 473   return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
 474 }
 475
 476
 477 /* vec_cmpb (vector compare bounds floating-point)
 478  * ========
 479  */
 480 static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b)
 481 {
 482   vec_int4 b0 = (vec_int4)spu_splats(0x80000000);
 483   vec_int4 b1 = (vec_int4)spu_splats(0x40000000);
 484
 485   return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0),
 486                  spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1)));
 487 }
 488
 489 /* vec_cmpeq (vector compare equal)
 490  * =========
 491  */
 492 #define vec_cmpeq(_a, _b)       spu_cmpeq(_a, _b)
 493
 494
 495 /* vec_cmpge (vector compare greater than or equal)
 496  * =========
 497  */
 498 static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b)
 499 {
 500   return (spu_xor(spu_cmpgt(b, a), -1));
 501 }
 502
 503
 504 /* vec_cmpgt (vector compare greater than)
 505  * =========
 506  */
 507 #define vec_cmpgt(_a, _b)       spu_cmpgt(_a, _b)
 508
 509
 510 /* vec_cmple (vector compare less than or equal)
 511  * =========
 512  */
 513 static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b)
 514 {
 515   return (spu_xor(spu_cmpgt(a, b), -1));
 516 }
 517
 518
 519 /* vec_cmplt (vector compare less than)
 520  * =========
 521  */
 522 #define vec_cmplt(_a, _b)       spu_cmpgt(_b, _a)
 523
 524
 525 /* vec_ctf (vector convert from fixed-point word)
 526  * =======
 527  */
 528 #define vec_ctf(_a, _b)         spu_convtf(_a, _b)
 529
 530
 531 /* vec_cts (vector convert to signed fixed-point word saturate)
 532  * =======
 533  */
 534 #define vec_cts(_a, _b)         spu_convts(_a, _b)
 535
 536
 537 /* vec_ctu (vector convert to unsigned fixed-point word saturate)
 538  * =======
 539  */
 540 #define vec_ctu(_a, _b)         spu_convtu(_a, _b)
 541
 542
 543 /* vec_dss (vector data stream stop)
 544  * =======
 545  */
 546 #define vec_dss(_a)
 547
 548
 549 /* vec_dssall (vector data stream stop all)
 550  * ==========
 551  */
 552 #define vec_dssall()
 553
 554
 555 /* vec_dst (vector data stream touch)
 556  * =======
 557  */
 558 #define vec_dst(_a, _b, _c)
 559
 560
 561 /* vec_dstst (vector data stream touch for store)
 562  * =========
 563  */
 564 #define vec_dstst(_a, _b, _c)
 565
 566
 567 /* vec_dststt (vector data stream touch for store transient)
 568  * ==========
 569  */
 570 #define vec_dststt(_a, _b, _c)
 571
 572
 573 /* vec_dstt (vector data stream touch transient)
 574  * ========
 575  */
 576 #define vec_dstt(_a, _b, _c)
 577
 578
 579 /* vec_expte (vector is 2 raised tp the exponent estimate floating-point)
 580  * =========
 581  */
 582 static inline vec_float4 vec_expte(vec_float4 a)
 583 {
 584   vec_float4 bias, frac, exp;
 585   vec_int4 ia;
 586
 587   bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31)));
 588   ia   = spu_convts(spu_add(a, bias), 0);
 589   frac = spu_sub(spu_convtf(ia, 0), a);
 590   exp  = (vec_float4)(spu_sl(spu_add(ia, 127), 23));
 591
 592   return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)),
 593                            frac, spu_splats(1.0f)), exp));
 594 }
 595
 596
 597 /* vec_floor (vector floor)
 598  * =========
 599  */
 600 static inline vec_float4 vec_floor(vec_float4 a)
 601 {
 602   vec_int4  exp;
 603   vec_uint4 mask;
 604
 605   a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF))));
 606   exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
 607   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
 608   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
 609   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
 610
 611   return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
 612 }
 613
 614
 615 /* vec_ld (vector load indexed)
 616  * ======
 617  */
 618 static inline vec_uchar16 vec_ld(int a, unsigned char *b)
 619 {
 620   return (*((vec_uchar16 *)(b+a)));
 621 }
 622
 623 static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b)
 624 {
 625   return (*((vec_uchar16 *)((unsigned char *)(b)+a)));
 626 }
 627
 628 static inline vec_char16 vec_ld(int a, signed char *b)
 629 {
 630   return (*((vec_char16 *)(b+a)));
 631 }
 632
 633 static inline vec_char16 vec_ld(int a, vec_char16 *b)
 634 {
 635   return (*((vec_char16 *)((signed char *)(b)+a)));
 636 }
 637
 638 static inline vec_ushort8 vec_ld(int a, unsigned short *b)
 639 {
 640   return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
 641 }
 642
 643 static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b)
 644 {
 645   return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
 646 }
 647
 648 static inline vec_short8 vec_ld(int a, signed short *b)
 649 {
 650   return (*((vec_short8 *)((unsigned char *)(b)+a)));
 651 }
 652
 653 static inline vec_short8 vec_ld(int a, vec_short8 *b)
 654 {
 655   return (*((vec_short8 *)((signed char *)(b)+a)));
 656 }
 657
 658 static inline vec_uint4 vec_ld(int a, unsigned int *b)
 659 {
 660   return (*((vec_uint4 *)((unsigned char *)(b)+a)));
 661 }
 662
 663 static inline vec_uint4 vec_ld(int a, vec_uint4 *b)
 664 {
 665   return (*((vec_uint4 *)((unsigned char *)(b)+a)));
 666 }
 667
 668 static inline vec_int4 vec_ld(int a, signed int *b)
 669 {
 670   return (*((vec_int4 *)((unsigned char *)(b)+a)));
 671 }
 672
 673 static inline vec_int4 vec_ld(int a, vec_int4 *b)
 674 {
 675   return (*((vec_int4 *)((signed char *)(b)+a)));
 676 }
 677
 678 static inline vec_float4 vec_ld(int a, float *b)
 679 {
 680   return (*((vec_float4 *)((unsigned char *)(b)+a)));
 681 }
 682
 683 static inline vec_float4 vec_ld(int a, vec_float4 *b)
 684 {
 685   return (*((vec_float4 *)((unsigned char *)(b)+a)));
 686 }
 687
 688 /* vec_lde (vector load element indexed)
 689  * =======
 690  */
 691 static inline vec_uchar16 vec_lde(int a, unsigned char *b)
 692 {
 693   return (*((vec_uchar16 *)(b+a)));
 694 }
 695
 696 static inline vec_char16 vec_lde(int a, signed char *b)
 697 {
 698   return (*((vec_char16 *)(b+a)));
 699 }
 700
 701 static inline vec_ushort8 vec_lde(int a, unsigned short *b)
 702 {
 703   return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
 704 }
 705
 706 static inline vec_short8 vec_lde(int a, signed short *b)
 707 {
 708   return (*((vec_short8 *)((unsigned char *)(b)+a)));
 709 }
 710
 711
 712 static inline vec_uint4 vec_lde(int a, unsigned int *b)
 713 {
 714   return (*((vec_uint4 *)((unsigned char *)(b)+a)));
 715 }
 716
 717 static inline vec_int4 vec_lde(int a, signed int *b)
 718 {
 719   return (*((vec_int4 *)((unsigned char *)(b)+a)));
 720 }
 721
 722
 723 static inline vec_float4 vec_lde(int a, float *b)
 724 {
 725   return (*((vec_float4 *)((unsigned char *)(b)+a)));
 726 }
 727
 728 /* vec_ldl (vector load indexed LRU)
 729  * =======
 730  */
 731 #define vec_ldl(_a, _b)         vec_ld(_a, _b)
 732
 733
 734 /* vec_loge (vector log2 estimate floating-point)
 735  * ========
 736  */
 737 static inline vec_float4 vec_loge(vec_float4 a)
 738 {
 739   vec_int4 exp;
 740   vec_float4 frac;
 741
 742   exp  = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127);
 743   frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23)));
 744
 745   return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)),
 746                    frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f))));
 747 }
 748
 749
 750 /* vec_lvsl (vector load for shift left)
 751  * ========
 752  */
 753 static inline vec_uchar16 vec_lvsl(int a, unsigned char *b)
 754 {
 755   return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))),
 756                                ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607,
 757                                               0x0809, 0x0A0B, 0x0C0D, 0x0E0F})));
 758 }
 759
 760 static inline vec_uchar16 vec_lvsl(int a, signed char *b)
 761 {
 762   return (vec_lvsl(a, (unsigned char *)b));
 763 }
 764
 765 static inline vec_uchar16 vec_lvsl(int a, unsigned short *b)
 766 {
 767   return (vec_lvsl(a, (unsigned char *)b));
 768 }
 769
 770 static inline vec_uchar16 vec_lvsl(int a, short *b)
 771 {
 772   return (vec_lvsl(a, (unsigned char *)b));
 773 }
 774
 775 static inline vec_uchar16 vec_lvsl(int a, unsigned int *b)
 776 {
 777   return (vec_lvsl(a, (unsigned char *)b));
 778 }
 779
 780 static inline vec_uchar16 vec_lvsl(int a, int *b)
 781 {
 782   return (vec_lvsl(a, (unsigned char *)b));
 783 }
 784
 785 static inline vec_uchar16 vec_lvsl(int a, float *b)
 786 {
 787   return (vec_lvsl(a, (unsigned char *)b));
 788 }
 789
 790
 791 /* vec_lvsr (vector load for shift right)
 792  * ========
 793  */
 794 static  inline vec_uchar16 vec_lvsr(int a, unsigned char *b)
 795 {
 796   return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617,
 797                                                0x1819, 0x1A1B, 0x1C1D, 0x1E1F}),
 798                                 (vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))))));
 799 }
 800
 801 static inline vec_uchar16 vec_lvsr(int a, signed char *b)
 802 {
 803   return (vec_lvsr(a, (unsigned char *)b));
 804 }
 805
 806 static inline vec_uchar16 vec_lvsr(int a, unsigned short *b)
 807 {
 808   return (vec_lvsr(a, (unsigned char *)b));
 809 }
 810
 811 static inline vec_uchar16 vec_lvsr(int a, short *b)
 812 {
 813   return (vec_lvsr(a, (unsigned char *)b));
 814 }
 815
 816 static inline vec_uchar16 vec_lvsr(int a, unsigned int *b)
 817 {
 818   return (vec_lvsr(a, (unsigned char *)b));
 819 }
 820
 821 static inline vec_uchar16 vec_lvsr(int a, int *b)
 822 {
 823   return (vec_lvsr(a, (unsigned char *)b));
 824 }
 825
 826 static inline vec_uchar16 vec_lvsr(int a, float *b)
 827 {
 828   return (vec_lvsr(a, (unsigned char *)b));
 829 }
 830
 831 /* vec_madd (vector multiply add)
 832  * ========
 833  */
 834 #define vec_madd(_a, _b, _c)    spu_madd(_a, _b, _c)
 835
 836
 837
 838 /* vec_madds (vector multiply add saturate)
 839  * =========
 840  */
 841 static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c)
 842 {
 843   return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)),
 844                               (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)),
 845                               ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF}))));
 846 }
 847
 848 /* vec_max (vector maximum)
 849  * =======
 850  */
 851 static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b)
 852 {
 853   return (spu_sel(b, a, spu_cmpgt(a, b)));
 854 }
 855
 856 static inline vec_char16 vec_max(vec_char16 a, vec_char16 b)
 857 {
 858   return (spu_sel(b, a, spu_cmpgt(a, b)));
 859 }
 860
 861 static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b)
 862 {
 863   return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b)));
 864 }
 865
 866 static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b)
 867 {
 868   return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b))));
 869 }
 870
 871 static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b)
 872 {
 873   return (spu_sel(b, a, spu_cmpgt(a, b)));
 874 }
 875
 876 static inline vec_short8 vec_max(vec_short8 a, vec_short8 b)
 877 {
 878   return (spu_sel(b, a, spu_cmpgt(a, b)));
 879 }
 880
 881 static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b)
 882 {
 883   return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b)));
 884 }
 885
 886 static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b)
 887 {
 888   return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b))));
 889 }
 890
 891 static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b)
 892 {
 893   return (spu_sel(b, a, spu_cmpgt(a, b)));
 894 }
 895
 896 static inline vec_int4 vec_max(vec_int4 a, vec_int4 b)
 897 {
 898   return (spu_sel(b, a, spu_cmpgt(a, b)));
 899 }
 900
 901 static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b)
 902 {
 903   return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b)));
 904 }
 905
 906 static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b)
 907 {
 908   return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b))));
 909 }
 910
 911 static inline vec_float4 vec_max(vec_float4 a, vec_float4 b)
 912 {
 913   return (spu_sel(b, a, spu_cmpgt(a, b)));
 914 }
 915
 916
 917 /* vec_mergeh (vector merge high)
 918  * ==========
 919  */
 920 static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b)
 921 {
 922   return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
 923                                            4, 20, 5, 21, 6, 22, 7, 23})));
 924 }
 925
 926 static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b)
 927 {
 928   return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
 929                                            4, 20, 5, 21, 6, 22, 7, 23})));
 930 }
 931
 932 static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b)
 933 {
 934   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
 935                                            4, 5, 20, 21, 6, 7, 22, 23})));
 936 }
 937
 938 static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b)
 939 {
 940   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
 941                                            4, 5, 20, 21, 6, 7, 22, 23})));
 942 }
 943
 944 static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b)
 945 {
 946   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
 947                                            4, 5, 6, 7, 20, 21, 22, 23})));
 948 }
 949
 950 static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b)
 951 {
 952   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
 953                                            4, 5, 6, 7, 20, 21, 22, 23})));
 954 }
 955
 956 static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b)
 957 {
 958   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
 959                                            4, 5, 6, 7, 20, 21, 22, 23})));
 960 }
 961
 962 /* vec_mergel (vector merge low)
 963  * ==========
 964  */
 965 static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b)
 966 {
 967   return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
 968                                            12, 28, 13, 29, 14, 30, 15, 31})));
 969 }
 970
 971 static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b)
 972 {
 973   return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
 974                                            12, 28, 13, 29, 14, 30, 15, 31})));
 975 }
 976
 977 static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b)
 978 {
 979   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
 980                                            12, 13, 28, 29, 14, 15, 30, 31})));
 981 }
 982
 983 static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b)
 984 {
 985   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
 986                                            12, 13, 28, 29, 14, 15, 30, 31})));
 987 }
 988
 989 static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b)
 990 {
 991   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
 992                                            12, 13, 14, 15, 28, 29, 30, 31})));
 993 }
 994
 995 static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b)
 996 {
 997   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
 998                                            12, 13, 14, 15, 28, 29, 30, 31})));
 999 }
1000
1001 static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b)
1002 {
1003   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
1004                                            12, 13, 14, 15, 28, 29, 30, 31})));
1005 }
1006
1007 /* vec_mfvscr (vector move from vector status and control register)
1008  * ==========
1009  */
1010 static inline vec_ushort8 vec_mfvscr()
1011 {
1012   return ((vec_ushort8)spu_splats(0));          /* not supported */
1013 }
1014
1015
1016 /* vec_min (vector minimum)
1017  * =======
1018  */
1019 static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b)
1020 {
1021   return (spu_sel(a, b, spu_cmpgt(a, b)));
1022 }
1023
1024 static inline vec_char16 vec_min(vec_char16 a, vec_char16 b)
1025 {
1026   return (spu_sel(a, b, spu_cmpgt(a, b)));
1027 }
1028
1029 static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b)
1030 {
1031   return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b)));
1032 }
1033
1034 static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b)
1035 {
1036   return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b))));
1037 }
1038
1039 static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b)
1040 {
1041   return (spu_sel(a, b, spu_cmpgt(a, b)));
1042 }
1043
1044 static inline vec_short8 vec_min(vec_short8 a, vec_short8 b)
1045 {
1046   return (spu_sel(a, b, spu_cmpgt(a, b)));
1047 }
1048
1049 static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b)
1050 {
1051   return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b)));
1052 }
1053
1054 static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b)
1055 {
1056   return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b))));
1057 }
1058
1059 static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b)
1060 {
1061   return (spu_sel(a, b, spu_cmpgt(a, b)));
1062 }
1063
1064 static inline vec_int4 vec_min(vec_int4 a, vec_int4 b)
1065 {
1066   return (spu_sel(a, b, spu_cmpgt(a, b)));
1067 }
1068
1069 static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b)
1070 {
1071   return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b)));
1072 }
1073
1074 static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b)
1075 {
1076   return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b))));
1077 }
1078
1079 static inline vec_float4 vec_min(vec_float4 a, vec_float4 b)
1080 {
1081   return (spu_sel(a, b, spu_cmpgt(a, b)));
1082 }
1083
1084 /* vec_mladd (vector multiply low and add unsigned half word)
1085  * =========
1086  */
1087 static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c)
1088 {
1089   return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)),
1090                                             (vec_short8)(spu_rl((vec_uint4)(b), -16)),
1091                                             (vec_int4)(spu_rl((vec_uint4)(c), -16))),
1092                                    spu_madd(a, b, spu_extend(c)),
1093                                    ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1094                                                   10, 11, 26, 27, 14, 15, 30, 31}))));
1095 }
1096
1097
1098 static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c)
1099 {
1100   return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c))));
1101 }
1102
1103 static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c)
1104 {
1105   return (vec_mladd((vec_short8)(a), b, c));
1106 }
1107
1108 static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c)
1109 {
1110   return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c)));
1111 }
1112
1113
1114 /* vec_mradds (vector multiply round and add saturate)
1115  * ==========
1116  */
1117 static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c)
1118 {
1119   vec_int4 round = (vec_int4)spu_splats(0x4000);
1120   vec_short8 hi, lo;
1121
1122   hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1));
1123   lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15));
1124
1125   return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c));
1126 }
1127
1128
1129 /* vec_msum (vector multiply sum)
1130  * ========
1131  */
1132 static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c)
1133 {
1134   vec_ushort8 a1, a2, b1, b2;
1135   vec_uint4 p1, p2;
1136
1137   a1 = spu_and((vec_ushort8)(a), 0xFF);
1138   a2 = spu_rlmask((vec_ushort8)(a), -8);
1139   b1 = spu_and((vec_ushort8)(b), 0xFF);
1140   b2 = spu_rlmask((vec_ushort8)(b), -8);
1141
1142   p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1143   p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1144   return (spu_add(p2, spu_add(p1, c)));
1145 }
1146
1147 static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c)
1148 {
1149   vec_short8 a1, a2, b1, b2;
1150   vec_int4 p1, p2;
1151
1152   a1 = (vec_short8)(spu_extend(a));
1153   a2 = spu_rlmaska((vec_short8)(a), -8);
1154   b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF));
1155   b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8);
1156
1157   p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1158   p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1159   return (spu_add(p2, spu_add(p1, c)));
1160 }
1161
1162 static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1163 {
1164   return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1165 }
1166
1167 static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c)
1168 {
1169   return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1170 }
1171
1172
1173 /* vec_msums (vector multiply sum saturate)
1174  * ========
1175  */
1176 static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1177 {
1178   vec_uint4 p1, p2;
1179
1180   p1 = spu_mulo(a, b);
1181   p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2));
1182
1183   return (vec_adds(p2, vec_adds(p1, c)));
1184 }
1185
1186 static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c)
1187 {
1188   return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1189 }
1190
1191 /* vec_mtvscr (vector move to vector status and control register)
1192  * ==========
1193  */
1194 #define vec_mtvscr(_a)          /* not supported */
1195
1196
1197 /* vec_mule (vector multiply even)
1198  * ========
1199  */
1200 static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b)
1201 {
1202   vec_ushort8 hi, lo;
1203
1204   hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)),
1205                              (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24)));
1206   lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)),
1207                              (vec_ushort8)(spu_rlmask((vec_short8)(b), -8)));
1208
1209   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1210                                              10, 11, 26, 27, 14, 15, 30, 31})));
1211 }
1212
1213 static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b)
1214 {
1215   vec_short8 hi, lo;
1216
1217   hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)),
1218                             (vec_short8)(spu_rlmaska((vec_uint4)(b), -24)));
1219   lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)),
1220                             (vec_short8)(spu_rlmaska((vec_short8)(b), -8)));
1221
1222   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1223                                              10, 11, 26, 27, 14, 15, 30, 31})));
1224 }
1225
1226 static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b)
1227 {
1228  return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16),
1229                   (vec_ushort8)spu_rlmask((vec_uint4)(b), -16)));
1230 }
1231
1232
1233 static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b)
1234 {
1235  return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16),
1236                   (vec_short8)spu_rlmaska((vec_int4)(b), -16)));
1237 }
1238
1239
1240 /* vec_mulo (vector multiply odd)
1241  * ========
1242  */
1243 static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b)
1244 {
1245   vec_ushort8 hi, lo;
1246
1247   hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)),
1248                              (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF)));
1249   lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
1250
1251   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1252                                              10, 11, 26, 27, 14, 15, 30, 31})));
1253 }
1254
1255 static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b)
1256 {
1257   vec_short8 aa, bb, hi, lo;
1258
1259   aa = spu_extend(a);
1260   bb = spu_extend(b);
1261
1262   hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)),
1263                 (vec_short8)(spu_rlmaska((vec_uint4)(bb), -16)));
1264   lo = (vec_short8)spu_mulo(aa, bb);
1265   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1266                                              10, 11, 26, 27, 14, 15, 30, 31})));
1267 }
1268
1269 static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b)
1270 {
1271   return (spu_mulo(a, b));
1272 }
1273
1274
1275 static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b)
1276 {
1277   return (spu_mulo(a, b));
1278 }
1279
1280
1281 /* vec_nmsub (vector negative multiply subtract)
1282  * =========
1283  */
1284 #define vec_nmsub(_a, _b, _c)   spu_nmsub(_a, _b, _c)
1285
1286
1287 /* vec_nor (vector logical nor)
1288  * =======
1289  */
1290 #define vec_nor(_a, _b)         spu_nor(_a, _b)
1291
1292
1293 /* vec_or (vector logical or)
1294  * ======
1295  */
1296 static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b)
1297 {
1298   return (spu_or(a, b));
1299 }
1300
1301 static inline vec_char16 vec_or(vec_char16 a, vec_char16 b)
1302 {
1303   return (spu_or(a, b));
1304 }
1305
1306 static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b)
1307 {
1308   return (spu_or((vec_char16)(a), b));
1309 }
1310
1311 static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b)
1312 {
1313   return (spu_or(a, (vec_char16)(b)));
1314 }
1315
1316 static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b)
1317 {
1318   return (spu_or(a, b));
1319 }
1320
1321 static inline vec_short8 vec_or(vec_short8 a, vec_short8 b)
1322 {
1323   return (spu_or(a, b));
1324 }
1325
1326 static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b)
1327 {
1328   return (spu_or((vec_short8)(a), b));
1329 }
1330
1331 static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b)
1332 {
1333   return (spu_or(a, (vec_short8)(b)));
1334 }
1335
1336 static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b)
1337 {
1338   return (spu_or(a, b));
1339 }
1340
1341 static inline vec_int4 vec_or(vec_int4 a, vec_int4 b)
1342 {
1343   return (spu_or(a, b));
1344 }
1345
1346 static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b)
1347 {
1348   return (spu_or((vec_int4)(a), b));
1349 }
1350
1351 static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b)
1352 {
1353   return (spu_or(a, (vec_int4)(b)));
1354 }
1355
1356 static inline vec_float4 vec_or(vec_float4 a, vec_float4 b)
1357 {
1358   return (spu_or(a, b));
1359 }
1360
1361 static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b)
1362 {
1363   return (spu_or((vec_float4)(a),b));
1364 }
1365
1366 static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b)
1367 {
1368   return (spu_or(a, (vec_float4)(b)));
1369 }
1370
1371
1372 /* vec_pack (vector pack)
1373  * ========
1374  */
1375 static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b)
1376 {
1377   return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1378                                                         17, 19, 21, 23, 25, 27, 29, 31})));
1379 }
1380
1381 static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b)
1382 {
1383   return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1384                                                        17, 19, 21, 23, 25, 27, 29, 31})));
1385 }
1386
1387 static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b)
1388 {
1389   return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1390                                                         18, 19, 22, 23, 26, 27, 30, 31})));
1391 }
1392
1393 static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b)
1394 {
1395   return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1396                                                        18, 19, 22, 23, 26, 27, 30, 31})));
1397 }
1398
1399
1400 /* vec_packpx (vector pack pixel)
1401  * ==========
1402  */
1403 static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b)
1404 {
1405   vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF));
1406   vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F));
1407
1408   return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF),
1409                                            spu_sl(a, 13), x001F),
1410                                    spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF),
1411                                            spu_sl(b, 13), x001F),
1412                                    ((vec_uchar16){ 0,  1,  4,  5,   8,  9, 12, 13,
1413                                                   16, 17, 20, 21, 24, 25, 28, 29}))));
1414 }
1415
1416
1417 /* vec_packs (vector pack saturate)
1418  * =========
1419  */
1420 static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b)
1421 {
1422   vec_ushort8 max = spu_splats((unsigned short)0x00FF);
1423
1424   return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)),
1425                                     spu_sel(b, max, spu_cmpgt(b, 255)),
1426                                     ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1427                                                    17, 19, 21, 23, 25, 27, 29, 31}))));
1428 }
1429
1430 static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b)
1431 {
1432   vec_short8 max = spu_splats((signed short)0x007F);
1433   vec_short8 min = spu_splats((signed short)0xFF80);
1434
1435   return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)),
1436                                     spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)),
1437                                    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1438                                                   17, 19, 21, 23, 25, 27, 29, 31}))));
1439 }
1440
1441 static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b)
1442 {
1443   vec_uint4 max = spu_splats((unsigned int)0x0000FFFF);
1444
1445   return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)),
1446                                     spu_sel(b, max, spu_cmpgt(b, max)),
1447                                     ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1448                                                    18, 19, 22, 23, 26, 27, 30, 31}))));
1449 }
1450
1451 static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b)
1452 {
1453   vec_int4 max = spu_splats((signed int)0x00007FFF);
1454   vec_int4 min = spu_splats((signed int)0xFFFF8000);
1455
1456   return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1457                                    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1458                                    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1459                                                   18, 19, 22, 23, 26, 27, 30, 31}))));
1460 }
1461
1462
1463 /* vec_packsu (vector pack saturate unsigned)
1464  * ==========
1465  */
1466 static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b)
1467 {
1468   return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))),
1469                                    spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))),
1470                                    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1471                                                   17, 19, 21, 23, 25, 27, 29, 31})));
1472 }
1473
1474 static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b)
1475 {
1476   vec_short8 max = spu_splats((signed short)0x00FF);
1477   vec_short8 min = spu_splats((signed short)0x0000);
1478
1479   return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)),
1480                                     spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)),
1481                                     ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1482                                                    17, 19, 21, 23, 25, 27, 29, 31}))));
1483
1484   return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b)));
1485 }
1486
1487 static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b)
1488 {
1489   vec_uint4 max = spu_splats((unsigned int)0xFFFF);
1490
1491   return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))),
1492                                    spu_or(b, (vec_uint4)(spu_cmpgt(b, max))),
1493                                    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1494                                                   18, 19, 22, 23, 26, 27, 30, 31})));
1495 }
1496
1497 static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b)
1498 {
1499   vec_int4 max = spu_splats((signed int)0x0000FFFF);
1500   vec_int4 min = spu_splats((signed int)0x00000000);
1501
1502   return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1503                                     spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1504                                     ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1505                                                    18, 19, 22, 23, 26, 27, 30, 31}))));
1506 }
1507
1508
1509 /* vec_perm (vector permute)
1510  * ========
1511  */
1512 static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c)
1513 {
1514   return (spu_shuffle(a, b, spu_and(c, 0x1F)));
1515 }
1516
1517 static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c)
1518 {
1519   return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1520 }
1521
1522 static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c)
1523 {
1524   return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1525 }
1526
1527 static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c)
1528 {
1529   return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1530 }
1531
1532 static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c)
1533 {
1534   return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1535 }
1536
1537 static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c)
1538 {
1539   return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1540 }
1541
1542 static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c)
1543 {
1544   return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1545 }
1546
1547
1548 /* vec_re (vector reciprocal estimate)
1549  * ======
1550  */
1551 #define vec_re(_a)      spu_re(_a)
1552
1553
1554 /* vec_rl (vector rotate left)
1555  * ======
1556  */
1557 static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b)
1558 {
1559   vec_ushort8 r1, r2;
1560
1561   r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7));
1562   r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1563   return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF))));
1564 }
1565
1566 static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b)
1567 {
1568   return ((vec_char16)(vec_rl((vec_uchar16)(a), b)));
1569 }
1570
1571 static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b)
1572 {
1573   return (spu_rl(a, (vec_short8)(b)));
1574 }
1575
1576 static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b)
1577 {
1578   return (spu_rl(a, (vec_short8)(b)));
1579 }
1580
1581 static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b)
1582 {
1583   return (spu_rl(a, (vec_int4)(b)));
1584 }
1585
1586 static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b)
1587 {
1588   return (spu_rl(a, (vec_int4)(b)));
1589 }
1590
1591
1592 /* vec_round (vector round)
1593  * =========
1594  */
1595 static inline vec_float4 vec_round(vec_float4 a)
1596 {
1597   vec_float4 s_half, s_one, d;
1598   vec_uint4 odd;
1599   vec_uint4 msb = spu_splats((unsigned int)0x80000000);
1600   vec_float4 half = spu_splats(0.5f);
1601   vec_int4 exp;
1602   vec_uint4 mask;
1603
1604   s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb));
1605   a = spu_add(a, s_half);
1606   s_one = spu_add(s_half, s_half);
1607   exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
1608   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
1609   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
1610   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
1611
1612   odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1);
1613   s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0));
1614   s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0),
1615                                  (vec_float4)spu_cmpeq(odd, 1)));
1616   d = spu_andc(a, (vec_float4)(mask));
1617   d = spu_sub(d, s_one);
1618   return (d);
1619 }
1620
1621 /* vec_rsqrte (vector reciprocal square root estimate)
1622  * ==========
1623  */
1624 #define vec_rsqrte(_a)  spu_rsqrte(_a)
1625
1626
1627 /* vec_sel (vector select)
1628  * =======
1629  */
1630 #define vec_sel(_a, _b, _c)     spu_sel(_a, _b, _c)
1631
1632
1633 /* vec_sl (vector shift left)
1634  * ======
1635  */
1636 static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b)
1637 {
1638   vec_ushort8 hi, lo;
1639
1640   lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF);
1641   hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1642
1643   return ((vec_uchar16)(spu_or(hi, lo)));
1644 }
1645
1646 static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b)
1647 {
1648   return ((vec_char16)(vec_sl((vec_uchar16)(a), b)));
1649 }
1650
1651 static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b)
1652 {
1653   return (spu_sl(a, spu_and(b, 15)));
1654 }
1655
1656 static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b)
1657 {
1658   return (spu_sl(a, spu_and((vec_ushort8)(b), 15)));
1659 }
1660
1661 static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b)
1662 {
1663   return (spu_sl(a, spu_and(b, 31)));
1664 }
1665
1666 static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b)
1667 {
1668   return (spu_sl(a, spu_and(b, 31)));
1669 }
1670
1671
1672 /* vec_sld (vector shift left double)
1673  * =======
1674  */
1675 #define vec_sld(_a, _b, _c)     spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c),  1+(_c),  2+(_c),  3+(_c),  \
1676                                                                     4+(_c),  5+(_c),  6+(_c),  7+(_c),  \
1677                                                                     8+(_c),  9+(_c), 10+(_c), 11+(_c),  \
1678                                                                    12+(_c), 13+(_c), 14+(_c), 15+(_c)}))
1679
1680
1681 /* vec_sll (vector shift left long)
1682  * =======
1683  */
1684 #define vec_sll(_a, _b)         spu_slqw(_a, spu_extract((vec_uint4)(_b), 0))
1685
1686
1687 /* vec_slo (vector shift left by octet)
1688  * =======
1689  */
1690 #define vec_slo(_a, _b)         spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F)
1691
1692
1693 /* vec_splat (vector splat)
1694  * =========
1695  */
1696 #define vec_splat(_a, _b)       spu_splats(spu_extract(_a, _b))
1697
1698
1699 /* vec_splat_s8 (vector splat signed byte)
1700  * ============
1701  */
1702 #define vec_splat_s8(_a)        spu_splats((signed char)(_a))
1703
1704
1705 /* vec_splat_s16 (vector splat signed half-word)
1706  * =============
1707  */
1708 #define vec_splat_s16(_a)       spu_splats((signed short)(_a))
1709
1710
1711 /* vec_splat_s32 (vector splat signed word)
1712  * =============
1713  */
1714 #define vec_splat_s32(_a)       spu_splats((signed int)(_a))
1715
1716
1717 /* vec_splat_u8 (vector splat unsigned byte)
1718  * ============
1719  */
1720 #define vec_splat_u8(_a)        spu_splats((unsigned char)(_a))
1721
1722
1723 /* vec_splat_u16 (vector splat unsigned half-word)
1724  * =============
1725  */
1726 #define vec_splat_u16(_a)       spu_splats((unsigned short)(_a))
1727
1728
1729 /* vec_splat_u32 (vector splat unsigned word)
1730  * =============
1731  */
1732 #define vec_splat_u32(_a)       spu_splats((unsigned int)(_a))
1733
1734
1735 /* vec_sr (vector shift right)
1736  * ======
1737  */
1738 static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b)
1739 {
1740   vec_ushort8 hi, lo;
1741
1742   lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7))));
1743   hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1744
1745   return ((vec_uchar16)(spu_or(hi, lo)));
1746 }
1747
1748 static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b)
1749 {
1750   return ((vec_char16)(vec_sr((vec_uchar16)(a), b)));
1751 }
1752
1753 static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b)
1754 {
1755   return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1756 }
1757
1758 static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b)
1759 {
1760   return ((vec_short8)(vec_sr((vec_ushort8)(a), b)));
1761 }
1762
1763 static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b)
1764 {
1765   return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1766 }
1767
1768 static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b)
1769 {
1770   return ((vec_int4)(vec_sr((vec_uint4)(a), b)));
1771 }
1772
1773
1774 /* vec_sra (vector shift right algebraic)
1775  * =======
1776  */
1777 static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b)
1778 {
1779   vec_short8 hi, lo;
1780
1781   lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF);
1782   hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1783
1784   return ((vec_char16)(spu_or(hi, lo)));
1785 }
1786
1787 static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b)
1788 {
1789   return ((vec_uchar16)(vec_sra((vec_char16)(a), b)));
1790 }
1791
1792 static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b)
1793 {
1794   return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1795 }
1796
1797 static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b)
1798 {
1799   return ((vec_ushort8)(vec_sra((vec_short8)(a), b)));
1800 }
1801
1802 static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b)
1803 {
1804   return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1805 }
1806
1807 static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b)
1808 {
1809   return ((vec_uint4)(vec_sra((vec_int4)(a), b)));
1810 }
1811
1812
1813 /* vec_srl (vector shift right long)
1814  * =======
1815  */
1816 #define vec_srl(_a, _b)         spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3))
1817
1818
1819 /* vec_sro (vector shift right by octet)
1820  * =======
1821  */
1822 #define vec_sro(_a, _b)         spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF))
1823
1824 /* vec_st (vector store indexed)
1825  * ======
1826  */
1827 static inline void vec_st(vec_uchar16 a, int b, unsigned char *c)
1828 {
1829   *((vec_uchar16 *)(c+b)) = a;
1830 }
1831
1832 static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c)
1833 {
1834   *((vec_uchar16 *)((unsigned char *)(c)+b)) = a;
1835 }
1836
1837 static inline void vec_st(vec_char16 a, int b, signed char *c)
1838 {
1839   *((vec_char16 *)(c+b)) = a;
1840 }
1841
1842 static inline void vec_st(vec_char16 a, int b, vec_char16 *c)
1843 {
1844   *((vec_char16 *)((signed char *)(c)+b)) = a;
1845 }
1846
1847 static inline void vec_st(vec_bchar16 a, int b, signed char *c)
1848 {
1849   *((vec_bchar16 *)((signed char *)(c)+b)) = a;
1850 }
1851
1852 static inline void vec_st(vec_ushort8 a, int b, unsigned short *c)
1853 {
1854   *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1855 }
1856
1857 static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c)
1858 {
1859   *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1860 }
1861
1862 static inline void vec_st(vec_short8 a, int b, signed short *c)
1863 {
1864   *((vec_short8 *)((unsigned char *)(c)+b)) = a;
1865 }
1866
1867 static inline void vec_st(vec_short8 a, int b, vec_short8 *c)
1868 {
1869   *((vec_short8 *)((signed char *)(c)+b)) = a;
1870 }
1871
1872 static inline void vec_st(vec_bshort8 a, int b, signed short *c)
1873 {
1874   *((vec_bshort8 *)((signed char *)(c)+b)) = a;
1875 }
1876
1877 static inline void vec_st(vec_uint4 a, int b, unsigned int *c)
1878 {
1879   *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1880 }
1881
1882 static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c)
1883 {
1884   *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1885 }
1886
1887 static inline void vec_st(vec_int4 a, int b, signed int *c)
1888 {
1889   *((vec_int4 *)((unsigned char *)(c)+b)) = a;
1890 }
1891
1892 static inline void vec_st(vec_int4 a, int b, vec_int4 *c)
1893 {
1894   *((vec_int4 *)((signed char *)(c)+b)) = a;
1895 }
1896
1897 static inline void vec_st(vec_bint4 a, int b, signed int *c)
1898 {
1899   *((vec_bint4 *)((signed char *)(c)+b)) = a;
1900 }
1901
1902 static inline void vec_st(vec_float4 a, int b, float *c)
1903 {
1904   *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1905 }
1906
1907 static inline void vec_st(vec_float4 a, int b, vec_float4 *c)
1908 {
1909   *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1910 }
1911
1912
1913 /* vec_ste (vector store element indexed)
1914  * =======
1915  */
1916 static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c)
1917 {
1918   unsigned char *ptr;
1919
1920   ptr = c + b;
1921   *ptr = spu_extract(a, (int)(ptr) & 15);
1922 }
1923
1924 static inline void vec_ste(vec_char16 a, int b, signed char *c)
1925 {
1926   vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1927 }
1928
1929 static inline void vec_ste(vec_bchar16 a, int b, signed char *c)
1930 {
1931   vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1932 }
1933
1934 static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c)
1935 {
1936   unsigned short *ptr;
1937
1938   ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1);
1939   *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7);
1940 }
1941
1942 static inline void vec_ste(vec_short8 a, int b, signed short *c)
1943 {
1944   vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1945 }
1946
1947 static inline void vec_ste(vec_bshort8 a, int b, signed short *c)
1948 {
1949   vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1950 }
1951
1952 static inline void vec_ste(vec_uint4 a, int b, unsigned int *c)
1953 {
1954   unsigned int *ptr;
1955
1956   ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3);
1957   *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3);
1958 }
1959
1960 static inline void vec_ste(vec_int4 a, int b, signed int *c)
1961 {
1962   vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1963 }
1964
1965 static inline void vec_ste(vec_bint4 a, int b, signed int *c)
1966 {
1967   vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1968 }
1969
1970 static inline void vec_ste(vec_float4 a, int b, float *c)
1971 {
1972   vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1973 }
1974
1975
1976 /* vec_stl (vector store indexed LRU)
1977  * =======
1978  */
1979 #define vec_stl(_a, _b, _c)             vec_st(_a, _b, _c)
1980
1981
1982 /* vec_sub (vector subtract)
1983  * =======
1984  */
1985 static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b)
1986 {
1987   return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)),
1988                                 spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)),
1989                                 spu_splats((unsigned short)0xFF00))));
1990 }
1991
1992 static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b)
1993 {
1994   return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
1995 }
1996
1997 static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b)
1998 {
1999   return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2000 }
2001
2002 static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b)
2003 {
2004   return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2005 }
2006
2007 static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b)
2008 {
2009   return (spu_sub(a, b));
2010 }
2011
2012 static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b)
2013 {
2014   return (spu_sub(a, b));
2015 }
2016
2017 static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b)
2018 {
2019   return (spu_sub((vec_short8)(a), b));
2020 }
2021
2022 static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b)
2023 {
2024   return (spu_sub(a, (vec_short8)(b)));
2025 }
2026
2027 static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b)
2028 {
2029   return (spu_sub(a, b));
2030 }
2031
2032 static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b)
2033 {
2034   return (spu_sub(a, b));
2035 }
2036
2037 static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b)
2038 {
2039   return (spu_sub((vec_int4)(a), b));
2040 }
2041
2042 static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b)
2043 {
2044   return (spu_sub(a, (vec_int4)(b)));
2045 }
2046
2047 static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b)
2048 {
2049   return (spu_sub(a, b));
2050 }
2051
2052
2053 /* vec_subc (vector subtract carryout)
2054  * ========
2055  */
2056 #define vec_subc(_a, _b)        spu_genb(_a, _b)
2057
2058
2059 /* vec_subs (vector subtract saturate)
2060  * ========
2061  */
2062 static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b)
2063 {
2064   vec_ushort8 s1, s2;
2065   vec_uchar16 s, d;
2066
2067   s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2068   s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2069   s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
2070                                                         8, 24, 10, 26, 12, 28, 14, 30})));
2071   d  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
2072                                                         9, 25, 11, 27, 13, 29, 15, 31})));
2073   return (spu_andc(d, s));
2074 }
2075
2076 static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b)
2077 {
2078   vec_ushort8 s1, s2;
2079   vec_uchar16 s, d;
2080
2081   s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2082   s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2083   s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
2084                                                         9, 25, 11, 27, 13, 29, 15, 31})));
2085   d  = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F));
2086   d  = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F));
2087
2088   return ((vec_char16)(d));
2089 }
2090
2091 static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b)
2092 {
2093   return (vec_subs((vec_char16)(a), b));
2094 }
2095
2096 static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b)
2097 {
2098   return (vec_subs(a, (vec_char16)(b)));
2099 }
2100
2101 static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b)
2102 {
2103   return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2104 }
2105
2106 static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b)
2107 {
2108   vec_short8 s;
2109   vec_short8 d;
2110
2111   s = spu_sub(a, b);
2112   d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15)));
2113   d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15)));
2114
2115   return (d);
2116 }
2117
2118 static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b)
2119 {
2120   return ((vec_short8)(vec_subs((vec_short8)(a), b)));
2121 }
2122
2123 static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b)
2124 {
2125   return ((vec_short8)(vec_subs(a, (vec_short8)(b))));
2126 }
2127
2128 static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b)
2129 {
2130   return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2131 }
2132
2133 static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b)
2134 {
2135   vec_int4 s;
2136   vec_int4 d;
2137
2138   s = spu_sub(a, b);
2139   d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31)));
2140   d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31)));
2141
2142   return (d);
2143 }
2144
2145 static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b)
2146 {
2147   return ((vec_int4)(vec_subs((vec_int4)(a), b)));
2148 }
2149
2150 static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b)
2151 {
2152   return ((vec_int4)(vec_subs(a, (vec_int4)(b))));
2153 }
2154
2155
2156 /* vec_sum4s (vector sum across partial (1/4) saturated)
2157  * =========
2158  */
2159 static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b)
2160 {
2161   vec_uint4 a01_23, a0123;
2162
2163   a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8),
2164                                spu_and((vec_ushort8)(a), 0xFF)));
2165   a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF));
2166   return (vec_adds(a0123, b));
2167 }
2168
2169 static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b)
2170 {
2171   vec_int4 a01_23, a0123;
2172
2173   a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8),
2174                               spu_extend(a)));
2175   a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23)));
2176   return (vec_adds(a0123, b));
2177 }
2178
2179 static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b)
2180 {
2181   vec_int4 a0123;
2182
2183   a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a));
2184   return (vec_adds(a0123, b));
2185 }
2186
2187
2188 /* vec_sum2s (vector sum across partial (1/2) saturated)
2189  * =========
2190  */
2191 static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b)
2192 {
2193   vec_int4 c, d;
2194   vec_int4 sign1, sign2, sign3;
2195   vec_int4 carry, sum_l, sum_h, sat, sat_val;
2196
2197   sign1 = spu_rlmaska(a, -31);
2198   sign2 = spu_rlmaska(b, -31);
2199
2200   c = spu_rlqwbyte(a, -4);
2201   sign3 = spu_rlqwbyte(sign1, -4);
2202
2203   carry = spu_genc(a, b);
2204   sum_l = spu_add(a, b);
2205   sum_h = spu_addx(sign1, sign2, carry);
2206
2207   carry = spu_genc(sum_l, c);
2208   sum_l = spu_add(sum_l, c);
2209   sum_h = spu_addx(sum_h, sign3, carry);
2210
2211   sign1 = spu_rlmaska(sum_l, -31);
2212   sign2 = spu_rlmaska(sum_h, -31);
2213
2214   sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF));
2215
2216   sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2));
2217
2218   d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1});
2219
2220   return (d);
2221 }
2222
2223
2224 /* vec_sums (vector sum saturated)
2225  * ========
2226  */
2227 static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b)
2228 {
2229   vec_int4 a0, a1, a2, c0, c1, c2, d;
2230   vec_int4 sign_a, sign_b, sign_l, sign_h;
2231   vec_int4 sum_l, sum_h, sat, sat_val;
2232
2233   sign_a = spu_rlmaska(a, -31);
2234   sign_b = spu_rlmaska(b, -31);
2235
2236   a0 = spu_rlqwbyte(a, -12);
2237   a1 = spu_rlqwbyte(a, -8);
2238   a2 = spu_rlqwbyte(a, -4);
2239
2240   sum_l = spu_add(a, b);
2241   sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b));
2242
2243   c2 = spu_genc(sum_l, a2);
2244   sum_l = spu_add(sum_l, a2);
2245   sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2);
2246
2247   c1 = spu_genc(sum_l, a1);
2248   sum_l = spu_add(sum_l, a1);
2249   sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1);
2250
2251   c0 = spu_genc(sum_l, a0);
2252   sum_l = spu_add(sum_l, a0);
2253   sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0);
2254
2255   sign_l = spu_rlmaska(sum_l, -31);
2256   sign_h = spu_rlmaska(sum_h, -31);
2257
2258   sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF));
2259
2260   sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h));
2261
2262   d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1}));
2263
2264   return (d);
2265 }
2266
2267
2268 /* vec_trunc (vector truncate)
2269  * =========
2270  */
2271 static inline vec_float4 vec_trunc(vec_float4 a)
2272 {
2273   vec_int4 exp;
2274   vec_uint4 mask;
2275
2276   exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
2277   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
2278   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
2279   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
2280   return (spu_andc(a, (vec_float4)(mask)));
2281 }
2282
2283 /* vec_unpackh (vector unpack high element)
2284  * ===========
2285  */
2286 static inline vec_short8 vec_unpackh(vec_char16 a)
2287 {
2288   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3,
2289                                                       4, 4, 5, 5, 6, 6, 7, 7}))));
2290 }
2291
2292 static inline vec_bshort8 vec_unpackh(vec_bchar16 a)
2293 {
2294   return ((vec_bshort8)(vec_unpackh((vec_char16)(a))));
2295 }
2296
2297 static inline vec_int4 vec_unpackh(vec_short8 a)
2298 {
2299   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3,
2300                                                       0, 0, 4, 5, 0, 0, 6, 7}))));
2301 }
2302
2303 #ifdef SUPPORT_UNPACK_PIXEL
2304 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2305  * can not simultaneously be supported. By default, the boolean short is
2306  * supported.
2307  */
2308 static inline vec_uint4 vec_unpackh(vec_pixel8 a)
2309 {
2310   vec_ushort8 p1, p2;
2311
2312   p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)),
2313                    spu_and((vec_ushort8)(a.p), 0x1F),
2314                    ((vec_uchar16){ 0, 128, 128, 17,  2, 128, 128, 19,
2315                                    4, 128, 128, 21,  6, 128, 128, 23}));
2316   p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F),
2317                    spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F),
2318                    ((vec_uchar16){ 128,  17, 1, 128, 128,  19, 3, 128,
2319                                    128,  21, 5, 128, 128,  23, 7, 128}));
2320   return ((vec_uint4)(spu_or(p1, p2)));
2321 }
2322
2323 #else
2324
2325 static inline vec_bint4 vec_unpackh(vec_bshort8 a)
2326 {
2327   return ((vec_bint4)(vec_unpackh((vec_short8)(a))));
2328 }
2329 #endif
2330
2331
2332
2333
2334
2335 /* vec_unpackl (vector unpack low element)
2336  * ===========
2337  */
2338 static inline vec_short8 vec_unpackl(vec_char16 a)
2339 {
2340   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11,
2341                                                       12, 12, 13, 13, 14, 14, 15, 15}))));
2342 }
2343
2344 static inline vec_bshort8 vec_unpackl(vec_bchar16 a)
2345 {
2346   return ((vec_bshort8)(vec_unpackl((vec_char16)(a))));
2347 }
2348
2349
2350 static inline vec_int4 vec_unpackl(vec_short8 a)
2351 {
2352   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11,
2353                                                       0, 0,12,13, 0, 0, 14, 15}))));
2354 }
2355
2356
2357 #ifdef SUPPORT_UNPACK_PIXEL
2358 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2359  * can not simultaneously be supported. By default, the boolean short is
2360  * supported.
2361  */
2362 static inline vec_uint4 vec_unpackl(vec_pixel8 a)
2363 {
2364   vec_ushort8 p1, p2;
2365
2366   p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)),
2367                    spu_and((vec_ushort8)(a), 0x1F),
2368                    ((vec_uchar16){ 8, 128, 128, 25,  10, 128, 128, 27,
2369                                   12, 128, 128, 29,  14, 128, 128, 31}));
2370   p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F),
2371                    spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F),
2372                    ((vec_uchar16){ 128, 25,  9, 128, 128, 27, 11, 128,
2373                                    128, 29, 13, 128, 128, 31, 15, 128}));
2374   return ((vec_uint4)(spu_or(p1, p2)));
2375 }
2376
2377 #else
2378
2379 static inline vec_bint4 vec_unpackl(vec_bshort8 a)
2380 {
2381   return ((vec_bint4)(vec_unpackl((vec_short8)(a))));
2382
2383 }
2384 #endif
2385
2386
2387
2388 /* vec_xor (vector logical xor)
2389  * ======
2390  */
2391 static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b)
2392 {
2393   return (spu_xor(a, b));
2394 }
2395
2396 static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b)
2397 {
2398   return (spu_xor(a, b));
2399 }
2400
2401 static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b)
2402 {
2403   return (spu_xor((vec_char16)(a), b));
2404 }
2405
2406 static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b)
2407 {
2408   return (spu_xor(a, (vec_char16)(b)));
2409 }
2410
2411 static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b)
2412 {
2413   return (spu_xor(a, b));
2414 }
2415
2416 static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b)
2417 {
2418   return (spu_xor(a, b));
2419 }
2420
2421 static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b)
2422 {
2423   return (spu_xor((vec_short8)(a), b));
2424 }
2425
2426 static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b)
2427 {
2428   return (spu_xor(a, (vec_short8)(b)));
2429 }
2430
2431 static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b)
2432 {
2433   return (spu_xor(a, b));
2434 }
2435
2436 static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b)
2437 {
2438   return (spu_xor(a, b));
2439 }
2440
2441 static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b)
2442 {
2443   return (spu_xor((vec_int4)(a), b));
2444 }
2445
2446 static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b)
2447 {
2448   return (spu_xor(a, (vec_int4)(b)));
2449 }
2450
2451 static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b)
2452 {
2453   return (spu_xor(a, b));
2454 }
2455
2456 static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b)
2457 {
2458   return (spu_xor((vec_float4)(a),b));
2459 }
2460
2461 static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b)
2462 {
2463   return (spu_xor(a, (vec_float4)(b)));
2464 }
2465
2466 /************************************************************************
2467  *                        PREDICATES
2468  ************************************************************************/
2469
2470 /* vec_all_eq (all elements equal)
2471  * ==========
2472  */
2473 static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b)
2474 {
2475   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2476 }
2477
2478 static inline int vec_all_eq(vec_char16 a, vec_char16 b)
2479 {
2480   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2481 }
2482
2483 static inline int vec_all_eq(vec_bchar16 a, vec_char16 b)
2484 {
2485   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF));
2486 }
2487
2488 static inline int vec_all_eq(vec_char16 a, vec_bchar16 b)
2489 {
2490   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF));
2491 }
2492
2493 static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b)
2494 {
2495   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2496 }
2497
2498 static inline int vec_all_eq(vec_short8 a, vec_short8 b)
2499 {
2500   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2501 }
2502
2503 static inline int vec_all_eq(vec_bshort8 a, vec_short8 b)
2504 {
2505   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF));
2506 }
2507
2508 static inline int vec_all_eq(vec_short8 a, vec_bshort8 b)
2509 {
2510   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF));
2511 }
2512
2513 static inline int vec_all_eq(vec_uint4 a, vec_uint4 b)
2514 {
2515   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2516 }
2517
2518 static inline int vec_all_eq(vec_int4 a, vec_int4 b)
2519 {
2520   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2521 }
2522
2523 static inline int vec_all_eq(vec_bint4 a, vec_int4 b)
2524 {
2525   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF));
2526 }
2527
2528 static inline int vec_all_eq(vec_int4 a, vec_bint4 b)
2529 {
2530   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF));
2531 }
2532
2533 static inline int vec_all_eq(vec_float4 a, vec_float4 b)
2534 {
2535   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2536 }
2537
2538
2539 /* vec_all_ge (all elements greater than or equal)
2540  * ==========
2541  */
2542 static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b)
2543 {
2544   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2545 }
2546
2547 static inline int vec_all_ge(vec_char16 a, vec_char16 b)
2548 {
2549   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2550 }
2551
2552 static inline  int vec_all_ge(vec_bchar16 a, vec_char16 b)
2553 {
2554   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0));
2555 }
2556
2557 static inline int vec_all_ge(vec_char16 a, vec_bchar16 b)
2558 {
2559   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0));
2560 }
2561
2562 static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b)
2563 {
2564   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2565 }
2566
2567 static inline int vec_all_ge(vec_short8 a, vec_short8 b)
2568 {
2569   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2570 }
2571
2572 static inline int vec_all_ge(vec_bshort8 a, vec_short8 b)
2573 {
2574   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0));
2575 }
2576
2577 static inline int vec_all_ge(vec_short8 a, vec_bshort8 b)
2578 {
2579   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0));
2580 }
2581
2582 static inline int vec_all_ge(vec_uint4 a, vec_uint4 b)
2583 {
2584   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2585 }
2586
2587 static inline int vec_all_ge(vec_int4 a, vec_int4 b)
2588 {
2589   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2590 }
2591
2592 static inline int vec_all_ge(vec_bint4 a, vec_int4 b)
2593 {
2594   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0));
2595 }
2596
2597 static inline int vec_all_ge(vec_int4 a, vec_bint4 b)
2598 {
2599   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0));
2600 }
2601
2602 static inline int vec_all_ge(vec_float4 a, vec_float4 b)
2603 {
2604   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2605 }
2606
2607
2608 /* vec_all_gt (all elements greater than)
2609  * ==========
2610  */
2611 static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b)
2612 {
2613   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2614 }
2615
2616 static inline int vec_all_gt(vec_char16 a, vec_char16 b)
2617 {
2618   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2619 }
2620
2621 static inline int vec_all_gt(vec_bchar16 a, vec_char16 b)
2622 {
2623   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF));
2624 }
2625
2626 static inline int vec_all_gt(vec_char16 a, vec_bchar16 b)
2627 {
2628   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF));
2629 }
2630
2631 static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b)
2632 {
2633   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2634 }
2635
2636 static inline int vec_all_gt(vec_short8 a, vec_short8 b)
2637 {
2638   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2639 }
2640
2641 static inline int vec_all_gt(vec_bshort8 a, vec_short8 b)
2642 {
2643   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF));
2644 }
2645
2646 static inline int vec_all_gt(vec_short8 a, vec_bshort8 b)
2647 {
2648   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF));
2649 }
2650
2651 static inline int vec_all_gt(vec_uint4 a, vec_uint4 b)
2652 {
2653   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2654 }
2655
2656 static inline int vec_all_gt(vec_int4 a, vec_int4 b)
2657 {
2658   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2659 }
2660
2661 static inline int vec_all_gt(vec_bint4 a, vec_int4 b)
2662 {
2663   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF));
2664 }
2665
2666 static inline int vec_all_gt(vec_int4 a, vec_bint4 b)
2667 {
2668   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF));
2669 }
2670
2671 static inline int vec_all_gt(vec_float4 a, vec_float4 b)
2672 {
2673   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2674 }
2675
2676
2677 /* vec_all_in (all elements in bounds)
2678  * ==========
2679  */
2680 static inline int vec_all_in(vec_float4 a, vec_float4 b)
2681 {
2682   return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF);
2683 }
2684
2685
2686 /* vec_all_le (all elements less than or equal)
2687  * ==========
2688  */
2689 static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b)
2690 {
2691   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2692 }
2693
2694 static inline int vec_all_le(vec_char16 a, vec_char16 b)
2695 {
2696   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2697 }
2698
2699 static inline int vec_all_le(vec_bchar16 a, vec_char16 b)
2700 {
2701   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0));
2702 }
2703
2704 static inline int vec_all_le(vec_char16 a, vec_bchar16 b)
2705 {
2706   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0));
2707 }
2708
2709 static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b)
2710 {
2711   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2712 }
2713
2714 static inline int vec_all_le(vec_short8 a, vec_short8 b)
2715 {
2716   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2717 }
2718
2719 static inline int vec_all_le(vec_bshort8 a, vec_short8 b)
2720 {
2721   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0));
2722 }
2723
2724 static inline int vec_all_le(vec_short8 a, vec_bshort8 b)
2725 {
2726   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0));
2727 }
2728
2729 static inline int vec_all_le(vec_uint4 a, vec_uint4 b)
2730 {
2731   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2732 }
2733
2734 static inline int vec_all_le(vec_int4 a, vec_int4 b)
2735 {
2736   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2737 }
2738
2739 static inline int vec_all_le(vec_bint4 a, vec_int4 b)
2740 {
2741   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0));
2742 }
2743
2744 static inline int vec_all_le(vec_int4 a, vec_bint4 b)
2745 {
2746   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0));
2747 }
2748
2749 static inline int vec_all_le(vec_float4 a, vec_float4 b)
2750 {
2751   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2752 }
2753
2754
2755 /* vec_all_lt (all elements less than)
2756  * ==========
2757  */
2758 static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b)
2759 {
2760   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2761 }
2762
2763 static inline int vec_all_lt(vec_char16 a, vec_char16 b)
2764 {
2765   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2766 }
2767
2768 static inline int vec_all_lt(vec_bchar16 a, vec_char16 b)
2769 {
2770   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF));
2771 }
2772
2773 static inline int vec_all_lt(vec_char16 a, vec_bchar16 b)
2774 {
2775   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF));
2776 }
2777
2778 static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b)
2779 {
2780   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2781 }
2782
2783 static inline int vec_all_lt(vec_short8 a, vec_short8 b)
2784 {
2785   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2786 }
2787
2788 static inline int vec_all_lt(vec_bshort8 a, vec_short8 b)
2789 {
2790   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF));
2791 }
2792
2793 static inline int vec_all_lt(vec_short8 a, vec_bshort8 b)
2794 {
2795   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF));
2796 }
2797
2798 static inline int vec_all_lt(vec_uint4 a, vec_uint4 b)
2799 {
2800   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2801 }
2802
2803 static inline int vec_all_lt(vec_int4 a, vec_int4 b)
2804 {
2805   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2806 }
2807
2808 static inline int vec_all_lt(vec_bint4 a, vec_int4 b)
2809 {
2810   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF));
2811 }
2812
2813 static inline int vec_all_lt(vec_int4 a, vec_bint4 b)
2814 {
2815   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF));
2816 }
2817
2818 static inline int vec_all_lt(vec_float4 a, vec_float4 b)
2819 {
2820   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2821 }
2822
2823
2824 /* vec_all_nan (all elements not a number)
2825  * ===========
2826  */
2827 static inline int vec_all_nan(vec_float4 a)
2828 {
2829   vec_uint4 exp, man;
2830   vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
2831
2832   exp = spu_and((vec_uint4)(a), exp_mask);
2833   man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
2834   return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
2835                                                 spu_cmpeq(man, 0))), 0) == 0xF));
2836 }
2837
2838 #define vec_all_nan(_a)         (0)
2839
2840
2841 /* vec_all_ne (all elements not equal)
2842  * ==========
2843  */
2844 static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b)
2845 {
2846   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2847 }
2848
2849 static inline int vec_all_ne(vec_char16 a, vec_char16 b)
2850 {
2851   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2852 }
2853
2854 static inline int vec_all_ne(vec_bchar16 a, vec_char16 b)
2855 {
2856   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0));
2857 }
2858
2859 static inline int vec_all_ne(vec_char16 a, vec_bchar16 b)
2860 {
2861   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0));
2862 }
2863
2864 static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b)
2865 {
2866   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2867 }
2868
2869 static inline int vec_all_ne(vec_short8 a, vec_short8 b)
2870 {
2871   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2872 }
2873
2874 static inline int vec_all_ne(vec_bshort8 a, vec_short8 b)
2875 {
2876   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0));
2877 }
2878
2879 static inline int vec_all_ne(vec_short8 a, vec_bshort8 b)
2880 {
2881   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0));
2882 }
2883
2884 static inline int vec_all_ne(vec_uint4 a, vec_uint4 b)
2885 {
2886   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2887 }
2888
2889 static inline int vec_all_ne(vec_int4 a, vec_int4 b)
2890 {
2891   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2892 }
2893
2894 static inline int vec_all_ne(vec_bint4 a, vec_int4 b)
2895 {
2896   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0));
2897 }
2898
2899 static inline int vec_all_ne(vec_int4 a, vec_bint4 b)
2900 {
2901   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0));
2902 }
2903
2904 static inline int vec_all_ne(vec_float4 a, vec_float4 b)
2905 {
2906   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2907 }
2908
2909
2910 /* vec_all_nge (all elements not greater than or equal)
2911  * ===========
2912  */
2913 static inline int vec_all_nge(vec_float4 a, vec_float4 b)
2914 {
2915   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2916 }
2917
2918
2919 /* vec_all_ngt (all elements not greater than)
2920  * ===========
2921  */
2922 static inline int vec_all_ngt(vec_float4 a, vec_float4 b)
2923 {
2924   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2925 }
2926
2927
2928 /* vec_all_nle (all elements not less than or equal)
2929  * ===========
2930  */
2931 static inline int vec_all_nle(vec_float4 a, vec_float4 b)
2932 {
2933   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2934 }
2935
2936
2937 /* vec_all_nlt (all elements not less than)
2938  * ===========
2939  */
2940 static inline int vec_all_nlt(vec_float4 a, vec_float4 b)
2941 {
2942   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2943 }
2944
2945
2946 /* vec_all_numeric (all elements numeric)
2947  * ===========
2948  */
2949 static inline int vec_all_numeric(vec_float4 a)
2950 {
2951   vec_uint4 exp;
2952
2953   exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
2954   return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0));
2955 }
2956
2957
2958
2959 /* vec_any_eq (any elements equal)
2960  * ==========
2961  */
2962 static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b)
2963 {
2964   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2965 }
2966
2967 static inline int vec_any_eq(vec_char16 a, vec_char16 b)
2968 {
2969   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2970 }
2971
2972 static inline int vec_any_eq(vec_bchar16 a, vec_char16 b)
2973 {
2974   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0));
2975 }
2976
2977 static inline int vec_any_eq(vec_char16 a, vec_bchar16 b)
2978 {
2979   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0));
2980 }
2981
2982 static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b)
2983 {
2984   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2985 }
2986
2987 static inline int vec_any_eq(vec_short8 a, vec_short8 b)
2988 {
2989   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2990 }
2991
2992 static inline int vec_any_eq(vec_bshort8 a, vec_short8 b)
2993 {
2994   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0));
2995 }
2996
2997 static inline int vec_any_eq(vec_short8 a, vec_bshort8 b)
2998 {
2999   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0));
3000 }
3001
3002 static inline int vec_any_eq(vec_uint4 a, vec_uint4 b)
3003 {
3004   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3005 }
3006
3007 static inline int vec_any_eq(vec_int4 a, vec_int4 b)
3008 {
3009   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3010 }
3011
3012 static inline int vec_any_eq(vec_bint4 a, vec_int4 b)
3013 {
3014   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0)));
3015 }
3016
3017 static inline int vec_any_eq(vec_int4 a, vec_bint4 b)
3018 {
3019   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0)));
3020 }
3021
3022 static inline int vec_any_eq(vec_float4 a, vec_float4 b)
3023 {
3024   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3025 }
3026
3027 /* vec_any_ge (any elements greater than or equal)
3028  * ==========
3029  */
3030 static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b)
3031 {
3032   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3033 }
3034
3035 static inline int vec_any_ge(vec_char16 a, vec_char16 b)
3036 {
3037   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3038 }
3039
3040 static inline int vec_any_ge(vec_bchar16 a, vec_char16 b)
3041 {
3042   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF));
3043 }
3044
3045 static inline int vec_any_ge(vec_char16 a, vec_bchar16 b)
3046 {
3047   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF));
3048 }
3049
3050 static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b)
3051 {
3052   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3053 }
3054
3055 static inline int vec_any_ge(vec_short8 a, vec_short8 b)
3056 {
3057   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3058 }
3059
3060 static inline int vec_any_ge(vec_bshort8 a, vec_short8 b)
3061 {
3062   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF));
3063 }
3064
3065 static inline int vec_any_ge(vec_short8 a, vec_bshort8 b)
3066 {
3067   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF));
3068 }
3069
3070 static inline int vec_any_ge(vec_uint4 a, vec_uint4 b)
3071 {
3072   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3073 }
3074
3075 static inline int vec_any_ge(vec_int4 a, vec_int4 b)
3076 {
3077   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3078 }
3079
3080 static inline int vec_any_ge(vec_bint4 a, vec_int4 b)
3081 {
3082   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF));
3083 }
3084
3085 static inline int vec_any_ge(vec_int4 a, vec_bint4 b)
3086 {
3087   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF));
3088 }
3089
3090 static inline int vec_any_ge(vec_float4 a, vec_float4 b)
3091 {
3092   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3093 }
3094
3095
3096 /* vec_any_gt (any elements greater than)
3097  * ==========
3098  */
3099 static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b)
3100 {
3101   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3102 }
3103
3104 static inline int vec_any_gt(vec_char16 a, vec_char16 b)
3105 {
3106   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3107 }
3108
3109 static inline int vec_any_gt(vec_bchar16 a, vec_char16 b)
3110 {
3111   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0));
3112 }
3113
3114 static inline int vec_any_gt(vec_char16 a, vec_bchar16 b)
3115 {
3116   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0));
3117 }
3118
3119 static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b)
3120 {
3121   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3122 }
3123
3124 static inline int vec_any_gt(vec_short8 a, vec_short8 b)
3125 {
3126   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3127 }
3128
3129 static inline int vec_any_gt(vec_bshort8 a, vec_short8 b)
3130 {
3131   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0));
3132 }
3133
3134 static inline int vec_any_gt(vec_short8 a, vec_bshort8 b)
3135 {
3136   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0));
3137 }
3138
3139
3140 static inline int vec_any_gt(vec_uint4 a, vec_uint4 b)
3141 {
3142   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3143 }
3144
3145 static inline int vec_any_gt(vec_int4 a, vec_int4 b)
3146 {
3147   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3148 }
3149
3150 static inline int vec_any_gt(vec_bint4 a, vec_int4 b)
3151 {
3152   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0)));
3153 }
3154
3155 static inline int vec_any_gt(vec_int4 a, vec_bint4 b)
3156 {
3157   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0)));
3158 }
3159
3160 static inline int vec_any_gt(vec_float4 a, vec_float4 b)
3161 {
3162   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3163 }
3164
3165 /* vec_any_le (any elements less than or equal)
3166  * ==========
3167  */
3168 static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b)
3169 {
3170   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3171 }
3172
3173 static inline int vec_any_le(vec_char16 a, vec_char16 b)
3174 {
3175   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3176 }
3177
3178 static inline int vec_any_le(vec_bchar16 a, vec_char16 b)
3179 {
3180   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF));
3181 }
3182
3183 static inline int vec_any_le(vec_char16 a, vec_bchar16 b)
3184 {
3185   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF));
3186 }
3187
3188 static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b)
3189 {
3190   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3191 }
3192
3193 static inline int vec_any_le(vec_short8 a, vec_short8 b)
3194 {
3195   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3196 }
3197
3198 static inline int vec_any_le(vec_bshort8 a, vec_short8 b)
3199 {
3200   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF));
3201 }
3202
3203 static inline int vec_any_le(vec_short8 a, vec_bshort8 b)
3204 {
3205   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF));
3206 }
3207
3208 static inline int vec_any_le(vec_uint4 a, vec_uint4 b)
3209 {
3210   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3211 }
3212
3213 static inline int vec_any_le(vec_int4 a, vec_int4 b)
3214 {
3215   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3216 }
3217
3218 static inline int vec_any_le(vec_bint4 a, vec_int4 b)
3219 {
3220   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF));
3221 }
3222
3223 static inline int vec_any_le(vec_int4 a, vec_bint4 b)
3224 {
3225   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF));
3226 }
3227
3228 static inline int vec_any_le(vec_float4 a, vec_float4 b)
3229 {
3230   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3231 }
3232
3233
3234 /* vec_any_lt (any elements less than)
3235  * ==========
3236  */
3237 static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b)
3238 {
3239   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3240 }
3241
3242 static inline int vec_any_lt(vec_char16 a, vec_char16 b)
3243 {
3244   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3245 }
3246
3247 static inline int vec_any_lt(vec_bchar16 a, vec_char16 b)
3248 {
3249   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0));
3250 }
3251
3252 static inline int vec_any_lt(vec_char16 a, vec_bchar16 b)
3253 {
3254   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0));
3255 }
3256
3257 static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b)
3258 {
3259   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3260 }
3261
3262 static inline int vec_any_lt(vec_short8 a, vec_short8 b)
3263 {
3264   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3265 }
3266
3267 static inline int vec_any_lt(vec_bshort8 a, vec_short8 b)
3268 {
3269   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0));
3270 }
3271
3272 static inline int vec_any_lt(vec_short8 a, vec_bshort8 b)
3273 {
3274   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0));
3275 }
3276
3277 static inline int vec_any_lt(vec_uint4 a, vec_uint4 b)
3278 {
3279   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3280 }
3281
3282 static inline int vec_any_lt(vec_int4 a, vec_int4 b)
3283 {
3284   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3285 }
3286
3287 static inline int vec_any_lt(vec_bint4 a, vec_int4 b)
3288 {
3289   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0)));
3290 }
3291
3292 static inline int vec_any_lt(vec_int4 a, vec_bint4 b)
3293 {
3294   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0)));
3295 }
3296
3297 static inline int vec_any_lt(vec_float4 a, vec_float4 b)
3298 {
3299   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3300 }
3301
3302 /* vec_any_nan (any elements not a number)
3303  * ===========
3304  */
3305 static inline int vec_any_nan(vec_float4 a)
3306 {
3307   vec_uint4 exp, man;
3308   vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
3309
3310   exp = spu_and((vec_uint4)(a), exp_mask);
3311   man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
3312   return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
3313                                                 spu_cmpeq(man, 0))), 0) != 0));
3314 }
3315
3316
3317 /* vec_any_ne (any elements not equal)
3318  * ==========
3319  */
3320 static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b)
3321 {
3322   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3323 }
3324
3325 static inline int vec_any_ne(vec_char16 a, vec_char16 b)
3326 {
3327   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3328 }
3329
3330 static inline int vec_any_ne(vec_bchar16 a, vec_char16 b)
3331 {
3332   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF));
3333 }
3334
3335 static inline int vec_any_ne(vec_char16 a, vec_bchar16 b)
3336 {
3337   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF));
3338 }
3339
3340 static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b)
3341 {
3342   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3343 }
3344
3345 static inline int vec_any_ne(vec_short8 a, vec_short8 b)
3346 {
3347   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3348 }
3349
3350 static inline int vec_any_ne(vec_bshort8 a, vec_short8 b)
3351 {
3352   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF));
3353 }
3354
3355 static inline int vec_any_ne(vec_short8 a, vec_bshort8 b)
3356 {
3357   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF));
3358 }
3359
3360 static inline int vec_any_ne(vec_uint4 a, vec_uint4 b)
3361 {
3362   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3363 }
3364
3365 static inline int vec_any_ne(vec_int4 a, vec_int4 b)
3366 {
3367   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3368 }
3369
3370 static inline int vec_any_ne(vec_bint4 a, vec_int4 b)
3371 {
3372   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF));
3373 }
3374
3375 static inline int vec_any_ne(vec_int4 a, vec_bint4 b)
3376 {
3377   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF));
3378 }
3379
3380 static inline int vec_any_ne(vec_float4 a, vec_float4 b)
3381 {
3382   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3383 }
3384
3385
3386 /* vec_any_nge (any elements not greater than or equal)
3387  * ===========
3388  */
3389 static inline int vec_any_nge(vec_float4 a, vec_float4 b)
3390 {
3391   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3392 }
3393
3394 /* vec_any_ngt (any elements not greater than)
3395  * ===========
3396  */
3397 static inline int vec_any_ngt(vec_float4 a, vec_float4 b)
3398 {
3399   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3400 }
3401
3402
3403 /* vec_any_nle (any elements not less than or equal)
3404  * ===========
3405  */
3406 static inline int vec_any_nle(vec_float4 a, vec_float4 b)
3407 {
3408   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3409 }
3410
3411
3412 /* vec_any_nlt (any elements not less than)
3413  * ===========
3414  */
3415 static inline int vec_any_nlt(vec_float4 a, vec_float4 b)
3416 {
3417   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3418 }
3419
3420
3421 /* vec_any_numeric (any elements numeric)
3422  * ===============
3423  */
3424 static inline int vec_any_numeric(vec_float4 a)
3425 {
3426   vec_uint4 exp;
3427
3428   exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
3429   return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF));
3430 }
3431
3432
3433 /* vec_any_out (any elements out of bounds)
3434  * ===========
3435  */
3436 static inline int vec_any_out(vec_float4 a, vec_float4 b)
3437 {
3438   return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF);
3439 }
3440
3441
3442 /* CBE Language Extension Intrinsics
3443  */
3444
3445 /* vec_extract (extract element from vector)
3446  * ===========
3447  */
3448 #define vec_extract(_a, _element)       spu_extract(_a, _element)
3449
3450
3451 /* vec_insert (insert scalar into specified vector element)
3452  * ==========
3453  */
3454 #define vec_insert(_a, _b, _element)    spu_insert(_a, _b, _element)
3455
3456 /* vec_lvlx (load vector left indexed)
3457  * ========
3458  */
3459 static inline vec_uchar16 vec_lvlx(int a, unsigned char *b)
3460 {
3461   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3462   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3463 }
3464
3465 static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b)
3466 {
3467   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3468   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3469 }
3470
3471 static inline vec_char16 vec_lvlx(int a, signed char *b)
3472 {
3473   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3474   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3475 }
3476
3477 static inline vec_char16 vec_lvlx(int a, vec_char16 *b)
3478 {
3479   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3480   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3481 }
3482
3483 static inline vec_ushort8 vec_lvlx(int a, unsigned short *b)
3484 {
3485   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3486   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3487 }
3488
3489 static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b)
3490 {
3491   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3492   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3493 }
3494
3495 static inline vec_short8 vec_lvlx(int a, signed short *b)
3496 {
3497   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3498   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3499 }
3500
3501 static inline vec_short8 vec_lvlx(int a, vec_short8 *b)
3502 {
3503   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3504   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3505 }
3506
3507 static inline vec_uint4 vec_lvlx(int a, unsigned int *b)
3508 {
3509   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3510   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3511 }
3512
3513 static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b)
3514 {
3515   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3516   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3517 }
3518
3519 static inline vec_int4 vec_lvlx(int a, signed int *b)
3520 {
3521   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3522   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3523 }
3524
3525 static inline vec_int4 vec_lvlx(int a, vec_int4 *b)
3526 {
3527   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3528   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3529 }
3530
3531 static inline vec_float4 vec_lvlx(int a, float *b)
3532 {
3533   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3534   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3535 }
3536
3537 static inline vec_float4 vec_lvlx(int a, vec_float4 *b)
3538 {
3539   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3540   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3541 }
3542
3543
3544 /* vec_lvlxl (load vector left indexed last)
3545  * =========
3546  */
3547 #define vec_lvlxl(_a, _b)       vec_lvlx(_a, _b)
3548
3549
3550 /* vec_lvrx (load vector right indexed)
3551  * ========
3552  */
3553 static inline vec_uchar16 vec_lvrx(int a, unsigned char *b)
3554 {
3555   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3556   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3557 }
3558
3559 static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b)
3560 {
3561   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3562   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3563 }
3564
3565 static inline vec_char16 vec_lvrx(int a, signed char *b)
3566 {
3567   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3568   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3569 }
3570
3571 static inline vec_char16 vec_lvrx(int a, vec_char16 *b)
3572 {
3573   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3574   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3575 }
3576
3577 static inline vec_ushort8 vec_lvrx(int a, unsigned short *b)
3578 {
3579   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3580   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3581 }
3582
3583 static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b)
3584 {
3585   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3586   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3587 }
3588
3589 static inline vec_short8 vec_lvrx(int a, signed short *b)
3590 {
3591   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3592   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3593 }
3594
3595 static inline vec_short8 vec_lvrx(int a, vec_short8 *b)
3596 {
3597   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3598   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3599 }
3600
3601 static inline vec_uint4 vec_lvrx(int a, unsigned int *b)
3602 {
3603   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3604   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3605 }
3606
3607 static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b)
3608 {
3609   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3610   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3611 }
3612
3613 static inline vec_int4 vec_lvrx(int a, signed int *b)
3614 {
3615   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3616   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3617 }
3618
3619 static inline vec_int4 vec_lvrx(int a, vec_int4 *b)
3620 {
3621   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3622   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3623 }
3624
3625 static inline vec_float4 vec_lvrx(int a, float *b)
3626 {
3627   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3628   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3629 }
3630
3631 static inline vec_float4 vec_lvrx(int a, vec_float4 *b)
3632 {
3633   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3634   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3635 }
3636
3637
3638
3639 /* vec_lvrxl (load vector right indexed last)
3640  * =========
3641  */
3642 #define vec_lvrxl(_a, _b)       vec_lvrx(_a, _b)
3643
3644
3645 /* vec_promote (promote scalar to a vector)
3646  * ===========
3647  */
3648 #define vec_promote(_a, _element)       spu_promote(_a, _element)
3649
3650
3651 /* vec_splats (splat scalar to a vector)
3652  * ==========
3653  */
3654 #define vec_splats(_a)  spu_splats(_a)
3655
3656
3657 /* vec_stvlx (store vector left indexed)
3658  * =========
3659  */
3660 static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c)
3661 {
3662   int shift;
3663   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3664
3665   shift = -((int)p & 0xF);
3666   *p = spu_sel(*p,
3667                spu_rlmaskqwbyte(a, shift),
3668                spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3669 }
3670
3671 static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c)
3672 {
3673   int shift;
3674   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3675
3676   shift = -((int)p & 0xF);
3677   *p = spu_sel(*p,
3678                spu_rlmaskqwbyte(a, shift),
3679                spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3680 }
3681
3682 static inline void vec_stvlx(vec_char16 a, int b, signed char *c)
3683 {
3684   int shift;
3685   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3686
3687   shift = -((int)p & 0xF);
3688   *p = spu_sel(*p,
3689                spu_rlmaskqwbyte(a, shift),
3690                spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3691 }
3692
3693 static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c)
3694 {
3695   int shift;
3696   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3697
3698   shift = -((int)p & 0xF);
3699   *p = spu_sel(*p,
3700                spu_rlmaskqwbyte(a, shift),
3701                spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3702 }
3703
3704 static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c)
3705 {
3706   int shift;
3707   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3708
3709   shift = -((int)p & 0xF);
3710   *p = spu_sel(*p,
3711                spu_rlmaskqwbyte(a, shift),
3712                spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3713 }
3714
3715 static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c)
3716 {
3717   int shift;
3718   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3719
3720   shift = -((int)p & 0xF);
3721   *p = spu_sel(*p,
3722                spu_rlmaskqwbyte(a, shift),
3723                spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3724 }
3725
3726 static inline void vec_stvlx(vec_short8 a, int b, signed short *c)
3727 {
3728   int shift;
3729   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3730
3731   shift = -((int)p & 0xF);
3732   *p = spu_sel(*p,
3733                spu_rlmaskqwbyte(a, shift),
3734                spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3735 }
3736
3737 static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c)
3738 {
3739   int shift;
3740   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3741
3742   shift = -((int)p & 0xF);
3743   *p = spu_sel(*p,
3744                spu_rlmaskqwbyte(a, shift),
3745                spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3746 }
3747
3748 static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c)
3749 {
3750   int shift;
3751   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3752
3753   shift = -((int)p & 0xF);
3754   *p = spu_sel(*p,
3755                spu_rlmaskqwbyte(a, shift),
3756                spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3757 }
3758
3759 static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c)
3760 {
3761   int shift;
3762   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3763
3764   shift = -((int)p & 0xF);
3765   *p = spu_sel(*p,
3766                spu_rlmaskqwbyte(a, shift),
3767                spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3768 }
3769
3770 static inline void vec_stvlx(vec_int4 a, int b, signed int *c)
3771 {
3772   int shift;
3773   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3774
3775   shift = -((int)p & 0xF);
3776   *p = spu_sel(*p,
3777                spu_rlmaskqwbyte(a, shift),
3778                spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3779 }
3780
3781 static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c)
3782 {
3783   int shift;
3784   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3785
3786   shift = -((int)p & 0xF);
3787   *p = spu_sel(*p,
3788                spu_rlmaskqwbyte(a, shift),
3789                spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3790 }
3791
3792 static inline void vec_stvlx(vec_float4 a, int b, float *c)
3793 {
3794   int shift;
3795   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3796
3797   shift = -((int)p & 0xF);
3798   *p = spu_sel(*p,
3799                spu_rlmaskqwbyte(a, shift),
3800                spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3801 }
3802
3803 static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c)
3804 {
3805   int shift;
3806   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3807
3808   shift = -((int)p & 0xF);
3809   *p = spu_sel(*p,
3810                spu_rlmaskqwbyte(a, shift),
3811                spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3812 }
3813
3814 /* vec_stvlxl (store vector left indexed last)
3815  * ==========
3816  */
3817 #define vec_stvlxl(_a, _b, _c)  vec_stvlx(_a, _b, _c)
3818
3819
3820 /* vec_stvrx (store vector right indexed)
3821  * =========
3822  */
3823 static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c)
3824 {
3825   int shift;
3826   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3827
3828   shift = 16-((int)p & 0xF);
3829   *p = spu_sel(*p,
3830                spu_slqwbyte(a, shift),
3831                spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3832 }
3833
3834 static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c)
3835 {
3836   int shift;
3837   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3838
3839   shift = 16-((int)p & 0xF);
3840   *p = spu_sel(*p,
3841                spu_slqwbyte(a, shift),
3842                spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3843 }
3844
3845 static inline void vec_stvrx(vec_char16 a, int b, signed char *c)
3846 {
3847   int shift;
3848   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3849
3850   shift = 16-((int)p & 0xF);
3851   *p = spu_sel(*p,
3852                spu_slqwbyte(a, shift),
3853                spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3854 }
3855
3856 static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c)
3857 {
3858   int shift;
3859   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3860
3861   shift = 16-((int)p & 0xF);
3862   *p = spu_sel(*p,
3863                spu_slqwbyte(a, shift),
3864                spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3865 }
3866
3867 static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c)
3868 {
3869   int shift;
3870   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3871
3872   shift = 16-((int)p & 0xF);
3873   *p = spu_sel(*p,
3874                spu_slqwbyte(a, shift),
3875                spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3876 }
3877
3878 static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c)
3879 {
3880   int shift;
3881   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3882
3883   shift = 16-((int)p & 0xF);
3884   *p = spu_sel(*p,
3885                spu_slqwbyte(a, shift),
3886                spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3887 }
3888
3889 static inline void vec_stvrx(vec_short8 a, int b, signed short *c)
3890 {
3891   int shift;
3892   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3893
3894   shift = 16-((int)p & 0xF);
3895   *p = spu_sel(*p,
3896                spu_slqwbyte(a, shift),
3897                spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3898 }
3899
3900 static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c)
3901 {
3902   int shift;
3903   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3904
3905   shift = 16-((int)p & 0xF);
3906   *p = spu_sel(*p,
3907                spu_slqwbyte(a, shift),
3908                spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3909 }
3910
3911 static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c)
3912 {
3913   int shift;
3914   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3915
3916   shift = 16-((int)p & 0xF);
3917   *p = spu_sel(*p,
3918                spu_slqwbyte(a, shift),
3919                spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3920 }
3921
3922 static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c)
3923 {
3924   int shift;
3925   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3926
3927   shift = 16-((int)p & 0xF);
3928   *p = spu_sel(*p,
3929                spu_slqwbyte(a, shift),
3930                spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3931 }
3932
3933 static inline void vec_stvrx(vec_int4 a, int b, signed int *c)
3934 {
3935   int shift;
3936   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3937
3938   shift = 16-((int)p & 0xF);
3939   *p = spu_sel(*p,
3940                spu_slqwbyte(a, shift),
3941                spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3942 }
3943
3944 static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c)
3945 {
3946   int shift;
3947   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3948
3949   shift = 16-((int)p & 0xF);
3950   *p = spu_sel(*p,
3951                spu_slqwbyte(a, shift),
3952                spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3953 }
3954
3955 static inline void vec_stvrx(vec_float4 a, int b, float *c)
3956 {
3957   int shift;
3958   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3959
3960   shift = 16-((int)p & 0xF);
3961   *p = spu_sel(*p,
3962                spu_slqwbyte(a, shift),
3963                spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3964 }
3965
3966 static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c)
3967 {
3968   int shift;
3969   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3970
3971   shift = 16-((int)p & 0xF);
3972   *p = spu_sel(*p,
3973                spu_slqwbyte(a, shift),
3974                spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3975 }
3976
3977 /* vec_stvrxl (store vector right indexed last)
3978  * ==========
3979  */
3980 #define vec_stvrxl(_a, _b, _c)  vec_stvrx(_a, _b, _c)
3981
3982
3983 #endif /* __SPU__ */
3984 #endif /* __cplusplus */
3985 #endif /* !_VMX2SPU_H_ */