gcc/config/i386/mmintrin.h

   1 /* Copyright (C) 2002 Free Software Foundation, Inc.
   2
   3    This file is part of GNU CC.
   4
   5    GNU CC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    GNU CC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with GNU CC; see the file COPYING.  If not, write to
  17    the Free Software Foundation, 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 /* As a special exception, if you include this header file into source
  21    files compiled by GCC, this header file does not by itself cause
  22    the resulting executable to be covered by the GNU General Public
  23    License.  This exception does not however invalidate any other
  24    reasons why the executable file might be covered by the GNU General
  25    Public License.  */
  26
  27 /* Implemented from the specification included in the Intel C++ Compiler
  28    User Guide and Reference, version 5.0.  */
  29
  30 #ifndef _MMINTRIN_H_INCLUDED
  31 #define _MMINTRIN_H_INCLUDED
  32
  33 #ifndef __MMX__
  34 # error "MMX instruction set not enabled"
  35 #else
  36 /* The data type intended for user use.  */
  37 typedef unsigned long long __m64 __attribute__ ((__aligned__ (8)));
  38
  39 /* Internal data types for implementing the intrinsics.  */
  40 typedef int __v2si __attribute__ ((__mode__ (__V2SI__)));
  41 typedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
  42 typedef int __v8qi __attribute__ ((__mode__ (__V8QI__)));
  43
  44 /* Empty the multimedia state.  */
  45 static __inline void
  46 _mm_empty (void)
  47 {
  48   __builtin_ia32_emms ();
  49 }
  50
  51 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  52 static __inline __m64
  53 _mm_cvtsi32_si64 (int __i)
  54 {
  55   return (unsigned int) __i;
  56 }
  57
  58 /* Convert the lower 32 bits of the __m64 object into an integer.  */
  59 static __inline int
  60 _mm_cvtsi64_si32 (__m64 __i)
  61 {
  62   return __i;
  63 }
  64
  65 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
  66    the result, and the four 16-bit values from M2 into the upper four 8-bit
  67    values of the result, all with signed saturation.  */
  68 static __inline __m64
  69 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
  70 {
  71   return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
  72 }
  73
  74 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
  75    the result, and the two 32-bit values from M2 into the upper two 16-bit
  76    values of the result, all with signed saturation.  */
  77 static __inline __m64
  78 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
  79 {
  80   return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
  81 }
  82
  83 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
  84    the result, and the four 16-bit values from M2 into the upper four 8-bit
  85    values of the result, all with unsigned saturation.  */
  86 static __inline __m64
  87 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
  88 {
  89   return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
  90 }
  91
  92 /* Interleave the four 8-bit values from the high half of M1 with the four
  93    8-bit values from the high half of M2.  */
  94 static __inline __m64
  95 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
  96 {
  97   return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
  98 }
  99
 100 /* Interleave the two 16-bit values from the high half of M1 with the two
 101    16-bit values from the high half of M2.  */
 102 static __inline __m64
 103 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
 104 {
 105   return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
 106 }
 107
 108 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 109    value from the high half of M2.  */
 110 static __inline __m64
 111 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
 112 {
 113   return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
 114 }
 115
 116 /* Interleave the four 8-bit values from the low half of M1 with the four
 117    8-bit values from the low half of M2.  */
 118 static __inline __m64
 119 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 120 {
 121   return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
 122 }
 123
 124 /* Interleave the two 16-bit values from the low half of M1 with the two
 125    16-bit values from the low half of M2.  */
 126 static __inline __m64
 127 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
 128 {
 129   return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
 130 }
 131
 132 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 133    value from the low half of M2.  */
 134 static __inline __m64
 135 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
 136 {
 137   return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
 138 }
 139
 140 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 141 static __inline __m64
 142 _mm_add_pi8 (__m64 __m1, __m64 __m2)
 143 {
 144   return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
 145 }
 146
 147 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 148 static __inline __m64
 149 _mm_add_pi16 (__m64 __m1, __m64 __m2)
 150 {
 151   return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
 152 }
 153
 154 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 155 static __inline __m64
 156 _mm_add_pi32 (__m64 __m1, __m64 __m2)
 157 {
 158   return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
 159 }
 160
 161 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 162    saturated arithmetic.  */
 163 static __inline __m64
 164 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
 165 {
 166   return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
 167 }
 168
 169 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 170    saturated arithmetic.  */
 171 static __inline __m64
 172 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
 173 {
 174   return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
 175 }
 176
 177 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 178    saturated arithmetic.  */
 179 static __inline __m64
 180 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
 181 {
 182   return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
 183 }
 184
 185 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 186    saturated arithmetic.  */
 187 static __inline __m64
 188 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 189 {
 190   return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
 191 }
 192
 193 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 194 static __inline __m64
 195 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
 196 {
 197   return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
 198 }
 199
 200 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 201 static __inline __m64
 202 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
 203 {
 204   return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
 205 }
 206
 207 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 208 static __inline __m64
 209 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
 210 {
 211   return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
 212 }
 213
 214 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 215    saturating arithmetic.  */
 216 static __inline __m64
 217 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
 218 {
 219   return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
 220 }
 221
 222 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 223    signed saturating arithmetic.  */
 224 static __inline __m64
 225 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
 226 {
 227   return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
 228 }
 229
 230 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
 231    unsigned saturating arithmetic.  */
 232 static __inline __m64
 233 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
 234 {
 235   return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
 236 }
 237
 238 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 239    unsigned saturating arithmetic.  */
 240 static __inline __m64
 241 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
 242 {
 243   return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
 244 }
 245
 246 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
 247    four 32-bit intermediate results, which are then summed by pairs to
 248    produce two 32-bit results.  */
 249 static __inline __m64
 250 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
 251 {
 252   return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
 253 }
 254
 255 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
 256    M2 and produce the high 16 bits of the 32-bit results.  */
 257 static __inline __m64
 258 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
 259 {
 260   return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
 261 }
 262
 263 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
 264    the low 16 bits of the results.  */
 265 static __inline __m64
 266 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
 267 {
 268   return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
 269 }
 270
 271 /* Shift four 16-bit values in M left by COUNT.  */
 272 static __inline __m64
 273 _mm_sll_pi16 (__m64 __m, __m64 __count)
 274 {
 275   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
 276 }
 277
 278 static __inline __m64
 279 _mm_slli_pi16 (__m64 __m, int __count)
 280 {
 281   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
 282 }
 283
 284 /* Shift two 32-bit values in M left by COUNT.  */
 285 static __inline __m64
 286 _mm_sll_pi32 (__m64 __m, __m64 __count)
 287 {
 288   return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
 289 }
 290
 291 static __inline __m64
 292 _mm_slli_pi32 (__m64 __m, int __count)
 293 {
 294   return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
 295 }
 296
 297 /* Shift the 64-bit value in M left by COUNT.  */
 298 static __inline __m64
 299 _mm_sll_pi64 (__m64 __m, __m64 __count)
 300 {
 301   return (__m64) __builtin_ia32_psllq (__m, __count);
 302 }
 303
 304 static __inline __m64
 305 _mm_slli_pi64 (__m64 __m, int __count)
 306 {
 307   return (__m64) __builtin_ia32_psllq (__m, __count);
 308 }
 309
 310 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
 311 static __inline __m64
 312 _mm_sra_pi16 (__m64 __m, __m64 __count)
 313 {
 314   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
 315 }
 316
 317 static __inline __m64
 318 _mm_srai_pi16 (__m64 __m, int __count)
 319 {
 320   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
 321 }
 322
 323 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
 324 static __inline __m64
 325 _mm_sra_pi32 (__m64 __m, __m64 __count)
 326 {
 327   return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
 328 }
 329
 330 static __inline __m64
 331 _mm_srai_pi32 (__m64 __m, int __count)
 332 {
 333   return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
 334 }
 335
 336 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
 337 static __inline __m64
 338 _mm_srl_pi16 (__m64 __m, __m64 __count)
 339 {
 340   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
 341 }
 342
 343 static __inline __m64
 344 _mm_srli_pi16 (__m64 __m, int __count)
 345 {
 346   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
 347 }
 348
 349 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
 350 static __inline __m64
 351 _mm_srl_pi32 (__m64 __m, __m64 __count)
 352 {
 353   return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
 354 }
 355
 356 static __inline __m64
 357 _mm_srli_pi32 (__m64 __m, int __count)
 358 {
 359   return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
 360 }
 361
 362 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 363 static __inline __m64
 364 _mm_srl_pi64 (__m64 __m, __m64 __count)
 365 {
 366   return (__m64) __builtin_ia32_psrlq (__m, __count);
 367 }
 368
 369 static __inline __m64
 370 _mm_srli_pi64 (__m64 __m, int __count)
 371 {
 372   return (__m64) __builtin_ia32_psrlq (__m, __count);
 373 }
 374
 375 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 376 static __inline __m64
 377 _mm_and_si64 (__m64 __m1, __m64 __m2)
 378 {
 379   return __builtin_ia32_pand (__m1, __m2);
 380 }
 381
 382 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 383    64-bit value in M2.  */
 384 static __inline __m64
 385 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 386 {
 387   return __builtin_ia32_pandn (__m1, __m2);
 388 }
 389
 390 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 391 static __inline __m64
 392 _mm_or_si64 (__m64 __m1, __m64 __m2)
 393 {
 394   return __builtin_ia32_por (__m1, __m2);
 395 }
 396
 397 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 398 static __inline __m64
 399 _mm_xor_si64 (__m64 __m1, __m64 __m2)
 400 {
 401   return __builtin_ia32_pxor (__m1, __m2);
 402 }
 403
 404 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 405    test is true and zero if false.  */
 406 static __inline __m64
 407 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
 408 {
 409   return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
 410 }
 411
 412 static __inline __m64
 413 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
 414 {
 415   return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
 416 }
 417
 418 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 419    the test is true and zero if false.  */
 420 static __inline __m64
 421 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
 422 {
 423   return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
 424 }
 425
 426 static __inline __m64
 427 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
 428 {
 429   return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
 430 }
 431
 432 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 433    the test is true and zero if false.  */
 434 static __inline __m64
 435 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 436 {
 437   return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
 438 }
 439
 440 static __inline __m64
 441 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
 442 {
 443   return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
 444 }
 445
 446 /* Creates a 64-bit zero.  */
 447 static __inline __m64
 448 _mm_setzero_si64 (void)
 449 {
 450   return __builtin_ia32_mmx_zero ();
 451 }
 452
 453 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 454 static __inline __m64
 455 _mm_set_pi32 (int __i1, int __i0)
 456 {
 457   union {
 458     __m64 __q;
 459     struct {
 460       unsigned int __i0;
 461       unsigned int __i1;
 462     } __s;
 463   } __u;
 464
 465   __u.__s.__i0 = __i0;
 466   __u.__s.__i1 = __i1;
 467
 468   return __u.__q;
 469 }
 470
 471 /* Creates a vector of four 16-bit values; W0 is least significant.  */
 472 static __inline __m64
 473 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
 474 {
 475   unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
 476   unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
 477   return _mm_set_pi32 (__i1, __i0);
 478
 479 }
 480
 481 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
 482 static __inline __m64
 483 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
 484              char __b3, char __b2, char __b1, char __b0)
 485 {
 486   unsigned int __i1, __i0;
 487
 488   __i1 = (unsigned char)__b7;
 489   __i1 = __i1 << 8 | (unsigned char)__b6;
 490   __i1 = __i1 << 8 | (unsigned char)__b5;
 491   __i1 = __i1 << 8 | (unsigned char)__b4;
 492
 493   __i0 = (unsigned char)__b3;
 494   __i0 = __i0 << 8 | (unsigned char)__b2;
 495   __i0 = __i0 << 8 | (unsigned char)__b1;
 496   __i0 = __i0 << 8 | (unsigned char)__b0;
 497
 498   return _mm_set_pi32 (__i1, __i0);
 499 }
 500
 501 /* Similar, but with the arguments in reverse order.  */
 502 static __inline __m64
 503 _mm_setr_pi32 (int __i0, int __i1)
 504 {
 505   return _mm_set_pi32 (__i1, __i0);
 506 }
 507
 508 static __inline __m64
 509 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
 510 {
 511   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
 512 }
 513
 514 static __inline __m64
 515 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
 516               char __b4, char __b5, char __b6, char __b7)
 517 {
 518   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 519 }
 520
 521 /* Creates a vector of two 32-bit values, both elements containing I.  */
 522 static __inline __m64
 523 _mm_set1_pi32 (int __i)
 524 {
 525   return _mm_set_pi32 (__i, __i);
 526 }
 527
 528 /* Creates a vector of four 16-bit values, all elements containing W.  */
 529 static __inline __m64
 530 _mm_set1_pi16 (short __w)
 531 {
 532   unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
 533   return _mm_set1_pi32 (__i);
 534 }
 535
 536 /* Creates a vector of four 16-bit values, all elements containing B.  */
 537 static __inline __m64
 538 _mm_set1_pi8 (char __b)
 539 {
 540   unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
 541   unsigned int __i = __w << 16 | __w;
 542   return _mm_set1_pi32 (__i);
 543 }
 544
 545 #endif /* __MMX__ */
 546 #endif /* _MMINTRIN_H_INCLUDED */