1 /* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of GNU CC.
5 GNU CC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 GNU CC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GNU CC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 8.0. */
30 #ifndef _MMINTRIN_H_INCLUDED
31 #define _MMINTRIN_H_INCLUDED
34 # error "MMX instruction set not enabled"
36 /* The data type intended for user use. */
37 typedef int __m64 __attribute__ ((__mode__ (__V2SI__)));
39 /* Internal data types for implementing the intrinsics. */
40 typedef int __v2si __attribute__ ((__mode__ (__V2SI__)));
41 typedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
42 typedef int __v8qi __attribute__ ((__mode__ (__V8QI__)));
44 /* Empty the multimedia state. */
48 __builtin_ia32_emms ();
51 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
53 _mm_cvtsi32_si64 (int __i)
55 long long __tmp = (unsigned int)__i;
60 /* Convert I to a __m64 object. */
62 _mm_cvtsi64x_si64 (long long __i)
67 /* Convert I to a __m64 object. */
69 _mm_set_pi64x (long long __i)
75 /* Convert the lower 32 bits of the __m64 object into an integer. */
77 _mm_cvtsi64_si32 (__m64 __i)
79 long long __tmp = (long long)__i;
84 /* Convert the lower 32 bits of the __m64 object into an integer. */
85 static __inline long long
86 _mm_cvtsi64_si64x (__m64 __i)
88 return (long long)__i;
92 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
93 the result, and the four 16-bit values from M2 into the upper four 8-bit
94 values of the result, all with signed saturation. */
96 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
98 return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
101 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
102 the result, and the two 32-bit values from M2 into the upper two 16-bit
103 values of the result, all with signed saturation. */
104 static __inline __m64
105 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
107 return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
110 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
111 the result, and the four 16-bit values from M2 into the upper four 8-bit
112 values of the result, all with unsigned saturation. */
113 static __inline __m64
114 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
116 return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
119 /* Interleave the four 8-bit values from the high half of M1 with the four
120 8-bit values from the high half of M2. */
121 static __inline __m64
122 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
124 return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
127 /* Interleave the two 16-bit values from the high half of M1 with the two
128 16-bit values from the high half of M2. */
129 static __inline __m64
130 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
132 return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
135 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
136 value from the high half of M2. */
137 static __inline __m64
138 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
140 return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
143 /* Interleave the four 8-bit values from the low half of M1 with the four
144 8-bit values from the low half of M2. */
145 static __inline __m64
146 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
148 return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
151 /* Interleave the two 16-bit values from the low half of M1 with the two
152 16-bit values from the low half of M2. */
153 static __inline __m64
154 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
156 return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
159 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
160 value from the low half of M2. */
161 static __inline __m64
162 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
164 return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
167 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
168 static __inline __m64
169 _mm_add_pi8 (__m64 __m1, __m64 __m2)
171 return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
174 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
175 static __inline __m64
176 _mm_add_pi16 (__m64 __m1, __m64 __m2)
178 return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
181 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
182 static __inline __m64
183 _mm_add_pi32 (__m64 __m1, __m64 __m2)
185 return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
188 /* Add the 64-bit values in M1 to the 64-bit values in M2. */
189 static __inline __m64
190 _mm_add_si64 (__m64 __m1, __m64 __m2)
192 return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2);
195 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
196 saturated arithmetic. */
197 static __inline __m64
198 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
200 return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
203 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
204 saturated arithmetic. */
205 static __inline __m64
206 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
208 return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
211 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
212 saturated arithmetic. */
213 static __inline __m64
214 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
216 return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
219 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
220 saturated arithmetic. */
221 static __inline __m64
222 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
224 return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
227 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
228 static __inline __m64
229 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
231 return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
234 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
235 static __inline __m64
236 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
238 return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
241 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
242 static __inline __m64
243 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
245 return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
248 /* Add the 64-bit values in M1 to the 64-bit values in M2. */
249 static __inline __m64
250 _mm_sub_si64 (__m64 __m1, __m64 __m2)
252 return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2);
255 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
256 saturating arithmetic. */
257 static __inline __m64
258 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
260 return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
263 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
264 signed saturating arithmetic. */
265 static __inline __m64
266 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
268 return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
271 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
272 unsigned saturating arithmetic. */
273 static __inline __m64
274 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
276 return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
279 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
280 unsigned saturating arithmetic. */
281 static __inline __m64
282 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
284 return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
287 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
288 four 32-bit intermediate results, which are then summed by pairs to
289 produce two 32-bit results. */
290 static __inline __m64
291 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
293 return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
296 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
297 M2 and produce the high 16 bits of the 32-bit results. */
298 static __inline __m64
299 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
301 return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
304 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
305 the low 16 bits of the results. */
306 static __inline __m64
307 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
309 return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
312 /* Shift four 16-bit values in M left by COUNT. */
313 static __inline __m64
314 _mm_sll_pi16 (__m64 __m, __m64 __count)
316 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count);
319 static __inline __m64
320 _mm_slli_pi16 (__m64 __m, int __count)
322 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
325 /* Shift two 32-bit values in M left by COUNT. */
326 static __inline __m64
327 _mm_sll_pi32 (__m64 __m, __m64 __count)
329 return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count);
332 static __inline __m64
333 _mm_slli_pi32 (__m64 __m, int __count)
335 return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
338 /* Shift the 64-bit value in M left by COUNT. */
339 static __inline __m64
340 _mm_sll_si64 (__m64 __m, __m64 __count)
342 return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
345 static __inline __m64
346 _mm_slli_si64 (__m64 __m, int __count)
348 return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
351 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
352 static __inline __m64
353 _mm_sra_pi16 (__m64 __m, __m64 __count)
355 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count);
358 static __inline __m64
359 _mm_srai_pi16 (__m64 __m, int __count)
361 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
364 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
365 static __inline __m64
366 _mm_sra_pi32 (__m64 __m, __m64 __count)
368 return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count);
371 static __inline __m64
372 _mm_srai_pi32 (__m64 __m, int __count)
374 return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
377 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
378 static __inline __m64
379 _mm_srl_pi16 (__m64 __m, __m64 __count)
381 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count);
384 static __inline __m64
385 _mm_srli_pi16 (__m64 __m, int __count)
387 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
390 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
391 static __inline __m64
392 _mm_srl_pi32 (__m64 __m, __m64 __count)
394 return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count);
397 static __inline __m64
398 _mm_srli_pi32 (__m64 __m, int __count)
400 return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
403 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
404 static __inline __m64
405 _mm_srl_si64 (__m64 __m, __m64 __count)
407 return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
410 static __inline __m64
411 _mm_srli_si64 (__m64 __m, int __count)
413 return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
416 /* Bit-wise AND the 64-bit values in M1 and M2. */
417 static __inline __m64
418 _mm_and_si64 (__m64 __m1, __m64 __m2)
420 return (__m64) __builtin_ia32_pand ((long long)__m1, (long long)__m2);
423 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
424 64-bit value in M2. */
425 static __inline __m64
426 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
428 return (__m64) __builtin_ia32_pandn ((long long)__m1, (long long)__m2);
431 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
432 static __inline __m64
433 _mm_or_si64 (__m64 __m1, __m64 __m2)
435 return (__m64)__builtin_ia32_por ((long long)__m1, (long long)__m2);
438 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
439 static __inline __m64
440 _mm_xor_si64 (__m64 __m1, __m64 __m2)
442 return (__m64)__builtin_ia32_pxor ((long long)__m1, (long long)__m2);
445 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
446 test is true and zero if false. */
447 static __inline __m64
448 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
450 return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
453 static __inline __m64
454 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
456 return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
459 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
460 the test is true and zero if false. */
461 static __inline __m64
462 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
464 return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
467 static __inline __m64
468 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
470 return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
473 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
474 the test is true and zero if false. */
475 static __inline __m64
476 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
478 return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
481 static __inline __m64
482 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
484 return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
487 /* Creates a 64-bit zero. */
488 static __inline __m64
489 _mm_setzero_si64 (void)
491 return (__m64)__builtin_ia32_mmx_zero ();
494 /* Creates a vector of two 32-bit values; I0 is least significant. */
495 static __inline __m64
496 _mm_set_pi32 (int __i1, int __i0)
512 /* Creates a vector of four 16-bit values; W0 is least significant. */
513 static __inline __m64
514 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
516 unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
517 unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
518 return _mm_set_pi32 (__i1, __i0);
522 /* Creates a vector of eight 8-bit values; B0 is least significant. */
523 static __inline __m64
524 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
525 char __b3, char __b2, char __b1, char __b0)
527 unsigned int __i1, __i0;
529 __i1 = (unsigned char)__b7;
530 __i1 = __i1 << 8 | (unsigned char)__b6;
531 __i1 = __i1 << 8 | (unsigned char)__b5;
532 __i1 = __i1 << 8 | (unsigned char)__b4;
534 __i0 = (unsigned char)__b3;
535 __i0 = __i0 << 8 | (unsigned char)__b2;
536 __i0 = __i0 << 8 | (unsigned char)__b1;
537 __i0 = __i0 << 8 | (unsigned char)__b0;
539 return _mm_set_pi32 (__i1, __i0);
542 /* Similar, but with the arguments in reverse order. */
543 static __inline __m64
544 _mm_setr_pi32 (int __i0, int __i1)
546 return _mm_set_pi32 (__i1, __i0);
549 static __inline __m64
550 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
552 return _mm_set_pi16 (__w3, __w2, __w1, __w0);
555 static __inline __m64
556 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
557 char __b4, char __b5, char __b6, char __b7)
559 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
562 /* Creates a vector of two 32-bit values, both elements containing I. */
563 static __inline __m64
564 _mm_set1_pi32 (int __i)
566 return _mm_set_pi32 (__i, __i);
569 /* Creates a vector of four 16-bit values, all elements containing W. */
570 static __inline __m64
571 _mm_set1_pi16 (short __w)
573 unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
574 return _mm_set1_pi32 (__i);
577 /* Creates a vector of four 16-bit values, all elements containing B. */
578 static __inline __m64
579 _mm_set1_pi8 (char __b)
581 unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
582 unsigned int __i = __w << 16 | __w;
583 return _mm_set1_pi32 (__i);
586 /* Alternate intrinsic name definitions. */
587 #define _m_empty _mm_empty
588 #define _m_from_int _mm_cvtsi32_si64
589 #define _m_to_int _mm_cvtsi64_si32
590 #define _m_packsswb _mm_packs_pi16
591 #define _m_packssdw _mm_packs_pi32
592 #define _m_packuswb _mm_packs_pu16
593 #define _m_punpckhbw _mm_unpackhi_pi8
594 #define _m_punpckhwd _mm_unpackhi_pi16
595 #define _m_punpckhdq _mm_unpackhi_pi32
596 #define _m_punpcklbw _mm_unpacklo_pi8
597 #define _m_punpcklwd _mm_unpacklo_pi16
598 #define _m_punpckldq _mm_unpacklo_pi32
599 #define _m_paddb _mm_add_pi8
600 #define _m_paddw _mm_add_pi16
601 #define _m_paddd _mm_add_pi32
602 #define _m_paddsb _mm_adds_pi8
603 #define _m_paddsw _mm_adds_pi16
604 #define _m_paddusb _mm_adds_pu8
605 #define _m_paddusw _mm_adds_pu16
606 #define _m_psubb _mm_sub_pi8
607 #define _m_psubw _mm_sub_pi16
608 #define _m_psubd _mm_sub_pi32
609 #define _m_psubsb _mm_subs_pi8
610 #define _m_psubsw _mm_subs_pi16
611 #define _m_psubusb _mm_subs_pu8
612 #define _m_psubusw _mm_subs_pu16
613 #define _m_pmaddwd _mm_madd_pi16
614 #define _m_pmulhw _mm_mulhi_pi16
615 #define _m_pmullw _mm_mullo_pi16
616 #define _m_psllw _mm_sll_pi16
617 #define _m_psllwi _mm_slli_pi16
618 #define _m_pslld _mm_sll_pi32
619 #define _m_pslldi _mm_slli_pi32
620 #define _m_psllq _mm_sll_si64
621 #define _m_psllqi _mm_slli_si64
622 #define _m_psraw _mm_sra_pi16
623 #define _m_psrawi _mm_srai_pi16
624 #define _m_psrad _mm_sra_pi32
625 #define _m_psradi _mm_srai_pi32
626 #define _m_psrlw _mm_srl_pi16
627 #define _m_psrlwi _mm_srli_pi16
628 #define _m_psrld _mm_srl_pi32
629 #define _m_psrldi _mm_srli_pi32
630 #define _m_psrlq _mm_srl_si64
631 #define _m_psrlqi _mm_srli_si64
632 #define _m_pand _mm_and_si64
633 #define _m_pandn _mm_andnot_si64
634 #define _m_por _mm_or_si64
635 #define _m_pxor _mm_xor_si64
636 #define _m_pcmpeqb _mm_cmpeq_pi8
637 #define _m_pcmpeqw _mm_cmpeq_pi16
638 #define _m_pcmpeqd _mm_cmpeq_pi32
639 #define _m_pcmpgtb _mm_cmpgt_pi8
640 #define _m_pcmpgtw _mm_cmpgt_pi16
641 #define _m_pcmpgtd _mm_cmpgt_pi32
644 #endif /* _MMINTRIN_H_INCLUDED */