1 /* Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of GNU CC.
5 GNU CC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 GNU CC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GNU CC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 5.0. */
30 #ifndef _MMINTRIN_H_INCLUDED
31 #define _MMINTRIN_H_INCLUDED
34 # error "MMX instruction set not enabled"
36 /* The data type intended for user use. */
37 typedef unsigned long long __m64 __attribute__ ((__aligned__ (8)));
39 /* Internal data types for implementing the intrinsics. */
40 typedef int __v2si __attribute__ ((__mode__ (__V2SI__)));
41 typedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
42 typedef int __v8qi __attribute__ ((__mode__ (__V8QI__)));
44 /* Empty the multimedia state. */
48 __builtin_ia32_emms ();
51 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
53 _mm_cvtsi32_si64 (int __i)
55 return (unsigned int) __i;
58 /* Convert the lower 32 bits of the __m64 object into an integer. */
60 _mm_cvtsi64_si32 (__m64 __i)
65 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
66 the result, and the four 16-bit values from M2 into the upper four 8-bit
67 values of the result, all with signed saturation. */
69 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
71 return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
74 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
75 the result, and the two 32-bit values from M2 into the upper two 16-bit
76 values of the result, all with signed saturation. */
78 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
80 return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
83 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
84 the result, and the four 16-bit values from M2 into the upper four 8-bit
85 values of the result, all with unsigned saturation. */
87 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
89 return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
92 /* Interleave the four 8-bit values from the high half of M1 with the four
93 8-bit values from the high half of M2. */
95 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
97 return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
100 /* Interleave the two 16-bit values from the high half of M1 with the two
101 16-bit values from the high half of M2. */
102 static __inline __m64
103 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
105 return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
108 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
109 value from the high half of M2. */
110 static __inline __m64
111 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
113 return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
116 /* Interleave the four 8-bit values from the low half of M1 with the four
117 8-bit values from the low half of M2. */
118 static __inline __m64
119 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
121 return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
124 /* Interleave the two 16-bit values from the low half of M1 with the two
125 16-bit values from the low half of M2. */
126 static __inline __m64
127 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
129 return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
132 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
133 value from the low half of M2. */
134 static __inline __m64
135 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
137 return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
140 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
141 static __inline __m64
142 _mm_add_pi8 (__m64 __m1, __m64 __m2)
144 return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
147 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
148 static __inline __m64
149 _mm_add_pi16 (__m64 __m1, __m64 __m2)
151 return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
154 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
155 static __inline __m64
156 _mm_add_pi32 (__m64 __m1, __m64 __m2)
158 return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
161 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
162 saturated arithmetic. */
163 static __inline __m64
164 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
166 return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
169 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
170 saturated arithmetic. */
171 static __inline __m64
172 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
174 return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
177 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
178 saturated arithmetic. */
179 static __inline __m64
180 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
182 return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
185 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
186 saturated arithmetic. */
187 static __inline __m64
188 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
190 return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
193 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
194 static __inline __m64
195 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
197 return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
200 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
201 static __inline __m64
202 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
204 return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
207 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
208 static __inline __m64
209 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
211 return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
214 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
215 saturating arithmetic. */
216 static __inline __m64
217 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
219 return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
222 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
223 signed saturating arithmetic. */
224 static __inline __m64
225 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
227 return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
230 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
231 unsigned saturating arithmetic. */
232 static __inline __m64
233 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
235 return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
238 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
239 unsigned saturating arithmetic. */
240 static __inline __m64
241 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
243 return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
246 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
247 four 32-bit intermediate results, which are then summed by pairs to
248 produce two 32-bit results. */
249 static __inline __m64
250 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
252 return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
255 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
256 M2 and produce the high 16 bits of the 32-bit results. */
257 static __inline __m64
258 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
260 return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
263 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
264 the low 16 bits of the results. */
265 static __inline __m64
266 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
268 return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
271 /* Shift four 16-bit values in M left by COUNT. */
272 static __inline __m64
273 _mm_sll_pi16 (__m64 __m, __m64 __count)
275 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
278 static __inline __m64
279 _mm_slli_pi16 (__m64 __m, int __count)
281 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
284 /* Shift two 32-bit values in M left by COUNT. */
285 static __inline __m64
286 _mm_sll_pi32 (__m64 __m, __m64 __count)
288 return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
291 static __inline __m64
292 _mm_slli_pi32 (__m64 __m, int __count)
294 return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
297 /* Shift the 64-bit value in M left by COUNT. */
298 static __inline __m64
299 _mm_sll_pi64 (__m64 __m, __m64 __count)
301 return (__m64) __builtin_ia32_psllq (__m, __count);
304 static __inline __m64
305 _mm_slli_pi64 (__m64 __m, int __count)
307 return (__m64) __builtin_ia32_psllq (__m, __count);
310 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
311 static __inline __m64
312 _mm_sra_pi16 (__m64 __m, __m64 __count)
314 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
317 static __inline __m64
318 _mm_srai_pi16 (__m64 __m, int __count)
320 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
323 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
324 static __inline __m64
325 _mm_sra_pi32 (__m64 __m, __m64 __count)
327 return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
330 static __inline __m64
331 _mm_srai_pi32 (__m64 __m, int __count)
333 return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
336 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
337 static __inline __m64
338 _mm_srl_pi16 (__m64 __m, __m64 __count)
340 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
343 static __inline __m64
344 _mm_srli_pi16 (__m64 __m, int __count)
346 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
349 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
350 static __inline __m64
351 _mm_srl_pi32 (__m64 __m, __m64 __count)
353 return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
356 static __inline __m64
357 _mm_srli_pi32 (__m64 __m, int __count)
359 return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
362 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
363 static __inline __m64
364 _mm_srl_pi64 (__m64 __m, __m64 __count)
366 return (__m64) __builtin_ia32_psrlq (__m, __count);
369 static __inline __m64
370 _mm_srli_pi64 (__m64 __m, int __count)
372 return (__m64) __builtin_ia32_psrlq (__m, __count);
375 /* Bit-wise AND the 64-bit values in M1 and M2. */
376 static __inline __m64
377 _mm_and_si64 (__m64 __m1, __m64 __m2)
379 return __builtin_ia32_pand (__m1, __m2);
382 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
383 64-bit value in M2. */
384 static __inline __m64
385 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
387 return __builtin_ia32_pandn (__m1, __m2);
390 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
391 static __inline __m64
392 _mm_or_si64 (__m64 __m1, __m64 __m2)
394 return __builtin_ia32_por (__m1, __m2);
397 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
398 static __inline __m64
399 _mm_xor_si64 (__m64 __m1, __m64 __m2)
401 return __builtin_ia32_pxor (__m1, __m2);
404 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
405 test is true and zero if false. */
406 static __inline __m64
407 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
409 return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
412 static __inline __m64
413 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
415 return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
418 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
419 the test is true and zero if false. */
420 static __inline __m64
421 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
423 return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
426 static __inline __m64
427 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
429 return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
432 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
433 the test is true and zero if false. */
434 static __inline __m64
435 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
437 return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
440 static __inline __m64
441 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
443 return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
446 /* Creates a 64-bit zero. */
447 static __inline __m64
448 _mm_setzero_si64 (void)
450 return __builtin_ia32_mmx_zero ();
453 /* Creates a vector of two 32-bit values; I0 is least significant. */
454 static __inline __m64
455 _mm_set_pi32 (int __i1, int __i0)
471 /* Creates a vector of four 16-bit values; W0 is least significant. */
472 static __inline __m64
473 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
475 unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
476 unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
477 return _mm_set_pi32 (__i1, __i0);
481 /* Creates a vector of eight 8-bit values; B0 is least significant. */
482 static __inline __m64
483 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
484 char __b3, char __b2, char __b1, char __b0)
486 unsigned int __i1, __i0;
488 __i1 = (unsigned char)__b7;
489 __i1 = __i1 << 8 | (unsigned char)__b6;
490 __i1 = __i1 << 8 | (unsigned char)__b5;
491 __i1 = __i1 << 8 | (unsigned char)__b4;
493 __i0 = (unsigned char)__b3;
494 __i0 = __i0 << 8 | (unsigned char)__b2;
495 __i0 = __i0 << 8 | (unsigned char)__b1;
496 __i0 = __i0 << 8 | (unsigned char)__b0;
498 return _mm_set_pi32 (__i1, __i0);
501 /* Similar, but with the arguments in reverse order. */
502 static __inline __m64
503 _mm_setr_pi32 (int __i0, int __i1)
505 return _mm_set_pi32 (__i1, __i0);
508 static __inline __m64
509 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
511 return _mm_set_pi16 (__w3, __w2, __w1, __w0);
514 static __inline __m64
515 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
516 char __b4, char __b5, char __b6, char __b7)
518 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
521 /* Creates a vector of two 32-bit values, both elements containing I. */
522 static __inline __m64
523 _mm_set1_pi32 (int __i)
525 return _mm_set_pi32 (__i, __i);
528 /* Creates a vector of four 16-bit values, all elements containing W. */
529 static __inline __m64
530 _mm_set1_pi16 (short __w)
532 unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
533 return _mm_set1_pi32 (__i);
536 /* Creates a vector of four 16-bit values, all elements containing B. */
537 static __inline __m64
538 _mm_set1_pi8 (char __b)
540 unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
541 unsigned int __i = __w << 16 | __w;
542 return _mm_set1_pi32 (__i);
546 #endif /* _MMINTRIN_H_INCLUDED */