2 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
25 #ifndef _IMMINTRIN_H_INCLUDED
26 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
29 /* Sum absolute 8-bit integer difference of adjacent groups of 4
30 byte integers in the first 2 operands. Starting offsets within
31 operands are determined by the 3rd mask operand. */
33 extern __inline __m256i
34 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
35 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
37 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
41 #define _mm256_mpsadbw_epu8(X, Y, M) \
42 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
43 (__v32qi)(__m256i)(Y), (int)(M)))
46 extern __inline __m256i
47 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
48 _mm256_abs_epi8 (__m256i __A)
50 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
53 extern __inline __m256i
54 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
55 _mm256_abs_epi16 (__m256i __A)
57 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
60 extern __inline __m256i
61 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
62 _mm256_abs_epi32 (__m256i __A)
64 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
67 extern __inline __m256i
68 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
69 _mm256_packs_epi32 (__m256i __A, __m256i __B)
71 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
74 extern __inline __m256i
75 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
76 _mm256_packs_epi16 (__m256i __A, __m256i __B)
78 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
81 extern __inline __m256i
82 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
83 _mm256_packus_epi32 (__m256i __A, __m256i __B)
85 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
88 extern __inline __m256i
89 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
90 _mm256_packus_epi16 (__m256i __A, __m256i __B)
92 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
95 extern __inline __m256i
96 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
97 _mm256_add_epi8 (__m256i __A, __m256i __B)
99 return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
102 extern __inline __m256i
103 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
104 _mm256_add_epi16 (__m256i __A, __m256i __B)
106 return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
109 extern __inline __m256i
110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
111 _mm256_add_epi32 (__m256i __A, __m256i __B)
113 return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
116 extern __inline __m256i
117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
118 _mm256_add_epi64 (__m256i __A, __m256i __B)
120 return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
123 extern __inline __m256i
124 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
125 _mm256_adds_epi8 (__m256i __A, __m256i __B)
127 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
130 extern __inline __m256i
131 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
132 _mm256_adds_epi16 (__m256i __A, __m256i __B)
134 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
137 extern __inline __m256i
138 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
139 _mm256_adds_epu8 (__m256i __A, __m256i __B)
141 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
144 extern __inline __m256i
145 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
146 _mm256_adds_epu16 (__m256i __A, __m256i __B)
148 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
152 extern __inline __m256i
153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
154 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
156 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
161 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
162 /* Use define instead */
163 #define _mm256_alignr_epi8(A, B, N) \
164 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
165 (__v4di)(__m256i)(B), \
169 extern __inline __m256i
170 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
171 _mm256_and_si256 (__m256i __A, __m256i __B)
173 return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
176 extern __inline __m256i
177 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
178 _mm256_andnot_si256 (__m256i __A, __m256i __B)
180 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
183 extern __inline __m256i
184 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
185 _mm256_avg_epu8 (__m256i __A, __m256i __B)
187 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
190 extern __inline __m256i
191 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
192 _mm256_avg_epu16 (__m256i __A, __m256i __B)
194 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
197 extern __inline __m256i
198 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
199 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
201 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
207 extern __inline __m256i
208 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
209 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
211 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
216 #define _mm256_blend_epi16(X, Y, M) \
217 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
218 (__v16hi)(__m256i)(Y), (int)(M)))
221 extern __inline __m256i
222 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
223 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
225 return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
228 extern __inline __m256i
229 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
230 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
232 return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
235 extern __inline __m256i
236 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
237 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
239 return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
242 extern __inline __m256i
243 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
244 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
246 return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
249 extern __inline __m256i
250 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
251 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
253 return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
257 extern __inline __m256i
258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
259 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
261 return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
265 extern __inline __m256i
266 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
267 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
269 return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
273 extern __inline __m256i
274 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
275 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
277 return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
280 extern __inline __m256i
281 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
282 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
284 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
288 extern __inline __m256i
289 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
290 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
292 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
295 extern __inline __m256i
296 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
297 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
299 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
303 extern __inline __m256i
304 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
305 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
307 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
311 extern __inline __m256i
312 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
313 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
315 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
318 extern __inline __m256i
319 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
320 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
322 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
326 extern __inline __m256i
327 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
328 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
330 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
334 extern __inline __m256i
335 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
336 _mm256_madd_epi16 (__m256i __A, __m256i __B)
338 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
342 extern __inline __m256i
343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
344 _mm256_max_epi8 (__m256i __A, __m256i __B)
346 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
349 extern __inline __m256i
350 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
351 _mm256_max_epi16 (__m256i __A, __m256i __B)
353 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
356 extern __inline __m256i
357 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
358 _mm256_max_epi32 (__m256i __A, __m256i __B)
360 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
363 extern __inline __m256i
364 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
365 _mm256_max_epu8 (__m256i __A, __m256i __B)
367 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
370 extern __inline __m256i
371 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
372 _mm256_max_epu16 (__m256i __A, __m256i __B)
374 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
377 extern __inline __m256i
378 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
379 _mm256_max_epu32 (__m256i __A, __m256i __B)
381 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
384 extern __inline __m256i
385 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
386 _mm256_min_epi8 (__m256i __A, __m256i __B)
388 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
391 extern __inline __m256i
392 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
393 _mm256_min_epi16 (__m256i __A, __m256i __B)
395 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
398 extern __inline __m256i
399 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
400 _mm256_min_epi32 (__m256i __A, __m256i __B)
402 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
405 extern __inline __m256i
406 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
407 _mm256_min_epu8 (__m256i __A, __m256i __B)
409 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
412 extern __inline __m256i
413 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
414 _mm256_min_epu16 (__m256i __A, __m256i __B)
416 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
419 extern __inline __m256i
420 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
421 _mm256_min_epu32 (__m256i __A, __m256i __B)
423 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
427 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
428 _mm256_movemask_epi8 (__m256i __A)
430 return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
433 extern __inline __m256i
434 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
435 _mm256_cvtepi8_epi16 (__m128i __X)
437 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
440 extern __inline __m256i
441 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
442 _mm256_cvtepi8_epi32 (__m128i __X)
444 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
447 extern __inline __m256i
448 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
449 _mm256_cvtepi8_epi64 (__m128i __X)
451 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
454 extern __inline __m256i
455 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
456 _mm256_cvtepi16_epi32 (__m128i __X)
458 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
461 extern __inline __m256i
462 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
463 _mm256_cvtepi16_epi64 (__m128i __X)
465 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
468 extern __inline __m256i
469 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
470 _mm256_cvtepi32_epi64 (__m128i __X)
472 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
475 extern __inline __m256i
476 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
477 _mm256_cvtepu8_epi16 (__m128i __X)
479 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
482 extern __inline __m256i
483 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
484 _mm256_cvtepu8_epi32 (__m128i __X)
486 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
489 extern __inline __m256i
490 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
491 _mm256_cvtepu8_epi64 (__m128i __X)
493 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
496 extern __inline __m256i
497 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
498 _mm256_cvtepu16_epi32 (__m128i __X)
500 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
503 extern __inline __m256i
504 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
505 _mm256_cvtepu16_epi64 (__m128i __X)
507 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
510 extern __inline __m256i
511 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
512 _mm256_cvtepu32_epi64 (__m128i __X)
514 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
517 extern __inline __m256i
518 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
519 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
521 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
524 extern __inline __m256i
525 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
526 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
528 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
532 extern __inline __m256i
533 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
534 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
536 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
539 extern __inline __m256i
540 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
541 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
543 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
546 extern __inline __m256i
547 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
548 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
550 return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
553 extern __inline __m256i
554 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
555 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
557 return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
560 extern __inline __m256i
561 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
562 _mm256_mul_epu32 (__m256i __A, __m256i __B)
564 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
567 extern __inline __m256i
568 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
569 _mm256_or_si256 (__m256i __A, __m256i __B)
571 return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
574 extern __inline __m256i
575 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
576 _mm256_sad_epu8 (__m256i __A, __m256i __B)
578 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
581 extern __inline __m256i
582 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
583 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
585 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
590 extern __inline __m256i
591 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
592 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
594 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
597 extern __inline __m256i
598 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
599 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
601 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
604 extern __inline __m256i
605 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
606 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
608 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
611 #define _mm256_shuffle_epi32(A, N) \
612 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
613 #define _mm256_shufflehi_epi16(A, N) \
614 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
615 #define _mm256_shufflelo_epi16(A, N) \
616 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
619 extern __inline __m256i
620 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
621 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
623 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
626 extern __inline __m256i
627 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
628 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
630 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
633 extern __inline __m256i
634 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
635 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
637 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
641 extern __inline __m256i
642 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
643 _mm256_slli_si256 (__m256i __A, const int __N)
645 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
648 #define _mm256_slli_si256(A, N) \
649 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
652 extern __inline __m256i
653 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
654 _mm256_slli_epi16 (__m256i __A, int __B)
656 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
659 extern __inline __m256i
660 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
661 _mm256_sll_epi16 (__m256i __A, __m128i __B)
663 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
666 extern __inline __m256i
667 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
668 _mm256_slli_epi32 (__m256i __A, int __B)
670 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
673 extern __inline __m256i
674 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
675 _mm256_sll_epi32 (__m256i __A, __m128i __B)
677 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
680 extern __inline __m256i
681 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
682 _mm256_slli_epi64 (__m256i __A, int __B)
684 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
687 extern __inline __m256i
688 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
689 _mm256_sll_epi64 (__m256i __A, __m128i __B)
691 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
694 extern __inline __m256i
695 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
696 _mm256_srai_epi16 (__m256i __A, int __B)
698 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
701 extern __inline __m256i
702 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
703 _mm256_sra_epi16 (__m256i __A, __m128i __B)
705 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
708 extern __inline __m256i
709 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
710 _mm256_srai_epi32 (__m256i __A, int __B)
712 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
715 extern __inline __m256i
716 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
717 _mm256_sra_epi32 (__m256i __A, __m128i __B)
719 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
723 extern __inline __m256i
724 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
725 _mm256_srli_si256 (__m256i __A, const int __N)
727 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
730 #define _mm256_srli_si256(A, N) \
731 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
734 extern __inline __m256i
735 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
736 _mm256_srli_epi16 (__m256i __A, int __B)
738 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
741 extern __inline __m256i
742 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
743 _mm256_srl_epi16 (__m256i __A, __m128i __B)
745 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
748 extern __inline __m256i
749 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
750 _mm256_srli_epi32 (__m256i __A, int __B)
752 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
755 extern __inline __m256i
756 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
757 _mm256_srl_epi32 (__m256i __A, __m128i __B)
759 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
762 extern __inline __m256i
763 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
764 _mm256_srli_epi64 (__m256i __A, int __B)
766 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
769 extern __inline __m256i
770 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
771 _mm256_srl_epi64 (__m256i __A, __m128i __B)
773 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
776 extern __inline __m256i
777 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
778 _mm256_sub_epi8 (__m256i __A, __m256i __B)
780 return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
783 extern __inline __m256i
784 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
785 _mm256_sub_epi16 (__m256i __A, __m256i __B)
787 return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
790 extern __inline __m256i
791 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
792 _mm256_sub_epi32 (__m256i __A, __m256i __B)
794 return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
797 extern __inline __m256i
798 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
799 _mm256_sub_epi64 (__m256i __A, __m256i __B)
801 return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
804 extern __inline __m256i
805 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
806 _mm256_subs_epi8 (__m256i __A, __m256i __B)
808 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
811 extern __inline __m256i
812 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
813 _mm256_subs_epi16 (__m256i __A, __m256i __B)
815 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
818 extern __inline __m256i
819 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
820 _mm256_subs_epu8 (__m256i __A, __m256i __B)
822 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
825 extern __inline __m256i
826 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
827 _mm256_subs_epu16 (__m256i __A, __m256i __B)
829 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
832 extern __inline __m256i
833 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
834 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
836 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
839 extern __inline __m256i
840 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
841 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
843 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
846 extern __inline __m256i
847 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
848 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
850 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
853 extern __inline __m256i
854 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
855 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
857 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
860 extern __inline __m256i
861 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
862 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
864 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
867 extern __inline __m256i
868 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
869 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
871 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
874 extern __inline __m256i
875 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
876 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
878 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
881 extern __inline __m256i
882 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
883 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
885 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
888 extern __inline __m256i
889 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
890 _mm256_xor_si256 (__m256i __A, __m256i __B)
892 return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
895 extern __inline __m256i
896 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
897 _mm256_stream_load_si256 (__m256i const *__X)
899 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
902 extern __inline __m128
903 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
904 _mm_broadcastss_ps (__m128 __X)
906 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
909 extern __inline __m256
910 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
911 _mm256_broadcastss_ps (__m128 __X)
913 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
916 extern __inline __m256d
917 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
918 _mm256_broadcastsd_pd (__m128d __X)
920 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
923 extern __inline __m256i
924 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
925 _mm_broadcastsi128_si256 (__m128i __X)
927 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
931 extern __inline __m128i
932 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
933 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
935 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
940 #define _mm_blend_epi32(X, Y, M) \
941 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
942 (__v4si)(__m128i)(Y), (int)(M)))
946 extern __inline __m256i
947 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
948 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
950 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
955 #define _mm256_blend_epi32(X, Y, M) \
956 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
957 (__v8si)(__m256i)(Y), (int)(M)))
960 extern __inline __m256i
961 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
962 _mm256_broadcastb_epi8 (__m128i __X)
964 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
967 extern __inline __m256i
968 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
969 _mm256_broadcastw_epi16 (__m128i __X)
971 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
974 extern __inline __m256i
975 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
976 _mm256_broadcastd_epi32 (__m128i __X)
978 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
981 extern __inline __m256i
982 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
983 _mm256_broadcastq_epi64 (__m128i __X)
985 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
988 extern __inline __m128i
989 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
990 _mm_broadcastb_epi8 (__m128i __X)
992 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
995 extern __inline __m128i
996 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
997 _mm_broadcastw_epi16 (__m128i __X)
999 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1002 extern __inline __m128i
1003 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1004 _mm_broadcastd_epi32 (__m128i __X)
1006 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1009 extern __inline __m128i
1010 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1011 _mm_broadcastq_epi64 (__m128i __X)
1013 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1016 extern __inline __m256i
1017 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1018 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1020 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1024 extern __inline __m256d
1025 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm256_permute4x64_pd (__m256d __X, const int __M)
1028 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1031 #define _mm256_permute4x64_pd(X, M) \
1032 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1035 extern __inline __m256
1036 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1037 _mm256_permutevar8x32_ps (__m256 __X, __m256 __Y)
1039 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X,(__v8sf)__Y);
1043 extern __inline __m256i
1044 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1045 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1047 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1050 #define _mm256_permute4x64_epi64(X, M) \
1051 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1056 extern __inline __m256i
1057 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1058 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1060 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1063 #define _mm256_permute2x128_si256(X, Y, M) \
1064 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1068 extern __inline __m128i
1069 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1070 _mm256_extracti128_si256 (__m256i __X, const int __M)
1072 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1075 #define _mm256_extracti128_si256(X, M) \
1076 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1080 extern __inline __m256i
1081 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1082 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1084 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1087 #define _mm256_inserti128_si256(X, Y, M) \
1088 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1089 (__v2di)(__m128i)(Y), \
1093 extern __inline __m256i
1094 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1095 _mm256_maskload_epi32 (int const *__X, __m256i __M )
1097 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1101 extern __inline __m256i
1102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1103 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1105 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1109 extern __inline __m128i
1110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1111 _mm_maskload_epi32 (int const *__X, __m128i __M )
1113 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1117 extern __inline __m128i
1118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1119 _mm_maskload_epi64 (long long const *__X, __m128i __M )
1121 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1125 extern __inline void
1126 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1127 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1129 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1132 extern __inline void
1133 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1134 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1136 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1139 extern __inline void
1140 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1141 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1143 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1146 extern __inline void
1147 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1148 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1150 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1153 extern __inline __m256i
1154 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1155 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1157 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1160 extern __inline __m128i
1161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1164 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1167 extern __inline __m256i
1168 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1169 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1171 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1174 extern __inline __m128i
1175 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1176 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1178 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1181 extern __inline __m256i
1182 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1183 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1185 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1188 extern __inline __m128i
1189 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1190 _mm_srav_epi32 (__m128i __X, __m128i __Y)
1192 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1195 extern __inline __m256i
1196 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1199 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1202 extern __inline __m128i
1203 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1204 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1206 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1209 extern __inline __m256i
1210 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1211 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1213 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1216 extern __inline __m128i
1217 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1218 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1220 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1224 extern __inline __m128d
1225 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1226 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
1228 __v2df src = _mm_setzero_pd ();
1229 __v2df mask = _mm_cmpeq_pd (src, src);
1231 return (__m128d) __builtin_ia32_gathersiv2df (src,
1238 extern __inline __m128d
1239 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1240 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
1241 __m128d mask, const int scale)
1243 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
1250 extern __inline __m256d
1251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1252 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
1254 __v4df src = _mm256_setzero_pd ();
1255 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1257 return (__m256d) __builtin_ia32_gathersiv4df (src,
1264 extern __inline __m256d
1265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm256_mask_i32gather_pd (__m256d src, double const *base,
1267 __m128i index, __m256d mask, const int scale)
1269 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
1276 extern __inline __m128d
1277 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1278 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
1280 __v2df src = _mm_setzero_pd ();
1281 __v2df mask = _mm_cmpeq_pd (src, src);
1283 return (__m128d) __builtin_ia32_gatherdiv2df (src,
1290 extern __inline __m128d
1291 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1292 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
1293 __m128d mask, const int scale)
1295 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
1302 extern __inline __m256d
1303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
1306 __v4df src = _mm256_setzero_pd ();
1307 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1309 return (__m256d) __builtin_ia32_gatherdiv4df (src,
1316 extern __inline __m256d
1317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm256_mask_i64gather_pd (__m256d src, double const *base,
1319 __m256i index, __m256d mask, const int scale)
1321 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
1328 extern __inline __m128
1329 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1330 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
1332 __v4sf src = _mm_setzero_ps ();
1333 __v4sf mask = _mm_cmpeq_ps (src, src);
1335 return (__m128) __builtin_ia32_gathersiv4sf (src,
1342 extern __inline __m128
1343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1344 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
1345 __m128 mask, const int scale)
1347 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
1354 extern __inline __m256
1355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1356 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
1358 __v8sf src = _mm256_setzero_ps ();
1359 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
1361 return (__m256) __builtin_ia32_gathersiv8sf (src,
1368 extern __inline __m256
1369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm256_mask_i32gather_ps (__m256 src, float const *base,
1371 __m256i index, __m256 mask, const int scale)
1373 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
1380 extern __inline __m128
1381 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
1384 __v4sf src = _mm_setzero_ps ();
1385 __v4sf mask = _mm_cmpeq_ps (src, src);
1387 return (__m128) __builtin_ia32_gatherdiv4sf (src,
1394 extern __inline __m128
1395 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1396 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
1397 __m128 mask, const int scale)
1399 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
1406 extern __inline __m128
1407 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1408 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
1410 __v4sf src = _mm_setzero_ps ();
1411 __v4sf mask = _mm_cmpeq_ps (src, src);
1413 return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
1420 extern __inline __m128
1421 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1422 _mm256_mask_i64gather_ps (__m128 src, float const *base,
1423 __m256i index, __m128 mask, const int scale)
1425 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
1432 extern __inline __m128i
1433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1434 _mm_i32gather_epi64 (long long int const *base,
1435 __m128i index, const int scale)
1437 __v2di src = __extension__ (__v2di){ 0, 0 };
1438 __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1440 return (__m128i) __builtin_ia32_gathersiv2di (src,
1447 extern __inline __m128i
1448 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1449 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
1450 __m128i index, __m128i mask, const int scale)
1452 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
1459 extern __inline __m256i
1460 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1461 _mm256_i32gather_epi64 (long long int const *base,
1462 __m128i index, const int scale)
1464 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1465 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1467 return (__m256i) __builtin_ia32_gathersiv4di (src,
1474 extern __inline __m256i
1475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1476 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
1477 __m128i index, __m256i mask, const int scale)
1479 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
1486 extern __inline __m128i
1487 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1488 _mm_i64gather_epi64 (long long int const *base,
1489 __m128i index, const int scale)
1491 __v2di src = __extension__ (__v2di){ 0, 0 };
1492 __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1494 return (__m128i) __builtin_ia32_gatherdiv2di (src,
1501 extern __inline __m128i
1502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1503 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
1504 __m128i mask, const int scale)
1506 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
1513 extern __inline __m256i
1514 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1515 _mm256_i64gather_epi64 (long long int const *base,
1516 __m256i index, const int scale)
1518 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1519 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1521 return (__m256i) __builtin_ia32_gatherdiv4di (src,
1528 extern __inline __m256i
1529 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1530 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
1531 __m256i index, __m256i mask, const int scale)
1533 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
1540 extern __inline __m128i
1541 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1542 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
1544 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1545 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1547 return (__m128i) __builtin_ia32_gathersiv4si (src,
1554 extern __inline __m128i
1555 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1556 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
1557 __m128i mask, const int scale)
1559 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
1566 extern __inline __m256i
1567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1568 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
1570 __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1571 __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1573 return (__m256i) __builtin_ia32_gathersiv8si (src,
1580 extern __inline __m256i
1581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1582 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
1583 __m256i index, __m256i mask, const int scale)
1585 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
1592 extern __inline __m128i
1593 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1594 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
1596 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1597 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1599 return (__m128i) __builtin_ia32_gatherdiv4si (src,
1606 extern __inline __m128i
1607 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1608 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
1609 __m128i mask, const int scale)
1611 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
1618 extern __inline __m128i
1619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1620 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
1622 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1623 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1625 return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
1632 extern __inline __m128i
1633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1634 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
1635 __m256i index, __m128i mask, const int scale)
1637 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
1643 #else /* __OPTIMIZE__ */
1644 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \
1645 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
1646 (double const *)BASE, \
1647 (__v4si)(__m128i)INDEX, \
1648 (__v2df)_mm_set1_pd( \
1649 (double)(long long int) -1), \
1652 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1653 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \
1654 (double const *)BASE, \
1655 (__v4si)(__m128i)INDEX, \
1656 (__v2df)(__m128d)MASK, \
1659 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
1660 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
1661 (double const *)BASE, \
1662 (__v4si)(__m128i)INDEX, \
1663 (__v4df)_mm256_set1_pd( \
1664 (double)(long long int) -1), \
1667 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1668 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \
1669 (double const *)BASE, \
1670 (__v4si)(__m128i)INDEX, \
1671 (__v4df)(__m256d)MASK, \
1674 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \
1675 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
1676 (double const *)BASE, \
1677 (__v2di)(__m128i)INDEX, \
1678 (__v2df)_mm_set1_pd( \
1679 (double)(long long int) -1), \
1682 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1683 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \
1684 (double const *)BASE, \
1685 (__v2di)(__m128i)INDEX, \
1686 (__v2df)(__m128d)MASK, \
1689 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
1690 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
1691 (double const *)BASE, \
1692 (__v4di)(__m256i)INDEX, \
1693 (__v4df)_mm256_set1_pd( \
1694 (double)(long long int) -1), \
1697 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1698 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \
1699 (double const *)BASE, \
1700 (__v4di)(__m256i)INDEX, \
1701 (__v4df)(__m256d)MASK, \
1704 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \
1705 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
1706 (float const *)BASE, \
1707 (__v4si)(__m128i)INDEX, \
1708 _mm_set1_ps ((float)(int) -1), \
1711 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1712 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \
1713 (float const *)BASE, \
1714 (__v4si)(__m128i)INDEX, \
1715 (__v4sf)(__m128d)MASK, \
1718 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
1719 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1720 (float const *)BASE, \
1721 (__v8si)(__m256i)INDEX, \
1722 (__v8sf)_mm256_set1_ps ( \
1726 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1727 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \
1728 (float const *)BASE, \
1729 (__v8si)(__m256i)INDEX, \
1730 (__v8sf)(__m256d)MASK, \
1733 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \
1734 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
1735 (float const *)BASE, \
1736 (__v2di)(__m128i)INDEX, \
1737 (__v4sf)_mm_set1_ps ( \
1741 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1742 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \
1743 (float const *)BASE, \
1744 (__v2di)(__m128i)INDEX, \
1745 (__v4sf)(__m128d)MASK, \
1748 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
1749 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
1750 (float const *)BASE, \
1751 (__v4di)(__m256i)INDEX, \
1752 (__v4sf)_mm_set1_ps( \
1756 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1757 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \
1758 (float const *)BASE, \
1759 (__v4di)(__m256i)INDEX, \
1760 (__v4sf)(__m128)MASK, \
1763 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
1764 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1765 (long long const *)BASE, \
1766 (__v4si)(__m128i)INDEX, \
1767 (__v2di)_mm_set1_epi64x (-1), \
1770 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1771 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \
1772 (long long const *)BASE, \
1773 (__v4si)(__m128i)INDEX, \
1774 (__v2di)(__m128i)MASK, \
1777 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
1778 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1779 (long long const *)BASE, \
1780 (__v4si)(__m128i)INDEX, \
1781 (__v4di)_mm256_set1_epi64x (-1), \
1784 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1785 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \
1786 (long long const *)BASE, \
1787 (__v4si)(__m128i)INDEX, \
1788 (__v4di)(__m256i)MASK, \
1791 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
1792 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1793 (long long const *)BASE, \
1794 (__v2di)(__m128i)INDEX, \
1795 (__v2di)_mm_set1_epi64x (-1), \
1798 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1799 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \
1800 (long long const *)BASE, \
1801 (__v2di)(__m128i)INDEX, \
1802 (__v2di)(__m128i)MASK, \
1805 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
1806 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1807 (long long const *)BASE, \
1808 (__v4di)(__m256i)INDEX, \
1809 (__v4di)_mm256_set1_epi64x (-1), \
1812 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1813 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \
1814 (long long const *)BASE, \
1815 (__v4di)(__m256i)INDEX, \
1816 (__v4di)(__m256i)MASK, \
1819 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
1820 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
1821 (int const *)BASE, \
1822 (__v4si)(__m128i)INDEX, \
1823 (__v4si)_mm_set1_epi32 (-1), \
1826 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1827 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \
1828 (int const *)BASE, \
1829 (__v4si)(__m128i)INDEX, \
1830 (__v4si)(__m128i)MASK, \
1833 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
1834 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1835 (int const *)BASE, \
1836 (__v8si)(__m256i)INDEX, \
1837 (__v8si)_mm256_set1_epi32 (-1), \
1840 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1841 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \
1842 (int const *)BASE, \
1843 (__v8si)(__m256i)INDEX, \
1844 (__v8si)(__m256i)MASK, \
1847 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
1848 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
1849 (int const *)BASE, \
1850 (__v2di)(__m128i)INDEX, \
1851 (__v4si)_mm_set1_epi32 (-1), \
1854 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1855 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \
1856 (int const *)BASE, \
1857 (__v2di)(__m128i)INDEX, \
1858 (__v4si)(__m128i)MASK, \
1861 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
1862 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1863 (int const *)BASE, \
1864 (__v4di)(__m256i)INDEX, \
1865 (__v4si)_mm_set1_epi32(-1), \
1868 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1869 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \
1870 (int const *)BASE, \
1871 (__v4di)(__m256i)INDEX, \
1872 (__v4si)(__m128i)MASK, \
1874 #endif /* __OPTIMIZE__ */