1 /* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 11.0. */
30 #ifndef _IMMINTRIN_H_INCLUDED
31 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
34 /* Internal data types for implementing the intrinsics. */
35 typedef double __v4df __attribute__ ((__vector_size__ (32)));
36 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
37 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
38 typedef int __v8si __attribute__ ((__vector_size__ (32)));
39 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
40 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
42 /* The Intel API is flexible enough that we must allow aliasing with other
43 vector types, and their scalar components. */
44 typedef float __m256 __attribute__ ((__vector_size__ (32),
46 typedef long long __m256i __attribute__ ((__vector_size__ (32),
48 typedef double __m256d __attribute__ ((__vector_size__ (32),
51 /* Compare predicates for scalar and packed compare intrinsics. */
53 /* Equal (ordered, non-signaling) */
54 #define _CMP_EQ_OQ 0x00
55 /* Less-than (ordered, signaling) */
56 #define _CMP_LT_OS 0x01
57 /* Less-than-or-equal (ordered, signaling) */
58 #define _CMP_LE_OS 0x02
59 /* Unordered (non-signaling) */
60 #define _CMP_UNORD_Q 0x03
61 /* Not-equal (unordered, non-signaling) */
62 #define _CMP_NEQ_UQ 0x04
63 /* Not-less-than (unordered, signaling) */
64 #define _CMP_NLT_US 0x05
65 /* Not-less-than-or-equal (unordered, signaling) */
66 #define _CMP_NLE_US 0x06
67 /* Ordered (nonsignaling) */
68 #define _CMP_ORD_Q 0x07
69 /* Equal (unordered, non-signaling) */
70 #define _CMP_EQ_UQ 0x08
71 /* Not-greater-than-or-equal (unordered, signaling) */
72 #define _CMP_NGE_US 0x09
73 /* Not-greater-than (unordered, signaling) */
74 #define _CMP_NGT_US 0x0a
75 /* False (ordered, non-signaling) */
76 #define _CMP_FALSE_OQ 0x0b
77 /* Not-equal (ordered, non-signaling) */
78 #define _CMP_NEQ_OQ 0x0c
79 /* Greater-than-or-equal (ordered, signaling) */
80 #define _CMP_GE_OS 0x0d
81 /* Greater-than (ordered, signaling) */
82 #define _CMP_GT_OS 0x0e
83 /* True (unordered, non-signaling) */
84 #define _CMP_TRUE_UQ 0x0f
85 /* Equal (ordered, signaling) */
86 #define _CMP_EQ_OS 0x10
87 /* Less-than (ordered, non-signaling) */
88 #define _CMP_LT_OQ 0x11
89 /* Less-than-or-equal (ordered, non-signaling) */
90 #define _CMP_LE_OQ 0x12
91 /* Unordered (signaling) */
92 #define _CMP_UNORD_S 0x13
93 /* Not-equal (unordered, signaling) */
94 #define _CMP_NEQ_US 0x14
95 /* Not-less-than (unordered, non-signaling) */
96 #define _CMP_NLT_UQ 0x15
97 /* Not-less-than-or-equal (unordered, non-signaling) */
98 #define _CMP_NLE_UQ 0x16
99 /* Ordered (signaling) */
100 #define _CMP_ORD_S 0x17
101 /* Equal (unordered, signaling) */
102 #define _CMP_EQ_US 0x18
103 /* Not-greater-than-or-equal (unordered, non-signaling) */
104 #define _CMP_NGE_UQ 0x19
105 /* Not-greater-than (unordered, non-signaling) */
106 #define _CMP_NGT_UQ 0x1a
107 /* False (ordered, signaling) */
108 #define _CMP_FALSE_OS 0x1b
109 /* Not-equal (ordered, signaling) */
110 #define _CMP_NEQ_OS 0x1c
111 /* Greater-than-or-equal (ordered, non-signaling) */
112 #define _CMP_GE_OQ 0x1d
113 /* Greater-than (ordered, non-signaling) */
114 #define _CMP_GT_OQ 0x1e
115 /* True (unordered, signaling) */
116 #define _CMP_TRUE_US 0x1f
118 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119 _mm256_add_pd (__m256d __A, __m256d __B)
121 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
124 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 _mm256_add_ps (__m256 __A, __m256 __B)
127 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
130 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131 _mm256_addsub_pd (__m256d __A, __m256d __B)
133 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
136 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
137 _mm256_addsub_ps (__m256 __A, __m256 __B)
139 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
143 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
144 _mm256_and_pd (__m256d __A, __m256d __B)
146 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
149 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150 _mm256_and_ps (__m256 __A, __m256 __B)
152 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
155 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 _mm256_andnot_pd (__m256d __A, __m256d __B)
158 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
161 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162 _mm256_andnot_ps (__m256 __A, __m256 __B)
164 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
167 /* Double/single precision floating point blend instructions - select
168 data from 2 sources using constant/variable mask. */
171 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
174 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
179 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
180 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
182 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
187 #define _mm256_blend_pd(X, Y, M) \
188 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
189 (__v4df)(__m256d)(Y), (int)(M)))
191 #define _mm256_blend_ps(X, Y, M) \
192 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
193 (__v8sf)(__m256)(Y), (int)(M)))
196 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
197 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
199 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
204 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
207 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
212 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
213 _mm256_div_pd (__m256d __A, __m256d __B)
215 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
218 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219 _mm256_div_ps (__m256 __A, __m256 __B)
221 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
224 /* Dot product instructions with mask-defined summing and zeroing parts
228 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
231 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
236 #define _mm256_dp_ps(X, Y, M) \
237 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
238 (__v8sf)(__m256)(Y), (int)(M)))
241 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242 _mm256_hadd_pd (__m256d __X, __m256d __Y)
244 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
247 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm256_hadd_ps (__m256 __X, __m256 __Y)
250 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
253 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
254 _mm256_hsub_pd (__m256d __X, __m256d __Y)
256 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
259 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260 _mm256_hsub_ps (__m256 __X, __m256 __Y)
262 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
265 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_max_pd (__m256d __A, __m256d __B)
268 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
271 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272 _mm256_max_ps (__m256 __A, __m256 __B)
274 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
277 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278 _mm256_min_pd (__m256d __A, __m256d __B)
280 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
283 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 _mm256_min_ps (__m256 __A, __m256 __B)
286 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
289 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm256_mul_pd (__m256d __A, __m256d __B)
292 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
295 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296 _mm256_mul_ps (__m256 __A, __m256 __B)
298 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
301 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 _mm256_or_pd (__m256d __A, __m256d __B)
304 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
307 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308 _mm256_or_ps (__m256 __A, __m256 __B)
310 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
314 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
317 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
321 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
322 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
324 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
328 #define _mm256_shuffle_pd(A, B, N) \
329 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
330 (__v4df)(__m256d)(B), (int)(N)))
332 #define _mm256_shuffle_ps(A, B, N) \
333 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
334 (__v8sf)(__m256)(B), (int)(N)))
337 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338 _mm256_sub_pd (__m256d __A, __m256d __B)
340 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
343 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 _mm256_sub_ps (__m256 __A, __m256 __B)
346 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
349 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350 _mm256_xor_pd (__m256d __A, __m256d __B)
352 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
355 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm256_xor_ps (__m256 __A, __m256 __B)
358 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
362 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
365 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
368 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
371 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
374 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
377 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
381 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
384 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
388 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
391 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
394 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
395 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
397 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
400 #define _mm_cmp_pd(X, Y, P) \
401 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
402 (__v2df)(__m128d)(Y), (int)(P)))
404 #define _mm_cmp_ps(X, Y, P) \
405 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
406 (__v4sf)(__m128)(Y), (int)(P)))
408 #define _mm256_cmp_pd(X, Y, P) \
409 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
410 (__v4df)(__m256d)(Y), (int)(P)))
412 #define _mm256_cmp_ps(X, Y, P) \
413 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
414 (__v8sf)(__m256)(Y), (int)(P)))
416 #define _mm_cmp_sd(X, Y, P) \
417 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
418 (__v2df)(__m128d)(Y), (int)(P)))
420 #define _mm_cmp_ss(X, Y, P) \
421 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
422 (__v4sf)(__m128)(Y), (int)(P)))
425 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426 _mm256_cvtepi32_pd (__m128i __A)
428 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
431 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
432 _mm256_cvtepi32_ps (__m256i __A)
434 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
437 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
438 _mm256_cvtpd_ps (__m256d __A)
440 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
443 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444 _mm256_cvtps_epi32 (__m256 __A)
446 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
449 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450 _mm256_cvtps_pd (__m128 __A)
452 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
455 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456 _mm256_cvttpd_epi32 (__m256d __A)
458 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
461 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462 _mm256_cvtpd_epi32 (__m256d __A)
464 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
467 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468 _mm256_cvttps_epi32 (__m256 __A)
470 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
474 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475 _mm256_extractf128_pd (__m256d __X, const int __N)
477 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
480 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481 _mm256_extractf128_ps (__m256 __X, const int __N)
483 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
486 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487 _mm256_extractf128_si256 (__m256i __X, const int __N)
489 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
492 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493 _mm256_extract_epi32 (__m256i __X, int const __N)
495 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
496 return _mm_extract_epi32 (__Y, __N % 4);
499 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
500 _mm256_extract_epi16 (__m256i __X, int const __N)
502 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
503 return _mm_extract_epi16 (__Y, __N % 8);
506 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 _mm256_extract_epi8 (__m256i __X, int const __N)
509 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
510 return _mm_extract_epi8 (__Y, __N % 16);
514 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515 _mm256_extract_epi64 (__m256i __X, const int __N)
517 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
518 return _mm_extract_epi64 (__Y, __N % 2);
522 #define _mm256_extractf128_pd(X, N) \
523 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
526 #define _mm256_extractf128_ps(X, N) \
527 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
530 #define _mm256_extractf128_si256(X, N) \
531 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
534 #define _mm256_extract_epi32(X, N) \
537 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
538 _mm_extract_epi32 (__Y, (N) % 4); \
541 #define _mm256_extract_epi16(X, N) \
544 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
545 _mm_extract_epi16 (__Y, (N) % 8); \
548 #define _mm256_extract_epi8(X, N) \
551 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
552 _mm_extract_epi8 (__Y, (N) % 16); \
556 #define _mm256_extract_epi64(X, N) \
559 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
560 _mm_extract_epi64 (__Y, (N) % 2); \
565 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
566 _mm256_zeroall (void)
568 __builtin_ia32_vzeroall ();
571 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572 _mm256_zeroupper (void)
574 __builtin_ia32_vzeroupper ();
577 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
578 _mm_permutevar_pd (__m128d __A, __m128i __C)
580 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
584 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585 _mm256_permutevar_pd (__m256d __A, __m256i __C)
587 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
591 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 _mm_permutevar_ps (__m128 __A, __m128i __C)
594 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
598 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599 _mm256_permutevar_ps (__m256 __A, __m256i __C)
601 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
606 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm_permute_pd (__m128d __X, const int __C)
609 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
612 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm256_permute_pd (__m256d __X, const int __C)
615 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
618 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _mm_permute_ps (__m128 __X, const int __C)
621 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
624 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm256_permute_ps (__m256 __X, const int __C)
627 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
630 #define _mm_permute_pd(X, C) \
631 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
633 #define _mm256_permute_pd(X, C) \
634 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
636 #define _mm_permute_ps(X, C) \
637 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
639 #define _mm256_permute_ps(X, C) \
640 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
644 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
647 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
652 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
655 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
660 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
663 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
668 #define _mm256_permute2f128_pd(X, Y, C) \
669 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
670 (__v4df)(__m256d)(Y), \
673 #define _mm256_permute2f128_ps(X, Y, C) \
674 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
675 (__v8sf)(__m256)(Y), \
678 #define _mm256_permute2f128_si256(X, Y, C) \
679 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
680 (__v8si)(__m256i)(Y), \
684 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
685 _mm_broadcast_ss (float const *__X)
687 return (__m128) __builtin_ia32_vbroadcastss (__X);
690 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691 _mm256_broadcast_sd (double const *__X)
693 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
696 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697 _mm256_broadcast_ss (float const *__X)
699 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
702 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703 _mm256_broadcast_pd (__m128d const *__X)
705 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
708 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709 _mm256_broadcast_ps (__m128 const *__X)
711 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
715 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
718 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
723 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
726 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
731 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
732 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
734 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
739 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
742 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
743 __Y = _mm_insert_epi16 (__Y, __D, __N % 4);
744 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
747 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
748 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
750 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
751 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
752 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
755 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
756 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
758 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
759 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
760 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
764 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
765 _mm256_insert_epi64 (__m256i __X, int __D, int const __N)
767 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
768 __Y = _mm_insert_epi16 (__Y, __D, __N % 2);
769 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
773 #define _mm256_insertf128_pd(X, Y, O) \
774 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
775 (__v2df)(__m128d)(Y), \
778 #define _mm256_insertf128_ps(X, Y, O) \
779 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
780 (__v4sf)(__m128)(Y), \
783 #define _mm256_insertf128_si256(X, Y, O) \
784 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
785 (__v4si)(__m128i)(Y), \
788 #define _mm256_insert_epi32(X, D, N) \
791 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
792 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
793 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
796 #define _mm256_insert_epi16(X, D, N) \
799 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
800 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
801 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
804 #define _mm256_insert_epi8(X, D, N) \
807 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
808 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
809 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
813 #define _mm256_insert_epi64(X, D, N) \
816 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
817 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
818 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
823 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824 _mm256_load_pd (double const *__P)
826 return *(__m256d *)__P;
829 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830 _mm256_store_pd (double *__P, __m256d __A)
832 *(__m256d *)__P = __A;
835 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836 _mm256_load_ps (float const *__P)
838 return *(__m256 *)__P;
841 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 _mm256_store_ps (float *__P, __m256 __A)
844 *(__m256 *)__P = __A;
847 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848 _mm256_loadu_pd (double const *__P)
850 return (__m256d) __builtin_ia32_loadupd256 (__P);
853 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
854 _mm256_storeu_pd (double *__P, __m256d __A)
856 __builtin_ia32_storeupd256 (__P, (__v4df)__A);
859 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
860 _mm256_loadu_ps (float const *__P)
862 return (__m256) __builtin_ia32_loadups256 (__P);
865 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866 _mm256_storeu_ps (float *__P, __m256 __A)
868 __builtin_ia32_storeups256 (__P, (__v8sf)__A);
871 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872 _mm256_load_si256 (__m256i const *__P)
877 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878 _mm256_store_si256 (__m256i *__P, __m256i __A)
883 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884 _mm256_loadu_si256 (__m256i const *__P)
886 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
889 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
892 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
895 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896 _mm_maskload_pd (double const *__P, __m128d __M)
898 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
902 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903 _mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
905 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2df)__M, (__v2df)__A);
908 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
909 _mm256_maskload_pd (double const *__P, __m256d __M)
911 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
915 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916 _mm256_maskstore_pd (double *__P, __m256d __M, __m256d __A)
918 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4df)__M, (__v4df)__A);
921 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922 _mm_maskload_ps (float const *__P, __m128 __M)
924 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
928 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929 _mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
931 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4sf)__M, (__v4sf)__A);
934 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
935 _mm256_maskload_ps (float const *__P, __m256 __M)
937 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
941 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
942 _mm256_maskstore_ps (float *__P, __m256 __M, __m256 __A)
944 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8sf)__M, (__v8sf)__A);
947 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
948 _mm256_movehdup_ps (__m256 __X)
950 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
953 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954 _mm256_moveldup_ps (__m256 __X)
956 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
959 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960 _mm256_movedup_pd (__m256d __X)
962 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
965 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966 _mm256_lddqu_si256 (__m256i const *__P)
968 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
971 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972 _mm256_rcp_ps (__m256 __A)
974 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
977 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978 _mm256_rsqrt_ps (__m256 __A)
980 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
983 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984 _mm256_sqrt_pd (__m256d __A)
986 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
989 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 _mm256_sqrt_ps (__m256 __A)
992 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
996 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
997 _mm256_round_pd (__m256d __V, const int __M)
999 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1002 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1003 _mm256_round_ps (__m256 __V, const int __M)
1005 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1008 #define _mm256_round_pd(V, M) \
1009 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1011 #define _mm256_round_ps(V, M) \
1012 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1015 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1016 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1017 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1018 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1020 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1023 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1026 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1029 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1032 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1035 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1038 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1041 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1044 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045 _mm_testz_pd (__m128d __M, __m128d __V)
1047 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1050 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _mm_testc_pd (__m128d __M, __m128d __V)
1053 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1056 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057 _mm_testnzc_pd (__m128d __M, __m128d __V)
1059 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1062 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm_testz_ps (__m128 __M, __m128 __V)
1065 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1068 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069 _mm_testc_ps (__m128 __M, __m128 __V)
1071 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1074 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075 _mm_testnzc_ps (__m128 __M, __m128 __V)
1077 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1080 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm256_testz_pd (__m256d __M, __m256d __V)
1083 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1086 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm256_testc_pd (__m256d __M, __m256d __V)
1089 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1092 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1093 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1095 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1098 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099 _mm256_testz_ps (__m256 __M, __m256 __V)
1101 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105 _mm256_testc_ps (__m256 __M, __m256 __V)
1107 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1113 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1116 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117 _mm256_testz_si256 (__m256i __M, __m256i __V)
1119 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1122 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1123 _mm256_testc_si256 (__m256i __M, __m256i __V)
1125 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1128 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1131 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1134 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1135 _mm256_movemask_pd (__m256d __A)
1137 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1140 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141 _mm256_movemask_ps (__m256 __A)
1143 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1146 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm256_setzero_pd (void)
1149 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1152 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1153 _mm256_setzero_ps (void)
1155 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1156 0.0, 0.0, 0.0, 0.0 };
1159 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1160 _mm256_setzero_si256 (void)
1162 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1165 /* Create the vector [A B C D]. */
1166 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167 _mm256_set_pd (double __A, double __B, double __C, double __D)
1169 return __extension__ (__m256d){ __D, __C, __B, __A };
1172 /* Create the vector [A B C D E F G H]. */
1173 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174 _mm256_set_ps (float __A, float __B, float __C, float __D,
1175 float __E, float __F, float __G, float __H)
1177 return __extension__ (__m256){ __H, __G, __F, __E,
1178 __D, __C, __B, __A };
1181 /* Create the vector [A B C D E F G H]. */
1182 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1184 int __E, int __F, int __G, int __H)
1186 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1187 __D, __C, __B, __A };
1190 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1192 short __q11, short __q10, short __q09, short __q08,
1193 short __q07, short __q06, short __q05, short __q04,
1194 short __q03, short __q02, short __q01, short __q00)
1196 return __extension__ (__m256i)(__v16hi){
1197 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1198 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1202 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1203 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1204 char __q27, char __q26, char __q25, char __q24,
1205 char __q23, char __q22, char __q21, char __q20,
1206 char __q19, char __q18, char __q17, char __q16,
1207 char __q15, char __q14, char __q13, char __q12,
1208 char __q11, char __q10, char __q09, char __q08,
1209 char __q07, char __q06, char __q05, char __q04,
1210 char __q03, char __q02, char __q01, char __q00)
1212 return __extension__ (__m256i)(__v32qi){
1213 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1214 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1215 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1216 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1220 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1224 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1227 /* Create a vector with all elements equal to A. */
1228 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229 _mm256_set1_pd (double __A)
1231 return __extension__ (__m256d){ __A, __A, __A, __A };
1234 /* Create a vector with all elements equal to A. */
1235 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236 _mm256_set1_ps (float __A)
1238 return __extension__ (__m256){ __A, __A, __A, __A,
1239 __A, __A, __A, __A };
1242 /* Create a vector with all elements equal to A. */
1243 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244 _mm256_set1_epi32 (int __A)
1246 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1247 __A, __A, __A, __A };
1250 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm256_set1_epi16 (short __A)
1253 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1254 __A, __A, __A, __A, __A, __A, __A, __A);
1257 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258 _mm256_set1_epi8 (char __A)
1260 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1261 __A, __A, __A, __A, __A, __A, __A, __A,
1262 __A, __A, __A, __A, __A, __A, __A, __A,
1263 __A, __A, __A, __A, __A, __A, __A, __A);
1266 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1267 _mm256_set1_epi64x (long long __A)
1269 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1272 /* Create vectors of elements in the reversed order from the
1273 _mm256_set_XXX functions. */
1275 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1276 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1278 return _mm256_set_pd (__D, __C, __B, __A);
1281 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1283 float __E, float __F, float __G, float __H)
1285 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1288 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1290 int __E, int __F, int __G, int __H)
1292 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1295 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1297 short __q11, short __q10, short __q09, short __q08,
1298 short __q07, short __q06, short __q05, short __q04,
1299 short __q03, short __q02, short __q01, short __q00)
1301 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1302 __q04, __q05, __q06, __q07,
1303 __q08, __q09, __q10, __q11,
1304 __q12, __q13, __q14, __q15);
1307 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1308 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1309 char __q27, char __q26, char __q25, char __q24,
1310 char __q23, char __q22, char __q21, char __q20,
1311 char __q19, char __q18, char __q17, char __q16,
1312 char __q15, char __q14, char __q13, char __q12,
1313 char __q11, char __q10, char __q09, char __q08,
1314 char __q07, char __q06, char __q05, char __q04,
1315 char __q03, char __q02, char __q01, char __q00)
1317 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1318 __q04, __q05, __q06, __q07,
1319 __q08, __q09, __q10, __q11,
1320 __q12, __q13, __q14, __q15,
1321 __q16, __q17, __q18, __q19,
1322 __q20, __q21, __q22, __q23,
1323 __q24, __q25, __q26, __q27,
1324 __q28, __q29, __q30, __q31);
1327 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1331 return _mm256_set_epi64x (__D, __C, __B, __A);
1334 /* Casts between various SP, DP, INT vector types. Note that these do no
1335 conversion of values, they just change the type. */
1336 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 _mm256_castpd_ps (__m256d __A)
1339 return (__m256) __A;
1342 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 _mm256_castpd_si256 (__m256d __A)
1345 return (__m256i) __A;
1348 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm256_castps_pd (__m256 __A)
1351 return (__m256d) __A;
1354 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355 _mm256_castps_si256(__m256 __A)
1357 return (__m256i) __A;
1360 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm256_castsi256_ps (__m256i __A)
1363 return (__m256) __A;
1366 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367 _mm256_castsi256_pd (__m256i __A)
1369 return (__m256d) __A;
1372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373 _mm256_castpd256_pd128 (__m256d __A)
1375 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1378 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 _mm256_castps256_ps128 (__m256 __A)
1381 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1384 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385 _mm256_castsi256_si128 (__m256i __A)
1387 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1390 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1391 the 256-bit result contain source parameter value and the upper 128
1392 bits of the result are undefined. Those intrinsics shouldn't
1393 generate any extra moves. */
1395 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396 _mm256_castpd128_pd256 (__m128d __A)
1398 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1401 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1402 _mm256_castps128_ps256 (__m128 __A)
1404 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1407 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1408 _mm256_castsi128_si256 (__m128i __A)
1410 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);