2 TiMidity++ -- MIDI to WAVE converter and player
3 Copyright (C) 1999-2002 Masanao Izumo <mo@goice.co.jp>
4 Copyright (C) 1995 Tuukka Toivonen <tt@cgs.fi>
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #ifndef OPTCODE_H_INCLUDED
22 #define OPTCODE_H_INCLUDED 1
25 #pragma clang diagnostic push
26 #pragma clang diagnostic ignored "-Wmacro-redefined"
29 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_X86_) || defined(__X86__) || defined(__I86__)
33 #if defined(_M_X64) || defined(_AMD64_) || defined(_X64_) || defined(__X64__) || defined(__x86_64__)
39 #if defined(_IA64_) || defined(__IA64__) || defined(__I64__)
56 #if OPT_MODE == 1 && !defined(IX86CPU)
62 #if OPT_MODE == * && !defined(AMD64CPU)
69 #if OPT_MODE == * && !defined(IA64CPU)
76 #if OPT_MODE == * && !defined(ARMCPU)
83 #if OPT_MODE == * && !defined(ARM64CPU)
92 /*****************************************************************************/
94 intrinsic
\82ð
\8eg
\97p
\82µ
\82Ä
\82Ý
\82é
\83e
\83X
\83g gcc
\82Å
\82à
\8eg
\82¦
\82é
\82ç
\82µ
\82¢
\82µ
95 CPU
\82Ì
\8ag
\92£
\8b@
\94\
\82Ì
\91Î
\89\9e\82Ì
\88á
\82¢,
\83r
\83\8b\83h
\8aÂ
\8b«
\82Ìasm/intrin
\91Î
\89\9e\82Ì
\88á
\82¢
\82ª
\82 \82é
\82Ì
\82Å
96 arch_ext_asm/intrin
\82ð
\8cÂ
\95Ê
\82É
\8ew
\92è
\82Å
\82«
\82é
\82æ
\82¤
\82É
\82·
\82é
97 asm/intrin
\97¼
\91Î
\89\9e\82Ì
\8fê
\8d\87 asm
\82ð
\97D
\90æ
\82µ
\82Ä
\8eg
\97p
\82·
\82é
98 x86_ext/x86_AMD_ext
\97¼
\91Î
\89\9e\82Ì
\8fê
\8d\87 x86_AMD_ext
\82ð
\97D
\90æ
\82µ
\82Ä
\8eg
\97p
\82·
\82é
99 intrin
\82Í
\88ê
\95\94\8f\9c\82¢
\82Äx86/x64
\8b¤
\92Ê
\82È
\82Ì
\82Å USE_X86_EXT_INTRIN
\82Íx64
\82Å
\82à
\97L
\8cø
\89»
100 x86/x64
\90ê
\97p
\96½
\97ß
\82Í USE_X64_EXT_INTRIN/IX64CPU
\93\99\82Å
\8bæ
\95Ê (gather
\93\99
102 \95ª
\8aò
\82Ì
\8f\87\8f\98\82Í
103 1 OPT_MODE or USE_X86_AMD_EXT_ASM or USE_X64_AMD_EXT_ASM
104 2 OPT_MODE or USE_X86_EXT_ASM or USE_X64_EXT_ASM
105 3 USE_X64_AMD_EXT_INTRIN
107 5 USE_X86_AMD_EXT_INTRIN
112 AMD
\82í
\82©
\82ç
\82ñ
\81E
\81E
\82½
\82Ô
\82ñ
\88á
\82¤
\82Ì
\82Å
\97v
\8fC
\90³ (
\8d¡
\82Ì
\82Æ
\82±
\82ë
\95K
\97v
\82È
\82¢
\82µ
\8eg
\82¤
\97\
\92è
\82à
\82È
\82¢
\82¯
\82Ç
113 \91Î
\89\9e\8b@
\94\
\83`
\83F
\83b
\83N
\82¢
\82ë
\82¢
\82ë
\89ö
\82µ
\82¢ (optcode.c is_x86ext_available()
\96¢
\8eg
\97p
114 OPT_MODE
\82Æ
\82Ì
\8aÖ
\8cW
\82ð
\82Ç
\82¤
\82·
\82é
\82©
\81E
\81E (
\8d¡
\82Ì
\82Æ
\82±
\82ëOPT_MODE
\97D
\90æ
115 \82Ü
\82Æ
\82ß
\82é
\82È
\82ç, 1: x86 asm / no intrin
\82É
\82µ
\82Ä
\88È
\89º
\82¸
\82ç
\82·, intrin
\94ñ
\91Î
\89\9e\82É
\82È
\82é
\8fð
\8c\8f\82ð
\95Ï
\8dX, _EXT
\82ð_OPT
\82É
\95Ï
\8dX
\82Æ
\82©
116 AVX2
\88È
\8fã
\82Ì
\83r
\83\8b\83h
\8aÂ
\8b«
\82ª
\82È
\82¢
\82Ì
\82Å
\93®
\8dì
\82Í
\95s
\96¾ (VC2013?
\88È
\8d~
119 #define USE_PENTIUM_4 // for pentium 4 (northwood steppingA) float/double denormal fix
121 #if !defined(IX86CPU)
125 //#define USE_SSE //
\83e
\83X
\83g
\97p
126 //#define USE_SSE2 //
\83e
\83X
\83g
\97p
127 //#define USE_SSE3 //
\83e
\83X
\83g
\97p
128 //#define USE_SSSE3 //
\83e
\83X
\83g
\97p
129 //#define USE_SSE41 //
\83e
\83X
\83g
\97p
130 //#define USE_SSE42 //
\83e
\83X
\83g
\97p
131 //#define USE_AVX //
\83e
\83X
\83g
\97p
132 //#define USE_AVX2 //
\83e
\83X
\83g
\97p
134 /* x86 extension define */
136 \8eg
\97p
\82·
\82é
\8ag
\92£
\8b@
\94\
\82ð
\8ew
\92è
\82·
\82é (
\89º
\88Ê
\82Ì
\8ag
\92£
\8b@
\94\
\82ð
\8aÜ
\82Þ
139 USE_SSE // include MMX2
144 USE_SSE42 (SSE4.2 // include POPCNT
145 USE_SSE4 (SSE4.1 +SSE4.2
146 USE_AVX // include PCLMULQDQ
147 USE_AVX2 // include FMA BMI1 BMI2 F16C RDRAND
149 /* x86 AMD extension define */
151 \8eg
\97p
\82·
\82é
\8ag
\92£
\8b@
\94\
\82ð
\8ew
\92è
\82·
\82é (
\89º
\88Ê
\82Ì
\8ag
\92£
\8b@
\94\
\82ð
\8aÜ
\82Þ
152 x86 extension
\82à
\8d\87\82í
\82¹
\82Ä
\8ew
\92è
\82·
\82é
154 USE_3DNOW_ENH (3DNow+
155 USE_3DNOW_PRO (3DNow?
160 // x86 extension number
173 //x86 AMD extension number
175 X86_AMD_EXT_NONE = 0,
185 #if defined(USE_AVX2) // _MSC_VER >= 1700 VC2013?
186 #define USE_X86_EXT_INTRIN 9
187 #elif defined(USE_AVX) // _MSC_VER >= 1600 VC2010?
188 #define USE_X86_EXT_INTRIN 8
189 #elif defined(USE_SSE42) || defined(USE_SSE4)
190 #define USE_X86_EXT_INTRIN 7
191 #elif defined(USE_SSE41) // _MSC_VER >= 1500 VC2008?
192 #define USE_X86_EXT_INTRIN 6
193 #elif defined(USE_SSSE3)
194 #define USE_X86_EXT_INTRIN 5
195 #elif defined(USE_SSE3) // _MSC_VER >= 1400?? VC2005?
196 #define USE_X86_EXT_INTRIN 4
197 #elif defined(USE_SSE2)
198 #define USE_X86_EXT_INTRIN 3
199 #elif defined(USE_SSE) || defined(USE_MMX2)
200 #define USE_X86_EXT_INTRIN 2 // include MMX2
201 #elif defined(USE_MMX) // _MSC_VER >= 1310 VC2003?
202 #define USE_X86_EXT_INTRIN 1
204 #define USE_X86_EXT_INTRIN 0
207 #if (USE_X86_EXT_INTRIN >= 4)
211 #if defined(USE_AVX2) // _MSC_VER >= 1700 VC2013?
212 #define USE_X64_EXT_INTRIN 9
213 #elif defined(USE_AVX) // _MSC_VER >= 1600 VC2010?
214 #define USE_X64_EXT_INTRIN 8
215 #elif defined(USE_SSE42) || defined(USE_SSE4)
216 #define USE_X64_EXT_INTRIN 7
217 #elif defined(USE_SSE41) // _MSC_VER >= 1500 VC2008?
218 #define USE_X64_EXT_INTRIN 6
219 #elif defined(USE_SSSE3)
220 #define USE_X64_EXT_INTRIN 5
221 #elif defined(USE_SSE3) // _MSC_VER >= 1400?? VC2005?
222 #define USE_X64_EXT_INTRIN 4
223 #elif defined(USE_SSE2)
224 #define USE_X64_EXT_INTRIN 3
225 #elif defined(USE_SSE) || defined(USE_MMX2)
226 #define USE_X64_EXT_INTRIN 2 // include MMX2
227 #elif defined(USE_MMX) // _MSC_VER >= 1310 VC2003?
228 #define USE_X64_EXT_INTRIN 1
230 #define USE_X64_EXT_INTRIN 0
233 #if defined(USE_SSE5) // _MSC_VER >= 1700 VC2012?
234 #define USE_X86_AMD_EXT_INTRIN 6
235 #elif defined(USE_SSE4A) // _MSC_VER >= 1600 VC2010?
236 #define USE_X86_AMD_EXT_INTRIN 5
237 #elif defined(USE_3DNOW_PRO)
238 #define USE_X86_AMD_EXT_INTRIN 4
239 #elif defined(USE_3DNOW_ENH)
240 #define USE_X86_AMD_EXT_INTRIN 3
241 #elif defined(USE_3DNOW)
242 #define USE_X86_AMD_EXT_INTRIN 2
243 #elif defined(USE_MMX_EXT)
244 #define USE_X86_AMD_EXT_INTRIN 1
246 #define USE_X86_AMD_EXT_INTRIN 0
249 #if defined(USE_AVX2)
250 #define USE_X86_EXT_ASM 9
251 #elif defined(USE_AVX)
252 #define USE_X86_EXT_ASM 8
253 #elif defined(USE_SSE42) || defined(USE_SSE4)
254 #define USE_X86_EXT_ASM 7
255 #elif defined(USE_SSE41)
256 #define USE_X86_EXT_ASM 6
257 #elif defined(USE_SSSE3)
258 #define USE_X86_EXT_ASM 5
259 #elif defined(USE_SSE3)
260 #define USE_X86_EXT_ASM 4
261 #elif defined(USE_SSE2)
262 #define USE_X86_EXT_ASM 3
263 #elif defined(USE_SSE) || defined(USE_MMX2)
264 #define USE_X86_EXT_ASM 2 // include MMX2
265 #elif defined(USE_MMX)
266 #define USE_X86_EXT_ASM 1
268 #define USE_X86_EXT_ASM 0
271 #if defined(USE_AVX2)
272 #define USE_X64_EXT_ASM 9
273 #elif defined(USE_AVX)
274 #define USE_X64_EXT_ASM 8
275 #elif defined(USE_SSE42) || defined(USE_SSE4)
276 #define USE_X64_EXT_ASM 7
277 #elif defined(USE_SSE41)
278 #define USE_X64_EXT_ASM 6
279 #elif defined(USE_SSSE3)
280 #define USE_X64_EXT_ASM 5
281 #elif defined(USE_SSE3)
282 #define USE_X64_EXT_ASM 4
283 #elif defined(USE_SSE2)
284 #define USE_X64_EXT_ASM 3
285 #elif defined(USE_SSE) || defined(USE_MMX2)
286 #define USE_X64_EXT_ASM 2 // include MMX2
287 #elif defined(USE_MMX)
288 #define USE_X64_EXT_ASM 1
290 #define USE_X64_EXT_ASM 0
293 #if defined(USE_SSE4A)
294 #define USE_X86_AMD_EXT_ASM 5
295 #elif defined(USE_3DNOW_PRO)
296 #define USE_X86_AMD_EXT_ASM 4
297 #elif defined(USE_3DNOW_ENH)
298 #define USE_X86_AMD_EXT_ASM 3
299 #elif defined(USE_3DNOW)
300 #define USE_X86_AMD_EXT_ASM 2
301 #elif defined(USE_MMX_EXT)
302 #define USE_X86_AMD_EXT_ASM 1
304 #define USE_X86_AMD_EXT_ASM 0
307 /* asm/intrin
\95s
\89Â
\8fð
\8c\8f \91¼
\82É
\82 \82ê
\82Î
\92Ç
\89Á */
308 #if !defined(IX64CPU)
309 #undef USE_X64_EXT_INTRIN
310 #define USE_X64_EXT_INTRIN 0
311 #undef USE_X64_AMD_EXT_INTRIN
312 #define USE_X64_AMD_EXT_INTRIN 0
314 #if !defined(IX86CPU) && !defined(IX64CPU)
315 #undef USE_X86_EXT_INTRIN
316 #define USE_X86_EXT_INTRIN 0
317 #undef USE_X86_AMD_EXT_INTRIN
318 #define USE_X86_AMD_EXT_INTRIN 0
321 /* Always disable inline asm */
322 #undef USE_X86_EXT_ASM
323 #define USE_X86_EXT_ASM 0
324 #undef USE_X86_AMD_EXT_ASM
325 #define USE_X86_AMD_EXT_ASM 0
326 #undef USE_X64_EXT_ASM
327 #define USE_X64_EXT_ASM 0
328 #undef USE_X64_AMD_EXT_ASM
329 #define USE_X64_AMD_EXT_ASM 0
331 #undef SUPPORT_ASM_INTEL
333 /*****************************************************************************/
334 /* PowerPC's AltiVec enhancement */
337 /* (need -faltivec option) */
339 #define USE_ALTIVEC 0
344 /*****************************************************************************/
345 /*****************************************************************************/
347 #ifdef HAVE_SYS_PARAM_H
348 #include <sys/param.h>
349 #endif/* <sys/param.h> */
350 #ifdef HAVE_SYS_SYSCTL_H
351 #include <sys/sysctl.h>
352 #endif/* <sys/sysctl.h> */
355 #elif defined(HAVE_STRINGS_H)
357 #endif/* <string.h> */
359 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
360 # include <stdbool.h>
363 /*****************************************************************************/
364 #if OPT_MODE == 1 && USE_X86_EXT_ASM > 0
371 #define _double2fixmagic 68719476736.0 * 1.5
373 #if defined(__BORLANDC__) && (__BORLANDC__ >= 1380)
374 extern int32 imuldiv8(int32 a, int32 b);
375 extern int32 imuldiv16(int32 a, int32 b);
376 extern int32 imuldiv24(int32 a, int32 b);
377 extern int32 imuldiv28(int32 a, int32 b);
379 #elif defined(SUPPORT_ASM_AT_AND_T) && defined(__ppc__)
380 static inline int32 imuldiv8(int32 a, int32 b)
382 register int32 ret, rah, ral, rlh, rll;
383 __asm__("mulhw %0,%7,%8\n\t"
385 "rlwinm %2,%0,24,0,7\n\t"
386 "rlwinm %3,%1,24,8,31\n\t"
388 :"=r"(rah),"=r"(ral),
396 static inline int32 imuldiv16(int32 a, int32 b)
398 register int32 ret, rah, ral, rlh, rll;
399 __asm__("mulhw %0,%7,%8\n\t"
401 "rlwinm %2,%0,16,0,15\n\t"
402 "rlwinm %3,%1,16,16,31\n\t"
404 :"=r"(rah),"=r"(ral),
412 static inline int32 imuldiv24(int32 a, int32 b)
414 register int32 ret, rah, ral, rlh, rll;
415 __asm__("mulhw %0,%7,%8\n\t"
417 "rlwinm %2,%0,8,0,23\n\t"
418 "rlwinm %3,%1,8,24,31\n\t"
420 :"=r"(rah),"=r"(ral),
428 static inline int32 imuldiv28(int32 a, int32 b)
430 register int32 ret, rah, ral, rlh, rll;
431 __asm__("mulhw %0,%7,%8\n\t"
433 "rlwinm %2,%0,4,0,27\n\t"
434 "rlwinm %3,%1,4,28,31\n\t"
436 :"=r"(rah),"=r"(ral),
444 #elif defined(SUPPORT_ASM_AT_AND_T)
445 static inline int32 imuldiv8(int32 a, int32 b)
448 __asm__("movl %1, %%eax\n\t"
453 "or %%edx, %%eax\n\t"
461 static inline int32 imuldiv16(int32 a, int32 b)
464 __asm__("movl %1, %%eax\n\t"
469 "or %%edx, %%eax\n\t"
477 static inline int32 imuldiv24(int32 a, int32 b)
480 __asm__("movl %1, %%eax\n\t"
485 "or %%edx, %%eax\n\t"
493 static inline int32 imuldiv28(int32 a, int32 b)
496 __asm__("movl %1, %%eax\n\t"
501 "or %%edx, %%eax\n\t"
509 #elif defined(SUPPORT_ASM_INTEL)
510 inline int32 imuldiv8(int32 a, int32 b) {
521 inline int32 imuldiv16(int32 a, int32 b) {
532 inline int32 imuldiv24(int32 a, int32 b) {
543 inline int32 imuldiv28(int32 a, int32 b) {
554 inline int64 imuldiv24_64bit(int64 a, int64 b) {
555 return ((int64)(a) * (int64)(b)) >> 24;
558 inline int64 int64_imuldiv24(int64 a, int64 b)
560 return ((int64)(a) * (int64)(b)) >> 24;
564 /* Generic version of imuldiv. */
565 #define imuldiv8(a, b) \
566 (int32)(((int64)(a) * (int64)(b)) >> 8)
568 #define imuldiv16(a, b) \
569 (int32)(((int64)(a) * (int64)(b)) >> 16)
571 #define imuldiv24(a, b) \
572 (int32)(((int64)(a) * (int64)(b)) >> 24)
574 #define imuldiv28(a, b) \
575 (int32)(((int64)(a) * (int64)(b)) >> 28)
577 #endif /* architectures */
579 #define ifloor_internal(a, b) \
580 ((a) & ~((1L << (b)) - 1))
583 ifloor_internal(a, 8)
585 #define ifloor16(a) \
586 ifloor_internal(a, 16)
588 #define ifloor24(a) \
589 ifloor_internal(a, 24)
591 #define ifloor28(a) \
592 ifloor_internal(a, 28)
594 static inline int32 signlong(int32 a)
596 return ((a | 0x7fffffff) >> 30);
600 /* Generic version of imuldiv. */
601 #define imuldiv8(a, b) \
602 (int32)(((int64)(a) * (int64)(b)) >> 8)
604 #define imuldiv16(a, b) \
605 (int32)(((int64)(a) * (int64)(b)) >> 16)
607 #define imuldiv24(a, b) \
608 (int32)(((int64)(a) * (int64)(b)) >> 24)
610 #define imuldiv28(a, b) \
611 (int32)(((int64)(a) * (int64)(b)) >> 28)
613 #endif /* OPT_MODE != 0 */
617 /*****************************************************************************/
618 #if (USE_X86_EXT_ASM || USE_X86_EXT_INTRIN || USE_X86_AMD_EXT_ASM || USE_X86_AMD_EXT_INTRIN)
620 #if (USE_X86_EXT_INTRIN || USE_X86_AMD_EXT_INTRIN)
622 #include <x86intrin.h>
623 #elif (_MSC_VER >= 1600) // VC2010(VC10)
625 #else // VC2003(VC7) VC2005(VC8) VC2008(VC9)
626 #include <emmintrin.h>
627 #if defined(USE_X86_AMD_EXT_INTRIN) && (USE_X86_AMD_EXT_INTRIN >= 2)
636 #if ((USE_X86_EXT_ASM >= 8) || (USE_X86_EXT_INTRIN >= 8)) // AVX 32byte
637 #define ALIGN_SIZE 32
638 #define ALIGN __attribute__((aligned(ALIGN_SIZE)))
639 #define ALIGN32 __attribute__((aligned(32)))
640 #define ALIGN16 __attribute__((aligned(16)))
641 #define ALIGN8 __attribute__((aligned(8)))
642 #elif ((USE_X86_EXT_ASM >= 2) || (USE_X86_EXT_INTRIN >= 2)) // SSE 16byte // AMD??
643 #define ALIGN_SIZE 16
644 #define ALIGN __attribute__((aligned(ALIGN_SIZE)))
645 #define ALIGN32 __attribute__((aligned(32)))
646 #define ALIGN16 __attribute__((aligned(16)))
647 #define ALIGN8 __attribute__((aligned(8)))
648 #elif ((USE_X86_EXT_ASM >= 1) || (USE_X86_EXT_INTRIN >= 1)) // MMX 8byte // AMD??
650 #define ALIGN __attribute__((aligned(ALIGN_SIZE)))
651 #define ALIGN32 __attribute__((aligned(32)))
652 #define ALIGN16 __attribute__((aligned(16)))
653 #define ALIGN8 __attribute__((aligned(8)))
656 #elif defined(_MSC_VER) || defined(MSC_VER)
658 #if ((USE_X86_EXT_ASM >= 8) || (USE_X86_EXT_INTRIN >= 8)) // AVX 32byte
659 #define ALIGN_SIZE 32
660 #define ALIGN _declspec(align(ALIGN_SIZE))
661 #define ALIGN32 _declspec(align(32))
662 #define ALIGN16 _declspec(align(16))
663 #define ALIGN8 _declspec(align(8))
664 #elif ((USE_X86_EXT_ASM >= 2) || (USE_X86_EXT_INTRIN >= 2)) // SSE 16byte // AMD??
665 #define ALIGN_SIZE 16
666 #define ALIGN _declspec(align(ALIGN_SIZE))
667 #define ALIGN32 _declspec(align(32))
668 #define ALIGN16 _declspec(align(16))
669 #define ALIGN8 _declspec(align(8))
670 #elif ((USE_X86_EXT_ASM >= 1) || (USE_X86_EXT_INTRIN >= 1)) // MMX 8byte // AMD??
672 #define ALIGN _declspec(align(ALIGN_SIZE))
673 #define ALIGN32 _declspec(align(32))
674 #define ALIGN16 _declspec(align(16))
675 #define ALIGN8 _declspec(align(8))
678 #endif /* __GNUC__, MSC_VER */
681 \88È
\89º
\82ÌFMA
\82Ì
\83}
\83N
\83\8d\82Í CPU
\82ÉFMA
\82Ì
\8eÀ
\91\95\82ª
\82È
\82¢
\8fê
\8d\87\82ÍMADD (
\8aÛ
\82ß
\97L
\96³
\82Ì
\90¸
\93x
\82Ì
\88á
\82¢
\82Í
\8dl
\97¶
\82µ
\82Ä
\82È
\82¢
\83~
\83X
\82Á
\82½
\81E
\81E
682 FMA(vec_a, vec_b, vec_c) : vec_a * vec_b + vec_c
683 FMA2(vec_a, vec_b, vec_c, vec_d) : vec_a * vec_b + vec_c * vec_d
684 LS_FMA(ptr, vec_a, vec_b) : store(ptr, load(ptr) + vec_a * vec_b) // *ptr += vec_a * vec_b
685 LS_ADD(ptr, vec_a) : store(ptr, load(ptr) + vec_a) // *ptr += vec_a
686 LS_MUL(ptr, vec_a) : store(ptr, load(ptr) * vec_a) // *ptr *= vec_a
687 LSU : Unalignment (use loadu/storeu
690 #if (USE_X86_EXT_INTRIN >= 9)
691 #define MM256_SET2X_SI256(vec_a, vec_b) \
692 _mm256_inserti128_si256(_mm256_inserti128_si256(_mm256_setzero_si256(), vec_a, 0x0), vec_b, 0x1)
695 #if (USE_X86_EXT_INTRIN >= 8)
696 #if (USE_X86_EXT_INTRIN >= 9)
697 #define MM256_FMA_PD(vec_a, vec_b, vec_c) _mm256_fmadd_pd(vec_a, vec_b, vec_c)
698 #define MM256_FMA2_PD(vec_a, vec_b, vec_c, vec_d) _mm256_fmadd_pd(vec_a, vec_b, _mm256_mul_pd(vec_c, vec_d))
699 #define MM256_FMA3_PD(v00, v01, v10, v11, v20, v21) _mm256_fmadd_pd(v20, v21, _mm256_fmadd_pd(v10, v11, _mm256_mul_pd(v00, v01)))
700 #define MM256_FMA4_PD(v00, v01, v10, v11, v20, v21, v30, v31) _mm256_add_pd(\
701 _mm256_fmadd_pd(v30, v31, _mm256_mul_pd(v20, v21)), _mm256_fmadd_pd(v10, v11, _mm256_mul_pd(v00, v01)) )
702 #define MM256_LS_FMA_PD(ptr, vec_a, vec_b) _mm256_store_pd(ptr, _mm256_fmadd_pd(vec_a, vec_b, _mm256_load_pd(ptr)))
703 #define MM256_LSU_FMA_PD(ptr, vec_a, vec_b) _mm256_storeu_pd(ptr, _mm256_fmadd_pd(vec_a, vec_b, _mm256_loadu_pd(ptr)))
704 #define MM256_MSUB_PD(vec_a, vec_b, vec_c) _mm256_fmsub_pd(vec_a, vec_b, vec_c)
705 #define MM256_FMA_PS(vec_a, vec_b, vec_c) _mm256_fmadd_ps(vec_a, vec_b, vec_c)
706 #define MM256_FMA2_PS(vec_a, vec_b, vec_c, vec_d) _mm256_fmadd_ps(vec_a, vec_b, _mm256_mul_ps(vec_c, vec_d))
707 #define MM256_FMA3_PS(v00, v01, v10, v11, v20, v21) _mm256_fmadd_ps(v20, v21, _mm256_fmadd_ps(v10, v11, _mm256_mul_ps(v00, v01))
708 #define MM256_LS_FMA_PS(ptr, vec_a, vec_b) _mm256_store_ps(ptr, _mm256_fmadd_ps(vec_a, vec_b, _mm256_load_ps(ptr)))
709 #define MM256_LSU_FMA_PS(ptr, vec_a, vec_b) _mm256_storeu_ps(ptr, _mm256_fmadd_ps(vec_a, vec_b, _mm256_loadu_ps(ptr)))
710 #define MM256_MSUB_PS(vec_a, vec_b, vec_c) _mm256_fmsub_ps(vec_a, vec_b, vec_c)
711 #else // ! (USE_X86_EXT_INTRIN >= 9)
712 #define MM256_FMA_PD(vec_a, vec_b, vec_c) _mm256_add_pd(_mm256_mul_pd(vec_a, vec_b), vec_c)
713 #define MM256_FMA2_PD(vec_a, vec_b, vec_c, vec_d) _mm256_add_pd(_mm256_mul_pd(vec_a, vec_b), _mm256_mul_pd(vec_c, vec_d))
714 #define MM256_FMA3_PD(v00, v01, v10, v11, v20, v21) _mm256_add_pd(\
715 _mm256_add_pd(_mm256_mul_pd(v00, v01),_mm256_mul_pd(v10, v11)), _mm256_mul_pd(v20, v21))
716 #define MM256_FMA4_PD(v00, v01, v10, v11, v20, v21, v30, v31) _mm256_add_pd(\
717 _mm256_add_pd(_mm256_mul_pd(v00, v01),_mm256_mul_pd(v10, v11)), _mm256_add_pd(_mm256_mul_pd(v20, v21),_mm256_mul_pd(v30, v31)))
718 #define MM256_LS_FMA_PD(ptr, vec_a, vec_b) _mm256_store_pd(ptr, _mm256_add_pd(_mm256_load_pd(ptr), _mm256_mul_pd(vec_a, vec_b)))
719 #define MM256_LSU_FMA_PD(ptr, vec_a, vec_b) _mm256_storeu_pd(ptr, _mm256_add_pd(_mm256_loadu_pd(ptr), _mm256_mul_pd(vec_a, vec_b)))
720 #define MM256_MSUB_PD(vec_a, vec_b, vec_c) _mm256_sub_pd(_mm256_mul_pd(vec_a, vec_b), vec_c)
721 #define MM256_FMA_PS(vec_a, vec_b, vec_c) _mm256_add_ps(_mm256_mul_ps(vec_a, vec_b), vec_c)
722 #define MM256_FMA2_PS(vec_a, vec_b, vec_c, vec_d) _mm256_add_ps(_mm256_mul_ps(vec_a, vec_b), _mm256_mul_ps(vec_c, vec_d))
723 #define MM256_FMA3_PS(v00, v01, v10, v11, v20, v21) _mm256_add_ps(\
724 _mm256_add_ps(_mm256_mul_ps(v00, v01),_mm256_mul_ps(v10, v11)), _mm256_mul_ps(v20, v21)))
725 #define MM256_LS_FMA_PS(ptr, vec_a, vec_b) _mm256_store_ps(ptr, _mm256_add_ps(_mm256_load_ps(ptr), _mm256_mul_ps(vec_a, vec_b)))
726 #define MM256_LSU_FMA_PS(ptr, vec_a, vec_b) _mm256_storeu_ps(ptr, _mm256_add_ps(_mm256_loadu_ps(ptr), _mm256_mul_ps(vec_a, vec_b)))
727 #define MM256_MSUB_PS(vec_a, vec_b, vec_c) _mm256_sub_ps(_mm256_mul_ps(vec_a, vec_b), vec_c)
728 #endif // (USE_X86_EXT_INTRIN >= 9)
729 #define MM256_LS_ADD_PD(ptr, vec_a) _mm256_store_pd(ptr, _mm256_add_pd(_mm256_load_pd(ptr), vec_a))
730 #define MM256_LSU_ADD_PD(ptr, vec_a) _mm256_storeu_pd(ptr, _mm256_add_pd(_mm256_loadu_pd(ptr), vec_a))
731 #define MM256_LS_MUL_PD(ptr, vec_a) _mm256_store_pd(ptr, _mm256_mul_pd(_mm256_load_pd(ptr), vec_a))
732 #define MM256_LSU_MUL_PD(ptr, vec_a) _mm256_storeu_pd(ptr, _mm256_mul_pd(_mm256_loadu_pd(ptr), vec_a))
733 #define MM256_LS_ADD_PS(ptr, vec_a) _mm256_store_ps(ptr, _mm256_add_ps(_mm256_load_ps(ptr), vec_a))
734 #define MM256_LSU_ADD_PS(ptr, vec_a) _mm256_storeu_ps(ptr, _mm256_add_ps(_mm256_loadu_ps(ptr), vec_a))
735 #define MM256_LS_MUL_PS(ptr, vec_a) _mm256_store_ps(ptr, _mm256_mul_ps(_mm256_load_ps(ptr), vec_a))
736 #define MM256_LSU_MUL_PS(ptr, vec_a) _mm256_storeu_ps(ptr, _mm256_mul_ps(_mm256_loadu_ps(ptr), vec_a))
737 #define MM256_SET2X_PS(vec_a, vec_b) \
738 _mm256_insertf128_ps(_mm256_insertf128_ps(_mm256_setzero_ps(), vec_a, 0x0), vec_b, 0x1)
739 #define MM256_SET2X_PD(vec_a, vec_b) \
740 _mm256_insertf128_pd(_mm256_insertf128_pd(_mm256_setzero_pd(), vec_a, 0x0), vec_b, 0x1)
741 #endif // (USE_X86_EXT_INTRIN >= 8)
743 #if (USE_X86_EXT_INTRIN >= 3)
744 #if (USE_X86_EXT_INTRIN >= 9)
745 #define MM_FMA_PD(vec_a, vec_b, vec_c) _mm_fmadd_pd(vec_a, vec_b, vec_c)
746 #define MM_FMA2_PD(vec_a, vec_b, vec_c, vec_d) _mm_fmadd_pd(vec_a, vec_b, _mm_mul_pd(vec_c, vec_d))
747 #define MM_FMA3_PD(v00, v01, v10, v11, v20, v21) _mm_fmadd_pd(v20, v21, _mm_fmadd_pd(v10, v11, _mm_mul_pd(v00, v01)) )
748 #define MM_FMA4_PD(v00, v01, v10, v11, v20, v21, v30, v31) _mm_add_pd(\
749 _mm_fmadd_pd(v30, v31, _mm_mul_pd(v20, v21)), _mm_fmadd_pd(v10, v11, _mm_mul_pd(v00, v01)) )
750 #define MM_FMA5_PD(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41) _mm_add_pd(_mm_fmadd_pd(v40, v41, \
751 _mm_fmadd_pd(v30, v31, _mm_mul_pd(v20, v21))), _mm_fmadd_pd(v10, v11, _mm_mul_pd(v00, v01)) )
752 #define MM_FMA6_PD(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41, v50, v51) _mm_add_pd(\
753 _mm_fmadd_pd(v50, v51, _mm_fmadd_pd(v40, v41, _mm_mul_pd(v30, v31))), \
754 _mm_fmadd_pd(v20, v21, _mm_fmadd_pd(v10, v11, _mm_mul_pd(v00, v01))) )
755 #define MM_MSUB_PD(vec_a, vec_b, vec_c) _mm_fmsub_pd(vec_a, vec_b, vec_c)
756 #define MM_LS_FMA_PD(ptr, vec_a, vec_b) _mm_store_pd(ptr, _mm_fmadd_pd(vec_a, vec_b, _mm_load_pd(ptr)))
757 #define MM_LSU_FMA_PD(ptr, vec_a, vec_b) _mm_storeu_pd(ptr, _mm_fmadd_pd(vec_a, vec_b, _mm_loadu_pd(ptr)))
758 #define MM_MSUB_PD(vec_a, vec_b, vec_c) _mm_fmsub_pd(vec_a, vec_b, vec_c)
759 #else // !(USE_X86_EXT_INTRIN >= 9)
760 #define MM_FMA_PD(vec_a, vec_b, vec_c) _mm_add_pd(_mm_mul_pd(vec_a, vec_b), vec_c)
761 #define MM_FMA2_PD(vec_a, vec_b, vec_c, vec_d) _mm_add_pd(_mm_mul_pd(vec_a, vec_b), _mm_mul_pd(vec_c, vec_d))
762 #define MM_FMA3_PD(v00, v01, v10, v11, v20, v21) _mm_add_pd(\
763 _mm_add_pd(_mm_mul_pd(v00, v01),_mm_mul_pd(v10, v11)), _mm_mul_pd(v20, v21) )
764 #define MM_FMA4_PD(v00, v01, v10, v11, v20, v21, v30, v31) _mm_add_pd(\
765 _mm_add_pd(_mm_mul_pd(v00, v01),_mm_mul_pd(v10, v11)), _mm_add_pd(_mm_mul_pd(v20, v21),_mm_mul_pd(v30, v31)))
766 #define MM_FMA5_PD(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41) _mm_add_pd(_mm_add_pd(\
767 _mm_add_pd(_mm_mul_pd(v00, v01),_mm_mul_pd(v10, v11)), _mm_add_pd(_mm_mul_pd(v20, v21),_mm_mul_pd(v30, v31)))\
768 , _mm_mul_pd(v40, v41))
769 #define MM_FMA6_PD(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41, v50, v51) _mm_add_pd(_mm_add_pd(\
770 _mm_add_pd(_mm_mul_pd(v00, v01),_mm_mul_pd(v10, v11)), _mm_add_pd(_mm_mul_pd(v20, v21),_mm_mul_pd(v30, v31)))\
771 , _mm_add_pd(_mm_mul_pd(v40, v41),_mm_mul_pd(v50, v51)))
772 #define MM_LS_FMA_PD(ptr, vec_a, vec_b) _mm_store_pd(ptr, _mm_add_pd(_mm_load_pd(ptr), _mm_mul_pd(vec_a, vec_b)))
773 #define MM_LSU_FMA_PD(ptr, vec_a, vec_b) _mm_storeu_pd(ptr, _mm_add_pd(_mm_loadu_pd(ptr), _mm_mul_pd(vec_a, vec_b)))
774 #define MM_MSUB_PD(vec_a, vec_b, vec_c) _mm_sub_pd(_mm_mul_pd(vec_a, vec_b), vec_c)
775 #endif // (USE_X86_EXT_INTRIN >= 9)
777 #define MM_LS_ADD_PD(ptr, vec_a) _mm_store_pd(ptr, _mm_add_pd(_mm_load_pd(ptr), vec_a))
778 #define MM_LSU_ADD_PD(ptr, vec_a) _mm_storeu_pd(ptr, _mm_add_pd(_mm_loadu_pd(ptr), vec_a))
779 #define MM_LS_MUL_PD(ptr, vec_a) _mm_store_pd(ptr, _mm_mul_pd(_mm_load_pd(ptr), vec_a))
780 #define MM_LSU_MUL_PD(ptr, vec_a) _mm_storeu_pd(ptr, _mm_mul_pd(_mm_loadu_pd(ptr), vec_a))
782 #if 0//(USE_X86_EXT_INTRIN >= 4) // sse3
783 #define MM_LOAD1_PD(ptr) _mm_loaddup_pd(ptr) // slow!
784 #else // !(USE_X86_EXT_INTRIN >= 4)
785 #define MM_LOAD1_PD(ptr) _mm_load1_pd(ptr)
786 #endif // (USE_X86_EXT_INTRIN >= 4)
788 #if (USE_X86_EXT_INTRIN >= 6) // sse4.1
789 #define MM_EXTRACT_EPI32(vec,num) _mm_extract_epi32(vec,num) // num:0~3
790 #else // ! (USE_X86_EXT_INTRIN >= 6)
791 #define MM_EXTRACT_EPI32(vec,num) _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, num)) // num:0~3
792 #endif // (USE_X86_EXT_INTRIN >= 6)
794 #endif // (USE_X86_EXT_INTRIN >= 3)
796 #if (USE_X86_EXT_INTRIN >= 2)
797 #if (USE_X86_EXT_INTRIN >= 9)
798 #define MM_FMA_PS(vec_a, vec_b, vec_c) _mm_fmadd_ps(vec_a, vec_b, vec_c)
799 #define MM_FMA2_PS(vec_a, vec_b, vec_c, vec_d) _mm_fmadd_ps(vec_a, vec_b, _mm_mul_ps(vec_c, vec_d))
800 #define MM_FMA3_PS(v00, v01, v10, v11, v20, v21) _mm_fmadd_ps(v20, v21, _mm_fmadd_ps(v10, v11, _mm_mul_ps(v00, v01))
801 #define MM_FMA4_PS(v00, v01, v10, v11, v20, v21, v30, v31) _mm_add_ps(\
802 _mm_fmadd_ps(v30, v31, _mm_mul_ps(v20, v21)), _mm_fmadd_ps(v10, v11, _mm_mul_ps(v00, v01)) )
803 #define MM_FMA5_PS(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41) _mm_fmadd_ps(v40, v41, \
804 _mm_fmadd_ps(v30, v31, _mm_mul_ps(v20, v21)), _mm_fmadd_ps(v10, v11, _mm_mul_ps(v00, v01)) )
805 #define MM_FMA6_PS(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41, v50, v51) _mm_add_ps(\
806 _mm_fmadd_ps(v50, v51, _mm_fmadd_ps(v40, v41, _mm_mul_ps(v30, v31))), \
807 _mm_fmadd_ps(v20, v21, _mm_fmadd_ps(v10, v11, _mm_mul_ps(v00, v01))) )
808 #define MM_LS_FMA_PS(ptr, vec_a, vec_b) _mm_store_ps(ptr, _mm_fmadd_ps(vec_a, vec_b, _mm_load_ps(ptr)))
809 #define MM_LSU_FMA_PS(ptr, vec_a, vec_b) _mm_storeu_ps(ptr, _mm_fmadd_ps(vec_a, vec_b, _mm_loadu_ps(ptr)))
810 #define MM_MSUB_PS(vec_a, vec_b, vec_c) _mm_fmsub_ps(vec_a, vec_b, vec_c)
812 #define MM_FMA_PS(vec_a, vec_b, vec_c) _mm_add_ps(_mm_mul_ps(vec_a, vec_b), vec_c)
813 #define MM_FMA2_PS(vec_a, vec_b, vec_c, vec_d) _mm_add_ps(_mm_mul_ps(vec_a, vec_b), _mm_mul_ps(vec_c, vec_d))
814 #define MM_FMA3_PS(v00, v01, v10, v11, v20, v21) _mm_add_ps(\
815 _mm_add_ps(_mm_mul_ps(v00, v01),_mm_mul_ps(v10, v11)), _mm_mul_ps(v20, v21))
816 #define MM_FMA4_PS(v00, v01, v10, v11, v20, v21, v30, v31) _mm_add_ps(\
817 _mm_add_ps(_mm_mul_ps(v00, v01),_mm_mul_ps(v10, v11)), _mm_add_ps(_mm_mul_ps(v20, v21),_mm_mul_ps(v30, v31))))
818 #define MM_FMA5_PS(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41) _mm_add_ps(_mm_add_ps(\
819 _mm_add_ps(_mm_mul_ps(v00, v01),_mm_mul_ps(v10, v11)), _mm_add_ps(_mm_mul_ps(v20, v21),_mm_mul_ps(v30, v31)))\
820 , _mm_mul_ps(v40, v41))
821 #define MM_FMA6_PS(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41, v50, v51) _mm_add_ps(_mm_add_ps(\
822 _mm_add_ps(_mm_mul_ps(v00, v01),_mm_mul_ps(v10, v11)), _mm_add_ps(_mm_mul_ps(v20, v21),_mm_mul_ps(v30, v31)))\
823 , _mm_add_ps(_mm_mul_ps(v40, v41),_mm_mul_ps(v50, v51)))
824 #define MM_LS_FMA_PS(ptr, vec_a, vec_b) _mm_store_ps(ptr, _mm_add_ps(_mm_load_ps(ptr), _mm_mul_ps(vec_a, vec_b)))
825 #define MM_LSU_FMA_PS(ptr, vec_a, vec_b) _mm_storeu_ps(ptr, _mm_add_ps(_mm_loadu_ps(ptr), _mm_mul_ps(vec_a, vec_b)))
826 #define MM_MSUB_PS(vec_a, vec_b, vec_c) _mm_sub_ps(_mm_mul_ps(vec_a, vec_b), vec_c)
828 #define MM_LS_ADD_PS(ptr, vec_a) _mm_store_ps(ptr, _mm_add_ps(_mm_load_ps(ptr), vec_a))
829 #define MM_LSU_ADD_PS(ptr, vec_a) _mm_storeu_ps(ptr, _mm_add_ps(_mm_loadu_ps(ptr), vec_a))
830 #define MM_LS_MUL_PS(ptr, vec_a) _mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), vec_a))
831 #define MM_LSU_MUL_PS(ptr, vec_a) _mm_storeu_ps(ptr, _mm_mul_ps(_mm_loadu_ps(ptr), vec_a))
834 #if (USE_X86_EXT_INTRIN >= 1)
835 #if !defined(_MSC_VER) || defined(__clang__)
836 #define MM_EXTRACT_F32(reg,idx) _mm_cvtss_f32(_mm_shuffle_ps(reg,reg,idx))
837 #define MM_EXTRACT_F64(reg,idx) _mm_cvtsd_f64(_mm_shuffle_pd(reg,reg,idx))
838 #define MM_EXTRACT_I32(reg,idx) _mm_cvtsi128_si32(_mm_shuffle_epi32(reg,idx))
839 #if (USE_X86_EXT_INTRIN >= 9)
840 #define MM256_EXTRACT_F32(reg,idx) _mm256_cvtss_f32(_mm256_permutevar8x32_ps(reg,idx))
841 #define MM256_EXTRACT_F64(reg,idx) _mm256_cvtsd_f64(_mm256_permute4x64_pd(reg,idx))
843 #define MM256_EXTRACT_F32(reg,idx) _mm_cvtss_f32(_mm_permute_ps(_mm256_extractf128_ps(reg, idx >= 4), idx % 4))
844 #define MM256_EXTRACT_F64(reg,idx) _mm_cvtsd_f64(_mm_permute_pd(_mm256_extractf128_ps(reg, idx >= 2), idx % 2))
846 #define MM256_EXTRACT_I32(reg,idx) _mm256_extract_epi32(reg,idx)
848 #define MM_EXTRACT_F32(reg,idx) reg.m128_f32[idx]
849 #define MM_EXTRACT_F64(reg,idx) reg.m128d_f64[idx]
850 #define MM_EXTRACT_I32(reg,idx) reg.m128i_i32[idx]
851 #define MM256_EXTRACT_F32(reg,idx) reg.m256_f32[idx]
852 #define MM256_EXTRACT_F64(reg,idx) reg.m256d_f64[idx]
853 #define MM256_EXTRACT_I32(reg,idx) reg.m256i_i32[idx]
855 #endif // (USE_X86_EXT_INTRIN >= 1)
861 #if (USE_X86_EXT_INTRIN >= 9)
862 #if (USE_X86_EXT_INTRIN >= 9)
863 #define MM256_I32GATHER_I32(base, offset, scale) _mm256_i32gather_epi32(base, offset, scale)
866 static TIMIDITY_FORCEINLINE __m256i mm256_i32gather_i32_impl(const int *base, __m256i offset, int scale)
868 ALIGN32 int32 buf[8];
869 __m256i byte_offset = _mm256_mullo_epi32(offset, _mm256_set1_epi32(scale));
871 __m256i vbase = _mm256_set1_epi64x((int64)base);
872 __m256i vnegative = _mm256_cmpgt_epi32(_mm256_setzero_si256(), byte_offset);
873 __m256i vptr0145 = _mm256_add_epi64(vbase, _mm256_unpacklo_epi32(byte_offset, vnegative));
874 __m256i vptr2367 = _mm256_add_epi64(vbase, _mm256_unpackhi_epi32(byte_offset, vnegative));
875 ALIGN32 const int32 *ptr0145[8];
876 ALIGN32 const int32 *ptr2367[8];
877 _mm256_store_si256((__m256i *)ptr0145, vptr0145);
878 _mm256_store_si256((__m256i *)ptr2367, vptr2367);
880 buf[0] = *ptr0145[0];
881 buf[1] = *ptr0145[1];
882 buf[2] = *ptr2367[0];
883 buf[3] = *ptr2367[1];
884 buf[4] = *ptr0145[2];
885 buf[5] = *ptr0145[3];
886 buf[6] = *ptr2367[2];
887 buf[7] = *ptr2367[3];
889 __m256i pointers = _mm256_add_epi32(_mm256_set1_epi32((int32)base), byte_offset);
890 _mm256_store_si256((__m256i *)buf, pointers);
892 for (int i = 0; i < 8; i++) {
893 buf[i] = *(const int *)buf[i];
897 return _mm256_load_si256((const __m256i *)buf);
900 #define MM256_I32GATHER_I32(base, offset, scale) mm256_i32gather_i32_impl(base, offset, scale)
901 #endif // (USE_X86_EXT_INTRIN >= 9)
903 static TIMIDITY_FORCEINLINE void mm256_i32scatter_i32_impl(void *base, __m256i offset, __m256i val, int scale)
905 ALIGN32 int32 buf[8];
906 _mm256_store_si256((__m256i *)buf, val);
908 __m256i byte_offset = _mm256_mullo_epi32(offset, _mm256_set1_epi32(scale));
910 __m256i vbase = _mm256_set1_epi64x((int64)base);
911 __m256i vnegative = _mm256_cmpgt_epi32(_mm256_setzero_si256(), byte_offset);
912 __m256i vptr0145 = _mm256_add_epi64(vbase, _mm256_unpacklo_epi32(byte_offset, vnegative));
913 __m256i vptr2367 = _mm256_add_epi64(vbase, _mm256_unpackhi_epi32(byte_offset, vnegative));
914 ALIGN32 int32 *ptr0145[4];
915 ALIGN32 int32 *ptr2367[4];
916 _mm256_store_si256((__m256i *)ptr0145, vptr0145);
917 _mm256_store_si256((__m256i *)ptr2367, vptr2367);
919 *ptr0145[0] = buf[0];
920 *ptr0145[1] = buf[1];
921 *ptr2367[0] = buf[2];
922 *ptr2367[1] = buf[3];
923 *ptr0145[2] = buf[4];
924 *ptr0145[3] = buf[5];
925 *ptr2367[2] = buf[6];
926 *ptr2367[3] = buf[7];
928 __m256i vptr = _mm256_add_epi32(_mm256_set1_epi32((int32)base), byte_offset);
929 ALIGN32 int32 *ptr[8];
930 _mm256_store_si256((__m256i *)ptr, vptr);
932 for (int i = 0; i < 8; i++) {
938 #define MM256_I32SCATTER_I32(base, offset, val, scale) mm256_i32scatter_i32_impl(base, offset, val, scale)
940 #endif // (USE_X86_EXT_INTRIN >= 9)
942 #if (USE_X86_EXT_INTRIN >= 1)
943 #if (USE_X86_EXT_INTRIN >= 9)
944 #define MM_I32GATHER_I32(base, offset, scale) _mm_i32gather_epi32(base, offset, scale)
945 #elif (USE_X86_EXT_INTRIN >= 6)
947 static TIMIDITY_FORCEINLINE __m128i mm_i32gather_i32_impl(const int *base, __m128i offset, int scale)
949 ALIGN16 int32 buf[4];
950 __m128i byte_offset = _mm_mullo_epi32(offset, _mm_set1_epi32(scale));
952 __m128i vbase = _mm_set1_epi64x((int64)base);
953 __m128i vnegative = _mm_cmpgt_epi32(_mm_setzero_si128(), byte_offset);
954 __m128i vptr01 = _mm_add_epi64(vbase, _mm_unpacklo_epi32(byte_offset, vnegative));
955 __m128i vptr23 = _mm_add_epi64(vbase, _mm_unpackhi_epi32(byte_offset, vnegative));
956 ALIGN16 const int32 *ptr01[2];
957 ALIGN16 const int32 *ptr23[2];
958 _mm_store_si128((__m128i *)ptr01, vptr01);
959 _mm_store_si128((__m128i *)ptr23, vptr23);
966 __m128i pointers = _mm_add_epi32(_mm_set1_epi32((int32)base), byte_offset);
967 _mm_store_si128((__m128i *)buf, pointers);
969 for (int i = 0; i < 4; i++) {
970 buf[i] = *(const int *)buf[i];
974 return _mm_load_si128((const __m128i *)buf);
977 #define MM_I32GATHER_I32(base, offset, scale) mm_i32gather_i32_impl(base, offset, scale)
978 #endif // (USE_X86_EXT_INTRIN >= 6)
979 #endif // (USE_X86_EXT_INTRIN >= 1)
981 #if (USE_X86_EXT_INTRIN >= 6)
983 static TIMIDITY_FORCEINLINE void mm_i32scatter_i32_impl(void *base, __m128i offset, __m128i val, int scale)
985 ALIGN16 int32 buf[4];
986 _mm_store_si128((__m128i *)buf, val);
988 __m128i byte_offset = _mm_mullo_epi32(offset, _mm_set1_epi32(scale));
990 __m128i vbase = _mm_set1_epi64x((int64)base);
991 __m128i vnegative = _mm_cmpgt_epi32(_mm_setzero_si128(), byte_offset);
992 __m128i vptr01 = _mm_add_epi64(vbase, _mm_unpacklo_epi32(byte_offset, vnegative));
993 __m128i vptr23 = _mm_add_epi64(vbase, _mm_unpackhi_epi32(byte_offset, vnegative));
994 ALIGN16 int32 *ptr01[2];
995 ALIGN16 int32 *ptr23[2];
996 _mm_store_si128((__m128i *)ptr01, vptr01);
997 _mm_store_si128((__m128i *)ptr23, vptr23);
1004 __m128i vptr = _mm_add_epi32(_mm_set1_epi32((int32)base), byte_offset);
1005 ALIGN16 int32 *ptr[4];
1006 _mm_store_si128((__m128i *)ptr, vptr);
1008 for (int i = 0; i < 4; i++) {
1015 #define MM_I32SCATTER_I32(base, offset, val, scale) mm_i32scatter_i32_impl(base, offset, val, scale)
1017 #endif // (USE_X86_EXT_INTRIN >= 1)
1019 #define IS_ALIGN(ptr) (!((int32)ptr & (ALIGN_SIZE - 1)))
1020 extern int is_x86ext_available(void);
1028 #define ALIGNED_MALLOC(size) malloc(size)
1029 #define ALIGNED_FREE(ptr) free(ptr)
1031 #ifndef aligned_malloc
1032 #define aligned_malloc(size_byte, align_size) malloc(size_byte)
1034 #ifndef aligned_free
1035 #define aligned_free(ptr) free(ptr)
1041 /*****************************************************************************/
1044 #ifndef __bool_true_false_are_defined
1046 typedef enum { false = 0, true = 1 } bool;
1047 #endif /* C99 Hack */
1050 typedef vector signed int vint32;
1051 typedef vector signed char vint8;
1054 void v_memset(void *dest, int c, size_t len);
1055 void v_memzero(void *dest, size_t len);
1056 void v_set_dry_signal(void *dest, const int32 *buf, int32 n);
1058 /* inline functions */
1059 extern inline bool is_altivec_available(void)
1061 int sel[2] = { CTL_HW, HW_VECTORUNIT };
1062 int has_altivec = false;
1063 size_t len = sizeof(has_altivec);
1064 int error = sysctl(sel, 2, &has_altivec, &len, NULL, 0);
1066 return (bool)!!has_altivec;
1072 extern inline void libc_memset(void *destp, int c, size_t len)
1074 memset(destp, c, len);
1077 static inline void *switch_memset(void *destp, int c, size_t len)
1079 void *keepdestp = destp;
1080 if (!is_altivec_available()) {
1081 libc_memset(destp, c, len);
1083 v_memset(destp, c, len);
1085 v_memzero(destp, len);
1090 #define memset switch_memset
1091 #endif /* altivec */
1094 #pragma clang diagnostic pop
1097 #endif /* OPTCODE_H_INCLUDED */