#define fract_t int64
#endif
+#define USE_PERMUTEX2
static inline int32 imuldiv_fraction(int32 a, int32 b) {
#if (OPT_MODE == 1) && defined(SUPPORT_ASM_INTEL) /* fixed-point implementation */
__m512i vofsib = _mm512_broadcastd_epi32(_mm512_castsi512_si128(vofsi1));
__m512i vofsub1 = _mm512_sub_epi32(vofsi1, vofsib);
__m512i vofsub2 = _mm512_sub_epi32(vofsi2, vofsib);
-#ifdef USE_PERMUTEX2
__m512 vvf1 = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(vin1));
+#ifdef USE_PERMUTEX2
__m512 vvf2 = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(vin2));
__m512 vv1 = _mm512_permutex2var_ps(vvf1, vofsub1, vvf2); // v1 ofsi
__m512 vv2 = _mm512_permutex2var_ps(vvf1, vofsub2, vvf2); // v2 ofsi+1
#else
- __m512 vvf1 = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(vin1));
__m512 vv1 = _mm512_permutexvar_ps(vofsub1, vvf1); // v1 ofsi
__m512 vv2 = _mm512_permutexvar_ps(vofsub2, vvf1); // v2 ofsi+1
#endif
const __m512 vec_divo = _mm512_set1_ps(DIV_15BIT);
#ifdef LAO_OPTIMIZE_INCREMENT
// \8dÅ\93K\89»\83\8c\81[\83g = (\83\8d\81[\83h\83f\81[\83^\90\94 - \8f\89\8aú\83I\83t\83Z\83b\83g\8f¬\90\94\95\94\82Ì\8dÅ\91å\92l(1\96¢\96\9e) - \95â\8aÔ\83|\83C\83\93\83g\90\94(lagrange\82Í3) ) / \83I\83t\83Z\83b\83g\83f\81[\83^\90\94
+#ifdef USE_PERMUTEX2
+ const int32 opt_inc1 = (1 << FRACTION_BITS) * (32 - 1 - 3) / 16; // (float*16) * 1\83Z\83b\83g
+#else
const int32 opt_inc1 = (1 << FRACTION_BITS) * (16 - 1 - 3) / 16; // (float*16) * 1\83Z\83b\83g
+#endif
if(inc < opt_inc1){ // 1\83Z\83b\83g
const __m512i vvar1n = _mm512_set1_epi32(-1);
const __m512i vvar1 = _mm512_set1_epi32(1);
__m512i vofsi4 = _mm512_add_epi32(vofsi2, vvar2); // ofsi+2
int32 ofs0 = _mm_cvtsi128_si32(_mm512_castsi512_si128(vofsi1));
__m256i vin1 = _mm256_loadu_si256((__m256i *)&src[ofs0]); // int16*16
+#ifdef USE_PERMUTEX2
+ __m256i vin2 = _mm256_loadu_si256((__m256i *)&src[ofs0 + 16]); // int16*6
+#endif
__m512i vofsib = _mm512_broadcastd_epi32(_mm512_castsi512_si128(vofsi1));
__m512i vofsub1 = _mm512_sub_epi32(vofsi1, vofsib);
__m512i vofsub2 = _mm512_sub_epi32(vofsi2, vofsib);
__m512i vofsub3 = _mm512_sub_epi32(vofsi3, vofsib);
__m512i vofsub4 = _mm512_sub_epi32(vofsi4, vofsib);
__m512 vvf1 = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(vin1)); // int16 to float (i16*16->i32*16->f32*16
+#ifdef USE_PERMUTEX2
+ __m512 vvf2 = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(vin2));
+ __m512 vv0 = _mm512_permutex2var_ps(vvf1, vofsub1, vvf2);
+ __m512 vv1 = _mm512_permutex2var_ps(vvf1, vofsub2, vvf2);
+ __m512 vv2 = _mm512_permutex2var_ps(vvf1, vofsub3, vvf2);
+ __m512 vv3 = _mm512_permutex2var_ps(vvf1, vofsub4, vvf2);
+#else
__m512 vv0 = _mm512_permutexvar_ps(vofsub1, vvf1); // v1 ofsi-1
__m512 vv1 = _mm512_permutexvar_ps(vofsub2, vvf1); // v2 ofsi
__m512 vv2 = _mm512_permutexvar_ps(vofsub3, vvf1); // v2 ofsi+1
__m512 vv3 = _mm512_permutexvar_ps(vofsub4, vvf1); // v2 ofsi+2
+#endif
// \82 \82Æ\82Í\92Ê\8fí\82Æ\93¯\82¶
__m512i vofsf = _mm512_add_epi32(_mm512_and_epi32(vofs, vfmask), vfrac); // ofsf = (ofs & FRACTION_MASK) + mlt_fraction;
__m512 vtmp = _mm512_sub_ps(vv1, vv0); // tmp = v[1] - v[0];