if (i >= batch_size)
break;
- __m256i vcounts = _mm256_set_epi32(
- i + 7 < batch_size ? counts[i + 7] : 0,
- i + 6 < batch_size ? counts[i + 6] : 0,
- i + 5 < batch_size ? counts[i + 5] : 0,
- i + 4 < batch_size ? counts[i + 4] : 0,
- i + 3 < batch_size ? counts[i + 3] : 0,
- i + 2 < batch_size ? counts[i + 2] : 0,
- i + 1 < batch_size ? counts[i + 1] : 0,
- counts[i]
- );
+ __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(i, batch_size), &counts[i]);
__m256d vdb0123_0 = _mm256_loadu_pd(&dbs[i][0]);
__m256d vdb0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dbs[i + 1][0]) : _mm256_setzero_pd();
uint8 imask = _kor_mask8(
_kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
_kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
- ) & ((1 << (batch_size - i)) - 1);
+ );
+
+ if (batch_size - i < 8)
+ imask &= (1 << (batch_size - i)) - 1;
if (imask) {
__m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
{
- __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(0, batch_size), counts);
-
- __m128d vdb01_0 = _mm_loadu_pd(dbs[0]);
- __m128d vdb01_1 = 1 < batch_size ? _mm_loadu_pd(dbs[1]) : _mm_setzero_pd();
- __m128d vdb01_2 = 2 < batch_size ? _mm_loadu_pd(dbs[2]) : _mm_setzero_pd();
- __m128d vdb01_3 = 3 < batch_size ? _mm_loadu_pd(dbs[3]) : _mm_setzero_pd();
- __m128d vdb01_4 = 4 < batch_size ? _mm_loadu_pd(dbs[4]) : _mm_setzero_pd();
- __m128d vdb01_5 = 5 < batch_size ? _mm_loadu_pd(dbs[5]) : _mm_setzero_pd();
- __m128d vdb01_6 = 6 < batch_size ? _mm_loadu_pd(dbs[6]) : _mm_setzero_pd();
- __m128d vdb01_7 = 7 < batch_size ? _mm_loadu_pd(dbs[7]) : _mm_setzero_pd();
-
- __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
- __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
- __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
- __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
-
- __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
- __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
-
- __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
- __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
-
- __m128d vdc01_0 = _mm_loadu_pd(dcs[0]);
- __m128d vdc01_1 = 1 < batch_size ? _mm_loadu_pd(dcs[1]) : _mm_setzero_pd();
- __m128d vdc01_2 = 2 < batch_size ? _mm_loadu_pd(dcs[2]) : _mm_setzero_pd();
- __m128d vdc01_3 = 3 < batch_size ? _mm_loadu_pd(dcs[3]) : _mm_setzero_pd();
- __m128d vdc01_4 = 4 < batch_size ? _mm_loadu_pd(dcs[4]) : _mm_setzero_pd();
- __m128d vdc01_5 = 5 < batch_size ? _mm_loadu_pd(dcs[5]) : _mm_setzero_pd();
- __m128d vdc01_6 = 6 < batch_size ? _mm_loadu_pd(dcs[6]) : _mm_setzero_pd();
- __m128d vdc01_7 = 7 < batch_size ? _mm_loadu_pd(dcs[7]) : _mm_setzero_pd();
-
- __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
- __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
- __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
- __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
-
- __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
- __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
-
- __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
- __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
-
- __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
- vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
- int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
-
- for (int32 j = 0; j < count_max; j += 8) {
- __m512d vin[8];
- vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[0]), &sps[0][j]);
-
- for (int k = 1; k < 8; k++)
- vin[k] = _mm512_maskz_loadu_pd(k < batch_size ? generate_mask8_for_count(j, counts[k]) : 0, &sps[k][j]);
-
- __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
- __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
- __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
- __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
- __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
- __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
- __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
- __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
-
- __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
- __m512d vsps[8];
- vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
- for (int k = 0; k < 8; k++) {
- __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
-
- vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
- vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
- vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
- vsps[k] = vdb0;
- }
+ for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
+ if (i >= batch_size)
+ break;
- __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
- __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
- __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
- __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
- __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
- __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
- __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
- __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
-
- __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
- __m512d vout[8];
- vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
- for (int k = 0; k < batch_size; k++)
- _mm512_mask_storeu_pd(&sps[k][j], generate_mask8_for_count(j, counts[k]), vout[k]);
- }
+ __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(i, batch_size), &counts[i]);
+
+ __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
+ __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
+ __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
+ __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
+ __m128d vdb01_4 = i + 4 < batch_size ? _mm_loadu_pd(dbs[i + 4]) : _mm_setzero_pd();
+ __m128d vdb01_5 = i + 5 < batch_size ? _mm_loadu_pd(dbs[i + 5]) : _mm_setzero_pd();
+ __m128d vdb01_6 = i + 6 < batch_size ? _mm_loadu_pd(dbs[i + 6]) : _mm_setzero_pd();
+ __m128d vdb01_7 = i + 7 < batch_size ? _mm_loadu_pd(dbs[i + 7]) : _mm_setzero_pd();
+
+ __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
+ __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
+ __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
+ __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
+
+ __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
+ __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
+
+ __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
+ __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
+
+ __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
+ __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
+ __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
+ __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
+ __m128d vdc01_4 = i + 4 < batch_size ? _mm_loadu_pd(dcs[i + 4]) : _mm_setzero_pd();
+ __m128d vdc01_5 = i + 5 < batch_size ? _mm_loadu_pd(dcs[i + 5]) : _mm_setzero_pd();
+ __m128d vdc01_6 = i + 6 < batch_size ? _mm_loadu_pd(dcs[i + 6]) : _mm_setzero_pd();
+ __m128d vdc01_7 = i + 7 < batch_size ? _mm_loadu_pd(dcs[i + 7]) : _mm_setzero_pd();
+
+ __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
+ __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
+ __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
+ __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
+
+ __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
+ __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
+
+ __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
+ __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
+
+ __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
+ vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
+ int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
+
+ for (int32 j = 0; j < count_max; j += 8) {
+ __m512d vin[8];
+ vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[i]), &sps[i][j]);
+
+ for (int k = 1; k < 8; k++)
+ vin[k] = _mm512_maskz_loadu_pd(i + k < batch_size ? generate_mask8_for_count(j, counts[i + k]) : 0, & sps[i + k][j]);
+
+ __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
+ __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
+ __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
+ __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
+ __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
+ __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
+ __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
+ __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
+
+ __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+ __m512d vsps[8];
+ vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+ for (int k = 0; k < 8; k++) {
+ __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
+
+ vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
+ vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
+ vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
+ vsps[k] = vdb0;
+ }
+
+ __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
+ __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
+ __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
+ __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
+ __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
+ __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
+ __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
+ __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
- vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
- vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
+ __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- _mm_storeu_pd(dbs[0], _mm512_castpd512_pd128(vdb01_0246));
+ __m512d vout[8];
+ vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+ for (int k = 0; k < batch_size; k++)
+ _mm512_mask_storeu_pd(&sps[i + k][j], generate_mask8_for_count(j, counts[i + k]), vout[k]);
+ }
- if (1 < batch_size)
- _mm_storeu_pd(dbs[1], _mm512_castpd512_pd128(vdb01_1357));
+ vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
+ vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
+
+ _mm_storeu_pd(dbs[i], _mm512_castpd512_pd128(vdb01_0246));
+
+ if (i + 1 < batch_size)
+ _mm_storeu_pd(dbs[i + 1], _mm512_castpd512_pd128(vdb01_1357));
- if (2 < batch_size)
- _mm_storeu_pd(dbs[2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
+ if (i + 2 < batch_size)
+ _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
- if (3 < batch_size)
- _mm_storeu_pd(dbs[3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
+ if (i + 3 < batch_size)
+ _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
- if (4 < batch_size)
- _mm_storeu_pd(dbs[4], _mm512_extractf64x2_pd(vdb01_0246, 2));
+ if (i + 4 < batch_size)
+ _mm_storeu_pd(dbs[i + 4], _mm512_extractf64x2_pd(vdb01_0246, 2));
- if (5 < batch_size)
- _mm_storeu_pd(dbs[5], _mm512_extractf64x2_pd(vdb01_1357, 2));
+ if (i + 5 < batch_size)
+ _mm_storeu_pd(dbs[i + 5], _mm512_extractf64x2_pd(vdb01_1357, 2));
- if (6 < batch_size)
- _mm_storeu_pd(dbs[6], _mm512_extractf64x2_pd(vdb01_0246, 3));
+ if (i + 6 < batch_size)
+ _mm_storeu_pd(dbs[i + 6], _mm512_extractf64x2_pd(vdb01_0246, 3));
- if (7 < batch_size)
- _mm_storeu_pd(dbs[7], _mm512_extractf64x2_pd(vdb01_1357, 3));
+ if (i + 7 < batch_size)
+ _mm_storeu_pd(dbs[i + 7], _mm512_extractf64x2_pd(vdb01_1357, 3));
+ }
}
#elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients **fcs)
{
- __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[0]->range);
- __m256d vfcrange0123_1 = 1 < batch_size ? _mm256_loadu_pd(fcs[1]->range) : _mm256_setzero_pd();
- __m256d vfcrange0123_2 = 2 < batch_size ? _mm256_loadu_pd(fcs[2]->range) : _mm256_setzero_pd();
- __m256d vfcrange0123_3 = 3 < batch_size ? _mm256_loadu_pd(fcs[3]->range) : _mm256_setzero_pd();
- __m256d vfcrange0123_4 = 4 < batch_size ? _mm256_loadu_pd(fcs[4]->range) : _mm256_setzero_pd();
- __m256d vfcrange0123_5 = 5 < batch_size ? _mm256_loadu_pd(fcs[5]->range) : _mm256_setzero_pd();
- __m256d vfcrange0123_6 = 6 < batch_size ? _mm256_loadu_pd(fcs[6]->range) : _mm256_setzero_pd();
- __m256d vfcrange0123_7 = 7 < batch_size ? _mm256_loadu_pd(fcs[7]->range) : _mm256_setzero_pd();
-
- __m512d vfcrange0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_0), vfcrange0123_2, 1);
- __m512d vfcrange0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_1), vfcrange0123_3, 1);
- __m512d vfcrange0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_4), vfcrange0123_6, 1);
- __m512d vfcrange0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_5), vfcrange0123_7, 1);
-
- __m512d vfcrange01_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vfcrange01_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vfcrange23_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vfcrange23_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
- __m512d vfcrange0 = _mm512_unpacklo_pd(vfcrange01_0246, vfcrange01_1357);
- __m512d vfcrange1 = _mm512_unpackhi_pd(vfcrange01_0246, vfcrange01_1357);
- __m512d vfcrange2 = _mm512_unpacklo_pd(vfcrange23_0246, vfcrange23_1357);
- __m512d vfcrange3 = _mm512_unpackhi_pd(vfcrange23_0246, vfcrange23_1357);
-
- __m512d vfcfreq = _mm512_set_pd(
- 7 < batch_size ? fcs[7]->freq : 0.0,
- 6 < batch_size ? fcs[6]->freq : 0.0,
- 5 < batch_size ? fcs[5]->freq : 0.0,
- 4 < batch_size ? fcs[4]->freq : 0.0,
- 3 < batch_size ? fcs[3]->freq : 0.0,
- 2 < batch_size ? fcs[2]->freq : 0.0,
- 1 < batch_size ? fcs[1]->freq : 0.0,
- fcs[0]->freq
- );
-
- __m512d vfcreso_DB = _mm512_set_pd(
- 7 < batch_size ? fcs[7]->reso_dB : 0.0,
- 6 < batch_size ? fcs[6]->reso_dB : 0.0,
- 5 < batch_size ? fcs[5]->reso_dB : 0.0,
- 4 < batch_size ? fcs[4]->reso_dB : 0.0,
- 3 < batch_size ? fcs[3]->reso_dB : 0.0,
- 2 < batch_size ? fcs[2]->reso_dB : 0.0,
- 1 < batch_size ? fcs[1]->reso_dB : 0.0,
- fcs[0]->reso_dB
- );
-
- uint8 imask = _kor_mask8(
- _kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
- _kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
- ) & ((1 << batch_size) - 1);
-
- if (imask) {
- __m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
- __m512d v1pmargin = _mm512_set1_pd(1.0 + ext_filter_margin);
-
- vfcrange0 = _mm512_mul_pd(vfcfreq, v1mmargin);
- vfcrange1 = _mm512_mul_pd(vfcfreq, v1pmargin);
- vfcrange2 = _mm512_mul_pd(vfcreso_DB, v1mmargin);
- vfcrange3 = _mm512_mul_pd(vfcreso_DB, v1pmargin);
-
- vfcrange01_0246 = _mm512_unpacklo_pd(vfcrange0, vfcrange1);
- vfcrange01_1357 = _mm512_unpackhi_pd(vfcrange0, vfcrange1);
- vfcrange23_0246 = _mm512_unpacklo_pd(vfcrange2, vfcrange3);
- vfcrange23_1357 = _mm512_unpackhi_pd(vfcrange2, vfcrange3);
+ for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
+ if (i >= batch_size)
+ break;
+
+ __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[i]->range);
+ __m256d vfcrange0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(fcs[i + 1]->range) : _mm256_setzero_pd();
+ __m256d vfcrange0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(fcs[i + 2]->range) : _mm256_setzero_pd();
+ __m256d vfcrange0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(fcs[i + 3]->range) : _mm256_setzero_pd();
+ __m256d vfcrange0123_4 = i + 4 < batch_size ? _mm256_loadu_pd(fcs[i + 4]->range) : _mm256_setzero_pd();
+ __m256d vfcrange0123_5 = i + 5 < batch_size ? _mm256_loadu_pd(fcs[i + 5]->range) : _mm256_setzero_pd();
+ __m256d vfcrange0123_6 = i + 6 < batch_size ? _mm256_loadu_pd(fcs[i + 6]->range) : _mm256_setzero_pd();
+ __m256d vfcrange0123_7 = i + 7 < batch_size ? _mm256_loadu_pd(fcs[i + 7]->range) : _mm256_setzero_pd();
+
+ __m512d vfcrange0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_0), vfcrange0123_2, 1);
+ __m512d vfcrange0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_1), vfcrange0123_3, 1);
+ __m512d vfcrange0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_4), vfcrange0123_6, 1);
+ __m512d vfcrange0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_5), vfcrange0123_7, 1);
+
+ __m512d vfcrange01_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vfcrange01_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vfcrange23_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vfcrange23_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+ __m512d vfcrange0 = _mm512_unpacklo_pd(vfcrange01_0246, vfcrange01_1357);
+ __m512d vfcrange1 = _mm512_unpackhi_pd(vfcrange01_0246, vfcrange01_1357);
+ __m512d vfcrange2 = _mm512_unpacklo_pd(vfcrange23_0246, vfcrange23_1357);
+ __m512d vfcrange3 = _mm512_unpackhi_pd(vfcrange23_0246, vfcrange23_1357);
+
+ __m512d vfcfreq = _mm512_set_pd(
+ i + 7 < batch_size ? fcs[i + 7]->freq : 0.0,
+ i + 6 < batch_size ? fcs[i + 6]->freq : 0.0,
+ i + 5 < batch_size ? fcs[i + 5]->freq : 0.0,
+ i + 4 < batch_size ? fcs[i + 4]->freq : 0.0,
+ i + 3 < batch_size ? fcs[i + 3]->freq : 0.0,
+ i + 2 < batch_size ? fcs[i + 2]->freq : 0.0,
+ i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
+ fcs[i]->freq
+ );
+
+ __m512d vfcreso_DB = _mm512_set_pd(
+ i + 7 < batch_size ? fcs[i + 7]->reso_dB : 0.0,
+ i + 6 < batch_size ? fcs[i + 6]->reso_dB : 0.0,
+ i + 5 < batch_size ? fcs[i + 5]->reso_dB : 0.0,
+ i + 4 < batch_size ? fcs[i + 4]->reso_dB : 0.0,
+ i + 3 < batch_size ? fcs[i + 3]->reso_dB : 0.0,
+ i + 2 < batch_size ? fcs[i + 2]->reso_dB : 0.0,
+ i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
+ fcs[i]->reso_dB
+ );
+
+ uint8 imask = _kor_mask8(
+ _kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
+ _kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
+ );
+
+ if (batch_size - i < 8)
+ imask &= (1 << (batch_size - i)) - 1;
+
+ if (imask) {
+ __m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
+ __m512d v1pmargin = _mm512_set1_pd(1.0 + ext_filter_margin);
+
+ vfcrange0 = _mm512_mul_pd(vfcfreq, v1mmargin);
+ vfcrange1 = _mm512_mul_pd(vfcfreq, v1pmargin);
+ vfcrange2 = _mm512_mul_pd(vfcreso_DB, v1mmargin);
+ vfcrange3 = _mm512_mul_pd(vfcreso_DB, v1pmargin);
+
+ vfcrange01_0246 = _mm512_unpacklo_pd(vfcrange0, vfcrange1);
+ vfcrange01_1357 = _mm512_unpackhi_pd(vfcrange0, vfcrange1);
+ vfcrange23_0246 = _mm512_unpacklo_pd(vfcrange2, vfcrange3);
+ vfcrange23_1357 = _mm512_unpackhi_pd(vfcrange2, vfcrange3);
#if 1
- __m512d vfcrange0123_04 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_0246);
- __m512d vfcrange0123_26 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_0246);
- __m512d vfcrange0123_15 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_1357);
- __m512d vfcrange0123_37 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_1357);
+ __m512d vfcrange0123_04 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_0246);
+ __m512d vfcrange0123_26 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_0246);
+ __m512d vfcrange0123_15 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_1357);
+ __m512d vfcrange0123_37 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_1357);
#else
- __m512d vfcrange0123_04 = _mm512_mask_permutex_pd(vfcrange01_0246, 0xCC, vfcrange23_0246, (1 << 6) | (0 << 4) | 0);
- __m512d vfcrange0123_26 = _mm512_mask_permutex_pd(vfcrange01_0246, 0x33, vfcrange23_0246, (3 << 2) | 2);
- __m512d vfcrange0123_15 = _mm512_mask_permutex_pd(vfcrange01_1357, 0xCC, vfcrange23_1357, (1 << 6) | (0 << 4) | 0);
- __m512d vfcrange0123_37 = _mm512_mask_permutex_pd(vfcrange01_1357, 0x33, vfcrange23_1357, (3 << 2) | 2);
+ __m512d vfcrange0123_04 = _mm512_mask_permutex_pd(vfcrange01_0246, 0xCC, vfcrange23_0246, (1 << 6) | (0 << 4) | 0);
+ __m512d vfcrange0123_26 = _mm512_mask_permutex_pd(vfcrange01_0246, 0x33, vfcrange23_0246, (3 << 2) | 2);
+ __m512d vfcrange0123_15 = _mm512_mask_permutex_pd(vfcrange01_1357, 0xCC, vfcrange23_1357, (1 << 6) | (0 << 4) | 0);
+ __m512d vfcrange0123_37 = _mm512_mask_permutex_pd(vfcrange01_1357, 0x33, vfcrange23_1357, (3 << 2) | 2);
#endif
- if (imask & 1)
- _mm256_storeu_pd(fcs[0]->range, _mm512_castpd512_pd256(vfcrange0123_04));
+ if (imask & 1)
+ _mm256_storeu_pd(fcs[i]->range, _mm512_castpd512_pd256(vfcrange0123_04));
- if (imask & (1 << 1))
- _mm256_storeu_pd(fcs[1]->range, _mm512_castpd512_pd256(vfcrange0123_15));
+ if (imask & (1 << 1))
+ _mm256_storeu_pd(fcs[i + 1]->range, _mm512_castpd512_pd256(vfcrange0123_15));
- if (imask & (1 << 2))
- _mm256_storeu_pd(fcs[2]->range, _mm512_castpd512_pd256(vfcrange0123_26));
+ if (imask & (1 << 2))
+ _mm256_storeu_pd(fcs[i + 2]->range, _mm512_castpd512_pd256(vfcrange0123_26));
- if (imask & (1 << 3))
- _mm256_storeu_pd(fcs[3]->range, _mm512_castpd512_pd256(vfcrange0123_37));
+ if (imask & (1 << 3))
+ _mm256_storeu_pd(fcs[i + 3]->range, _mm512_castpd512_pd256(vfcrange0123_37));
- if (imask & (1 << 4))
- _mm256_storeu_pd(fcs[4]->range, _mm512_extractf64x4_pd(vfcrange0123_04, 1));
+ if (imask & (1 << 4))
+ _mm256_storeu_pd(fcs[i + 4]->range, _mm512_extractf64x4_pd(vfcrange0123_04, 1));
- if (imask & (1 << 5))
- _mm256_storeu_pd(fcs[5]->range, _mm512_extractf64x4_pd(vfcrange0123_15, 1));
+ if (imask & (1 << 5))
+ _mm256_storeu_pd(fcs[i + 5]->range, _mm512_extractf64x4_pd(vfcrange0123_15, 1));
- if (imask & (1 << 6))
- _mm256_storeu_pd(fcs[6]->range, _mm512_extractf64x4_pd(vfcrange0123_26, 1));
+ if (imask & (1 << 6))
+ _mm256_storeu_pd(fcs[i + 6]->range, _mm512_extractf64x4_pd(vfcrange0123_26, 1));
- if (imask & (1 << 7))
- _mm256_storeu_pd(fcs[7]->range, _mm512_extractf64x4_pd(vfcrange0123_37, 1));
+ if (imask & (1 << 7))
+ _mm256_storeu_pd(fcs[i + 7]->range, _mm512_extractf64x4_pd(vfcrange0123_37, 1));
- __m512d vfcdiv_flt_rate = _mm512_set_pd(
- 7 < batch_size ? fcs[7]->div_flt_rate : fcs[0]->div_flt_rate,
- 6 < batch_size ? fcs[6]->div_flt_rate : fcs[0]->div_flt_rate,
- 5 < batch_size ? fcs[5]->div_flt_rate : fcs[0]->div_flt_rate,
- 4 < batch_size ? fcs[4]->div_flt_rate : fcs[0]->div_flt_rate,
- 3 < batch_size ? fcs[3]->div_flt_rate : fcs[0]->div_flt_rate,
- 2 < batch_size ? fcs[2]->div_flt_rate : fcs[0]->div_flt_rate,
- 1 < batch_size ? fcs[1]->div_flt_rate : fcs[0]->div_flt_rate,
- fcs[0]->div_flt_rate
- );
+ __m512d vfcdiv_flt_rate = _mm512_set_pd(
+ i + 7 < batch_size ? fcs[i + 7]->div_flt_rate : fcs[i]->div_flt_rate,
+ i + 6 < batch_size ? fcs[i + 6]->div_flt_rate : fcs[i]->div_flt_rate,
+ i + 5 < batch_size ? fcs[i + 5]->div_flt_rate : fcs[i]->div_flt_rate,
+ i + 4 < batch_size ? fcs[i + 4]->div_flt_rate : fcs[i]->div_flt_rate,
+ i + 3 < batch_size ? fcs[i + 3]->div_flt_rate : fcs[i]->div_flt_rate,
+ i + 2 < batch_size ? fcs[i + 2]->div_flt_rate : fcs[i]->div_flt_rate,
+ i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
+ fcs[i]->div_flt_rate
+ );
- __m512d vf = _mm512_mul_pd(_mm512_mul_pd(_mm512_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
+ __m512d vf = _mm512_mul_pd(_mm512_mul_pd(_mm512_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
- FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[0]->reso_dB);
+ FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
- __m512d vreso_db_cf_p = _mm512_set_pd(
- 7 < batch_size ? RESO_DB_CF_P(fcs[7]->reso_dB) : reso_db_cf_p,
- 6 < batch_size ? RESO_DB_CF_P(fcs[6]->reso_dB) : reso_db_cf_p,
- 5 < batch_size ? RESO_DB_CF_P(fcs[5]->reso_dB) : reso_db_cf_p,
- 4 < batch_size ? RESO_DB_CF_P(fcs[4]->reso_dB) : reso_db_cf_p,
- 3 < batch_size ? RESO_DB_CF_P(fcs[3]->reso_dB) : reso_db_cf_p,
- 2 < batch_size ? RESO_DB_CF_P(fcs[2]->reso_dB) : reso_db_cf_p,
- 1 < batch_size ? RESO_DB_CF_P(fcs[1]->reso_dB) : reso_db_cf_p,
- reso_db_cf_p
- );
+ __m512d vreso_db_cf_p = _mm512_set_pd(
+ i + 7 < batch_size ? RESO_DB_CF_P(fcs[i + 7]->reso_dB) : reso_db_cf_p,
+ i + 6 < batch_size ? RESO_DB_CF_P(fcs[i + 6]->reso_dB) : reso_db_cf_p,
+ i + 5 < batch_size ? RESO_DB_CF_P(fcs[i + 5]->reso_dB) : reso_db_cf_p,
+ i + 4 < batch_size ? RESO_DB_CF_P(fcs[i + 4]->reso_dB) : reso_db_cf_p,
+ i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : reso_db_cf_p,
+ i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : reso_db_cf_p,
+ i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
+ reso_db_cf_p
+ );
- __m512d v1 = _mm512_set1_pd(1.0);
- __m512d v2 = _mm512_set1_pd(2.0);
- __m512d v0_5 = _mm512_set1_pd(0.5);
+ __m512d v1 = _mm512_set1_pd(1.0);
+ __m512d v2 = _mm512_set1_pd(2.0);
+ __m512d v0_5 = _mm512_set1_pd(0.5);
- __m512d vq = _mm512_sub_pd(v1, _mm512_div_pd(vf, _mm512_fmadd_pd(v2, _mm512_add_pd(vreso_db_cf_p, _mm512_div_pd(v0_5, _mm512_add_pd(v1, vf))), _mm512_sub_pd(vf, v2))));
- __m512d vc0 = _mm512_mul_pd(vq, vq);
+ __m512d vq = _mm512_sub_pd(v1, _mm512_div_pd(vf, _mm512_fmadd_pd(v2, _mm512_add_pd(vreso_db_cf_p, _mm512_div_pd(v0_5, _mm512_add_pd(v1, vf))), _mm512_sub_pd(vf, v2))));
+ __m512d vc0 = _mm512_mul_pd(vq, vq);
#ifdef USE_SVML
- __m512d vcosf = _mm512_cos_pd(vf);
+ __m512d vcosf = _mm512_cos_pd(vf);
#else
- ALIGN FLOAT_T af[8];
- _mm512_storeu_pd(af, vf);
- __m512d vcosf = _mm512_set_pd(cos(af[7]), cos(af[6]), cos(af[5]), cos(af[4]), cos(af[3]), cos(af[2]), cos(af[1]), cos(af[0]));
+ ALIGN FLOAT_T af[8];
+ _mm512_storeu_pd(af, vf);
+ __m512d vcosf = _mm512_set_pd(cos(af[7]), cos(af[6]), cos(af[5]), cos(af[4]), cos(af[3]), cos(af[2]), cos(af[1]), cos(af[0]));
#endif
- __m512d vc1 = _mm512_sub_pd(_mm512_add_pd(vc0, v1), _mm512_mul_pd(_mm512_mul_pd(v2, vcosf), vq));
-
- __m512d vdc0246 = _mm512_unpacklo_pd(vc0, vc1);
- __m512d vdc1357 = _mm512_unpackhi_pd(vc0, vc1);
-
- if (imask & 1)
- _mm_storeu_pd(fcs[0]->dc, _mm512_castpd512_pd128(vdc0246));
- if (imask & (1 << 1))
- _mm_storeu_pd(fcs[1]->dc, _mm512_castpd512_pd128(vdc1357));
- if (imask & (1 << 2))
- _mm_storeu_pd(fcs[2]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc0246), 1));
- if (imask & (1 << 3))
- _mm_storeu_pd(fcs[3]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc1357), 1));
- if (imask & (1 << 4))
- _mm_storeu_pd(fcs[4]->dc, _mm512_extractf64x2_pd(vdc0246, 2));
- if (imask & (1 << 5))
- _mm_storeu_pd(fcs[5]->dc, _mm512_extractf64x2_pd(vdc1357, 2));
- if (imask & (1 << 6))
- _mm_storeu_pd(fcs[6]->dc, _mm512_extractf64x2_pd(vdc0246, 3));
- if (imask & (1 << 7))
- _mm_storeu_pd(fcs[7]->dc, _mm512_extractf64x2_pd(vdc1357, 3));
+ __m512d vc1 = _mm512_sub_pd(_mm512_add_pd(vc0, v1), _mm512_mul_pd(_mm512_mul_pd(v2, vcosf), vq));
+
+ __m512d vdc0246 = _mm512_unpacklo_pd(vc0, vc1);
+ __m512d vdc1357 = _mm512_unpackhi_pd(vc0, vc1);
+
+ if (imask & 1)
+ _mm_storeu_pd(fcs[i]->dc, _mm512_castpd512_pd128(vdc0246));
+ if (imask & (1 << 1))
+ _mm_storeu_pd(fcs[i + 1]->dc, _mm512_castpd512_pd128(vdc1357));
+ if (imask & (1 << 2))
+ _mm_storeu_pd(fcs[i + 2]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc0246), 1));
+ if (imask & (1 << 3))
+ _mm_storeu_pd(fcs[i + 3]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc1357), 1));
+ if (imask & (1 << 4))
+ _mm_storeu_pd(fcs[i + 4]->dc, _mm512_extractf64x2_pd(vdc0246, 2));
+ if (imask & (1 << 5))
+ _mm_storeu_pd(fcs[i + 5]->dc, _mm512_extractf64x2_pd(vdc1357, 2));
+ if (imask & (1 << 6))
+ _mm_storeu_pd(fcs[i + 6]->dc, _mm512_extractf64x2_pd(vdc0246, 3));
+ if (imask & (1 << 7))
+ _mm_storeu_pd(fcs[i + 7]->dc, _mm512_extractf64x2_pd(vdc1357, 3));
+ }
}
}
static void sample_filter_HPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
{
- __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(0, batch_size), counts);
-
- __m128d vdb01_0 = _mm_loadu_pd(dbs[0]);
- __m128d vdb01_1 = 1 < batch_size ? _mm_loadu_pd(dbs[1]) : _mm_setzero_pd();
- __m128d vdb01_2 = 2 < batch_size ? _mm_loadu_pd(dbs[2]) : _mm_setzero_pd();
- __m128d vdb01_3 = 3 < batch_size ? _mm_loadu_pd(dbs[3]) : _mm_setzero_pd();
- __m128d vdb01_4 = 4 < batch_size ? _mm_loadu_pd(dbs[4]) : _mm_setzero_pd();
- __m128d vdb01_5 = 5 < batch_size ? _mm_loadu_pd(dbs[5]) : _mm_setzero_pd();
- __m128d vdb01_6 = 6 < batch_size ? _mm_loadu_pd(dbs[6]) : _mm_setzero_pd();
- __m128d vdb01_7 = 7 < batch_size ? _mm_loadu_pd(dbs[7]) : _mm_setzero_pd();
-
- __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
- __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
- __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
- __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
-
- __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
- __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
-
- __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
- __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
-
- __m128d vdc01_0 = _mm_loadu_pd(dcs[0]);
- __m128d vdc01_1 = 1 < batch_size ? _mm_loadu_pd(dcs[1]) : _mm_setzero_pd();
- __m128d vdc01_2 = 2 < batch_size ? _mm_loadu_pd(dcs[2]) : _mm_setzero_pd();
- __m128d vdc01_3 = 3 < batch_size ? _mm_loadu_pd(dcs[3]) : _mm_setzero_pd();
- __m128d vdc01_4 = 4 < batch_size ? _mm_loadu_pd(dcs[4]) : _mm_setzero_pd();
- __m128d vdc01_5 = 5 < batch_size ? _mm_loadu_pd(dcs[5]) : _mm_setzero_pd();
- __m128d vdc01_6 = 6 < batch_size ? _mm_loadu_pd(dcs[6]) : _mm_setzero_pd();
- __m128d vdc01_7 = 7 < batch_size ? _mm_loadu_pd(dcs[7]) : _mm_setzero_pd();
-
- __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
- __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
- __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
- __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
-
- __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
- __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
-
- __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
- __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
-
- __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
- vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
- int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
-
- for (int32 j = 0; j < count_max; j += 8) {
- __m512d vin[8];
- vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[0]), &sps[0][j]);
-
- for (int k = 1; k < 8; k++)
- vin[k] = _mm512_maskz_loadu_pd(k < batch_size ? generate_mask8_for_count(j, counts[k]) : 0, &sps[k][j]);
-
- __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
- __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
- __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
- __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
- __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
- __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
- __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
- __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
-
- __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
- __m512d vsps[8];
- vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
- for (int k = 0; k < 8; k++) {
- __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
-
- vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
- vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
- vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
- vsps[k] = _mm512_sub_pd(vsps[k], vdb0);
- }
+ for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
+ if (i >= batch_size)
+ break;
- __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
- __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
- __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
- __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
- __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
- __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
- __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
- __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
-
- __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
- __m512d vout[8];
- vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
- vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
- vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
- for (int k = 0; k < batch_size; k++)
- _mm512_mask_storeu_pd(&sps[k][j], generate_mask8_for_count(j, counts[k]), vout[k]);
- }
+ __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(i, batch_size), &counts[i]);
- vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
- vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
+ __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
+ __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
+ __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
+ __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
+ __m128d vdb01_4 = i + 4 < batch_size ? _mm_loadu_pd(dbs[i + 4]) : _mm_setzero_pd();
+ __m128d vdb01_5 = i + 5 < batch_size ? _mm_loadu_pd(dbs[i + 5]) : _mm_setzero_pd();
+ __m128d vdb01_6 = i + 6 < batch_size ? _mm_loadu_pd(dbs[i + 6]) : _mm_setzero_pd();
+ __m128d vdb01_7 = i + 7 < batch_size ? _mm_loadu_pd(dbs[i + 7]) : _mm_setzero_pd();
+
+ __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
+ __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
+ __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
+ __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
- _mm_storeu_pd(dbs[0], _mm512_castpd512_pd128(vdb01_0246));
+ __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
+ __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
- if (1 < batch_size)
- _mm_storeu_pd(dbs[1], _mm512_castpd512_pd128(vdb01_1357));
+ __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
+ __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
+
+ __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
+ __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
+ __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
+ __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
+ __m128d vdc01_4 = i + 4 < batch_size ? _mm_loadu_pd(dcs[i + 4]) : _mm_setzero_pd();
+ __m128d vdc01_5 = i + 5 < batch_size ? _mm_loadu_pd(dcs[i + 5]) : _mm_setzero_pd();
+ __m128d vdc01_6 = i + 6 < batch_size ? _mm_loadu_pd(dcs[i + 6]) : _mm_setzero_pd();
+ __m128d vdc01_7 = i + 7 < batch_size ? _mm_loadu_pd(dcs[i + 7]) : _mm_setzero_pd();
- if (2 < batch_size)
- _mm_storeu_pd(dbs[2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
+ __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
+ __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
+ __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
+ __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
- if (3 < batch_size)
- _mm_storeu_pd(dbs[3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
+ __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
+ __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
- if (4 < batch_size)
- _mm_storeu_pd(dbs[4], _mm512_extractf64x2_pd(vdb01_0246, 2));
+ __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
+ __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
- if (5 < batch_size)
- _mm_storeu_pd(dbs[5], _mm512_extractf64x2_pd(vdb01_1357, 2));
+ __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
+ vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
+ int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
- if (6 < batch_size)
- _mm_storeu_pd(dbs[6], _mm512_extractf64x2_pd(vdb01_0246, 3));
+ for (int32 j = 0; j < count_max; j += 8) {
+ __m512d vin[8];
+ vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[i]), &sps[i][j]);
- if (7 < batch_size)
- _mm_storeu_pd(dbs[7], _mm512_extractf64x2_pd(vdb01_1357, 3));
+ for (int k = 1; k < 8; k++)
+ vin[k] = _mm512_maskz_loadu_pd(i + k < batch_size ? generate_mask8_for_count(j, counts[i + k]) : 0, & sps[i + k][j]);
+
+ __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
+ __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
+ __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
+ __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
+ __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
+ __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
+ __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
+ __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
+
+ __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+ __m512d vsps[8];
+ vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+ for (int k = 0; k < 8; k++) {
+ __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
+
+ vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
+ vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
+ vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
+ vsps[k] = _mm512_sub_pd(vsps[k], vdb0);
+ }
+
+ __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
+ __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
+ __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
+ __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
+ __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
+ __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
+ __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
+ __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
+
+ __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+ __m512d vout[8];
+ vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+ vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+ vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+ for (int k = 0; k < batch_size; k++)
+ _mm512_mask_storeu_pd(&sps[i + k][j], generate_mask8_for_count(j, counts[i + k]), vout[k]);
+ }
+
+ vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
+ vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
+
+ _mm_storeu_pd(dbs[i], _mm512_castpd512_pd128(vdb01_0246));
+
+ if (i + 1 < batch_size)
+ _mm_storeu_pd(dbs[i + 1], _mm512_castpd512_pd128(vdb01_1357));
+
+ if (i + 2 < batch_size)
+ _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
+
+ if (i + 3 < batch_size)
+ _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
+
+ if (i + 4 < batch_size)
+ _mm_storeu_pd(dbs[i + 4], _mm512_extractf64x2_pd(vdb01_0246, 2));
+
+ if (i + 5 < batch_size)
+ _mm_storeu_pd(dbs[i + 5], _mm512_extractf64x2_pd(vdb01_1357, 2));
+
+ if (i + 6 < batch_size)
+ _mm_storeu_pd(dbs[i + 6], _mm512_extractf64x2_pd(vdb01_0246, 3));
+
+ if (i + 7 < batch_size)
+ _mm_storeu_pd(dbs[i + 7], _mm512_extractf64x2_pd(vdb01_1357, 3));
+ }
}
#elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)