OSDN Git Service

[filter] Improve LPF vectorization
authorStarg <starg@users.osdn.me>
Tue, 23 Mar 2021 10:12:12 +0000 (19:12 +0900)
committerStarg <starg@users.osdn.me>
Tue, 23 Mar 2021 10:12:12 +0000 (19:12 +0900)
timidity/filter.c

index 77d154b..6dd7016 100644 (file)
@@ -3951,16 +3951,7 @@ static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T
                if (i >= batch_size)
                        break;
 
-               __m256i vcounts = _mm256_set_epi32(
-                       i + 7 < batch_size ? counts[i + 7] : 0,
-                       i + 6 < batch_size ? counts[i + 6] : 0,
-                       i + 5 < batch_size ? counts[i + 5] : 0,
-                       i + 4 < batch_size ? counts[i + 4] : 0,
-                       i + 3 < batch_size ? counts[i + 3] : 0,
-                       i + 2 < batch_size ? counts[i + 2] : 0,
-                       i + 1 < batch_size ? counts[i + 1] : 0,
-                       counts[i]
-               );
+               __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(i, batch_size), &counts[i]);
 
                __m256d vdb0123_0 = _mm256_loadu_pd(&dbs[i][0]);
                __m256d vdb0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dbs[i + 1][0]) : _mm256_setzero_pd();
@@ -4465,7 +4456,10 @@ static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs)
                uint8 imask = _kor_mask8(
                        _kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
                        _kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
-               ) & ((1 << (batch_size - i)) - 1);
+               );
+
+               if (batch_size - i < 8)
+                       imask &= (1 << (batch_size - i)) - 1;
 
                if (imask) {
                        __m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
@@ -4884,153 +4878,158 @@ static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs)
 
 static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
 {
-       __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(0, batch_size), counts);
-
-       __m128d vdb01_0 = _mm_loadu_pd(dbs[0]);
-       __m128d vdb01_1 = 1 < batch_size ? _mm_loadu_pd(dbs[1]) : _mm_setzero_pd();
-       __m128d vdb01_2 = 2 < batch_size ? _mm_loadu_pd(dbs[2]) : _mm_setzero_pd();
-       __m128d vdb01_3 = 3 < batch_size ? _mm_loadu_pd(dbs[3]) : _mm_setzero_pd();
-       __m128d vdb01_4 = 4 < batch_size ? _mm_loadu_pd(dbs[4]) : _mm_setzero_pd();
-       __m128d vdb01_5 = 5 < batch_size ? _mm_loadu_pd(dbs[5]) : _mm_setzero_pd();
-       __m128d vdb01_6 = 6 < batch_size ? _mm_loadu_pd(dbs[6]) : _mm_setzero_pd();
-       __m128d vdb01_7 = 7 < batch_size ? _mm_loadu_pd(dbs[7]) : _mm_setzero_pd();
-
-       __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
-       __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
-       __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
-       __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
-
-       __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
-       __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
-
-       __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
-       __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
-
-       __m128d vdc01_0 = _mm_loadu_pd(dcs[0]);
-       __m128d vdc01_1 = 1 < batch_size ? _mm_loadu_pd(dcs[1]) : _mm_setzero_pd();
-       __m128d vdc01_2 = 2 < batch_size ? _mm_loadu_pd(dcs[2]) : _mm_setzero_pd();
-       __m128d vdc01_3 = 3 < batch_size ? _mm_loadu_pd(dcs[3]) : _mm_setzero_pd();
-       __m128d vdc01_4 = 4 < batch_size ? _mm_loadu_pd(dcs[4]) : _mm_setzero_pd();
-       __m128d vdc01_5 = 5 < batch_size ? _mm_loadu_pd(dcs[5]) : _mm_setzero_pd();
-       __m128d vdc01_6 = 6 < batch_size ? _mm_loadu_pd(dcs[6]) : _mm_setzero_pd();
-       __m128d vdc01_7 = 7 < batch_size ? _mm_loadu_pd(dcs[7]) : _mm_setzero_pd();
-
-       __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
-       __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
-       __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
-       __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
-
-       __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
-       __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
-
-       __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
-       __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
-
-       __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
-       vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
-       int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
-
-       for (int32 j = 0; j < count_max; j += 8) {
-               __m512d vin[8];
-               vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[0]), &sps[0][j]);
-
-               for (int k = 1; k < 8; k++)
-                       vin[k] = _mm512_maskz_loadu_pd(k < batch_size ? generate_mask8_for_count(j, counts[k]) : 0, &sps[k][j]);
-
-               __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
-               __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
-               __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
-               __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
-               __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
-               __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
-               __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
-               __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
-
-               __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
-               __m512d vsps[8];
-               vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
-               for (int k = 0; k < 8; k++) {
-                       __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
-
-                       vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
-                       vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
-                       vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
-                       vsps[k] = vdb0;
-               }
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
+               if (i >= batch_size)
+                       break;
 
-               __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
-               __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
-               __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
-               __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
-               __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
-               __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
-               __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
-               __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
-
-               __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
-               __m512d vout[8];
-               vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
-               for (int k = 0; k < batch_size; k++)
-                       _mm512_mask_storeu_pd(&sps[k][j], generate_mask8_for_count(j, counts[k]), vout[k]);
-       }
+               __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(i, batch_size), &counts[i]);
+
+               __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
+               __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
+               __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
+               __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
+               __m128d vdb01_4 = i + 4 < batch_size ? _mm_loadu_pd(dbs[i + 4]) : _mm_setzero_pd();
+               __m128d vdb01_5 = i + 5 < batch_size ? _mm_loadu_pd(dbs[i + 5]) : _mm_setzero_pd();
+               __m128d vdb01_6 = i + 6 < batch_size ? _mm_loadu_pd(dbs[i + 6]) : _mm_setzero_pd();
+               __m128d vdb01_7 = i + 7 < batch_size ? _mm_loadu_pd(dbs[i + 7]) : _mm_setzero_pd();
+
+               __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
+               __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
+               __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
+               __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
+
+               __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
+               __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
+
+               __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
+               __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
+
+               __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
+               __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
+               __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
+               __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
+               __m128d vdc01_4 = i + 4 < batch_size ? _mm_loadu_pd(dcs[i + 4]) : _mm_setzero_pd();
+               __m128d vdc01_5 = i + 5 < batch_size ? _mm_loadu_pd(dcs[i + 5]) : _mm_setzero_pd();
+               __m128d vdc01_6 = i + 6 < batch_size ? _mm_loadu_pd(dcs[i + 6]) : _mm_setzero_pd();
+               __m128d vdc01_7 = i + 7 < batch_size ? _mm_loadu_pd(dcs[i + 7]) : _mm_setzero_pd();
+
+               __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
+               __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
+               __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
+               __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
+
+               __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
+               __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
+
+               __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
+               __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
+
+               __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
+               vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
+               int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
+
+               for (int32 j = 0; j < count_max; j += 8) {
+                       __m512d vin[8];
+                       vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[i]), &sps[i][j]);
+
+                       for (int k = 1; k < 8; k++)
+                               vin[k] = _mm512_maskz_loadu_pd(i + k < batch_size ? generate_mask8_for_count(j, counts[i + k]) : 0, & sps[i + k][j]);
+
+                       __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
+                       __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
+                       __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
+                       __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
+                       __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
+                       __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
+                       __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
+                       __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
+
+                       __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+                       __m512d vsps[8];
+                       vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+                       for (int k = 0; k < 8; k++) {
+                               __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
+
+                               vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
+                               vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
+                               vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
+                               vsps[k] = vdb0;
+                       }
+
+                       __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
+                       __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
+                       __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
+                       __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
+                       __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
+                       __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
+                       __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
+                       __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
 
-       vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
-       vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
+                       __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
 
-       _mm_storeu_pd(dbs[0], _mm512_castpd512_pd128(vdb01_0246));
+                       __m512d vout[8];
+                       vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+                       for (int k = 0; k < batch_size; k++)
+                               _mm512_mask_storeu_pd(&sps[i + k][j], generate_mask8_for_count(j, counts[i + k]), vout[k]);
+               }
 
-       if (1 < batch_size)
-               _mm_storeu_pd(dbs[1], _mm512_castpd512_pd128(vdb01_1357));
+               vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
+               vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
+
+               _mm_storeu_pd(dbs[i], _mm512_castpd512_pd128(vdb01_0246));
+
+               if (i + 1 < batch_size)
+                       _mm_storeu_pd(dbs[i + 1], _mm512_castpd512_pd128(vdb01_1357));
 
-       if (2 < batch_size)
-               _mm_storeu_pd(dbs[2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
+               if (i + 2 < batch_size)
+                       _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
 
-       if (3 < batch_size)
-               _mm_storeu_pd(dbs[3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
+               if (i + 3 < batch_size)
+                       _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
 
-       if (4 < batch_size)
-               _mm_storeu_pd(dbs[4], _mm512_extractf64x2_pd(vdb01_0246, 2));
+               if (i + 4 < batch_size)
+                       _mm_storeu_pd(dbs[i + 4], _mm512_extractf64x2_pd(vdb01_0246, 2));
 
-       if (5 < batch_size)
-               _mm_storeu_pd(dbs[5], _mm512_extractf64x2_pd(vdb01_1357, 2));
+               if (i + 5 < batch_size)
+                       _mm_storeu_pd(dbs[i + 5], _mm512_extractf64x2_pd(vdb01_1357, 2));
 
-       if (6 < batch_size)
-               _mm_storeu_pd(dbs[6], _mm512_extractf64x2_pd(vdb01_0246, 3));
+               if (i + 6 < batch_size)
+                       _mm_storeu_pd(dbs[i + 6], _mm512_extractf64x2_pd(vdb01_0246, 3));
 
-       if (7 < batch_size)
-               _mm_storeu_pd(dbs[7], _mm512_extractf64x2_pd(vdb01_1357, 3));
+               if (i + 7 < batch_size)
+                       _mm_storeu_pd(dbs[i + 7], _mm512_extractf64x2_pd(vdb01_1357, 3));
+       }
 }
 
 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
@@ -5210,167 +5209,175 @@ static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T
 
 static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients **fcs)
 {
-       __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[0]->range);
-       __m256d vfcrange0123_1 = 1 < batch_size ? _mm256_loadu_pd(fcs[1]->range) : _mm256_setzero_pd();
-       __m256d vfcrange0123_2 = 2 < batch_size ? _mm256_loadu_pd(fcs[2]->range) : _mm256_setzero_pd();
-       __m256d vfcrange0123_3 = 3 < batch_size ? _mm256_loadu_pd(fcs[3]->range) : _mm256_setzero_pd();
-       __m256d vfcrange0123_4 = 4 < batch_size ? _mm256_loadu_pd(fcs[4]->range) : _mm256_setzero_pd();
-       __m256d vfcrange0123_5 = 5 < batch_size ? _mm256_loadu_pd(fcs[5]->range) : _mm256_setzero_pd();
-       __m256d vfcrange0123_6 = 6 < batch_size ? _mm256_loadu_pd(fcs[6]->range) : _mm256_setzero_pd();
-       __m256d vfcrange0123_7 = 7 < batch_size ? _mm256_loadu_pd(fcs[7]->range) : _mm256_setzero_pd();
-
-       __m512d vfcrange0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_0), vfcrange0123_2, 1);
-       __m512d vfcrange0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_1), vfcrange0123_3, 1);
-       __m512d vfcrange0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_4), vfcrange0123_6, 1);
-       __m512d vfcrange0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_5), vfcrange0123_7, 1);
-
-       __m512d vfcrange01_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-       __m512d vfcrange01_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-       __m512d vfcrange23_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-       __m512d vfcrange23_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
-       __m512d vfcrange0 = _mm512_unpacklo_pd(vfcrange01_0246, vfcrange01_1357);
-       __m512d vfcrange1 = _mm512_unpackhi_pd(vfcrange01_0246, vfcrange01_1357);
-       __m512d vfcrange2 = _mm512_unpacklo_pd(vfcrange23_0246, vfcrange23_1357);
-       __m512d vfcrange3 = _mm512_unpackhi_pd(vfcrange23_0246, vfcrange23_1357);
-
-       __m512d vfcfreq = _mm512_set_pd(
-               7 < batch_size ? fcs[7]->freq : 0.0,
-               6 < batch_size ? fcs[6]->freq : 0.0,
-               5 < batch_size ? fcs[5]->freq : 0.0,
-               4 < batch_size ? fcs[4]->freq : 0.0,
-               3 < batch_size ? fcs[3]->freq : 0.0,
-               2 < batch_size ? fcs[2]->freq : 0.0,
-               1 < batch_size ? fcs[1]->freq : 0.0,
-               fcs[0]->freq
-       );
-
-       __m512d vfcreso_DB = _mm512_set_pd(
-               7 < batch_size ? fcs[7]->reso_dB : 0.0,
-               6 < batch_size ? fcs[6]->reso_dB : 0.0,
-               5 < batch_size ? fcs[5]->reso_dB : 0.0,
-               4 < batch_size ? fcs[4]->reso_dB : 0.0,
-               3 < batch_size ? fcs[3]->reso_dB : 0.0,
-               2 < batch_size ? fcs[2]->reso_dB : 0.0,
-               1 < batch_size ? fcs[1]->reso_dB : 0.0,
-               fcs[0]->reso_dB
-       );
-
-       uint8 imask = _kor_mask8(
-               _kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
-               _kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
-       ) & ((1 << batch_size) - 1);
-
-       if (imask) {
-               __m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
-               __m512d v1pmargin = _mm512_set1_pd(1.0 + ext_filter_margin);
-
-               vfcrange0 = _mm512_mul_pd(vfcfreq, v1mmargin);
-               vfcrange1 = _mm512_mul_pd(vfcfreq, v1pmargin);
-               vfcrange2 = _mm512_mul_pd(vfcreso_DB, v1mmargin);
-               vfcrange3 = _mm512_mul_pd(vfcreso_DB, v1pmargin);
-
-               vfcrange01_0246 = _mm512_unpacklo_pd(vfcrange0, vfcrange1);
-               vfcrange01_1357 = _mm512_unpackhi_pd(vfcrange0, vfcrange1);
-               vfcrange23_0246 = _mm512_unpacklo_pd(vfcrange2, vfcrange3);
-               vfcrange23_1357 = _mm512_unpackhi_pd(vfcrange2, vfcrange3);
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
+               if (i >= batch_size)
+                       break;
+
+               __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[i]->range);
+               __m256d vfcrange0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(fcs[i + 1]->range) : _mm256_setzero_pd();
+               __m256d vfcrange0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(fcs[i + 2]->range) : _mm256_setzero_pd();
+               __m256d vfcrange0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(fcs[i + 3]->range) : _mm256_setzero_pd();
+               __m256d vfcrange0123_4 = i + 4 < batch_size ? _mm256_loadu_pd(fcs[i + 4]->range) : _mm256_setzero_pd();
+               __m256d vfcrange0123_5 = i + 5 < batch_size ? _mm256_loadu_pd(fcs[i + 5]->range) : _mm256_setzero_pd();
+               __m256d vfcrange0123_6 = i + 6 < batch_size ? _mm256_loadu_pd(fcs[i + 6]->range) : _mm256_setzero_pd();
+               __m256d vfcrange0123_7 = i + 7 < batch_size ? _mm256_loadu_pd(fcs[i + 7]->range) : _mm256_setzero_pd();
+
+               __m512d vfcrange0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_0), vfcrange0123_2, 1);
+               __m512d vfcrange0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_1), vfcrange0123_3, 1);
+               __m512d vfcrange0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_4), vfcrange0123_6, 1);
+               __m512d vfcrange0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_5), vfcrange0123_7, 1);
+
+               __m512d vfcrange01_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+               __m512d vfcrange01_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+               __m512d vfcrange23_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+               __m512d vfcrange23_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+               __m512d vfcrange0 = _mm512_unpacklo_pd(vfcrange01_0246, vfcrange01_1357);
+               __m512d vfcrange1 = _mm512_unpackhi_pd(vfcrange01_0246, vfcrange01_1357);
+               __m512d vfcrange2 = _mm512_unpacklo_pd(vfcrange23_0246, vfcrange23_1357);
+               __m512d vfcrange3 = _mm512_unpackhi_pd(vfcrange23_0246, vfcrange23_1357);
+
+               __m512d vfcfreq = _mm512_set_pd(
+                       i + 7 < batch_size ? fcs[i + 7]->freq : 0.0,
+                       i + 6 < batch_size ? fcs[i + 6]->freq : 0.0,
+                       i + 5 < batch_size ? fcs[i + 5]->freq : 0.0,
+                       i + 4 < batch_size ? fcs[i + 4]->freq : 0.0,
+                       i + 3 < batch_size ? fcs[i + 3]->freq : 0.0,
+                       i + 2 < batch_size ? fcs[i + 2]->freq : 0.0,
+                       i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
+                       fcs[i]->freq
+               );
+
+               __m512d vfcreso_DB = _mm512_set_pd(
+                       i + 7 < batch_size ? fcs[i + 7]->reso_dB : 0.0,
+                       i + 6 < batch_size ? fcs[i + 6]->reso_dB : 0.0,
+                       i + 5 < batch_size ? fcs[i + 5]->reso_dB : 0.0,
+                       i + 4 < batch_size ? fcs[i + 4]->reso_dB : 0.0,
+                       i + 3 < batch_size ? fcs[i + 3]->reso_dB : 0.0,
+                       i + 2 < batch_size ? fcs[i + 2]->reso_dB : 0.0,
+                       i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
+                       fcs[i]->reso_dB
+               );
+
+               uint8 imask = _kor_mask8(
+                       _kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
+                       _kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
+               );
+
+               if (batch_size - i < 8)
+                       imask &= (1 << (batch_size - i)) - 1;
+
+               if (imask) {
+                       __m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
+                       __m512d v1pmargin = _mm512_set1_pd(1.0 + ext_filter_margin);
+
+                       vfcrange0 = _mm512_mul_pd(vfcfreq, v1mmargin);
+                       vfcrange1 = _mm512_mul_pd(vfcfreq, v1pmargin);
+                       vfcrange2 = _mm512_mul_pd(vfcreso_DB, v1mmargin);
+                       vfcrange3 = _mm512_mul_pd(vfcreso_DB, v1pmargin);
+
+                       vfcrange01_0246 = _mm512_unpacklo_pd(vfcrange0, vfcrange1);
+                       vfcrange01_1357 = _mm512_unpackhi_pd(vfcrange0, vfcrange1);
+                       vfcrange23_0246 = _mm512_unpacklo_pd(vfcrange2, vfcrange3);
+                       vfcrange23_1357 = _mm512_unpackhi_pd(vfcrange2, vfcrange3);
 
 #if 1
-               __m512d vfcrange0123_04 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_0246);
-               __m512d vfcrange0123_26 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_0246);
-               __m512d vfcrange0123_15 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_1357);
-               __m512d vfcrange0123_37 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_1357);
+                       __m512d vfcrange0123_04 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_0246);
+                       __m512d vfcrange0123_26 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_0246);
+                       __m512d vfcrange0123_15 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_1357);
+                       __m512d vfcrange0123_37 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_1357);
 #else
-               __m512d vfcrange0123_04 = _mm512_mask_permutex_pd(vfcrange01_0246, 0xCC, vfcrange23_0246, (1 << 6) | (0 << 4) | 0);
-               __m512d vfcrange0123_26 = _mm512_mask_permutex_pd(vfcrange01_0246, 0x33, vfcrange23_0246, (3 << 2) | 2);
-               __m512d vfcrange0123_15 = _mm512_mask_permutex_pd(vfcrange01_1357, 0xCC, vfcrange23_1357, (1 << 6) | (0 << 4) | 0);
-               __m512d vfcrange0123_37 = _mm512_mask_permutex_pd(vfcrange01_1357, 0x33, vfcrange23_1357, (3 << 2) | 2);
+                       __m512d vfcrange0123_04 = _mm512_mask_permutex_pd(vfcrange01_0246, 0xCC, vfcrange23_0246, (1 << 6) | (0 << 4) | 0);
+                       __m512d vfcrange0123_26 = _mm512_mask_permutex_pd(vfcrange01_0246, 0x33, vfcrange23_0246, (3 << 2) | 2);
+                       __m512d vfcrange0123_15 = _mm512_mask_permutex_pd(vfcrange01_1357, 0xCC, vfcrange23_1357, (1 << 6) | (0 << 4) | 0);
+                       __m512d vfcrange0123_37 = _mm512_mask_permutex_pd(vfcrange01_1357, 0x33, vfcrange23_1357, (3 << 2) | 2);
 #endif
 
-               if (imask & 1)
-                       _mm256_storeu_pd(fcs[0]->range, _mm512_castpd512_pd256(vfcrange0123_04));
+                       if (imask & 1)
+                               _mm256_storeu_pd(fcs[i]->range, _mm512_castpd512_pd256(vfcrange0123_04));
 
-               if (imask & (1 << 1))
-                       _mm256_storeu_pd(fcs[1]->range, _mm512_castpd512_pd256(vfcrange0123_15));
+                       if (imask & (1 << 1))
+                               _mm256_storeu_pd(fcs[i + 1]->range, _mm512_castpd512_pd256(vfcrange0123_15));
 
-               if (imask & (1 << 2))
-                       _mm256_storeu_pd(fcs[2]->range, _mm512_castpd512_pd256(vfcrange0123_26));
+                       if (imask & (1 << 2))
+                               _mm256_storeu_pd(fcs[i + 2]->range, _mm512_castpd512_pd256(vfcrange0123_26));
 
-               if (imask & (1 << 3))
-                       _mm256_storeu_pd(fcs[3]->range, _mm512_castpd512_pd256(vfcrange0123_37));
+                       if (imask & (1 << 3))
+                               _mm256_storeu_pd(fcs[i + 3]->range, _mm512_castpd512_pd256(vfcrange0123_37));
 
-               if (imask & (1 << 4))
-                       _mm256_storeu_pd(fcs[4]->range, _mm512_extractf64x4_pd(vfcrange0123_04, 1));
+                       if (imask & (1 << 4))
+                               _mm256_storeu_pd(fcs[i + 4]->range, _mm512_extractf64x4_pd(vfcrange0123_04, 1));
 
-               if (imask & (1 << 5))
-                       _mm256_storeu_pd(fcs[5]->range, _mm512_extractf64x4_pd(vfcrange0123_15, 1));
+                       if (imask & (1 << 5))
+                               _mm256_storeu_pd(fcs[i + 5]->range, _mm512_extractf64x4_pd(vfcrange0123_15, 1));
 
-               if (imask & (1 << 6))
-                       _mm256_storeu_pd(fcs[6]->range, _mm512_extractf64x4_pd(vfcrange0123_26, 1));
+                       if (imask & (1 << 6))
+                               _mm256_storeu_pd(fcs[i + 6]->range, _mm512_extractf64x4_pd(vfcrange0123_26, 1));
 
-               if (imask & (1 << 7))
-                       _mm256_storeu_pd(fcs[7]->range, _mm512_extractf64x4_pd(vfcrange0123_37, 1));
+                       if (imask & (1 << 7))
+                               _mm256_storeu_pd(fcs[i + 7]->range, _mm512_extractf64x4_pd(vfcrange0123_37, 1));
 
-               __m512d vfcdiv_flt_rate = _mm512_set_pd(
-                       7 < batch_size ? fcs[7]->div_flt_rate : fcs[0]->div_flt_rate,
-                       6 < batch_size ? fcs[6]->div_flt_rate : fcs[0]->div_flt_rate,
-                       5 < batch_size ? fcs[5]->div_flt_rate : fcs[0]->div_flt_rate,
-                       4 < batch_size ? fcs[4]->div_flt_rate : fcs[0]->div_flt_rate,
-                       3 < batch_size ? fcs[3]->div_flt_rate : fcs[0]->div_flt_rate,
-                       2 < batch_size ? fcs[2]->div_flt_rate : fcs[0]->div_flt_rate,
-                       1 < batch_size ? fcs[1]->div_flt_rate : fcs[0]->div_flt_rate,
-                       fcs[0]->div_flt_rate
-               );
+                       __m512d vfcdiv_flt_rate = _mm512_set_pd(
+                               i + 7 < batch_size ? fcs[i + 7]->div_flt_rate : fcs[i]->div_flt_rate,
+                               i + 6 < batch_size ? fcs[i + 6]->div_flt_rate : fcs[i]->div_flt_rate,
+                               i + 5 < batch_size ? fcs[i + 5]->div_flt_rate : fcs[i]->div_flt_rate,
+                               i + 4 < batch_size ? fcs[i + 4]->div_flt_rate : fcs[i]->div_flt_rate,
+                               i + 3 < batch_size ? fcs[i + 3]->div_flt_rate : fcs[i]->div_flt_rate,
+                               i + 2 < batch_size ? fcs[i + 2]->div_flt_rate : fcs[i]->div_flt_rate,
+                               i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
+                               fcs[i]->div_flt_rate
+                       );
 
-               __m512d vf = _mm512_mul_pd(_mm512_mul_pd(_mm512_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
+                       __m512d vf = _mm512_mul_pd(_mm512_mul_pd(_mm512_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
 
-               FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[0]->reso_dB);
+                       FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
 
-               __m512d vreso_db_cf_p = _mm512_set_pd(
-                       7 < batch_size ? RESO_DB_CF_P(fcs[7]->reso_dB) : reso_db_cf_p,
-                       6 < batch_size ? RESO_DB_CF_P(fcs[6]->reso_dB) : reso_db_cf_p,
-                       5 < batch_size ? RESO_DB_CF_P(fcs[5]->reso_dB) : reso_db_cf_p,
-                       4 < batch_size ? RESO_DB_CF_P(fcs[4]->reso_dB) : reso_db_cf_p,
-                       3 < batch_size ? RESO_DB_CF_P(fcs[3]->reso_dB) : reso_db_cf_p,
-                       2 < batch_size ? RESO_DB_CF_P(fcs[2]->reso_dB) : reso_db_cf_p,
-                       1 < batch_size ? RESO_DB_CF_P(fcs[1]->reso_dB) : reso_db_cf_p,
-                       reso_db_cf_p
-               );
+                       __m512d vreso_db_cf_p = _mm512_set_pd(
+                               i + 7 < batch_size ? RESO_DB_CF_P(fcs[i + 7]->reso_dB) : reso_db_cf_p,
+                               i + 6 < batch_size ? RESO_DB_CF_P(fcs[i + 6]->reso_dB) : reso_db_cf_p,
+                               i + 5 < batch_size ? RESO_DB_CF_P(fcs[i + 5]->reso_dB) : reso_db_cf_p,
+                               i + 4 < batch_size ? RESO_DB_CF_P(fcs[i + 4]->reso_dB) : reso_db_cf_p,
+                               i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : reso_db_cf_p,
+                               i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : reso_db_cf_p,
+                               i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
+                               reso_db_cf_p
+                       );
 
-               __m512d v1 = _mm512_set1_pd(1.0);
-               __m512d v2 = _mm512_set1_pd(2.0);
-               __m512d v0_5 = _mm512_set1_pd(0.5);
+                       __m512d v1 = _mm512_set1_pd(1.0);
+                       __m512d v2 = _mm512_set1_pd(2.0);
+                       __m512d v0_5 = _mm512_set1_pd(0.5);
 
-               __m512d vq = _mm512_sub_pd(v1, _mm512_div_pd(vf, _mm512_fmadd_pd(v2, _mm512_add_pd(vreso_db_cf_p, _mm512_div_pd(v0_5, _mm512_add_pd(v1, vf))), _mm512_sub_pd(vf, v2))));
-               __m512d vc0 = _mm512_mul_pd(vq, vq);
+                       __m512d vq = _mm512_sub_pd(v1, _mm512_div_pd(vf, _mm512_fmadd_pd(v2, _mm512_add_pd(vreso_db_cf_p, _mm512_div_pd(v0_5, _mm512_add_pd(v1, vf))), _mm512_sub_pd(vf, v2))));
+                       __m512d vc0 = _mm512_mul_pd(vq, vq);
 #ifdef USE_SVML
-               __m512d vcosf = _mm512_cos_pd(vf);
+                       __m512d vcosf = _mm512_cos_pd(vf);
 #else
-               ALIGN FLOAT_T af[8];
-               _mm512_storeu_pd(af, vf);
-               __m512d vcosf = _mm512_set_pd(cos(af[7]), cos(af[6]), cos(af[5]), cos(af[4]), cos(af[3]), cos(af[2]), cos(af[1]), cos(af[0]));
+                       ALIGN FLOAT_T af[8];
+                       _mm512_storeu_pd(af, vf);
+                       __m512d vcosf = _mm512_set_pd(cos(af[7]), cos(af[6]), cos(af[5]), cos(af[4]), cos(af[3]), cos(af[2]), cos(af[1]), cos(af[0]));
 #endif
-               __m512d vc1 = _mm512_sub_pd(_mm512_add_pd(vc0, v1), _mm512_mul_pd(_mm512_mul_pd(v2, vcosf), vq));
-
-               __m512d vdc0246 = _mm512_unpacklo_pd(vc0, vc1);
-               __m512d vdc1357 = _mm512_unpackhi_pd(vc0, vc1);
-
-               if (imask & 1)
-                       _mm_storeu_pd(fcs[0]->dc, _mm512_castpd512_pd128(vdc0246));
-               if (imask & (1 << 1))
-                       _mm_storeu_pd(fcs[1]->dc, _mm512_castpd512_pd128(vdc1357));
-               if (imask & (1 << 2))
-                       _mm_storeu_pd(fcs[2]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc0246), 1));
-               if (imask & (1 << 3))
-                       _mm_storeu_pd(fcs[3]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc1357), 1));
-               if (imask & (1 << 4))
-                       _mm_storeu_pd(fcs[4]->dc, _mm512_extractf64x2_pd(vdc0246, 2));
-               if (imask & (1 << 5))
-                       _mm_storeu_pd(fcs[5]->dc, _mm512_extractf64x2_pd(vdc1357, 2));
-               if (imask & (1 << 6))
-                       _mm_storeu_pd(fcs[6]->dc, _mm512_extractf64x2_pd(vdc0246, 3));
-               if (imask & (1 << 7))
-                       _mm_storeu_pd(fcs[7]->dc, _mm512_extractf64x2_pd(vdc1357, 3));
+                       __m512d vc1 = _mm512_sub_pd(_mm512_add_pd(vc0, v1), _mm512_mul_pd(_mm512_mul_pd(v2, vcosf), vq));
+
+                       __m512d vdc0246 = _mm512_unpacklo_pd(vc0, vc1);
+                       __m512d vdc1357 = _mm512_unpackhi_pd(vc0, vc1);
+
+                       if (imask & 1)
+                               _mm_storeu_pd(fcs[i]->dc, _mm512_castpd512_pd128(vdc0246));
+                       if (imask & (1 << 1))
+                               _mm_storeu_pd(fcs[i + 1]->dc, _mm512_castpd512_pd128(vdc1357));
+                       if (imask & (1 << 2))
+                               _mm_storeu_pd(fcs[i + 2]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc0246), 1));
+                       if (imask & (1 << 3))
+                               _mm_storeu_pd(fcs[i + 3]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc1357), 1));
+                       if (imask & (1 << 4))
+                               _mm_storeu_pd(fcs[i + 4]->dc, _mm512_extractf64x2_pd(vdc0246, 2));
+                       if (imask & (1 << 5))
+                               _mm_storeu_pd(fcs[i + 5]->dc, _mm512_extractf64x2_pd(vdc1357, 2));
+                       if (imask & (1 << 6))
+                               _mm_storeu_pd(fcs[i + 6]->dc, _mm512_extractf64x2_pd(vdc0246, 3));
+                       if (imask & (1 << 7))
+                               _mm_storeu_pd(fcs[i + 7]->dc, _mm512_extractf64x2_pd(vdc1357, 3));
+               }
        }
 }
 
@@ -5603,153 +5610,158 @@ static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients** fcs
 
 static void sample_filter_HPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
 {
-       __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(0, batch_size), counts);
-
-       __m128d vdb01_0 = _mm_loadu_pd(dbs[0]);
-       __m128d vdb01_1 = 1 < batch_size ? _mm_loadu_pd(dbs[1]) : _mm_setzero_pd();
-       __m128d vdb01_2 = 2 < batch_size ? _mm_loadu_pd(dbs[2]) : _mm_setzero_pd();
-       __m128d vdb01_3 = 3 < batch_size ? _mm_loadu_pd(dbs[3]) : _mm_setzero_pd();
-       __m128d vdb01_4 = 4 < batch_size ? _mm_loadu_pd(dbs[4]) : _mm_setzero_pd();
-       __m128d vdb01_5 = 5 < batch_size ? _mm_loadu_pd(dbs[5]) : _mm_setzero_pd();
-       __m128d vdb01_6 = 6 < batch_size ? _mm_loadu_pd(dbs[6]) : _mm_setzero_pd();
-       __m128d vdb01_7 = 7 < batch_size ? _mm_loadu_pd(dbs[7]) : _mm_setzero_pd();
-
-       __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
-       __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
-       __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
-       __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
-
-       __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
-       __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
-
-       __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
-       __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
-
-       __m128d vdc01_0 = _mm_loadu_pd(dcs[0]);
-       __m128d vdc01_1 = 1 < batch_size ? _mm_loadu_pd(dcs[1]) : _mm_setzero_pd();
-       __m128d vdc01_2 = 2 < batch_size ? _mm_loadu_pd(dcs[2]) : _mm_setzero_pd();
-       __m128d vdc01_3 = 3 < batch_size ? _mm_loadu_pd(dcs[3]) : _mm_setzero_pd();
-       __m128d vdc01_4 = 4 < batch_size ? _mm_loadu_pd(dcs[4]) : _mm_setzero_pd();
-       __m128d vdc01_5 = 5 < batch_size ? _mm_loadu_pd(dcs[5]) : _mm_setzero_pd();
-       __m128d vdc01_6 = 6 < batch_size ? _mm_loadu_pd(dcs[6]) : _mm_setzero_pd();
-       __m128d vdc01_7 = 7 < batch_size ? _mm_loadu_pd(dcs[7]) : _mm_setzero_pd();
-
-       __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
-       __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
-       __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
-       __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
-
-       __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
-       __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
-
-       __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
-       __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
-
-       __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
-       vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
-       int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
-
-       for (int32 j = 0; j < count_max; j += 8) {
-               __m512d vin[8];
-               vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[0]), &sps[0][j]);
-
-               for (int k = 1; k < 8; k++)
-                       vin[k] = _mm512_maskz_loadu_pd(k < batch_size ? generate_mask8_for_count(j, counts[k]) : 0, &sps[k][j]);
-
-               __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
-               __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
-               __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
-               __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
-               __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
-               __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
-               __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
-               __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
-
-               __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
-               __m512d vsps[8];
-               vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
-               for (int k = 0; k < 8; k++) {
-                       __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
-
-                       vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
-                       vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
-                       vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
-                       vsps[k] = _mm512_sub_pd(vsps[k], vdb0);
-               }
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
+               if (i >= batch_size)
+                       break;
 
-               __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
-               __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
-               __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
-               __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
-               __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
-               __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
-               __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
-               __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
-
-               __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
-               __m512d vout[8];
-               vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-               vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
-               vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
-
-               for (int k = 0; k < batch_size; k++)
-                       _mm512_mask_storeu_pd(&sps[k][j], generate_mask8_for_count(j, counts[k]), vout[k]);
-       }
+               __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(i, batch_size), &counts[i]);
 
-       vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
-       vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
+               __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
+               __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
+               __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
+               __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
+               __m128d vdb01_4 = i + 4 < batch_size ? _mm_loadu_pd(dbs[i + 4]) : _mm_setzero_pd();
+               __m128d vdb01_5 = i + 5 < batch_size ? _mm_loadu_pd(dbs[i + 5]) : _mm_setzero_pd();
+               __m128d vdb01_6 = i + 6 < batch_size ? _mm_loadu_pd(dbs[i + 6]) : _mm_setzero_pd();
+               __m128d vdb01_7 = i + 7 < batch_size ? _mm_loadu_pd(dbs[i + 7]) : _mm_setzero_pd();
+
+               __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
+               __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
+               __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
+               __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
 
-       _mm_storeu_pd(dbs[0], _mm512_castpd512_pd128(vdb01_0246));
+               __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
+               __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
 
-       if (1 < batch_size)
-               _mm_storeu_pd(dbs[1], _mm512_castpd512_pd128(vdb01_1357));
+               __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
+               __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
+
+               __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
+               __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
+               __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
+               __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
+               __m128d vdc01_4 = i + 4 < batch_size ? _mm_loadu_pd(dcs[i + 4]) : _mm_setzero_pd();
+               __m128d vdc01_5 = i + 5 < batch_size ? _mm_loadu_pd(dcs[i + 5]) : _mm_setzero_pd();
+               __m128d vdc01_6 = i + 6 < batch_size ? _mm_loadu_pd(dcs[i + 6]) : _mm_setzero_pd();
+               __m128d vdc01_7 = i + 7 < batch_size ? _mm_loadu_pd(dcs[i + 7]) : _mm_setzero_pd();
 
-       if (2 < batch_size)
-               _mm_storeu_pd(dbs[2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
+               __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
+               __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
+               __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
+               __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
 
-       if (3 < batch_size)
-               _mm_storeu_pd(dbs[3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
+               __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
+               __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
 
-       if (4 < batch_size)
-               _mm_storeu_pd(dbs[4], _mm512_extractf64x2_pd(vdb01_0246, 2));
+               __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
+               __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
 
-       if (5 < batch_size)
-               _mm_storeu_pd(dbs[5], _mm512_extractf64x2_pd(vdb01_1357, 2));
+               __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
+               vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
+               int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
 
-       if (6 < batch_size)
-               _mm_storeu_pd(dbs[6], _mm512_extractf64x2_pd(vdb01_0246, 3));
+               for (int32 j = 0; j < count_max; j += 8) {
+                       __m512d vin[8];
+                       vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[i]), &sps[i][j]);
 
-       if (7 < batch_size)
-               _mm_storeu_pd(dbs[7], _mm512_extractf64x2_pd(vdb01_1357, 3));
+                       for (int k = 1; k < 8; k++)
+                               vin[k] = _mm512_maskz_loadu_pd(i + k < batch_size ? generate_mask8_for_count(j, counts[i + k]) : 0, & sps[i + k][j]);
+
+                       __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
+                       __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
+                       __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
+                       __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
+                       __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
+                       __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
+                       __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
+                       __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
+
+                       __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+                       __m512d vsps[8];
+                       vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+                       for (int k = 0; k < 8; k++) {
+                               __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
+
+                               vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
+                               vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
+                               vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
+                               vsps[k] = _mm512_sub_pd(vsps[k], vdb0);
+                       }
+
+                       __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
+                       __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
+                       __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
+                       __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
+                       __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
+                       __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
+                       __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
+                       __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
+
+                       __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+                       __m512d vout[8];
+                       vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+                       vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
+                       vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
+
+                       for (int k = 0; k < batch_size; k++)
+                               _mm512_mask_storeu_pd(&sps[i + k][j], generate_mask8_for_count(j, counts[i + k]), vout[k]);
+               }
+
+               vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
+               vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
+
+               _mm_storeu_pd(dbs[i], _mm512_castpd512_pd128(vdb01_0246));
+
+               if (i + 1 < batch_size)
+                       _mm_storeu_pd(dbs[i + 1], _mm512_castpd512_pd128(vdb01_1357));
+
+               if (i + 2 < batch_size)
+                       _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
+
+               if (i + 3 < batch_size)
+                       _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
+
+               if (i + 4 < batch_size)
+                       _mm_storeu_pd(dbs[i + 4], _mm512_extractf64x2_pd(vdb01_0246, 2));
+
+               if (i + 5 < batch_size)
+                       _mm_storeu_pd(dbs[i + 5], _mm512_extractf64x2_pd(vdb01_1357, 2));
+
+               if (i + 6 < batch_size)
+                       _mm_storeu_pd(dbs[i + 6], _mm512_extractf64x2_pd(vdb01_0246, 3));
+
+               if (i + 7 < batch_size)
+                       _mm_storeu_pd(dbs[i + 7], _mm512_extractf64x2_pd(vdb01_1357, 3));
+       }
 }
 
 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)