2 TiMidity++ -- MIDI to WAVE converter and player
3 Copyright (C) 1999-2002 Masanao Izumo <mo@goice.co.jp>
4 Copyright (C) 1995 Tuukka Toivonen <tt@cgs.fi>
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 filter.c: written by Vincent Pagel ( pagel@loria.fr )
22 implements fir antialiasing filter : should help when setting sample
29 - modify "filter" so that it simulate leading and trailing 0 in the buffer
34 #endif /* HAVE_CONFIG_H */
54 double ext_filter_shelving_gain = 1.0;
55 double ext_filter_shelving_reduce = 1.0;
56 double ext_filter_shelving_q = 1.0;
57 double ext_filter_peaking_gain = 1.0;
58 double ext_filter_peaking_reduce = 1.0;
59 double ext_filter_peaking_q = 1.0;
60 const double ext_filter_margin = 0.010; // 1cB,+-20cent
\82æ
\82è
\8f¬
\82³
\82¢
\95\89\89×
\8c¸
\8f
\8f¬
61 //const double ext_filter_margin = 0.05; // 5cB,+-100cent
\82æ
\82è
\8f¬
\82³
\82¢
\95\89\89×
\8c¸
\8f
\91å
67 voice_filter1(LPF), voice_filter2(HPF), resample_filter
68 \83t
\83B
\83\8b\83^
\95\94\95ª
\8b¤
\92Ê
69 \83t
\83B
\83\8b\83^
\8e©
\91Ì
\82Í freq[Hz], reso_dB[dB] ,(EQ
\82Ì
\8fê
\8d\87 q[0.0~1.0]
71 voice_filter(LPF)
\82Ì
\8fê
\8d\87
72 playmidi.c init_voice_filter(int i)
74 input freq 20 < freq < 20000 , input 0 < reso < 96
76 0<n<
\81\87, 0<n<1, 1>n>0 (
\8ae
\83t
\83B
\83\8b\83^
\91¤
\83\8c\83]
\83i
\83\93\83X
\95\94\95ª
\92l
\88æ
\82ª
\82±
\82ñ
\82È
\8a´
\82¶
\82Å
\83o
\83\89\83o
\83\89
77 n=f(rez) (
\8ae
\83t
\83B
\83\8b\83^
\82Ìf()
\82ª
\89½
\82È
\82Ì
\82©
78 q, 1/q, 1-1/q
\82±
\82¤
\82·
\82é
\82Æ
\8b¤
\92Ê
\82µ
\82½
\92l
\88æ 1<q<
\81\87 \82É
\82È
\82é
79 0<rez
\82È
\82Ì
\82Å q=X^rez
\82Å q
\82Ì
\92l
\88æ
\82É
\95Ï
\8a·
80 \82±
\82ê
\82É
\90§
\8cÀ
\82â
\82ç
\8cW
\90\94\82ª
\82Â
\82
82 \8f\88\97\9d\8f\87\8f\98
83 1: type
\82ð
\8ew
\92è set_type() (
\8f\89\89ñ
\82Ü
\82½
\82Ítype
\95Ï
\89»
\82Ì
\8fê
\8d\87 FilterCoefficients 0
\83N
\83\8a\83A
84 2:
\93Á
\8eê
\82È
\83T
\83\93\83v
\83\8b\83\8c\81[
\83g
\82Ì
\8fê
\8d\87\82Í set_ext_rate()
85 3: orig_freq,orig_reso
\82ð
\8eg
\97p
\82·
\82é
\8fê
\8d\87\82Í set_orig_freq() set_orig_reso()
86 4: freq,reso
\82ð
\8ew
\92è set_freq() set_reso() (EQ
\82Ì
\8fê
\8d\87 set_q()
87 5: sample_filter
\82Ì
\8fê
\8d\87\82Í
\8cW
\90\94\8cv
\8eZ recalc_filter()
88 6:
\83t
\83B
\83\8b\83^
\8f\88\97\9d filter() (buffer_filter
\82Í
\8f\88\97\9d\91O
\82É
\8cW
\90\94\8cv
\8eZ
\82³
\82ê
\82é
90 1:~5:
\82ð
\82Ü
\82Æ
\82ß
\82½ init_sample_filter()
\82Å
\82à
\82¢
\82¢ (EQ
\82Ì
\8fê
\8d\87 init_sample_filter2()
96 num filter_define type cutoff_limit (oversampling) desc
97 00 FILTER_NONE, OFF filter OFF
98 01 FILTER_LPF12, LPF sr / 6 0.16666 (~x3) Chamberlin 12dB/oct
99 02 FILTER_LPF24, LPF sr / 2 Moog VCF 24dB/oct
100 03 FILTER_LPF_BW, LPF sr / 2 butterworth elion add
101 04 FILTER_LPF12_2, LPF sr / 2 Resonant IIR 12dB/oct
102 05 FILTER_LPF24_2, LPF sr / 2 amSynth 24dB/oct
103 06 FILTER_LPF6, LPF sr / 2 One pole 6dB/oct nonrez
104 07 FILTER_LPF18, LPF sr / 2.25 0.44444 (~x2) 3pole 18dB/oct
105 08 FILTER_LPF_TFO, LPF sr / 2 two first order
107 09 FILTER_HPF_BW, HPF sr / 2 butterworth elion+
108 10 FILTER_BPF_BW, BPF sr / 2 butterworth elion+
109 11 FILTER_PEAK1, peak sr / 2
110 12 FILTER_NOTCH1, notch sr / 2
111 13 FILTER_LPF12_3, LPF sr / 4.6 0.21875 (~x3) Chamberlin2 12dB/oct
112 14 FILTER_HPF12_3, HPF sr / 4.6 0.21875 (~x3) Chamberlin2 12dB/oct
113 15 FILTER_BPF12_3, BPF sr / 4.6 0.21875 (~x3) Chamberlin2 12dB/oct
114 16 FILTER_BCF12_3, BCF sr / 4.6 0.21875 (~x3) Chamberlin2 12dB/oct
115 17 FILTER_HPF6, HPF sr / 2 One pole 6dB/oct nonrez
116 18 FILTER_HPF12_2, HPF sr / 2 Resonant IIR 12dB/oct
118 19 FILTER_HBF_L6L12, HBF sr / 2
119 20 FILTER_HBF_L12L6, HBF sr / 2
120 21 FILTER_HBF_L12H6, HBF sr / 2
121 22 FILTER_HBF_L24H6, HBF sr / 2
122 23 FILTER_HBF_L24H12, HBF sr / 2
123 24 FILTER_HBF_L12OCT, HBF sr / 2
124 25 FILTER_HBF_L24OCT, HBF sr / 2
126 26 FILTER_LPF6x2, LPF sr / 2
127 27 FILTER_LPF6x3, LPF sr / 2
128 28 FILTER_LPF6x4, LPF sr / 2
129 29 FILTER_LPF6x8, LPF sr / 2
130 30 FILTER_LPF6x16, LPF sr / 2
131 31 FILTER_LPF_BWx2, LPF sr / 2
132 32 FILTER_LPF_BWx3, LPF sr / 2
133 33 FILTER_LPF_BWx4, LPF sr / 2
134 34 FILTER_LPF24_2x2, LPF sr / 2
137 36 FILTER_SHELVING_LOW,EQ_LOW sr / 2
138 37 FILTER_SHELVING_HI, EQ_HI sr / 2
139 38 FILTER_PEAKING, EQ_MID sr / 2
140 39 FILTER_BIQUAD_LOW, LPF sr / 2
141 40 FILTER_BIQUAD_HI, HPF sr / 2
144 cutoff_limit sr/2
\96¢
\96\9e\82Ì
\82à
\82Ì
\82Íoversampling
\82Åsr/2
\82ð
\88µ
\82¦
\82é
\82æ
\82¤
\82É
\82·
\82é
149 #if 1 // recalc filter margin
151 \8cW
\90\94\8dÄ
\8cv
\8eZ
\82Í
\95\89\89×
\82ª
\91å
\82«
\82
\83{
\83C
\83X
\83t
\83B
\83\8b\83^
\82Å
\8eg
\97p
\89ñ
\90\94\82à
\91½
\82¢
\82Ì
\82Å
\82 \82é
\92ö
\93x
\8dí
\82Á
\82Ä
\8f\88\97\9d\89ñ
\90\94\82ð
\8c¸
\82ç
\82·
152 \83{
\83C
\83X
\83t
\83B
\83\8b\83^
\82Í
\95Ï
\93®
\82µ
\82Ä
\82é
\82à
\82Ì
\82¾
\82©
\82ç100cent
\83Y
\83\8c\82Ä
\82à
\88á
\82¢
\82Í
\82í
\82©
\82ç
\82È
\82¢ EQ
\82Í
\95Ï
\93®
\82µ
\82È
\82¢
\82µ
155 #define INIT_MARGIN_VAL { \
156 fc->range[0] = fc->range[1] = fc->range[2] = fc->range[3] = fc->range[4] = fc->range[5] = fc->range[6] = fc->range[7] = 0; }
157 #define FLT_FREQ_MARGIN (fc->freq < fc->range[0] || fc->freq > fc->range[1])
158 #define FLT_RESO_MARGIN (fc->reso_dB < fc->range[2] || fc->reso_dB > fc->range[3])
159 #define FLT_WIDTH_MARGIN (fc->q < fc->range[4] || fc->q > fc->range[5])
161 #if (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
162 #define CALC_MARGIN_VAL __m128d vec_range = _mm_set_pd(1.0 + ext_filter_margin, 1.0 - ext_filter_margin);
163 #define CALC_FREQ_MARGIN { _mm_storeu_pd(&fc->range[0], _mm_mul_pd(MM_LOAD1_PD(&fc->freq), vec_range));}
164 #define CALC_RESO_MARGIN { _mm_storeu_pd(&fc->range[2], _mm_mul_pd(MM_LOAD1_PD(&fc->reso_dB), vec_range));}
165 #define CALC_WIDTH_MARGIN { _mm_storeu_pd(&fc->range[4], _mm_mul_pd(MM_LOAD1_PD(&fc->q), vec_range));}
167 #define CALC_MARGIN_VAL
168 #define CALC_FREQ_MARGIN {fc->range[0] = fc->freq * (1.0 - ext_filter_margin); fc->range[1] = fc->freq * (1.0 + ext_filter_margin);}
169 #define CALC_RESO_MARGIN {fc->range[2] = fc->reso_dB * (1.0 - ext_filter_margin); fc->range[3] = fc->reso_dB * (1.0 + ext_filter_margin);}
170 #define CALC_WIDTH_MARGIN {fc->range[4] = fc->q * (1.0 - ext_filter_margin); fc->range[5] = fc->q * (1.0 + ext_filter_margin);}
173 #else // ! recalc filter margin
175 #define INIT_MARGIN_VAL
176 #define CALC_MARGIN_VAL
177 #define FLT_FREQ_MARGIN (fc->freq != fc->last_freq)
178 #define FLT_RESO_MARGIN (fc->reso_dB != fc->last_reso_dB)
179 #define FLT_WIDTH_MARGIN (fc->q != fc->last_q)
180 #define CALC_FREQ_MARGIN {fc->last_freq = fc->freq;}
181 #define CALC_RESO_MARGIN {fc->last_reso_dB = fc->reso_dB;}
182 #define CALC_WIDTH_MARGIN {fc->last_q = fc->q;}
185 #if 1 // resonance use table
186 #define RESO_DB_CF_P(db) filter_cb_p_table[(int)(db * 10.0)]
187 #define RESO_DB_CF_M(db) filter_cb_m_table[(int)(db * 10.0)]
188 #elif 1 // resonance calc function lite
189 #define RESO_DB_CF_P(db) (FLOAT_T)(exp((float)(M_LN10 * DIV_40 * db)))
190 #define RESO_DB_CF_M(db) (FLOAT_T)(exp((float)(M_LN10 * -DIV_40 * db)))
191 #else // resonance calc function
192 #define RESO_DB_CF_P(db) pow(10.0, DIV_40 * db)
193 #define RESO_DB_CF_M(db) pow(10.0, -DIV_40 * db)
199 #if (OPT_MODE == 1) && !defined(DATA_T_DOUBLE) && !defined(DATA_T_FLOAT) /* fixed-point implementation */
201 static inline void sample_filter_none(FILTER_T *dc, FILTER_T *db, DATA_T *sp) {}
203 static inline void recalc_filter_none(FilterCoefficients *fc) {}
205 static inline void sample_filter_LPF12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
207 db[0] = db[0] + imuldiv28(db[2], dc[0]);
208 db[1] = (*sp << 4) - db[0] - imuldiv28(db[2], dc[1]);
209 db[2] = imuldiv28(db[1], dc[0]) + db[2];
210 *sp = db[0] >> 4; /* 4.28 to 8.24 */
213 static inline void sample_filter_LPF12_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
215 FILTER_T input = *sp << 4;
217 db[0] = db[0] + imuldiv28(db[2], dc[0]);
218 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
219 db[2] = imuldiv28(db[1], dc[0]) + db[2];
220 *sp = db[0] >> 4; /* 4.28 to 8.24 */
222 db[0] = db[0] + imuldiv28(db[2], dc[0]);
223 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
224 db[2] = imuldiv28(db[1], dc[0]) + db[2];
227 static inline void sample_filter_LPF12_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
229 FILTER_T input = *sp << 4;
231 db[0] = db[0] + imuldiv28(db[2], dc[0]);
232 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
233 db[2] = imuldiv28(db[1], dc[0]) + db[2];
234 *sp = db[0] >> 4; /* 4.28 to 8.24 */
236 db[0] = db[0] + imuldiv28(db[2], dc[0]);
237 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
238 db[2] = imuldiv28(db[1], dc[0]) + db[2];
240 db[0] = db[0] + imuldiv28(db[2], dc[0]);
241 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
242 db[2] = imuldiv28(db[1], dc[0]) + db[2];
245 static inline void recalc_filter_LPF12(FilterCoefficients *fc)
249 /* copy with applying Chamberlin's lowpass filter. */
250 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
251 fc->last_freq = fc->freq;
252 if(fc->freq < fc->flt_rate_limit1){ // <sr*DIV_6
253 fc->sample_filter = sample_filter_LPF12;
254 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
255 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*DIV_6
256 fc->sample_filter = sample_filter_LPF12_ov2;
257 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
258 }else{ // <sr*3*DIV_6
259 fc->sample_filter = sample_filter_LPF12_ov3;
260 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
262 fc->last_reso_dB = fc->reso_dB;
263 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
267 static inline void sample_filter_LPF24(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
271 da[0] = (*sp << 4) - imuldiv28(db[4], dc[2]); /* feedback */
275 db[1] = imuldiv28((db[0] + da[0]), dc[0]) - imuldiv28(db[1], dc[1]);
276 db[2] = imuldiv28((db[1] + da[1]), dc[0]) - imuldiv28(db[2], dc[1]);
277 db[3] = imuldiv28((db[2] + da[2]), dc[0]) - imuldiv28(db[3], dc[1]);
278 db[4] = imuldiv28((db[3] + da[3]), dc[0]) - imuldiv28(db[4], dc[1]);
280 *sp = db[4] >> 4; /* 4.28 to 8.24 */
283 static inline void recalc_filter_LPF24(FilterCoefficients *fc)
285 FLOAT_T f, q, p, tmp;
288 /* copy with applying Moog lowpass VCF. */
289 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
290 fc->last_freq = fc->freq;
291 f = 2.0 * fc->freq * fc->div_flt_rate;
293 fc->last_reso_dB = fc->reso_dB;
294 q = 0.80 * (1.0 - RESO_DB_CF_M(fc->reso_dB)); // 0.0f <= c < 0.80f
295 dc[0] = TIM_FSCALE(tmp = f + 0.8 * f * p, 28);
296 dc[1] = TIM_FSCALE(tmp + tmp - 1.0, 28);
297 dc[2] = TIM_FSCALE(q * (1.0 + 0.5 * p * (1.0 - p + 5.6 * p * p)), 28);
301 static inline void sample_filter_LPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
306 db[2] = imuldiv28(db[0], dc[0])
307 + imuldiv28(db[1], dc[1])
308 + imuldiv28(db[2], dc[2])
309 - imuldiv28(db[3], dc[3])
310 - imuldiv28(db[4], dc[4]);
312 db[3] = db[2]; // flt out
314 db[1] = db[0]; // flt in
316 *sp = db[3] >> 4; /* 4.28 to 8.24 */
319 static inline void recalc_filter_LPF_BW(FilterCoefficients *fc)
321 FILTER_T *dc = fc->dc;
322 FLOAT_T q ,p, p2, qp, tmp;
325 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
326 fc->last_freq = fc->freq;
327 fc->last_reso_dB = fc->reso_dB;
328 p = 1.0 / tan(M_PI * fc->freq * fc->div_flt_rate); // ?
329 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
332 dc[0] = TIM_FSCALE(tmp = 1.0 / (1.0 + qp + p2), 28);
333 dc[1] = TIM_FSCALE(2.0 * tmp, 28);
335 dc[3] = TIM_FSCALE(2.0 * (1.0 - p2) * tmp, 28);
336 dc[4] = TIM_FSCALE((1.0 - qp + p2) * tmp, 28);
340 static inline void sample_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
342 db[1] += imuldiv28(((*sp << 4) - db[0]), dc[1]);
344 db[1] = imuldiv28(db[1], dc[0]);
345 *sp = db[0] >> 4; /* 4.28 to 8.24 */
348 static inline void recalc_filter_LPF12_2(FilterCoefficients *fc)
350 FILTER_T *dc = fc->dc;
353 // Resonant IIR lowpass (12dB/oct) Olli Niemitalo //r
354 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
355 fc->last_freq = fc->freq;
356 fc->last_reso_dB = fc->reso_dB;
357 f = M_PI2 * fc->freq * fc->div_flt_rate;
358 //q = 1.0 - f / (2.0 * ((fc->reso_dB * DIV_96 + 1.0) + 0.5 / (1.0 + f)) + f - 2.0);
359 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
360 dc[0] = TIM_FSCALE(tmp = q * q, 28);
361 dc[1] = TIM_FSCALE(tmp + 1.0 - 2.0 * cos(f) * q, 28);
365 static inline void buffer_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp, int32 count)
368 FILTER_T db0 = db[0], db1 = db[1], dc0 = dc[0], dc1 = dc[1];
370 for (i = 0; i < count; i++) {
371 db1 += imuldiv28(((sp[i] << 4) - db0), dc1);
373 db1 = imuldiv28(db1, dc0);
374 sp[i] = db0 >> 4; /* 4.28 to 8.24 */
380 static inline void sample_filter_LPF24_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
383 db[5] = imuldiv28(db[0], dc[0]) + db[1];
384 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
385 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
387 db[5] = imuldiv28(db[0], dc[0]) + db[3];
388 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
389 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
390 *sp = db[0] >> 4; /* 4.28 to 8.24 */
393 static inline void recalc_filter_LPF24_2(FilterCoefficients *fc)
395 FLOAT_T f, q, p, r, dc0;
398 // amSynth 24dB/ocatave resonant low-pass filter. Nick Dowell //r
399 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
400 fc->last_freq = fc->freq;
401 fc->last_reso_dB = fc->reso_dB;
402 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
403 //q = 2 * (1 - fc->reso_dB * DIV_96); // maxQ = 0.9995
404 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
406 p = 1.0 / (1.0 + (q * f) + r);
408 dc[0] = TIM_FSCALE(dc0, 28);
409 dc[1] = TIM_FSCALE(dc0 * 2, 28);
411 dc[3] = TIM_FSCALE(-2.0 * (r - 1) * p, 28);
412 dc[4] = TIM_FSCALE((-1.0 + (q * f) - r) * p, 28);
416 static inline void sample_filter_LPF6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
418 db[1] = imuldiv28((*sp << 4), dc[0]) + imuldiv28(db[1], dc[1]);
419 *sp = db[1] >> 4; /* 4.28 to 8.24 */
422 static inline void recalc_filter_LPF6(FilterCoefficients *fc)
427 // One pole filter, LP 6dB/Oct scoofy no resonance //r
428 if (!FP_EQ(fc->freq, fc->last_freq)) {
429 fc->last_freq = fc->freq;
430 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
431 dc[0] = TIM_FSCALE(1.0 - f, 28);
432 dc[1] = TIM_FSCALE(f, 28);
436 static inline void sample_filter_LPF18(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
443 db[0] = (*sp << 4) - imuldiv28(db[3], dc[2]);
444 db[1] = imuldiv28((db[0] + da[0]), dc[1]) - imuldiv28(db[1], dc[0]);
445 db[2] = imuldiv28((db[1] + da[1]), dc[1]) - imuldiv28(db[2], dc[0]);
446 db[3] = imuldiv28((db[2] + da[2]), dc[1]) - imuldiv28(db[3], dc[0]);
447 *sp = imuldiv28(db[3], dc[3]) >> 4; /* 4.28 to 8.24 */
450 static inline void sample_filter_LPF18_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
452 FILTER_T da[3], input = *sp << 4;
457 db[0] = input - imuldiv28(db[3], dc[2]);
458 db[1] = imuldiv28((db[0] + da[0]), dc[1]) - imuldiv28(db[1], dc[0]);
459 db[2] = imuldiv28((db[1] + da[1]), dc[1]) - imuldiv28(db[2], dc[0]);
460 db[3] = imuldiv28((db[2] + da[2]), dc[1]) - imuldiv28(db[3], dc[0]);
465 db[0] = input - imuldiv28(db[3], dc[2]);
466 db[1] = imuldiv28((db[0] + da[0]), dc[1]) - imuldiv28(db[1], dc[0]);
467 db[2] = imuldiv28((db[1] + da[1]), dc[1]) - imuldiv28(db[2], dc[0]);
468 db[3] = imuldiv28((db[2] + da[2]), dc[1]) - imuldiv28(db[3], dc[0]);
469 *sp = imuldiv28(db[3], dc[3]) >> 4; /* 4.28 to 8.24 */
472 static inline void recalc_filter_LPF18(FilterCoefficients *fc)
474 FLOAT_T f, q, p, tmp;
477 // LPF18 low-pass filter //r
478 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
479 fc->last_freq = fc->freq;
480 if(fc->freq < fc->flt_rate_limit1){ // <sr/2.25
481 fc->sample_filter = sample_filter_LPF18;
482 f = 2.0 * fc->freq * fc->div_flt_rate; // *1.0
484 fc->sample_filter = sample_filter_LPF18_ov2;
485 f = 2.0 * fc->freq * fc->div_flt_rate_ov2; // sr*2
487 dc[0] = TIM_FSCALE(tmp = ((-2.7528 * f + 3.0429) * f + 1.718) * f - 0.9984, 28);
488 fc->last_reso_dB = fc->reso_dB;
489 //q = fc->reso_dB * DIV_96;
490 q = 0.789 * (1.0 - RESO_DB_CF_M(fc->reso_dB)); // 0<q<0.78125
492 dc[1] = TIM_FSCALE(0.5 * p, 28);
493 dc[2] = TIM_FSCALE(tmp = q * (((-2.7079 * p + 10.963) * p - 14.934) * p + 8.4974), 28);
494 dc[3] = TIM_FSCALE(1.0 + (0.25 * (1.5 + 2.0 * tmp * (1.0 - f))), 28);
498 static inline void sample_filter_LPF_TFO(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
500 db[0] = db[0] + imuldiv28(((*sp << 4) - db[0] + imuldiv28((db[0] - db[1]), dc[1])), dc[0]);
501 db[1] = db[1] + imuldiv28((db[0] - db[1]), dc[0]);
502 *sp = db[1] >> 4; /* 4.28 to 8.24 */
505 static inline void recalc_filter_LPF_TFO(FilterCoefficients *fc)
510 // two first order low-pass filter //r
511 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
512 fc->last_freq = fc->freq;
513 fc->last_reso_dB = fc->reso_dB;
514 dc[0] = TIM_FSCALE(tmp = 2 * fc->freq * fc->div_flt_rate, 28);
515 q = 1.0 - RESO_DB_CF_M(fc->reso_dB);
516 dc[1] = TIM_FSCALE(q + q / (1.01 - tmp), 28);
520 static inline void sample_filter_HPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
525 db[2] = imuldiv28(db[0], dc[0])
526 + imuldiv28(db[1], dc[1])
527 + imuldiv28(db[2], dc[2])
528 - imuldiv28(db[3], dc[3])
529 - imuldiv28(db[4], dc[4]);
531 db[3] = db[2]; // flt out
533 db[1] = db[0]; // flt in
535 *sp = db[3] >> 4; /* 4.28 to 8.24 */
538 static inline void recalc_filter_HPF_BW(FilterCoefficients *fc)
541 FLOAT_T q, p, p2, qp, tmp;
543 // elion butterworth HPF //r
544 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
545 fc->last_freq = fc->freq;
546 fc->last_reso_dB = fc->reso_dB;
547 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
548 p = tan(M_PI * fc->freq * fc->div_flt_rate); // hpf ?
551 dc[0] = TIM_FSCALE(tmp = 1.0 / (1.0 + qp + p2), 28);
552 dc[1] = TIM_FSCALE(-2 * tmp, 28); // hpf
554 dc[3] = TIM_FSCALE(2.0 * (p2 - 1.0) * tmp, 28); // hpf
555 dc[4] = TIM_FSCALE((1.0 - qp + p2) * tmp, 28);
559 static inline void sample_filter_BPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
564 db[2] = imuldiv28(db[0], dc[0])
565 + imuldiv28(db[1], dc[1])
566 + imuldiv28(db[2], dc[2])
567 - imuldiv28(db[3], dc[3])
568 - imuldiv28(db[4], dc[4]);
570 db[3] = db[2]; // flt out
572 db[1] = db[0]; // flt in
574 db[7] = imuldiv28(db[3], dc[5])
575 + imuldiv28(db[6], dc[6])
576 + imuldiv28(db[7], dc[7])
577 - imuldiv28(db[8], dc[8])
578 - imuldiv28(db[9], dc[9]);
580 db[8] = db[7]; // flt out
582 db[6] = db[3]; // flt in
584 *sp = db[8] >> 4; /* 4.28 to 8.24 */
587 static inline void recalc_filter_BPF_BW(FilterCoefficients *fc)
589 FLOAT_T f, q, r, pl, ph, sl, sh, tmp;
593 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
594 fc->last_freq = fc->freq;
595 fc->last_reso_dB = fc->reso_dB;
596 f = fc->freq * fc->div_flt_rate;
597 r = 1.0 - RESO_DB_CF_M(fc->reso_dB);
598 q = SQRT_2 - r * SQRT_2; // q>0.1
600 pl = 1.0 / tan(M_PI * f); // ?
602 dc[0] = TIM_FSCALE(tmp = 1.0 / (1.0 + q * pl + sl), 28);
603 dc[1] = TIM_FSCALE(2.0 * tmp, 28);
605 dc[3] = TIM_FSCALE(2.0 * (1.0 - sl) * tmp, 28);
606 dc[4] = TIM_FSCALE((1.0 - q * pl + sl) * tmp, 28);
608 f = f * 0.80; // bandwidth = LPF-HPF
609 ph = tan(M_PI * f); // hpf ?
611 dc[5] = TIM_FSCALE(tmp = 1.0 / (1.0 + q * ph + sh), 28);
612 dc[6] = TIM_FSCALE(-2 * tmp, 28); // hpf
614 dc[8] = TIM_FSCALE(2.0 * (sh - 1.0) * tmp, 28); // hpf
615 dc[9] = TIM_FSCALE((1.0 - q * ph + sh) * tmp, 28);
619 static inline void recalc_filter_peak1(FilterCoefficients *fc)
621 FLOAT_T f, q, r, pl ,ph, sl, sh;
625 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
626 fc->last_freq = fc->freq;
627 fc->last_reso_dB = fc->reso_dB;
628 f = cos(M_PI2 * fc->freq * fc->div_flt_rate);
629 r = 1.0 - RESO_DB_CF_M(fc->reso_dB); // r < 0.99609375
631 dc[0] = TIM_FSCALE((1 - r) * sqrt(r * (r - 4 * (f * f) + 2.0) + 1.0), 28);
632 dc[1] = TIM_FSCALE(2 * f * r, 28);
633 dc[2] = TIM_FSCALE(-(r * r), 28);
637 static inline void sample_filter_peak1(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
642 r += imuldiv28(dc[0], db[0]);
643 r += imuldiv28(dc[1], db[1]);
644 r += imuldiv28(dc[2], db[2]);
647 *sp = r >> 4; /* 4.28 to 8.24 */
650 static inline void recalc_filter_notch1(FilterCoefficients *fc)
652 FLOAT_T f, q, r, pl ,ph, sl, sh;
656 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
657 fc->last_freq = fc->freq;
658 fc->last_reso_dB = fc->reso_dB;
659 f = cos(M_PI2 * fc->freq * fc->div_flt_rate);
660 r = (1.0 - RESO_DB_CF_M(fc->reso_dB)) * 0.99609375; // r < 0.99609375
661 dc[0] = TIM_FSCALE((1 - r) * sqrt(r * (r - 4 * (f * f) + 2.0) + 1.0), 28);
662 dc[1] = TIM_FSCALE(2 * f * r, 28);
663 dc[2] = TIM_FSCALE(-(r * r), 28);
667 static inline void sample_filter_notch1(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
672 r += imuldiv28(dc[0], db[0]);
673 r += imuldiv28(dc[1], db[1]);
674 r += imuldiv28(dc[2], db[2]);
677 *sp = (db[0] - r) >> 4; // notch
680 static inline void sample_filter_LPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
682 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
683 db[1] = imuldiv28(dc[1], *sp << 4) - db[0] - imuldiv28(dc[1], db[2]); // high
684 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
685 *sp = db[0] >> 4; // (db[1] + db[0]) >> 4; // notch
688 static inline void sample_filter_LPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
690 FILTER_T input = *sp << 4;
692 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
693 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
694 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
695 *sp = db[0] >> 4; // (db[1] + db[0]) >> 4; // notch
697 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
698 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
699 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
702 static inline void sample_filter_LPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
704 FILTER_T input = *sp << 4;
706 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
707 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
708 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
709 *sp = db[0] >> 4; // (db[1] + db[0]) >> 4; // notch
711 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
712 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
713 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
715 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
716 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
717 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
720 static inline void recalc_filter_LPF12_3(FilterCoefficients *fc)
724 /* Chamberlin2's lowpass filter. */
725 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
726 fc->last_freq = fc->freq;
727 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
728 fc->sample_filter = sample_filter_LPF12_3;
729 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
730 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
731 fc->sample_filter = sample_filter_LPF12_3_ov2;
732 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
733 }else{ // <sr*3*0.21875
734 fc->sample_filter = sample_filter_LPF12_3_ov3;
735 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
737 fc->last_reso_dB = fc->reso_dB;
738 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
742 static inline void sample_filter_HPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
744 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
745 db[1] = imuldiv28(dc[1], *sp << 4) - db[0] - imuldiv28(dc[1], db[2]); // high
746 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
747 *sp = db[1] >> 4; // (db[1] + db[0]) >> 4; // notch
750 static inline void sample_filter_HPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
752 FILTER_T input = *sp << 4;
754 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
755 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
756 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
757 *sp = db[1] >> 4; // (db[1] + db[0]) >> 4; // notch
759 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
760 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
761 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
764 static inline void sample_filter_HPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
766 FILTER_T input = *sp << 4;
768 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
769 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
770 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
771 *sp = db[1] >> 4; // (db[1] + db[0]) >> 4; // notch
773 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
774 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
775 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
777 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
778 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
779 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
782 static inline void recalc_filter_HPF12_3(FilterCoefficients *fc)
786 /* Chamberlin2's lowpass filter. */
787 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
788 fc->last_freq = fc->freq;
789 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
790 fc->sample_filter = sample_filter_HPF12_3;
791 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
792 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
793 fc->sample_filter = sample_filter_HPF12_3_ov2;
794 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
795 }else{ // <sr*3*0.21875
796 fc->sample_filter = sample_filter_HPF12_3_ov3;
797 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
799 fc->last_reso_dB = fc->reso_dB;
800 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
804 static inline void sample_filter_BPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
806 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
807 db[1] = imuldiv28(dc[1], *sp << 4) - db[0] - imuldiv28(dc[1], db[2]); // high
808 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
809 *sp = db[2] >> 4; // (db[1] + db[0]) >> 4; // notch
812 static inline void sample_filter_BPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
814 FILTER_T input = *sp << 4;
816 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
817 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
818 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
819 *sp = db[2] >> 4; // (db[1] + db[0]) >> 4; // notch
821 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
822 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
823 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
826 static inline void sample_filter_BPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
828 FILTER_T input = *sp << 4;
830 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
831 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
832 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
833 *sp = db[2] >> 4; // (db[1] + db[0]) >> 4; // notch
835 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
836 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
837 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
839 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
840 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
841 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
844 static inline void recalc_filter_BPF12_3(FilterCoefficients *fc)
848 /* Chamberlin2's lowpass filter. */
849 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
850 fc->last_freq = fc->freq;
851 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
852 fc->sample_filter = sample_filter_BPF12_3;
853 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
854 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
855 fc->sample_filter = sample_filter_BPF12_3_ov2;
856 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
857 }else{ // <sr*3*0.21875
858 fc->sample_filter = sample_filter_BPF12_3_ov3;
859 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
861 fc->last_reso_dB = fc->reso_dB;
862 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
866 static inline void sample_filter_BCF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
868 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
869 db[1] = imuldiv28(dc[1], *sp << 4) - db[0] - imuldiv28(dc[1], db[2]); // high
870 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
871 *sp = (db[1] + db[0]) >> 4; // notch
874 static inline void sample_filter_BCF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
876 FILTER_T input = *sp << 4;
878 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
879 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
880 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
881 *sp = (db[1] + db[0]) >> 4; // notch
883 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
884 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
885 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
888 static inline void sample_filter_BCF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
890 FILTER_T input = *sp << 4;
892 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
893 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
894 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
895 *sp = (db[1] + db[0]) >> 4; // notch
897 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
898 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
899 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
901 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
902 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
903 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
906 static inline void recalc_filter_BCF12_3(FilterCoefficients *fc)
910 /* Chamberlin2's lowpass filter. */
911 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
912 fc->last_freq = fc->freq;
913 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
914 fc->sample_filter = sample_filter_BCF12_3;
915 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
916 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
917 fc->sample_filter = sample_filter_BCF12_3_ov2;
918 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
919 }else{ // <sr*3*0.21875
920 fc->sample_filter = sample_filter_BCF12_3_ov3;
921 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
923 fc->last_reso_dB = fc->reso_dB;
924 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
928 static inline void sample_filter_HPF6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
930 *sp -= (db[1] = imuldiv28(dc[0], *sp << 4) + imuldiv28(dc[1], db[1])) >> 4;
933 static inline void sample_filter_HPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
935 db[1] += imuldiv28(((*sp << 4) - db[0]), dc[1]);
937 db[1] = imuldiv28(db[1], dc[0]);
938 *sp -= db[0] >> 4; /* 4.28 to 8.24 */
944 static inline void sample_filter_HBF_L6L12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
946 FILTER_T input = *sp << 4, out1, out2;
947 const FILTER_T var1 = TIM_FSCALE(0.75, 28);
948 const FILTER_T var2 = TIM_FSCALE(0.25, 28);
949 const FILTER_T var3 = TIM_FSCALE(1.0, 28);
951 db[1] += imuldiv28((input - db[0]), dc[1]);
953 db[1] = imuldiv28(db[1], dc[0]);
956 db[11] = imuldiv28(input, dc[10]) + imuldiv28(db[11], dc[11]);
959 dc[16] = imuldiv28(dc[16], var1) + imuldiv28(dc[15], var2);
960 *sp = imuldiv28(out1, dc[16]) + imuldiv28(out2, var3 - dc[16]);
963 static inline void recalc_filter_HBF_L6L12(FilterCoefficients *fc)
965 FILTER_T *dc = fc->dc;
966 FLOAT_T f, r, p, q, t;
968 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
969 fc->last_freq = fc->freq;
970 fc->last_reso_dB = fc->reso_dB;
972 f = M_PI2 * fc->freq * fc->div_flt_rate;
973 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
975 dc[0] = TIM_FSCALE(p, 28);
976 dc[1] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
978 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
979 dc[10] = TIM_FSCALE(1.0 - f, 28);
980 dc[11] = TIM_FSCALE(f, 28);
982 dc[15] = TIM_FSCALE(1.0 - RESO_DB_CF_M(fc->reso_dB), 28);
986 static inline void sample_filter_HBF_L12L6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
988 FILTER_T input = *sp << 4, out1, out2;
989 const FILTER_T var = TIM_FSCALE(DIV_2, 28);
991 db[1] += imuldiv28((input - db[0]), dc[1]);
993 db[1] = imuldiv28(db[1], dc[0]);
996 db[11] = imuldiv28(input, dc[10]) + imuldiv28(db[11], dc[11]);
999 *sp = (out1 + imuldiv28(out2, var)) >> 4; /* 4.28 to 8.24 */
1002 static inline void recalc_filter_HBF_L12L6(FilterCoefficients *fc)
1004 FILTER_T *dc = fc->dc;
1007 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1008 fc->last_freq = fc->freq;
1009 fc->last_reso_dB = fc->reso_dB;
1011 f = M_PI2 * fc->freq * fc->div_flt_rate;
1012 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1014 dc[0] = TIM_FSCALE(p, 28);
1015 dc[1] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1017 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
1018 dc[10] = TIM_FSCALE(1.0 - f, 28);
1019 dc[11] = TIM_FSCALE(f, 28);
1023 static inline void sample_filter_HBF_L12H6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1025 FILTER_T input = *sp << 4, out1, out2;
1026 const FILTER_T var = TIM_FSCALE(DIV_2, 28);
1028 db[1] += imuldiv28((input - db[0]), dc[1]);
1030 db[1] = imuldiv28(db[1], dc[0]);
1033 db[11] = imuldiv28(input, dc[10]) + imuldiv28(db[11], dc[11]);
1034 out2 = input - db[11];
1036 *sp = (out1 + imuldiv28(out2, var)) >> 4; /* 4.28 to 8.24 */
1039 static inline void recalc_filter_HBF_L12H6(FilterCoefficients *fc)
1041 FILTER_T *dc = fc->dc;
1044 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1045 fc->last_freq = fc->freq;
1046 fc->last_reso_dB = fc->reso_dB;
1048 f = M_PI2 * fc->freq * fc->div_flt_rate;
1049 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1051 dc[0] = TIM_FSCALE(p, 28);
1052 dc[1] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1054 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
1055 dc[10] = TIM_FSCALE(1.0 - f, 28);
1056 dc[11] = TIM_FSCALE(f, 28);
1060 static inline void sample_filter_HBF_L24H6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1062 FILTER_T input = *sp << 4, out1, out2;
1063 const FILTER_T var = TIM_FSCALE(DIV_2, 28);
1066 db[5] = imuldiv28(db[0], dc[0]) + db[1];
1067 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
1068 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1070 db[5] = imuldiv28(db[0], dc[0]) + db[3];
1071 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
1072 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1075 db[11] = imuldiv28(input, dc[10]) + imuldiv28(db[11], dc[11]);
1076 out2 = input - db[11];
1078 *sp = (out1 + imuldiv28(out2, var)) >> 4; /* 4.28 to 8.24 */
1081 static inline void recalc_filter_HBF_L24H6(FilterCoefficients *fc)
1083 FILTER_T *dc = fc->dc;
1084 FLOAT_T f, r, q ,p, s;
1086 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1087 fc->last_freq = fc->freq;
1088 fc->last_reso_dB = fc->reso_dB;
1090 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
1091 q = 2.0 * RESO_DB_CF_M( fc->reso_dB);
1093 p = 1 + (q * f) + r;
1095 dc[0] = TIM_FSCALE(s, 28);
1096 dc[1] = TIM_FSCALE(s * 2, 28);
1097 dc[2] = TIM_FSCALE(r / p, 28);
1098 dc[3] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1099 dc[4] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1101 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
1102 dc[10] = TIM_FSCALE(1.0 - f, 28);
1103 dc[11] = TIM_FSCALE(f, 28);
1107 static inline void sample_filter_HBF_L24H12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1109 FILTER_T input = *sp << 4, out1, out2;
1110 const FILTER_T var = TIM_FSCALE(DIV_2, 28);
1113 db[5] = imuldiv28(db[0], dc[0]) + db[1];
1114 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
1115 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1117 db[5] = imuldiv28(db[0], dc[0]) + db[3];
1118 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
1119 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1122 db[11] += imuldiv28((input - db[10]), dc[11]);
1124 db[11] = imuldiv28(db[11], dc[10]);
1125 out2 = input - db[10];
1127 *sp = (out1 + imuldiv28(out2, var)) >> 4; /* 4.28 to 8.24 */
1130 static inline void recalc_filter_HBF_L24H12(FilterCoefficients *fc)
1132 FILTER_T *dc = fc->dc;
1133 FLOAT_T f, r, q ,p, s;
1135 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1136 fc->last_freq = fc->freq;
1137 fc->last_reso_dB = fc->reso_dB;
1139 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
1140 q = 2.0 * RESO_DB_CF_M( fc->reso_dB);
1142 p = 1 + (q * f) + r;
1144 dc[0] = TIM_FSCALE(s, 28);
1145 dc[1] = TIM_FSCALE(s * 2, 28);
1146 dc[2] = TIM_FSCALE(r / p, 28);
1147 dc[3] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1148 dc[4] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1150 f = M_PI2 * fc->freq * fc->div_flt_rate;
1151 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1153 dc[10] = TIM_FSCALE(p, 28);
1154 dc[11] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1158 static inline void sample_filter_HBF_L12OCT(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1160 FILTER_T input = *sp << 4, out1, out2;
1161 const FILTER_T var1 = TIM_FSCALE(DIV_3_2, 28);
1162 const FILTER_T var2 = TIM_FSCALE(DIV_3, 28);
1164 db[1] += imuldiv28((input - db[0]), dc[1]);
1166 db[1] = imuldiv28(db[1], dc[0]);
1169 db[11] += imuldiv28((input - db[10]), dc[11]);
1171 db[11] = imuldiv28(db[11], dc[10]);
1174 *sp = (imuldiv28(out1, var1) + imuldiv28(out2, var2)) >> 4; /* 4.28 to 8.24 */
1177 static inline void recalc_filter_HBF_L12OCT(FilterCoefficients *fc)
1179 FILTER_T *dc = fc->dc;
1182 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1183 fc->last_freq = fc->freq;
1184 fc->last_reso_dB = fc->reso_dB;
1186 f = M_PI2 * fc->freq * fc->div_flt_rate;
1187 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1189 dc[0] = TIM_FSCALE(p, 28);
1190 dc[1] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1192 f = 2.0 * fc->freq * fc->div_flt_rate;
1196 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1198 dc[10] = TIM_FSCALE(p, 28);
1199 dc[11] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1203 static inline void sample_filter_HBF_L24OCT(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1205 FILTER_T input = *sp << 4, out1, out2;
1206 const FILTER_T var1 = TIM_FSCALE(DIV_3_2, 28);
1207 const FILTER_T var2 = TIM_FSCALE(DIV_3, 28);
1210 db[5] = imuldiv28(db[0], dc[0]) + db[1];
1211 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
1212 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1214 db[5] = imuldiv28(db[0], dc[0]) + db[3];
1215 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
1216 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1220 db[15] = imuldiv28(db[10], dc[10]) + db[11];
1221 db[11] = imuldiv28(db[10], dc[11]) + imuldiv28(db[15], dc[13]) + db[12];
1222 db[12] = imuldiv28(db[10], dc[12]) + imuldiv28(db[15], dc[14]);
1224 db[15] = imuldiv28(db[10], dc[10]) + db[3];
1225 db[13] = imuldiv28(db[10], dc[11]) + imuldiv28(db[15], dc[13]) + db[14];
1226 db[14] = imuldiv28(db[10], dc[12]) + imuldiv28(db[15], dc[14]);
1229 *sp = (imuldiv28(out1, var1) + imuldiv28(out2, var2)) >> 4; /* 4.28 to 8.24 */
1232 static inline void recalc_filter_HBF_L24OCT(FilterCoefficients *fc)
1234 FILTER_T *dc = fc->dc;
1235 FLOAT_T f, r, q ,p, s;
1237 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1238 fc->last_freq = fc->freq;
1239 fc->last_reso_dB = fc->reso_dB;
1241 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
1242 q = 2.0 * RESO_DB_CF_M( fc->reso_dB);
1244 p = 1 + (q * f) + r;
1246 dc[0] = TIM_FSCALE(s, 28);
1247 dc[1] = TIM_FSCALE(s * 2, 28);
1248 dc[2] = TIM_FSCALE(r / p, 28);
1249 dc[3] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1250 dc[4] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1252 f = 2.0 * fc->freq * fc->div_flt_rate;
1255 f = tan(M_PI * f); // cutoff freq rate/2
1256 q = 2.0 * RESO_DB_CF_M( fc->reso_dB);
1258 p = 1 + (q * f) + r;
1260 dc[10] = TIM_FSCALE(s, 28);
1261 dc[11] = TIM_FSCALE(s * 2, 28);
1262 dc[12] = TIM_FSCALE(r / p, 28);
1263 dc[13] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1264 dc[14] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1270 static inline void sample_filter_LPF_BWx2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1275 db[ 2] = imuldiv28(db[ 0], dc[0])
1276 + imuldiv28(db[ 1], dc[1])
1277 + imuldiv28(db[ 2], dc[2])
1278 - imuldiv28(db[ 3], dc[3])
1279 - imuldiv28(db[ 4], dc[4]);
1281 db[ 3] = db[ 2]; // flt out
1283 db[ 1] = db[ 0]; // flt in
1285 db[ 6] = imuldiv28(db[ 3], dc[0])
1286 + imuldiv28(db[ 5], dc[1])
1287 + imuldiv28(db[ 6], dc[2])
1288 - imuldiv28(db[ 7], dc[3])
1289 - imuldiv28(db[ 8], dc[4]);
1291 db[ 7] = db[ 6]; // flt out
1293 db[ 5] = db[ 3]; // flt in
1295 *sp = db[ 7] >> 4; /* 4.28 to 8.24 */
1298 static inline void sample_filter_LPF_BWx3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1303 db[ 2] = imuldiv28(db[ 0], dc[0])
1304 + imuldiv28(db[ 1], dc[1])
1305 + imuldiv28(db[ 2], dc[2])
1306 - imuldiv28(db[ 3], dc[3])
1307 - imuldiv28(db[ 4], dc[4]);
1309 db[ 3] = db[ 2]; // flt out
1311 db[ 1] = db[ 0]; // flt in
1313 db[ 6] = imuldiv28(db[ 3], dc[0])
1314 + imuldiv28(db[ 5], dc[1])
1315 + imuldiv28(db[ 6], dc[2])
1316 - imuldiv28(db[ 7], dc[3])
1317 - imuldiv28(db[ 8], dc[4]);
1319 db[ 7] = db[ 6]; // flt out
1321 db[ 5] = db[ 3]; // flt in
1323 db[10] = imuldiv28(db[ 7], dc[0])
1324 + imuldiv28(db[ 9], dc[1])
1325 + imuldiv28(db[10], dc[2])
1326 - imuldiv28(db[11], dc[3])
1327 - imuldiv28(db[12], dc[4]);
1329 db[11] = db[10]; // flt out
1331 db[ 9] = db[ 7]; // flt in
1333 *sp = db[11] >> 4; /* 4.28 to 8.24 */
1336 static inline void sample_filter_LPF_BWx4(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1341 db[ 2] = imuldiv28(db[ 0], dc[0])
1342 + imuldiv28(db[ 1], dc[1])
1343 + imuldiv28(db[ 2], dc[2])
1344 - imuldiv28(db[ 3], dc[3])
1345 - imuldiv28(db[ 4], dc[4]);
1347 db[ 3] = db[ 2]; // flt out
1349 db[ 1] = db[ 0]; // flt in
1351 db[ 6] = imuldiv28(db[ 3], dc[0])
1352 + imuldiv28(db[ 5], dc[1])
1353 + imuldiv28(db[ 6], dc[2])
1354 - imuldiv28(db[ 7], dc[3])
1355 - imuldiv28(db[ 8], dc[4]);
1357 db[ 7] = db[ 6]; // flt out
1359 db[ 5] = db[ 3]; // flt in
1361 db[10] = imuldiv28(db[ 7], dc[0])
1362 + imuldiv28(db[ 9], dc[1])
1363 + imuldiv28(db[10], dc[2])
1364 - imuldiv28(db[11], dc[3])
1365 - imuldiv28(db[12], dc[4]);
1367 db[11] = db[10]; // flt out
1369 db[ 9] = db[ 7]; // flt in
1371 db[14] = imuldiv28(db[11], dc[0])
1372 + imuldiv28(db[13], dc[1])
1373 + imuldiv28(db[14], dc[2])
1374 - imuldiv28(db[15], dc[3])
1375 - imuldiv28(db[16], dc[4]);
1377 db[15] = db[14]; // flt out
1379 db[13] = db[11]; // flt in
1381 *sp = db[15] >> 4; /* 4.28 to 8.24 */
1384 static inline void recalc_filter_LPF24_2x2(FilterCoefficients *fc)
1386 FLOAT_T f, q, p, r, tmp;
1389 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1390 fc->last_freq = fc->freq;
1391 fc->last_reso_dB = fc->reso_dB;
1392 //f = 1.0 / tan(M_PI * fc->freq * fc->div_flt_rate);
1393 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
1394 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
1396 //p = 1 + ((2.0) * f) + r;
1397 p = 1 + (q * f) + r;
1398 dc[0] = TIM_FSCALE(tmp = r / p, 28);
1399 dc[1] = TIM_FSCALE(tmp * 2, 28);
1400 dc[2] = TIM_FSCALE(r / p, 28);
1401 dc[3] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1402 //dc[4] = TIM_FSCALE((1 - ((2.0) * f) + r) / (-p), 28);
1403 dc[4] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1407 static inline void sample_filter_LPF24_2x2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1411 db[5] = imuldiv28(db[0], dc[0]) + db[1];
1412 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
1413 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1414 db[10] = db[0] = db[5];
1415 db[5] = imuldiv28(db[0], dc[0]) + db[3];
1416 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
1417 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1419 db[15] = imuldiv28(db[10], dc[0]) + db[11];
1420 db[11] = imuldiv28(db[10], dc[1]) + imuldiv28(db[15], dc[3]) + db[12];
1421 db[12] = imuldiv28(db[10], dc[2]) + imuldiv28(db[15], dc[4]);
1423 db[15] = imuldiv28(db[10], dc[0]) + db[13];
1424 db[13] = imuldiv28(db[10], dc[1]) + imuldiv28(db[15], dc[3]) + db[14];
1425 db[14] = imuldiv28(db[10], dc[2]) + imuldiv28(db[15], dc[4]);
1426 *sp = db[10] >> 4; /* 4.28 to 8.24 */
1429 static inline void sample_filter_LPF6x2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1432 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1433 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1434 *sp = db[2] >> 4; /* 4.28 to 8.24 */
1437 static inline void sample_filter_LPF6x3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1440 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1441 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1442 db[3] = imuldiv28(db[0], dc[2]) + imuldiv28(db[1], dc[3]);
1443 *sp = db[3] >> 4; /* 4.28 to 8.24 */
1446 static inline void sample_filter_LPF6x4(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1449 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1450 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1451 db[3] = imuldiv28(db[0], dc[2]) + imuldiv28(db[1], dc[3]);
1452 db[4] = imuldiv28(db[0], dc[3]) + imuldiv28(db[1], dc[4]); // 24db
1453 *sp = db[4] >> 4; /* 4.28 to 8.24 */
1456 static inline void sample_filter_LPF6x8(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1459 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1460 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1461 db[3] = imuldiv28(db[0], dc[2]) + imuldiv28(db[1], dc[3]);
1462 db[4] = imuldiv28(db[0], dc[3]) + imuldiv28(db[1], dc[4]); // 24db
1463 db[5] = imuldiv28(db[0], dc[4]) + imuldiv28(db[1], dc[5]);
1464 db[6] = imuldiv28(db[0], dc[5]) + imuldiv28(db[1], dc[6]); // 36db
1465 db[7] = imuldiv28(db[0], dc[6]) + imuldiv28(db[1], dc[7]);
1466 db[8] = imuldiv28(db[0], dc[7]) + imuldiv28(db[1], dc[8]); // 48db
1467 *sp = db[8] >> 4; /* 4.28 to 8.24 */
1470 static inline void sample_filter_LPF6x16(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1473 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1474 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1475 db[3] = imuldiv28(db[0], dc[2]) + imuldiv28(db[1], dc[3]);
1476 db[4] = imuldiv28(db[0], dc[3]) + imuldiv28(db[1], dc[4]); // 24db
1477 db[5] = imuldiv28(db[0], dc[4]) + imuldiv28(db[1], dc[5]);
1478 db[6] = imuldiv28(db[0], dc[5]) + imuldiv28(db[1], dc[6]); // 36db
1479 db[7] = imuldiv28(db[0], dc[6]) + imuldiv28(db[1], dc[7]);
1480 db[8] = imuldiv28(db[0], dc[7]) + imuldiv28(db[1], dc[8]); // 48db
1481 db[9] = imuldiv28(db[0], dc[8]) + imuldiv28(db[1], dc[9]);
1482 db[10] = imuldiv28(db[0], dc[9]) + imuldiv28(db[1], dc[10]); // 60db
1483 db[11] = imuldiv28(db[0], dc[10]) + imuldiv28(db[1], dc[11]);
1484 db[12] = imuldiv28(db[0], dc[11]) + imuldiv28(db[1], dc[12]); // 72db
1485 db[13] = imuldiv28(db[0], dc[12]) + imuldiv28(db[1], dc[13]);
1486 db[14] = imuldiv28(db[0], dc[13]) + imuldiv28(db[1], dc[14]); // 84db
1487 db[15] = imuldiv28(db[0], dc[14]) + imuldiv28(db[1], dc[15]);
1488 db[16] = imuldiv28(db[0], dc[15]) + imuldiv28(db[1], dc[16]); // 96db
1489 *sp = db[16] >> 4; /* 4.28 to 8.24 */
1493 static inline void sample_filter_LPF_FIR(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1497 for (i = 0; i < LPF_FIR_ORDER ;i++)
1498 sum += imuldiv24(db[i], dc[i]);
1499 for (i = LPF_FIR_ORDER - 2; i >= 0; i--)
1505 static void designfir(FLOAT_T *g , FLOAT_T fc, FLOAT_T att);
1507 static inline void recalc_filter_LPF_FIR(FilterCoefficients *fc)
1509 FILTER_T *dc = fc->dc;
1510 FLOAT_T fir_coef[LPF_FIR_ORDER2];
1514 if(FLT_FREQ_MARGIN){
1517 f = fc->freq * fc->div_flt_rate * 2.0;
1518 designfir(fir_coef, f, 40.0);
1519 for (i = 0; i < LPF_FIR_ORDER2; i++)
1520 dc[LPF_FIR_ORDER-1 - i] = dc[i] = TIM_FSCALE(fir_coef[LPF_FIR_ORDER2 - 1 - i], 24);
1524 // shelving
\8b¤
\92Ê
1525 static inline void sample_filter_shelving(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1528 db[2] = imuldiv28(db[0], dc[0])
1529 + imuldiv28(db[1], dc[1])
1530 + imuldiv28(db[2], dc[2])
1531 + imuldiv28(db[3], dc[3])
1532 + imuldiv28(db[4], dc[4]);
1537 *sp = imuldiv28(db[3], dc[6]); /* 4.28 to 8.24 */ // spgain
1540 static inline void recalc_filter_shelving_low(FilterCoefficients *fc)
1542 FILTER_T *dc = fc->dc;
1543 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
1545 if(fc->freq != fc->last_freq || fc->reso_dB != fc->last_reso_dB || fc->q != fc->last_q){
1546 fc->last_freq = fc->freq;
1547 fc->last_reso_dB = fc->reso_dB;
1549 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_shelving_gain);
1550 dc[6] = TIM_FSCALE(pow(10.0, -(fc->reso_dB) * DIV_80 * ext_filter_shelving_reduce), 28); // spgain
1551 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
1554 beta = sqrt(A) / (fc->q * ext_filter_shelving_q); // q > 0
1555 a0 = 1.0 / ((A + 1) + (A - 1) * cs + beta * sn);
1556 a1 = 2.0 * ((A - 1) + (A + 1) * cs);
1557 a2 = -((A + 1) + (A - 1) * cs - beta * sn);
1558 b0 = A * ((A + 1) - (A - 1) * cs + beta * sn);
1559 b1 = 2.0 * A * ((A - 1) - (A + 1) * cs);
1560 b2 = A * ((A + 1) - (A - 1) * cs - beta * sn);
1561 dc[4] = TIM_FSCALE(a2* a0, 28);
1562 dc[3] = TIM_FSCALE(a1* a0, 28);
1563 dc[2] = TIM_FSCALE(b2* a0, 28);
1564 dc[1] = TIM_FSCALE(b1* a0, 28);
1565 dc[0] = TIM_FSCALE(b0* a0, 28);
1569 static inline void recalc_filter_shelving_hi(FilterCoefficients *fc)
1571 FILTER_T *dc = fc->dc;
1572 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
1574 if(fc->freq != fc->last_freq || fc->reso_dB != fc->last_reso_dB || fc->q != fc->last_q){
1575 fc->last_freq = fc->freq;
1576 fc->last_reso_dB = fc->reso_dB;
1578 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_shelving_gain);
1579 dc[6] = TIM_FSCALE(pow(10.0, -(fc->reso_dB) * DIV_80 * ext_filter_shelving_reduce), 28); // spgain
1580 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
1583 beta = sqrt(A) / (fc->q * ext_filter_shelving_q); // q > 0
1584 a0 = 1.0 / ((A + 1) - (A - 1) * cs + beta * sn);
1585 a1 = (-2.0 * ((A - 1) - (A + 1) * cs));
1586 a2 = -((A + 1) - (A - 1) * cs - beta * sn);
1587 b0 = A * ((A + 1) + (A - 1) * cs + beta * sn);
1588 b1 = -2.0 * A * ((A - 1) + (A + 1) * cs);
1589 b2 = A * ((A + 1) + (A - 1) * cs - beta * sn);
1590 dc[4] = TIM_FSCALE(a2* a0, 28);
1591 dc[3] = TIM_FSCALE(a1* a0, 28);
1592 dc[2] = TIM_FSCALE(b2* a0, 28);
1593 dc[1] = TIM_FSCALE(b1* a0, 28);
1594 dc[0] = TIM_FSCALE(b0* a0, 28);
1599 static inline void sample_filter_peaking(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1602 db[2] = imuldiv28(db[0], dc[0])
1603 + imuldiv28(db[1], dc[1])
1604 + imuldiv28(db[2], dc[2])
1605 - imuldiv28(db[3], dc[3])
1606 - imuldiv28(db[4], dc[4]);
1611 *sp = imuldiv28(db[3], dc[6]); // spgain
1614 static inline void recalc_filter_peaking(FilterCoefficients *fc)
1616 FILTER_T *dc = fc->dc;
1617 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
1619 if(fc->freq != fc->last_freq || fc->reso_dB != fc->last_reso_dB || fc->q != fc->last_q){
1620 fc->last_freq = fc->freq;
1621 fc->last_reso_dB = fc->reso_dB;
1623 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_peaking_gain);
1624 dc[6] = TIM_FSCALE(pow(10.0, -(fc->reso_dB) * DIV_80 * ext_filter_peaking_reduce), 28); // spgain
1625 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
1628 beta = sn / (2.0 * fc->q * ext_filter_peaking_q); // q > 0
1629 a0 = 1.0 / (1.0 + beta / A);
1631 a2 = 1.0 - beta / A;
1632 b0 = 1.0 + beta * A;
1633 b2 = 1.0 - beta * A;
1638 dc[4] = TIM_FSCALE(a2, 28);
1639 dc[3] = TIM_FSCALE(a1, 28);
1640 dc[2] = TIM_FSCALE(b2, 28);
1641 dc[1] = TIM_FSCALE(a1, 28); // b1 = a1
1642 dc[0] = TIM_FSCALE(b0, 28);
1647 static inline void sample_filter_biquad(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1650 DATA_T input = *sp << 4, r;
1652 r = imuldiv28(db[1], dc[1])
1653 + imuldiv28(*sp + db[2], dc[2])
1654 - imuldiv28(db[3], dc[3])
1655 - imuldiv28(db[4], dc[4]); // -dc3 -dc4
1661 *sp = r >> 4; /* 4.28 to 8.24 */
1664 static inline void recalc_filter_biquad_low(FilterCoefficients *fc)
1666 FILTER_T *dc = fc->dc;
1667 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, alpha;
1669 if(fc->freq != fc->last_freq || fc->q != fc->last_q){
1670 fc->last_freq = fc->freq;
1671 fc->last_reso_dB = fc->reso_dB;
1673 omega = 2.0 * M_PI * fc->freq * fc->div_flt_rate;
1676 alpha = sn / (2.0 * fc->q); // q > 0
1677 a0 = 1.0 / (1.0 + alpha);
1678 dc[1] = TIM_FSCALE((1.0 - cs) * a0, 28);
1679 dc[2] = dc[0] = TIM_FSCALE(((1.0 - cs) * DIV_2) * a0, 28);
1680 dc[3] = TIM_FSCALE((-2.0 * cs) * a0, 28);
1681 dc[4] = TIM_FSCALE((1.0 - alpha) * a0, 28);
1682 //b2 = ((1.0 - cs) * DIV_2) * a0;
1683 //b1 = (1.0 - cs) * a0;
1684 //a1 = (-2.0 * cs) * a0;
1685 //a2 = (1.0 - alpha) * a0;
1686 //dc[0] = TIM_FSCALE(b2, 28);
1687 //dc[1] = TIM_FSCALE(b1, 28);
1688 //dc[2] = TIM_FSCALE(a1, 28);
1689 //dc[3] = TIM_FSCALE(a2, 28);
1693 static inline void recalc_filter_biquad_hi(FilterCoefficients *fc)
1695 FILTER_T *dc = fc->dc;
1696 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, alpha;
1698 if(fc->freq != fc->last_freq || fc->q != fc->last_q){
1699 fc->last_freq = fc->freq;
1700 fc->last_reso_dB = fc->reso_dB;
1702 omega = 2.0 * M_PI * fc->freq * fc->div_flt_rate;
1705 alpha = sn / (2.0 * fc->q); // q > 0
1706 a0 = 1.0 / (1.0 + alpha);
1707 dc[1] = TIM_FSCALE((-(1.0 + cs)) * a0, 28);
1708 dc[2] = dc[0] = TIM_FSCALE(((1.0 + cs) * DIV_2) * a0, 28);
1709 dc[3] = TIM_FSCALE((-2.0 * cs) * a0, 28);
1710 dc[4] = TIM_FSCALE((1.0 - alpha) * a0, 28);
1711 //b2 = ((1.0 + cs) * DIV_2) * a0;
1712 //b1 = (-(1.0 + cs)) * a0;
1713 //a1 = (-2.0 * cs) * a0;
1714 //a2 = (1.0 - alpha) * a0;
1715 //dc[0] = TIM_FSCALE(b2, 28);
1716 //dc[1] = TIM_FSCALE(b1, 28);
1717 //dc[2] = TIM_FSCALE(a1, 28);
1718 //dc[3] = TIM_FSCALE(a2, 28);
1722 #else /* floating-point implementation */
1724 #ifdef USE_PENTIUM_4
1725 #define DENORMAL_FIX 1 // for pentium 4 float/double denormal fix
1726 #define DENORMAL_ADD (5.4210108624275221703311375920553e-20) // 1.0/(1<<64)
1727 const FLOAT_T denormal_add = DENORMAL_ADD; // 1.0/(1<<64)
1729 #if (USE_X86_EXT_INTRIN >= 3) && defined(FLOAT_T_DOUBLE)
1730 const __m128d vec_denormal_add = {DENORMAL_ADD, DENORMAL_ADD};
1731 #elif (USE_X86_EXT_INTRIN >= 2) && defined(FLOAT_T_FLOAT)
1732 const __m128 vec_denormal_add = {DENORMAL_ADD, DENORMAL_ADD, DENORMAL_ADD, DENORMAL_ADD};
1733 #endif // USE_X86_EXT_INTRIN
1735 #endif // USE_PENTIUM_4
1738 static inline void sample_filter_none(FILTER_T *dc, FILTER_T *db, DATA_T *sp){}
1740 static inline void recalc_filter_none(FilterCoefficients *fc){}
1742 static inline void sample_filter_LPF12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1744 db[0] = db[0] + db[2] * dc[0];
1745 db[1] = *sp - db[0] - db[2] * dc[1];
1746 db[2] = db[1] * dc[0] + db[2];
1750 static inline void sample_filter_LPF12_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1752 FILTER_T input = *sp;
1754 db[0] = db[0] + db[2] * dc[0];
1755 db[1] = input - db[0] - db[2] * dc[1];
1756 db[2] = db[1] * dc[0] + db[2];
1759 db[0] = db[0] + db[2] * dc[0];
1760 db[1] = input - db[0] - db[2] * dc[1];
1761 db[2] = db[1] * dc[0] + db[2];
1764 static inline void sample_filter_LPF12_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1766 FILTER_T input = *sp;
1768 db[0] = db[0] + db[2] * dc[0];
1769 db[1] = input - db[0] - db[2] * dc[1];
1770 db[2] = db[1] * dc[0] + db[2];
1773 db[0] = db[0] + db[2] * dc[0];
1774 db[1] = input - db[0] - db[2] * dc[1];
1775 db[2] = db[1] * dc[0] + db[2];
1777 db[0] = db[0] + db[2] * dc[0];
1778 db[1] = input - db[0] - db[2] * dc[1];
1779 db[2] = db[1] * dc[0] + db[2];
1782 static inline void recalc_filter_LPF12(FilterCoefficients *fc)
1784 FILTER_T *dc = fc->dc;
1786 /* copy with applying Chamberlin's lowpass filter. */
1787 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
1791 if(fc->freq < fc->flt_rate_limit1){ // <sr*DIV_6
1792 fc->sample_filter = sample_filter_LPF12;
1793 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
1794 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*DIV_6
1795 fc->sample_filter = sample_filter_LPF12_ov2;
1796 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
1797 }else{ // <sr*3*DIV_6
1798 fc->sample_filter = sample_filter_LPF12_ov3;
1799 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
1801 dc[1] = RESO_DB_CF_M(fc->reso_dB);
1805 static inline void sample_filter_LPF24(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1809 da[0] = *sp - dc[2] * db[4]; /* feedback */
1813 db[1] = (db[0] + da[0]) * dc[0] - db[1] * dc[1];
1814 db[2] = (db[1] + da[1]) * dc[0] - db[2] * dc[1];
1815 db[3] = (db[2] + da[2]) * dc[0] - db[3] * dc[1];
1816 db[4] = (db[3] + da[3]) * dc[0] - db[4] * dc[1];
1821 static inline void recalc_filter_LPF24(FilterCoefficients *fc)
1823 FILTER_T *dc = fc->dc, f, q ,p, r;
1825 /* copy with applying Moog lowpass VCF. */
1826 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
1830 f = 2.0 * fc->freq * fc->div_flt_rate;
1832 q = 0.80 * (1.0 - RESO_DB_CF_M(fc->reso_dB)); // 0.0f <= c < 0.80f
1833 dc[0] = f + 0.8 * f * p;
1834 dc[1] = dc[0] + dc[0] - 1.0;
1835 dc[2] = q * (1.0 + 0.5 * p * (1.0 - p + 5.6 * p * p));
1839 static inline void sample_filter_LPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1844 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
1845 #if defined(DENORMAL_FIX)
1846 db[2] += denormal_add;
1849 db[3] = db[2]; // flt out
1851 db[1] = db[0]; // flt in
1856 static inline void recalc_filter_LPF_BW(FilterCoefficients *fc)
1858 FILTER_T *dc = fc->dc;
1859 double q ,p, p2, qp, dc0;
1861 // elion butterworth
1862 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
1866 p = 1.0 / tan(M_PI * fc->freq * (double)fc->div_flt_rate); // ?
1867 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
1870 dc0 = 1.0 / ( 1.0 + qp + p2);
1874 dc[3] = -2.0 * ( 1.0 - p2) * dc0; // -
1875 dc[4] = -(1.0 - qp + p2) * dc0; // -
1879 static inline void sample_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1881 db[1] += (*sp - db[0]) * dc[1];
1887 static inline void recalc_filter_LPF12_2(FilterCoefficients *fc)
1889 FILTER_T *dc = fc->dc;
1891 FLOAT_T c0, c1, a0, b1, b2;
1893 // Resonant IIR lowpass (12dB/oct) Olli Niemitalo //r
1894 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
1898 f = M_PI2 * fc->freq * fc->div_flt_rate;
1899 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1902 c1 = c0 + 1.0 - 2.0 * cos(f) * q;
1905 #if (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
1916 dc[9] = b1 * b1 + b2;
1921 #if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
1922 // SIMD optimization (double * 2)
1923 static inline void buffer_filter_LPF12_2(FILTER_T* dc, FILTER_T* db, DATA_T* sp, int32 count)
1926 __m256d vcx0 = _mm256_broadcast_pd((__m128d *)(dc + 2));
1927 __m256d vcx1 = _mm256_broadcast_pd((__m128d *)(dc + 4));
1928 __m128d vcym2 = _mm_loadu_pd(dc + 6);
1929 __m128d vcym1 = _mm_loadu_pd(dc + 8);
1930 __m128d vy = _mm_loadu_pd(db + 2);
1931 __m128d vym2 = _mm_unpacklo_pd(vy, vy);
1932 __m128d vym1 = _mm_unpackhi_pd(vy, vy);
1934 for (i = 0; i < count; i += 4)
1936 __m256d vin = _mm256_loadu_pd(sp + i);
1937 __m256d vx0 = _mm256_unpacklo_pd(vin, vin);
1938 __m256d vx1 = _mm256_unpackhi_pd(vin, vin);
1939 __m256d vfma2x = MM256_FMA2_PD(vcx0, vx0, vcx1, vx1);
1941 __m128d vy0 = _mm_add_pd(_mm256_castpd256_pd128(vfma2x), MM_FMA2_PD(vcym2, vym2, vcym1, vym1));
1942 _mm_storeu_pd(sp + i, vy0);
1943 vym2 = _mm_unpacklo_pd(vy0, vy0);
1944 vym1 = _mm_unpackhi_pd(vy0, vy0);
1946 __m128d vy1 = _mm_add_pd(_mm256_extractf128_pd(vfma2x, 1), MM_FMA2_PD(vcym2, vym2, vcym1, vym1));
1947 _mm_storeu_pd(sp + i + 2, vy1);
1948 vym2 = _mm_unpacklo_pd(vy1, vy1);
1949 vym1 = _mm_unpackhi_pd(vy1, vy1);
1953 _mm_storeu_pd(db + 2, vy);
1956 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
1957 // SIMD optimization (double * 2)
1958 static inline void buffer_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp, int32 count)
1961 __m128d vcx0 = _mm_loadu_pd(dc + 2);
1962 __m128d vcx1 = _mm_loadu_pd(dc + 4);
1963 __m128d vcym2 = _mm_loadu_pd(dc + 6);
1964 __m128d vcym1 = _mm_loadu_pd(dc + 8);
1965 __m128d vy = _mm_loadu_pd(db + 2);
1966 __m128d vym2 = _mm_unpacklo_pd(vy, vy);
1967 __m128d vym1 = _mm_unpackhi_pd(vy, vy);
1969 for (i = 0; i < count; i += 2) {
1970 __m128d vin = _mm_loadu_pd(sp + i);
1971 __m128d vx0 = _mm_unpacklo_pd(vin, vin);
1972 __m128d vx1 = _mm_unpackhi_pd(vin, vin);
1973 vy = MM_FMA4_PD(vcx0, vx0, vcx1, vx1, vcym2, vym2, vcym1, vym1);
1974 _mm_storeu_pd(sp + i, vy);
1975 vym2 = _mm_unpacklo_pd(vy, vy);
1976 vym1 = _mm_unpackhi_pd(vy, vy);
1978 _mm_storeu_pd(db + 2, vy);
1982 static inline void buffer_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp, int32 count)
1985 FILTER_T db0 = db[0], db1 = db[1], dc0 = dc[0], dc1 = dc[1];
1987 for (i = 0; i < count; i++) {
1988 db1 += (sp[i] - db0) * dc1;
1997 #endif // (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
1999 static inline void sample_filter_LPF24_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2002 db[5] = dc[0] * db[0] + db[1];
2003 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2004 db[2] = dc[2] * db[0] + dc[4] * db[5];
2006 db[5] = dc[0] * db[0] + db[3];
2007 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2008 db[4] = dc[2] * db[0] + dc[4] * db[5];
2012 static inline void recalc_filter_LPF24_2(FilterCoefficients *fc)
2014 FILTER_T *dc = fc->dc, f, q ,p, r, dc0;
2016 // amSynth 24dB/ocatave resonant low-pass filter. Nick Dowell //r
2017 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2021 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
2022 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2024 p = 1.0 / (1.0 + (q * f) + r);
2029 dc[3] = -2.0 * (r - 1) * p;
2030 dc[4] = (-1.0 + (q * f) - r) * p;
2034 static inline void sample_filter_LPF6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2036 *sp = (db[1] = dc[0] * *sp + dc[1] * db[1]);
2037 #if defined(DENORMAL_FIX)
2038 db[1] += denormal_add;
2042 static inline void recalc_filter_LPF6(FilterCoefficients *fc)
2044 FILTER_T *dc = fc->dc, f;
2046 // One pole filter, LP 6dB/Oct scoofy no resonance //r
2047 if(FLT_FREQ_MARGIN){
2050 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2056 static inline void sample_filter_LPF18(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2063 db[0] = *sp - dc[2] * db[3];
2064 db[1] = dc[1] * (db[0] + da[0]) - dc[0] * db[1];
2065 db[2] = dc[1] * (db[1] + da[1]) - dc[0] * db[2];
2066 db[3] = dc[1] * (db[2] + da[2]) - dc[0] * db[3];
2067 *sp = db[3] * dc[3];
2070 static inline void sample_filter_LPF18_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2072 FILTER_T da[6], input = *sp;
2077 db[0] = input - dc[2] * db[3];
2078 db[1] = dc[1] * (db[0] + da[0]) - dc[0] * db[1];
2079 db[2] = dc[1] * (db[1] + da[1]) - dc[0] * db[2];
2080 db[3] = dc[1] * (db[2] + da[2]) - dc[0] * db[3];
2081 *sp = db[3] * dc[3];
2086 db[0] = input - dc[2] * db[3];
2087 db[1] = dc[1] * (db[0] + da[0]) - dc[0] * db[1];
2088 db[2] = dc[1] * (db[1] + da[1]) - dc[0] * db[2];
2089 db[3] = dc[1] * (db[2] + da[2]) - dc[0] * db[3];
2092 static inline void recalc_filter_LPF18(FilterCoefficients *fc)
2094 FILTER_T *dc = fc->dc, f, q , p;
2095 // LPF18 low-pass filter //r
2096 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2100 if(fc->freq < fc->flt_rate_limit1){ // <sr/2.25
2101 fc->sample_filter = sample_filter_LPF18;
2102 f = 2.0 * fc->freq * fc->div_flt_rate; // *1.0
2103 }else{ // <sr*2/2.25
2104 fc->sample_filter = sample_filter_LPF18_ov2;
2105 f = 2.0 * fc->freq * fc->div_flt_rate_ov2; // sr*2
2107 dc[0] = ((-2.7528 * f + 3.0429) * f + 1.718) * f - 0.9984;
2108 q = 0.789 * (1.0 - RESO_DB_CF_M(fc->reso_dB)); // 0<q<0.78125
2111 dc[2] = q * (((-2.7079 * p + 10.963) * p - 14.934) * p + 8.4974);
2112 dc[3] = 1.0 + (0.25 * (1.5 + 2.0 * dc[2] * (1.0 - f)));
2116 static inline void sample_filter_LPF_TFO(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2118 db[0] = db[0] + dc[0] * (*sp - db[0] + dc[1] * (db[0] - db[1]));
2119 db[1] = db[1] + dc[0] * (db[0] - db[1]);
2123 static inline void recalc_filter_LPF_TFO(FilterCoefficients *fc)
2125 FILTER_T *dc = fc->dc, q;
2127 // two first order low-pass filter //r
2128 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2132 dc[0] = 2 * fc->freq * fc->div_flt_rate;
2133 q = 1.0 - RESO_DB_CF_M(fc->reso_dB);
2134 dc[1] = q + q / (1.01 - dc[0]);
2138 static inline void sample_filter_HPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2143 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
2144 #if defined(DENORMAL_FIX)
2145 db[2] += denormal_add;
2148 db[3] = db[2]; // flt out
2150 db[1] = db[0]; // flt in
2155 static inline void recalc_filter_HPF_BW(FilterCoefficients *fc)
2157 FILTER_T *dc = fc->dc;
2158 double q, p, p2, qp, dc0;
2160 // elion butterworth HPF //r
2161 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2165 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
2166 p = tan(M_PI * fc->freq * fc->div_flt_rate); // hpf ?
2169 dc0 = 1.0 / (1.0 + qp + p2);
2171 dc[1] = -2 * dc0; // hpf
2173 dc[3] = -2.0 * (p2 - 1.0) * dc0; // hpf
2174 dc[4] = -(1.0 - qp + p2) * dc0;
2178 static inline void sample_filter_BPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2183 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
2185 db[10] = dc[8] * db[8] + dc[9] * db[9] + dc[10] * db[10] + dc[11] * db[11] + dc[12] * db[12];
2186 #if defined(DENORMAL_FIX)
2187 db[2] += denormal_add;
2191 db[11] = db[10]; // flt out
2193 db[9] = db[8]; // flt in
2195 db[8] = db[4]; // db[4]
\82©
\82çdb[8]
\82Ö
\82Í
\92x
\89\84\82µ
\82Ä
\82à
\82¢
\82¢
2198 db[3] = db[2]; // flt out
2200 db[1] = db[0]; // flt in
2205 static inline void recalc_filter_BPF_BW(FilterCoefficients *fc)
2207 FILTER_T *dc = fc->dc;
2208 FLOAT_T f, q, pl, pl2, qpl, ph, ph2, qph, dc0;
2210 // elion butterworth
2211 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2215 f = fc->freq * fc->div_flt_rate;
2216 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
2218 pl = 1.0 / tan(M_PI * f);
2221 dc0 = 1.0 / ( 1.0 + qpl + pl2);
2225 dc[3] = -2.0 * ( 1.0 - pl2) * dc0; // -
2226 dc[4] = -(1.0 - qpl + pl2) * dc0; // -
2228 ph = tan(M_PI * f * 0.8); // hpf // f bandwidth = LPF-HPF
2231 dc0 = 1.0 / (1.0 + qph + ph2);
2233 dc[9] = -2 * dc0; // hpf
2235 dc[11] = -2.0 * (ph2 - 1.0) * dc0; // hpf
2236 dc[12] = -(1.0 - qph + ph2) * dc0;
2240 static inline void sample_filter_peak1(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2245 r = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2];
2251 static inline void recalc_filter_peak1(FilterCoefficients *fc)
2253 FILTER_T *dc = fc->dc, f, q, r, pl ,ph, sl, sh;
2255 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2259 f = cos(M_PI2 * fc->freq * fc->div_flt_rate);
2260 r = (1.0 - RESO_DB_CF_M(fc->reso_dB)) * 0.99609375; // r < 0.99609375
2261 dc[0] = (1 - r) * sqrt(r * (r - 4 * (f * f) + 2.0) + 1.0);
2267 static inline void sample_filter_notch1(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2272 r = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2];
2275 *sp = db[0] - r; // notch
2278 static inline void sample_filter_LPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2280 db[0] = db[0] + dc[0] * db[2]; // low
2281 db[1] = dc[1] * *sp - db[0] - dc[1] * db[2]; // high
2282 db[2] = dc[0] * db[1] + db[2]; // band
2283 *sp = db[0]; // db[1] + db[0]; // notch
2286 static inline void sample_filter_LPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2288 FILTER_T input = *sp;
2290 db[0] = db[0] + dc[0] * db[2]; // low
2291 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2292 db[2] = dc[0] * db[1] + db[2]; // band
2293 *sp = db[0]; // db[1] + db[0]; // notch
2295 db[0] = db[0] + dc[0] * db[2]; // low
2296 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2297 db[2] = dc[0] * db[1] + db[2]; // band
2300 static inline void sample_filter_LPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2302 FILTER_T input = *sp;
2304 db[0] = db[0] + dc[0] * db[2]; // low
2305 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2306 db[2] = dc[0] * db[1] + db[2]; // band
2307 *sp = db[0]; // db[1] + db[0]; // notch
2309 db[0] = db[0] + dc[0] * db[2]; // low
2310 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2311 db[2] = dc[0] * db[1] + db[2]; // band
2313 db[0] = db[0] + dc[0] * db[2]; // low
2314 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2315 db[2] = dc[0] * db[1] + db[2]; // band
2318 static inline void recalc_filter_LPF12_3(FilterCoefficients *fc)
2320 FILTER_T *dc = fc->dc;
2322 /* Chamberlin2's lowpass filter. */
2323 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2327 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
2328 fc->sample_filter = sample_filter_LPF12_3;
2329 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
2330 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
2331 fc->sample_filter = sample_filter_LPF12_3_ov2;
2332 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
2333 }else{ // <sr*3*0.21875
2334 fc->sample_filter = sample_filter_LPF12_3_ov3;
2335 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
2337 dc[1] = RESO_DB_CF_M(fc->reso_dB);
2341 static inline void sample_filter_HPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2343 db[0] = db[0] + dc[0] * db[2]; // low
2344 db[1] = dc[1] * *sp - db[0] - dc[1] * db[2]; // high
2345 db[2] = dc[0] * db[1] + db[2]; // band
2346 *sp = db[1]; // db[1] + db[0]; // notch
2349 static inline void sample_filter_HPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2351 FILTER_T input = *sp;
2353 db[0] = db[0] + dc[0] * db[2]; // low
2354 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2355 db[2] = dc[0] * db[1] + db[2]; // band
2356 *sp = db[1]; // db[1] + db[0]; // notch
2358 db[0] = db[0] + dc[0] * db[2]; // low
2359 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2360 db[2] = dc[0] * db[1] + db[2]; // band
2363 static inline void sample_filter_HPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2365 FILTER_T input = *sp;
2367 db[0] = db[0] + dc[0] * db[2]; // low
2368 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2369 db[2] = dc[0] * db[1] + db[2]; // band
2370 *sp = db[1]; // db[1] + db[0]; // notch
2372 db[0] = db[0] + dc[0] * db[2]; // low
2373 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2374 db[2] = dc[0] * db[1] + db[2]; // band
2376 db[0] = db[0] + dc[0] * db[2]; // low
2377 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2378 db[2] = dc[0] * db[1] + db[2]; // band
2381 static inline void recalc_filter_HPF12_3(FilterCoefficients *fc)
2383 FILTER_T *dc = fc->dc;
2385 /* Chamberlin2's lowpass filter. */
2386 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2390 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
2391 fc->sample_filter = sample_filter_HPF12_3;
2392 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
2393 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
2394 fc->sample_filter = sample_filter_HPF12_3_ov2;
2395 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
2396 }else{ // <sr*3*0.21875
2397 fc->sample_filter = sample_filter_HPF12_3_ov3;
2398 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
2400 dc[1] = RESO_DB_CF_M(fc->reso_dB);
2404 static inline void sample_filter_BPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2406 db[0] = db[0] + dc[0] * db[2]; // low
2407 db[1] = dc[1] * *sp - db[0] - dc[1] * db[2]; // high
2408 db[2] = dc[0] * db[1] + db[2]; // band
2409 *sp = db[2]; // db[1] + db[0]; // notch
2410 #if defined(DENORMAL_FIX)
2411 db[0] += denormal_add;
2415 static inline void sample_filter_BPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2417 FILTER_T input = *sp;
2419 db[0] = db[0] + dc[0] * db[2]; // low
2420 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2421 db[2] = dc[0] * db[1] + db[2]; // band
2422 *sp = db[2]; // db[1] + db[0]; // notch
2423 #if defined(DENORMAL_FIX)
2424 db[0] += denormal_add;
2427 db[0] = db[0] + dc[0] * db[2]; // low
2428 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2429 db[2] = dc[0] * db[1] + db[2]; // band
2432 static inline void sample_filter_BPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2434 FILTER_T input = *sp;
2436 db[0] = db[0] + dc[0] * db[2]; // low
2437 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2438 db[2] = dc[0] * db[1] + db[2]; // band
2439 *sp = db[2]; // db[1] + db[0]; // notch
2440 #if defined(DENORMAL_FIX)
2441 db[0] += denormal_add;
2444 db[0] = db[0] + dc[0] * db[2]; // low
2445 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2446 db[2] = dc[0] * db[1] + db[2]; // band
2448 db[0] = db[0] + dc[0] * db[2]; // low
2449 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2450 db[2] = dc[0] * db[1] + db[2]; // band
2453 static inline void recalc_filter_BPF12_3(FilterCoefficients *fc)
2455 FILTER_T *dc = fc->dc;
2457 /* Chamberlin2's lowpass filter. */
2458 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2462 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
2463 fc->sample_filter = sample_filter_BPF12_3;
2464 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
2465 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
2466 fc->sample_filter = sample_filter_BPF12_3_ov2;
2467 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
2468 }else{ // <sr*3*0.21875
2469 fc->sample_filter = sample_filter_BPF12_3_ov3;
2470 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
2472 dc[1] = RESO_DB_CF_M(fc->reso_dB);
2476 static inline void sample_filter_BCF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2478 db[0] = db[0] + dc[0] * db[2]; // low
2479 db[1] = dc[1] * *sp - db[0] - dc[1] * db[2]; // high
2480 db[2] = dc[0] * db[1] + db[2]; // band
2481 *sp = db[1] + db[0]; // notch
2484 static inline void sample_filter_BCF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2486 FILTER_T input = *sp;
2488 db[0] = db[0] + dc[0] * db[2]; // low
2489 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2490 db[2] = dc[0] * db[1] + db[2]; // band
2491 *sp = db[1] + db[0]; // notch
2493 db[0] = db[0] + dc[0] * db[2]; // low
2494 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2495 db[2] = dc[0] * db[1] + db[2]; // band
2498 static inline void sample_filter_BCF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2500 FILTER_T input = *sp;
2502 db[0] = db[0] + dc[0] * db[2]; // low
2503 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2504 db[2] = dc[0] * db[1] + db[2]; // band
2505 db[3] = db[1] + db[0]; // notch
2506 *sp = db[1] + db[0]; // notch
2508 db[0] = db[0] + dc[0] * db[2]; // low
2509 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2510 db[2] = dc[0] * db[1] + db[2]; // band
2512 db[0] = db[0] + dc[0] * db[2]; // low
2513 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2514 db[2] = dc[0] * db[1] + db[2]; // band
2517 static inline void recalc_filter_BCF12_3(FilterCoefficients *fc)
2519 FILTER_T *dc = fc->dc;
2521 /* Chamberlin2's lowpass filter. */
2522 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2526 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
2527 fc->sample_filter = sample_filter_BCF12_3;
2528 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
2529 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
2530 fc->sample_filter = sample_filter_BCF12_3_ov2;
2531 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
2532 }else{ // <sr*3*0.21875
2533 fc->sample_filter = sample_filter_BCF12_3_ov3;
2534 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
2536 dc[1] = RESO_DB_CF_M(fc->reso_dB);
2540 static inline void sample_filter_HPF6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2542 *sp -= (db[1] = dc[0] * *sp + dc[1] * db[1]);
2543 #if defined(DENORMAL_FIX)
2544 db[1] += denormal_add;
2548 static inline void sample_filter_HPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2550 db[1] += (*sp - db[0]) * dc[1];
2558 static inline void sample_filter_HBF_L6L12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2560 FLOAT_T in = *sp, out1, out2;
2562 db[1] += (in - db[0]) * dc[1];
2567 db[11] = dc[10] * in + dc[11] * db[11];
2569 #if defined(DENORMAL_FIX)
2570 db[11] += denormal_add;
2573 dc[16] = dc[16] * 0.75 + dc[15] * 0.25;
2574 *sp = (out1 * dc[16] + out2 * (1.0 - dc[16]));
2577 static inline void recalc_filter_HBF_L6L12(FilterCoefficients *fc)
2579 FILTER_T *dc = fc->dc;
2582 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2587 f = M_PI2 * fc->freq * fc->div_flt_rate;
2588 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
2590 dc[1] = dc[0] + 1.0 - 2.0 * cos(f) * q;
2592 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2596 dc[15] = 1.0 - RESO_DB_CF_M(fc->reso_dB);
2600 static inline void sample_filter_HBF_L12L6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2602 FLOAT_T in = *sp, out1, out2;
2604 db[1] += (in - db[0]) * dc[1];
2609 db[11] = dc[10] * in + dc[11] * db[11];
2611 #if defined(DENORMAL_FIX)
2612 db[11] += denormal_add;
2615 *sp = out1 + out2 * DIV_2;
2618 static inline void recalc_filter_HBF_L12L6(FilterCoefficients *fc)
2620 FILTER_T *dc = fc->dc;
2623 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2628 f = M_PI2 * fc->freq * fc->div_flt_rate;
2629 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
2631 dc[1] = dc[0] + 1.0 - 2.0 * cos(f) * q;
2633 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2639 static inline void sample_filter_HBF_L12H6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2641 FLOAT_T in = *sp, out1, out2;
2643 db[1] += (in - db[0]) * dc[1];
2648 db[11] = dc[10] * in + dc[11] * db[11];
2650 #if defined(DENORMAL_FIX)
2651 db[11] += denormal_add;
2654 *sp = out1 + out2 * DIV_2;
2657 static inline void recalc_filter_HBF_L12H6(FilterCoefficients *fc)
2659 FILTER_T *dc = fc->dc;
2662 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2667 f = M_PI2 * fc->freq * fc->div_flt_rate;
2668 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
2670 dc[1] = dc[0] + 1.0 - 2.0 * cos(f) * q;
2672 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2678 static inline void sample_filter_HBF_L24H6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2680 FLOAT_T in = *sp, out1, out2;
2683 db[5] = dc[0] * db[0] + db[1];
2684 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2685 db[2] = dc[2] * db[0] + dc[4] * db[5];
2687 db[5] = dc[0] * db[0] + db[3];
2688 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2689 db[4] = dc[2] * db[0] + dc[4] * db[5];
2692 db[11] = dc[10] * in + dc[11] * db[11];
2694 #if defined(DENORMAL_FIX)
2695 db[11] += denormal_add;
2698 *sp = out1 + out2 * DIV_2;
2701 static inline void recalc_filter_HBF_L24H6(FilterCoefficients *fc)
2703 FILTER_T *dc = fc->dc;
2704 FLOAT_T f, r, q ,p, dc0;
2706 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2711 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
2712 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2714 p = 1.0 / (1.0 + (q * f) + r);
2719 dc[3] = -2.0 * (r - 1) * p;
2720 dc[4] = (-1.0 + (q * f) - r) * p;
2722 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2729 static inline void sample_filter_HBF_L24H12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2731 FLOAT_T in = *sp, out1, out2;
2734 db[5] = dc[0] * db[0] + db[1];
2735 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2736 db[2] = dc[2] * db[0] + dc[4] * db[5];
2738 db[5] = dc[0] * db[0] + db[3];
2739 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2740 db[4] = dc[2] * db[0] + dc[4] * db[5];
2743 db[11] += (in - db[10]) * dc[11];
2748 *sp = out1 + out2 * DIV_2;
2751 static inline void recalc_filter_HBF_L24H12(FilterCoefficients *fc)
2753 FILTER_T *dc = fc->dc;
2754 FLOAT_T f, r, q ,p, p2, qp, dc0;
2756 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2761 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
2762 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2764 p = 1.0 / (1.0 + (q * f) + r);
2769 dc[3] = -2.0 * (r - 1) * p;
2770 dc[4] = (-1.0 + (q * f) - r) * p;
2772 f = M_PI2 * fc->freq * fc->div_flt_rate;
2773 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(0) + 0.5 / (1.0 + f)) + f - 2.0);
2776 dc[11] = dc0 + 1.0 - 2.0 * cos(f) * q;
2780 static inline void sample_filter_HBF_L12OCT(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2782 FLOAT_T in = *sp, out1, out2;
2784 db[1] += (in - db[0]) * dc[1];
2789 db[11] += (in - db[10]) * dc[11];
2794 *sp = out1 * DIV_3_2 + out2 * DIV_3;
2797 static inline void recalc_filter_HBF_L12OCT(FilterCoefficients *fc)
2799 FILTER_T *dc = fc->dc;
2802 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2807 f = M_PI2 * fc->freq * fc->div_flt_rate;
2808 r = RESO_DB_CF_P(fc->reso_dB);
2809 q = 1.0 - f / (2.0 * (r + 0.5 / (1.0 + f)) + f - 2.0);
2811 dc[1] = dc[0] + 1.0 - 2.0 * cos(f) * q;
2813 f = 2.0 * fc->freq * fc->div_flt_rate;
2817 q = 1.0 - f / (2.0 * (r + 0.5 / (1.0 + f)) + f - 2.0);
2819 dc[11] = dc[10] + 1.0 - 2.0 * cos(f) * q;
2823 static inline void sample_filter_HBF_L24OCT(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2825 FLOAT_T in = *sp, out1, out2;
2828 db[5] = dc[0] * db[0] + db[1];
2829 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2830 db[2] = dc[2] * db[0] + dc[4] * db[5];
2832 db[5] = dc[0] * db[0] + db[3];
2833 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2834 db[4] = dc[2] * db[0] + dc[4] * db[5];
2838 db[15] = dc[10] * db[10] + db[11];
2839 db[11] = dc[11] * db[10] + dc[13] * db[15] + db[12];
2840 db[12] = dc[12] * db[10] + dc[14] * db[15];
2842 db[15] = dc[10] * db[10] + db[13];
2843 db[13] = dc[11] * db[10] + dc[13] * db[15] + db[14];
2844 db[14] = dc[12] * db[10] + dc[14] * db[15];
2847 *sp = out1 * DIV_3_2 + out2 * DIV_3;
2850 static inline void recalc_filter_HBF_L24OCT(FilterCoefficients *fc)
2852 FILTER_T *dc = fc->dc;
2853 FLOAT_T f, r, q ,p, dc0;
2855 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2860 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
2861 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2863 p = 1.0 / (1.0 + (q * f) + r);
2868 dc[3] = -2.0 * (r - 1) * p;
2869 dc[4] = (-1.0 + (q * f) - r) * p;
2871 f = 2.0 * fc->freq * fc->div_flt_rate;
2874 f = tan(M_PI * f); // cutoff freq rate/2
2875 // q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2877 p = 1.0 / (1.0 + (q * f) + r);
2882 dc[13] = -2.0 * (r - 1) * p;
2883 dc[14] = (-1.0 + (q * f) - r) * p;
2890 static inline void sample_filter_LPF_BWx2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2895 db[ 2] = dc[ 0] * db[ 0] + dc[ 1] * db[ 1] + dc[ 2] * db[ 2] + dc[ 3] * db[ 3] + dc[ 4] * db[ 4];
2897 db[ 3] = db[ 2]; // flt out
2899 db[ 1] = db[ 0]; // flt in
2901 db[ 6] = dc[ 0] * db[ 3] + dc[ 1] * db[ 5] + dc[ 2] * db[ 6] + dc[ 3] * db[ 7] + dc[ 4] * db[ 8];
2903 db[ 7] = db[ 6]; // flt out
2905 db[ 5] = db[ 3]; // flt in
2910 static inline void sample_filter_LPF_BWx3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2914 db[ 2] = dc[ 0] * db[ 0] + dc[ 1] * db[ 1] + dc[ 2] * db[ 2] + dc[ 3] * db[ 3] + dc[ 4] * db[ 4];
2916 db[ 3] = db[ 2]; // flt out
2918 db[ 1] = db[ 0]; // flt in
2920 db[ 6] = dc[ 0] * db[ 3] + dc[ 1] * db[ 5] + dc[ 2] * db[ 6] + dc[ 3] * db[ 7] + dc[ 4] * db[ 8];
2922 db[ 7] = db[ 6]; // flt out
2924 db[ 5] = db[ 3]; // flt in
2926 db[10] = dc[ 0] * db[ 7] + dc[ 1] * db[ 9] + dc[ 2] * db[10] + dc[ 3] * db[11] + dc[ 4] * db[12];
2928 db[11] = db[10]; // flt out
2930 db[ 9] = db[ 7]; // flt in
2935 static inline void sample_filter_LPF_BWx4(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2940 db[ 2] = dc[ 0] * db[ 0] + dc[ 1] * db[ 1] + dc[ 2] * db[ 2] + dc[ 3] * db[ 3] + dc[ 4] * db[ 4];
2942 db[ 3] = db[ 2]; // flt out
2944 db[ 1] = db[ 0]; // flt in
2946 db[ 6] = dc[ 0] * db[ 3] + dc[ 1] * db[ 5] + dc[ 2] * db[ 6] + dc[ 3] * db[ 7] + dc[ 4] * db[ 8];
2948 db[ 7] = db[ 6]; // flt out
2950 db[ 5] = db[ 3]; // flt in
2952 db[10] = dc[ 0] * db[ 7] + dc[ 1] * db[ 9] + dc[ 2] * db[10] + dc[ 3] * db[11] + dc[ 4] * db[12];
2954 db[11] = db[10]; // flt out
2956 db[ 9] = db[ 7]; // flt in
2958 db[14] = dc[ 0] * db[11] + dc[ 1] * db[13] + dc[ 2] * db[14] + dc[ 3] * db[15] + dc[ 4] * db[16];
2960 db[15] = db[14]; // flt out
2962 db[13] = db[11]; // flt in
2967 static inline void sample_filter_LPF24_2x2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2971 db[5] = dc[0] * db[0] + db[1];
2972 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2973 db[2] = dc[2] * db[0] + dc[4] * db[5];
2974 db[10] = db[0] = db[5];
2975 db[5] = dc[0] * db[0] + db[3];
2976 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2977 db[4] = dc[2] * db[0] + dc[4] * db[5];
2979 db[15] = dc[0] * db[10] + db[11];
2980 db[11] = dc[1] * db[10] + dc[3] * db[15] + db[12];
2981 db[12] = dc[2] * db[10] + dc[4] * db[15];
2983 db[15] = dc[0] * db[10] + db[3];
2984 db[13] = dc[1] * db[10] + dc[3] * db[15] + db[14];
2985 db[14] = dc[2] * db[10] + dc[4] * db[15];
2989 static inline void sample_filter_LPF6x2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2992 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
2993 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
2997 static inline void sample_filter_LPF6x3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3000 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
3001 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
3002 db[3] = dc[0] * db[2] + dc[1] * db[3]; // 18db
3006 static inline void sample_filter_LPF6x4(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3009 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
3010 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
3011 db[3] = dc[0] * db[2] + dc[1] * db[3]; // 18db
3012 db[4] = dc[0] * db[3] + dc[1] * db[4]; // 24db
3016 static inline void sample_filter_LPF6x8(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3019 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
3020 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
3021 db[3] = dc[0] * db[2] + dc[1] * db[3];
3022 db[4] = dc[0] * db[3] + dc[1] * db[4]; // 24db
3023 db[5] = dc[0] * db[4] + dc[1] * db[5];
3024 db[6] = dc[0] * db[5] + dc[1] * db[6]; // 36db
3025 db[7] = dc[0] * db[6] + dc[1] * db[7];
3026 db[8] = dc[0] * db[7] + dc[1] * db[8]; // 48db
3030 static inline void sample_filter_LPF6x16(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3033 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
3034 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
3035 db[3] = dc[0] * db[2] + dc[1] * db[3];
3036 db[4] = dc[0] * db[3] + dc[1] * db[4]; // 24db
3037 db[5] = dc[0] * db[4] + dc[1] * db[5];
3038 db[6] = dc[0] * db[5] + dc[1] * db[6]; // 36db
3039 db[7] = dc[0] * db[6] + dc[1] * db[7];
3040 db[8] = dc[0] * db[7] + dc[1] * db[8]; // 48db
3041 db[9] = dc[0] * db[8] + dc[1] * db[9];
3042 db[10] = dc[0] * db[9] + dc[1] * db[10]; // 60db
3043 db[11] = dc[0] * db[10] + dc[1] * db[11];
3044 db[12] = dc[0] * db[11] + dc[1] * db[12]; // 72db
3045 db[13] = dc[0] * db[12] + dc[1] * db[13];
3046 db[14] = dc[0] * db[13] + dc[1] * db[14]; // 84db
3047 db[15] = dc[0] * db[14] + dc[1] * db[15];
3048 db[16] = dc[0] * db[15] + dc[1] * db[16]; // 96db
3055 static inline void sample_filter_LPF_FIR(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3057 #if (LPF_FIR_ORDER == 20) // optimize
3058 #if (USE_X86_EXT_INTRIN >= 3) && defined(FLOAT_T_DOUBLE)
3059 FLOAT_T input = *sp;
3060 __m128d xdc0 = _mm_loadu_pd(&dc[0]), xdc2 = _mm_loadu_pd(&dc[2]),
3061 xdc4 = _mm_loadu_pd(&dc[4]), xdc6 = _mm_loadu_pd(&dc[6]),
3062 xdc8 = _mm_loadu_pd(&dc[8]), xdc10 = _mm_loadu_pd(&dc[10]),
3063 xdc12 = _mm_loadu_pd(&dc[12]), xdc14 = _mm_loadu_pd(&dc[14]),
3064 xdc16 = _mm_loadu_pd(&dc[16]), xdc18 = _mm_loadu_pd(&dc[18]);
3065 __m128d xdb0 = _mm_loadu_pd(&db[0]), xdb2 = _mm_loadu_pd(&db[2]),
3066 xdb4 = _mm_loadu_pd(&db[4]), xdb6 = _mm_loadu_pd(&db[6]),
3067 xdb8 = _mm_loadu_pd(&db[8]), xdb10 = _mm_loadu_pd(&db[10]),
3068 xdb12 = _mm_loadu_pd(&db[12]), xdb14 = _mm_loadu_pd(&db[14]),
3069 xdb16 = _mm_loadu_pd(&db[16]), xdb18 = _mm_loadu_pd(&db[18]);
3070 __m128d xsum = _mm_setzero_pd();
3071 xsum = MM_FMA5_PD(xdb0, xdc0, xdb2, xdc2, xdb4, xdc4, xdb6, xdc6, xdb8, xdc8);
3072 xsum = MM_FMA5_PD(xdb10, xdc10, xdb12, xdc12, xdb14, xdc14, xdb16, xdc16, xdb18, xdc18);
3073 xsum = _mm_add_pd(xsum, _mm_shuffle_pd(xsum, xsum, 0x1)); // v0=v0+v1 v1=v1+v0
3074 #if defined(DATA_T_FLOAT)
3075 _mm_store_ss(sp, _mm_cvtsd_ss(_mm_setzero_ps(), xsum));
3077 _mm_store_sd(sp, xsum);
3079 _mm_storeu_pd(&db[19], xdb18);
3080 _mm_storeu_pd(&db[17], xdb16);
3081 _mm_storeu_pd(&db[15], xdb14);
3082 _mm_storeu_pd(&db[13], xdb12);
3083 _mm_storeu_pd(&db[11], xdb10);
3084 _mm_storeu_pd(&db[9], xdb8);
3085 _mm_storeu_pd(&db[7], xdb6);
3086 _mm_storeu_pd(&db[5], xdb4);
3087 _mm_storeu_pd(&db[3], xdb2);
3088 _mm_storeu_pd(&db[1], xdb0);
3090 #elif (USE_X86_EXT_INTRIN >= 2) && defined(FLOAT_T_FLOAT)
3091 FLOAT_T input = *sp;
3092 __m128 xdc0 = _mm_loadu_ps(&dc[0]), xdc4 = _mm_loadu_ps(&dc[4]),
3093 xdc8 = _mm_loadu_ps(&dc[8]), xdc12 = _mm_loadu_ps(&dc[12]),
3094 xdc16 = _mm_loadu_ps(&dc[16]);
3095 __m128 xdb0 = _mm_loadu_ps(&db[0]), xdb4 = _mm_loadu_ps(&db[4]),
3096 xdb8 = _mm_loadu_ps(&db[8]), xdb12 = _mm_loadu_ps(&db[12]),
3097 xdb16 = _mm_loadu_ps(&db[16]);
3098 __m128 xsum = _mm_setzero_ps();
3099 xsum = MM_FMA5_PS(xdb0, xdc0, xdb0, xdc0, xdb4, xdc4, xdb8, xdc8, xdb12, xdc12, xdb16, xdc16);
3100 xsum = _mm_add_ps(xsum, _mm_movehl_ps(xsum, xsum)); // v0=v0+v2 v1=v1+v3 v2=-- v3=--
3101 xsum = _mm_add_ps(xsum, _mm_shuffle_ps(xsum, xsum, 0xe1)); // v0=v0+v1
3102 #if defined(DATA_T_FLOAT)
3103 _mm_store_ss(sp, xsum);
3105 #if (USE_X86_EXT_INTRIN >= 3)
3106 _mm_store_sd(sp, _mm_cvtss_sd(xsum));
3110 _mm_store_ss(&out, xsum);
3115 _mm_storeu_ps(&db[17], xdb16);
3116 _mm_storeu_ps(&db[13], xdb12);
3117 _mm_storeu_ps(&db[9], xdb8);
3118 _mm_storeu_ps(&db[5], xdb4);
3119 _mm_storeu_ps(&db[1], xdb0);
3123 sum += db[0] * dc[0];
3124 sum += db[1] * dc[1];
3125 sum += db[2] * dc[2];
3126 sum += db[3] * dc[3];
3127 sum += db[4] * dc[4];
3128 sum += db[5] * dc[5];
3129 sum += db[6] * dc[6];
3130 sum += db[7] * dc[7];
3131 sum += db[8] * dc[8];
3132 sum += db[9] * dc[9];
3133 sum += db[10] * dc[10];
3134 sum += db[11] * dc[11];
3135 sum += db[12] * dc[12];
3136 sum += db[13] * dc[13];
3137 sum += db[14] * dc[14];
3138 sum += db[15] * dc[15];
3139 sum += db[16] * dc[16];
3140 sum += db[17] * dc[17];
3141 sum += db[18] * dc[18];
3142 sum += db[19] * dc[19];
3165 #else // ! (LPF_FIR_ORDER == 20)
3168 for (i = 0; i < LPF_FIR_ORDER ;i++)
3169 sum += db[i] * dc[i];
3170 for (i = LPF_FIR_ORDER - 1; i >= 0; i--)
3177 static void designfir(FLOAT_T *g , FLOAT_T fc, FLOAT_T att);
3179 static inline void recalc_filter_LPF_FIR(FilterCoefficients *fc)
3181 FILTER_T *dc = fc->dc;
3182 FLOAT_T fir_coef[LPF_FIR_ORDER2];
3186 if(FLT_FREQ_MARGIN){
3189 f = fc->freq * fc->div_flt_rate * 2.0;
3190 designfir(fir_coef, f, 40.0);
3191 for (i = 0; i < LPF_FIR_ORDER2; i++)
3192 dc[LPF_FIR_ORDER-1 - i] = dc[i] = fir_coef[LPF_FIR_ORDER2 - 1 - i];
3196 // shelving
\8b¤
\92Ê
3197 static inline void sample_filter_shelving(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3202 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
3203 #if defined(DENORMAL_FIX)
3204 db[2] += denormal_add;
3207 db[3] = db[2]; // flt out
3209 db[1] = db[0]; // flt in
3211 *sp = db[3] * dc[8]; // spgain
3214 static inline void recalc_filter_shelving_low(FilterCoefficients *fc)
3216 FILTER_T *dc = fc->dc;
3217 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
3219 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN || FLT_WIDTH_MARGIN){
3224 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_shelving_gain);
3226 dc[8] = pow((FLOAT_T)10.0, -(fc->reso_dB) * DIV_80 * ext_filter_shelving_reduce); // spgain
3229 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
3232 beta = sqrt(A) / (fc->q * ext_filter_shelving_q); // q > 0
3233 a0 = 1.0 / ((A + 1) + (A - 1) * cs + beta * sn);
3234 a1 = 2.0 * ((A - 1) + (A + 1) * cs);
3235 a2 = -((A + 1) + (A - 1) * cs - beta * sn);
3236 b0 = A * ((A + 1) - (A - 1) * cs + beta * sn);
3237 b1 = 2.0 * A * ((A - 1) - (A + 1) * cs);
3238 b2 = A * ((A + 1) - (A - 1) * cs - beta * sn);
3247 static inline void recalc_filter_shelving_hi(FilterCoefficients *fc)
3249 FILTER_T *dc = fc->dc;
3250 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
3252 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN || FLT_WIDTH_MARGIN){
3257 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_shelving_gain);
3259 dc[8] = pow((FLOAT_T)10.0, -(fc->reso_dB) * DIV_80 * ext_filter_shelving_reduce); // spgain
3262 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
3265 beta = sqrt(A) / (fc->q * ext_filter_shelving_q); // q > 0
3266 a0 = 1.0 / ((A + 1) - (A - 1) * cs + beta * sn);
3267 a1 = (-2.0 * ((A - 1) - (A + 1) * cs));
3268 a2 = -((A + 1) - (A - 1) * cs - beta * sn);
3269 b0 = A * ((A + 1) + (A - 1) * cs + beta * sn);
3270 b1 = -2.0 * A * ((A - 1) + (A + 1) * cs);
3271 b2 = A * ((A + 1) + (A - 1) * cs - beta * sn);
3281 static inline void sample_filter_peaking(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3286 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
3287 #if defined(DENORMAL_FIX)
3288 db[2] += denormal_add;
3291 db[3] = db[2]; // flt out
3293 db[1] = db[0]; // flt in
3295 *sp = db[3] * dc[8]; // spgain
3298 static inline void recalc_filter_peaking(FilterCoefficients *fc)
3300 FILTER_T *dc = fc->dc;
3301 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
3303 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN || FLT_WIDTH_MARGIN){
3308 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_peaking_gain);
3310 dc[8] = pow((FLOAT_T)10.0, -(fc->reso_dB) * DIV_80 * ext_filter_peaking_reduce); // spgain
3313 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
3316 beta = sn / (2.0 * fc->q * ext_filter_peaking_q); // q > 0
3317 a0 = 1.0 / (1.0 + beta / A);
3319 dc[4] = -(1.0 - beta / A) * a0; // -
3320 dc[3] = -a1 * a0; // -
3321 dc[2] = (1.0 - beta * A) * a0;
3322 dc[1] = a1 * a0; // b1 = a1
3323 dc[0] = (1.0 + beta * A) * a0;
3329 static inline void sample_filter_biquad(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3331 // db[2] = db[0] * dc[0] + db[1] * dc[1] + db[2] * dc[2] + db[3] * dc[3] + db[4] * dc[4]; // dc[0]=dc[2] BW
\82Æ
\93¯
\82¶
3332 db[2] = db[1] * dc[1] + (*sp + db[2]) * dc[2] + db[3] * dc[3] + db[4] * dc[4]; // -dc3 -dc4
3333 #if defined(DENORMAL_FIX)
3334 db[2] += denormal_add;
3343 static inline void recalc_filter_biquad_low(FilterCoefficients *fc)
3345 FILTER_T *dc = fc->dc;
3346 double a0, omega, sn, cs, alpha;
3348 if(FLT_FREQ_MARGIN || FLT_WIDTH_MARGIN){
3353 omega = 2.0 * M_PI * fc->freq * fc->div_flt_rate;
3356 alpha = sn / (2.0 * fc->q); // q > 0
3357 a0 = 1.0 / (1.0 + alpha);
3358 dc[1] = (1.0 - cs) * a0;
3359 dc[2] = dc[0] = ((1.0 - cs) * DIV_2) * a0;
3360 dc[3] = -(-2.0 * cs) * a0; // -
3361 dc[4] = -(1.0 - alpha) * a0; // -
3365 static inline void recalc_filter_biquad_hi(FilterCoefficients *fc)
3367 FILTER_T *dc = fc->dc;
3368 double a0, omega, sn, cs, alpha;
3370 if(FLT_FREQ_MARGIN || FLT_WIDTH_MARGIN){
3375 omega = 2.0 * M_PI * fc->freq * fc->div_flt_rate;
3378 alpha = sn / (2.0 * fc->q); // q > 0
3379 a0 = 1.0 / (1.0 + alpha);
3380 dc[1] = (-(1.0 + cs)) * a0;
3381 dc[2] = dc[0] = ((1.0 + cs) * DIV_2) * a0;
3382 dc[3] = -(-2.0 * cs) * a0; // -
3383 dc[4] = -(1.0 - alpha) * a0; // -
3387 #endif /* OPT_MODE == 1 */
3394 const char *filter_name[] =
3445 typedef void (*recalc_filter_t)(FilterCoefficients *fc);
3447 static recalc_filter_t recalc_filters[] = {
3450 recalc_filter_LPF12,
3451 recalc_filter_LPF24,
3452 recalc_filter_LPF_BW,
3453 recalc_filter_LPF12_2,
3454 recalc_filter_LPF24_2,
3456 recalc_filter_LPF18,
3457 recalc_filter_LPF_TFO,
3458 recalc_filter_HPF_BW,
3459 recalc_filter_BPF_BW,
3460 recalc_filter_peak1,
3461 recalc_filter_peak1, // notch1
3462 recalc_filter_LPF12_3,
3463 recalc_filter_HPF12_3, // HPF12_3
3464 recalc_filter_BPF12_3, // BPF12_3
3465 recalc_filter_BCF12_3, // BCF12_3
3466 recalc_filter_LPF6, // HPF6
3467 recalc_filter_LPF12_2, // HPF12_2
3469 recalc_filter_HBF_L6L12,
3470 recalc_filter_HBF_L12L6,
3471 recalc_filter_HBF_L12H6,
3472 recalc_filter_HBF_L24H6,
3473 recalc_filter_HBF_L24H12,
3474 recalc_filter_HBF_L12OCT,
3475 recalc_filter_HBF_L24OCT,
3477 recalc_filter_LPF6, // LPF6x2
3478 recalc_filter_LPF6, // LPF6x3
3479 recalc_filter_LPF6, // LPF6x4
3480 recalc_filter_LPF6, // LPF6x8
3481 recalc_filter_LPF6, // LPF6x16
3482 recalc_filter_LPF_BW, // LPF_BWx2
3483 recalc_filter_LPF_BW, // LPF_BWx3
3484 recalc_filter_LPF_BW, // LPF_BWx4
3485 recalc_filter_LPF24_2, // LPF24_2x2
3487 recalc_filter_LPF_FIR, // LPF_FIR
3489 recalc_filter_shelving_low, // eq_low
3490 recalc_filter_shelving_hi, // eq_hi
3491 recalc_filter_peaking, // eq_mid
3492 recalc_filter_biquad_low,
3493 recalc_filter_biquad_hi,
3496 typedef void (*sample_filter_t)(FILTER_T *dc, FILTER_T *db, DATA_T *sp);
3498 static sample_filter_t sample_filters[] = {
3501 sample_filter_LPF12,
3502 sample_filter_LPF24,
3503 sample_filter_LPF_BW,
3504 sample_filter_LPF12_2,
3505 sample_filter_LPF24_2,
3507 sample_filter_LPF18,
3508 sample_filter_LPF_TFO,
3509 sample_filter_HPF_BW,
3510 sample_filter_BPF_BW,
3511 sample_filter_peak1,
3512 sample_filter_notch1,
3513 sample_filter_LPF12_3,
3514 sample_filter_HPF12_3,
3515 sample_filter_BPF12_3,
3516 sample_filter_BCF12_3,
3518 sample_filter_HPF12_2,
3520 sample_filter_HBF_L6L12,
3521 sample_filter_HBF_L12L6,
3522 sample_filter_HBF_L12H6,
3523 sample_filter_HBF_L24H6,
3524 sample_filter_HBF_L24H12,
3525 sample_filter_HBF_L12OCT,
3526 sample_filter_HBF_L24OCT,
3528 sample_filter_LPF6x2,
3529 sample_filter_LPF6x3,
3530 sample_filter_LPF6x4,
3531 sample_filter_LPF6x8,
3532 sample_filter_LPF6x16,
3533 sample_filter_LPF_BWx2,
3534 sample_filter_LPF_BWx3,
3535 sample_filter_LPF_BWx4,
3536 sample_filter_LPF24_2x2,
3538 sample_filter_LPF_FIR,
3540 sample_filter_shelving, // eq_low
3541 sample_filter_shelving, // eq_hi
3542 sample_filter_peaking, // eq_mid
3543 sample_filter_biquad,
3544 sample_filter_biquad,
3547 void set_sample_filter_type(FilterCoefficients *fc, int type)
3549 if(type < FILTER_NONE || type >= FILTER_LIST_MAX)
3551 if(!fc->init || fc->type != type)
3552 memset(fc, 0, sizeof(FilterCoefficients));
3554 fc->recalc_filter = recalc_filters[type];
3555 fc->sample_filter = sample_filters[type];
3560 const double sample_filter_limit_rate[] = {
3561 0.16666, // type0 OFF
3562 0.16666, // type1 Chamberlin 12dB/oct fc < rate / 6
3563 0.50000, // type2 Moog VCF 24dB/oct fc < rate / 2
3564 0.50000, // type3 butterworth fc < rate / 2 elion
3565 0.50000, // type4 Resonant IIR 12dB/oct fc < rate / 2
3566 0.50000, // type5 amSynth 24dB/oct fc < rate / 2
3567 0.50000, // type6 One pole 6dB/oct non rez fc < rate / 2
3568 0.44444, // type7 resonant 3 pole 18dB/oct fc < rate / 2.25
3569 0.50000, // type8 two first order fc < rate / 2
3571 0.50000, // type9 HPF butterworth fc < rate / 2 elion +
3572 0.50000, // type10 BPF butterworth fc < rate / 2 elion +
3573 0.50000, // type11 peak fc < rate / 2
3574 0.50000, // type12 notch fc < rate / 2
3575 0.21875, // type13 LPF Chamberlin2 12dB/oct fc < rate / 2
3576 0.21875, // type14 HPF Chamberlin2 12dB/oct fc < rate / 2
3577 0.21875, // type15 BPF Chamberlin2 12dB/oct fc < rate / 2
3578 0.21875, // type16 notch Chamberlin2 12dB/oct fc < rate / 2
3579 0.50000, // type17 HPF6
3580 0.50000, // type18 HPF12_2
3582 0.50000, // type19 L6L12
3583 0.50000, // type20 L12L6
3584 0.50000, // type21 L12H6
3585 0.50000, // type22 L24H6
3586 0.50000, // type23 L24H12
3587 0.50000, // type24 L12OCT
3588 0.50000, // type25 L24OCT
3598 0.50000, // LPF24_2x2
3602 0.50000, // FILTER_SHELVING_LOW, // q
3603 0.50000, // FILTER_SHELVING_HI, // q
3604 0.50000, // FILTER_PEAKING, // q
3605 0.50000, // FILTER_BIQUAD_LOW, // q
3606 0.50000, // FILTER_BIQUAD_HI, // q
3609 void set_sample_filter_ext_rate(FilterCoefficients *fc, FLOAT_T freq)
3612 fc->flt_rate = freq;
3613 fc->div_flt_rate = 1.0 / fc->flt_rate;
3614 fc->flt_rate_div2 = fc->flt_rate * DIV_2;
3616 fc->flt_rate = play_mode->rate;
3617 fc->div_flt_rate = div_playmode_rate;
3618 fc->flt_rate_div2 = playmode_rate_div2;
3621 fc->flt_rate_limit1 = fc->flt_rate * sample_filter_limit_rate[fc->type]; // sr*limit
3622 fc->flt_rate_limit2 = fc->flt_rate_limit1 * 2.0; // sr*2*limit
3623 fc->div_flt_rate_ov2 = fc->div_flt_rate * DIV_2; // 1/sr*2
3624 fc->div_flt_rate_ov3 = fc->div_flt_rate * DIV_3; // 1/sr*3
3627 void set_sample_filter_freq(FilterCoefficients *fc, FLOAT_T freq)
3629 if(fc->flt_rate == 0) // not init filter rate
3630 set_sample_filter_ext_rate(fc, 0);
3631 if(freq < 0 || freq > fc->flt_rate_div2) // sr/2
3632 fc->freq = fc->flt_rate_div2;
3633 else if(freq < 10.0)
3639 void set_sample_filter_reso(FilterCoefficients *fc, FLOAT_T reso)
3641 const FLOAT_T limit = 96.0;
3644 fc->reso_dB = limit;
3651 void set_sample_filter_q(FilterCoefficients *fc, FLOAT_T q)
3653 const FLOAT_T def = 0.7;
3654 const FLOAT_T limit = 12.0;
3655 const FLOAT_T min = 0.01;
3667 void init_sample_filter(FilterCoefficients *fc, FLOAT_T freq, FLOAT_T reso, int type)
3669 set_sample_filter_type(fc, type);
3670 set_sample_filter_freq(fc, freq);
3671 set_sample_filter_reso(fc, reso);
3676 void init_sample_filter2(FilterCoefficients *fc, FLOAT_T freq, FLOAT_T reso, FLOAT_T q, int type)
3678 set_sample_filter_type(fc, type);
3679 set_sample_filter_freq(fc, freq);
3680 set_sample_filter_reso(fc, reso);
3681 set_sample_filter_q(fc, q);
3686 void recalc_filter(FilterCoefficients *fc)
3690 return; // error not init
3692 fc->recalc_filter(fc);
3695 // sample_filter (1ch mono / 2ch left)
3696 inline void sample_filter(FilterCoefficients *fc, DATA_T *sp)
3698 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], sp);
3701 // sample_filter (2ch stereo)
3702 inline void sample_filter_stereo(FilterCoefficients *fc, DATA_T *spL, DATA_T *spR)
3704 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], spL);
3705 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], spR);
3708 inline void sample_filter_stereo2(FilterCoefficients *fc, DATA_T *spLR)
3710 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], &spLR[0]);
3711 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], &spLR[1]);
3714 // sample_filter (2ch left)
3715 inline void sample_filter_left(FilterCoefficients *fc, DATA_T *sp)
3717 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], sp);
3720 // sample_filter (2ch left)
3721 inline void sample_filter_right(FilterCoefficients *fc, DATA_T *sp)
3723 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], sp);
3726 // buffer filter (1ch mono)
3727 inline void buffer_filter(FilterCoefficients *fc, DATA_T *sp, int32 count)
3733 return; // error not init
3736 return; // filter none
3738 if (fc->type == FILTER_LPF12_2) {
3739 recalc_filter_LPF12_2(fc);
3740 buffer_filter_LPF12_2(fc->dc, &fc->db[FILTER_FB_L], sp, count);
3744 fc->recalc_filter(fc);
3745 for(i = 0; i < count; i++)
3746 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], &sp[i]);
3749 // buffer filter (2ch stereo)
3750 inline void buffer_filter_stereo(FilterCoefficients *fc, DATA_T *sp, int32 count)
3756 return; // error not init
3759 return; // filter none
3760 fc->recalc_filter(fc);
3761 for(i = 0; i < count; i++){
3762 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], &sp[i]);
3764 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], &sp[i]);
3768 // buffer filter (2ch left)
3769 inline void buffer_filter_left(FilterCoefficients *fc, DATA_T *sp, int32 count)
3775 return; // error not init
3778 return; // filter none
3779 fc->recalc_filter(fc);
3780 for(i = 0; i < count; i++)
3781 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], &sp[i++]);
3784 // buffer filter (2ch right)
3785 inline void buffer_filter_right(FilterCoefficients *fc, DATA_T *sp, int32 count)
3791 return; // error not init
3794 return; // filter none
3795 fc->recalc_filter(fc);
3796 for(i = 0; i < count; i++)
3797 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], &sp[++i]);
3804 void set_voice_filter1_type(FilterCoefficients *fc, int type)
3807 set_sample_filter_type(fc, type);
3810 void set_voice_filter1_ext_rate(FilterCoefficients *fc, FLOAT_T freq)
3812 set_sample_filter_ext_rate(fc, freq);
3815 void set_voice_filter1_freq(FilterCoefficients *fc, FLOAT_T freq)
3817 set_sample_filter_freq(fc, freq);
3820 void set_voice_filter1_reso(FilterCoefficients *fc, FLOAT_T reso)
3822 set_sample_filter_reso(fc, reso);
3825 void voice_filter1(FilterCoefficients *fc, DATA_T *sp, int32 count)
3827 buffer_filter(fc, sp, count);
3835 static int conv_type_voice_filter2[] = {
3839 FILTER_HPF12_3, // 2
3841 FILTER_HPF12_2, // 4
3844 void set_voice_filter2_type(FilterCoefficients *fc, int type)
3847 if(type < VOICE_FILTER2_NONE || type >= VOICE_FILTER2_LIST_MAX)
3848 type = VOICE_FILTER2_NONE;
3849 set_sample_filter_type(fc, conv_type_voice_filter2[type]);
3852 void set_voice_filter2_ext_rate(FilterCoefficients *fc, FLOAT_T freq)
3854 set_sample_filter_ext_rate(fc, freq);
3857 void set_voice_filter2_freq(FilterCoefficients *fc, FLOAT_T freq)
3859 set_sample_filter_freq(fc, freq);
3862 void set_voice_filter2_reso(FilterCoefficients *fc, FLOAT_T reso)
3864 set_sample_filter_reso(fc, reso);
3867 void voice_filter2(FilterCoefficients *fc, DATA_T *sp, int32 count)
3869 buffer_filter(fc, sp, count);
3872 /// voice_filter1 + voice_filter2
3873 void voice_filter(int v, DATA_T *sp, int32 count)
3875 Voice *vp = &voice[v];
3877 buffer_filter(&vp->fc, sp, count); // lpf
3878 buffer_filter(&vp->fc2, sp, count); // hpf
3886 static int conv_type_resample_filter[] = {
3900 void set_resample_filter_type(FilterCoefficients *fc, int type)
3903 if(type < RESAMPLE_FILTER_NONE || type >= RESAMPLE_FILTER_LIST_MAX)
3904 type = RESAMPLE_FILTER_NONE;
3905 set_sample_filter_type(fc, conv_type_resample_filter[type]);
3908 void set_resample_filter_ext_rate(FilterCoefficients *fc, FLOAT_T freq)
3910 set_sample_filter_ext_rate(fc, freq);
3913 void set_resample_filter_freq(FilterCoefficients *fc, FLOAT_T freq)
3915 set_sample_filter_freq(fc, freq);
3918 void resample_filter(int v, DATA_T *sp, int32 count)
3920 buffer_filter(&voice[v].rf_fc, sp, count);
3924 #ifdef MIX_VOICE_BATCH
3926 #if MIX_VOICE_BATCH_SIZE != 8
3927 #error invalid MIX_VOICE_BATCH_SIZE
3930 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
3932 static inline __mmask8 generate_mask8_for_count(int32 offset, int32 count)
3934 if (offset < count) {
3935 if (offset + 8 <= count)
3938 return (1 << (count - offset)) - 1;
3946 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
3948 static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
3950 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
3951 if (i >= batch_size)
3954 __m256i vcounts = _mm256_set_epi32(
3955 i + 7 < batch_size ? counts[i + 7] : 0,
3956 i + 6 < batch_size ? counts[i + 6] : 0,
3957 i + 5 < batch_size ? counts[i + 5] : 0,
3958 i + 4 < batch_size ? counts[i + 4] : 0,
3959 i + 3 < batch_size ? counts[i + 3] : 0,
3960 i + 2 < batch_size ? counts[i + 2] : 0,
3961 i + 1 < batch_size ? counts[i + 1] : 0,
3965 __m256d vdb0123_0 = _mm256_loadu_pd(&dbs[i][0]);
3966 __m256d vdb0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dbs[i + 1][0]) : _mm256_setzero_pd();
3967 __m256d vdb0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(&dbs[i + 2][0]) : _mm256_setzero_pd();
3968 __m256d vdb0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(&dbs[i + 3][0]) : _mm256_setzero_pd();
3969 __m256d vdb0123_4 = i + 4 < batch_size ? _mm256_loadu_pd(&dbs[i + 4][0]) : _mm256_setzero_pd();
3970 __m256d vdb0123_5 = i + 5 < batch_size ? _mm256_loadu_pd(&dbs[i + 5][0]) : _mm256_setzero_pd();
3971 __m256d vdb0123_6 = i + 6 < batch_size ? _mm256_loadu_pd(&dbs[i + 6][0]) : _mm256_setzero_pd();
3972 __m256d vdb0123_7 = i + 7 < batch_size ? _mm256_loadu_pd(&dbs[i + 7][0]) : _mm256_setzero_pd();
3974 __m512d vdb0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb0123_0), vdb0123_2, 1);
3975 __m512d vdb0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb0123_1), vdb0123_3, 1);
3976 __m512d vdb0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb0123_4), vdb0123_6, 1);
3977 __m512d vdb0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb0123_5), vdb0123_7, 1);
3979 __m512d vdb01_0246 = _mm512_shuffle_f64x2(vdb0123_02, vdb0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
3980 __m512d vdb01_1357 = _mm512_shuffle_f64x2(vdb0123_13, vdb0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
3981 __m512d vdb23_0246 = _mm512_shuffle_f64x2(vdb0123_02, vdb0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
3982 __m512d vdb23_1357 = _mm512_shuffle_f64x2(vdb0123_13, vdb0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
3984 __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
3985 __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
3986 __m512d vdb2 = _mm512_unpacklo_pd(vdb23_0246, vdb23_1357);
3987 __m512d vdb3 = _mm512_unpackhi_pd(vdb23_0246, vdb23_1357);
3988 __m512d vdb4 = _mm512_set_pd(
3989 i + 7 < batch_size ? dbs[i + 7][4] : 0.0,
3990 i + 6 < batch_size ? dbs[i + 6][4] : 0.0,
3991 i + 5 < batch_size ? dbs[i + 5][4] : 0.0,
3992 i + 4 < batch_size ? dbs[i + 4][4] : 0.0,
3993 i + 3 < batch_size ? dbs[i + 3][4] : 0.0,
3994 i + 2 < batch_size ? dbs[i + 2][4] : 0.0,
3995 i + 1 < batch_size ? dbs[i + 1][4] : 0.0,
3999 __m256d vdc0123_0 = _mm256_loadu_pd(&dcs[i][0]);
4000 __m256d vdc0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dcs[i + 1][0]) : _mm256_setzero_pd();
4001 __m256d vdc0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(&dcs[i + 2][0]) : _mm256_setzero_pd();
4002 __m256d vdc0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(&dcs[i + 3][0]) : _mm256_setzero_pd();
4003 __m256d vdc0123_4 = i + 4 < batch_size ? _mm256_loadu_pd(&dcs[i + 4][0]) : _mm256_setzero_pd();
4004 __m256d vdc0123_5 = i + 5 < batch_size ? _mm256_loadu_pd(&dcs[i + 5][0]) : _mm256_setzero_pd();
4005 __m256d vdc0123_6 = i + 6 < batch_size ? _mm256_loadu_pd(&dcs[i + 6][0]) : _mm256_setzero_pd();
4006 __m256d vdc0123_7 = i + 7 < batch_size ? _mm256_loadu_pd(&dcs[i + 7][0]) : _mm256_setzero_pd();
4008 __m512d vdc0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc0123_0), vdc0123_2, 1);
4009 __m512d vdc0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc0123_1), vdc0123_3, 1);
4010 __m512d vdc0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc0123_4), vdc0123_6, 1);
4011 __m512d vdc0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc0123_5), vdc0123_7, 1);
4013 __m512d vdc01_0246 = _mm512_shuffle_f64x2(vdc0123_02, vdc0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4014 __m512d vdc01_1357 = _mm512_shuffle_f64x2(vdc0123_13, vdc0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4015 __m512d vdc23_0246 = _mm512_shuffle_f64x2(vdc0123_02, vdc0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4016 __m512d vdc23_1357 = _mm512_shuffle_f64x2(vdc0123_13, vdc0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4018 __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
4019 __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
4020 __m512d vdc2 = _mm512_unpacklo_pd(vdc23_0246, vdc23_1357);
4021 __m512d vdc3 = _mm512_unpackhi_pd(vdc23_0246, vdc23_1357);
4022 __m512d vdc4 = _mm512_set_pd(
4023 i + 7 < batch_size ? dcs[i + 7][4] : 0.0,
4024 i + 6 < batch_size ? dcs[i + 6][4] : 0.0,
4025 i + 5 < batch_size ? dcs[i + 5][4] : 0.0,
4026 i + 4 < batch_size ? dcs[i + 4][4] : 0.0,
4027 i + 3 < batch_size ? dcs[i + 3][4] : 0.0,
4028 i + 2 < batch_size ? dcs[i + 2][4] : 0.0,
4029 i + 1 < batch_size ? dcs[i + 1][4] : 0.0,
4033 __m128i vcounts_halfmax = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
4034 vcounts_halfmax = _mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, (3 << 2) | 2));
4035 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, 1)));
4037 for (int32 j = 0; j < count_max; j += 8) {
4039 vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[i]), &sps[i][j]);
4041 for (int k = 1; k < 8; k++)
4042 vin[k] = _mm512_maskz_loadu_pd(i + k < batch_size ? generate_mask8_for_count(j, counts[i + k]) : 0, & sps[i + k][j]);
4044 __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
4045 __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
4046 __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
4047 __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
4048 __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
4049 __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
4050 __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
4051 __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
4053 __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4054 __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4055 __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4056 __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4057 __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4058 __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4059 __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4060 __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4063 vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4064 vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4065 vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4066 vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4067 vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4068 vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4069 vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4070 vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4072 for (int k = 0; k < 8; k++) {
4073 __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
4075 vdb0 = _mm512_mask_mov_pd(vdb0, kmask, vsps[k]);
4076 vdb2 = _mm512_mask_fmadd_pd(vdb2, kmask, vdc2, _mm512_add_pd(_mm512_fmadd_pd(vdc0, vdb0, _mm512_mul_pd(vdc1, vdb1)), _mm512_fmadd_pd(vdc3, vdb3, _mm512_mul_pd(vdc4, vdb4))));
4079 vdb2 = _mm512_mask_add_pd(vdb2, kmask, vdb2, _mm512_set1_pd(denormal_add));
4081 vdb4 = _mm512_mask_mov_pd(vdb4, kmask, vdb3);
4082 vdb3 = _mm512_mask_mov_pd(vdb3, kmask, vdb2);
4083 vdb2 = _mm512_mask_mov_pd(vdb2, kmask, vdb1);
4084 vdb1 = _mm512_mask_mov_pd(vdb1, kmask, vdb0);
4088 __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
4089 __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
4090 __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
4091 __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
4092 __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
4093 __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
4094 __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
4095 __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
4097 __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4098 __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4099 __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4100 __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4101 __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4102 __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4103 __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4104 __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4107 vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4108 vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4109 vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4110 vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4111 vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4112 vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4113 vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4114 vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4116 for (int k = 0; k < batch_size; k++)
4117 _mm512_mask_storeu_pd(&sps[i + k][j], generate_mask8_for_count(j, counts[i + k]), vout[k]);
4120 vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
4121 vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
4122 vdb23_0246 = _mm512_unpacklo_pd(vdb2, vdb3);
4123 vdb23_1357 = _mm512_unpackhi_pd(vdb2, vdb3);
4125 __m512d vdb0123_04 = _mm512_permutex2var_pd(vdb01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vdb23_0246);
4126 __m512d vdb0123_15 = _mm512_permutex2var_pd(vdb01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vdb23_1357);
4127 __m512d vdb0123_26 = _mm512_permutex2var_pd(vdb01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vdb23_0246);
4128 __m512d vdb0123_37 = _mm512_permutex2var_pd(vdb01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vdb23_1357);
4130 _mm256_storeu_pd(&dbs[i][0], _mm512_castpd512_pd256(vdb0123_04));
4131 _mm_storel_pd(&dbs[i][4], _mm512_castpd512_pd128(vdb4));
4133 if (i + 1 < batch_size) {
4134 _mm256_storeu_pd(&dbs[i + 1][0], _mm512_castpd512_pd256(vdb0123_15));
4135 _mm_storeh_pd(&dbs[i + 1][4], _mm512_castpd512_pd128(vdb4));
4138 if (i + 2 < batch_size) {
4139 _mm256_storeu_pd(&dbs[i + 2][0], _mm512_castpd512_pd256(vdb0123_26));
4140 _mm_storel_pd(&dbs[i + 2][4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb4), 1));
4143 if (i + 3 < batch_size) {
4144 _mm256_storeu_pd(&dbs[i + 3][0], _mm512_castpd512_pd256(vdb0123_37));
4145 _mm_storeh_pd(&dbs[i + 3][4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb4), 1));
4148 if (i + 4 < batch_size) {
4149 _mm256_storeu_pd(&dbs[i + 4][0], _mm512_extractf64x4_pd(vdb0123_04, 1));
4150 _mm_storel_pd(&dbs[i + 4][4], _mm512_extractf64x2_pd(vdb4, 2));
4153 if (i + 5 < batch_size) {
4154 _mm256_storeu_pd(&dbs[i + 5][0], _mm512_extractf64x4_pd(vdb0123_15, 1));
4155 _mm_storeh_pd(&dbs[i + 5][4], _mm512_extractf64x2_pd(vdb4, 2));
4158 if (i + 6 < batch_size) {
4159 _mm256_storeu_pd(&dbs[i + 6][0], _mm512_extractf64x4_pd(vdb0123_26, 1));
4160 _mm_storel_pd(&dbs[i + 6][4], _mm512_extractf64x2_pd(vdb4, 3));
4163 if (i + 7 < batch_size) {
4164 _mm256_storeu_pd(&dbs[i + 7][0], _mm512_extractf64x4_pd(vdb0123_37, 1));
4165 _mm_storeh_pd(&dbs[i + 7][4], _mm512_extractf64x2_pd(vdb4, 3));
4170 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4172 static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
4174 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
4175 if (i >= batch_size)
4178 __m128i vcounts = _mm_set_epi32(
4179 i + 3 < batch_size ? counts[i + 3] : 0,
4180 i + 2 < batch_size ? counts[i + 2] : 0,
4181 i + 1 < batch_size ? counts[i + 1] : 0,
4185 __m256d vdb0123_0 = _mm256_loadu_pd(&dbs[i][0]);
4186 __m256d vdb0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dbs[i + 1][0]) : _mm256_setzero_pd();
4187 __m256d vdb0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(&dbs[i + 2][0]) : _mm256_setzero_pd();
4188 __m256d vdb0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(&dbs[i + 3][0]) : _mm256_setzero_pd();
4190 __m256d vdb01_02 = _mm256_permute2f128_pd(vdb0123_0, vdb0123_2, (2 << 4) | 0);
4191 __m256d vdb01_13 = _mm256_permute2f128_pd(vdb0123_1, vdb0123_3, (2 << 4) | 0);
4192 __m256d vdb23_02 = _mm256_permute2f128_pd(vdb0123_0, vdb0123_2, (3 << 4) | 1);
4193 __m256d vdb23_13 = _mm256_permute2f128_pd(vdb0123_1, vdb0123_3, (3 << 4) | 1);
4195 __m256d vdb0 = _mm256_unpacklo_pd(vdb01_02, vdb01_13);
4196 __m256d vdb1 = _mm256_unpackhi_pd(vdb01_02, vdb01_13);
4197 __m256d vdb2 = _mm256_unpacklo_pd(vdb23_02, vdb23_13);
4198 __m256d vdb3 = _mm256_unpackhi_pd(vdb23_02, vdb23_13);
4199 __m256d vdb4 = _mm256_set_pd(
4200 i + 3 < batch_size ? dbs[i + 3][4] : 0.0,
4201 i + 2 < batch_size ? dbs[i + 2][4] : 0.0,
4202 i + 1 < batch_size ? dbs[i + 1][4] : 0.0,
4206 __m256d vdc0123_0 = _mm256_loadu_pd(&dcs[i][0]);
4207 __m256d vdc0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dcs[i + 1][0]) : _mm256_setzero_pd();
4208 __m256d vdc0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(&dcs[i + 2][0]) : _mm256_setzero_pd();
4209 __m256d vdc0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(&dcs[i + 3][0]) : _mm256_setzero_pd();
4211 __m256d vdc01_02 = _mm256_permute2f128_pd(vdc0123_0, vdc0123_2, (2 << 4) | 0);
4212 __m256d vdc01_13 = _mm256_permute2f128_pd(vdc0123_1, vdc0123_3, (2 << 4) | 0);
4213 __m256d vdc23_02 = _mm256_permute2f128_pd(vdc0123_0, vdc0123_2, (3 << 4) | 1);
4214 __m256d vdc23_13 = _mm256_permute2f128_pd(vdc0123_1, vdc0123_3, (3 << 4) | 1);
4216 __m256d vdc0 = _mm256_unpacklo_pd(vdc01_02, vdc01_13);
4217 __m256d vdc1 = _mm256_unpackhi_pd(vdc01_02, vdc01_13);
4218 __m256d vdc2 = _mm256_unpacklo_pd(vdc23_02, vdc23_13);
4219 __m256d vdc3 = _mm256_unpackhi_pd(vdc23_02, vdc23_13);
4220 __m256d vdc4 = _mm256_set_pd(
4221 i + 3 < batch_size ? dcs[i + 3][4] : 0.0,
4222 i + 2 < batch_size ? dcs[i + 2][4] : 0.0,
4223 i + 1 < batch_size ? dcs[i + 1][4] : 0.0,
4227 __m128i vcounts_halfmax = _mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, (3 << 2) | 2));
4228 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, 1)));
4230 for (int32 j = 0; j < count_max; j += 4) {
4231 __m256d vsp0123_0 = j < counts[i] ? _mm256_loadu_pd(&sps[i][j]) : _mm256_setzero_pd();
4232 __m256d vsp0123_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm256_loadu_pd(&sps[i + 1][j]) : _mm256_setzero_pd();
4233 __m256d vsp0123_2 = i + 1 < batch_size && j < counts[i + 2] ? _mm256_loadu_pd(&sps[i + 2][j]) : _mm256_setzero_pd();
4234 __m256d vsp0123_3 = i + 1 < batch_size && j < counts[i + 3] ? _mm256_loadu_pd(&sps[i + 3][j]) : _mm256_setzero_pd();
4236 __m256d vsp01_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (2 << 4) | 0);
4237 __m256d vsp01_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (2 << 4) | 0);
4238 __m256d vsp23_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (3 << 4) | 1);
4239 __m256d vsp23_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (3 << 4) | 1);
4242 vsps[0] = _mm256_unpacklo_pd(vsp01_02, vsp01_13);
4243 vsps[1] = _mm256_unpackhi_pd(vsp01_02, vsp01_13);
4244 vsps[2] = _mm256_unpacklo_pd(vsp23_02, vsp23_13);
4245 vsps[3] = _mm256_unpackhi_pd(vsp23_02, vsp23_13);
4247 for (int k = 0; k < 4; k++) {
4248 __m256d vmask = _mm256_castsi256_pd(_mm256_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
4250 vdb0 = _mm256_blendv_pd(vdb0, vsps[k], vmask);
4251 vdb2 = _mm256_blendv_pd(vdb2, MM256_FMA_PD(vdc0, vdb0, MM256_FMA4_PD(vdc1, vdb1, vdc2, vdb2, vdc3, vdb3, vdc4, vdb4)), vmask);
4254 vdb2 = _mm256_blendv_pd(vdb2, _mm256_add_pd(vdb2, _mm_set1_pd(denormal_add)), vmask);
4256 vdb4 = _mm256_blendv_pd(vdb4, vdb3, vmask);
4257 vdb3 = _mm256_blendv_pd(vdb3, vdb2, vmask);
4258 vdb2 = _mm256_blendv_pd(vdb2, vdb1, vmask);
4259 vdb1 = _mm256_blendv_pd(vdb1, vdb0, vmask);
4263 vsp01_02 = _mm256_unpacklo_pd(vsps[0], vsps[1]);
4264 vsp01_13 = _mm256_unpackhi_pd(vsps[0], vsps[1]);
4265 vsp23_02 = _mm256_unpacklo_pd(vsps[2], vsps[3]);
4266 vsp23_13 = _mm256_unpackhi_pd(vsps[2], vsps[3]);
4268 vsp0123_0 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (2 << 4) | 0);
4269 vsp0123_1 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (2 << 4) | 0);
4270 vsp0123_2 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (3 << 4) | 1);
4271 vsp0123_3 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (3 << 4) | 1);
4274 _mm256_storeu_pd(&sps[i][j], vsp0123_0);
4276 if (i + 1 < batch_size && j < counts[i + 1])
4277 _mm256_storeu_pd(&sps[i + 1][j], vsp0123_1);
4279 if (i + 2 < batch_size && j < counts[i + 2])
4280 _mm256_storeu_pd(&sps[i + 2][j], vsp0123_2);
4282 if (i + 3 < batch_size && j < counts[i + 3])
4283 _mm256_storeu_pd(&sps[i + 3][j], vsp0123_3);
4286 vdb01_02 = _mm256_unpacklo_pd(vdb0, vdb1);
4287 vdb01_13 = _mm256_unpackhi_pd(vdb0, vdb1);
4288 vdb23_02 = _mm256_unpacklo_pd(vdb2, vdb3);
4289 vdb23_13 = _mm256_unpackhi_pd(vdb2, vdb3);
4291 vdb0123_0 = _mm256_permute2f128_pd(vdb01_02, vdb23_02, (2 << 4) | 0);
4292 vdb0123_1 = _mm256_permute2f128_pd(vdb01_13, vdb23_13, (2 << 4) | 0);
4293 vdb0123_2 = _mm256_permute2f128_pd(vdb01_02, vdb23_02, (3 << 4) | 1);
4294 vdb0123_3 = _mm256_permute2f128_pd(vdb01_13, vdb23_13, (3 << 4) | 1);
4296 _mm256_storeu_pd(&dbs[i][0], vdb0123_0);
4297 _mm_storel_pd(&dbs[i][4], _mm256_castpd256_pd128(vdb4));
4299 if (i + 1 < batch_size) {
4300 _mm256_storeu_pd(&dbs[i + 1][0], vdb0123_1);
4301 _mm_storeh_pd(&dbs[i + 1][4], _mm256_castpd256_pd128(vdb4));
4304 if (i + 2 < batch_size) {
4305 _mm256_storeu_pd(&dbs[i + 2][0], vdb0123_2);
4306 _mm_storel_pd(&dbs[i + 2][4], _mm256_extractf128_pd(vdb4, 1));
4309 if (i + 3 < batch_size) {
4310 _mm256_storeu_pd(&dbs[i + 3][0], vdb0123_3);
4311 _mm_storeh_pd(&dbs[i + 3][4], _mm256_extractf128_pd(vdb4, 1));
4316 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4318 static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
4320 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
4321 if (i >= batch_size)
4324 __m128i vcounts = _mm_set_epi32(
4327 i + 1 < batch_size ? counts[i + 1] : 0,
4331 __m128d vdb01_0 = _mm_loadu_pd(&dbs[i][0]);
4332 __m128d vdb23_0 = _mm_loadu_pd(&dbs[i][2]);
4333 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(&dbs[i + 1][0]) : _mm_setzero_pd();
4334 __m128d vdb23_1 = i + 1 < batch_size ? _mm_loadu_pd(&dbs[i + 1][2]) : _mm_setzero_pd();
4336 __m128d vdb0 = _mm_unpacklo_pd(vdb01_0, vdb01_1);
4337 __m128d vdb1 = _mm_unpackhi_pd(vdb01_0, vdb01_1);
4338 __m128d vdb2 = _mm_unpacklo_pd(vdb23_0, vdb23_1);
4339 __m128d vdb3 = _mm_unpackhi_pd(vdb23_0, vdb23_1);
4340 __m128d vdb4 = _mm_set_pd(
4341 i + 1 < batch_size ? dbs[i + 1][4] : 0.0,
4345 __m128d vdc01_0 = _mm_loadu_pd(&dcs[i][0]);
4346 __m128d vdc23_0 = _mm_loadu_pd(&dcs[i][2]);
4347 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(&dcs[i + 1][0]) : _mm_setzero_pd();
4348 __m128d vdc23_1 = i + 1 < batch_size ? _mm_loadu_pd(&dcs[i + 1][2]) : _mm_setzero_pd();
4350 __m128d vdc0 = _mm_unpacklo_pd(vdc01_0, vdc01_1);
4351 __m128d vdc1 = _mm_unpackhi_pd(vdc01_0, vdc01_1);
4352 __m128d vdc2 = _mm_unpacklo_pd(vdc23_0, vdc23_1);
4353 __m128d vdc3 = _mm_unpackhi_pd(vdc23_0, vdc23_1);
4354 __m128d vdc4 = _mm_set_pd(
4355 i + 1 < batch_size ? dcs[i + 1][4] : 0.0,
4359 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, 1)));
4361 for (int32 j = 0; j < count_max; j += 2) {
4362 __m128d vsp01_0 = j < counts[i] ? _mm_loadu_pd(&sps[i][j]) : _mm_setzero_pd();
4363 __m128d vsp01_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm_loadu_pd(&sps[i + 1][j]) : _mm_setzero_pd();
4366 vsps[0] = _mm_unpacklo_pd(vsp01_0, vsp01_1);
4367 vsps[1] = _mm_unpackhi_pd(vsp01_0, vsp01_1);
4369 for (int k = 0; k < 2; k++) {
4370 __m128d vmask = _mm_castsi128_pd(_mm_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
4372 #if USE_X86_EXT_INTRIN >= 6
4373 vdb0 = _mm_blendv_pd(vdb0, vsps[k], vmask);
4374 vdb2 = _mm_blendv_pd(vdb2, MM_FMA5_PD(vdc0, vdb0, vdc1, vdb1, vdc2, vdb2, vdc3, vdb3, vdc4, vdb4), vmask);
4377 vdb2 = _mm_blendv_pd(vdb2, _mm_add_pd(vdb2, _mm_set1_pd(denormal_add)), vmask);
4379 vdb4 = _mm_blendv_pd(vdb4, vdb3, vmask);
4380 vdb3 = _mm_blendv_pd(vdb3, vdb2, vmask);
4381 vdb2 = _mm_blendv_pd(vdb2, vdb1, vmask);
4382 vdb1 = _mm_blendv_pd(vdb1, vdb0, vmask);
4384 vdb0 = _mm_or_pd(_mm_andnot_pd(vmask, vdb0), _mm_and_pd(vmask, vsps[k]));
4385 vdb2 = _mm_or_pd(_mm_andnot_pd(vmask, vdb2), _mm_and_pd(vmask, MM_FMA5_PD(vdc0, vdb0, vdc1, vdb1, vdc2, vdb2, vdc3, vdb3, vdc4, vdb4)));
4388 vdb2 = _mm_or_pd(_mm_andnot_pd(vmask, vdb2), _mm_and_pd(vmask, _mm_add_pd(vdb2, _mm_set1_pd(denormal_add))));
4390 vdb4 = _mm_or_pd(_mm_andnot_pd(vmask, vdb4), _mm_and_pd(vmask, vdb3));
4391 vdb3 = _mm_or_pd(_mm_andnot_pd(vmask, vdb3), _mm_and_pd(vmask, vdb2));
4392 vdb2 = _mm_or_pd(_mm_andnot_pd(vmask, vdb2), _mm_and_pd(vmask, vdb1));
4393 vdb1 = _mm_or_pd(_mm_andnot_pd(vmask, vdb1), _mm_and_pd(vmask, vdb0));
4398 vsp01_0 = _mm_unpacklo_pd(vsps[0], vsps[1]);
4399 vsp01_1 = _mm_unpackhi_pd(vsps[0], vsps[1]);
4402 _mm_storeu_pd(&sps[i][j], vsp01_0);
4404 if (i + 1 < batch_size && j < counts[i + 1])
4405 _mm_storeu_pd(&sps[i + 1][j], vsp01_1);
4408 vdb01_0 = _mm_unpacklo_pd(vdb0, vdb1);
4409 vdb01_1 = _mm_unpackhi_pd(vdb0, vdb1);
4410 vdb23_0 = _mm_unpacklo_pd(vdb2, vdb3);
4411 vdb23_1 = _mm_unpackhi_pd(vdb2, vdb3);
4413 _mm_storeu_pd(&dbs[i][0], vdb01_0);
4414 _mm_storeu_pd(&dbs[i][2], vdb23_0);
4415 _mm_storel_pd(&dbs[i][4], vdb4);
4417 if (i + 1 < batch_size) {
4418 _mm_storeu_pd(&dbs[i + 1][0], vdb01_1);
4419 _mm_storeu_pd(&dbs[i + 1][2], vdb23_1);
4420 _mm_storeh_pd(&dbs[i + 1][4], vdb4);
4427 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4429 static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs)
4431 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
4432 if (i >= batch_size)
4435 __m256d vfcrange0123[8];
4436 vfcrange0123[0] = _mm256_loadu_pd(fcs[i]->range);
4438 for (int j = 1; j < 8; j++)
4439 vfcrange0123[j] = i + j < batch_size ? _mm256_loadu_pd(fcs[i + j]->range) : _mm256_setzero_pd();
4441 __m512d vfcrange0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123[0]), vfcrange0123[2], 1);
4442 __m512d vfcrange0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123[1]), vfcrange0123[3], 1);
4443 __m512d vfcrange0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123[4]), vfcrange0123[6], 1);
4444 __m512d vfcrange0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123[5]), vfcrange0123[7], 1);
4446 __m512d vfcrange01_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4447 __m512d vfcrange01_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4448 __m512d vfcrange23_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4449 __m512d vfcrange23_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4451 __m512d vfcrange0 = _mm512_unpacklo_pd(vfcrange01_0246, vfcrange01_1357);
4452 __m512d vfcrange1 = _mm512_unpackhi_pd(vfcrange01_0246, vfcrange01_1357);
4453 __m512d vfcrange2 = _mm512_unpacklo_pd(vfcrange23_0246, vfcrange23_1357);
4454 __m512d vfcrange3 = _mm512_unpackhi_pd(vfcrange23_0246, vfcrange23_1357);
4456 __m512d vfcfreq = _mm512_set_pd(
4457 7 < batch_size ? fcs[7]->freq : 0.0,
4458 6 < batch_size ? fcs[6]->freq : 0.0,
4459 5 < batch_size ? fcs[5]->freq : 0.0,
4460 4 < batch_size ? fcs[4]->freq : 0.0,
4461 3 < batch_size ? fcs[3]->freq : 0.0,
4462 2 < batch_size ? fcs[2]->freq : 0.0,
4463 1 < batch_size ? fcs[1]->freq : 0.0,
4467 __m512d vfcreso_DB = _mm512_set_pd(
4468 7 < batch_size ? fcs[7]->reso_dB : 0.0,
4469 6 < batch_size ? fcs[6]->reso_dB : 0.0,
4470 5 < batch_size ? fcs[5]->reso_dB : 0.0,
4471 4 < batch_size ? fcs[4]->reso_dB : 0.0,
4472 3 < batch_size ? fcs[3]->reso_dB : 0.0,
4473 2 < batch_size ? fcs[2]->reso_dB : 0.0,
4474 1 < batch_size ? fcs[1]->reso_dB : 0.0,
4478 uint8 imask = _kor_mask8(
4479 _kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
4480 _kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
4481 ) & ((1 << (batch_size - i)) - 1);
4484 __m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
4485 __m512d v1pmargin = _mm512_set1_pd(1.0 + ext_filter_margin);
4487 vfcrange0 = _mm512_mul_pd(vfcfreq, v1mmargin);
4488 vfcrange1 = _mm512_mul_pd(vfcfreq, v1pmargin);
4489 vfcrange2 = _mm512_mul_pd(vfcreso_DB, v1mmargin);
4490 vfcrange3 = _mm512_mul_pd(vfcreso_DB, v1pmargin);
4492 vfcrange01_0246 = _mm512_unpacklo_pd(vfcrange0, vfcrange1);
4493 vfcrange01_1357 = _mm512_unpackhi_pd(vfcrange0, vfcrange1);
4494 vfcrange23_0246 = _mm512_unpacklo_pd(vfcrange2, vfcrange3);
4495 vfcrange23_1357 = _mm512_unpackhi_pd(vfcrange2, vfcrange3);
4498 __m512d vfcrange0123_04 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_0246);
4499 __m512d vfcrange0123_26 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_0246);
4500 __m512d vfcrange0123_15 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_1357);
4501 __m512d vfcrange0123_37 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_1357);
4503 __m512d vfcrange0123_04 = _mm512_mask_permutex_pd(vfcrange01_0246, 0xCC, vfcrange23_0246, (1 << 6) | (0 << 4) | 0);
4504 __m512d vfcrange0123_26 = _mm512_mask_permutex_pd(vfcrange01_0246, 0x33, vfcrange23_0246, (3 << 2) | 2);
4505 __m512d vfcrange0123_15 = _mm512_mask_permutex_pd(vfcrange01_1357, 0xCC, vfcrange23_1357, (1 << 6) | (0 << 4) | 0);
4506 __m512d vfcrange0123_37 = _mm512_mask_permutex_pd(vfcrange01_1357, 0x33, vfcrange23_1357, (3 << 2) | 2);
4510 _mm256_storeu_pd(fcs[0]->range, _mm512_castpd512_pd256(vfcrange0123_04));
4512 if (imask & (1 << 1))
4513 _mm256_storeu_pd(fcs[1]->range, _mm512_castpd512_pd256(vfcrange0123_15));
4515 if (imask & (1 << 2))
4516 _mm256_storeu_pd(fcs[2]->range, _mm512_castpd512_pd256(vfcrange0123_26));
4518 if (imask & (1 << 3))
4519 _mm256_storeu_pd(fcs[3]->range, _mm512_castpd512_pd256(vfcrange0123_37));
4521 if (imask & (1 << 4))
4522 _mm256_storeu_pd(fcs[4]->range, _mm512_extractf64x4_pd(vfcrange0123_04, 1));
4524 if (imask & (1 << 5))
4525 _mm256_storeu_pd(fcs[5]->range, _mm512_extractf64x4_pd(vfcrange0123_15, 1));
4527 if (imask & (1 << 6))
4528 _mm256_storeu_pd(fcs[6]->range, _mm512_extractf64x4_pd(vfcrange0123_26, 1));
4530 if (imask & (1 << 7))
4531 _mm256_storeu_pd(fcs[7]->range, _mm512_extractf64x4_pd(vfcrange0123_37, 1));
4533 __m512d vfcdiv_flt_rate = _mm512_set_pd(
4534 7 < batch_size ? fcs[7]->div_flt_rate : fcs[0]->div_flt_rate,
4535 6 < batch_size ? fcs[6]->div_flt_rate : fcs[0]->div_flt_rate,
4536 5 < batch_size ? fcs[5]->div_flt_rate : fcs[0]->div_flt_rate,
4537 4 < batch_size ? fcs[4]->div_flt_rate : fcs[0]->div_flt_rate,
4538 3 < batch_size ? fcs[3]->div_flt_rate : fcs[0]->div_flt_rate,
4539 2 < batch_size ? fcs[2]->div_flt_rate : fcs[0]->div_flt_rate,
4540 1 < batch_size ? fcs[1]->div_flt_rate : fcs[0]->div_flt_rate,
4541 fcs[0]->div_flt_rate
4544 __m512d vf = _mm512_mul_pd(_mm512_mul_pd(_mm512_set1_pd(M_PI), vfcfreq), vfcdiv_flt_rate);
4547 __m512d vtanf = _mm512_tan_pd(vf);
4549 ALIGN FLOAT_T af[8];
4550 _mm512_storeu_pd(af, vf);
4551 __m512d vtanf = _mm512_set_pd(tan(af[7]), tan(af[6]), tan(af[5]), tan(af[4]), tan(af[3]), tan(af[2]), tan(af[1]), tan(af[0]));
4554 __m512d v1 = _mm512_set1_pd(1.0);
4555 __m512d v2 = _mm512_set1_pd(2.0);
4556 __m512d vp = _mm512_div_pd(v1, vtanf);
4558 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
4560 __m512d vreso_db_cf_p = _mm512_set_pd(
4561 i + 7 < batch_size ? RESO_DB_CF_P(fcs[i + 7]->reso_dB) : reso_db_cf_p,
4562 i + 6 < batch_size ? RESO_DB_CF_P(fcs[i + 6]->reso_dB) : reso_db_cf_p,
4563 i + 5 < batch_size ? RESO_DB_CF_P(fcs[i + 5]->reso_dB) : reso_db_cf_p,
4564 i + 4 < batch_size ? RESO_DB_CF_P(fcs[i + 4]->reso_dB) : reso_db_cf_p,
4565 i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : reso_db_cf_p,
4566 i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : reso_db_cf_p,
4567 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
4571 __m512d vq = _mm512_mul_pd(vreso_db_cf_p, _mm512_set1_pd(SQRT_2));
4572 __m512d vp2 = _mm512_mul_pd(vp, vp);
4573 __m512d vqp = _mm512_mul_pd(vq, vp);
4574 __m512d vdc0 = _mm512_div_pd(v1, _mm512_add_pd(_mm512_add_pd(v1, vqp), vp2));
4575 __m512d vdc1 = _mm512_mul_pd(v2, vdc0);
4576 __m512d vdc2 = vdc0;
4577 __m512d vdc3 = _mm512_mul_pd(_mm512_mul_pd(v2, _mm512_sub_pd(vp2, v1)), vdc0);
4578 __m512d vdc4 = _mm512_mul_pd(_mm512_sub_pd(_mm512_sub_pd(vqp, v1), vp2), vdc0);
4580 __m512d vdc01_0246 = _mm512_unpacklo_pd(vdc0, vdc1);
4581 __m512d vdc01_1357 = _mm512_unpackhi_pd(vdc0, vdc1);
4582 __m512d vdc23_0246 = _mm512_unpacklo_pd(vdc2, vdc3);
4583 __m512d vdc23_1357 = _mm512_unpackhi_pd(vdc2, vdc3);
4585 __m512d vdc0123_04 = _mm512_permutex2var_pd(vdc01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vdc23_0246);
4586 __m512d vdc0123_26 = _mm512_permutex2var_pd(vdc01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vdc23_0246);
4587 __m512d vdc0123_15 = _mm512_permutex2var_pd(vdc01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vdc23_1357);
4588 __m512d vdc0123_37 = _mm512_permutex2var_pd(vdc01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vdc23_1357);
4591 _mm256_storeu_pd(&fcs[i]->dc[0], _mm512_castpd512_pd256(vdc0123_04));
4592 _mm_storel_pd(&fcs[i]->dc[4], _mm512_castpd512_pd128(vdc4));
4595 if (imask & (1 << 1)) {
4596 _mm256_storeu_pd(&fcs[i + 1]->dc[0], _mm512_castpd512_pd256(vdc0123_15));
4597 _mm_storeh_pd(&fcs[i + 1]->dc[4], _mm512_castpd512_pd128(vdc4));
4600 if (imask & (1 << 2)) {
4601 _mm256_storeu_pd(&fcs[i + 2]->dc[0], _mm512_castpd512_pd256(vdc0123_26));
4602 _mm_storel_pd(&fcs[i + 2]->dc[4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc4), 1));
4605 if (imask & (1 << 3)) {
4606 _mm256_storeu_pd(&fcs[i + 3]->dc[0], _mm512_castpd512_pd256(vdc0123_37));
4607 _mm_storeh_pd(&fcs[i + 3]->dc[4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc4), 1));
4610 if (imask & (1 << 4)) {
4611 _mm256_storeu_pd(&fcs[i + 4]->dc[0], _mm512_extractf64x4_pd(vdc0123_04, 1));
4612 _mm_storel_pd(&fcs[i + 4]->dc[4], _mm512_extractf64x2_pd(vdc4, 2));
4615 if (imask & (1 << 5)) {
4616 _mm256_storeu_pd(&fcs[i + 5]->dc[0], _mm512_extractf64x4_pd(vdc0123_15, 1));
4617 _mm_storeh_pd(&fcs[i + 5]->dc[4], _mm512_extractf64x2_pd(vdc4, 2));
4620 if (imask & (1 << 6)) {
4621 _mm256_storeu_pd(&fcs[i + 6]->dc[0], _mm512_extractf64x4_pd(vdc0123_26, 1));
4622 _mm_storel_pd(&fcs[i + 6]->dc[4], _mm512_extractf64x2_pd(vdc4, 3));
4625 if (imask & (1 << 7)) {
4626 _mm256_storeu_pd(&fcs[i + 7]->dc[0], _mm512_extractf64x4_pd(vdc0123_37, 1));
4627 _mm_storeh_pd(&fcs[i + 7]->dc[4], _mm512_extractf64x2_pd(vdc4, 3));
4633 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4635 static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs)
4637 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
4638 if (i >= batch_size)
4641 __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[i]->range);
4642 __m256d vfcrange0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(fcs[i + 1]->range) : _mm256_setzero_pd();
4643 __m256d vfcrange0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(fcs[i + 2]->range) : _mm256_setzero_pd();
4644 __m256d vfcrange0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(fcs[i + 3]->range) : _mm256_setzero_pd();
4646 __m256d vfcrange01_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (2 << 4) | 0);
4647 __m256d vfcrange01_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (2 << 4) | 0);
4648 __m256d vfcrange23_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (3 << 4) | 1);
4649 __m256d vfcrange23_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (3 << 4) | 1);
4651 __m256d vfcrange0 = _mm256_unpacklo_pd(vfcrange01_02, vfcrange01_13);
4652 __m256d vfcrange1 = _mm256_unpackhi_pd(vfcrange01_02, vfcrange01_13);
4653 __m256d vfcrange2 = _mm256_unpacklo_pd(vfcrange23_02, vfcrange23_13);
4654 __m256d vfcrange3 = _mm256_unpackhi_pd(vfcrange23_02, vfcrange23_13);
4656 __m256d vfcfreq = _mm256_set_pd(
4657 i + 3 < batch_size ? fcs[i + 3]->freq : 0.0,
4658 i + 2 < batch_size ? fcs[i + 2]->freq : 0.0,
4659 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
4663 __m256d vfcreso_DB = _mm256_set_pd(
4664 i + 3 < batch_size ? fcs[i + 3]->reso_dB : 0.0,
4665 i + 2 < batch_size ? fcs[i + 2]->reso_dB : 0.0,
4666 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
4670 __m256d vmask = _mm256_or_pd(
4671 _mm256_or_pd(_mm256_cmp_pd(vfcfreq, vfcrange0, _CMP_LT_OS), _mm256_cmp_pd(vfcfreq, vfcrange1, _CMP_GT_OS)),
4672 _mm256_or_pd(_mm256_cmp_pd(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm256_cmp_pd(vfcreso_DB, vfcrange3, _CMP_GT_OS))
4675 int imask = _mm256_movemask_pd(vmask) & ((1 << (batch_size - i)) - 1);
4678 __m256d v1mmargin = _mm256_set1_pd(1.0 - ext_filter_margin);
4679 __m256d v1pmargin = _mm256_set1_pd(1.0 + ext_filter_margin);
4681 vfcrange0 = _mm256_mul_pd(vfcfreq, v1mmargin);
4682 vfcrange1 = _mm256_mul_pd(vfcfreq, v1pmargin);
4683 vfcrange2 = _mm256_mul_pd(vfcreso_DB, v1mmargin);
4684 vfcrange3 = _mm256_mul_pd(vfcreso_DB, v1pmargin);
4686 vfcrange01_02 = _mm256_unpacklo_pd(vfcrange0, vfcrange1);
4687 vfcrange01_13 = _mm256_unpackhi_pd(vfcrange0, vfcrange1);
4688 vfcrange23_02 = _mm256_unpacklo_pd(vfcrange2, vfcrange3);
4689 vfcrange23_13 = _mm256_unpackhi_pd(vfcrange2, vfcrange3);
4691 vfcrange0123_0 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (2 << 4) | 0);
4692 vfcrange0123_1 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (2 << 4) | 0);
4693 vfcrange0123_2 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (3 << 4) | 1);
4694 vfcrange0123_3 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (3 << 4) | 1);
4697 _mm256_storeu_pd(fcs[i]->range, vfcrange0123_0);
4699 if (imask & (1 << 1))
4700 _mm256_storeu_pd(fcs[i + 1]->range, vfcrange0123_1);
4702 if (imask & (1 << 2))
4703 _mm256_storeu_pd(fcs[i + 2]->range, vfcrange0123_2);
4705 if (imask & (1 << 3))
4706 _mm256_storeu_pd(fcs[i + 3]->range, vfcrange0123_3);
4708 __m256d vfcdiv_flt_rate = _mm256_set_pd(
4709 i + 3 < batch_size ? fcs[i + 3]->div_flt_rate : fcs[i]->div_flt_rate,
4710 i + 2 < batch_size ? fcs[i + 2]->div_flt_rate : fcs[i]->div_flt_rate,
4711 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
4712 fcs[i]->div_flt_rate
4715 __m256d vf = _mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(M_PI), vfcfreq), vfcdiv_flt_rate);
4718 __m256d vtanf = _mm256_tan_pd(vf);
4720 ALIGN FLOAT_T af[4];
4721 _mm256_storeu_pd(af, vf);
4722 __m256d vtanf = _mm256_set_pd(tan(af[3]), tan(af[2]), tan(af[1]), tan(af[0]));
4725 __m256d v1 = _mm256_set1_pd(1.0);
4726 __m256d v2 = _mm256_set1_pd(2.0);
4727 __m256d vp = _mm256_div_pd(v1, vtanf);
4729 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
4731 __m256d vreso_db_cf_p = _mm256_set_pd(
4732 i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : reso_db_cf_p,
4733 i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : reso_db_cf_p,
4734 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
4738 __m256d vq = _mm256_mul_pd(vreso_db_cf_p, _mm256_set1_pd(SQRT_2));
4739 __m256d vp2 = _mm256_mul_pd(vp, vp);
4740 __m256d vqp = _mm256_mul_pd(vq, vp);
4741 __m256d vdc0 = _mm256_div_pd(v1, _mm256_add_pd(_mm256_add_pd(v1, vqp), vp2));
4742 __m256d vdc1 = _mm256_mul_pd(v2, vdc0);
4743 __m256d vdc2 = vdc0;
4744 __m256d vdc3 = _mm256_mul_pd(_mm256_mul_pd(v2, _mm256_sub_pd(vp2, v1)), vdc0);
4745 __m256d vdc4 = _mm256_mul_pd(_mm256_sub_pd(_mm256_sub_pd(vqp, v1), vp2), vdc0);
4747 __m256d vdc01_02 = _mm256_unpacklo_pd(vdc0, vdc1);
4748 __m256d vdc01_13 = _mm256_unpackhi_pd(vdc0, vdc1);
4749 __m256d vdc23_02 = _mm256_unpacklo_pd(vdc2, vdc3);
4750 __m256d vdc23_13 = _mm256_unpackhi_pd(vdc2, vdc3);
4752 __m256d vdc0123_0 = _mm256_permute2f128_pd(vdc01_02, vdc23_02, (2 << 4) | 0);
4753 __m256d vdc0123_1 = _mm256_permute2f128_pd(vdc01_13, vdc23_13, (2 << 4) | 0);
4754 __m256d vdc0123_2 = _mm256_permute2f128_pd(vdc01_02, vdc23_02, (3 << 4) | 1);
4755 __m256d vdc0123_3 = _mm256_permute2f128_pd(vdc01_13, vdc23_13, (3 << 4) | 1);
4758 _mm256_storeu_pd(&fcs[i]->dc[0], vdc0123_0);
4759 _mm_storel_pd(&fcs[i]->dc[4], _mm256_castpd256_pd128(vdc4));
4762 if (imask & (1 << 1)) {
4763 _mm256_storeu_pd(&fcs[i + 1]->dc[0], vdc0123_1);
4764 _mm_storeh_pd(&fcs[i + 1]->dc[4], _mm256_castpd256_pd128(vdc4));
4767 if (imask & (1 << 2)) {
4768 _mm256_storeu_pd(&fcs[i + 2]->dc[0], vdc0123_2);
4769 _mm_storel_pd(&fcs[i + 2]->dc[4], _mm256_extractf128_pd(vdc4, 1));
4772 if (imask & (1 << 3)) {
4773 _mm256_storeu_pd(&fcs[i + 3]->dc[0], vdc0123_3);
4774 _mm_storeh_pd(&fcs[i + 3]->dc[4], _mm256_extractf128_pd(vdc4, 1));
4780 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4782 static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs)
4784 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
4785 if (i >= batch_size)
4788 __m128d vfcrange01_0 = _mm_loadu_pd(fcs[i]->range);
4789 __m128d vfcrange23_0 = _mm_loadu_pd(&fcs[i]->range[2]);
4790 __m128d vfcrange01_1 = i + 1 < batch_size ? _mm_loadu_pd(fcs[i + 1]->range) : _mm_setzero_pd();
4791 __m128d vfcrange23_1 = i + 1 < batch_size ? _mm_loadu_pd(&fcs[i + 1]->range[2]) : _mm_setzero_pd();
4793 __m128d vfcrange0 = _mm_unpacklo_pd(vfcrange01_0, vfcrange01_1);
4794 __m128d vfcrange1 = _mm_unpackhi_pd(vfcrange01_0, vfcrange01_1);
4795 __m128d vfcrange2 = _mm_unpacklo_pd(vfcrange23_0, vfcrange23_1);
4796 __m128d vfcrange3 = _mm_unpackhi_pd(vfcrange23_0, vfcrange23_1);
4798 __m128d vfcfreq = _mm_set_pd(
4799 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
4803 __m128d vfcreso_DB = _mm_set_pd(
4804 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
4808 __m128d vmask = _mm_or_pd(
4809 _mm_or_pd(_mm_cmplt_pd(vfcfreq, vfcrange0), _mm_cmpgt_pd(vfcfreq, vfcrange1)),
4810 _mm_or_pd(_mm_cmplt_pd(vfcreso_DB, vfcrange2), _mm_cmpgt_pd(vfcreso_DB, vfcrange3))
4813 int imask = _mm_movemask_pd(vmask) & ((1 << (batch_size - i)) - 1);
4816 __m128d v1mmargin = _mm_set1_pd(1.0 - ext_filter_margin);
4817 __m128d v1pmargin = _mm_set1_pd(1.0 + ext_filter_margin);
4819 vfcrange0 = _mm_mul_pd(vfcfreq, v1mmargin);
4820 vfcrange1 = _mm_mul_pd(vfcfreq, v1pmargin);
4821 vfcrange2 = _mm_mul_pd(vfcreso_DB, v1mmargin);
4822 vfcrange3 = _mm_mul_pd(vfcreso_DB, v1pmargin);
4824 vfcrange01_0 = _mm_unpacklo_pd(vfcrange0, vfcrange1);
4825 vfcrange01_1 = _mm_unpackhi_pd(vfcrange0, vfcrange1);
4826 vfcrange23_0 = _mm_unpacklo_pd(vfcrange2, vfcrange3);
4827 vfcrange23_1 = _mm_unpackhi_pd(vfcrange2, vfcrange3);
4830 _mm_storeu_pd(fcs[i]->range, vfcrange01_0);
4831 _mm_storeu_pd(&fcs[i]->range[2], vfcrange23_0);
4834 if (imask & (1 << 1)) {
4835 _mm_storeu_pd(fcs[i + 1]->range, vfcrange01_1);
4836 _mm_storeu_pd(&fcs[i + 1]->range[2], vfcrange23_1);
4839 __m128d vfcdiv_flt_rate = _mm_set_pd(
4840 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
4841 fcs[i]->div_flt_rate
4844 __m128d vf = _mm_mul_pd(_mm_mul_pd(_mm_set1_pd(M_PI), vfcfreq), vfcdiv_flt_rate);
4847 __m128d vtanf = _mm_tan_pd(vf);
4849 ALIGN FLOAT_T af[2];
4850 _mm_storeu_pd(af, vf);
4851 __m128d vtanf = _mm_set_pd(tan(af[1]), tan(af[0]));
4854 __m128d v1 = _mm_set1_pd(1.0);
4855 __m128d v2 = _mm_set1_pd(2.0);
4856 __m128d vp = _mm_div_pd(v1, vtanf);
4858 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
4860 __m128d vreso_db_cf_p = _mm_set_pd(
4861 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
4865 __m128d vq = _mm_mul_pd(vreso_db_cf_p, _mm_set1_pd(SQRT_2));
4866 __m128d vp2 = _mm_mul_pd(vp, vp);
4867 __m128d vqp = _mm_mul_pd(vq, vp);
4868 __m128d vdc0 = _mm_div_pd(v1, _mm_add_pd(_mm_add_pd(v1, vqp), vp2));
4869 __m128d vdc1 = _mm_mul_pd(v2, vdc0);
4870 __m128d vdc2 = vdc0;
4871 __m128d vdc3 = _mm_mul_pd(_mm_mul_pd(v2, _mm_sub_pd(vp2, v1)), vdc0);
4872 __m128d vdc4 = _mm_mul_pd(_mm_sub_pd(_mm_sub_pd(vqp, v1), vp2), vdc0);
4874 __m128d vdc01_0 = _mm_unpacklo_pd(vdc0, vdc1);
4875 __m128d vdc01_1 = _mm_unpackhi_pd(vdc0, vdc1);
4876 __m128d vdc23_0 = _mm_unpacklo_pd(vdc2, vdc3);
4877 __m128d vdc23_1 = _mm_unpackhi_pd(vdc2, vdc3);
4880 _mm_storeu_pd(&fcs[i]->dc[0], vdc01_0);
4881 _mm_storeu_pd(&fcs[i]->dc[2], vdc23_0);
4882 _mm_storel_pd(&fcs[i]->dc[4], vdc4);
4885 if (imask & (1 << 1)) {
4886 _mm_storeu_pd(&fcs[i + 1]->dc[0], vdc01_1);
4887 _mm_storeu_pd(&fcs[i + 1]->dc[2], vdc23_1);
4888 _mm_storeh_pd(&fcs[i + 1]->dc[4], vdc4);
4896 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4898 static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
4900 __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(0, batch_size), counts);
4902 __m128d vdb01_0 = _mm_loadu_pd(dbs[0]);
4903 __m128d vdb01_1 = 1 < batch_size ? _mm_loadu_pd(dbs[1]) : _mm_setzero_pd();
4904 __m128d vdb01_2 = 2 < batch_size ? _mm_loadu_pd(dbs[2]) : _mm_setzero_pd();
4905 __m128d vdb01_3 = 3 < batch_size ? _mm_loadu_pd(dbs[3]) : _mm_setzero_pd();
4906 __m128d vdb01_4 = 4 < batch_size ? _mm_loadu_pd(dbs[4]) : _mm_setzero_pd();
4907 __m128d vdb01_5 = 5 < batch_size ? _mm_loadu_pd(dbs[5]) : _mm_setzero_pd();
4908 __m128d vdb01_6 = 6 < batch_size ? _mm_loadu_pd(dbs[6]) : _mm_setzero_pd();
4909 __m128d vdb01_7 = 7 < batch_size ? _mm_loadu_pd(dbs[7]) : _mm_setzero_pd();
4911 __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
4912 __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
4913 __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
4914 __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
4916 __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
4917 __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
4919 __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
4920 __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
4922 __m128d vdc01_0 = _mm_loadu_pd(dcs[0]);
4923 __m128d vdc01_1 = 1 < batch_size ? _mm_loadu_pd(dcs[1]) : _mm_setzero_pd();
4924 __m128d vdc01_2 = 2 < batch_size ? _mm_loadu_pd(dcs[2]) : _mm_setzero_pd();
4925 __m128d vdc01_3 = 3 < batch_size ? _mm_loadu_pd(dcs[3]) : _mm_setzero_pd();
4926 __m128d vdc01_4 = 4 < batch_size ? _mm_loadu_pd(dcs[4]) : _mm_setzero_pd();
4927 __m128d vdc01_5 = 5 < batch_size ? _mm_loadu_pd(dcs[5]) : _mm_setzero_pd();
4928 __m128d vdc01_6 = 6 < batch_size ? _mm_loadu_pd(dcs[6]) : _mm_setzero_pd();
4929 __m128d vdc01_7 = 7 < batch_size ? _mm_loadu_pd(dcs[7]) : _mm_setzero_pd();
4931 __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
4932 __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
4933 __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
4934 __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
4936 __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
4937 __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
4939 __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
4940 __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
4942 __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
4943 vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
4944 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
4946 for (int32 j = 0; j < count_max; j += 8) {
4948 vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[0]), &sps[0][j]);
4950 for (int k = 1; k < 8; k++)
4951 vin[k] = _mm512_maskz_loadu_pd(k < batch_size ? generate_mask8_for_count(j, counts[k]) : 0, &sps[k][j]);
4953 __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
4954 __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
4955 __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
4956 __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
4957 __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
4958 __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
4959 __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
4960 __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
4962 __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4963 __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4964 __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4965 __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4966 __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4967 __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4968 __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4969 __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4972 vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4973 vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4974 vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4975 vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4976 vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4977 vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4978 vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4979 vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4981 for (int k = 0; k < 8; k++) {
4982 __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
4984 vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
4985 vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
4986 vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
4990 __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
4991 __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
4992 __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
4993 __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
4994 __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
4995 __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
4996 __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
4997 __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
4999 __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5000 __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5001 __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5002 __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5003 __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5004 __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5005 __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5006 __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5009 vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5010 vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5011 vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5012 vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5013 vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5014 vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5015 vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5016 vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5018 for (int k = 0; k < batch_size; k++)
5019 _mm512_mask_storeu_pd(&sps[k][j], generate_mask8_for_count(j, counts[k]), vout[k]);
5022 vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
5023 vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
5025 _mm_storeu_pd(dbs[0], _mm512_castpd512_pd128(vdb01_0246));
5028 _mm_storeu_pd(dbs[1], _mm512_castpd512_pd128(vdb01_1357));
5031 _mm_storeu_pd(dbs[2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
5034 _mm_storeu_pd(dbs[3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
5037 _mm_storeu_pd(dbs[4], _mm512_extractf64x2_pd(vdb01_0246, 2));
5040 _mm_storeu_pd(dbs[5], _mm512_extractf64x2_pd(vdb01_1357, 2));
5043 _mm_storeu_pd(dbs[6], _mm512_extractf64x2_pd(vdb01_0246, 3));
5046 _mm_storeu_pd(dbs[7], _mm512_extractf64x2_pd(vdb01_1357, 3));
5049 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5051 static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5053 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
5054 if (i >= batch_size)
5057 __m128i vcounts = _mm_set_epi32(
5058 i + 3 < batch_size ? counts[i + 3] : 0,
5059 i + 2 < batch_size ? counts[i + 2] : 0,
5060 i + 1 < batch_size ? counts[i + 1] : 0,
5064 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
5065 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
5066 __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
5067 __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
5069 __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
5070 __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
5072 __m256d vdb0 = _mm256_unpacklo_pd(vdb01_02, vdb01_13);
5073 __m256d vdb1 = _mm256_unpackhi_pd(vdb01_02, vdb01_13);
5075 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
5076 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
5077 __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
5078 __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
5080 __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
5081 __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
5083 __m256d vdc0 = _mm256_unpacklo_pd(vdc01_02, vdc01_13);
5084 __m256d vdc1 = _mm256_unpackhi_pd(vdc01_02, vdc01_13);
5086 __m128i vcounts_halfmax = _mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, (3 << 2) | 2));
5087 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, 1)));
5089 for (int32 j = 0; j < count_max; j += 4) {
5090 __m256d vsp0123_0 = j < counts[i] ? _mm256_loadu_pd(&sps[i][j]) : _mm256_setzero_pd();
5091 __m256d vsp0123_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm256_loadu_pd(&sps[i + 1][j]) : _mm256_setzero_pd();
5092 __m256d vsp0123_2 = i + 2 < batch_size && j < counts[i + 2] ? _mm256_loadu_pd(&sps[i + 2][j]) : _mm256_setzero_pd();
5093 __m256d vsp0123_3 = i + 3 < batch_size && j < counts[i + 3] ? _mm256_loadu_pd(&sps[i + 3][j]) : _mm256_setzero_pd();
5095 __m256d vsp01_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (2 << 4) | 0);
5096 __m256d vsp01_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (2 << 4) | 0);
5097 __m256d vsp23_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (3 << 4) | 1);
5098 __m256d vsp23_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (3 << 4) | 1);
5101 vsps[0] = _mm256_unpacklo_pd(vsp01_02, vsp01_13);
5102 vsps[1] = _mm256_unpackhi_pd(vsp01_02, vsp01_13);
5103 vsps[2] = _mm256_unpacklo_pd(vsp23_02, vsp23_13);
5104 vsps[3] = _mm256_unpackhi_pd(vsp23_02, vsp23_13);
5106 for (int k = 0; k < 4; k++) {
5107 __m256d vmask = _mm256_castsi256_pd(_mm256_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
5109 vdb1 = _mm256_blendv_pd(vdb1, MM256_FMA_PD(_mm256_sub_pd(vsps[k], vdb0), vdc1, vdb1), vmask);
5110 vdb0 = _mm256_blendv_pd(vdb0, _mm256_add_pd(vdb0, vdb1), vmask);
5111 vdb1 = _mm256_blendv_pd(vdb1, _mm256_mul_pd(vdb1, vdc0), vmask);
5115 vsp01_02 = _mm256_unpacklo_pd(vsps[0], vsps[1]);
5116 vsp01_13 = _mm256_unpackhi_pd(vsps[0], vsps[1]);
5117 vsp23_02 = _mm256_unpacklo_pd(vsps[2], vsps[3]);
5118 vsp23_13 = _mm256_unpackhi_pd(vsps[2], vsps[3]);
5120 vsp0123_0 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (2 << 4) | 0);
5121 vsp0123_1 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (2 << 4) | 0);
5122 vsp0123_2 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (3 << 4) | 1);
5123 vsp0123_3 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (3 << 4) | 1);
5126 _mm256_storeu_pd(&sps[i][j], vsp0123_0);
5128 if (i + 1 < batch_size && j < counts[i + 1])
5129 _mm256_storeu_pd(&sps[i + 1][j], vsp0123_1);
5131 if (i + 2 < batch_size && j < counts[i + 2])
5132 _mm256_storeu_pd(&sps[i + 2][j], vsp0123_2);
5134 if (i + 3 < batch_size && j < counts[i + 3])
5135 _mm256_storeu_pd(&sps[i + 3][j], vsp0123_3);
5138 vdb01_02 = _mm256_unpacklo_pd(vdb0, vdb1);
5139 vdb01_13 = _mm256_unpackhi_pd(vdb0, vdb1);
5141 _mm_storeu_pd(dbs[i], _mm256_castpd256_pd128(vdb01_02));
5143 if (i + 1 < batch_size)
5144 _mm_storeu_pd(dbs[i + 1], _mm256_castpd256_pd128(vdb01_13));
5146 if (i + 2 < batch_size)
5147 _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(vdb01_02, 1));
5149 if (i + 3 < batch_size)
5150 _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(vdb01_13, 1));
5154 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5156 static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5158 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
5159 if (i >= batch_size)
5162 __m128i vcounts = _mm_set_epi32(
5165 i + 1 < batch_size ? counts[i + 1] : 0,
5169 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
5170 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
5172 __m128d vdb0 = _mm_unpacklo_pd(vdb01_0, vdb01_1);
5173 __m128d vdb1 = _mm_unpackhi_pd(vdb01_0, vdb01_1);
5175 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
5176 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
5178 __m128d vdc0 = _mm_unpacklo_pd(vdc01_0, vdc01_1);
5179 __m128d vdc1 = _mm_unpackhi_pd(vdc01_0, vdc01_1);
5181 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, 1)));
5183 for (int32 j = 0; j < count_max; j += 2) {
5184 __m128d vsp01_0 = j < counts[i] ? _mm_loadu_pd(&sps[i][j]) : _mm_setzero_pd();
5185 __m128d vsp01_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm_loadu_pd(&sps[i + 1][j]) : _mm_setzero_pd();
5188 vsps[0] = _mm_unpacklo_pd(vsp01_0, vsp01_1);
5189 vsps[1] = _mm_unpackhi_pd(vsp01_0, vsp01_1);
5191 for (int k = 0; k < 2; k++) {
5192 __m128d vmask = _mm_castsi128_pd(_mm_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
5194 #if USE_X86_EXT_INTRIN >= 6
5195 vdb1 = _mm_blendv_pd(vdb1, MM_FMA_PD(_mm_sub_pd(vsps[k], vdb0), vdc1, vdb1), vmask);
5196 vdb0 = _mm_blendv_pd(vdb0, _mm_add_pd(vdb0, vdb1), vmask);
5197 vdb1 = _mm_blendv_pd(vdb1, _mm_mul_pd(vdb1, vdc0), vmask);
5199 vdb1 = _mm_or_pd(_mm_andnot_pd(vmask, vdb1), _mm_and_pd(vmask, MM_FMA_PD(_mm_sub_pd(vsps[k], vdb0), vdc1, vdb1)));
5200 vdb0 = _mm_or_pd(_mm_andnot_pd(vmask, vdb0), _mm_and_pd(vmask, _mm_add_pd(vdb0, vdb1)));
5201 vdb1 = _mm_or_pd(_mm_andnot_pd(vmask, vdb1), _mm_and_pd(vmask, _mm_mul_pd(vdb1, vdc0)));
5206 vsp01_0 = _mm_unpacklo_pd(vsps[0], vsps[1]);
5207 vsp01_1 = _mm_unpackhi_pd(vsps[0], vsps[1]);
5210 _mm_storeu_pd(&sps[i][j], vsp01_0);
5212 if (i + 1 < batch_size && j < counts[i + 1])
5213 _mm_storeu_pd(&sps[i + 1][j], vsp01_1);
5216 vdb01_0 = _mm_unpacklo_pd(vdb0, vdb1);
5217 vdb01_1 = _mm_unpackhi_pd(vdb0, vdb1);
5219 _mm_storeu_pd(dbs[i], vdb01_0);
5221 if (i + 1 < batch_size)
5222 _mm_storeu_pd(dbs[i + 1], vdb01_1);
5228 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5230 static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients **fcs)
5232 __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[0]->range);
5233 __m256d vfcrange0123_1 = 1 < batch_size ? _mm256_loadu_pd(fcs[1]->range) : _mm256_setzero_pd();
5234 __m256d vfcrange0123_2 = 2 < batch_size ? _mm256_loadu_pd(fcs[2]->range) : _mm256_setzero_pd();
5235 __m256d vfcrange0123_3 = 3 < batch_size ? _mm256_loadu_pd(fcs[3]->range) : _mm256_setzero_pd();
5236 __m256d vfcrange0123_4 = 4 < batch_size ? _mm256_loadu_pd(fcs[4]->range) : _mm256_setzero_pd();
5237 __m256d vfcrange0123_5 = 5 < batch_size ? _mm256_loadu_pd(fcs[5]->range) : _mm256_setzero_pd();
5238 __m256d vfcrange0123_6 = 6 < batch_size ? _mm256_loadu_pd(fcs[6]->range) : _mm256_setzero_pd();
5239 __m256d vfcrange0123_7 = 7 < batch_size ? _mm256_loadu_pd(fcs[7]->range) : _mm256_setzero_pd();
5241 __m512d vfcrange0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_0), vfcrange0123_2, 1);
5242 __m512d vfcrange0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_1), vfcrange0123_3, 1);
5243 __m512d vfcrange0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_4), vfcrange0123_6, 1);
5244 __m512d vfcrange0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_5), vfcrange0123_7, 1);
5246 __m512d vfcrange01_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5247 __m512d vfcrange01_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5248 __m512d vfcrange23_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5249 __m512d vfcrange23_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5251 __m512d vfcrange0 = _mm512_unpacklo_pd(vfcrange01_0246, vfcrange01_1357);
5252 __m512d vfcrange1 = _mm512_unpackhi_pd(vfcrange01_0246, vfcrange01_1357);
5253 __m512d vfcrange2 = _mm512_unpacklo_pd(vfcrange23_0246, vfcrange23_1357);
5254 __m512d vfcrange3 = _mm512_unpackhi_pd(vfcrange23_0246, vfcrange23_1357);
5256 __m512d vfcfreq = _mm512_set_pd(
5257 7 < batch_size ? fcs[7]->freq : 0.0,
5258 6 < batch_size ? fcs[6]->freq : 0.0,
5259 5 < batch_size ? fcs[5]->freq : 0.0,
5260 4 < batch_size ? fcs[4]->freq : 0.0,
5261 3 < batch_size ? fcs[3]->freq : 0.0,
5262 2 < batch_size ? fcs[2]->freq : 0.0,
5263 1 < batch_size ? fcs[1]->freq : 0.0,
5267 __m512d vfcreso_DB = _mm512_set_pd(
5268 7 < batch_size ? fcs[7]->reso_dB : 0.0,
5269 6 < batch_size ? fcs[6]->reso_dB : 0.0,
5270 5 < batch_size ? fcs[5]->reso_dB : 0.0,
5271 4 < batch_size ? fcs[4]->reso_dB : 0.0,
5272 3 < batch_size ? fcs[3]->reso_dB : 0.0,
5273 2 < batch_size ? fcs[2]->reso_dB : 0.0,
5274 1 < batch_size ? fcs[1]->reso_dB : 0.0,
5278 uint8 imask = _kor_mask8(
5279 _kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
5280 _kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
5281 ) & ((1 << batch_size) - 1);
5284 __m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
5285 __m512d v1pmargin = _mm512_set1_pd(1.0 + ext_filter_margin);
5287 vfcrange0 = _mm512_mul_pd(vfcfreq, v1mmargin);
5288 vfcrange1 = _mm512_mul_pd(vfcfreq, v1pmargin);
5289 vfcrange2 = _mm512_mul_pd(vfcreso_DB, v1mmargin);
5290 vfcrange3 = _mm512_mul_pd(vfcreso_DB, v1pmargin);
5292 vfcrange01_0246 = _mm512_unpacklo_pd(vfcrange0, vfcrange1);
5293 vfcrange01_1357 = _mm512_unpackhi_pd(vfcrange0, vfcrange1);
5294 vfcrange23_0246 = _mm512_unpacklo_pd(vfcrange2, vfcrange3);
5295 vfcrange23_1357 = _mm512_unpackhi_pd(vfcrange2, vfcrange3);
5298 __m512d vfcrange0123_04 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_0246);
5299 __m512d vfcrange0123_26 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_0246);
5300 __m512d vfcrange0123_15 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_1357);
5301 __m512d vfcrange0123_37 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_1357);
5303 __m512d vfcrange0123_04 = _mm512_mask_permutex_pd(vfcrange01_0246, 0xCC, vfcrange23_0246, (1 << 6) | (0 << 4) | 0);
5304 __m512d vfcrange0123_26 = _mm512_mask_permutex_pd(vfcrange01_0246, 0x33, vfcrange23_0246, (3 << 2) | 2);
5305 __m512d vfcrange0123_15 = _mm512_mask_permutex_pd(vfcrange01_1357, 0xCC, vfcrange23_1357, (1 << 6) | (0 << 4) | 0);
5306 __m512d vfcrange0123_37 = _mm512_mask_permutex_pd(vfcrange01_1357, 0x33, vfcrange23_1357, (3 << 2) | 2);
5310 _mm256_storeu_pd(fcs[0]->range, _mm512_castpd512_pd256(vfcrange0123_04));
5312 if (imask & (1 << 1))
5313 _mm256_storeu_pd(fcs[1]->range, _mm512_castpd512_pd256(vfcrange0123_15));
5315 if (imask & (1 << 2))
5316 _mm256_storeu_pd(fcs[2]->range, _mm512_castpd512_pd256(vfcrange0123_26));
5318 if (imask & (1 << 3))
5319 _mm256_storeu_pd(fcs[3]->range, _mm512_castpd512_pd256(vfcrange0123_37));
5321 if (imask & (1 << 4))
5322 _mm256_storeu_pd(fcs[4]->range, _mm512_extractf64x4_pd(vfcrange0123_04, 1));
5324 if (imask & (1 << 5))
5325 _mm256_storeu_pd(fcs[5]->range, _mm512_extractf64x4_pd(vfcrange0123_15, 1));
5327 if (imask & (1 << 6))
5328 _mm256_storeu_pd(fcs[6]->range, _mm512_extractf64x4_pd(vfcrange0123_26, 1));
5330 if (imask & (1 << 7))
5331 _mm256_storeu_pd(fcs[7]->range, _mm512_extractf64x4_pd(vfcrange0123_37, 1));
5333 __m512d vfcdiv_flt_rate = _mm512_set_pd(
5334 7 < batch_size ? fcs[7]->div_flt_rate : fcs[0]->div_flt_rate,
5335 6 < batch_size ? fcs[6]->div_flt_rate : fcs[0]->div_flt_rate,
5336 5 < batch_size ? fcs[5]->div_flt_rate : fcs[0]->div_flt_rate,
5337 4 < batch_size ? fcs[4]->div_flt_rate : fcs[0]->div_flt_rate,
5338 3 < batch_size ? fcs[3]->div_flt_rate : fcs[0]->div_flt_rate,
5339 2 < batch_size ? fcs[2]->div_flt_rate : fcs[0]->div_flt_rate,
5340 1 < batch_size ? fcs[1]->div_flt_rate : fcs[0]->div_flt_rate,
5341 fcs[0]->div_flt_rate
5344 __m512d vf = _mm512_mul_pd(_mm512_mul_pd(_mm512_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
5346 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[0]->reso_dB);
5348 __m512d vreso_db_cf_p = _mm512_set_pd(
5349 7 < batch_size ? RESO_DB_CF_P(fcs[7]->reso_dB) : reso_db_cf_p,
5350 6 < batch_size ? RESO_DB_CF_P(fcs[6]->reso_dB) : reso_db_cf_p,
5351 5 < batch_size ? RESO_DB_CF_P(fcs[5]->reso_dB) : reso_db_cf_p,
5352 4 < batch_size ? RESO_DB_CF_P(fcs[4]->reso_dB) : reso_db_cf_p,
5353 3 < batch_size ? RESO_DB_CF_P(fcs[3]->reso_dB) : reso_db_cf_p,
5354 2 < batch_size ? RESO_DB_CF_P(fcs[2]->reso_dB) : reso_db_cf_p,
5355 1 < batch_size ? RESO_DB_CF_P(fcs[1]->reso_dB) : reso_db_cf_p,
5359 __m512d v1 = _mm512_set1_pd(1.0);
5360 __m512d v2 = _mm512_set1_pd(2.0);
5361 __m512d v0_5 = _mm512_set1_pd(0.5);
5363 __m512d vq = _mm512_sub_pd(v1, _mm512_div_pd(vf, _mm512_fmadd_pd(v2, _mm512_add_pd(vreso_db_cf_p, _mm512_div_pd(v0_5, _mm512_add_pd(v1, vf))), _mm512_sub_pd(vf, v2))));
5364 __m512d vc0 = _mm512_mul_pd(vq, vq);
5366 __m512d vcosf = _mm512_cos_pd(vf);
5368 ALIGN FLOAT_T af[8];
5369 _mm512_storeu_pd(af, vf);
5370 __m512d vcosf = _mm512_set_pd(cos(af[7]), cos(af[6]), cos(af[5]), cos(af[4]), cos(af[3]), cos(af[2]), cos(af[1]), cos(af[0]));
5372 __m512d vc1 = _mm512_sub_pd(_mm512_add_pd(vc0, v1), _mm512_mul_pd(_mm512_mul_pd(v2, vcosf), vq));
5374 __m512d vdc0246 = _mm512_unpacklo_pd(vc0, vc1);
5375 __m512d vdc1357 = _mm512_unpackhi_pd(vc0, vc1);
5378 _mm_storeu_pd(fcs[0]->dc, _mm512_castpd512_pd128(vdc0246));
5379 if (imask & (1 << 1))
5380 _mm_storeu_pd(fcs[1]->dc, _mm512_castpd512_pd128(vdc1357));
5381 if (imask & (1 << 2))
5382 _mm_storeu_pd(fcs[2]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc0246), 1));
5383 if (imask & (1 << 3))
5384 _mm_storeu_pd(fcs[3]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc1357), 1));
5385 if (imask & (1 << 4))
5386 _mm_storeu_pd(fcs[4]->dc, _mm512_extractf64x2_pd(vdc0246, 2));
5387 if (imask & (1 << 5))
5388 _mm_storeu_pd(fcs[5]->dc, _mm512_extractf64x2_pd(vdc1357, 2));
5389 if (imask & (1 << 6))
5390 _mm_storeu_pd(fcs[6]->dc, _mm512_extractf64x2_pd(vdc0246, 3));
5391 if (imask & (1 << 7))
5392 _mm_storeu_pd(fcs[7]->dc, _mm512_extractf64x2_pd(vdc1357, 3));
5396 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5398 static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients** fcs)
5400 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
5401 if (i >= batch_size)
5404 __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[i]->range);
5405 __m256d vfcrange0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(fcs[i + 1]->range) : _mm256_setzero_pd();
5406 __m256d vfcrange0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(fcs[i + 2]->range) : _mm256_setzero_pd();
5407 __m256d vfcrange0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(fcs[i + 3]->range) : _mm256_setzero_pd();
5409 __m256d vfcrange01_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (2 << 4) | 0);
5410 __m256d vfcrange01_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (2 << 4) | 0);
5411 __m256d vfcrange23_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (3 << 4) | 1);
5412 __m256d vfcrange23_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (3 << 4) | 1);
5414 __m256d vfcrange0 = _mm256_unpacklo_pd(vfcrange01_02, vfcrange01_13);
5415 __m256d vfcrange1 = _mm256_unpackhi_pd(vfcrange01_02, vfcrange01_13);
5416 __m256d vfcrange2 = _mm256_unpacklo_pd(vfcrange23_02, vfcrange23_13);
5417 __m256d vfcrange3 = _mm256_unpackhi_pd(vfcrange23_02, vfcrange23_13);
5419 __m256d vfcfreq = _mm256_set_pd(
5420 i + 3 < batch_size ? fcs[i + 3]->freq : 0.0,
5421 i + 2 < batch_size ? fcs[i + 2]->freq : 0.0,
5422 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
5426 __m256d vfcreso_DB = _mm256_set_pd(
5427 i + 3 < batch_size ? fcs[i + 3]->reso_dB : 0.0,
5428 i + 2 < batch_size ? fcs[i + 2]->reso_dB : 0.0,
5429 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
5433 __m256d vmask = _mm256_or_pd(
5434 _mm256_or_pd(_mm256_cmp_pd(vfcfreq, vfcrange0, _CMP_LT_OS), _mm256_cmp_pd(vfcfreq, vfcrange1, _CMP_GT_OS)),
5435 _mm256_or_pd(_mm256_cmp_pd(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm256_cmp_pd(vfcreso_DB, vfcrange3, _CMP_GT_OS))
5438 int imask = _mm256_movemask_pd(vmask) & ((1 << (batch_size - i)) - 1);
5441 __m256d v1mmargin = _mm256_set1_pd(1.0 - ext_filter_margin);
5442 __m256d v1pmargin = _mm256_set1_pd(1.0 + ext_filter_margin);
5444 vfcrange0 = _mm256_mul_pd(vfcfreq, v1mmargin);
5445 vfcrange1 = _mm256_mul_pd(vfcfreq, v1pmargin);
5446 vfcrange2 = _mm256_mul_pd(vfcreso_DB, v1mmargin);
5447 vfcrange3 = _mm256_mul_pd(vfcreso_DB, v1pmargin);
5449 vfcrange01_02 = _mm256_unpacklo_pd(vfcrange0, vfcrange1);
5450 vfcrange01_13 = _mm256_unpackhi_pd(vfcrange0, vfcrange1);
5451 vfcrange23_02 = _mm256_unpacklo_pd(vfcrange2, vfcrange3);
5452 vfcrange23_13 = _mm256_unpackhi_pd(vfcrange2, vfcrange3);
5454 vfcrange0123_0 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (2 << 4) | 0);
5455 vfcrange0123_1 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (2 << 4) | 0);
5456 vfcrange0123_2 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (3 << 4) | 1);
5457 vfcrange0123_3 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (3 << 4) | 1);
5460 _mm256_storeu_pd(fcs[i]->range, vfcrange0123_0);
5462 if (imask & (1 << 1))
5463 _mm256_storeu_pd(fcs[i + 1]->range, vfcrange0123_1);
5465 if (imask & (1 << 2))
5466 _mm256_storeu_pd(fcs[i + 2]->range, vfcrange0123_2);
5468 if (imask & (1 << 3))
5469 _mm256_storeu_pd(fcs[i + 3]->range, vfcrange0123_3);
5471 __m256d vfcdiv_flt_rate = _mm256_set_pd(
5472 i + 3 < batch_size ? fcs[i + 3]->div_flt_rate : fcs[i]->div_flt_rate,
5473 i + 2 < batch_size ? fcs[i + 2]->div_flt_rate : fcs[i]->div_flt_rate,
5474 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
5475 fcs[i]->div_flt_rate
5478 __m256d vf = _mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
5480 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
5482 __m256d vreso_db_cf_p = _mm256_set_pd(
5483 i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : reso_db_cf_p,
5484 i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : reso_db_cf_p,
5485 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
5489 __m256d v1 = _mm256_set1_pd(1.0);
5490 __m256d v2 = _mm256_set1_pd(2.0);
5491 __m256d v0_5 = _mm256_set1_pd(0.5);
5493 __m256d vq = _mm256_sub_pd(v1, _mm256_div_pd(vf, MM256_FMA_PD(v2, _mm256_add_pd(vreso_db_cf_p, _mm256_div_pd(v0_5, _mm256_add_pd(v1, vf))), _mm256_sub_pd(vf, v2))));
5494 __m256d vc0 = _mm256_mul_pd(vq, vq);
5496 __m256d vcosf = _mm256_cos_pd(vf);
5498 ALIGN FLOAT_T af[4];
5499 _mm256_storeu_pd(af, vf);
5500 __m256d vcosf = _mm256_set_pd(cos(af[3]), cos(af[2]), cos(af[1]), cos(af[0]));
5502 __m256d vc1 = _mm256_sub_pd(_mm256_add_pd(vc0, v1), _mm256_mul_pd(_mm256_mul_pd(v2, vcosf), vq));
5504 __m256d vdc02 = _mm256_unpacklo_pd(vc0, vc1);
5505 __m256d vdc13 = _mm256_unpackhi_pd(vc0, vc1);
5508 _mm_storeu_pd(fcs[i]->dc, _mm256_castpd256_pd128(vdc02));
5509 if (imask & (1 << 1))
5510 _mm_storeu_pd(fcs[i + 1]->dc, _mm256_castpd256_pd128(vdc13));
5511 if (imask & (1 << 2))
5512 _mm_storeu_pd(fcs[i + 2]->dc, _mm256_extractf128_pd(vdc02, 1));
5513 if (imask & (1 << 3))
5514 _mm_storeu_pd(fcs[i + 3]->dc, _mm256_extractf128_pd(vdc13, 1));
5519 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5521 static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients** fcs)
5523 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
5524 if (i >= batch_size)
5527 __m128d vfcrange01_0 = _mm_loadu_pd(fcs[i]->range);
5528 __m128d vfcrange23_0 = _mm_loadu_pd(&fcs[i]->range[2]);
5529 __m128d vfcrange01_1 = i + 1 < batch_size ? _mm_loadu_pd(fcs[i + 1]->range) : _mm_setzero_pd();
5530 __m128d vfcrange23_1 = i + 1 < batch_size ? _mm_loadu_pd(&fcs[i + 1]->range[2]) : _mm_setzero_pd();
5532 __m128d vfcrange0 = _mm_unpacklo_pd(vfcrange01_0, vfcrange01_1);
5533 __m128d vfcrange1 = _mm_unpackhi_pd(vfcrange01_0, vfcrange01_1);
5534 __m128d vfcrange2 = _mm_unpacklo_pd(vfcrange23_0, vfcrange23_1);
5535 __m128d vfcrange3 = _mm_unpackhi_pd(vfcrange23_0, vfcrange23_1);
5537 __m128d vfcfreq = _mm_set_pd(
5538 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
5542 __m128d vfcreso_DB = _mm_set_pd(
5543 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
5547 __m128d vmask = _mm_or_pd(
5548 _mm_or_pd(_mm_cmplt_pd(vfcfreq, vfcrange0), _mm_cmpgt_pd(vfcfreq, vfcrange1)),
5549 _mm_or_pd(_mm_cmplt_pd(vfcreso_DB, vfcrange2), _mm_cmpgt_pd(vfcreso_DB, vfcrange3))
5552 int imask = _mm_movemask_pd(vmask) & ((1 << (batch_size - i)) - 1);
5555 __m128d v1mmargin = _mm_set1_pd(1.0 - ext_filter_margin);
5556 __m128d v1pmargin = _mm_set1_pd(1.0 + ext_filter_margin);
5558 vfcrange0 = _mm_mul_pd(vfcfreq, v1mmargin);
5559 vfcrange1 = _mm_mul_pd(vfcfreq, v1pmargin);
5560 vfcrange2 = _mm_mul_pd(vfcreso_DB, v1mmargin);
5561 vfcrange3 = _mm_mul_pd(vfcreso_DB, v1pmargin);
5563 vfcrange01_0 = _mm_unpacklo_pd(vfcrange0, vfcrange1);
5564 vfcrange01_1 = _mm_unpackhi_pd(vfcrange0, vfcrange1);
5565 vfcrange23_0 = _mm_unpacklo_pd(vfcrange2, vfcrange3);
5566 vfcrange23_1 = _mm_unpackhi_pd(vfcrange2, vfcrange3);
5569 _mm_storeu_pd(fcs[i]->range, vfcrange01_0);
5570 _mm_storeu_pd(&fcs[i]->range[2], vfcrange23_0);
5573 if (imask & (1 << 1)) {
5574 _mm_storeu_pd(fcs[i + 1]->range, vfcrange01_1);
5575 _mm_storeu_pd(&fcs[i + 1]->range[2], vfcrange23_1);
5578 __m128d vfcdiv_flt_rate = _mm_set_pd(
5579 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
5580 fcs[i]->div_flt_rate
5583 __m128d vf = _mm_mul_pd(_mm_mul_pd(_mm_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
5585 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
5587 __m128d vreso_db_cf_p = _mm_set_pd(
5588 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
5592 __m128d v1 = _mm_set1_pd(1.0);
5593 __m128d v2 = _mm_set1_pd(2.0);
5594 __m128d v0_5 = _mm_set1_pd(0.5);
5596 __m128d vq = _mm_sub_pd(v1, _mm_div_pd(vf, MM_FMA_PD(v2, _mm_add_pd(vreso_db_cf_p, _mm_div_pd(v0_5, _mm_add_pd(v1, vf))), _mm_sub_pd(vf, v2))));
5597 __m128d vc0 = _mm_mul_pd(vq, vq);
5599 __m128d vcosf = _mm_cos_pd(vf);
5601 ALIGN FLOAT_T af[2];
5602 _mm_storeu_pd(af, vf);
5603 __m128d vcosf = _mm_set_pd(cos(af[1]), cos(af[0]));
5605 __m128d vc1 = _mm_sub_pd(_mm_add_pd(vc0, v1), _mm_mul_pd(_mm_mul_pd(v2, vcosf), vq));
5607 __m128d vdc0 = _mm_unpacklo_pd(vc0, vc1);
5608 __m128d vdc1 = _mm_unpackhi_pd(vc0, vc1);
5611 _mm_storeu_pd(fcs[i]->dc, vdc0);
5613 if (imask & (1 << 1))
5614 _mm_storeu_pd(fcs[i + 1]->dc, vdc1);
5621 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5623 static void sample_filter_HPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5625 __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(0, batch_size), counts);
5627 __m128d vdb01_0 = _mm_loadu_pd(dbs[0]);
5628 __m128d vdb01_1 = 1 < batch_size ? _mm_loadu_pd(dbs[1]) : _mm_setzero_pd();
5629 __m128d vdb01_2 = 2 < batch_size ? _mm_loadu_pd(dbs[2]) : _mm_setzero_pd();
5630 __m128d vdb01_3 = 3 < batch_size ? _mm_loadu_pd(dbs[3]) : _mm_setzero_pd();
5631 __m128d vdb01_4 = 4 < batch_size ? _mm_loadu_pd(dbs[4]) : _mm_setzero_pd();
5632 __m128d vdb01_5 = 5 < batch_size ? _mm_loadu_pd(dbs[5]) : _mm_setzero_pd();
5633 __m128d vdb01_6 = 6 < batch_size ? _mm_loadu_pd(dbs[6]) : _mm_setzero_pd();
5634 __m128d vdb01_7 = 7 < batch_size ? _mm_loadu_pd(dbs[7]) : _mm_setzero_pd();
5636 __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
5637 __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
5638 __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
5639 __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
5641 __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
5642 __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
5644 __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
5645 __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
5647 __m128d vdc01_0 = _mm_loadu_pd(dcs[0]);
5648 __m128d vdc01_1 = 1 < batch_size ? _mm_loadu_pd(dcs[1]) : _mm_setzero_pd();
5649 __m128d vdc01_2 = 2 < batch_size ? _mm_loadu_pd(dcs[2]) : _mm_setzero_pd();
5650 __m128d vdc01_3 = 3 < batch_size ? _mm_loadu_pd(dcs[3]) : _mm_setzero_pd();
5651 __m128d vdc01_4 = 4 < batch_size ? _mm_loadu_pd(dcs[4]) : _mm_setzero_pd();
5652 __m128d vdc01_5 = 5 < batch_size ? _mm_loadu_pd(dcs[5]) : _mm_setzero_pd();
5653 __m128d vdc01_6 = 6 < batch_size ? _mm_loadu_pd(dcs[6]) : _mm_setzero_pd();
5654 __m128d vdc01_7 = 7 < batch_size ? _mm_loadu_pd(dcs[7]) : _mm_setzero_pd();
5656 __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
5657 __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
5658 __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
5659 __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
5661 __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
5662 __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
5664 __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
5665 __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
5667 __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
5668 vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
5669 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
5671 for (int32 j = 0; j < count_max; j += 8) {
5673 vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[0]), &sps[0][j]);
5675 for (int k = 1; k < 8; k++)
5676 vin[k] = _mm512_maskz_loadu_pd(k < batch_size ? generate_mask8_for_count(j, counts[k]) : 0, &sps[k][j]);
5678 __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
5679 __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
5680 __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
5681 __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
5682 __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
5683 __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
5684 __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
5685 __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
5687 __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5688 __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5689 __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5690 __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5691 __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5692 __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5693 __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5694 __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5697 vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5698 vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5699 vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5700 vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5701 vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5702 vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5703 vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5704 vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5706 for (int k = 0; k < 8; k++) {
5707 __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
5709 vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
5710 vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
5711 vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
5712 vsps[k] = _mm512_sub_pd(vsps[k], vdb0);
5715 __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
5716 __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
5717 __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
5718 __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
5719 __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
5720 __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
5721 __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
5722 __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
5724 __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5725 __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5726 __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5727 __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5728 __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5729 __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5730 __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5731 __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5734 vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5735 vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5736 vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5737 vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5738 vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5739 vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5740 vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5741 vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5743 for (int k = 0; k < batch_size; k++)
5744 _mm512_mask_storeu_pd(&sps[k][j], generate_mask8_for_count(j, counts[k]), vout[k]);
5747 vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
5748 vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
5750 _mm_storeu_pd(dbs[0], _mm512_castpd512_pd128(vdb01_0246));
5753 _mm_storeu_pd(dbs[1], _mm512_castpd512_pd128(vdb01_1357));
5756 _mm_storeu_pd(dbs[2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
5759 _mm_storeu_pd(dbs[3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
5762 _mm_storeu_pd(dbs[4], _mm512_extractf64x2_pd(vdb01_0246, 2));
5765 _mm_storeu_pd(dbs[5], _mm512_extractf64x2_pd(vdb01_1357, 2));
5768 _mm_storeu_pd(dbs[6], _mm512_extractf64x2_pd(vdb01_0246, 3));
5771 _mm_storeu_pd(dbs[7], _mm512_extractf64x2_pd(vdb01_1357, 3));
5774 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5776 static void sample_filter_HPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5778 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
5779 if (i >= batch_size)
5782 __m128i vcounts = _mm_set_epi32(
5783 i + 3 < batch_size ? counts[i + 3] : 0,
5784 i + 2 < batch_size ? counts[i + 2] : 0,
5785 i + 1 < batch_size ? counts[i + 1] : 0,
5789 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
5790 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
5791 __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
5792 __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
5794 __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
5795 __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
5797 __m256d vdb0 = _mm256_unpacklo_pd(vdb01_02, vdb01_13);
5798 __m256d vdb1 = _mm256_unpackhi_pd(vdb01_02, vdb01_13);
5800 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
5801 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
5802 __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
5803 __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
5805 __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
5806 __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
5808 __m256d vdc0 = _mm256_unpacklo_pd(vdc01_02, vdc01_13);
5809 __m256d vdc1 = _mm256_unpackhi_pd(vdc01_02, vdc01_13);
5811 __m128i vcounts_halfmax = _mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, (3 << 2) | 2));
5812 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, 1)));
5814 for (int32 j = 0; j < count_max; j += 4) {
5815 __m256d vsp0123_0 = j < counts[i] ? _mm256_loadu_pd(&sps[i][j]) : _mm256_setzero_pd();
5816 __m256d vsp0123_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm256_loadu_pd(&sps[i + 1][j]) : _mm256_setzero_pd();
5817 __m256d vsp0123_2 = i + 2 < batch_size && j < counts[i + 2] ? _mm256_loadu_pd(&sps[i + 2][j]) : _mm256_setzero_pd();
5818 __m256d vsp0123_3 = i + 3 < batch_size && j < counts[i + 3] ? _mm256_loadu_pd(&sps[i + 3][j]) : _mm256_setzero_pd();
5820 __m256d vsp01_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (2 << 4) | 0);
5821 __m256d vsp01_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (2 << 4) | 0);
5822 __m256d vsp23_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (3 << 4) | 1);
5823 __m256d vsp23_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (3 << 4) | 1);
5826 vsps[0] = _mm256_unpacklo_pd(vsp01_02, vsp01_13);
5827 vsps[1] = _mm256_unpackhi_pd(vsp01_02, vsp01_13);
5828 vsps[2] = _mm256_unpacklo_pd(vsp23_02, vsp23_13);
5829 vsps[3] = _mm256_unpackhi_pd(vsp23_02, vsp23_13);
5831 for (int k = 0; k < 4; k++) {
5832 __m256d vmask = _mm256_castsi256_pd(_mm256_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
5834 vdb1 = _mm256_blendv_pd(vdb1, MM256_FMA_PD(_mm256_sub_pd(vsps[k], vdb0), vdc1, vdb1), vmask);
5835 vdb0 = _mm256_blendv_pd(vdb0, _mm256_add_pd(vdb0, vdb1), vmask);
5836 vdb1 = _mm256_blendv_pd(vdb1, _mm256_mul_pd(vdb1, vdc0), vmask);
5837 vsps[k] = _mm256_sub_pd(vsps[k], vdb0);
5840 vsp01_02 = _mm256_unpacklo_pd(vsps[0], vsps[1]);
5841 vsp01_13 = _mm256_unpackhi_pd(vsps[0], vsps[1]);
5842 vsp23_02 = _mm256_unpacklo_pd(vsps[2], vsps[3]);
5843 vsp23_13 = _mm256_unpackhi_pd(vsps[2], vsps[3]);
5845 vsp0123_0 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (2 << 4) | 0);
5846 vsp0123_1 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (2 << 4) | 0);
5847 vsp0123_2 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (3 << 4) | 1);
5848 vsp0123_3 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (3 << 4) | 1);
5851 _mm256_storeu_pd(&sps[i][j], vsp0123_0);
5853 if (i + 1 < batch_size && j < counts[i + 1])
5854 _mm256_storeu_pd(&sps[i + 1][j], vsp0123_1);
5856 if (i + 2 < batch_size && j < counts[i + 2])
5857 _mm256_storeu_pd(&sps[i + 2][j], vsp0123_2);
5859 if (i + 3 < batch_size && j < counts[i + 3])
5860 _mm256_storeu_pd(&sps[i + 3][j], vsp0123_3);
5863 vdb01_02 = _mm256_unpacklo_pd(vdb0, vdb1);
5864 vdb01_13 = _mm256_unpackhi_pd(vdb0, vdb1);
5866 _mm_storeu_pd(dbs[i], _mm256_castpd256_pd128(vdb01_02));
5868 if (i + 1 < batch_size)
5869 _mm_storeu_pd(dbs[i + 1], _mm256_castpd256_pd128(vdb01_13));
5871 if (i + 2 < batch_size)
5872 _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(vdb01_02, 1));
5874 if (i + 3 < batch_size)
5875 _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(vdb01_13, 1));
5879 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5881 static void sample_filter_HPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5883 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
5884 if (i >= batch_size)
5887 __m128i vcounts = _mm_set_epi32(
5890 i + 1 < batch_size ? counts[i + 1] : 0,
5894 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
5895 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
5897 __m128d vdb0 = _mm_unpacklo_pd(vdb01_0, vdb01_1);
5898 __m128d vdb1 = _mm_unpackhi_pd(vdb01_0, vdb01_1);
5900 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
5901 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
5903 __m128d vdc0 = _mm_unpacklo_pd(vdc01_0, vdc01_1);
5904 __m128d vdc1 = _mm_unpackhi_pd(vdc01_0, vdc01_1);
5906 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, 1)));
5908 for (int32 j = 0; j < count_max; j += 2) {
5909 __m128d vsp01_0 = j < counts[i] ? _mm_loadu_pd(&sps[i][j]) : _mm_setzero_pd();
5910 __m128d vsp01_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm_loadu_pd(&sps[i + 1][j]) : _mm_setzero_pd();
5913 vsps[0] = _mm_unpacklo_pd(vsp01_0, vsp01_1);
5914 vsps[1] = _mm_unpackhi_pd(vsp01_0, vsp01_1);
5916 for (int k = 0; k < 2; k++) {
5917 __m128d vmask = _mm_castsi128_pd(_mm_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
5919 #if USE_X86_EXT_INTRIN >= 6
5920 vdb1 = _mm_blendv_pd(vdb1, MM_FMA_PD(_mm_sub_pd(vsps[k], vdb0), vdc1, vdb1), vmask);
5921 vdb0 = _mm_blendv_pd(vdb0, _mm_add_pd(vdb0, vdb1), vmask);
5922 vdb1 = _mm_blendv_pd(vdb1, _mm_mul_pd(vdb1, vdc0), vmask);
5924 vdb1 = _mm_or_pd(_mm_andnot_pd(vmask, vdb1), _mm_and_pd(vmask, MM_FMA_PD(_mm_sub_pd(vsps[k], vdb0), vdc1, vdb1)));
5925 vdb0 = _mm_or_pd(_mm_andnot_pd(vmask, vdb0), _mm_and_pd(vmask, _mm_add_pd(vdb0, vdb1)));
5926 vdb1 = _mm_or_pd(_mm_andnot_pd(vmask, vdb1), _mm_and_pd(vmask, _mm_mul_pd(vdb1, vdc0)));
5928 vsps[k] = _mm_sub_pd(vsps[k], vdb0);
5931 vsp01_0 = _mm_unpacklo_pd(vsps[0], vsps[1]);
5932 vsp01_1 = _mm_unpackhi_pd(vsps[0], vsps[1]);
5935 _mm_storeu_pd(&sps[i][j], vsp01_0);
5937 if (i + 1 < batch_size && j < counts[i + 1])
5938 _mm_storeu_pd(&sps[i + 1][j], vsp01_1);
5941 vdb01_0 = _mm_unpacklo_pd(vdb0, vdb1);
5942 vdb01_1 = _mm_unpackhi_pd(vdb0, vdb1);
5944 _mm_storeu_pd(dbs[i], vdb01_0);
5946 if (i + 1 < batch_size)
5947 _mm_storeu_pd(dbs[i + 1], vdb01_1);
5953 void buffer_filter_batch(int batch_size, FilterCoefficients **fcs, DATA_T **sps, int32 *counts)
5956 for (int i = 0; i < batch_size; i++) {
5958 ctl->cmsg(CMSG_ERROR, VERB_NORMAL, "buffer_filter_batch(): error: filter not initialized");
5963 for (int i = 1; i < batch_size; i++) {
5964 if (fcs[0]->type != fcs[i]->type) {
5965 ctl->cmsg(CMSG_ERROR, VERB_NORMAL, "buffer_filter_batch(): error: filter type mismatch");
5970 if (fcs[0]->type == FILTER_NONE)
5973 FILTER_T *dcs[MIX_VOICE_BATCH_SIZE];
5974 FILTER_T *dbs[MIX_VOICE_BATCH_SIZE];
5976 for (int i = 0; i < batch_size; i++) {
5977 dcs[i] = &fcs[i]->dc;
5978 dbs[i] = &fcs[i]->db[FILTER_FB_L];
5981 switch (fcs[0]->type) {
5983 recalc_filter_LPF_BW_batch(batch_size, fcs);
5984 sample_filter_LPF_BW_batch(batch_size, dcs, dbs, sps, counts);
5987 case FILTER_LPF12_2:
5988 recalc_filter_LPF12_2_batch(batch_size, fcs);
5989 sample_filter_LPF12_2_batch(batch_size, dcs, dbs, sps, counts);
5992 case FILTER_HPF12_2:
5993 recalc_filter_LPF12_2_batch(batch_size, fcs);
5994 sample_filter_HPF12_2_batch(batch_size, dcs, dbs, sps, counts);
5998 ctl->cmsg(CMSG_ERROR, VERB_NORMAL, "buffer_filter_batch(): error: unsupported filter type");
6003 void voice_filter_batch(int batch_size, int *vs, DATA_T **sps, int32 *counts)
6005 if (batch_size <= 0)
6008 FilterCoefficients *fcs[MIX_VOICE_BATCH_SIZE];
6010 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++)
6011 fcs[i] = (i < batch_size ? &voice[vs[i]].fc : NULL);
6013 buffer_filter_batch(batch_size, fcs, sps, counts);
6015 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++)
6016 fcs[i] = (i < batch_size ? &voice[vs[i]].fc2 : NULL);
6018 buffer_filter_batch(batch_size, fcs, sps, counts);
6022 #endif // MIX_VOICE_BATCH
6025 /*************** antialiasing ********************/
6029 /* bessel function */
6030 static FLOAT_T ino(FLOAT_T x)
6032 FLOAT_T y, de, e, sde;
6040 de = de * y / (FLOAT_T) i;
6043 } while (!( (e * 1.0e-08 - sde > 0) || (i++ > 25) ));
6047 /* Kaiser Window (symetric) */
6048 static void kaiser(FLOAT_T *w,int n,FLOAT_T beta)
6053 xind = (2*n - 1) * (2*n - 1);
6054 for (i =0; i<n ; i++)
6057 w[i] = ino((FLOAT_T)(beta * sqrt((double)(1. - 4 * xi * xi / xind))))
6058 / ino((FLOAT_T)beta);
6063 * fir coef in g, cuttoff frequency in fc
6065 static void designfir(FLOAT_T *g , FLOAT_T fc, FLOAT_T att)
6067 /* attenuation in db */
6069 FLOAT_T xi, omega, beta ;
6070 FLOAT_T w[LPF_FIR_ORDER2];
6072 for (i =0; i < LPF_FIR_ORDER2 ;i++)
6074 xi = (FLOAT_T) i + 0.5;
6076 g[i] = sin( (double) omega * fc) / omega;
6079 beta = (FLOAT_T) exp(log((double)0.58417 * (att - 20.96)) * 0.4) + 0.07886
6081 kaiser( w, LPF_FIR_ORDER2, beta);
6083 /* Matrix product */
6084 for (i =0; i < LPF_FIR_ORDER2 ; i++)
6089 * FIR filtering -> apply the filter given by coef[] to the data buffer
6090 * Note that we simulate leading and trailing 0 at the border of the
6093 static void filter(int16 *result,int16 *data, int32 length,FLOAT_T coef[])
6095 int32 sample,i,sample_window;
6099 /* Simulate leading 0 at the begining of the buffer */
6100 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6103 sample_window= sample - LPF_FIR_ORDER2;
6105 for (i = 0; i < LPF_FIR_ORDER ;i++)
6107 ((sample_window<0)? 0.0 : data[sample_window++]) ;
6109 /* Saturation ??? */
6110 if (sum> 32767.) { sum=32767.; peak++; }
6111 if (sum< -32768.) { sum=-32768; peak++; }
6112 result[sample] = (int16) sum;
6115 /* The core of the buffer */
6116 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6119 sample_window= sample - LPF_FIR_ORDER2;
6121 for (i = 0; i < LPF_FIR_ORDER ;i++)
6122 sum += data[sample_window++] * coef[i];
6124 /* Saturation ??? */
6125 if (sum> 32767.) { sum=32767.; peak++; }
6126 if (sum< -32768.) { sum=-32768; peak++; }
6127 result[sample] = (int16) sum;
6130 /* Simulate 0 at the end of the buffer */
6131 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6134 sample_window= sample - LPF_FIR_ORDER2;
6136 for (i = 0; i < LPF_FIR_ORDER ;i++)
6138 ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6140 /* Saturation ??? */
6141 if (sum> 32767.) { sum=32767.; peak++; }
6142 if (sum< -32768.) { sum=-32768; peak++; }
6143 result[sample] = (int16) sum;
6147 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Saturation %2.3f %%.", 100.0*peak/ (FLOAT_T) length);
6150 static void filter_int8(int8 *result,int8 *data, int32 length,FLOAT_T coef[])
6152 int32 sample,i,sample_window;
6156 /* Simulate leading 0 at the begining of the buffer */
6157 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6160 sample_window= sample - LPF_FIR_ORDER2;
6162 for (i = 0; i < LPF_FIR_ORDER ;i++)
6164 ((sample_window<0)? 0.0 : data[sample_window++]) ;
6166 /* Saturation ??? */
6167 if (sum> 127.) { sum=127.; peak++; }
6168 if (sum< -128.) { sum=-128; peak++; }
6169 result[sample] = (int8) sum;
6172 /* The core of the buffer */
6173 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6176 sample_window= sample - LPF_FIR_ORDER2;
6178 for (i = 0; i < LPF_FIR_ORDER ;i++)
6179 sum += data[sample_window++] * coef[i];
6181 /* Saturation ??? */
6182 if (sum> 127.) { sum=127.; peak++; }
6183 if (sum< -128.) { sum=-128; peak++; }
6184 result[sample] = (int8) sum;
6187 /* Simulate 0 at the end of the buffer */
6188 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6191 sample_window= sample - LPF_FIR_ORDER2;
6193 for (i = 0; i < LPF_FIR_ORDER ;i++)
6195 ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6197 /* Saturation ??? */
6198 if (sum> 127.) { sum=127.; peak++; }
6199 if (sum< -128.) { sum=-128; peak++; }
6200 result[sample] = (int8) sum;
6204 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Saturation %2.3f %%.", 100.0*peak/ (FLOAT_T) length);
6207 static void filter_int32(int32 *result,int32 *data, int32 length,FLOAT_T coef[])
6209 int32 sample,i,sample_window;
6213 /* Simulate leading 0 at the begining of the buffer */
6214 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6217 sample_window= sample - LPF_FIR_ORDER2;
6219 for (i = 0; i < LPF_FIR_ORDER ;i++)
6221 ((sample_window<0)? 0.0 : data[sample_window++]) ;
6223 /* Saturation ??? */
6224 if (sum> 2147483647.) { sum=2147483647.; peak++; }
6225 if (sum< -2147483648.) { sum=-2147483648.; peak++; }
6226 result[sample] = (int32) sum;
6229 /* The core of the buffer */
6230 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6233 sample_window= sample - LPF_FIR_ORDER2;
6235 for (i = 0; i < LPF_FIR_ORDER ;i++)
6236 sum += data[sample_window++] * coef[i];
6238 /* Saturation ??? */
6239 if (sum> 2147483647.) { sum=2147483647.; peak++; }
6240 if (sum< -2147483648.) { sum=-2147483648.; peak++; }
6241 result[sample] = (int32) sum;
6244 /* Simulate 0 at the end of the buffer */
6245 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6248 sample_window= sample - LPF_FIR_ORDER2;
6250 for (i = 0; i < LPF_FIR_ORDER ;i++)
6252 ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6254 /* Saturation ??? */
6255 if (sum> 2147483647.) { sum=2147483647.; peak++; }
6256 if (sum< -2147483648.) { sum=-2147483648.; peak++; }
6257 result[sample] = (int32) sum;
6261 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Saturation %2.3f %%.", 100.0*peak/ (FLOAT_T) length);
6264 static void filter_float(float *result,float *data, int32 length,FLOAT_T coef[])
6266 int32 sample,i,sample_window;
6269 /* Simulate leading 0 at the begining of the buffer */
6270 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6273 sample_window= sample - LPF_FIR_ORDER2;
6275 for (i = 0; i < LPF_FIR_ORDER ;i++)
6276 sum += coef[i] * ((sample_window<0)? 0.0 : data[sample_window++]) ;
6277 result[sample] = (float)sum;
6280 /* The core of the buffer */
6281 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6284 sample_window= sample - LPF_FIR_ORDER2;
6286 for (i = 0; i < LPF_FIR_ORDER ;i++)
6287 sum += data[sample_window++] * coef[i];
6288 result[sample] = (float)sum;
6291 /* Simulate 0 at the end of the buffer */
6292 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6295 sample_window= sample - LPF_FIR_ORDER2;
6297 for (i = 0; i < LPF_FIR_ORDER ;i++)
6298 sum += coef[i] * ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6299 result[sample] = (float)sum;
6303 static void filter_double(double *result,double *data, int32 length,FLOAT_T coef[])
6305 int32 sample,i,sample_window;
6308 /* Simulate leading 0 at the begining of the buffer */
6309 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6312 sample_window= sample - LPF_FIR_ORDER2;
6314 for (i = 0; i < LPF_FIR_ORDER ;i++)
6315 sum += coef[i] * ((sample_window<0)? 0.0 : data[sample_window++]) ;
6316 result[sample] = (double)sum;
6319 /* The core of the buffer */
6320 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6323 sample_window= sample - LPF_FIR_ORDER2;
6325 for (i = 0; i < LPF_FIR_ORDER ;i++)
6326 sum += data[sample_window++] * coef[i];
6327 result[sample] = (double)sum;
6330 /* Simulate 0 at the end of the buffer */
6331 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6334 sample_window= sample - LPF_FIR_ORDER2;
6336 for (i = 0; i < LPF_FIR_ORDER ;i++)
6337 sum += coef[i] * ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6338 result[sample] = (double)sum;
6341 /***********************************************************************/
6342 /* Prevent aliasing by filtering any freq above the output_rate */
6344 /* I don't worry about looping point -> they will remain soft if they */
6346 /***********************************************************************/
6347 void antialiasing(int16 *data, int32 data_length, int32 sample_rate, int32 output_rate)
6352 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6353 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6354 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6357 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6360 /* No oversampling */
6361 if (output_rate>=sample_rate)
6364 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6365 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6368 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6370 /* Make the filter symetric */
6371 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6372 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6374 /* We apply the filter we have designed on a copy of the patch */
6378 bytes = sizeof(int16) * data_length;
6379 temp = (int16 *)safe_malloc(bytes);
6380 memcpy(temp, data, bytes);
6381 filter(data, temp, data_length, fir_symetric);
6385 void antialiasing_int8(int8 *data, int32 data_length, int32 sample_rate, int32 output_rate)
6390 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6391 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6392 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6395 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6398 /* No oversampling */
6399 if (output_rate>=sample_rate)
6402 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6403 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6406 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6408 /* Make the filter symetric */
6409 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6410 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6412 /* We apply the filter we have designed on a copy of the patch */
6416 bytes = sizeof(int8) * data_length;
6417 temp = (int8 *)safe_malloc(bytes);
6418 memcpy(temp, data, bytes);
6419 filter_int8(data, temp, data_length, fir_symetric);
6423 void antialiasing_int32(int32 *data, int32 data_length, int32 sample_rate, int32 output_rate)
6428 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6429 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6430 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6433 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6436 /* No oversampling */
6437 if (output_rate>=sample_rate)
6440 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6441 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6444 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6446 /* Make the filter symetric */
6447 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6448 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6450 /* We apply the filter we have designed on a copy of the patch */
6454 bytes = sizeof(int32) * data_length;
6455 temp = (int32 *)safe_malloc(bytes);
6456 memcpy(temp, data, bytes);
6457 filter_int32(data, temp, data_length, fir_symetric);
6461 void antialiasing_float(float *data, int32 data_length, int32 sample_rate, int32 output_rate)
6466 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6467 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6468 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6471 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6474 /* No oversampling */
6475 if (output_rate>=sample_rate)
6478 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6479 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6482 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6484 /* Make the filter symetric */
6485 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6486 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6488 /* We apply the filter we have designed on a copy of the patch */
6492 bytes = sizeof(float) * data_length;
6493 temp = (float *)safe_malloc(bytes);
6494 memcpy(temp, data, bytes);
6495 filter_float(data, temp, data_length, fir_symetric);
6500 void antialiasing_double(double *data, int32 data_length, int32 sample_rate, int32 output_rate)
6505 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6506 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6507 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6509 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6512 /* No oversampling */
6513 if (output_rate>=sample_rate)
6516 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6517 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6520 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6522 /* Make the filter symetric */
6523 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6524 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6526 /* We apply the filter we have designed on a copy of the patch */
6530 bytes = sizeof(double) * data_length;
6531 temp = (double *)safe_malloc(bytes);
6532 memcpy(temp, data, bytes);
6533 filter_double(data, temp, data_length, fir_symetric);
6539 /*************** fir_eq ********************/
6541 void init_fir_eq(FIR_EQ *fc)
6543 int32 i, j, k, l, count1 = 0, count2 = 0, flg = 0, size_2;
6544 double amp[FIR_EQ_BAND_MAX*16], bounds[FIR_EQ_BAND_MAX*16][2]; // max_band*16 , [0]:f [1]:g
6545 double fL, gL, fR, gR, log_fL, log_fR, ft, gt, kT, div_kT2, hk, gain, f0, f1, w0, w1;
6546 const double div_2pi = 1.0 / (2 * M_PI);
6549 memset(fc, 0, sizeof(FIR_EQ));
6552 fc->st = 1; // stereo
6559 fc->freq[4] = 12000;
6560 fc->freq[5] = 20000;
6568 { // calc window size
6569 int32 t = play_mode->rate / 100; // sr/ 100Hz
6578 else if(fc->band > FIR_EQ_BAND_MAX)
6579 fc->band = FIR_EQ_BAND_MAX;
6580 if(fc->band != fc->band_p)
6582 fc->band_p = fc->band;
6585 else if(fc->bit > 12)
6587 if(fc->bit != fc->bit_p)
6589 fc->bit_p = fc->bit;
6590 fc->size = 1 << fc->bit;
6591 for(i = 0; i < FIR_EQ_SIZE_MAX; ++i){
6594 for(i = 0; i < fc->band; ++i){
6595 if(fc->gain[0] != 0)
6601 size_2 = fc->size / 2;
6602 for(i = 0; i < fc->band; ++i){
6603 amp[i] = pow(10, fc->gain[i] * DIV_20);
6605 bounds[0][1] = amp[0];
6606 for(i = 0; i < (fc->band - 2); i++){
6609 fR = fc->freq[i + 1];
6613 for(j = 0; j < 16; ++j){
6614 ft = ((double)j + 0.5) * DIV_16;
6615 bounds[count1][0] = exp(log_fL * (1 - ft) + log_fR * ft);
6616 gt = (double)j * DIV_16;
6617 bounds[count1][1] = gL * (1.0 - gt) + gR * gt;
6621 for(k = 0; k < size_2; ++k){
6622 kT = k * div_playmode_rate;
6626 while(count2 != count1){
6627 gain = bounds[count2][1];
6628 f0 = bounds[count2][0];
6630 f1 = count2 == count1 ? play_mode->rate * DIV_2 : bounds[count2][0];
6634 hk += gain * (w1 - w0 + (-w0) - (-w1));
6636 hk += gain * (sin(w1 * kT) - sin(w0 * kT)) * div_kT2; // * 2 / kT
6639 hk *= div_playmode_rate * div_2pi; // / (2 * M_PI);
6640 fc->dc[size_2 - 1 - k] = hk;
6641 fc->dc[size_2 - 1 + k] = hk;
6645 memset(fc->buff, 0, sizeof(fc->buff));
6651 void apply_fir_eq(FIR_EQ *fc, DATA_T *buf, int32 count)
6654 const int32 mask = FIR_EQ_SIZE_MAX - 1;
6658 #if (USE_X86_EXT_INTRIN >= 3) && defined(FLOAT_T_DOUBLE) && defined(DATA_T_DOUBLE)
6659 if(fc->st){ // stereo
6660 __m128d vout[2], tmp[2]; // DATA_T out[2];
6661 for(i = 0; i < count; i += 2){
6662 int32 sbuff = fc->count + FIR_EQ_SIZE_MAX;
6663 fc->buff[0][fc->count] = buf[i];
6664 fc->buff[0][sbuff] = buf[i]; // for SIMD
6665 fc->buff[1][fc->count] = buf[i + 1];
6666 fc->buff[1][sbuff] = buf[i + 1]; // for SIMD
6667 fc->count = (fc->count + 1) & mask;
6668 offset = (fc->count - fc->size) & mask;
6669 vout[0] = _mm_setzero_pd(); // out[0] = 0;
6670 vout[1] = _mm_setzero_pd(); // out[1] = 0;
6671 for(j = 0; j < fc->size; j += 2){
6672 int32 ofs = offset + j;
6673 __m128d vdc = _mm_loadu_pd(&fc->dc[j]);
6674 vout[0] = MM_FMA_PD(vdc, _mm_loadu_pd(&fc->buff[0][ofs]), vout[0]); // out[0] += fc->dc[j] * fc->buff[0][ofs];
6675 vout[1] = MM_FMA_PD(vdc, _mm_loadu_pd(&fc->buff[1][ofs]), vout[1]); // out[1] += fc->dc[j] * fc->buff[1][ofs];
6677 // vout[0](L0,L1) vout[1](R0,R1)
6678 tmp[0] = _mm_unpacklo_pd(vout[0], vout[1]); // (L0,R0)
6679 tmp[1] = _mm_unpackhi_pd(vout[0], vout[1]); // (L1,R1)
6680 tmp[0] = _mm_add_pd(tmp[0], tmp[1]); // (L0+L1,R0+R1)
6681 _mm_store_pd(&buf[i], tmp[0]); // buf[i] = out[0]; buf[i + 1] = out[1];
6684 __m128d vout; //DATA_T out;
6685 for(i = 0; i < count; i++){
6686 int32 sbuff = fc->count + FIR_EQ_SIZE_MAX;
6687 fc->buff[0][fc->count] = buf[i];
6688 fc->buff[0][sbuff] = buf[i]; // for SIMD
6689 fc->count = (fc->count + 1) & mask;
6690 offset = (fc->count - fc->size) & mask;
6691 vout = _mm_setzero_pd(); // out = 0;
6692 for(j = 0; j < fc->size; j += 2){
6693 int32 ofs = (offset + j) & mask;
6694 __m128d vdc = _mm_loadu_pd(&fc->dc[j]);
6695 vout = MM_FMA_PD(vdc, _mm_loadu_pd(&fc->buff[0][ofs]), vout); // out += fc->dc[j] * fc->buff[0][ofs];
6697 vout= _mm_add_pd(vout, _mm_shuffle_pd(vout, vout, 0x1)); // v0=v0+v1 v1=v1+v0
6698 _mm_store_sd(&buf[i], vout); // buf[i] = out;
6702 if(fc->st){ // stereo
6704 for(i = 0; i < count; i += 2){
6705 fc->buff[0][fc->count] = buf[i];
6706 fc->buff[1][fc->count] = buf[i + 1];
6707 fc->count = (fc->count + 1) & mask;
6708 offset = fc->count - fc->size;
6709 //offset &= fc->mask;
6710 //buf[i] = fc->buff[0][offset];
6711 //buf[i + 1] = fc->buff[1][offset];
6712 out[0] = 0; out[1] = 0;
6713 for(j = 0; j < fc->size; ++j){
6714 int32 ofs = (offset + j) & mask;
6715 out[0] += fc->dc[j] * fc->buff[0][ofs];
6716 out[1] += fc->dc[j] * fc->buff[1][ofs];
6718 buf[i] = out[0]; buf[i + 1] = out[1];
6722 for(i = 0; i < count; i++){
6723 fc->buff[0][fc->count] = buf[i];
6724 fc->count = (fc->count + 1) & mask;
6725 offset = fc->count - fc->size;
6727 for(j = 0; j < fc->size; ++j){
6728 int32 ofs = (offset + j) & mask;
6729 out += fc->dc[j] * fc->buff[0][ofs];