2 TiMidity++ -- MIDI to WAVE converter and player
3 Copyright (C) 1999-2002 Masanao Izumo <mo@goice.co.jp>
4 Copyright (C) 1995 Tuukka Toivonen <tt@cgs.fi>
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 filter.c: written by Vincent Pagel ( pagel@loria.fr )
22 implements fir antialiasing filter : should help when setting sample
29 - modify "filter" so that it simulate leading and trailing 0 in the buffer
34 #endif /* HAVE_CONFIG_H */
54 double ext_filter_shelving_gain = 1.0;
55 double ext_filter_shelving_reduce = 1.0;
56 double ext_filter_shelving_q = 1.0;
57 double ext_filter_peaking_gain = 1.0;
58 double ext_filter_peaking_reduce = 1.0;
59 double ext_filter_peaking_q = 1.0;
60 const double ext_filter_margin = 0.010; // 1cB,+-20cent
\82æ
\82è
\8f¬
\82³
\82¢
\95\89\89×
\8c¸
\8f
\8f¬
61 //const double ext_filter_margin = 0.05; // 5cB,+-100cent
\82æ
\82è
\8f¬
\82³
\82¢
\95\89\89×
\8c¸
\8f
\91å
67 voice_filter1(LPF), voice_filter2(HPF), resample_filter
68 \83t
\83B
\83\8b\83^
\95\94\95ª
\8b¤
\92Ê
69 \83t
\83B
\83\8b\83^
\8e©
\91Ì
\82Í freq[Hz], reso_dB[dB] ,(EQ
\82Ì
\8fê
\8d\87 q[0.0~1.0]
71 voice_filter(LPF)
\82Ì
\8fê
\8d\87
72 playmidi.c init_voice_filter(int i)
74 input freq 20 < freq < 20000 , input 0 < reso < 96
76 0<n<
\81\87, 0<n<1, 1>n>0 (
\8ae
\83t
\83B
\83\8b\83^
\91¤
\83\8c\83]
\83i
\83\93\83X
\95\94\95ª
\92l
\88æ
\82ª
\82±
\82ñ
\82È
\8a´
\82¶
\82Å
\83o
\83\89\83o
\83\89
77 n=f(rez) (
\8ae
\83t
\83B
\83\8b\83^
\82Ìf()
\82ª
\89½
\82È
\82Ì
\82©
78 q, 1/q, 1-1/q
\82±
\82¤
\82·
\82é
\82Æ
\8b¤
\92Ê
\82µ
\82½
\92l
\88æ 1<q<
\81\87 \82É
\82È
\82é
79 0<rez
\82È
\82Ì
\82Å q=X^rez
\82Å q
\82Ì
\92l
\88æ
\82É
\95Ï
\8a·
80 \82±
\82ê
\82É
\90§
\8cÀ
\82â
\82ç
\8cW
\90\94\82ª
\82Â
\82
82 \8f\88\97\9d\8f\87\8f\98
83 1: type
\82ð
\8ew
\92è set_type() (
\8f\89\89ñ
\82Ü
\82½
\82Ítype
\95Ï
\89»
\82Ì
\8fê
\8d\87 FilterCoefficients 0
\83N
\83\8a\83A
84 2:
\93Á
\8eê
\82È
\83T
\83\93\83v
\83\8b\83\8c\81[
\83g
\82Ì
\8fê
\8d\87\82Í set_ext_rate()
85 3: orig_freq,orig_reso
\82ð
\8eg
\97p
\82·
\82é
\8fê
\8d\87\82Í set_orig_freq() set_orig_reso()
86 4: freq,reso
\82ð
\8ew
\92è set_freq() set_reso() (EQ
\82Ì
\8fê
\8d\87 set_q()
87 5: sample_filter
\82Ì
\8fê
\8d\87\82Í
\8cW
\90\94\8cv
\8eZ recalc_filter()
88 6:
\83t
\83B
\83\8b\83^
\8f\88\97\9d filter() (buffer_filter
\82Í
\8f\88\97\9d\91O
\82É
\8cW
\90\94\8cv
\8eZ
\82³
\82ê
\82é
90 1:~5:
\82ð
\82Ü
\82Æ
\82ß
\82½ init_sample_filter()
\82Å
\82à
\82¢
\82¢ (EQ
\82Ì
\8fê
\8d\87 init_sample_filter2()
96 num filter_define type cutoff_limit (oversampling) desc
97 00 FILTER_NONE, OFF filter OFF
98 01 FILTER_LPF12, LPF sr / 6 0.16666 (~x3) Chamberlin 12dB/oct
99 02 FILTER_LPF24, LPF sr / 2 Moog VCF 24dB/oct
100 03 FILTER_LPF_BW, LPF sr / 2 butterworth elion add
101 04 FILTER_LPF12_2, LPF sr / 2 Resonant IIR 12dB/oct
102 05 FILTER_LPF24_2, LPF sr / 2 amSynth 24dB/oct
103 06 FILTER_LPF6, LPF sr / 2 One pole 6dB/oct nonrez
104 07 FILTER_LPF18, LPF sr / 2.25 0.44444 (~x2) 3pole 18dB/oct
105 08 FILTER_LPF_TFO, LPF sr / 2 two first order
107 09 FILTER_HPF_BW, HPF sr / 2 butterworth elion+
108 10 FILTER_BPF_BW, BPF sr / 2 butterworth elion+
109 11 FILTER_PEAK1, peak sr / 2
110 12 FILTER_NOTCH1, notch sr / 2
111 13 FILTER_LPF12_3, LPF sr / 4.6 0.21875 (~x3) Chamberlin2 12dB/oct
112 14 FILTER_HPF12_3, HPF sr / 4.6 0.21875 (~x3) Chamberlin2 12dB/oct
113 15 FILTER_BPF12_3, BPF sr / 4.6 0.21875 (~x3) Chamberlin2 12dB/oct
114 16 FILTER_BCF12_3, BCF sr / 4.6 0.21875 (~x3) Chamberlin2 12dB/oct
115 17 FILTER_HPF6, HPF sr / 2 One pole 6dB/oct nonrez
116 18 FILTER_HPF12_2, HPF sr / 2 Resonant IIR 12dB/oct
118 19 FILTER_HBF_L6L12, HBF sr / 2
119 20 FILTER_HBF_L12L6, HBF sr / 2
120 21 FILTER_HBF_L12H6, HBF sr / 2
121 22 FILTER_HBF_L24H6, HBF sr / 2
122 23 FILTER_HBF_L24H12, HBF sr / 2
123 24 FILTER_HBF_L12OCT, HBF sr / 2
124 25 FILTER_HBF_L24OCT, HBF sr / 2
126 26 FILTER_LPF6x2, LPF sr / 2
127 27 FILTER_LPF6x3, LPF sr / 2
128 28 FILTER_LPF6x4, LPF sr / 2
129 29 FILTER_LPF6x8, LPF sr / 2
130 30 FILTER_LPF6x16, LPF sr / 2
131 31 FILTER_LPF_BWx2, LPF sr / 2
132 32 FILTER_LPF_BWx3, LPF sr / 2
133 33 FILTER_LPF_BWx4, LPF sr / 2
134 34 FILTER_LPF24_2x2, LPF sr / 2
137 36 FILTER_SHELVING_LOW,EQ_LOW sr / 2
138 37 FILTER_SHELVING_HI, EQ_HI sr / 2
139 38 FILTER_PEAKING, EQ_MID sr / 2
140 39 FILTER_BIQUAD_LOW, LPF sr / 2
141 40 FILTER_BIQUAD_HI, HPF sr / 2
144 cutoff_limit sr/2
\96¢
\96\9e\82Ì
\82à
\82Ì
\82Íoversampling
\82Åsr/2
\82ð
\88µ
\82¦
\82é
\82æ
\82¤
\82É
\82·
\82é
149 #if 1 // recalc filter margin
151 \8cW
\90\94\8dÄ
\8cv
\8eZ
\82Í
\95\89\89×
\82ª
\91å
\82«
\82
\83{
\83C
\83X
\83t
\83B
\83\8b\83^
\82Å
\8eg
\97p
\89ñ
\90\94\82à
\91½
\82¢
\82Ì
\82Å
\82 \82é
\92ö
\93x
\8dí
\82Á
\82Ä
\8f\88\97\9d\89ñ
\90\94\82ð
\8c¸
\82ç
\82·
152 \83{
\83C
\83X
\83t
\83B
\83\8b\83^
\82Í
\95Ï
\93®
\82µ
\82Ä
\82é
\82à
\82Ì
\82¾
\82©
\82ç100cent
\83Y
\83\8c\82Ä
\82à
\88á
\82¢
\82Í
\82í
\82©
\82ç
\82È
\82¢ EQ
\82Í
\95Ï
\93®
\82µ
\82È
\82¢
\82µ
155 #define INIT_MARGIN_VAL { \
156 fc->range[0] = fc->range[1] = fc->range[2] = fc->range[3] = fc->range[4] = fc->range[5] = fc->range[6] = fc->range[7] = 0; }
157 #define FLT_FREQ_MARGIN (fc->freq < fc->range[0] || fc->freq > fc->range[1])
158 #define FLT_RESO_MARGIN (fc->reso_dB < fc->range[2] || fc->reso_dB > fc->range[3])
159 #define FLT_WIDTH_MARGIN (fc->q < fc->range[4] || fc->q > fc->range[5])
161 #if (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
162 #define CALC_MARGIN_VAL __m128d vec_range = _mm_set_pd(1.0 + ext_filter_margin, 1.0 - ext_filter_margin);
163 #define CALC_FREQ_MARGIN { _mm_storeu_pd(&fc->range[0], _mm_mul_pd(MM_LOAD1_PD(&fc->freq), vec_range));}
164 #define CALC_RESO_MARGIN { _mm_storeu_pd(&fc->range[2], _mm_mul_pd(MM_LOAD1_PD(&fc->reso_dB), vec_range));}
165 #define CALC_WIDTH_MARGIN { _mm_storeu_pd(&fc->range[4], _mm_mul_pd(MM_LOAD1_PD(&fc->q), vec_range));}
167 #define CALC_MARGIN_VAL
168 #define CALC_FREQ_MARGIN {fc->range[0] = fc->freq * (1.0 - ext_filter_margin); fc->range[1] = fc->freq * (1.0 + ext_filter_margin);}
169 #define CALC_RESO_MARGIN {fc->range[2] = fc->reso_dB * (1.0 - ext_filter_margin); fc->range[3] = fc->reso_dB * (1.0 + ext_filter_margin);}
170 #define CALC_WIDTH_MARGIN {fc->range[4] = fc->q * (1.0 - ext_filter_margin); fc->range[5] = fc->q * (1.0 + ext_filter_margin);}
173 #else // ! recalc filter margin
175 #define INIT_MARGIN_VAL
176 #define CALC_MARGIN_VAL
177 #define FLT_FREQ_MARGIN (fc->freq != fc->last_freq)
178 #define FLT_RESO_MARGIN (fc->reso_dB != fc->last_reso_dB)
179 #define FLT_WIDTH_MARGIN (fc->q != fc->last_q)
180 #define CALC_FREQ_MARGIN {fc->last_freq = fc->freq;}
181 #define CALC_RESO_MARGIN {fc->last_reso_dB = fc->reso_dB;}
182 #define CALC_WIDTH_MARGIN {fc->last_q = fc->q;}
185 #if 1 // resonance use table
186 #define RESO_DB_CF_P(db) filter_cb_p_table[(int)(db * 10.0)]
187 #define RESO_DB_CF_M(db) filter_cb_m_table[(int)(db * 10.0)]
188 #elif 1 // resonance calc function lite
189 #define RESO_DB_CF_P(db) (FLOAT_T)(exp((float)(M_LN10 * DIV_40 * db)))
190 #define RESO_DB_CF_M(db) (FLOAT_T)(exp((float)(M_LN10 * -DIV_40 * db)))
191 #else // resonance calc function
192 #define RESO_DB_CF_P(db) pow(10.0, DIV_40 * db)
193 #define RESO_DB_CF_M(db) pow(10.0, -DIV_40 * db)
199 #if (OPT_MODE == 1) && !defined(DATA_T_DOUBLE) && !defined(DATA_T_FLOAT) /* fixed-point implementation */
201 static inline void sample_filter_none(FILTER_T *dc, FILTER_T *db, DATA_T *sp) {}
203 static inline void recalc_filter_none(FilterCoefficients *fc) {}
205 static inline void sample_filter_LPF12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
207 db[0] = db[0] + imuldiv28(db[2], dc[0]);
208 db[1] = (*sp << 4) - db[0] - imuldiv28(db[2], dc[1]);
209 db[2] = imuldiv28(db[1], dc[0]) + db[2];
210 *sp = db[0] >> 4; /* 4.28 to 8.24 */
213 static inline void sample_filter_LPF12_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
215 FILTER_T input = *sp << 4;
217 db[0] = db[0] + imuldiv28(db[2], dc[0]);
218 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
219 db[2] = imuldiv28(db[1], dc[0]) + db[2];
220 *sp = db[0] >> 4; /* 4.28 to 8.24 */
222 db[0] = db[0] + imuldiv28(db[2], dc[0]);
223 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
224 db[2] = imuldiv28(db[1], dc[0]) + db[2];
227 static inline void sample_filter_LPF12_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
229 FILTER_T input = *sp << 4;
231 db[0] = db[0] + imuldiv28(db[2], dc[0]);
232 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
233 db[2] = imuldiv28(db[1], dc[0]) + db[2];
234 *sp = db[0] >> 4; /* 4.28 to 8.24 */
236 db[0] = db[0] + imuldiv28(db[2], dc[0]);
237 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
238 db[2] = imuldiv28(db[1], dc[0]) + db[2];
240 db[0] = db[0] + imuldiv28(db[2], dc[0]);
241 db[1] = input - db[0] - imuldiv28(db[2], dc[1]);
242 db[2] = imuldiv28(db[1], dc[0]) + db[2];
245 static inline void recalc_filter_LPF12(FilterCoefficients *fc)
249 /* copy with applying Chamberlin's lowpass filter. */
250 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
251 fc->last_freq = fc->freq;
252 if(fc->freq < fc->flt_rate_limit1){ // <sr*DIV_6
253 fc->sample_filter = sample_filter_LPF12;
254 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
255 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*DIV_6
256 fc->sample_filter = sample_filter_LPF12_ov2;
257 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
258 }else{ // <sr*3*DIV_6
259 fc->sample_filter = sample_filter_LPF12_ov3;
260 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
262 fc->last_reso_dB = fc->reso_dB;
263 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
267 static inline void sample_filter_LPF24(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
271 da[0] = (*sp << 4) - imuldiv28(db[4], dc[2]); /* feedback */
275 db[1] = imuldiv28((db[0] + da[0]), dc[0]) - imuldiv28(db[1], dc[1]);
276 db[2] = imuldiv28((db[1] + da[1]), dc[0]) - imuldiv28(db[2], dc[1]);
277 db[3] = imuldiv28((db[2] + da[2]), dc[0]) - imuldiv28(db[3], dc[1]);
278 db[4] = imuldiv28((db[3] + da[3]), dc[0]) - imuldiv28(db[4], dc[1]);
280 *sp = db[4] >> 4; /* 4.28 to 8.24 */
283 static inline void recalc_filter_LPF24(FilterCoefficients *fc)
285 FLOAT_T f, q, p, tmp;
288 /* copy with applying Moog lowpass VCF. */
289 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
290 fc->last_freq = fc->freq;
291 f = 2.0 * fc->freq * fc->div_flt_rate;
293 fc->last_reso_dB = fc->reso_dB;
294 q = 0.80 * (1.0 - RESO_DB_CF_M(fc->reso_dB)); // 0.0f <= c < 0.80f
295 dc[0] = TIM_FSCALE(tmp = f + 0.8 * f * p, 28);
296 dc[1] = TIM_FSCALE(tmp + tmp - 1.0, 28);
297 dc[2] = TIM_FSCALE(q * (1.0 + 0.5 * p * (1.0 - p + 5.6 * p * p)), 28);
301 static inline void sample_filter_LPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
306 db[2] = imuldiv28(db[0], dc[0])
307 + imuldiv28(db[1], dc[1])
308 + imuldiv28(db[2], dc[2])
309 - imuldiv28(db[3], dc[3])
310 - imuldiv28(db[4], dc[4]);
312 db[3] = db[2]; // flt out
314 db[1] = db[0]; // flt in
316 *sp = db[3] >> 4; /* 4.28 to 8.24 */
319 static inline void recalc_filter_LPF_BW(FilterCoefficients *fc)
321 FILTER_T *dc = fc->dc;
322 FLOAT_T q ,p, p2, qp, tmp;
325 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
326 fc->last_freq = fc->freq;
327 fc->last_reso_dB = fc->reso_dB;
328 p = 1.0 / tan(M_PI * fc->freq * fc->div_flt_rate); // ?
329 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
332 dc[0] = TIM_FSCALE(tmp = 1.0 / (1.0 + qp + p2), 28);
333 dc[1] = TIM_FSCALE(2.0 * tmp, 28);
335 dc[3] = TIM_FSCALE(2.0 * (1.0 - p2) * tmp, 28);
336 dc[4] = TIM_FSCALE((1.0 - qp + p2) * tmp, 28);
340 static inline void sample_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
342 db[1] += imuldiv28(((*sp << 4) - db[0]), dc[1]);
344 db[1] = imuldiv28(db[1], dc[0]);
345 *sp = db[0] >> 4; /* 4.28 to 8.24 */
348 static inline void recalc_filter_LPF12_2(FilterCoefficients *fc)
350 FILTER_T *dc = fc->dc;
353 // Resonant IIR lowpass (12dB/oct) Olli Niemitalo //r
354 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
355 fc->last_freq = fc->freq;
356 fc->last_reso_dB = fc->reso_dB;
357 f = M_PI2 * fc->freq * fc->div_flt_rate;
358 //q = 1.0 - f / (2.0 * ((fc->reso_dB * DIV_96 + 1.0) + 0.5 / (1.0 + f)) + f - 2.0);
359 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
360 dc[0] = TIM_FSCALE(tmp = q * q, 28);
361 dc[1] = TIM_FSCALE(tmp + 1.0 - 2.0 * cos(f) * q, 28);
365 static inline void buffer_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp, int32 count)
368 FILTER_T db0 = db[0], db1 = db[1], dc0 = dc[0], dc1 = dc[1];
370 for (i = 0; i < count; i++) {
371 db1 += imuldiv28(((sp[i] << 4) - db0), dc1);
373 db1 = imuldiv28(db1, dc0);
374 sp[i] = db0 >> 4; /* 4.28 to 8.24 */
380 static inline void sample_filter_LPF24_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
383 db[5] = imuldiv28(db[0], dc[0]) + db[1];
384 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
385 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
387 db[5] = imuldiv28(db[0], dc[0]) + db[3];
388 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
389 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
390 *sp = db[0] >> 4; /* 4.28 to 8.24 */
393 static inline void recalc_filter_LPF24_2(FilterCoefficients *fc)
395 FLOAT_T f, q, p, r, dc0;
398 // amSynth 24dB/ocatave resonant low-pass filter. Nick Dowell //r
399 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
400 fc->last_freq = fc->freq;
401 fc->last_reso_dB = fc->reso_dB;
402 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
403 //q = 2 * (1 - fc->reso_dB * DIV_96); // maxQ = 0.9995
404 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
406 p = 1.0 / (1.0 + (q * f) + r);
408 dc[0] = TIM_FSCALE(dc0, 28);
409 dc[1] = TIM_FSCALE(dc0 * 2, 28);
411 dc[3] = TIM_FSCALE(-2.0 * (r - 1) * p, 28);
412 dc[4] = TIM_FSCALE((-1.0 + (q * f) - r) * p, 28);
416 static inline void sample_filter_LPF6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
418 db[1] = imuldiv28((*sp << 4), dc[0]) + imuldiv28(db[1], dc[1]);
419 *sp = db[1] >> 4; /* 4.28 to 8.24 */
422 static inline void recalc_filter_LPF6(FilterCoefficients *fc)
427 // One pole filter, LP 6dB/Oct scoofy no resonance //r
428 if (!FP_EQ(fc->freq, fc->last_freq)) {
429 fc->last_freq = fc->freq;
430 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
431 dc[0] = TIM_FSCALE(1.0 - f, 28);
432 dc[1] = TIM_FSCALE(f, 28);
436 static inline void sample_filter_LPF18(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
443 db[0] = (*sp << 4) - imuldiv28(db[3], dc[2]);
444 db[1] = imuldiv28((db[0] + da[0]), dc[1]) - imuldiv28(db[1], dc[0]);
445 db[2] = imuldiv28((db[1] + da[1]), dc[1]) - imuldiv28(db[2], dc[0]);
446 db[3] = imuldiv28((db[2] + da[2]), dc[1]) - imuldiv28(db[3], dc[0]);
447 *sp = imuldiv28(db[3], dc[3]) >> 4; /* 4.28 to 8.24 */
450 static inline void sample_filter_LPF18_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
452 FILTER_T da[3], input = *sp << 4;
457 db[0] = input - imuldiv28(db[3], dc[2]);
458 db[1] = imuldiv28((db[0] + da[0]), dc[1]) - imuldiv28(db[1], dc[0]);
459 db[2] = imuldiv28((db[1] + da[1]), dc[1]) - imuldiv28(db[2], dc[0]);
460 db[3] = imuldiv28((db[2] + da[2]), dc[1]) - imuldiv28(db[3], dc[0]);
465 db[0] = input - imuldiv28(db[3], dc[2]);
466 db[1] = imuldiv28((db[0] + da[0]), dc[1]) - imuldiv28(db[1], dc[0]);
467 db[2] = imuldiv28((db[1] + da[1]), dc[1]) - imuldiv28(db[2], dc[0]);
468 db[3] = imuldiv28((db[2] + da[2]), dc[1]) - imuldiv28(db[3], dc[0]);
469 *sp = imuldiv28(db[3], dc[3]) >> 4; /* 4.28 to 8.24 */
472 static inline void recalc_filter_LPF18(FilterCoefficients *fc)
474 FLOAT_T f, q, p, tmp;
477 // LPF18 low-pass filter //r
478 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
479 fc->last_freq = fc->freq;
480 if(fc->freq < fc->flt_rate_limit1){ // <sr/2.25
481 fc->sample_filter = sample_filter_LPF18;
482 f = 2.0 * fc->freq * fc->div_flt_rate; // *1.0
484 fc->sample_filter = sample_filter_LPF18_ov2;
485 f = 2.0 * fc->freq * fc->div_flt_rate_ov2; // sr*2
487 dc[0] = TIM_FSCALE(tmp = ((-2.7528 * f + 3.0429) * f + 1.718) * f - 0.9984, 28);
488 fc->last_reso_dB = fc->reso_dB;
489 //q = fc->reso_dB * DIV_96;
490 q = 0.789 * (1.0 - RESO_DB_CF_M(fc->reso_dB)); // 0<q<0.78125
492 dc[1] = TIM_FSCALE(0.5 * p, 28);
493 dc[2] = TIM_FSCALE(tmp = q * (((-2.7079 * p + 10.963) * p - 14.934) * p + 8.4974), 28);
494 dc[3] = TIM_FSCALE(1.0 + (0.25 * (1.5 + 2.0 * tmp * (1.0 - f))), 28);
498 static inline void sample_filter_LPF_TFO(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
500 db[0] = db[0] + imuldiv28(((*sp << 4) - db[0] + imuldiv28((db[0] - db[1]), dc[1])), dc[0]);
501 db[1] = db[1] + imuldiv28((db[0] - db[1]), dc[0]);
502 *sp = db[1] >> 4; /* 4.28 to 8.24 */
505 static inline void recalc_filter_LPF_TFO(FilterCoefficients *fc)
510 // two first order low-pass filter //r
511 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
512 fc->last_freq = fc->freq;
513 fc->last_reso_dB = fc->reso_dB;
514 dc[0] = TIM_FSCALE(tmp = 2 * fc->freq * fc->div_flt_rate, 28);
515 q = 1.0 - RESO_DB_CF_M(fc->reso_dB);
516 dc[1] = TIM_FSCALE(q + q / (1.01 - tmp), 28);
520 static inline void sample_filter_HPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
525 db[2] = imuldiv28(db[0], dc[0])
526 + imuldiv28(db[1], dc[1])
527 + imuldiv28(db[2], dc[2])
528 - imuldiv28(db[3], dc[3])
529 - imuldiv28(db[4], dc[4]);
531 db[3] = db[2]; // flt out
533 db[1] = db[0]; // flt in
535 *sp = db[3] >> 4; /* 4.28 to 8.24 */
538 static inline void recalc_filter_HPF_BW(FilterCoefficients *fc)
541 FLOAT_T q, p, p2, qp, tmp;
543 // elion butterworth HPF //r
544 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
545 fc->last_freq = fc->freq;
546 fc->last_reso_dB = fc->reso_dB;
547 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
548 p = tan(M_PI * fc->freq * fc->div_flt_rate); // hpf ?
551 dc[0] = TIM_FSCALE(tmp = 1.0 / (1.0 + qp + p2), 28);
552 dc[1] = TIM_FSCALE(-2 * tmp, 28); // hpf
554 dc[3] = TIM_FSCALE(2.0 * (p2 - 1.0) * tmp, 28); // hpf
555 dc[4] = TIM_FSCALE((1.0 - qp + p2) * tmp, 28);
559 static inline void sample_filter_BPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
564 db[2] = imuldiv28(db[0], dc[0])
565 + imuldiv28(db[1], dc[1])
566 + imuldiv28(db[2], dc[2])
567 - imuldiv28(db[3], dc[3])
568 - imuldiv28(db[4], dc[4]);
570 db[3] = db[2]; // flt out
572 db[1] = db[0]; // flt in
574 db[7] = imuldiv28(db[3], dc[5])
575 + imuldiv28(db[6], dc[6])
576 + imuldiv28(db[7], dc[7])
577 - imuldiv28(db[8], dc[8])
578 - imuldiv28(db[9], dc[9]);
580 db[8] = db[7]; // flt out
582 db[6] = db[3]; // flt in
584 *sp = db[8] >> 4; /* 4.28 to 8.24 */
587 static inline void recalc_filter_BPF_BW(FilterCoefficients *fc)
589 FLOAT_T f, q, r, pl, ph, sl, sh, tmp;
593 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
594 fc->last_freq = fc->freq;
595 fc->last_reso_dB = fc->reso_dB;
596 f = fc->freq * fc->div_flt_rate;
597 r = 1.0 - RESO_DB_CF_M(fc->reso_dB);
598 q = SQRT_2 - r * SQRT_2; // q>0.1
600 pl = 1.0 / tan(M_PI * f); // ?
602 dc[0] = TIM_FSCALE(tmp = 1.0 / (1.0 + q * pl + sl), 28);
603 dc[1] = TIM_FSCALE(2.0 * tmp, 28);
605 dc[3] = TIM_FSCALE(2.0 * (1.0 - sl) * tmp, 28);
606 dc[4] = TIM_FSCALE((1.0 - q * pl + sl) * tmp, 28);
608 f = f * 0.80; // bandwidth = LPF-HPF
609 ph = tan(M_PI * f); // hpf ?
611 dc[5] = TIM_FSCALE(tmp = 1.0 / (1.0 + q * ph + sh), 28);
612 dc[6] = TIM_FSCALE(-2 * tmp, 28); // hpf
614 dc[8] = TIM_FSCALE(2.0 * (sh - 1.0) * tmp, 28); // hpf
615 dc[9] = TIM_FSCALE((1.0 - q * ph + sh) * tmp, 28);
619 static inline void recalc_filter_peak1(FilterCoefficients *fc)
621 FLOAT_T f, q, r, pl ,ph, sl, sh;
625 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
626 fc->last_freq = fc->freq;
627 fc->last_reso_dB = fc->reso_dB;
628 f = cos(M_PI2 * fc->freq * fc->div_flt_rate);
629 r = 1.0 - RESO_DB_CF_M(fc->reso_dB); // r < 0.99609375
631 dc[0] = TIM_FSCALE((1 - r) * sqrt(r * (r - 4 * (f * f) + 2.0) + 1.0), 28);
632 dc[1] = TIM_FSCALE(2 * f * r, 28);
633 dc[2] = TIM_FSCALE(-(r * r), 28);
637 static inline void sample_filter_peak1(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
642 r += imuldiv28(dc[0], db[0]);
643 r += imuldiv28(dc[1], db[1]);
644 r += imuldiv28(dc[2], db[2]);
647 *sp = r >> 4; /* 4.28 to 8.24 */
650 static inline void recalc_filter_notch1(FilterCoefficients *fc)
652 FLOAT_T f, q, r, pl ,ph, sl, sh;
656 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
657 fc->last_freq = fc->freq;
658 fc->last_reso_dB = fc->reso_dB;
659 f = cos(M_PI2 * fc->freq * fc->div_flt_rate);
660 r = (1.0 - RESO_DB_CF_M(fc->reso_dB)) * 0.99609375; // r < 0.99609375
661 dc[0] = TIM_FSCALE((1 - r) * sqrt(r * (r - 4 * (f * f) + 2.0) + 1.0), 28);
662 dc[1] = TIM_FSCALE(2 * f * r, 28);
663 dc[2] = TIM_FSCALE(-(r * r), 28);
667 static inline void sample_filter_notch1(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
672 r += imuldiv28(dc[0], db[0]);
673 r += imuldiv28(dc[1], db[1]);
674 r += imuldiv28(dc[2], db[2]);
677 *sp = (db[0] - r) >> 4; // notch
680 static inline void sample_filter_LPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
682 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
683 db[1] = imuldiv28(dc[1], *sp << 4) - db[0] - imuldiv28(dc[1], db[2]); // high
684 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
685 *sp = db[0] >> 4; // (db[1] + db[0]) >> 4; // notch
688 static inline void sample_filter_LPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
690 FILTER_T input = *sp << 4;
692 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
693 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
694 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
695 *sp = db[0] >> 4; // (db[1] + db[0]) >> 4; // notch
697 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
698 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
699 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
702 static inline void sample_filter_LPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
704 FILTER_T input = *sp << 4;
706 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
707 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
708 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
709 *sp = db[0] >> 4; // (db[1] + db[0]) >> 4; // notch
711 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
712 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
713 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
715 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
716 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
717 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
720 static inline void recalc_filter_LPF12_3(FilterCoefficients *fc)
724 /* Chamberlin2's lowpass filter. */
725 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
726 fc->last_freq = fc->freq;
727 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
728 fc->sample_filter = sample_filter_LPF12_3;
729 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
730 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
731 fc->sample_filter = sample_filter_LPF12_3_ov2;
732 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
733 }else{ // <sr*3*0.21875
734 fc->sample_filter = sample_filter_LPF12_3_ov3;
735 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
737 fc->last_reso_dB = fc->reso_dB;
738 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
742 static inline void sample_filter_HPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
744 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
745 db[1] = imuldiv28(dc[1], *sp << 4) - db[0] - imuldiv28(dc[1], db[2]); // high
746 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
747 *sp = db[1] >> 4; // (db[1] + db[0]) >> 4; // notch
750 static inline void sample_filter_HPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
752 FILTER_T input = *sp << 4;
754 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
755 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
756 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
757 *sp = db[1] >> 4; // (db[1] + db[0]) >> 4; // notch
759 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
760 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
761 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
764 static inline void sample_filter_HPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
766 FILTER_T input = *sp << 4;
768 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
769 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
770 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
771 *sp = db[1] >> 4; // (db[1] + db[0]) >> 4; // notch
773 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
774 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
775 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
777 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
778 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
779 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
782 static inline void recalc_filter_HPF12_3(FilterCoefficients *fc)
786 /* Chamberlin2's lowpass filter. */
787 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
788 fc->last_freq = fc->freq;
789 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
790 fc->sample_filter = sample_filter_HPF12_3;
791 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
792 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
793 fc->sample_filter = sample_filter_HPF12_3_ov2;
794 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
795 }else{ // <sr*3*0.21875
796 fc->sample_filter = sample_filter_HPF12_3_ov3;
797 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
799 fc->last_reso_dB = fc->reso_dB;
800 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
804 static inline void sample_filter_BPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
806 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
807 db[1] = imuldiv28(dc[1], *sp << 4) - db[0] - imuldiv28(dc[1], db[2]); // high
808 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
809 *sp = db[2] >> 4; // (db[1] + db[0]) >> 4; // notch
812 static inline void sample_filter_BPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
814 FILTER_T input = *sp << 4;
816 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
817 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
818 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
819 *sp = db[2] >> 4; // (db[1] + db[0]) >> 4; // notch
821 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
822 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
823 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
826 static inline void sample_filter_BPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
828 FILTER_T input = *sp << 4;
830 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
831 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
832 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
833 *sp = db[2] >> 4; // (db[1] + db[0]) >> 4; // notch
835 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
836 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
837 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
839 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
840 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
841 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
844 static inline void recalc_filter_BPF12_3(FilterCoefficients *fc)
848 /* Chamberlin2's lowpass filter. */
849 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
850 fc->last_freq = fc->freq;
851 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
852 fc->sample_filter = sample_filter_BPF12_3;
853 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
854 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
855 fc->sample_filter = sample_filter_BPF12_3_ov2;
856 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
857 }else{ // <sr*3*0.21875
858 fc->sample_filter = sample_filter_BPF12_3_ov3;
859 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
861 fc->last_reso_dB = fc->reso_dB;
862 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
866 static inline void sample_filter_BCF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
868 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
869 db[1] = imuldiv28(dc[1], *sp << 4) - db[0] - imuldiv28(dc[1], db[2]); // high
870 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
871 *sp = (db[1] + db[0]) >> 4; // notch
874 static inline void sample_filter_BCF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
876 FILTER_T input = *sp << 4;
878 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
879 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
880 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
881 *sp = (db[1] + db[0]) >> 4; // notch
883 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
884 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
885 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
888 static inline void sample_filter_BCF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
890 FILTER_T input = *sp << 4;
892 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
893 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
894 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
895 *sp = (db[1] + db[0]) >> 4; // notch
897 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
898 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
899 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
901 db[0] = db[0] + imuldiv28(dc[0], db[2]); // low
902 db[1] = imuldiv28(dc[1], input) - db[0] - imuldiv28(dc[1], db[2]); // high
903 db[2] = imuldiv28(dc[0], db[1]) + db[2]; // band
906 static inline void recalc_filter_BCF12_3(FilterCoefficients *fc)
910 /* Chamberlin2's lowpass filter. */
911 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
912 fc->last_freq = fc->freq;
913 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
914 fc->sample_filter = sample_filter_BCF12_3;
915 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate), 28); // *1.0
916 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
917 fc->sample_filter = sample_filter_BCF12_3_ov2;
918 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2), 28); // sr*2
919 }else{ // <sr*3*0.21875
920 fc->sample_filter = sample_filter_BCF12_3_ov3;
921 dc[0] = TIM_FSCALE(2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3), 28); // sr*3
923 fc->last_reso_dB = fc->reso_dB;
924 dc[1] = TIM_FSCALE(RESO_DB_CF_M(fc->reso_dB), 28);
928 static inline void sample_filter_HPF6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
930 *sp -= (db[1] = imuldiv28(dc[0], *sp << 4) + imuldiv28(dc[1], db[1])) >> 4;
933 static inline void sample_filter_HPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
935 db[1] += imuldiv28(((*sp << 4) - db[0]), dc[1]);
937 db[1] = imuldiv28(db[1], dc[0]);
938 *sp -= db[0] >> 4; /* 4.28 to 8.24 */
944 static inline void sample_filter_HBF_L6L12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
946 FILTER_T input = *sp << 4, out1, out2;
947 const FILTER_T var1 = TIM_FSCALE(0.75, 28);
948 const FILTER_T var2 = TIM_FSCALE(0.25, 28);
949 const FILTER_T var3 = TIM_FSCALE(1.0, 28);
951 db[1] += imuldiv28((input - db[0]), dc[1]);
953 db[1] = imuldiv28(db[1], dc[0]);
956 db[11] = imuldiv28(input, dc[10]) + imuldiv28(db[11], dc[11]);
959 dc[16] = imuldiv28(dc[16], var1) + imuldiv28(dc[15], var2);
960 *sp = imuldiv28(out1, dc[16]) + imuldiv28(out2, var3 - dc[16]);
963 static inline void recalc_filter_HBF_L6L12(FilterCoefficients *fc)
965 FILTER_T *dc = fc->dc;
966 FLOAT_T f, r, p, q, t;
968 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
969 fc->last_freq = fc->freq;
970 fc->last_reso_dB = fc->reso_dB;
972 f = M_PI2 * fc->freq * fc->div_flt_rate;
973 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
975 dc[0] = TIM_FSCALE(p, 28);
976 dc[1] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
978 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
979 dc[10] = TIM_FSCALE(1.0 - f, 28);
980 dc[11] = TIM_FSCALE(f, 28);
982 dc[15] = TIM_FSCALE(1.0 - RESO_DB_CF_M(fc->reso_dB), 28);
986 static inline void sample_filter_HBF_L12L6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
988 FILTER_T input = *sp << 4, out1, out2;
989 const FILTER_T var = TIM_FSCALE(DIV_2, 28);
991 db[1] += imuldiv28((input - db[0]), dc[1]);
993 db[1] = imuldiv28(db[1], dc[0]);
996 db[11] = imuldiv28(input, dc[10]) + imuldiv28(db[11], dc[11]);
999 *sp = (out1 + imuldiv28(out2, var)) >> 4; /* 4.28 to 8.24 */
1002 static inline void recalc_filter_HBF_L12L6(FilterCoefficients *fc)
1004 FILTER_T *dc = fc->dc;
1007 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1008 fc->last_freq = fc->freq;
1009 fc->last_reso_dB = fc->reso_dB;
1011 f = M_PI2 * fc->freq * fc->div_flt_rate;
1012 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1014 dc[0] = TIM_FSCALE(p, 28);
1015 dc[1] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1017 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
1018 dc[10] = TIM_FSCALE(1.0 - f, 28);
1019 dc[11] = TIM_FSCALE(f, 28);
1023 static inline void sample_filter_HBF_L12H6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1025 FILTER_T input = *sp << 4, out1, out2;
1026 const FILTER_T var = TIM_FSCALE(DIV_2, 28);
1028 db[1] += imuldiv28((input - db[0]), dc[1]);
1030 db[1] = imuldiv28(db[1], dc[0]);
1033 db[11] = imuldiv28(input, dc[10]) + imuldiv28(db[11], dc[11]);
1034 out2 = input - db[11];
1036 *sp = (out1 + imuldiv28(out2, var)) >> 4; /* 4.28 to 8.24 */
1039 static inline void recalc_filter_HBF_L12H6(FilterCoefficients *fc)
1041 FILTER_T *dc = fc->dc;
1044 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1045 fc->last_freq = fc->freq;
1046 fc->last_reso_dB = fc->reso_dB;
1048 f = M_PI2 * fc->freq * fc->div_flt_rate;
1049 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1051 dc[0] = TIM_FSCALE(p, 28);
1052 dc[1] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1054 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
1055 dc[10] = TIM_FSCALE(1.0 - f, 28);
1056 dc[11] = TIM_FSCALE(f, 28);
1060 static inline void sample_filter_HBF_L24H6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1062 FILTER_T input = *sp << 4, out1, out2;
1063 const FILTER_T var = TIM_FSCALE(DIV_2, 28);
1066 db[5] = imuldiv28(db[0], dc[0]) + db[1];
1067 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
1068 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1070 db[5] = imuldiv28(db[0], dc[0]) + db[3];
1071 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
1072 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1075 db[11] = imuldiv28(input, dc[10]) + imuldiv28(db[11], dc[11]);
1076 out2 = input - db[11];
1078 *sp = (out1 + imuldiv28(out2, var)) >> 4; /* 4.28 to 8.24 */
1081 static inline void recalc_filter_HBF_L24H6(FilterCoefficients *fc)
1083 FILTER_T *dc = fc->dc;
1084 FLOAT_T f, r, q ,p, s;
1086 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1087 fc->last_freq = fc->freq;
1088 fc->last_reso_dB = fc->reso_dB;
1090 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
1091 q = 2.0 * RESO_DB_CF_M( fc->reso_dB);
1093 p = 1 + (q * f) + r;
1095 dc[0] = TIM_FSCALE(s, 28);
1096 dc[1] = TIM_FSCALE(s * 2, 28);
1097 dc[2] = TIM_FSCALE(r / p, 28);
1098 dc[3] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1099 dc[4] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1101 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
1102 dc[10] = TIM_FSCALE(1.0 - f, 28);
1103 dc[11] = TIM_FSCALE(f, 28);
1107 static inline void sample_filter_HBF_L24H12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1109 FILTER_T input = *sp << 4, out1, out2;
1110 const FILTER_T var = TIM_FSCALE(DIV_2, 28);
1113 db[5] = imuldiv28(db[0], dc[0]) + db[1];
1114 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
1115 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1117 db[5] = imuldiv28(db[0], dc[0]) + db[3];
1118 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
1119 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1122 db[11] += imuldiv28((input - db[10]), dc[11]);
1124 db[11] = imuldiv28(db[11], dc[10]);
1125 out2 = input - db[10];
1127 *sp = (out1 + imuldiv28(out2, var)) >> 4; /* 4.28 to 8.24 */
1130 static inline void recalc_filter_HBF_L24H12(FilterCoefficients *fc)
1132 FILTER_T *dc = fc->dc;
1133 FLOAT_T f, r, q ,p, s;
1135 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1136 fc->last_freq = fc->freq;
1137 fc->last_reso_dB = fc->reso_dB;
1139 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
1140 q = 2.0 * RESO_DB_CF_M( fc->reso_dB);
1142 p = 1 + (q * f) + r;
1144 dc[0] = TIM_FSCALE(s, 28);
1145 dc[1] = TIM_FSCALE(s * 2, 28);
1146 dc[2] = TIM_FSCALE(r / p, 28);
1147 dc[3] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1148 dc[4] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1150 f = M_PI2 * fc->freq * fc->div_flt_rate;
1151 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1153 dc[10] = TIM_FSCALE(p, 28);
1154 dc[11] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1158 static inline void sample_filter_HBF_L12OCT(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1160 FILTER_T input = *sp << 4, out1, out2;
1161 const FILTER_T var1 = TIM_FSCALE(DIV_3_2, 28);
1162 const FILTER_T var2 = TIM_FSCALE(DIV_3, 28);
1164 db[1] += imuldiv28((input - db[0]), dc[1]);
1166 db[1] = imuldiv28(db[1], dc[0]);
1169 db[11] += imuldiv28((input - db[10]), dc[11]);
1171 db[11] = imuldiv28(db[11], dc[10]);
1174 *sp = (imuldiv28(out1, var1) + imuldiv28(out2, var2)) >> 4; /* 4.28 to 8.24 */
1177 static inline void recalc_filter_HBF_L12OCT(FilterCoefficients *fc)
1179 FILTER_T *dc = fc->dc;
1182 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1183 fc->last_freq = fc->freq;
1184 fc->last_reso_dB = fc->reso_dB;
1186 f = M_PI2 * fc->freq * fc->div_flt_rate;
1187 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1189 dc[0] = TIM_FSCALE(p, 28);
1190 dc[1] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1192 f = 2.0 * fc->freq * fc->div_flt_rate;
1196 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1198 dc[10] = TIM_FSCALE(p, 28);
1199 dc[11] = TIM_FSCALE(p + 1.0 - 2.0 * cos(f) * q, 28);
1203 static inline void sample_filter_HBF_L24OCT(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1205 FILTER_T input = *sp << 4, out1, out2;
1206 const FILTER_T var1 = TIM_FSCALE(DIV_3_2, 28);
1207 const FILTER_T var2 = TIM_FSCALE(DIV_3, 28);
1210 db[5] = imuldiv28(db[0], dc[0]) + db[1];
1211 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
1212 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1214 db[5] = imuldiv28(db[0], dc[0]) + db[3];
1215 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
1216 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1220 db[15] = imuldiv28(db[10], dc[10]) + db[11];
1221 db[11] = imuldiv28(db[10], dc[11]) + imuldiv28(db[15], dc[13]) + db[12];
1222 db[12] = imuldiv28(db[10], dc[12]) + imuldiv28(db[15], dc[14]);
1224 db[15] = imuldiv28(db[10], dc[10]) + db[3];
1225 db[13] = imuldiv28(db[10], dc[11]) + imuldiv28(db[15], dc[13]) + db[14];
1226 db[14] = imuldiv28(db[10], dc[12]) + imuldiv28(db[15], dc[14]);
1229 *sp = (imuldiv28(out1, var1) + imuldiv28(out2, var2)) >> 4; /* 4.28 to 8.24 */
1232 static inline void recalc_filter_HBF_L24OCT(FilterCoefficients *fc)
1234 FILTER_T *dc = fc->dc;
1235 FLOAT_T f, r, q ,p, s;
1237 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1238 fc->last_freq = fc->freq;
1239 fc->last_reso_dB = fc->reso_dB;
1241 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
1242 q = 2.0 * RESO_DB_CF_M( fc->reso_dB);
1244 p = 1 + (q * f) + r;
1246 dc[0] = TIM_FSCALE(s, 28);
1247 dc[1] = TIM_FSCALE(s * 2, 28);
1248 dc[2] = TIM_FSCALE(r / p, 28);
1249 dc[3] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1250 dc[4] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1252 f = 2.0 * fc->freq * fc->div_flt_rate;
1255 f = tan(M_PI * f); // cutoff freq rate/2
1256 q = 2.0 * RESO_DB_CF_M( fc->reso_dB);
1258 p = 1 + (q * f) + r;
1260 dc[10] = TIM_FSCALE(s, 28);
1261 dc[11] = TIM_FSCALE(s * 2, 28);
1262 dc[12] = TIM_FSCALE(r / p, 28);
1263 dc[13] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1264 dc[14] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1270 static inline void sample_filter_LPF_BWx2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1275 db[ 2] = imuldiv28(db[ 0], dc[0])
1276 + imuldiv28(db[ 1], dc[1])
1277 + imuldiv28(db[ 2], dc[2])
1278 - imuldiv28(db[ 3], dc[3])
1279 - imuldiv28(db[ 4], dc[4]);
1281 db[ 3] = db[ 2]; // flt out
1283 db[ 1] = db[ 0]; // flt in
1285 db[ 6] = imuldiv28(db[ 3], dc[0])
1286 + imuldiv28(db[ 5], dc[1])
1287 + imuldiv28(db[ 6], dc[2])
1288 - imuldiv28(db[ 7], dc[3])
1289 - imuldiv28(db[ 8], dc[4]);
1291 db[ 7] = db[ 6]; // flt out
1293 db[ 5] = db[ 3]; // flt in
1295 *sp = db[ 7] >> 4; /* 4.28 to 8.24 */
1298 static inline void sample_filter_LPF_BWx3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1303 db[ 2] = imuldiv28(db[ 0], dc[0])
1304 + imuldiv28(db[ 1], dc[1])
1305 + imuldiv28(db[ 2], dc[2])
1306 - imuldiv28(db[ 3], dc[3])
1307 - imuldiv28(db[ 4], dc[4]);
1309 db[ 3] = db[ 2]; // flt out
1311 db[ 1] = db[ 0]; // flt in
1313 db[ 6] = imuldiv28(db[ 3], dc[0])
1314 + imuldiv28(db[ 5], dc[1])
1315 + imuldiv28(db[ 6], dc[2])
1316 - imuldiv28(db[ 7], dc[3])
1317 - imuldiv28(db[ 8], dc[4]);
1319 db[ 7] = db[ 6]; // flt out
1321 db[ 5] = db[ 3]; // flt in
1323 db[10] = imuldiv28(db[ 7], dc[0])
1324 + imuldiv28(db[ 9], dc[1])
1325 + imuldiv28(db[10], dc[2])
1326 - imuldiv28(db[11], dc[3])
1327 - imuldiv28(db[12], dc[4]);
1329 db[11] = db[10]; // flt out
1331 db[ 9] = db[ 7]; // flt in
1333 *sp = db[11] >> 4; /* 4.28 to 8.24 */
1336 static inline void sample_filter_LPF_BWx4(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1341 db[ 2] = imuldiv28(db[ 0], dc[0])
1342 + imuldiv28(db[ 1], dc[1])
1343 + imuldiv28(db[ 2], dc[2])
1344 - imuldiv28(db[ 3], dc[3])
1345 - imuldiv28(db[ 4], dc[4]);
1347 db[ 3] = db[ 2]; // flt out
1349 db[ 1] = db[ 0]; // flt in
1351 db[ 6] = imuldiv28(db[ 3], dc[0])
1352 + imuldiv28(db[ 5], dc[1])
1353 + imuldiv28(db[ 6], dc[2])
1354 - imuldiv28(db[ 7], dc[3])
1355 - imuldiv28(db[ 8], dc[4]);
1357 db[ 7] = db[ 6]; // flt out
1359 db[ 5] = db[ 3]; // flt in
1361 db[10] = imuldiv28(db[ 7], dc[0])
1362 + imuldiv28(db[ 9], dc[1])
1363 + imuldiv28(db[10], dc[2])
1364 - imuldiv28(db[11], dc[3])
1365 - imuldiv28(db[12], dc[4]);
1367 db[11] = db[10]; // flt out
1369 db[ 9] = db[ 7]; // flt in
1371 db[14] = imuldiv28(db[11], dc[0])
1372 + imuldiv28(db[13], dc[1])
1373 + imuldiv28(db[14], dc[2])
1374 - imuldiv28(db[15], dc[3])
1375 - imuldiv28(db[16], dc[4]);
1377 db[15] = db[14]; // flt out
1379 db[13] = db[11]; // flt in
1381 *sp = db[15] >> 4; /* 4.28 to 8.24 */
1384 static inline void recalc_filter_LPF24_2x2(FilterCoefficients *fc)
1386 FLOAT_T f, q, p, r, tmp;
1389 if (!FP_EQ(fc->freq, fc->last_freq) || !FP_EQ(fc->reso_dB, fc->last_reso_dB)) {
1390 fc->last_freq = fc->freq;
1391 fc->last_reso_dB = fc->reso_dB;
1392 //f = 1.0 / tan(M_PI * fc->freq * fc->div_flt_rate);
1393 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
1394 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
1396 //p = 1 + ((2.0) * f) + r;
1397 p = 1 + (q * f) + r;
1398 dc[0] = TIM_FSCALE(tmp = r / p, 28);
1399 dc[1] = TIM_FSCALE(tmp * 2, 28);
1400 dc[2] = TIM_FSCALE(r / p, 28);
1401 dc[3] = TIM_FSCALE(2 * (r - 1) / (-p), 28);
1402 //dc[4] = TIM_FSCALE((1 - ((2.0) * f) + r) / (-p), 28);
1403 dc[4] = TIM_FSCALE((1 - (q * f) + r) / (-p), 28);
1407 static inline void sample_filter_LPF24_2x2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1411 db[5] = imuldiv28(db[0], dc[0]) + db[1];
1412 db[1] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[2];
1413 db[2] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1414 db[10] = db[0] = db[5];
1415 db[5] = imuldiv28(db[0], dc[0]) + db[3];
1416 db[3] = imuldiv28(db[0], dc[1]) + imuldiv28(db[5], dc[3]) + db[4];
1417 db[4] = imuldiv28(db[0], dc[2]) + imuldiv28(db[5], dc[4]);
1419 db[15] = imuldiv28(db[10], dc[0]) + db[11];
1420 db[11] = imuldiv28(db[10], dc[1]) + imuldiv28(db[15], dc[3]) + db[12];
1421 db[12] = imuldiv28(db[10], dc[2]) + imuldiv28(db[15], dc[4]);
1423 db[15] = imuldiv28(db[10], dc[0]) + db[13];
1424 db[13] = imuldiv28(db[10], dc[1]) + imuldiv28(db[15], dc[3]) + db[14];
1425 db[14] = imuldiv28(db[10], dc[2]) + imuldiv28(db[15], dc[4]);
1426 *sp = db[10] >> 4; /* 4.28 to 8.24 */
1429 static inline void sample_filter_LPF6x2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1432 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1433 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1434 *sp = db[2] >> 4; /* 4.28 to 8.24 */
1437 static inline void sample_filter_LPF6x3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1440 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1441 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1442 db[3] = imuldiv28(db[0], dc[2]) + imuldiv28(db[1], dc[3]);
1443 *sp = db[3] >> 4; /* 4.28 to 8.24 */
1446 static inline void sample_filter_LPF6x4(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1449 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1450 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1451 db[3] = imuldiv28(db[0], dc[2]) + imuldiv28(db[1], dc[3]);
1452 db[4] = imuldiv28(db[0], dc[3]) + imuldiv28(db[1], dc[4]); // 24db
1453 *sp = db[4] >> 4; /* 4.28 to 8.24 */
1456 static inline void sample_filter_LPF6x8(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1459 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1460 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1461 db[3] = imuldiv28(db[0], dc[2]) + imuldiv28(db[1], dc[3]);
1462 db[4] = imuldiv28(db[0], dc[3]) + imuldiv28(db[1], dc[4]); // 24db
1463 db[5] = imuldiv28(db[0], dc[4]) + imuldiv28(db[1], dc[5]);
1464 db[6] = imuldiv28(db[0], dc[5]) + imuldiv28(db[1], dc[6]); // 36db
1465 db[7] = imuldiv28(db[0], dc[6]) + imuldiv28(db[1], dc[7]);
1466 db[8] = imuldiv28(db[0], dc[7]) + imuldiv28(db[1], dc[8]); // 48db
1467 *sp = db[8] >> 4; /* 4.28 to 8.24 */
1470 static inline void sample_filter_LPF6x16(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1473 db[1] = imuldiv28(db[0], dc[0]) + imuldiv28(db[1], dc[1]); // 6db
1474 db[2] = imuldiv28(db[0], dc[1]) + imuldiv28(db[1], dc[2]); // 12db
1475 db[3] = imuldiv28(db[0], dc[2]) + imuldiv28(db[1], dc[3]);
1476 db[4] = imuldiv28(db[0], dc[3]) + imuldiv28(db[1], dc[4]); // 24db
1477 db[5] = imuldiv28(db[0], dc[4]) + imuldiv28(db[1], dc[5]);
1478 db[6] = imuldiv28(db[0], dc[5]) + imuldiv28(db[1], dc[6]); // 36db
1479 db[7] = imuldiv28(db[0], dc[6]) + imuldiv28(db[1], dc[7]);
1480 db[8] = imuldiv28(db[0], dc[7]) + imuldiv28(db[1], dc[8]); // 48db
1481 db[9] = imuldiv28(db[0], dc[8]) + imuldiv28(db[1], dc[9]);
1482 db[10] = imuldiv28(db[0], dc[9]) + imuldiv28(db[1], dc[10]); // 60db
1483 db[11] = imuldiv28(db[0], dc[10]) + imuldiv28(db[1], dc[11]);
1484 db[12] = imuldiv28(db[0], dc[11]) + imuldiv28(db[1], dc[12]); // 72db
1485 db[13] = imuldiv28(db[0], dc[12]) + imuldiv28(db[1], dc[13]);
1486 db[14] = imuldiv28(db[0], dc[13]) + imuldiv28(db[1], dc[14]); // 84db
1487 db[15] = imuldiv28(db[0], dc[14]) + imuldiv28(db[1], dc[15]);
1488 db[16] = imuldiv28(db[0], dc[15]) + imuldiv28(db[1], dc[16]); // 96db
1489 *sp = db[16] >> 4; /* 4.28 to 8.24 */
1493 static inline void sample_filter_LPF_FIR(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1497 for (i = 0; i < LPF_FIR_ORDER ;i++)
1498 sum += imuldiv24(db[i], dc[i]);
1499 for (i = LPF_FIR_ORDER - 2; i >= 0; i--)
1505 static void designfir(FLOAT_T *g , FLOAT_T fc, FLOAT_T att);
1507 static inline void recalc_filter_LPF_FIR(FilterCoefficients *fc)
1509 FILTER_T *dc = fc->dc;
1510 FLOAT_T fir_coef[LPF_FIR_ORDER2];
1514 if(FLT_FREQ_MARGIN){
1517 f = fc->freq * fc->div_flt_rate * 2.0;
1518 designfir(fir_coef, f, 40.0);
1519 for (i = 0; i < LPF_FIR_ORDER2; i++)
1520 dc[LPF_FIR_ORDER-1 - i] = dc[i] = TIM_FSCALE(fir_coef[LPF_FIR_ORDER2 - 1 - i], 24);
1524 // shelving
\8b¤
\92Ê
1525 static inline void sample_filter_shelving(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1528 db[2] = imuldiv28(db[0], dc[0])
1529 + imuldiv28(db[1], dc[1])
1530 + imuldiv28(db[2], dc[2])
1531 + imuldiv28(db[3], dc[3])
1532 + imuldiv28(db[4], dc[4]);
1537 *sp = imuldiv28(db[3], dc[6]); /* 4.28 to 8.24 */ // spgain
1540 static inline void recalc_filter_shelving_low(FilterCoefficients *fc)
1542 FILTER_T *dc = fc->dc;
1543 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
1545 if(fc->freq != fc->last_freq || fc->reso_dB != fc->last_reso_dB || fc->q != fc->last_q){
1546 fc->last_freq = fc->freq;
1547 fc->last_reso_dB = fc->reso_dB;
1549 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_shelving_gain);
1550 dc[6] = TIM_FSCALE(pow(10.0, -(fc->reso_dB) * DIV_80 * ext_filter_shelving_reduce), 28); // spgain
1551 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
1554 beta = sqrt(A) / (fc->q * ext_filter_shelving_q); // q > 0
1555 a0 = 1.0 / ((A + 1) + (A - 1) * cs + beta * sn);
1556 a1 = 2.0 * ((A - 1) + (A + 1) * cs);
1557 a2 = -((A + 1) + (A - 1) * cs - beta * sn);
1558 b0 = A * ((A + 1) - (A - 1) * cs + beta * sn);
1559 b1 = 2.0 * A * ((A - 1) - (A + 1) * cs);
1560 b2 = A * ((A + 1) - (A - 1) * cs - beta * sn);
1561 dc[4] = TIM_FSCALE(a2* a0, 28);
1562 dc[3] = TIM_FSCALE(a1* a0, 28);
1563 dc[2] = TIM_FSCALE(b2* a0, 28);
1564 dc[1] = TIM_FSCALE(b1* a0, 28);
1565 dc[0] = TIM_FSCALE(b0* a0, 28);
1569 static inline void recalc_filter_shelving_hi(FilterCoefficients *fc)
1571 FILTER_T *dc = fc->dc;
1572 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
1574 if(fc->freq != fc->last_freq || fc->reso_dB != fc->last_reso_dB || fc->q != fc->last_q){
1575 fc->last_freq = fc->freq;
1576 fc->last_reso_dB = fc->reso_dB;
1578 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_shelving_gain);
1579 dc[6] = TIM_FSCALE(pow(10.0, -(fc->reso_dB) * DIV_80 * ext_filter_shelving_reduce), 28); // spgain
1580 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
1583 beta = sqrt(A) / (fc->q * ext_filter_shelving_q); // q > 0
1584 a0 = 1.0 / ((A + 1) - (A - 1) * cs + beta * sn);
1585 a1 = (-2.0 * ((A - 1) - (A + 1) * cs));
1586 a2 = -((A + 1) - (A - 1) * cs - beta * sn);
1587 b0 = A * ((A + 1) + (A - 1) * cs + beta * sn);
1588 b1 = -2.0 * A * ((A - 1) + (A + 1) * cs);
1589 b2 = A * ((A + 1) + (A - 1) * cs - beta * sn);
1590 dc[4] = TIM_FSCALE(a2* a0, 28);
1591 dc[3] = TIM_FSCALE(a1* a0, 28);
1592 dc[2] = TIM_FSCALE(b2* a0, 28);
1593 dc[1] = TIM_FSCALE(b1* a0, 28);
1594 dc[0] = TIM_FSCALE(b0* a0, 28);
1599 static inline void sample_filter_peaking(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1602 db[2] = imuldiv28(db[0], dc[0])
1603 + imuldiv28(db[1], dc[1])
1604 + imuldiv28(db[2], dc[2])
1605 - imuldiv28(db[3], dc[3])
1606 - imuldiv28(db[4], dc[4]);
1611 *sp = imuldiv28(db[3], dc[6]); // spgain
1614 static inline void recalc_filter_peaking(FilterCoefficients *fc)
1616 FILTER_T *dc = fc->dc;
1617 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
1619 if(fc->freq != fc->last_freq || fc->reso_dB != fc->last_reso_dB || fc->q != fc->last_q){
1620 fc->last_freq = fc->freq;
1621 fc->last_reso_dB = fc->reso_dB;
1623 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_peaking_gain);
1624 dc[6] = TIM_FSCALE(pow(10.0, -(fc->reso_dB) * DIV_80 * ext_filter_peaking_reduce), 28); // spgain
1625 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
1628 beta = sn / (2.0 * fc->q * ext_filter_peaking_q); // q > 0
1629 a0 = 1.0 / (1.0 + beta / A);
1631 a2 = 1.0 - beta / A;
1632 b0 = 1.0 + beta * A;
1633 b2 = 1.0 - beta * A;
1638 dc[4] = TIM_FSCALE(a2, 28);
1639 dc[3] = TIM_FSCALE(a1, 28);
1640 dc[2] = TIM_FSCALE(b2, 28);
1641 dc[1] = TIM_FSCALE(a1, 28); // b1 = a1
1642 dc[0] = TIM_FSCALE(b0, 28);
1647 static inline void sample_filter_biquad(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1650 DATA_T input = *sp << 4, r;
1652 r = imuldiv28(db[1], dc[1])
1653 + imuldiv28(*sp + db[2], dc[2])
1654 - imuldiv28(db[3], dc[3])
1655 - imuldiv28(db[4], dc[4]); // -dc3 -dc4
1661 *sp = r >> 4; /* 4.28 to 8.24 */
1664 static inline void recalc_filter_biquad_low(FilterCoefficients *fc)
1666 FILTER_T *dc = fc->dc;
1667 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, alpha;
1669 if(fc->freq != fc->last_freq || fc->q != fc->last_q){
1670 fc->last_freq = fc->freq;
1671 fc->last_reso_dB = fc->reso_dB;
1673 omega = 2.0 * M_PI * fc->freq * fc->div_flt_rate;
1676 alpha = sn / (2.0 * fc->q); // q > 0
1677 a0 = 1.0 / (1.0 + alpha);
1678 dc[1] = TIM_FSCALE((1.0 - cs) * a0, 28);
1679 dc[2] = dc[0] = TIM_FSCALE(((1.0 - cs) * DIV_2) * a0, 28);
1680 dc[3] = TIM_FSCALE((-2.0 * cs) * a0, 28);
1681 dc[4] = TIM_FSCALE((1.0 - alpha) * a0, 28);
1682 //b2 = ((1.0 - cs) * DIV_2) * a0;
1683 //b1 = (1.0 - cs) * a0;
1684 //a1 = (-2.0 * cs) * a0;
1685 //a2 = (1.0 - alpha) * a0;
1686 //dc[0] = TIM_FSCALE(b2, 28);
1687 //dc[1] = TIM_FSCALE(b1, 28);
1688 //dc[2] = TIM_FSCALE(a1, 28);
1689 //dc[3] = TIM_FSCALE(a2, 28);
1693 static inline void recalc_filter_biquad_hi(FilterCoefficients *fc)
1695 FILTER_T *dc = fc->dc;
1696 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, alpha;
1698 if(fc->freq != fc->last_freq || fc->q != fc->last_q){
1699 fc->last_freq = fc->freq;
1700 fc->last_reso_dB = fc->reso_dB;
1702 omega = 2.0 * M_PI * fc->freq * fc->div_flt_rate;
1705 alpha = sn / (2.0 * fc->q); // q > 0
1706 a0 = 1.0 / (1.0 + alpha);
1707 dc[1] = TIM_FSCALE((-(1.0 + cs)) * a0, 28);
1708 dc[2] = dc[0] = TIM_FSCALE(((1.0 + cs) * DIV_2) * a0, 28);
1709 dc[3] = TIM_FSCALE((-2.0 * cs) * a0, 28);
1710 dc[4] = TIM_FSCALE((1.0 - alpha) * a0, 28);
1711 //b2 = ((1.0 + cs) * DIV_2) * a0;
1712 //b1 = (-(1.0 + cs)) * a0;
1713 //a1 = (-2.0 * cs) * a0;
1714 //a2 = (1.0 - alpha) * a0;
1715 //dc[0] = TIM_FSCALE(b2, 28);
1716 //dc[1] = TIM_FSCALE(b1, 28);
1717 //dc[2] = TIM_FSCALE(a1, 28);
1718 //dc[3] = TIM_FSCALE(a2, 28);
1722 #else /* floating-point implementation */
1724 #ifdef USE_PENTIUM_4
1725 #define DENORMAL_FIX 1 // for pentium 4 float/double denormal fix
1726 #define DENORMAL_ADD (5.4210108624275221703311375920553e-20) // 1.0/(1<<64)
1727 const FLOAT_T denormal_add = DENORMAL_ADD; // 1.0/(1<<64)
1729 #if (USE_X86_EXT_INTRIN >= 3) && defined(FLOAT_T_DOUBLE)
1730 const __m128d vec_denormal_add = {DENORMAL_ADD, DENORMAL_ADD};
1731 #elif (USE_X86_EXT_INTRIN >= 2) && defined(FLOAT_T_FLOAT)
1732 const __m128 vec_denormal_add = {DENORMAL_ADD, DENORMAL_ADD, DENORMAL_ADD, DENORMAL_ADD};
1733 #endif // USE_X86_EXT_INTRIN
1735 #endif // USE_PENTIUM_4
1738 static inline void sample_filter_none(FILTER_T *dc, FILTER_T *db, DATA_T *sp){}
1740 static inline void recalc_filter_none(FilterCoefficients *fc){}
1742 static inline void sample_filter_LPF12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1744 db[0] = db[0] + db[2] * dc[0];
1745 db[1] = *sp - db[0] - db[2] * dc[1];
1746 db[2] = db[1] * dc[0] + db[2];
1750 static inline void sample_filter_LPF12_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1752 FILTER_T input = *sp;
1754 db[0] = db[0] + db[2] * dc[0];
1755 db[1] = input - db[0] - db[2] * dc[1];
1756 db[2] = db[1] * dc[0] + db[2];
1759 db[0] = db[0] + db[2] * dc[0];
1760 db[1] = input - db[0] - db[2] * dc[1];
1761 db[2] = db[1] * dc[0] + db[2];
1764 static inline void sample_filter_LPF12_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1766 FILTER_T input = *sp;
1768 db[0] = db[0] + db[2] * dc[0];
1769 db[1] = input - db[0] - db[2] * dc[1];
1770 db[2] = db[1] * dc[0] + db[2];
1773 db[0] = db[0] + db[2] * dc[0];
1774 db[1] = input - db[0] - db[2] * dc[1];
1775 db[2] = db[1] * dc[0] + db[2];
1777 db[0] = db[0] + db[2] * dc[0];
1778 db[1] = input - db[0] - db[2] * dc[1];
1779 db[2] = db[1] * dc[0] + db[2];
1782 static inline void recalc_filter_LPF12(FilterCoefficients *fc)
1784 FILTER_T *dc = fc->dc;
1786 /* copy with applying Chamberlin's lowpass filter. */
1787 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
1791 if(fc->freq < fc->flt_rate_limit1){ // <sr*DIV_6
1792 fc->sample_filter = sample_filter_LPF12;
1793 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
1794 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*DIV_6
1795 fc->sample_filter = sample_filter_LPF12_ov2;
1796 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
1797 }else{ // <sr*3*DIV_6
1798 fc->sample_filter = sample_filter_LPF12_ov3;
1799 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
1801 dc[1] = RESO_DB_CF_M(fc->reso_dB);
1805 static inline void sample_filter_LPF24(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1809 da[0] = *sp - dc[2] * db[4]; /* feedback */
1813 db[1] = (db[0] + da[0]) * dc[0] - db[1] * dc[1];
1814 db[2] = (db[1] + da[1]) * dc[0] - db[2] * dc[1];
1815 db[3] = (db[2] + da[2]) * dc[0] - db[3] * dc[1];
1816 db[4] = (db[3] + da[3]) * dc[0] - db[4] * dc[1];
1821 static inline void recalc_filter_LPF24(FilterCoefficients *fc)
1823 FILTER_T *dc = fc->dc, f, q ,p, r;
1825 /* copy with applying Moog lowpass VCF. */
1826 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
1830 f = 2.0 * fc->freq * fc->div_flt_rate;
1832 q = 0.80 * (1.0 - RESO_DB_CF_M(fc->reso_dB)); // 0.0f <= c < 0.80f
1833 dc[0] = f + 0.8 * f * p;
1834 dc[1] = dc[0] + dc[0] - 1.0;
1835 dc[2] = q * (1.0 + 0.5 * p * (1.0 - p + 5.6 * p * p));
1839 static inline void sample_filter_LPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1844 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
1845 #if defined(DENORMAL_FIX)
1846 db[2] += denormal_add;
1849 db[3] = db[2]; // flt out
1851 db[1] = db[0]; // flt in
1856 static inline void recalc_filter_LPF_BW(FilterCoefficients *fc)
1858 FILTER_T *dc = fc->dc;
1859 double q ,p, p2, qp, dc0;
1861 // elion butterworth
1862 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
1866 p = 1.0 / tan(M_PI * fc->freq * (double)fc->div_flt_rate); // ?
1867 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
1870 dc0 = 1.0 / ( 1.0 + qp + p2);
1874 dc[3] = -2.0 * ( 1.0 - p2) * dc0; // -
1875 dc[4] = -(1.0 - qp + p2) * dc0; // -
1879 static inline void sample_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
1881 db[1] += (*sp - db[0]) * dc[1];
1887 static inline void recalc_filter_LPF12_2(FilterCoefficients *fc)
1889 FILTER_T *dc = fc->dc;
1891 FLOAT_T c0, c1, a0, b1, b2;
1893 // Resonant IIR lowpass (12dB/oct) Olli Niemitalo //r
1894 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
1898 f = M_PI2 * fc->freq * fc->div_flt_rate;
1899 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
1902 c1 = c0 + 1.0 - 2.0 * cos(f) * q;
1905 #if (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
1916 dc[9] = b1 * b1 + b2;
1921 #if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
1922 // SIMD optimization (double * 2)
1923 static inline void buffer_filter_LPF12_2(FILTER_T* dc, FILTER_T* db, DATA_T* sp, int32 count)
1926 __m256d vcx0 = _mm256_broadcast_pd((__m128d *)(dc + 2));
1927 __m256d vcx1 = _mm256_broadcast_pd((__m128d *)(dc + 4));
1928 __m128d vcym2 = _mm_loadu_pd(dc + 6);
1929 __m128d vcym1 = _mm_loadu_pd(dc + 8);
1930 __m128d vy = _mm_loadu_pd(db + 2);
1931 __m128d vym2 = _mm_unpacklo_pd(vy, vy);
1932 __m128d vym1 = _mm_unpackhi_pd(vy, vy);
1934 for (i = 0; i < count; i += 4)
1936 __m256d vin = _mm256_loadu_pd(sp + i);
1937 __m256d vx0 = _mm256_unpacklo_pd(vin, vin);
1938 __m256d vx1 = _mm256_unpackhi_pd(vin, vin);
1939 __m256d vfma2x = MM256_FMA2_PD(vcx0, vx0, vcx1, vx1);
1941 __m128d vy0 = _mm_add_pd(_mm256_castpd256_pd128(vfma2x), MM_FMA2_PD(vcym2, vym2, vcym1, vym1));
1942 _mm_storeu_pd(sp + i, vy0);
1943 vym2 = _mm_unpacklo_pd(vy0, vy0);
1944 vym1 = _mm_unpackhi_pd(vy0, vy0);
1946 __m128d vy1 = _mm_add_pd(_mm256_extractf128_pd(vfma2x, 1), MM_FMA2_PD(vcym2, vym2, vcym1, vym1));
1947 _mm_storeu_pd(sp + i + 2, vy1);
1948 vym2 = _mm_unpacklo_pd(vy1, vy1);
1949 vym1 = _mm_unpackhi_pd(vy1, vy1);
1953 _mm_storeu_pd(db + 2, vy);
1956 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
1957 // SIMD optimization (double * 2)
1958 static inline void buffer_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp, int32 count)
1961 __m128d vcx0 = _mm_loadu_pd(dc + 2);
1962 __m128d vcx1 = _mm_loadu_pd(dc + 4);
1963 __m128d vcym2 = _mm_loadu_pd(dc + 6);
1964 __m128d vcym1 = _mm_loadu_pd(dc + 8);
1965 __m128d vy = _mm_loadu_pd(db + 2);
1966 __m128d vym2 = _mm_unpacklo_pd(vy, vy);
1967 __m128d vym1 = _mm_unpackhi_pd(vy, vy);
1969 for (i = 0; i < count; i += 2) {
1970 __m128d vin = _mm_loadu_pd(sp + i);
1971 __m128d vx0 = _mm_unpacklo_pd(vin, vin);
1972 __m128d vx1 = _mm_unpackhi_pd(vin, vin);
1973 vy = MM_FMA4_PD(vcx0, vx0, vcx1, vx1, vcym2, vym2, vcym1, vym1);
1974 _mm_storeu_pd(sp + i, vy);
1975 vym2 = _mm_unpacklo_pd(vy, vy);
1976 vym1 = _mm_unpackhi_pd(vy, vy);
1978 _mm_storeu_pd(db + 2, vy);
1982 static inline void buffer_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp, int32 count)
1985 FILTER_T db0 = db[0], db1 = db[1], dc0 = dc[0], dc1 = dc[1];
1987 for (i = 0; i < count; i++) {
1988 db1 += (sp[i] - db0) * dc1;
1997 #endif // (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
1999 static inline void sample_filter_LPF24_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2002 db[5] = dc[0] * db[0] + db[1];
2003 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2004 db[2] = dc[2] * db[0] + dc[4] * db[5];
2006 db[5] = dc[0] * db[0] + db[3];
2007 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2008 db[4] = dc[2] * db[0] + dc[4] * db[5];
2012 static inline void recalc_filter_LPF24_2(FilterCoefficients *fc)
2014 FILTER_T *dc = fc->dc, f, q ,p, r, dc0;
2016 // amSynth 24dB/ocatave resonant low-pass filter. Nick Dowell //r
2017 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2021 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
2022 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2024 p = 1.0 / (1.0 + (q * f) + r);
2029 dc[3] = -2.0 * (r - 1) * p;
2030 dc[4] = (-1.0 + (q * f) - r) * p;
2034 static inline void sample_filter_LPF6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2036 *sp = (db[1] = dc[0] * *sp + dc[1] * db[1]);
2037 #if defined(DENORMAL_FIX)
2038 db[1] += denormal_add;
2042 static inline void recalc_filter_LPF6(FilterCoefficients *fc)
2044 FILTER_T *dc = fc->dc, f;
2046 // One pole filter, LP 6dB/Oct scoofy no resonance //r
2047 if(FLT_FREQ_MARGIN){
2050 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2056 static inline void sample_filter_LPF18(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2063 db[0] = *sp - dc[2] * db[3];
2064 db[1] = dc[1] * (db[0] + da[0]) - dc[0] * db[1];
2065 db[2] = dc[1] * (db[1] + da[1]) - dc[0] * db[2];
2066 db[3] = dc[1] * (db[2] + da[2]) - dc[0] * db[3];
2067 *sp = db[3] * dc[3];
2070 static inline void sample_filter_LPF18_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2072 FILTER_T da[6], input = *sp;
2077 db[0] = input - dc[2] * db[3];
2078 db[1] = dc[1] * (db[0] + da[0]) - dc[0] * db[1];
2079 db[2] = dc[1] * (db[1] + da[1]) - dc[0] * db[2];
2080 db[3] = dc[1] * (db[2] + da[2]) - dc[0] * db[3];
2081 *sp = db[3] * dc[3];
2086 db[0] = input - dc[2] * db[3];
2087 db[1] = dc[1] * (db[0] + da[0]) - dc[0] * db[1];
2088 db[2] = dc[1] * (db[1] + da[1]) - dc[0] * db[2];
2089 db[3] = dc[1] * (db[2] + da[2]) - dc[0] * db[3];
2092 static inline void recalc_filter_LPF18(FilterCoefficients *fc)
2094 FILTER_T *dc = fc->dc, f, q , p;
2095 // LPF18 low-pass filter //r
2096 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2100 if(fc->freq < fc->flt_rate_limit1){ // <sr/2.25
2101 fc->sample_filter = sample_filter_LPF18;
2102 f = 2.0 * fc->freq * fc->div_flt_rate; // *1.0
2103 }else{ // <sr*2/2.25
2104 fc->sample_filter = sample_filter_LPF18_ov2;
2105 f = 2.0 * fc->freq * fc->div_flt_rate_ov2; // sr*2
2107 dc[0] = ((-2.7528 * f + 3.0429) * f + 1.718) * f - 0.9984;
2108 q = 0.789 * (1.0 - RESO_DB_CF_M(fc->reso_dB)); // 0<q<0.78125
2111 dc[2] = q * (((-2.7079 * p + 10.963) * p - 14.934) * p + 8.4974);
2112 dc[3] = 1.0 + (0.25 * (1.5 + 2.0 * dc[2] * (1.0 - f)));
2116 static inline void sample_filter_LPF_TFO(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2118 db[0] = db[0] + dc[0] * (*sp - db[0] + dc[1] * (db[0] - db[1]));
2119 db[1] = db[1] + dc[0] * (db[0] - db[1]);
2123 static inline void recalc_filter_LPF_TFO(FilterCoefficients *fc)
2125 FILTER_T *dc = fc->dc, q;
2127 // two first order low-pass filter //r
2128 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2132 dc[0] = 2 * fc->freq * fc->div_flt_rate;
2133 q = 1.0 - RESO_DB_CF_M(fc->reso_dB);
2134 dc[1] = q + q / (1.01 - dc[0]);
2138 static inline void sample_filter_HPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2143 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
2144 #if defined(DENORMAL_FIX)
2145 db[2] += denormal_add;
2148 db[3] = db[2]; // flt out
2150 db[1] = db[0]; // flt in
2155 static inline void recalc_filter_HPF_BW(FilterCoefficients *fc)
2157 FILTER_T *dc = fc->dc;
2158 double q, p, p2, qp, dc0;
2160 // elion butterworth HPF //r
2161 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2165 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
2166 p = tan(M_PI * fc->freq * fc->div_flt_rate); // hpf ?
2169 dc0 = 1.0 / (1.0 + qp + p2);
2171 dc[1] = -2 * dc0; // hpf
2173 dc[3] = -2.0 * (p2 - 1.0) * dc0; // hpf
2174 dc[4] = -(1.0 - qp + p2) * dc0;
2178 static inline void sample_filter_BPF_BW(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2183 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
2185 db[10] = dc[8] * db[8] + dc[9] * db[9] + dc[10] * db[10] + dc[11] * db[11] + dc[12] * db[12];
2186 #if defined(DENORMAL_FIX)
2187 db[2] += denormal_add;
2191 db[11] = db[10]; // flt out
2193 db[9] = db[8]; // flt in
2195 db[8] = db[4]; // db[4]
\82©
\82çdb[8]
\82Ö
\82Í
\92x
\89\84\82µ
\82Ä
\82à
\82¢
\82¢
2198 db[3] = db[2]; // flt out
2200 db[1] = db[0]; // flt in
2205 static inline void recalc_filter_BPF_BW(FilterCoefficients *fc)
2207 FILTER_T *dc = fc->dc;
2208 FLOAT_T f, q, pl, pl2, qpl, ph, ph2, qph, dc0;
2210 // elion butterworth
2211 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2215 f = fc->freq * fc->div_flt_rate;
2216 q = RESO_DB_CF_M(fc->reso_dB) * SQRT_2; // q>0.1
2218 pl = 1.0 / tan(M_PI * f);
2221 dc0 = 1.0 / ( 1.0 + qpl + pl2);
2225 dc[3] = -2.0 * ( 1.0 - pl2) * dc0; // -
2226 dc[4] = -(1.0 - qpl + pl2) * dc0; // -
2228 ph = tan(M_PI * f * 0.8); // hpf // f bandwidth = LPF-HPF
2231 dc0 = 1.0 / (1.0 + qph + ph2);
2233 dc[9] = -2 * dc0; // hpf
2235 dc[11] = -2.0 * (ph2 - 1.0) * dc0; // hpf
2236 dc[12] = -(1.0 - qph + ph2) * dc0;
2240 static inline void sample_filter_peak1(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2245 r = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2];
2251 static inline void recalc_filter_peak1(FilterCoefficients *fc)
2253 FILTER_T *dc = fc->dc, f, q, r, pl ,ph, sl, sh;
2255 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2259 f = cos(M_PI2 * fc->freq * fc->div_flt_rate);
2260 r = (1.0 - RESO_DB_CF_M(fc->reso_dB)) * 0.99609375; // r < 0.99609375
2261 dc[0] = (1 - r) * sqrt(r * (r - 4 * (f * f) + 2.0) + 1.0);
2267 static inline void sample_filter_notch1(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2272 r = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2];
2275 *sp = db[0] - r; // notch
2278 static inline void sample_filter_LPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2280 db[0] = db[0] + dc[0] * db[2]; // low
2281 db[1] = dc[1] * *sp - db[0] - dc[1] * db[2]; // high
2282 db[2] = dc[0] * db[1] + db[2]; // band
2283 *sp = db[0]; // db[1] + db[0]; // notch
2286 static inline void sample_filter_LPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2288 FILTER_T input = *sp;
2290 db[0] = db[0] + dc[0] * db[2]; // low
2291 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2292 db[2] = dc[0] * db[1] + db[2]; // band
2293 *sp = db[0]; // db[1] + db[0]; // notch
2295 db[0] = db[0] + dc[0] * db[2]; // low
2296 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2297 db[2] = dc[0] * db[1] + db[2]; // band
2300 static inline void sample_filter_LPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2302 FILTER_T input = *sp;
2304 db[0] = db[0] + dc[0] * db[2]; // low
2305 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2306 db[2] = dc[0] * db[1] + db[2]; // band
2307 *sp = db[0]; // db[1] + db[0]; // notch
2309 db[0] = db[0] + dc[0] * db[2]; // low
2310 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2311 db[2] = dc[0] * db[1] + db[2]; // band
2313 db[0] = db[0] + dc[0] * db[2]; // low
2314 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2315 db[2] = dc[0] * db[1] + db[2]; // band
2318 static inline void recalc_filter_LPF12_3(FilterCoefficients *fc)
2320 FILTER_T *dc = fc->dc;
2322 /* Chamberlin2's lowpass filter. */
2323 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2327 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
2328 fc->sample_filter = sample_filter_LPF12_3;
2329 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
2330 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
2331 fc->sample_filter = sample_filter_LPF12_3_ov2;
2332 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
2333 }else{ // <sr*3*0.21875
2334 fc->sample_filter = sample_filter_LPF12_3_ov3;
2335 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
2337 dc[1] = RESO_DB_CF_M(fc->reso_dB);
2341 static inline void sample_filter_HPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2343 db[0] = db[0] + dc[0] * db[2]; // low
2344 db[1] = dc[1] * *sp - db[0] - dc[1] * db[2]; // high
2345 db[2] = dc[0] * db[1] + db[2]; // band
2346 *sp = db[1]; // db[1] + db[0]; // notch
2349 static inline void sample_filter_HPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2351 FILTER_T input = *sp;
2353 db[0] = db[0] + dc[0] * db[2]; // low
2354 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2355 db[2] = dc[0] * db[1] + db[2]; // band
2356 *sp = db[1]; // db[1] + db[0]; // notch
2358 db[0] = db[0] + dc[0] * db[2]; // low
2359 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2360 db[2] = dc[0] * db[1] + db[2]; // band
2363 static inline void sample_filter_HPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2365 FILTER_T input = *sp;
2367 db[0] = db[0] + dc[0] * db[2]; // low
2368 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2369 db[2] = dc[0] * db[1] + db[2]; // band
2370 *sp = db[1]; // db[1] + db[0]; // notch
2372 db[0] = db[0] + dc[0] * db[2]; // low
2373 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2374 db[2] = dc[0] * db[1] + db[2]; // band
2376 db[0] = db[0] + dc[0] * db[2]; // low
2377 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2378 db[2] = dc[0] * db[1] + db[2]; // band
2381 static inline void recalc_filter_HPF12_3(FilterCoefficients *fc)
2383 FILTER_T *dc = fc->dc;
2385 /* Chamberlin2's lowpass filter. */
2386 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2390 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
2391 fc->sample_filter = sample_filter_HPF12_3;
2392 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
2393 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
2394 fc->sample_filter = sample_filter_HPF12_3_ov2;
2395 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
2396 }else{ // <sr*3*0.21875
2397 fc->sample_filter = sample_filter_HPF12_3_ov3;
2398 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
2400 dc[1] = RESO_DB_CF_M(fc->reso_dB);
2404 static inline void sample_filter_BPF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2406 db[0] = db[0] + dc[0] * db[2]; // low
2407 db[1] = dc[1] * *sp - db[0] - dc[1] * db[2]; // high
2408 db[2] = dc[0] * db[1] + db[2]; // band
2409 *sp = db[2]; // db[1] + db[0]; // notch
2410 #if defined(DENORMAL_FIX)
2411 db[0] += denormal_add;
2415 static inline void sample_filter_BPF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2417 FILTER_T input = *sp;
2419 db[0] = db[0] + dc[0] * db[2]; // low
2420 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2421 db[2] = dc[0] * db[1] + db[2]; // band
2422 *sp = db[2]; // db[1] + db[0]; // notch
2423 #if defined(DENORMAL_FIX)
2424 db[0] += denormal_add;
2427 db[0] = db[0] + dc[0] * db[2]; // low
2428 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2429 db[2] = dc[0] * db[1] + db[2]; // band
2432 static inline void sample_filter_BPF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2434 FILTER_T input = *sp;
2436 db[0] = db[0] + dc[0] * db[2]; // low
2437 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2438 db[2] = dc[0] * db[1] + db[2]; // band
2439 *sp = db[2]; // db[1] + db[0]; // notch
2440 #if defined(DENORMAL_FIX)
2441 db[0] += denormal_add;
2444 db[0] = db[0] + dc[0] * db[2]; // low
2445 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2446 db[2] = dc[0] * db[1] + db[2]; // band
2448 db[0] = db[0] + dc[0] * db[2]; // low
2449 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2450 db[2] = dc[0] * db[1] + db[2]; // band
2453 static inline void recalc_filter_BPF12_3(FilterCoefficients *fc)
2455 FILTER_T *dc = fc->dc;
2457 /* Chamberlin2's lowpass filter. */
2458 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2462 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
2463 fc->sample_filter = sample_filter_BPF12_3;
2464 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
2465 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
2466 fc->sample_filter = sample_filter_BPF12_3_ov2;
2467 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
2468 }else{ // <sr*3*0.21875
2469 fc->sample_filter = sample_filter_BPF12_3_ov3;
2470 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
2472 dc[1] = RESO_DB_CF_M(fc->reso_dB);
2476 static inline void sample_filter_BCF12_3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2478 db[0] = db[0] + dc[0] * db[2]; // low
2479 db[1] = dc[1] * *sp - db[0] - dc[1] * db[2]; // high
2480 db[2] = dc[0] * db[1] + db[2]; // band
2481 *sp = db[1] + db[0]; // notch
2484 static inline void sample_filter_BCF12_3_ov2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2486 FILTER_T input = *sp;
2488 db[0] = db[0] + dc[0] * db[2]; // low
2489 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2490 db[2] = dc[0] * db[1] + db[2]; // band
2491 *sp = db[1] + db[0]; // notch
2493 db[0] = db[0] + dc[0] * db[2]; // low
2494 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2495 db[2] = dc[0] * db[1] + db[2]; // band
2498 static inline void sample_filter_BCF12_3_ov3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2500 FILTER_T input = *sp;
2502 db[0] = db[0] + dc[0] * db[2]; // low
2503 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2504 db[2] = dc[0] * db[1] + db[2]; // band
2505 db[3] = db[1] + db[0]; // notch
2506 *sp = db[1] + db[0]; // notch
2508 db[0] = db[0] + dc[0] * db[2]; // low
2509 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2510 db[2] = dc[0] * db[1] + db[2]; // band
2512 db[0] = db[0] + dc[0] * db[2]; // low
2513 db[1] = dc[1] * input - db[0] - dc[1] * db[2]; // high
2514 db[2] = dc[0] * db[1] + db[2]; // band
2517 static inline void recalc_filter_BCF12_3(FilterCoefficients *fc)
2519 FILTER_T *dc = fc->dc;
2521 /* Chamberlin2's lowpass filter. */
2522 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2526 if(fc->freq < fc->flt_rate_limit1){ // <sr*0.21875
2527 fc->sample_filter = sample_filter_BCF12_3;
2528 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate); // *1.0
2529 }else if(fc->freq < fc->flt_rate_limit2){ // <sr*2*0.21875
2530 fc->sample_filter = sample_filter_BCF12_3_ov2;
2531 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov2); // sr*2
2532 }else{ // <sr*3*0.21875
2533 fc->sample_filter = sample_filter_BCF12_3_ov3;
2534 dc[0] = 2.0 * sin(M_PI * fc->freq * fc->div_flt_rate_ov3); // sr*3
2536 dc[1] = RESO_DB_CF_M(fc->reso_dB);
2540 static inline void sample_filter_HPF6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2542 *sp -= (db[1] = dc[0] * *sp + dc[1] * db[1]);
2543 #if defined(DENORMAL_FIX)
2544 db[1] += denormal_add;
2548 static inline void sample_filter_HPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2550 db[1] += (*sp - db[0]) * dc[1];
2558 static inline void sample_filter_HBF_L6L12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2560 FLOAT_T in = *sp, out1, out2;
2562 db[1] += (in - db[0]) * dc[1];
2567 db[11] = dc[10] * in + dc[11] * db[11];
2569 #if defined(DENORMAL_FIX)
2570 db[11] += denormal_add;
2573 dc[16] = dc[16] * 0.75 + dc[15] * 0.25;
2574 *sp = (out1 * dc[16] + out2 * (1.0 - dc[16]));
2577 static inline void recalc_filter_HBF_L6L12(FilterCoefficients *fc)
2579 FILTER_T *dc = fc->dc;
2582 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2587 f = M_PI2 * fc->freq * fc->div_flt_rate;
2588 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
2590 dc[1] = dc[0] + 1.0 - 2.0 * cos(f) * q;
2592 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2596 dc[15] = 1.0 - RESO_DB_CF_M(fc->reso_dB);
2600 static inline void sample_filter_HBF_L12L6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2602 FLOAT_T in = *sp, out1, out2;
2604 db[1] += (in - db[0]) * dc[1];
2609 db[11] = dc[10] * in + dc[11] * db[11];
2611 #if defined(DENORMAL_FIX)
2612 db[11] += denormal_add;
2615 *sp = out1 + out2 * DIV_2;
2618 static inline void recalc_filter_HBF_L12L6(FilterCoefficients *fc)
2620 FILTER_T *dc = fc->dc;
2623 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2628 f = M_PI2 * fc->freq * fc->div_flt_rate;
2629 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
2631 dc[1] = dc[0] + 1.0 - 2.0 * cos(f) * q;
2633 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2639 static inline void sample_filter_HBF_L12H6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2641 FLOAT_T in = *sp, out1, out2;
2643 db[1] += (in - db[0]) * dc[1];
2648 db[11] = dc[10] * in + dc[11] * db[11];
2650 #if defined(DENORMAL_FIX)
2651 db[11] += denormal_add;
2654 *sp = out1 + out2 * DIV_2;
2657 static inline void recalc_filter_HBF_L12H6(FilterCoefficients *fc)
2659 FILTER_T *dc = fc->dc;
2662 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2667 f = M_PI2 * fc->freq * fc->div_flt_rate;
2668 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(fc->reso_dB) + 0.5 / (1.0 + f)) + f - 2.0);
2670 dc[1] = dc[0] + 1.0 - 2.0 * cos(f) * q;
2672 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2678 static inline void sample_filter_HBF_L24H6(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2680 FLOAT_T in = *sp, out1, out2;
2683 db[5] = dc[0] * db[0] + db[1];
2684 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2685 db[2] = dc[2] * db[0] + dc[4] * db[5];
2687 db[5] = dc[0] * db[0] + db[3];
2688 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2689 db[4] = dc[2] * db[0] + dc[4] * db[5];
2692 db[11] = dc[10] * in + dc[11] * db[11];
2694 #if defined(DENORMAL_FIX)
2695 db[11] += denormal_add;
2698 *sp = out1 + out2 * DIV_2;
2701 static inline void recalc_filter_HBF_L24H6(FilterCoefficients *fc)
2703 FILTER_T *dc = fc->dc;
2704 FLOAT_T f, r, q ,p, dc0;
2706 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2711 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
2712 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2714 p = 1.0 / (1.0 + (q * f) + r);
2719 dc[3] = -2.0 * (r - 1) * p;
2720 dc[4] = (-1.0 + (q * f) - r) * p;
2722 f = exp(-M_PI2 * fc->freq * fc->div_flt_rate);
2729 static inline void sample_filter_HBF_L24H12(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2731 FLOAT_T in = *sp, out1, out2;
2734 db[5] = dc[0] * db[0] + db[1];
2735 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2736 db[2] = dc[2] * db[0] + dc[4] * db[5];
2738 db[5] = dc[0] * db[0] + db[3];
2739 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2740 db[4] = dc[2] * db[0] + dc[4] * db[5];
2743 db[11] += (in - db[10]) * dc[11];
2748 *sp = out1 + out2 * DIV_2;
2751 static inline void recalc_filter_HBF_L24H12(FilterCoefficients *fc)
2753 FILTER_T *dc = fc->dc;
2754 FLOAT_T f, r, q ,p, p2, qp, dc0;
2756 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2761 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
2762 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2764 p = 1.0 / (1.0 + (q * f) + r);
2769 dc[3] = -2.0 * (r - 1) * p;
2770 dc[4] = (-1.0 + (q * f) - r) * p;
2772 f = M_PI2 * fc->freq * fc->div_flt_rate;
2773 q = 1.0 - f / (2.0 * (RESO_DB_CF_P(0) + 0.5 / (1.0 + f)) + f - 2.0);
2776 dc[11] = dc0 + 1.0 - 2.0 * cos(f) * q;
2780 static inline void sample_filter_HBF_L12OCT(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2782 FLOAT_T in = *sp, out1, out2;
2784 db[1] += (in - db[0]) * dc[1];
2789 db[11] += (in - db[10]) * dc[11];
2794 *sp = out1 * DIV_3_2 + out2 * DIV_3;
2797 static inline void recalc_filter_HBF_L12OCT(FilterCoefficients *fc)
2799 FILTER_T *dc = fc->dc;
2802 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2807 f = M_PI2 * fc->freq * fc->div_flt_rate;
2808 r = RESO_DB_CF_P(fc->reso_dB);
2809 q = 1.0 - f / (2.0 * (r + 0.5 / (1.0 + f)) + f - 2.0);
2811 dc[1] = dc[0] + 1.0 - 2.0 * cos(f) * q;
2813 f = 2.0 * fc->freq * fc->div_flt_rate;
2817 q = 1.0 - f / (2.0 * (r + 0.5 / (1.0 + f)) + f - 2.0);
2819 dc[11] = dc[10] + 1.0 - 2.0 * cos(f) * q;
2823 static inline void sample_filter_HBF_L24OCT(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2825 FLOAT_T in = *sp, out1, out2;
2828 db[5] = dc[0] * db[0] + db[1];
2829 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2830 db[2] = dc[2] * db[0] + dc[4] * db[5];
2832 db[5] = dc[0] * db[0] + db[3];
2833 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2834 db[4] = dc[2] * db[0] + dc[4] * db[5];
2838 db[15] = dc[10] * db[10] + db[11];
2839 db[11] = dc[11] * db[10] + dc[13] * db[15] + db[12];
2840 db[12] = dc[12] * db[10] + dc[14] * db[15];
2842 db[15] = dc[10] * db[10] + db[13];
2843 db[13] = dc[11] * db[10] + dc[13] * db[15] + db[14];
2844 db[14] = dc[12] * db[10] + dc[14] * db[15];
2847 *sp = out1 * DIV_3_2 + out2 * DIV_3;
2850 static inline void recalc_filter_HBF_L24OCT(FilterCoefficients *fc)
2852 FILTER_T *dc = fc->dc;
2853 FLOAT_T f, r, q ,p, dc0;
2855 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN){
2860 f = tan(M_PI * fc->freq * fc->div_flt_rate); // cutoff freq rate/2
2861 q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2863 p = 1.0 / (1.0 + (q * f) + r);
2868 dc[3] = -2.0 * (r - 1) * p;
2869 dc[4] = (-1.0 + (q * f) - r) * p;
2871 f = 2.0 * fc->freq * fc->div_flt_rate;
2874 f = tan(M_PI * f); // cutoff freq rate/2
2875 // q = 2.0 * RESO_DB_CF_M(fc->reso_dB);
2877 p = 1.0 / (1.0 + (q * f) + r);
2882 dc[13] = -2.0 * (r - 1) * p;
2883 dc[14] = (-1.0 + (q * f) - r) * p;
2890 static inline void sample_filter_LPF_BWx2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2895 db[ 2] = dc[ 0] * db[ 0] + dc[ 1] * db[ 1] + dc[ 2] * db[ 2] + dc[ 3] * db[ 3] + dc[ 4] * db[ 4];
2897 db[ 3] = db[ 2]; // flt out
2899 db[ 1] = db[ 0]; // flt in
2901 db[ 6] = dc[ 0] * db[ 3] + dc[ 1] * db[ 5] + dc[ 2] * db[ 6] + dc[ 3] * db[ 7] + dc[ 4] * db[ 8];
2903 db[ 7] = db[ 6]; // flt out
2905 db[ 5] = db[ 3]; // flt in
2910 static inline void sample_filter_LPF_BWx3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2914 db[ 2] = dc[ 0] * db[ 0] + dc[ 1] * db[ 1] + dc[ 2] * db[ 2] + dc[ 3] * db[ 3] + dc[ 4] * db[ 4];
2916 db[ 3] = db[ 2]; // flt out
2918 db[ 1] = db[ 0]; // flt in
2920 db[ 6] = dc[ 0] * db[ 3] + dc[ 1] * db[ 5] + dc[ 2] * db[ 6] + dc[ 3] * db[ 7] + dc[ 4] * db[ 8];
2922 db[ 7] = db[ 6]; // flt out
2924 db[ 5] = db[ 3]; // flt in
2926 db[10] = dc[ 0] * db[ 7] + dc[ 1] * db[ 9] + dc[ 2] * db[10] + dc[ 3] * db[11] + dc[ 4] * db[12];
2928 db[11] = db[10]; // flt out
2930 db[ 9] = db[ 7]; // flt in
2935 static inline void sample_filter_LPF_BWx4(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2940 db[ 2] = dc[ 0] * db[ 0] + dc[ 1] * db[ 1] + dc[ 2] * db[ 2] + dc[ 3] * db[ 3] + dc[ 4] * db[ 4];
2942 db[ 3] = db[ 2]; // flt out
2944 db[ 1] = db[ 0]; // flt in
2946 db[ 6] = dc[ 0] * db[ 3] + dc[ 1] * db[ 5] + dc[ 2] * db[ 6] + dc[ 3] * db[ 7] + dc[ 4] * db[ 8];
2948 db[ 7] = db[ 6]; // flt out
2950 db[ 5] = db[ 3]; // flt in
2952 db[10] = dc[ 0] * db[ 7] + dc[ 1] * db[ 9] + dc[ 2] * db[10] + dc[ 3] * db[11] + dc[ 4] * db[12];
2954 db[11] = db[10]; // flt out
2956 db[ 9] = db[ 7]; // flt in
2958 db[14] = dc[ 0] * db[11] + dc[ 1] * db[13] + dc[ 2] * db[14] + dc[ 3] * db[15] + dc[ 4] * db[16];
2960 db[15] = db[14]; // flt out
2962 db[13] = db[11]; // flt in
2967 static inline void sample_filter_LPF24_2x2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2971 db[5] = dc[0] * db[0] + db[1];
2972 db[1] = dc[1] * db[0] + dc[3] * db[5] + db[2];
2973 db[2] = dc[2] * db[0] + dc[4] * db[5];
2974 db[10] = db[0] = db[5];
2975 db[5] = dc[0] * db[0] + db[3];
2976 db[3] = dc[1] * db[0] + dc[3] * db[5] + db[4];
2977 db[4] = dc[2] * db[0] + dc[4] * db[5];
2979 db[15] = dc[0] * db[10] + db[11];
2980 db[11] = dc[1] * db[10] + dc[3] * db[15] + db[12];
2981 db[12] = dc[2] * db[10] + dc[4] * db[15];
2983 db[15] = dc[0] * db[10] + db[3];
2984 db[13] = dc[1] * db[10] + dc[3] * db[15] + db[14];
2985 db[14] = dc[2] * db[10] + dc[4] * db[15];
2989 static inline void sample_filter_LPF6x2(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
2992 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
2993 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
2997 static inline void sample_filter_LPF6x3(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3000 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
3001 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
3002 db[3] = dc[0] * db[2] + dc[1] * db[3]; // 18db
3006 static inline void sample_filter_LPF6x4(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3009 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
3010 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
3011 db[3] = dc[0] * db[2] + dc[1] * db[3]; // 18db
3012 db[4] = dc[0] * db[3] + dc[1] * db[4]; // 24db
3016 static inline void sample_filter_LPF6x8(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3019 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
3020 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
3021 db[3] = dc[0] * db[2] + dc[1] * db[3];
3022 db[4] = dc[0] * db[3] + dc[1] * db[4]; // 24db
3023 db[5] = dc[0] * db[4] + dc[1] * db[5];
3024 db[6] = dc[0] * db[5] + dc[1] * db[6]; // 36db
3025 db[7] = dc[0] * db[6] + dc[1] * db[7];
3026 db[8] = dc[0] * db[7] + dc[1] * db[8]; // 48db
3030 static inline void sample_filter_LPF6x16(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3033 db[1] = dc[0] * db[0] + dc[1] * db[1]; // 6db
3034 db[2] = dc[0] * db[1] + dc[1] * db[2]; // 12db
3035 db[3] = dc[0] * db[2] + dc[1] * db[3];
3036 db[4] = dc[0] * db[3] + dc[1] * db[4]; // 24db
3037 db[5] = dc[0] * db[4] + dc[1] * db[5];
3038 db[6] = dc[0] * db[5] + dc[1] * db[6]; // 36db
3039 db[7] = dc[0] * db[6] + dc[1] * db[7];
3040 db[8] = dc[0] * db[7] + dc[1] * db[8]; // 48db
3041 db[9] = dc[0] * db[8] + dc[1] * db[9];
3042 db[10] = dc[0] * db[9] + dc[1] * db[10]; // 60db
3043 db[11] = dc[0] * db[10] + dc[1] * db[11];
3044 db[12] = dc[0] * db[11] + dc[1] * db[12]; // 72db
3045 db[13] = dc[0] * db[12] + dc[1] * db[13];
3046 db[14] = dc[0] * db[13] + dc[1] * db[14]; // 84db
3047 db[15] = dc[0] * db[14] + dc[1] * db[15];
3048 db[16] = dc[0] * db[15] + dc[1] * db[16]; // 96db
3055 static inline void sample_filter_LPF_FIR(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3057 #if (LPF_FIR_ORDER == 20) // optimize
3058 #if (USE_X86_EXT_INTRIN >= 3) && defined(FLOAT_T_DOUBLE)
3059 FLOAT_T input = *sp;
3060 __m128d xdc0 = _mm_loadu_pd(&dc[0]), xdc2 = _mm_loadu_pd(&dc[2]),
3061 xdc4 = _mm_loadu_pd(&dc[4]), xdc6 = _mm_loadu_pd(&dc[6]),
3062 xdc8 = _mm_loadu_pd(&dc[8]), xdc10 = _mm_loadu_pd(&dc[10]),
3063 xdc12 = _mm_loadu_pd(&dc[12]), xdc14 = _mm_loadu_pd(&dc[14]),
3064 xdc16 = _mm_loadu_pd(&dc[16]), xdc18 = _mm_loadu_pd(&dc[18]);
3065 __m128d xdb0 = _mm_loadu_pd(&db[0]), xdb2 = _mm_loadu_pd(&db[2]),
3066 xdb4 = _mm_loadu_pd(&db[4]), xdb6 = _mm_loadu_pd(&db[6]),
3067 xdb8 = _mm_loadu_pd(&db[8]), xdb10 = _mm_loadu_pd(&db[10]),
3068 xdb12 = _mm_loadu_pd(&db[12]), xdb14 = _mm_loadu_pd(&db[14]),
3069 xdb16 = _mm_loadu_pd(&db[16]), xdb18 = _mm_loadu_pd(&db[18]);
3070 __m128d xsum = _mm_setzero_pd();
3071 xsum = MM_FMA5_PD(xdb0, xdc0, xdb2, xdc2, xdb4, xdc4, xdb6, xdc6, xdb8, xdc8);
3072 xsum = MM_FMA5_PD(xdb10, xdc10, xdb12, xdc12, xdb14, xdc14, xdb16, xdc16, xdb18, xdc18);
3073 xsum = _mm_add_pd(xsum, _mm_shuffle_pd(xsum, xsum, 0x1)); // v0=v0+v1 v1=v1+v0
3074 #if defined(DATA_T_FLOAT)
3075 _mm_store_ss(sp, _mm_cvtsd_ss(_mm_setzero_ps(), xsum));
3077 _mm_store_sd(sp, xsum);
3079 _mm_storeu_pd(&db[19], xdb18);
3080 _mm_storeu_pd(&db[17], xdb16);
3081 _mm_storeu_pd(&db[15], xdb14);
3082 _mm_storeu_pd(&db[13], xdb12);
3083 _mm_storeu_pd(&db[11], xdb10);
3084 _mm_storeu_pd(&db[9], xdb8);
3085 _mm_storeu_pd(&db[7], xdb6);
3086 _mm_storeu_pd(&db[5], xdb4);
3087 _mm_storeu_pd(&db[3], xdb2);
3088 _mm_storeu_pd(&db[1], xdb0);
3090 #elif (USE_X86_EXT_INTRIN >= 2) && defined(FLOAT_T_FLOAT)
3091 FLOAT_T input = *sp;
3092 __m128 xdc0 = _mm_loadu_ps(&dc[0]), xdc4 = _mm_loadu_ps(&dc[4]),
3093 xdc8 = _mm_loadu_ps(&dc[8]), xdc12 = _mm_loadu_ps(&dc[12]),
3094 xdc16 = _mm_loadu_ps(&dc[16]);
3095 __m128 xdb0 = _mm_loadu_ps(&db[0]), xdb4 = _mm_loadu_ps(&db[4]),
3096 xdb8 = _mm_loadu_ps(&db[8]), xdb12 = _mm_loadu_ps(&db[12]),
3097 xdb16 = _mm_loadu_ps(&db[16]);
3098 __m128 xsum = _mm_setzero_ps();
3099 xsum = MM_FMA5_PS(xdb0, xdc0, xdb0, xdc0, xdb4, xdc4, xdb8, xdc8, xdb12, xdc12, xdb16, xdc16);
3100 xsum = _mm_add_ps(xsum, _mm_movehl_ps(xsum, xsum)); // v0=v0+v2 v1=v1+v3 v2=-- v3=--
3101 xsum = _mm_add_ps(xsum, _mm_shuffle_ps(xsum, xsum, 0xe1)); // v0=v0+v1
3102 #if defined(DATA_T_FLOAT)
3103 _mm_store_ss(sp, xsum);
3105 #if (USE_X86_EXT_INTRIN >= 3)
3106 _mm_store_sd(sp, _mm_cvtss_sd(xsum));
3110 _mm_store_ss(&out, xsum);
3115 _mm_storeu_ps(&db[17], xdb16);
3116 _mm_storeu_ps(&db[13], xdb12);
3117 _mm_storeu_ps(&db[9], xdb8);
3118 _mm_storeu_ps(&db[5], xdb4);
3119 _mm_storeu_ps(&db[1], xdb0);
3123 sum += db[0] * dc[0];
3124 sum += db[1] * dc[1];
3125 sum += db[2] * dc[2];
3126 sum += db[3] * dc[3];
3127 sum += db[4] * dc[4];
3128 sum += db[5] * dc[5];
3129 sum += db[6] * dc[6];
3130 sum += db[7] * dc[7];
3131 sum += db[8] * dc[8];
3132 sum += db[9] * dc[9];
3133 sum += db[10] * dc[10];
3134 sum += db[11] * dc[11];
3135 sum += db[12] * dc[12];
3136 sum += db[13] * dc[13];
3137 sum += db[14] * dc[14];
3138 sum += db[15] * dc[15];
3139 sum += db[16] * dc[16];
3140 sum += db[17] * dc[17];
3141 sum += db[18] * dc[18];
3142 sum += db[19] * dc[19];
3165 #else // ! (LPF_FIR_ORDER == 20)
3168 for (i = 0; i < LPF_FIR_ORDER ;i++)
3169 sum += db[i] * dc[i];
3170 for (i = LPF_FIR_ORDER - 1; i >= 0; i--)
3177 static void designfir(FLOAT_T *g , FLOAT_T fc, FLOAT_T att);
3179 static inline void recalc_filter_LPF_FIR(FilterCoefficients *fc)
3181 FILTER_T *dc = fc->dc;
3182 FLOAT_T fir_coef[LPF_FIR_ORDER2];
3186 if(FLT_FREQ_MARGIN){
3189 f = fc->freq * fc->div_flt_rate * 2.0;
3190 designfir(fir_coef, f, 40.0);
3191 for (i = 0; i < LPF_FIR_ORDER2; i++)
3192 dc[LPF_FIR_ORDER-1 - i] = dc[i] = fir_coef[LPF_FIR_ORDER2 - 1 - i];
3196 // shelving
\8b¤
\92Ê
3197 static inline void sample_filter_shelving(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3202 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
3203 #if defined(DENORMAL_FIX)
3204 db[2] += denormal_add;
3207 db[3] = db[2]; // flt out
3209 db[1] = db[0]; // flt in
3211 *sp = db[3] * dc[8]; // spgain
3214 static inline void recalc_filter_shelving_low(FilterCoefficients *fc)
3216 FILTER_T *dc = fc->dc;
3217 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
3219 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN || FLT_WIDTH_MARGIN){
3224 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_shelving_gain);
3226 dc[8] = pow((FLOAT_T)10.0, -(fc->reso_dB) * DIV_80 * ext_filter_shelving_reduce); // spgain
3229 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
3232 beta = sqrt(A) / (fc->q * ext_filter_shelving_q); // q > 0
3233 a0 = 1.0 / ((A + 1) + (A - 1) * cs + beta * sn);
3234 a1 = 2.0 * ((A - 1) + (A + 1) * cs);
3235 a2 = -((A + 1) + (A - 1) * cs - beta * sn);
3236 b0 = A * ((A + 1) - (A - 1) * cs + beta * sn);
3237 b1 = 2.0 * A * ((A - 1) - (A + 1) * cs);
3238 b2 = A * ((A + 1) - (A - 1) * cs - beta * sn);
3247 static inline void recalc_filter_shelving_hi(FilterCoefficients *fc)
3249 FILTER_T *dc = fc->dc;
3250 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
3252 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN || FLT_WIDTH_MARGIN){
3257 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_shelving_gain);
3259 dc[8] = pow((FLOAT_T)10.0, -(fc->reso_dB) * DIV_80 * ext_filter_shelving_reduce); // spgain
3262 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
3265 beta = sqrt(A) / (fc->q * ext_filter_shelving_q); // q > 0
3266 a0 = 1.0 / ((A + 1) - (A - 1) * cs + beta * sn);
3267 a1 = (-2.0 * ((A - 1) - (A + 1) * cs));
3268 a2 = -((A + 1) - (A - 1) * cs - beta * sn);
3269 b0 = A * ((A + 1) + (A - 1) * cs + beta * sn);
3270 b1 = -2.0 * A * ((A - 1) + (A + 1) * cs);
3271 b2 = A * ((A + 1) + (A - 1) * cs - beta * sn);
3281 static inline void sample_filter_peaking(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3286 db[2] = dc[0] * db[0] + dc[1] * db[1] + dc[2] * db[2] + dc[3] * db[3] + dc[4] * db[4];
3287 #if defined(DENORMAL_FIX)
3288 db[2] += denormal_add;
3291 db[3] = db[2]; // flt out
3293 db[1] = db[0]; // flt in
3295 *sp = db[3] * dc[8]; // spgain
3298 static inline void recalc_filter_peaking(FilterCoefficients *fc)
3300 FILTER_T *dc = fc->dc;
3301 FLOAT_T a0, a1, a2, b1, b2, b0, omega, sn, cs, A, beta;
3303 if(FLT_FREQ_MARGIN || FLT_RESO_MARGIN || FLT_WIDTH_MARGIN){
3308 A = pow(10.0, fc->reso_dB * DIV_40 * ext_filter_peaking_gain);
3310 dc[8] = pow((FLOAT_T)10.0, -(fc->reso_dB) * DIV_80 * ext_filter_peaking_reduce); // spgain
3313 omega = (FLOAT_T)2.0 * M_PI * fc->freq * fc->div_flt_rate;
3316 beta = sn / (2.0 * fc->q * ext_filter_peaking_q); // q > 0
3317 a0 = 1.0 / (1.0 + beta / A);
3319 dc[4] = -(1.0 - beta / A) * a0; // -
3320 dc[3] = -a1 * a0; // -
3321 dc[2] = (1.0 - beta * A) * a0;
3322 dc[1] = a1 * a0; // b1 = a1
3323 dc[0] = (1.0 + beta * A) * a0;
3329 static inline void sample_filter_biquad(FILTER_T *dc, FILTER_T *db, DATA_T *sp)
3331 // db[2] = db[0] * dc[0] + db[1] * dc[1] + db[2] * dc[2] + db[3] * dc[3] + db[4] * dc[4]; // dc[0]=dc[2] BW
\82Æ
\93¯
\82¶
3332 db[2] = db[1] * dc[1] + (*sp + db[2]) * dc[2] + db[3] * dc[3] + db[4] * dc[4]; // -dc3 -dc4
3333 #if defined(DENORMAL_FIX)
3334 db[2] += denormal_add;
3343 static inline void recalc_filter_biquad_low(FilterCoefficients *fc)
3345 FILTER_T *dc = fc->dc;
3346 double a0, omega, sn, cs, alpha;
3348 if(FLT_FREQ_MARGIN || FLT_WIDTH_MARGIN){
3353 omega = 2.0 * M_PI * fc->freq * fc->div_flt_rate;
3356 alpha = sn / (2.0 * fc->q); // q > 0
3357 a0 = 1.0 / (1.0 + alpha);
3358 dc[1] = (1.0 - cs) * a0;
3359 dc[2] = dc[0] = ((1.0 - cs) * DIV_2) * a0;
3360 dc[3] = -(-2.0 * cs) * a0; // -
3361 dc[4] = -(1.0 - alpha) * a0; // -
3365 static inline void recalc_filter_biquad_hi(FilterCoefficients *fc)
3367 FILTER_T *dc = fc->dc;
3368 double a0, omega, sn, cs, alpha;
3370 if(FLT_FREQ_MARGIN || FLT_WIDTH_MARGIN){
3375 omega = 2.0 * M_PI * fc->freq * fc->div_flt_rate;
3378 alpha = sn / (2.0 * fc->q); // q > 0
3379 a0 = 1.0 / (1.0 + alpha);
3380 dc[1] = (-(1.0 + cs)) * a0;
3381 dc[2] = dc[0] = ((1.0 + cs) * DIV_2) * a0;
3382 dc[3] = -(-2.0 * cs) * a0; // -
3383 dc[4] = -(1.0 - alpha) * a0; // -
3387 #endif /* OPT_MODE == 1 */
3394 const char *filter_name[] =
3445 typedef void (*recalc_filter_t)(FilterCoefficients *fc);
3447 static recalc_filter_t recalc_filters[] = {
3450 recalc_filter_LPF12,
3451 recalc_filter_LPF24,
3452 recalc_filter_LPF_BW,
3453 recalc_filter_LPF12_2,
3454 recalc_filter_LPF24_2,
3456 recalc_filter_LPF18,
3457 recalc_filter_LPF_TFO,
3458 recalc_filter_HPF_BW,
3459 recalc_filter_BPF_BW,
3460 recalc_filter_peak1,
3461 recalc_filter_peak1, // notch1
3462 recalc_filter_LPF12_3,
3463 recalc_filter_HPF12_3, // HPF12_3
3464 recalc_filter_BPF12_3, // BPF12_3
3465 recalc_filter_BCF12_3, // BCF12_3
3466 recalc_filter_LPF6, // HPF6
3467 recalc_filter_LPF12_2, // HPF12_2
3469 recalc_filter_HBF_L6L12,
3470 recalc_filter_HBF_L12L6,
3471 recalc_filter_HBF_L12H6,
3472 recalc_filter_HBF_L24H6,
3473 recalc_filter_HBF_L24H12,
3474 recalc_filter_HBF_L12OCT,
3475 recalc_filter_HBF_L24OCT,
3477 recalc_filter_LPF6, // LPF6x2
3478 recalc_filter_LPF6, // LPF6x3
3479 recalc_filter_LPF6, // LPF6x4
3480 recalc_filter_LPF6, // LPF6x8
3481 recalc_filter_LPF6, // LPF6x16
3482 recalc_filter_LPF_BW, // LPF_BWx2
3483 recalc_filter_LPF_BW, // LPF_BWx3
3484 recalc_filter_LPF_BW, // LPF_BWx4
3485 recalc_filter_LPF24_2, // LPF24_2x2
3487 recalc_filter_LPF_FIR, // LPF_FIR
3489 recalc_filter_shelving_low, // eq_low
3490 recalc_filter_shelving_hi, // eq_hi
3491 recalc_filter_peaking, // eq_mid
3492 recalc_filter_biquad_low,
3493 recalc_filter_biquad_hi,
3496 typedef void (*sample_filter_t)(FILTER_T *dc, FILTER_T *db, DATA_T *sp);
3498 static sample_filter_t sample_filters[] = {
3501 sample_filter_LPF12,
3502 sample_filter_LPF24,
3503 sample_filter_LPF_BW,
3504 sample_filter_LPF12_2,
3505 sample_filter_LPF24_2,
3507 sample_filter_LPF18,
3508 sample_filter_LPF_TFO,
3509 sample_filter_HPF_BW,
3510 sample_filter_BPF_BW,
3511 sample_filter_peak1,
3512 sample_filter_notch1,
3513 sample_filter_LPF12_3,
3514 sample_filter_HPF12_3,
3515 sample_filter_BPF12_3,
3516 sample_filter_BCF12_3,
3518 sample_filter_HPF12_2,
3520 sample_filter_HBF_L6L12,
3521 sample_filter_HBF_L12L6,
3522 sample_filter_HBF_L12H6,
3523 sample_filter_HBF_L24H6,
3524 sample_filter_HBF_L24H12,
3525 sample_filter_HBF_L12OCT,
3526 sample_filter_HBF_L24OCT,
3528 sample_filter_LPF6x2,
3529 sample_filter_LPF6x3,
3530 sample_filter_LPF6x4,
3531 sample_filter_LPF6x8,
3532 sample_filter_LPF6x16,
3533 sample_filter_LPF_BWx2,
3534 sample_filter_LPF_BWx3,
3535 sample_filter_LPF_BWx4,
3536 sample_filter_LPF24_2x2,
3538 sample_filter_LPF_FIR,
3540 sample_filter_shelving, // eq_low
3541 sample_filter_shelving, // eq_hi
3542 sample_filter_peaking, // eq_mid
3543 sample_filter_biquad,
3544 sample_filter_biquad,
3547 void set_sample_filter_type(FilterCoefficients *fc, int type)
3549 if(type < FILTER_NONE || type >= FILTER_LIST_MAX)
3551 if(!fc->init || fc->type != type)
3552 memset(fc, 0, sizeof(FilterCoefficients));
3554 fc->recalc_filter = recalc_filters[type];
3555 fc->sample_filter = sample_filters[type];
3560 const double sample_filter_limit_rate[] = {
3561 0.16666, // type0 OFF
3562 0.16666, // type1 Chamberlin 12dB/oct fc < rate / 6
3563 0.50000, // type2 Moog VCF 24dB/oct fc < rate / 2
3564 0.50000, // type3 butterworth fc < rate / 2 elion
3565 0.50000, // type4 Resonant IIR 12dB/oct fc < rate / 2
3566 0.50000, // type5 amSynth 24dB/oct fc < rate / 2
3567 0.50000, // type6 One pole 6dB/oct non rez fc < rate / 2
3568 0.44444, // type7 resonant 3 pole 18dB/oct fc < rate / 2.25
3569 0.50000, // type8 two first order fc < rate / 2
3571 0.50000, // type9 HPF butterworth fc < rate / 2 elion +
3572 0.50000, // type10 BPF butterworth fc < rate / 2 elion +
3573 0.50000, // type11 peak fc < rate / 2
3574 0.50000, // type12 notch fc < rate / 2
3575 0.21875, // type13 LPF Chamberlin2 12dB/oct fc < rate / 2
3576 0.21875, // type14 HPF Chamberlin2 12dB/oct fc < rate / 2
3577 0.21875, // type15 BPF Chamberlin2 12dB/oct fc < rate / 2
3578 0.21875, // type16 notch Chamberlin2 12dB/oct fc < rate / 2
3579 0.50000, // type17 HPF6
3580 0.50000, // type18 HPF12_2
3582 0.50000, // type19 L6L12
3583 0.50000, // type20 L12L6
3584 0.50000, // type21 L12H6
3585 0.50000, // type22 L24H6
3586 0.50000, // type23 L24H12
3587 0.50000, // type24 L12OCT
3588 0.50000, // type25 L24OCT
3598 0.50000, // LPF24_2x2
3602 0.50000, // FILTER_SHELVING_LOW, // q
3603 0.50000, // FILTER_SHELVING_HI, // q
3604 0.50000, // FILTER_PEAKING, // q
3605 0.50000, // FILTER_BIQUAD_LOW, // q
3606 0.50000, // FILTER_BIQUAD_HI, // q
3609 void set_sample_filter_ext_rate(FilterCoefficients *fc, FLOAT_T freq)
3612 fc->flt_rate = freq;
3613 fc->div_flt_rate = 1.0 / fc->flt_rate;
3614 fc->flt_rate_div2 = fc->flt_rate * DIV_2;
3616 fc->flt_rate = play_mode->rate;
3617 fc->div_flt_rate = div_playmode_rate;
3618 fc->flt_rate_div2 = playmode_rate_div2;
3621 fc->flt_rate_limit1 = fc->flt_rate * sample_filter_limit_rate[fc->type]; // sr*limit
3622 fc->flt_rate_limit2 = fc->flt_rate_limit1 * 2.0; // sr*2*limit
3623 fc->div_flt_rate_ov2 = fc->div_flt_rate * DIV_2; // 1/sr*2
3624 fc->div_flt_rate_ov3 = fc->div_flt_rate * DIV_3; // 1/sr*3
3627 void set_sample_filter_freq(FilterCoefficients *fc, FLOAT_T freq)
3629 if(fc->flt_rate == 0) // not init filter rate
3630 set_sample_filter_ext_rate(fc, 0);
3631 if(freq < 0 || freq > fc->flt_rate_div2) // sr/2
3632 fc->freq = fc->flt_rate_div2;
3633 else if(freq < 10.0)
3639 void set_sample_filter_reso(FilterCoefficients *fc, FLOAT_T reso)
3641 const FLOAT_T limit = 96.0;
3644 fc->reso_dB = limit;
3651 void set_sample_filter_q(FilterCoefficients *fc, FLOAT_T q)
3653 const FLOAT_T def = 0.7;
3654 const FLOAT_T limit = 12.0;
3655 const FLOAT_T min = 0.01;
3667 void init_sample_filter(FilterCoefficients *fc, FLOAT_T freq, FLOAT_T reso, int type)
3669 set_sample_filter_type(fc, type);
3670 set_sample_filter_freq(fc, freq);
3671 set_sample_filter_reso(fc, reso);
3676 void init_sample_filter2(FilterCoefficients *fc, FLOAT_T freq, FLOAT_T reso, FLOAT_T q, int type)
3678 set_sample_filter_type(fc, type);
3679 set_sample_filter_freq(fc, freq);
3680 set_sample_filter_reso(fc, reso);
3681 set_sample_filter_q(fc, q);
3686 void recalc_filter(FilterCoefficients *fc)
3690 return; // error not init
3692 fc->recalc_filter(fc);
3695 // sample_filter (1ch mono / 2ch left)
3696 inline void sample_filter(FilterCoefficients *fc, DATA_T *sp)
3698 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], sp);
3701 // sample_filter (2ch stereo)
3702 inline void sample_filter_stereo(FilterCoefficients *fc, DATA_T *spL, DATA_T *spR)
3704 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], spL);
3705 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], spR);
3708 inline void sample_filter_stereo2(FilterCoefficients *fc, DATA_T *spLR)
3710 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], &spLR[0]);
3711 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], &spLR[1]);
3714 // sample_filter (2ch left)
3715 inline void sample_filter_left(FilterCoefficients *fc, DATA_T *sp)
3717 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], sp);
3720 // sample_filter (2ch left)
3721 inline void sample_filter_right(FilterCoefficients *fc, DATA_T *sp)
3723 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], sp);
3726 // buffer filter (1ch mono)
3727 inline void buffer_filter(FilterCoefficients *fc, DATA_T *sp, int32 count)
3733 return; // error not init
3736 return; // filter none
3738 if (fc->type == FILTER_LPF12_2) {
3739 recalc_filter_LPF12_2(fc);
3740 buffer_filter_LPF12_2(fc->dc, &fc->db[FILTER_FB_L], sp, count);
3744 fc->recalc_filter(fc);
3745 for(i = 0; i < count; i++)
3746 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], &sp[i]);
3749 // buffer filter (2ch stereo)
3750 inline void buffer_filter_stereo(FilterCoefficients *fc, DATA_T *sp, int32 count)
3756 return; // error not init
3759 return; // filter none
3760 fc->recalc_filter(fc);
3761 for(i = 0; i < count; i++){
3762 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], &sp[i]);
3764 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], &sp[i]);
3768 // buffer filter (2ch left)
3769 inline void buffer_filter_left(FilterCoefficients *fc, DATA_T *sp, int32 count)
3775 return; // error not init
3778 return; // filter none
3779 fc->recalc_filter(fc);
3780 for(i = 0; i < count; i++)
3781 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_L], &sp[i++]);
3784 // buffer filter (2ch right)
3785 inline void buffer_filter_right(FilterCoefficients *fc, DATA_T *sp, int32 count)
3791 return; // error not init
3794 return; // filter none
3795 fc->recalc_filter(fc);
3796 for(i = 0; i < count; i++)
3797 fc->sample_filter(fc->dc, &fc->db[FILTER_FB_R], &sp[++i]);
3804 void set_voice_filter1_type(FilterCoefficients *fc, int type)
3807 set_sample_filter_type(fc, type);
3810 void set_voice_filter1_ext_rate(FilterCoefficients *fc, FLOAT_T freq)
3812 set_sample_filter_ext_rate(fc, freq);
3815 void set_voice_filter1_freq(FilterCoefficients *fc, FLOAT_T freq)
3817 set_sample_filter_freq(fc, freq);
3820 void set_voice_filter1_reso(FilterCoefficients *fc, FLOAT_T reso)
3822 set_sample_filter_reso(fc, reso);
3825 void voice_filter1(FilterCoefficients *fc, DATA_T *sp, int32 count)
3827 buffer_filter(fc, sp, count);
3835 static int conv_type_voice_filter2[] = {
3839 FILTER_HPF12_3, // 2
3841 FILTER_HPF12_2, // 4
3844 void set_voice_filter2_type(FilterCoefficients *fc, int type)
3847 if(type < VOICE_FILTER2_NONE || type >= VOICE_FILTER2_LIST_MAX)
3848 type = VOICE_FILTER2_NONE;
3849 set_sample_filter_type(fc, conv_type_voice_filter2[type]);
3852 void set_voice_filter2_ext_rate(FilterCoefficients *fc, FLOAT_T freq)
3854 set_sample_filter_ext_rate(fc, freq);
3857 void set_voice_filter2_freq(FilterCoefficients *fc, FLOAT_T freq)
3859 set_sample_filter_freq(fc, freq);
3862 void set_voice_filter2_reso(FilterCoefficients *fc, FLOAT_T reso)
3864 set_sample_filter_reso(fc, reso);
3867 void voice_filter2(FilterCoefficients *fc, DATA_T *sp, int32 count)
3869 buffer_filter(fc, sp, count);
3872 /// voice_filter1 + voice_filter2
3873 void voice_filter(int v, DATA_T *sp, int32 count)
3875 Voice *vp = &voice[v];
3877 buffer_filter(&vp->fc, sp, count); // lpf
3878 buffer_filter(&vp->fc2, sp, count); // hpf
3886 static int conv_type_resample_filter[] = {
3900 void set_resample_filter_type(FilterCoefficients *fc, int type)
3903 if(type < RESAMPLE_FILTER_NONE || type >= RESAMPLE_FILTER_LIST_MAX)
3904 type = RESAMPLE_FILTER_NONE;
3905 set_sample_filter_type(fc, conv_type_resample_filter[type]);
3908 void set_resample_filter_ext_rate(FilterCoefficients *fc, FLOAT_T freq)
3910 set_sample_filter_ext_rate(fc, freq);
3913 void set_resample_filter_freq(FilterCoefficients *fc, FLOAT_T freq)
3915 set_sample_filter_freq(fc, freq);
3918 void resample_filter(int v, DATA_T *sp, int32 count)
3920 buffer_filter(&voice[v].rf_fc, sp, count);
3924 #ifdef MIX_VOICE_BATCH
3926 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
3928 static inline __mmask8 generate_mask8_for_count(int32 offset, int32 count)
3930 if (offset < count) {
3931 if (offset + 8 <= count)
3934 return (1 << (count - offset)) - 1;
3942 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
3944 static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
3946 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
3947 if (i >= batch_size)
3950 __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(i, batch_size), &counts[i]);
3952 __m256d vdb0123_0 = _mm256_loadu_pd(&dbs[i][0]);
3953 __m256d vdb0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dbs[i + 1][0]) : _mm256_setzero_pd();
3954 __m256d vdb0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(&dbs[i + 2][0]) : _mm256_setzero_pd();
3955 __m256d vdb0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(&dbs[i + 3][0]) : _mm256_setzero_pd();
3956 __m256d vdb0123_4 = i + 4 < batch_size ? _mm256_loadu_pd(&dbs[i + 4][0]) : _mm256_setzero_pd();
3957 __m256d vdb0123_5 = i + 5 < batch_size ? _mm256_loadu_pd(&dbs[i + 5][0]) : _mm256_setzero_pd();
3958 __m256d vdb0123_6 = i + 6 < batch_size ? _mm256_loadu_pd(&dbs[i + 6][0]) : _mm256_setzero_pd();
3959 __m256d vdb0123_7 = i + 7 < batch_size ? _mm256_loadu_pd(&dbs[i + 7][0]) : _mm256_setzero_pd();
3961 __m512d vdb0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb0123_0), vdb0123_2, 1);
3962 __m512d vdb0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb0123_1), vdb0123_3, 1);
3963 __m512d vdb0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb0123_4), vdb0123_6, 1);
3964 __m512d vdb0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb0123_5), vdb0123_7, 1);
3966 __m512d vdb01_0246 = _mm512_shuffle_f64x2(vdb0123_02, vdb0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
3967 __m512d vdb01_1357 = _mm512_shuffle_f64x2(vdb0123_13, vdb0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
3968 __m512d vdb23_0246 = _mm512_shuffle_f64x2(vdb0123_02, vdb0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
3969 __m512d vdb23_1357 = _mm512_shuffle_f64x2(vdb0123_13, vdb0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
3971 __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
3972 __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
3973 __m512d vdb2 = _mm512_unpacklo_pd(vdb23_0246, vdb23_1357);
3974 __m512d vdb3 = _mm512_unpackhi_pd(vdb23_0246, vdb23_1357);
3975 __m512d vdb4 = _mm512_set_pd(
3976 i + 7 < batch_size ? dbs[i + 7][4] : 0.0,
3977 i + 6 < batch_size ? dbs[i + 6][4] : 0.0,
3978 i + 5 < batch_size ? dbs[i + 5][4] : 0.0,
3979 i + 4 < batch_size ? dbs[i + 4][4] : 0.0,
3980 i + 3 < batch_size ? dbs[i + 3][4] : 0.0,
3981 i + 2 < batch_size ? dbs[i + 2][4] : 0.0,
3982 i + 1 < batch_size ? dbs[i + 1][4] : 0.0,
3986 __m256d vdc0123_0 = _mm256_loadu_pd(&dcs[i][0]);
3987 __m256d vdc0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dcs[i + 1][0]) : _mm256_setzero_pd();
3988 __m256d vdc0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(&dcs[i + 2][0]) : _mm256_setzero_pd();
3989 __m256d vdc0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(&dcs[i + 3][0]) : _mm256_setzero_pd();
3990 __m256d vdc0123_4 = i + 4 < batch_size ? _mm256_loadu_pd(&dcs[i + 4][0]) : _mm256_setzero_pd();
3991 __m256d vdc0123_5 = i + 5 < batch_size ? _mm256_loadu_pd(&dcs[i + 5][0]) : _mm256_setzero_pd();
3992 __m256d vdc0123_6 = i + 6 < batch_size ? _mm256_loadu_pd(&dcs[i + 6][0]) : _mm256_setzero_pd();
3993 __m256d vdc0123_7 = i + 7 < batch_size ? _mm256_loadu_pd(&dcs[i + 7][0]) : _mm256_setzero_pd();
3995 __m512d vdc0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc0123_0), vdc0123_2, 1);
3996 __m512d vdc0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc0123_1), vdc0123_3, 1);
3997 __m512d vdc0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc0123_4), vdc0123_6, 1);
3998 __m512d vdc0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc0123_5), vdc0123_7, 1);
4000 __m512d vdc01_0246 = _mm512_shuffle_f64x2(vdc0123_02, vdc0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4001 __m512d vdc01_1357 = _mm512_shuffle_f64x2(vdc0123_13, vdc0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4002 __m512d vdc23_0246 = _mm512_shuffle_f64x2(vdc0123_02, vdc0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4003 __m512d vdc23_1357 = _mm512_shuffle_f64x2(vdc0123_13, vdc0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4005 __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
4006 __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
4007 __m512d vdc2 = _mm512_unpacklo_pd(vdc23_0246, vdc23_1357);
4008 __m512d vdc3 = _mm512_unpackhi_pd(vdc23_0246, vdc23_1357);
4009 __m512d vdc4 = _mm512_set_pd(
4010 i + 7 < batch_size ? dcs[i + 7][4] : 0.0,
4011 i + 6 < batch_size ? dcs[i + 6][4] : 0.0,
4012 i + 5 < batch_size ? dcs[i + 5][4] : 0.0,
4013 i + 4 < batch_size ? dcs[i + 4][4] : 0.0,
4014 i + 3 < batch_size ? dcs[i + 3][4] : 0.0,
4015 i + 2 < batch_size ? dcs[i + 2][4] : 0.0,
4016 i + 1 < batch_size ? dcs[i + 1][4] : 0.0,
4020 __m128i vcounts_halfmax = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
4021 vcounts_halfmax = _mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, (3 << 2) | 2));
4022 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, 1)));
4024 for (int32 j = 0; j < count_max; j += 8) {
4026 vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[i]), &sps[i][j]);
4028 for (int k = 1; k < 8; k++)
4029 vin[k] = _mm512_maskz_loadu_pd(i + k < batch_size ? generate_mask8_for_count(j, counts[i + k]) : 0, & sps[i + k][j]);
4031 __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
4032 __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
4033 __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
4034 __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
4035 __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
4036 __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
4037 __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
4038 __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
4040 __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4041 __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4042 __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4043 __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4044 __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4045 __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4046 __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4047 __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4050 vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4051 vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4052 vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4053 vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4054 vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4055 vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4056 vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4057 vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4059 for (int k = 0; k < 8; k++) {
4060 __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
4062 vdb0 = _mm512_mask_mov_pd(vdb0, kmask, vsps[k]);
4063 vdb2 = _mm512_mask_fmadd_pd(vdb2, kmask, vdc2, _mm512_add_pd(_mm512_fmadd_pd(vdc0, vdb0, _mm512_mul_pd(vdc1, vdb1)), _mm512_fmadd_pd(vdc3, vdb3, _mm512_mul_pd(vdc4, vdb4))));
4066 vdb2 = _mm512_mask_add_pd(vdb2, kmask, vdb2, _mm512_set1_pd(denormal_add));
4068 vdb4 = _mm512_mask_mov_pd(vdb4, kmask, vdb3);
4069 vdb3 = _mm512_mask_mov_pd(vdb3, kmask, vdb2);
4070 vdb2 = _mm512_mask_mov_pd(vdb2, kmask, vdb1);
4071 vdb1 = _mm512_mask_mov_pd(vdb1, kmask, vdb0);
4075 __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
4076 __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
4077 __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
4078 __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
4079 __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
4080 __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
4081 __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
4082 __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
4084 __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4085 __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4086 __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4087 __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4088 __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4089 __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4090 __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4091 __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4094 vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4095 vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4096 vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4097 vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4098 vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4099 vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4100 vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4101 vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4103 for (int k = 0; k < 8; k++)
4104 _mm512_mask_storeu_pd(&sps[i + k][j], generate_mask8_for_count(j, counts[i + k]), vout[k]);
4107 vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
4108 vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
4109 vdb23_0246 = _mm512_unpacklo_pd(vdb2, vdb3);
4110 vdb23_1357 = _mm512_unpackhi_pd(vdb2, vdb3);
4112 __m512d vdb0123_04 = _mm512_permutex2var_pd(vdb01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vdb23_0246);
4113 __m512d vdb0123_15 = _mm512_permutex2var_pd(vdb01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vdb23_1357);
4114 __m512d vdb0123_26 = _mm512_permutex2var_pd(vdb01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vdb23_0246);
4115 __m512d vdb0123_37 = _mm512_permutex2var_pd(vdb01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vdb23_1357);
4117 _mm256_storeu_pd(&dbs[i][0], _mm512_castpd512_pd256(vdb0123_04));
4118 _mm_storel_pd(&dbs[i][4], _mm512_castpd512_pd128(vdb4));
4120 if (i + 1 < batch_size) {
4121 _mm256_storeu_pd(&dbs[i + 1][0], _mm512_castpd512_pd256(vdb0123_15));
4122 _mm_storeh_pd(&dbs[i + 1][4], _mm512_castpd512_pd128(vdb4));
4125 if (i + 2 < batch_size) {
4126 _mm256_storeu_pd(&dbs[i + 2][0], _mm512_castpd512_pd256(vdb0123_26));
4127 _mm_storel_pd(&dbs[i + 2][4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb4), 1));
4130 if (i + 3 < batch_size) {
4131 _mm256_storeu_pd(&dbs[i + 3][0], _mm512_castpd512_pd256(vdb0123_37));
4132 _mm_storeh_pd(&dbs[i + 3][4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb4), 1));
4135 if (i + 4 < batch_size) {
4136 _mm256_storeu_pd(&dbs[i + 4][0], _mm512_extractf64x4_pd(vdb0123_04, 1));
4137 _mm_storel_pd(&dbs[i + 4][4], _mm512_extractf64x2_pd(vdb4, 2));
4140 if (i + 5 < batch_size) {
4141 _mm256_storeu_pd(&dbs[i + 5][0], _mm512_extractf64x4_pd(vdb0123_15, 1));
4142 _mm_storeh_pd(&dbs[i + 5][4], _mm512_extractf64x2_pd(vdb4, 2));
4145 if (i + 6 < batch_size) {
4146 _mm256_storeu_pd(&dbs[i + 6][0], _mm512_extractf64x4_pd(vdb0123_26, 1));
4147 _mm_storel_pd(&dbs[i + 6][4], _mm512_extractf64x2_pd(vdb4, 3));
4150 if (i + 7 < batch_size) {
4151 _mm256_storeu_pd(&dbs[i + 7][0], _mm512_extractf64x4_pd(vdb0123_37, 1));
4152 _mm_storeh_pd(&dbs[i + 7][4], _mm512_extractf64x2_pd(vdb4, 3));
4157 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4159 static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
4161 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
4162 if (i >= batch_size)
4165 __m128i vcounts = _mm_set_epi32(
4166 i + 3 < batch_size ? counts[i + 3] : 0,
4167 i + 2 < batch_size ? counts[i + 2] : 0,
4168 i + 1 < batch_size ? counts[i + 1] : 0,
4172 __m256d vdb0123_0 = _mm256_loadu_pd(&dbs[i][0]);
4173 __m256d vdb0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dbs[i + 1][0]) : _mm256_setzero_pd();
4174 __m256d vdb0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(&dbs[i + 2][0]) : _mm256_setzero_pd();
4175 __m256d vdb0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(&dbs[i + 3][0]) : _mm256_setzero_pd();
4177 __m256d vdb01_02 = _mm256_permute2f128_pd(vdb0123_0, vdb0123_2, (2 << 4) | 0);
4178 __m256d vdb01_13 = _mm256_permute2f128_pd(vdb0123_1, vdb0123_3, (2 << 4) | 0);
4179 __m256d vdb23_02 = _mm256_permute2f128_pd(vdb0123_0, vdb0123_2, (3 << 4) | 1);
4180 __m256d vdb23_13 = _mm256_permute2f128_pd(vdb0123_1, vdb0123_3, (3 << 4) | 1);
4182 __m256d vdb0 = _mm256_unpacklo_pd(vdb01_02, vdb01_13);
4183 __m256d vdb1 = _mm256_unpackhi_pd(vdb01_02, vdb01_13);
4184 __m256d vdb2 = _mm256_unpacklo_pd(vdb23_02, vdb23_13);
4185 __m256d vdb3 = _mm256_unpackhi_pd(vdb23_02, vdb23_13);
4186 __m256d vdb4 = _mm256_set_pd(
4187 i + 3 < batch_size ? dbs[i + 3][4] : 0.0,
4188 i + 2 < batch_size ? dbs[i + 2][4] : 0.0,
4189 i + 1 < batch_size ? dbs[i + 1][4] : 0.0,
4193 __m256d vdc0123_0 = _mm256_loadu_pd(&dcs[i][0]);
4194 __m256d vdc0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(&dcs[i + 1][0]) : _mm256_setzero_pd();
4195 __m256d vdc0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(&dcs[i + 2][0]) : _mm256_setzero_pd();
4196 __m256d vdc0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(&dcs[i + 3][0]) : _mm256_setzero_pd();
4198 __m256d vdc01_02 = _mm256_permute2f128_pd(vdc0123_0, vdc0123_2, (2 << 4) | 0);
4199 __m256d vdc01_13 = _mm256_permute2f128_pd(vdc0123_1, vdc0123_3, (2 << 4) | 0);
4200 __m256d vdc23_02 = _mm256_permute2f128_pd(vdc0123_0, vdc0123_2, (3 << 4) | 1);
4201 __m256d vdc23_13 = _mm256_permute2f128_pd(vdc0123_1, vdc0123_3, (3 << 4) | 1);
4203 __m256d vdc0 = _mm256_unpacklo_pd(vdc01_02, vdc01_13);
4204 __m256d vdc1 = _mm256_unpackhi_pd(vdc01_02, vdc01_13);
4205 __m256d vdc2 = _mm256_unpacklo_pd(vdc23_02, vdc23_13);
4206 __m256d vdc3 = _mm256_unpackhi_pd(vdc23_02, vdc23_13);
4207 __m256d vdc4 = _mm256_set_pd(
4208 i + 3 < batch_size ? dcs[i + 3][4] : 0.0,
4209 i + 2 < batch_size ? dcs[i + 2][4] : 0.0,
4210 i + 1 < batch_size ? dcs[i + 1][4] : 0.0,
4214 __m128i vcounts_halfmax = _mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, (3 << 2) | 2));
4215 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, 1)));
4217 for (int32 j = 0; j < count_max; j += 4) {
4218 __m256d vsp0123_0 = j < counts[i] ? _mm256_loadu_pd(&sps[i][j]) : _mm256_setzero_pd();
4219 __m256d vsp0123_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm256_loadu_pd(&sps[i + 1][j]) : _mm256_setzero_pd();
4220 __m256d vsp0123_2 = i + 1 < batch_size && j < counts[i + 2] ? _mm256_loadu_pd(&sps[i + 2][j]) : _mm256_setzero_pd();
4221 __m256d vsp0123_3 = i + 1 < batch_size && j < counts[i + 3] ? _mm256_loadu_pd(&sps[i + 3][j]) : _mm256_setzero_pd();
4223 __m256d vsp01_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (2 << 4) | 0);
4224 __m256d vsp01_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (2 << 4) | 0);
4225 __m256d vsp23_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (3 << 4) | 1);
4226 __m256d vsp23_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (3 << 4) | 1);
4229 vsps[0] = _mm256_unpacklo_pd(vsp01_02, vsp01_13);
4230 vsps[1] = _mm256_unpackhi_pd(vsp01_02, vsp01_13);
4231 vsps[2] = _mm256_unpacklo_pd(vsp23_02, vsp23_13);
4232 vsps[3] = _mm256_unpackhi_pd(vsp23_02, vsp23_13);
4234 for (int k = 0; k < 4; k++) {
4235 __m256d vmask = _mm256_castsi256_pd(_mm256_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
4237 vdb0 = _mm256_blendv_pd(vdb0, vsps[k], vmask);
4238 vdb2 = _mm256_blendv_pd(vdb2, MM256_FMA_PD(vdc0, vdb0, MM256_FMA4_PD(vdc1, vdb1, vdc2, vdb2, vdc3, vdb3, vdc4, vdb4)), vmask);
4241 vdb2 = _mm256_blendv_pd(vdb2, _mm256_add_pd(vdb2, _mm_set1_pd(denormal_add)), vmask);
4243 vdb4 = _mm256_blendv_pd(vdb4, vdb3, vmask);
4244 vdb3 = _mm256_blendv_pd(vdb3, vdb2, vmask);
4245 vdb2 = _mm256_blendv_pd(vdb2, vdb1, vmask);
4246 vdb1 = _mm256_blendv_pd(vdb1, vdb0, vmask);
4250 vsp01_02 = _mm256_unpacklo_pd(vsps[0], vsps[1]);
4251 vsp01_13 = _mm256_unpackhi_pd(vsps[0], vsps[1]);
4252 vsp23_02 = _mm256_unpacklo_pd(vsps[2], vsps[3]);
4253 vsp23_13 = _mm256_unpackhi_pd(vsps[2], vsps[3]);
4255 vsp0123_0 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (2 << 4) | 0);
4256 vsp0123_1 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (2 << 4) | 0);
4257 vsp0123_2 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (3 << 4) | 1);
4258 vsp0123_3 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (3 << 4) | 1);
4261 _mm256_storeu_pd(&sps[i][j], vsp0123_0);
4263 if (i + 1 < batch_size && j < counts[i + 1])
4264 _mm256_storeu_pd(&sps[i + 1][j], vsp0123_1);
4266 if (i + 2 < batch_size && j < counts[i + 2])
4267 _mm256_storeu_pd(&sps[i + 2][j], vsp0123_2);
4269 if (i + 3 < batch_size && j < counts[i + 3])
4270 _mm256_storeu_pd(&sps[i + 3][j], vsp0123_3);
4273 vdb01_02 = _mm256_unpacklo_pd(vdb0, vdb1);
4274 vdb01_13 = _mm256_unpackhi_pd(vdb0, vdb1);
4275 vdb23_02 = _mm256_unpacklo_pd(vdb2, vdb3);
4276 vdb23_13 = _mm256_unpackhi_pd(vdb2, vdb3);
4278 vdb0123_0 = _mm256_permute2f128_pd(vdb01_02, vdb23_02, (2 << 4) | 0);
4279 vdb0123_1 = _mm256_permute2f128_pd(vdb01_13, vdb23_13, (2 << 4) | 0);
4280 vdb0123_2 = _mm256_permute2f128_pd(vdb01_02, vdb23_02, (3 << 4) | 1);
4281 vdb0123_3 = _mm256_permute2f128_pd(vdb01_13, vdb23_13, (3 << 4) | 1);
4283 _mm256_storeu_pd(&dbs[i][0], vdb0123_0);
4284 _mm_storel_pd(&dbs[i][4], _mm256_castpd256_pd128(vdb4));
4286 if (i + 1 < batch_size) {
4287 _mm256_storeu_pd(&dbs[i + 1][0], vdb0123_1);
4288 _mm_storeh_pd(&dbs[i + 1][4], _mm256_castpd256_pd128(vdb4));
4291 if (i + 2 < batch_size) {
4292 _mm256_storeu_pd(&dbs[i + 2][0], vdb0123_2);
4293 _mm_storel_pd(&dbs[i + 2][4], _mm256_extractf128_pd(vdb4, 1));
4296 if (i + 3 < batch_size) {
4297 _mm256_storeu_pd(&dbs[i + 3][0], vdb0123_3);
4298 _mm_storeh_pd(&dbs[i + 3][4], _mm256_extractf128_pd(vdb4, 1));
4303 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4305 static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
4307 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
4308 if (i >= batch_size)
4311 __m128i vcounts = _mm_set_epi32(
4314 i + 1 < batch_size ? counts[i + 1] : 0,
4318 __m128d vdb01_0 = _mm_loadu_pd(&dbs[i][0]);
4319 __m128d vdb23_0 = _mm_loadu_pd(&dbs[i][2]);
4320 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(&dbs[i + 1][0]) : _mm_setzero_pd();
4321 __m128d vdb23_1 = i + 1 < batch_size ? _mm_loadu_pd(&dbs[i + 1][2]) : _mm_setzero_pd();
4323 __m128d vdb0 = _mm_unpacklo_pd(vdb01_0, vdb01_1);
4324 __m128d vdb1 = _mm_unpackhi_pd(vdb01_0, vdb01_1);
4325 __m128d vdb2 = _mm_unpacklo_pd(vdb23_0, vdb23_1);
4326 __m128d vdb3 = _mm_unpackhi_pd(vdb23_0, vdb23_1);
4327 __m128d vdb4 = _mm_set_pd(
4328 i + 1 < batch_size ? dbs[i + 1][4] : 0.0,
4332 __m128d vdc01_0 = _mm_loadu_pd(&dcs[i][0]);
4333 __m128d vdc23_0 = _mm_loadu_pd(&dcs[i][2]);
4334 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(&dcs[i + 1][0]) : _mm_setzero_pd();
4335 __m128d vdc23_1 = i + 1 < batch_size ? _mm_loadu_pd(&dcs[i + 1][2]) : _mm_setzero_pd();
4337 __m128d vdc0 = _mm_unpacklo_pd(vdc01_0, vdc01_1);
4338 __m128d vdc1 = _mm_unpackhi_pd(vdc01_0, vdc01_1);
4339 __m128d vdc2 = _mm_unpacklo_pd(vdc23_0, vdc23_1);
4340 __m128d vdc3 = _mm_unpackhi_pd(vdc23_0, vdc23_1);
4341 __m128d vdc4 = _mm_set_pd(
4342 i + 1 < batch_size ? dcs[i + 1][4] : 0.0,
4346 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, 1)));
4348 for (int32 j = 0; j < count_max; j += 2) {
4349 __m128d vsp01_0 = j < counts[i] ? _mm_loadu_pd(&sps[i][j]) : _mm_setzero_pd();
4350 __m128d vsp01_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm_loadu_pd(&sps[i + 1][j]) : _mm_setzero_pd();
4353 vsps[0] = _mm_unpacklo_pd(vsp01_0, vsp01_1);
4354 vsps[1] = _mm_unpackhi_pd(vsp01_0, vsp01_1);
4356 for (int k = 0; k < 2; k++) {
4357 __m128d vmask = _mm_castsi128_pd(_mm_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
4359 vdb0 = MM_BLENDV_PD(vdb0, vsps[k], vmask);
4360 vdb2 = MM_BLENDV_PD(vdb2, MM_FMA5_PD(vdc0, vdb0, vdc1, vdb1, vdc2, vdb2, vdc3, vdb3, vdc4, vdb4), vmask);
4363 vdb2 = MM_BLENDV_PD(vdb2, _mm_add_pd(vdb2, _mm_set1_pd(denormal_add)), vmask);
4365 vdb4 = MM_BLENDV_PD(vdb4, vdb3, vmask);
4366 vdb3 = MM_BLENDV_PD(vdb3, vdb2, vmask);
4367 vdb2 = MM_BLENDV_PD(vdb2, vdb1, vmask);
4368 vdb1 = MM_BLENDV_PD(vdb1, vdb0, vmask);
4372 vsp01_0 = _mm_unpacklo_pd(vsps[0], vsps[1]);
4373 vsp01_1 = _mm_unpackhi_pd(vsps[0], vsps[1]);
4376 _mm_storeu_pd(&sps[i][j], vsp01_0);
4378 if (i + 1 < batch_size && j < counts[i + 1])
4379 _mm_storeu_pd(&sps[i + 1][j], vsp01_1);
4382 vdb01_0 = _mm_unpacklo_pd(vdb0, vdb1);
4383 vdb01_1 = _mm_unpackhi_pd(vdb0, vdb1);
4384 vdb23_0 = _mm_unpacklo_pd(vdb2, vdb3);
4385 vdb23_1 = _mm_unpackhi_pd(vdb2, vdb3);
4387 _mm_storeu_pd(&dbs[i][0], vdb01_0);
4388 _mm_storeu_pd(&dbs[i][2], vdb23_0);
4389 _mm_storel_pd(&dbs[i][4], vdb4);
4391 if (i + 1 < batch_size) {
4392 _mm_storeu_pd(&dbs[i + 1][0], vdb01_1);
4393 _mm_storeu_pd(&dbs[i + 1][2], vdb23_1);
4394 _mm_storeh_pd(&dbs[i + 1][4], vdb4);
4401 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4403 static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs)
4405 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
4406 if (i >= batch_size)
4409 __m256d vfcrange0123[8];
4410 vfcrange0123[0] = _mm256_loadu_pd(fcs[i]->range);
4412 for (int j = 1; j < 8; j++)
4413 vfcrange0123[j] = i + j < batch_size ? _mm256_loadu_pd(fcs[i + j]->range) : _mm256_setzero_pd();
4415 __m512d vfcrange0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123[0]), vfcrange0123[2], 1);
4416 __m512d vfcrange0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123[1]), vfcrange0123[3], 1);
4417 __m512d vfcrange0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123[4]), vfcrange0123[6], 1);
4418 __m512d vfcrange0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123[5]), vfcrange0123[7], 1);
4420 __m512d vfcrange01_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4421 __m512d vfcrange01_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4422 __m512d vfcrange23_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4423 __m512d vfcrange23_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4425 __m512d vfcrange0 = _mm512_unpacklo_pd(vfcrange01_0246, vfcrange01_1357);
4426 __m512d vfcrange1 = _mm512_unpackhi_pd(vfcrange01_0246, vfcrange01_1357);
4427 __m512d vfcrange2 = _mm512_unpacklo_pd(vfcrange23_0246, vfcrange23_1357);
4428 __m512d vfcrange3 = _mm512_unpackhi_pd(vfcrange23_0246, vfcrange23_1357);
4430 __m512d vfcfreq = _mm512_set_pd(
4431 i + 7 < batch_size ? fcs[i + 7]->freq : 0.0,
4432 i + 6 < batch_size ? fcs[i + 6]->freq : 0.0,
4433 i + 5 < batch_size ? fcs[i + 5]->freq : 0.0,
4434 i + 4 < batch_size ? fcs[i + 4]->freq : 0.0,
4435 i + 3 < batch_size ? fcs[i + 3]->freq : 0.0,
4436 i + 2 < batch_size ? fcs[i + 2]->freq : 0.0,
4437 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
4441 __m512d vfcreso_DB = _mm512_set_pd(
4442 i + 7 < batch_size ? fcs[i + 7]->reso_dB : 0.0,
4443 i + 6 < batch_size ? fcs[i + 6]->reso_dB : 0.0,
4444 i + 5 < batch_size ? fcs[i + 5]->reso_dB : 0.0,
4445 i + 4 < batch_size ? fcs[i + 4]->reso_dB : 0.0,
4446 i + 3 < batch_size ? fcs[i + 3]->reso_dB : 0.0,
4447 i + 2 < batch_size ? fcs[i + 2]->reso_dB : 0.0,
4448 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
4452 uint8 imask = _kor_mask8(
4453 _kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
4454 _kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
4457 if (batch_size - i < 8)
4458 imask &= (1 << (batch_size - i)) - 1;
4461 __m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
4462 __m512d v1pmargin = _mm512_set1_pd(1.0 + ext_filter_margin);
4464 vfcrange0 = _mm512_mul_pd(vfcfreq, v1mmargin);
4465 vfcrange1 = _mm512_mul_pd(vfcfreq, v1pmargin);
4466 vfcrange2 = _mm512_mul_pd(vfcreso_DB, v1mmargin);
4467 vfcrange3 = _mm512_mul_pd(vfcreso_DB, v1pmargin);
4469 vfcrange01_0246 = _mm512_unpacklo_pd(vfcrange0, vfcrange1);
4470 vfcrange01_1357 = _mm512_unpackhi_pd(vfcrange0, vfcrange1);
4471 vfcrange23_0246 = _mm512_unpacklo_pd(vfcrange2, vfcrange3);
4472 vfcrange23_1357 = _mm512_unpackhi_pd(vfcrange2, vfcrange3);
4475 __m512d vfcrange0123_04 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_0246);
4476 __m512d vfcrange0123_26 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_0246);
4477 __m512d vfcrange0123_15 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_1357);
4478 __m512d vfcrange0123_37 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_1357);
4480 __m512d vfcrange0123_04 = _mm512_mask_permutex_pd(vfcrange01_0246, 0xCC, vfcrange23_0246, (1 << 6) | (0 << 4) | 0);
4481 __m512d vfcrange0123_26 = _mm512_mask_permutex_pd(vfcrange01_0246, 0x33, vfcrange23_0246, (3 << 2) | 2);
4482 __m512d vfcrange0123_15 = _mm512_mask_permutex_pd(vfcrange01_1357, 0xCC, vfcrange23_1357, (1 << 6) | (0 << 4) | 0);
4483 __m512d vfcrange0123_37 = _mm512_mask_permutex_pd(vfcrange01_1357, 0x33, vfcrange23_1357, (3 << 2) | 2);
4487 _mm256_storeu_pd(fcs[i]->range, _mm512_castpd512_pd256(vfcrange0123_04));
4489 if (imask & (1 << 1))
4490 _mm256_storeu_pd(fcs[i + 1]->range, _mm512_castpd512_pd256(vfcrange0123_15));
4492 if (imask & (1 << 2))
4493 _mm256_storeu_pd(fcs[i + 2]->range, _mm512_castpd512_pd256(vfcrange0123_26));
4495 if (imask & (1 << 3))
4496 _mm256_storeu_pd(fcs[i + 3]->range, _mm512_castpd512_pd256(vfcrange0123_37));
4498 if (imask & (1 << 4))
4499 _mm256_storeu_pd(fcs[i + 4]->range, _mm512_extractf64x4_pd(vfcrange0123_04, 1));
4501 if (imask & (1 << 5))
4502 _mm256_storeu_pd(fcs[i + 5]->range, _mm512_extractf64x4_pd(vfcrange0123_15, 1));
4504 if (imask & (1 << 6))
4505 _mm256_storeu_pd(fcs[i + 6]->range, _mm512_extractf64x4_pd(vfcrange0123_26, 1));
4507 if (imask & (1 << 7))
4508 _mm256_storeu_pd(fcs[i + 7]->range, _mm512_extractf64x4_pd(vfcrange0123_37, 1));
4510 __m512d vfcdiv_flt_rate = _mm512_set_pd(
4511 i + 7 < batch_size ? fcs[i + 7]->div_flt_rate : fcs[i]->div_flt_rate,
4512 i + 6 < batch_size ? fcs[i + 6]->div_flt_rate : fcs[i]->div_flt_rate,
4513 i + 5 < batch_size ? fcs[i + 5]->div_flt_rate : fcs[i]->div_flt_rate,
4514 i + 4 < batch_size ? fcs[i + 4]->div_flt_rate : fcs[i]->div_flt_rate,
4515 i + 3 < batch_size ? fcs[i + 3]->div_flt_rate : fcs[i]->div_flt_rate,
4516 i + 2 < batch_size ? fcs[i + 2]->div_flt_rate : fcs[i]->div_flt_rate,
4517 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
4518 fcs[i]->div_flt_rate
4521 __m512d vf = _mm512_mul_pd(_mm512_mul_pd(_mm512_set1_pd(M_PI), vfcfreq), vfcdiv_flt_rate);
4524 __m512d vtanf = _mm512_tan_pd(vf);
4526 ALIGN FLOAT_T af[8];
4527 _mm512_storeu_pd(af, vf);
4528 __m512d vtanf = _mm512_set_pd(tan(af[7]), tan(af[6]), tan(af[5]), tan(af[4]), tan(af[3]), tan(af[2]), tan(af[1]), tan(af[0]));
4531 __m512d v1 = _mm512_set1_pd(1.0);
4532 __m512d v2 = _mm512_set1_pd(2.0);
4533 __m512d vp = _mm512_div_pd(v1, vtanf);
4535 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
4537 __m512d vreso_db_cf_p = _mm512_set_pd(
4538 i + 7 < batch_size ? RESO_DB_CF_P(fcs[i + 7]->reso_dB) : reso_db_cf_p,
4539 i + 6 < batch_size ? RESO_DB_CF_P(fcs[i + 6]->reso_dB) : reso_db_cf_p,
4540 i + 5 < batch_size ? RESO_DB_CF_P(fcs[i + 5]->reso_dB) : reso_db_cf_p,
4541 i + 4 < batch_size ? RESO_DB_CF_P(fcs[i + 4]->reso_dB) : reso_db_cf_p,
4542 i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : reso_db_cf_p,
4543 i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : reso_db_cf_p,
4544 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
4548 __m512d vq = _mm512_mul_pd(vreso_db_cf_p, _mm512_set1_pd(SQRT_2));
4549 __m512d vp2 = _mm512_mul_pd(vp, vp);
4550 __m512d vqp = _mm512_mul_pd(vq, vp);
4551 __m512d vdc0 = _mm512_div_pd(v1, _mm512_add_pd(_mm512_add_pd(v1, vqp), vp2));
4552 __m512d vdc1 = _mm512_mul_pd(v2, vdc0);
4553 __m512d vdc2 = vdc0;
4554 __m512d vdc3 = _mm512_mul_pd(_mm512_mul_pd(v2, _mm512_sub_pd(vp2, v1)), vdc0);
4555 __m512d vdc4 = _mm512_mul_pd(_mm512_sub_pd(_mm512_sub_pd(vqp, v1), vp2), vdc0);
4557 __m512d vdc01_0246 = _mm512_unpacklo_pd(vdc0, vdc1);
4558 __m512d vdc01_1357 = _mm512_unpackhi_pd(vdc0, vdc1);
4559 __m512d vdc23_0246 = _mm512_unpacklo_pd(vdc2, vdc3);
4560 __m512d vdc23_1357 = _mm512_unpackhi_pd(vdc2, vdc3);
4562 __m512d vdc0123_04 = _mm512_permutex2var_pd(vdc01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vdc23_0246);
4563 __m512d vdc0123_26 = _mm512_permutex2var_pd(vdc01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vdc23_0246);
4564 __m512d vdc0123_15 = _mm512_permutex2var_pd(vdc01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vdc23_1357);
4565 __m512d vdc0123_37 = _mm512_permutex2var_pd(vdc01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vdc23_1357);
4568 _mm256_storeu_pd(&fcs[i]->dc[0], _mm512_castpd512_pd256(vdc0123_04));
4569 _mm_storel_pd(&fcs[i]->dc[4], _mm512_castpd512_pd128(vdc4));
4572 if (imask & (1 << 1)) {
4573 _mm256_storeu_pd(&fcs[i + 1]->dc[0], _mm512_castpd512_pd256(vdc0123_15));
4574 _mm_storeh_pd(&fcs[i + 1]->dc[4], _mm512_castpd512_pd128(vdc4));
4577 if (imask & (1 << 2)) {
4578 _mm256_storeu_pd(&fcs[i + 2]->dc[0], _mm512_castpd512_pd256(vdc0123_26));
4579 _mm_storel_pd(&fcs[i + 2]->dc[4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc4), 1));
4582 if (imask & (1 << 3)) {
4583 _mm256_storeu_pd(&fcs[i + 3]->dc[0], _mm512_castpd512_pd256(vdc0123_37));
4584 _mm_storeh_pd(&fcs[i + 3]->dc[4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc4), 1));
4587 if (imask & (1 << 4)) {
4588 _mm256_storeu_pd(&fcs[i + 4]->dc[0], _mm512_extractf64x4_pd(vdc0123_04, 1));
4589 _mm_storel_pd(&fcs[i + 4]->dc[4], _mm512_extractf64x2_pd(vdc4, 2));
4592 if (imask & (1 << 5)) {
4593 _mm256_storeu_pd(&fcs[i + 5]->dc[0], _mm512_extractf64x4_pd(vdc0123_15, 1));
4594 _mm_storeh_pd(&fcs[i + 5]->dc[4], _mm512_extractf64x2_pd(vdc4, 2));
4597 if (imask & (1 << 6)) {
4598 _mm256_storeu_pd(&fcs[i + 6]->dc[0], _mm512_extractf64x4_pd(vdc0123_26, 1));
4599 _mm_storel_pd(&fcs[i + 6]->dc[4], _mm512_extractf64x2_pd(vdc4, 3));
4602 if (imask & (1 << 7)) {
4603 _mm256_storeu_pd(&fcs[i + 7]->dc[0], _mm512_extractf64x4_pd(vdc0123_37, 1));
4604 _mm_storeh_pd(&fcs[i + 7]->dc[4], _mm512_extractf64x2_pd(vdc4, 3));
4610 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4612 static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs)
4614 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
4615 if (i >= batch_size)
4618 __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[i]->range);
4619 __m256d vfcrange0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(fcs[i + 1]->range) : _mm256_setzero_pd();
4620 __m256d vfcrange0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(fcs[i + 2]->range) : _mm256_setzero_pd();
4621 __m256d vfcrange0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(fcs[i + 3]->range) : _mm256_setzero_pd();
4623 __m256d vfcrange01_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (2 << 4) | 0);
4624 __m256d vfcrange01_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (2 << 4) | 0);
4625 __m256d vfcrange23_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (3 << 4) | 1);
4626 __m256d vfcrange23_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (3 << 4) | 1);
4628 __m256d vfcrange0 = _mm256_unpacklo_pd(vfcrange01_02, vfcrange01_13);
4629 __m256d vfcrange1 = _mm256_unpackhi_pd(vfcrange01_02, vfcrange01_13);
4630 __m256d vfcrange2 = _mm256_unpacklo_pd(vfcrange23_02, vfcrange23_13);
4631 __m256d vfcrange3 = _mm256_unpackhi_pd(vfcrange23_02, vfcrange23_13);
4633 __m256d vfcfreq = _mm256_set_pd(
4634 i + 3 < batch_size ? fcs[i + 3]->freq : 0.0,
4635 i + 2 < batch_size ? fcs[i + 2]->freq : 0.0,
4636 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
4640 __m256d vfcreso_DB = _mm256_set_pd(
4641 i + 3 < batch_size ? fcs[i + 3]->reso_dB : 0.0,
4642 i + 2 < batch_size ? fcs[i + 2]->reso_dB : 0.0,
4643 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
4647 __m256d vmask = _mm256_or_pd(
4648 _mm256_or_pd(_mm256_cmp_pd(vfcfreq, vfcrange0, _CMP_LT_OS), _mm256_cmp_pd(vfcfreq, vfcrange1, _CMP_GT_OS)),
4649 _mm256_or_pd(_mm256_cmp_pd(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm256_cmp_pd(vfcreso_DB, vfcrange3, _CMP_GT_OS))
4652 int imask = _mm256_movemask_pd(vmask);
4654 if (batch_size - i < 4)
4655 imask &= (1 << (batch_size - i)) - 1;
4658 __m256d v1mmargin = _mm256_set1_pd(1.0 - ext_filter_margin);
4659 __m256d v1pmargin = _mm256_set1_pd(1.0 + ext_filter_margin);
4661 vfcrange0 = _mm256_mul_pd(vfcfreq, v1mmargin);
4662 vfcrange1 = _mm256_mul_pd(vfcfreq, v1pmargin);
4663 vfcrange2 = _mm256_mul_pd(vfcreso_DB, v1mmargin);
4664 vfcrange3 = _mm256_mul_pd(vfcreso_DB, v1pmargin);
4666 vfcrange01_02 = _mm256_unpacklo_pd(vfcrange0, vfcrange1);
4667 vfcrange01_13 = _mm256_unpackhi_pd(vfcrange0, vfcrange1);
4668 vfcrange23_02 = _mm256_unpacklo_pd(vfcrange2, vfcrange3);
4669 vfcrange23_13 = _mm256_unpackhi_pd(vfcrange2, vfcrange3);
4671 vfcrange0123_0 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (2 << 4) | 0);
4672 vfcrange0123_1 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (2 << 4) | 0);
4673 vfcrange0123_2 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (3 << 4) | 1);
4674 vfcrange0123_3 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (3 << 4) | 1);
4677 _mm256_storeu_pd(fcs[i]->range, vfcrange0123_0);
4679 if (imask & (1 << 1))
4680 _mm256_storeu_pd(fcs[i + 1]->range, vfcrange0123_1);
4682 if (imask & (1 << 2))
4683 _mm256_storeu_pd(fcs[i + 2]->range, vfcrange0123_2);
4685 if (imask & (1 << 3))
4686 _mm256_storeu_pd(fcs[i + 3]->range, vfcrange0123_3);
4688 __m256d vfcdiv_flt_rate = _mm256_set_pd(
4689 i + 3 < batch_size ? fcs[i + 3]->div_flt_rate : fcs[i]->div_flt_rate,
4690 i + 2 < batch_size ? fcs[i + 2]->div_flt_rate : fcs[i]->div_flt_rate,
4691 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
4692 fcs[i]->div_flt_rate
4695 __m256d vf = _mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(M_PI), vfcfreq), vfcdiv_flt_rate);
4698 __m256d vtanf = _mm256_tan_pd(vf);
4700 ALIGN FLOAT_T af[4];
4701 _mm256_storeu_pd(af, vf);
4702 __m256d vtanf = _mm256_set_pd(tan(af[3]), tan(af[2]), tan(af[1]), tan(af[0]));
4705 __m256d v1 = _mm256_set1_pd(1.0);
4706 __m256d v2 = _mm256_set1_pd(2.0);
4707 __m256d vp = _mm256_div_pd(v1, vtanf);
4709 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
4711 __m256d vreso_db_cf_p = _mm256_set_pd(
4712 i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : reso_db_cf_p,
4713 i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : reso_db_cf_p,
4714 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
4718 __m256d vq = _mm256_mul_pd(vreso_db_cf_p, _mm256_set1_pd(SQRT_2));
4719 __m256d vp2 = _mm256_mul_pd(vp, vp);
4720 __m256d vqp = _mm256_mul_pd(vq, vp);
4721 __m256d vdc0 = _mm256_div_pd(v1, _mm256_add_pd(_mm256_add_pd(v1, vqp), vp2));
4722 __m256d vdc1 = _mm256_mul_pd(v2, vdc0);
4723 __m256d vdc2 = vdc0;
4724 __m256d vdc3 = _mm256_mul_pd(_mm256_mul_pd(v2, _mm256_sub_pd(vp2, v1)), vdc0);
4725 __m256d vdc4 = _mm256_mul_pd(_mm256_sub_pd(_mm256_sub_pd(vqp, v1), vp2), vdc0);
4727 __m256d vdc01_02 = _mm256_unpacklo_pd(vdc0, vdc1);
4728 __m256d vdc01_13 = _mm256_unpackhi_pd(vdc0, vdc1);
4729 __m256d vdc23_02 = _mm256_unpacklo_pd(vdc2, vdc3);
4730 __m256d vdc23_13 = _mm256_unpackhi_pd(vdc2, vdc3);
4732 __m256d vdc0123_0 = _mm256_permute2f128_pd(vdc01_02, vdc23_02, (2 << 4) | 0);
4733 __m256d vdc0123_1 = _mm256_permute2f128_pd(vdc01_13, vdc23_13, (2 << 4) | 0);
4734 __m256d vdc0123_2 = _mm256_permute2f128_pd(vdc01_02, vdc23_02, (3 << 4) | 1);
4735 __m256d vdc0123_3 = _mm256_permute2f128_pd(vdc01_13, vdc23_13, (3 << 4) | 1);
4738 _mm256_storeu_pd(&fcs[i]->dc[0], vdc0123_0);
4739 _mm_storel_pd(&fcs[i]->dc[4], _mm256_castpd256_pd128(vdc4));
4742 if (imask & (1 << 1)) {
4743 _mm256_storeu_pd(&fcs[i + 1]->dc[0], vdc0123_1);
4744 _mm_storeh_pd(&fcs[i + 1]->dc[4], _mm256_castpd256_pd128(vdc4));
4747 if (imask & (1 << 2)) {
4748 _mm256_storeu_pd(&fcs[i + 2]->dc[0], vdc0123_2);
4749 _mm_storel_pd(&fcs[i + 2]->dc[4], _mm256_extractf128_pd(vdc4, 1));
4752 if (imask & (1 << 3)) {
4753 _mm256_storeu_pd(&fcs[i + 3]->dc[0], vdc0123_3);
4754 _mm_storeh_pd(&fcs[i + 3]->dc[4], _mm256_extractf128_pd(vdc4, 1));
4760 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4762 static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs)
4764 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
4765 if (i >= batch_size)
4768 __m128d vfcrange01_0 = _mm_loadu_pd(fcs[i]->range);
4769 __m128d vfcrange23_0 = _mm_loadu_pd(&fcs[i]->range[2]);
4770 __m128d vfcrange01_1 = i + 1 < batch_size ? _mm_loadu_pd(fcs[i + 1]->range) : _mm_setzero_pd();
4771 __m128d vfcrange23_1 = i + 1 < batch_size ? _mm_loadu_pd(&fcs[i + 1]->range[2]) : _mm_setzero_pd();
4773 __m128d vfcrange0 = _mm_unpacklo_pd(vfcrange01_0, vfcrange01_1);
4774 __m128d vfcrange1 = _mm_unpackhi_pd(vfcrange01_0, vfcrange01_1);
4775 __m128d vfcrange2 = _mm_unpacklo_pd(vfcrange23_0, vfcrange23_1);
4776 __m128d vfcrange3 = _mm_unpackhi_pd(vfcrange23_0, vfcrange23_1);
4778 __m128d vfcfreq = _mm_set_pd(
4779 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
4783 __m128d vfcreso_DB = _mm_set_pd(
4784 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
4788 __m128d vmask = _mm_or_pd(
4789 _mm_or_pd(_mm_cmplt_pd(vfcfreq, vfcrange0), _mm_cmpgt_pd(vfcfreq, vfcrange1)),
4790 _mm_or_pd(_mm_cmplt_pd(vfcreso_DB, vfcrange2), _mm_cmpgt_pd(vfcreso_DB, vfcrange3))
4793 int imask = _mm_movemask_pd(vmask);
4795 if (batch_size - i < 2)
4796 imask &= (1 << (batch_size - i)) - 1;
4799 __m128d v1mmargin = _mm_set1_pd(1.0 - ext_filter_margin);
4800 __m128d v1pmargin = _mm_set1_pd(1.0 + ext_filter_margin);
4802 vfcrange0 = _mm_mul_pd(vfcfreq, v1mmargin);
4803 vfcrange1 = _mm_mul_pd(vfcfreq, v1pmargin);
4804 vfcrange2 = _mm_mul_pd(vfcreso_DB, v1mmargin);
4805 vfcrange3 = _mm_mul_pd(vfcreso_DB, v1pmargin);
4807 vfcrange01_0 = _mm_unpacklo_pd(vfcrange0, vfcrange1);
4808 vfcrange01_1 = _mm_unpackhi_pd(vfcrange0, vfcrange1);
4809 vfcrange23_0 = _mm_unpacklo_pd(vfcrange2, vfcrange3);
4810 vfcrange23_1 = _mm_unpackhi_pd(vfcrange2, vfcrange3);
4813 _mm_storeu_pd(fcs[i]->range, vfcrange01_0);
4814 _mm_storeu_pd(&fcs[i]->range[2], vfcrange23_0);
4817 if (imask & (1 << 1)) {
4818 _mm_storeu_pd(fcs[i + 1]->range, vfcrange01_1);
4819 _mm_storeu_pd(&fcs[i + 1]->range[2], vfcrange23_1);
4822 __m128d vfcdiv_flt_rate = _mm_set_pd(
4823 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
4824 fcs[i]->div_flt_rate
4827 __m128d vf = _mm_mul_pd(_mm_mul_pd(_mm_set1_pd(M_PI), vfcfreq), vfcdiv_flt_rate);
4830 __m128d vtanf = _mm_tan_pd(vf);
4832 ALIGN FLOAT_T af[2];
4833 _mm_storeu_pd(af, vf);
4834 __m128d vtanf = _mm_set_pd(tan(af[1]), tan(af[0]));
4837 __m128d v1 = _mm_set1_pd(1.0);
4838 __m128d v2 = _mm_set1_pd(2.0);
4839 __m128d vp = _mm_div_pd(v1, vtanf);
4841 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
4843 __m128d vreso_db_cf_p = _mm_set_pd(
4844 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
4848 __m128d vq = _mm_mul_pd(vreso_db_cf_p, _mm_set1_pd(SQRT_2));
4849 __m128d vp2 = _mm_mul_pd(vp, vp);
4850 __m128d vqp = _mm_mul_pd(vq, vp);
4851 __m128d vdc0 = _mm_div_pd(v1, _mm_add_pd(_mm_add_pd(v1, vqp), vp2));
4852 __m128d vdc1 = _mm_mul_pd(v2, vdc0);
4853 __m128d vdc2 = vdc0;
4854 __m128d vdc3 = _mm_mul_pd(_mm_mul_pd(v2, _mm_sub_pd(vp2, v1)), vdc0);
4855 __m128d vdc4 = _mm_mul_pd(_mm_sub_pd(_mm_sub_pd(vqp, v1), vp2), vdc0);
4857 __m128d vdc01_0 = _mm_unpacklo_pd(vdc0, vdc1);
4858 __m128d vdc01_1 = _mm_unpackhi_pd(vdc0, vdc1);
4859 __m128d vdc23_0 = _mm_unpacklo_pd(vdc2, vdc3);
4860 __m128d vdc23_1 = _mm_unpackhi_pd(vdc2, vdc3);
4863 _mm_storeu_pd(&fcs[i]->dc[0], vdc01_0);
4864 _mm_storeu_pd(&fcs[i]->dc[2], vdc23_0);
4865 _mm_storel_pd(&fcs[i]->dc[4], vdc4);
4868 if (imask & (1 << 1)) {
4869 _mm_storeu_pd(&fcs[i + 1]->dc[0], vdc01_1);
4870 _mm_storeu_pd(&fcs[i + 1]->dc[2], vdc23_1);
4871 _mm_storeh_pd(&fcs[i + 1]->dc[4], vdc4);
4879 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
4881 static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
4883 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
4884 if (i >= batch_size)
4887 __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(i, batch_size), &counts[i]);
4889 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
4890 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
4891 __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
4892 __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
4893 __m128d vdb01_4 = i + 4 < batch_size ? _mm_loadu_pd(dbs[i + 4]) : _mm_setzero_pd();
4894 __m128d vdb01_5 = i + 5 < batch_size ? _mm_loadu_pd(dbs[i + 5]) : _mm_setzero_pd();
4895 __m128d vdb01_6 = i + 6 < batch_size ? _mm_loadu_pd(dbs[i + 6]) : _mm_setzero_pd();
4896 __m128d vdb01_7 = i + 7 < batch_size ? _mm_loadu_pd(dbs[i + 7]) : _mm_setzero_pd();
4898 __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
4899 __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
4900 __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
4901 __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
4903 __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
4904 __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
4906 __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
4907 __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
4909 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
4910 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
4911 __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
4912 __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
4913 __m128d vdc01_4 = i + 4 < batch_size ? _mm_loadu_pd(dcs[i + 4]) : _mm_setzero_pd();
4914 __m128d vdc01_5 = i + 5 < batch_size ? _mm_loadu_pd(dcs[i + 5]) : _mm_setzero_pd();
4915 __m128d vdc01_6 = i + 6 < batch_size ? _mm_loadu_pd(dcs[i + 6]) : _mm_setzero_pd();
4916 __m128d vdc01_7 = i + 7 < batch_size ? _mm_loadu_pd(dcs[i + 7]) : _mm_setzero_pd();
4918 __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
4919 __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
4920 __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
4921 __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
4923 __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
4924 __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
4926 __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
4927 __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
4929 __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
4930 vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
4931 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
4933 for (int32 j = 0; j < count_max; j += 8) {
4935 vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[i]), &sps[i][j]);
4937 for (int k = 1; k < 8; k++)
4938 vin[k] = _mm512_maskz_loadu_pd(i + k < batch_size ? generate_mask8_for_count(j, counts[i + k]) : 0, & sps[i + k][j]);
4940 __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
4941 __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
4942 __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
4943 __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
4944 __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
4945 __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
4946 __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
4947 __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
4949 __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4950 __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4951 __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4952 __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4953 __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4954 __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4955 __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4956 __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4959 vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4960 vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4961 vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4962 vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4963 vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4964 vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4965 vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4966 vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4968 for (int k = 0; k < 8; k++) {
4969 __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
4971 vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
4972 vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
4973 vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
4977 __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
4978 __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
4979 __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
4980 __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
4981 __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
4982 __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
4983 __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
4984 __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
4986 __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4987 __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4988 __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4989 __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4990 __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4991 __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4992 __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4993 __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4996 vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4997 vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
4998 vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
4999 vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5000 vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5001 vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5002 vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5003 vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5005 for (int k = 0; k < 8; k++)
5006 _mm512_mask_storeu_pd(&sps[i + k][j], generate_mask8_for_count(j, counts[i + k]), vout[k]);
5009 vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
5010 vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
5012 _mm_storeu_pd(dbs[i], _mm512_castpd512_pd128(vdb01_0246));
5014 if (i + 1 < batch_size)
5015 _mm_storeu_pd(dbs[i + 1], _mm512_castpd512_pd128(vdb01_1357));
5017 if (i + 2 < batch_size)
5018 _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
5020 if (i + 3 < batch_size)
5021 _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
5023 if (i + 4 < batch_size)
5024 _mm_storeu_pd(dbs[i + 4], _mm512_extractf64x2_pd(vdb01_0246, 2));
5026 if (i + 5 < batch_size)
5027 _mm_storeu_pd(dbs[i + 5], _mm512_extractf64x2_pd(vdb01_1357, 2));
5029 if (i + 6 < batch_size)
5030 _mm_storeu_pd(dbs[i + 6], _mm512_extractf64x2_pd(vdb01_0246, 3));
5032 if (i + 7 < batch_size)
5033 _mm_storeu_pd(dbs[i + 7], _mm512_extractf64x2_pd(vdb01_1357, 3));
5037 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5039 static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5041 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
5042 if (i >= batch_size)
5045 __m128i vcounts = _mm_set_epi32(
5046 i + 3 < batch_size ? counts[i + 3] : 0,
5047 i + 2 < batch_size ? counts[i + 2] : 0,
5048 i + 1 < batch_size ? counts[i + 1] : 0,
5052 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
5053 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
5054 __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
5055 __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
5057 __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
5058 __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
5060 __m256d vdb0 = _mm256_unpacklo_pd(vdb01_02, vdb01_13);
5061 __m256d vdb1 = _mm256_unpackhi_pd(vdb01_02, vdb01_13);
5063 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
5064 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
5065 __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
5066 __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
5068 __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
5069 __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
5071 __m256d vdc0 = _mm256_unpacklo_pd(vdc01_02, vdc01_13);
5072 __m256d vdc1 = _mm256_unpackhi_pd(vdc01_02, vdc01_13);
5074 __m128i vcounts_halfmax = _mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, (3 << 2) | 2));
5075 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, 1)));
5077 for (int32 j = 0; j < count_max; j += 4) {
5078 __m256d vsp0123_0 = j < counts[i] ? _mm256_loadu_pd(&sps[i][j]) : _mm256_setzero_pd();
5079 __m256d vsp0123_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm256_loadu_pd(&sps[i + 1][j]) : _mm256_setzero_pd();
5080 __m256d vsp0123_2 = i + 2 < batch_size && j < counts[i + 2] ? _mm256_loadu_pd(&sps[i + 2][j]) : _mm256_setzero_pd();
5081 __m256d vsp0123_3 = i + 3 < batch_size && j < counts[i + 3] ? _mm256_loadu_pd(&sps[i + 3][j]) : _mm256_setzero_pd();
5083 __m256d vsp01_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (2 << 4) | 0);
5084 __m256d vsp01_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (2 << 4) | 0);
5085 __m256d vsp23_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (3 << 4) | 1);
5086 __m256d vsp23_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (3 << 4) | 1);
5089 vsps[0] = _mm256_unpacklo_pd(vsp01_02, vsp01_13);
5090 vsps[1] = _mm256_unpackhi_pd(vsp01_02, vsp01_13);
5091 vsps[2] = _mm256_unpacklo_pd(vsp23_02, vsp23_13);
5092 vsps[3] = _mm256_unpackhi_pd(vsp23_02, vsp23_13);
5094 for (int k = 0; k < 4; k++) {
5095 __m256d vmask = _mm256_castsi256_pd(_mm256_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
5097 vdb1 = _mm256_blendv_pd(vdb1, MM256_FMA_PD(_mm256_sub_pd(vsps[k], vdb0), vdc1, vdb1), vmask);
5098 vdb0 = _mm256_blendv_pd(vdb0, _mm256_add_pd(vdb0, vdb1), vmask);
5099 vdb1 = _mm256_blendv_pd(vdb1, _mm256_mul_pd(vdb1, vdc0), vmask);
5103 vsp01_02 = _mm256_unpacklo_pd(vsps[0], vsps[1]);
5104 vsp01_13 = _mm256_unpackhi_pd(vsps[0], vsps[1]);
5105 vsp23_02 = _mm256_unpacklo_pd(vsps[2], vsps[3]);
5106 vsp23_13 = _mm256_unpackhi_pd(vsps[2], vsps[3]);
5108 vsp0123_0 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (2 << 4) | 0);
5109 vsp0123_1 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (2 << 4) | 0);
5110 vsp0123_2 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (3 << 4) | 1);
5111 vsp0123_3 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (3 << 4) | 1);
5114 _mm256_storeu_pd(&sps[i][j], vsp0123_0);
5116 if (i + 1 < batch_size && j < counts[i + 1])
5117 _mm256_storeu_pd(&sps[i + 1][j], vsp0123_1);
5119 if (i + 2 < batch_size && j < counts[i + 2])
5120 _mm256_storeu_pd(&sps[i + 2][j], vsp0123_2);
5122 if (i + 3 < batch_size && j < counts[i + 3])
5123 _mm256_storeu_pd(&sps[i + 3][j], vsp0123_3);
5126 vdb01_02 = _mm256_unpacklo_pd(vdb0, vdb1);
5127 vdb01_13 = _mm256_unpackhi_pd(vdb0, vdb1);
5129 _mm_storeu_pd(dbs[i], _mm256_castpd256_pd128(vdb01_02));
5131 if (i + 1 < batch_size)
5132 _mm_storeu_pd(dbs[i + 1], _mm256_castpd256_pd128(vdb01_13));
5134 if (i + 2 < batch_size)
5135 _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(vdb01_02, 1));
5137 if (i + 3 < batch_size)
5138 _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(vdb01_13, 1));
5142 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5144 static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5146 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
5147 if (i >= batch_size)
5150 __m128i vcounts = _mm_set_epi32(
5153 i + 1 < batch_size ? counts[i + 1] : 0,
5157 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
5158 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
5160 __m128d vdb0 = _mm_unpacklo_pd(vdb01_0, vdb01_1);
5161 __m128d vdb1 = _mm_unpackhi_pd(vdb01_0, vdb01_1);
5163 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
5164 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
5166 __m128d vdc0 = _mm_unpacklo_pd(vdc01_0, vdc01_1);
5167 __m128d vdc1 = _mm_unpackhi_pd(vdc01_0, vdc01_1);
5169 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, 1)));
5171 for (int32 j = 0; j < count_max; j += 2) {
5172 __m128d vsp01_0 = j < counts[i] ? _mm_loadu_pd(&sps[i][j]) : _mm_setzero_pd();
5173 __m128d vsp01_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm_loadu_pd(&sps[i + 1][j]) : _mm_setzero_pd();
5176 vsps[0] = _mm_unpacklo_pd(vsp01_0, vsp01_1);
5177 vsps[1] = _mm_unpackhi_pd(vsp01_0, vsp01_1);
5179 for (int k = 0; k < 2; k++) {
5180 __m128d vmask = _mm_castsi128_pd(_mm_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
5182 vdb1 = MM_BLENDV_PD(vdb1, MM_FMA_PD(_mm_sub_pd(vsps[k], vdb0), vdc1, vdb1), vmask);
5183 vdb0 = MM_BLENDV_PD(vdb0, _mm_add_pd(vdb0, vdb1), vmask);
5184 vdb1 = MM_BLENDV_PD(vdb1, _mm_mul_pd(vdb1, vdc0), vmask);
5188 vsp01_0 = _mm_unpacklo_pd(vsps[0], vsps[1]);
5189 vsp01_1 = _mm_unpackhi_pd(vsps[0], vsps[1]);
5192 _mm_storeu_pd(&sps[i][j], vsp01_0);
5194 if (i + 1 < batch_size && j < counts[i + 1])
5195 _mm_storeu_pd(&sps[i + 1][j], vsp01_1);
5198 vdb01_0 = _mm_unpacklo_pd(vdb0, vdb1);
5199 vdb01_1 = _mm_unpackhi_pd(vdb0, vdb1);
5201 _mm_storeu_pd(dbs[i], vdb01_0);
5203 if (i + 1 < batch_size)
5204 _mm_storeu_pd(dbs[i + 1], vdb01_1);
5210 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5212 static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients **fcs)
5214 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
5215 if (i >= batch_size)
5218 __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[i]->range);
5219 __m256d vfcrange0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(fcs[i + 1]->range) : _mm256_setzero_pd();
5220 __m256d vfcrange0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(fcs[i + 2]->range) : _mm256_setzero_pd();
5221 __m256d vfcrange0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(fcs[i + 3]->range) : _mm256_setzero_pd();
5222 __m256d vfcrange0123_4 = i + 4 < batch_size ? _mm256_loadu_pd(fcs[i + 4]->range) : _mm256_setzero_pd();
5223 __m256d vfcrange0123_5 = i + 5 < batch_size ? _mm256_loadu_pd(fcs[i + 5]->range) : _mm256_setzero_pd();
5224 __m256d vfcrange0123_6 = i + 6 < batch_size ? _mm256_loadu_pd(fcs[i + 6]->range) : _mm256_setzero_pd();
5225 __m256d vfcrange0123_7 = i + 7 < batch_size ? _mm256_loadu_pd(fcs[i + 7]->range) : _mm256_setzero_pd();
5227 __m512d vfcrange0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_0), vfcrange0123_2, 1);
5228 __m512d vfcrange0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_1), vfcrange0123_3, 1);
5229 __m512d vfcrange0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_4), vfcrange0123_6, 1);
5230 __m512d vfcrange0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512(vfcrange0123_5), vfcrange0123_7, 1);
5232 __m512d vfcrange01_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5233 __m512d vfcrange01_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5234 __m512d vfcrange23_0246 = _mm512_shuffle_f64x2(vfcrange0123_02, vfcrange0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5235 __m512d vfcrange23_1357 = _mm512_shuffle_f64x2(vfcrange0123_13, vfcrange0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5237 __m512d vfcrange0 = _mm512_unpacklo_pd(vfcrange01_0246, vfcrange01_1357);
5238 __m512d vfcrange1 = _mm512_unpackhi_pd(vfcrange01_0246, vfcrange01_1357);
5239 __m512d vfcrange2 = _mm512_unpacklo_pd(vfcrange23_0246, vfcrange23_1357);
5240 __m512d vfcrange3 = _mm512_unpackhi_pd(vfcrange23_0246, vfcrange23_1357);
5242 __m512d vfcfreq = _mm512_set_pd(
5243 i + 7 < batch_size ? fcs[i + 7]->freq : 0.0,
5244 i + 6 < batch_size ? fcs[i + 6]->freq : 0.0,
5245 i + 5 < batch_size ? fcs[i + 5]->freq : 0.0,
5246 i + 4 < batch_size ? fcs[i + 4]->freq : 0.0,
5247 i + 3 < batch_size ? fcs[i + 3]->freq : 0.0,
5248 i + 2 < batch_size ? fcs[i + 2]->freq : 0.0,
5249 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
5253 __m512d vfcreso_DB = _mm512_set_pd(
5254 i + 7 < batch_size ? fcs[i + 7]->reso_dB : 0.0,
5255 i + 6 < batch_size ? fcs[i + 6]->reso_dB : 0.0,
5256 i + 5 < batch_size ? fcs[i + 5]->reso_dB : 0.0,
5257 i + 4 < batch_size ? fcs[i + 4]->reso_dB : 0.0,
5258 i + 3 < batch_size ? fcs[i + 3]->reso_dB : 0.0,
5259 i + 2 < batch_size ? fcs[i + 2]->reso_dB : 0.0,
5260 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
5264 uint8 imask = _kor_mask8(
5265 _kor_mask8(_mm512_cmp_pd_mask(vfcfreq, vfcrange0, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcfreq, vfcrange1, _CMP_GT_OS)),
5266 _kor_mask8(_mm512_cmp_pd_mask(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm512_cmp_pd_mask(vfcreso_DB, vfcrange3, _CMP_GT_OS))
5269 if (batch_size - i < 8)
5270 imask &= (1 << (batch_size - i)) - 1;
5273 __m512d v1mmargin = _mm512_set1_pd(1.0 - ext_filter_margin);
5274 __m512d v1pmargin = _mm512_set1_pd(1.0 + ext_filter_margin);
5276 vfcrange0 = _mm512_mul_pd(vfcfreq, v1mmargin);
5277 vfcrange1 = _mm512_mul_pd(vfcfreq, v1pmargin);
5278 vfcrange2 = _mm512_mul_pd(vfcreso_DB, v1mmargin);
5279 vfcrange3 = _mm512_mul_pd(vfcreso_DB, v1pmargin);
5281 vfcrange01_0246 = _mm512_unpacklo_pd(vfcrange0, vfcrange1);
5282 vfcrange01_1357 = _mm512_unpackhi_pd(vfcrange0, vfcrange1);
5283 vfcrange23_0246 = _mm512_unpacklo_pd(vfcrange2, vfcrange3);
5284 vfcrange23_1357 = _mm512_unpackhi_pd(vfcrange2, vfcrange3);
5287 __m512d vfcrange0123_04 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_0246);
5288 __m512d vfcrange0123_26 = _mm512_permutex2var_pd(vfcrange01_0246, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_0246);
5289 __m512d vfcrange0123_15 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0), vfcrange23_1357);
5290 __m512d vfcrange0123_37 = _mm512_permutex2var_pd(vfcrange01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vfcrange23_1357);
5292 __m512d vfcrange0123_04 = _mm512_mask_permutex_pd(vfcrange01_0246, 0xCC, vfcrange23_0246, (1 << 6) | (0 << 4) | 0);
5293 __m512d vfcrange0123_26 = _mm512_mask_permutex_pd(vfcrange01_0246, 0x33, vfcrange23_0246, (3 << 2) | 2);
5294 __m512d vfcrange0123_15 = _mm512_mask_permutex_pd(vfcrange01_1357, 0xCC, vfcrange23_1357, (1 << 6) | (0 << 4) | 0);
5295 __m512d vfcrange0123_37 = _mm512_mask_permutex_pd(vfcrange01_1357, 0x33, vfcrange23_1357, (3 << 2) | 2);
5299 _mm256_storeu_pd(fcs[i]->range, _mm512_castpd512_pd256(vfcrange0123_04));
5301 if (imask & (1 << 1))
5302 _mm256_storeu_pd(fcs[i + 1]->range, _mm512_castpd512_pd256(vfcrange0123_15));
5304 if (imask & (1 << 2))
5305 _mm256_storeu_pd(fcs[i + 2]->range, _mm512_castpd512_pd256(vfcrange0123_26));
5307 if (imask & (1 << 3))
5308 _mm256_storeu_pd(fcs[i + 3]->range, _mm512_castpd512_pd256(vfcrange0123_37));
5310 if (imask & (1 << 4))
5311 _mm256_storeu_pd(fcs[i + 4]->range, _mm512_extractf64x4_pd(vfcrange0123_04, 1));
5313 if (imask & (1 << 5))
5314 _mm256_storeu_pd(fcs[i + 5]->range, _mm512_extractf64x4_pd(vfcrange0123_15, 1));
5316 if (imask & (1 << 6))
5317 _mm256_storeu_pd(fcs[i + 6]->range, _mm512_extractf64x4_pd(vfcrange0123_26, 1));
5319 if (imask & (1 << 7))
5320 _mm256_storeu_pd(fcs[i + 7]->range, _mm512_extractf64x4_pd(vfcrange0123_37, 1));
5322 __m512d vfcdiv_flt_rate = _mm512_set_pd(
5323 i + 7 < batch_size ? fcs[i + 7]->div_flt_rate : fcs[i]->div_flt_rate,
5324 i + 6 < batch_size ? fcs[i + 6]->div_flt_rate : fcs[i]->div_flt_rate,
5325 i + 5 < batch_size ? fcs[i + 5]->div_flt_rate : fcs[i]->div_flt_rate,
5326 i + 4 < batch_size ? fcs[i + 4]->div_flt_rate : fcs[i]->div_flt_rate,
5327 i + 3 < batch_size ? fcs[i + 3]->div_flt_rate : fcs[i]->div_flt_rate,
5328 i + 2 < batch_size ? fcs[i + 2]->div_flt_rate : fcs[i]->div_flt_rate,
5329 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
5330 fcs[i]->div_flt_rate
5333 __m512d vf = _mm512_mul_pd(_mm512_mul_pd(_mm512_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
5335 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
5337 __m512d vreso_db_cf_p = _mm512_set_pd(
5338 i + 7 < batch_size ? RESO_DB_CF_P(fcs[i + 7]->reso_dB) : reso_db_cf_p,
5339 i + 6 < batch_size ? RESO_DB_CF_P(fcs[i + 6]->reso_dB) : reso_db_cf_p,
5340 i + 5 < batch_size ? RESO_DB_CF_P(fcs[i + 5]->reso_dB) : reso_db_cf_p,
5341 i + 4 < batch_size ? RESO_DB_CF_P(fcs[i + 4]->reso_dB) : reso_db_cf_p,
5342 i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : reso_db_cf_p,
5343 i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : reso_db_cf_p,
5344 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
5348 __m512d v1 = _mm512_set1_pd(1.0);
5349 __m512d v2 = _mm512_set1_pd(2.0);
5350 __m512d v0_5 = _mm512_set1_pd(0.5);
5352 __m512d vq = _mm512_sub_pd(v1, _mm512_div_pd(vf, _mm512_fmadd_pd(v2, _mm512_add_pd(vreso_db_cf_p, _mm512_div_pd(v0_5, _mm512_add_pd(v1, vf))), _mm512_sub_pd(vf, v2))));
5353 __m512d vc0 = _mm512_mul_pd(vq, vq);
5355 __m512d vcosf = _mm512_cos_pd(vf);
5357 ALIGN FLOAT_T af[8];
5358 _mm512_storeu_pd(af, vf);
5359 __m512d vcosf = _mm512_set_pd(cos(af[7]), cos(af[6]), cos(af[5]), cos(af[4]), cos(af[3]), cos(af[2]), cos(af[1]), cos(af[0]));
5361 __m512d vc1 = _mm512_sub_pd(_mm512_add_pd(vc0, v1), _mm512_mul_pd(_mm512_mul_pd(v2, vcosf), vq));
5363 __m512d vdc0246 = _mm512_unpacklo_pd(vc0, vc1);
5364 __m512d vdc1357 = _mm512_unpackhi_pd(vc0, vc1);
5367 _mm_storeu_pd(fcs[i]->dc, _mm512_castpd512_pd128(vdc0246));
5368 if (imask & (1 << 1))
5369 _mm_storeu_pd(fcs[i + 1]->dc, _mm512_castpd512_pd128(vdc1357));
5370 if (imask & (1 << 2))
5371 _mm_storeu_pd(fcs[i + 2]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc0246), 1));
5372 if (imask & (1 << 3))
5373 _mm_storeu_pd(fcs[i + 3]->dc, _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc1357), 1));
5374 if (imask & (1 << 4))
5375 _mm_storeu_pd(fcs[i + 4]->dc, _mm512_extractf64x2_pd(vdc0246, 2));
5376 if (imask & (1 << 5))
5377 _mm_storeu_pd(fcs[i + 5]->dc, _mm512_extractf64x2_pd(vdc1357, 2));
5378 if (imask & (1 << 6))
5379 _mm_storeu_pd(fcs[i + 6]->dc, _mm512_extractf64x2_pd(vdc0246, 3));
5380 if (imask & (1 << 7))
5381 _mm_storeu_pd(fcs[i + 7]->dc, _mm512_extractf64x2_pd(vdc1357, 3));
5386 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5388 static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients** fcs)
5390 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
5391 if (i >= batch_size)
5394 __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[i]->range);
5395 __m256d vfcrange0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(fcs[i + 1]->range) : _mm256_setzero_pd();
5396 __m256d vfcrange0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(fcs[i + 2]->range) : _mm256_setzero_pd();
5397 __m256d vfcrange0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(fcs[i + 3]->range) : _mm256_setzero_pd();
5399 __m256d vfcrange01_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (2 << 4) | 0);
5400 __m256d vfcrange01_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (2 << 4) | 0);
5401 __m256d vfcrange23_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (3 << 4) | 1);
5402 __m256d vfcrange23_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (3 << 4) | 1);
5404 __m256d vfcrange0 = _mm256_unpacklo_pd(vfcrange01_02, vfcrange01_13);
5405 __m256d vfcrange1 = _mm256_unpackhi_pd(vfcrange01_02, vfcrange01_13);
5406 __m256d vfcrange2 = _mm256_unpacklo_pd(vfcrange23_02, vfcrange23_13);
5407 __m256d vfcrange3 = _mm256_unpackhi_pd(vfcrange23_02, vfcrange23_13);
5409 __m256d vfcfreq = _mm256_set_pd(
5410 i + 3 < batch_size ? fcs[i + 3]->freq : 0.0,
5411 i + 2 < batch_size ? fcs[i + 2]->freq : 0.0,
5412 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
5416 __m256d vfcreso_DB = _mm256_set_pd(
5417 i + 3 < batch_size ? fcs[i + 3]->reso_dB : 0.0,
5418 i + 2 < batch_size ? fcs[i + 2]->reso_dB : 0.0,
5419 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
5423 __m256d vmask = _mm256_or_pd(
5424 _mm256_or_pd(_mm256_cmp_pd(vfcfreq, vfcrange0, _CMP_LT_OS), _mm256_cmp_pd(vfcfreq, vfcrange1, _CMP_GT_OS)),
5425 _mm256_or_pd(_mm256_cmp_pd(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm256_cmp_pd(vfcreso_DB, vfcrange3, _CMP_GT_OS))
5428 int imask = _mm256_movemask_pd(vmask);
5430 if (batch_size - i < 4)
5431 imask &= (1 << (batch_size - i)) - 1;
5434 __m256d v1mmargin = _mm256_set1_pd(1.0 - ext_filter_margin);
5435 __m256d v1pmargin = _mm256_set1_pd(1.0 + ext_filter_margin);
5437 vfcrange0 = _mm256_mul_pd(vfcfreq, v1mmargin);
5438 vfcrange1 = _mm256_mul_pd(vfcfreq, v1pmargin);
5439 vfcrange2 = _mm256_mul_pd(vfcreso_DB, v1mmargin);
5440 vfcrange3 = _mm256_mul_pd(vfcreso_DB, v1pmargin);
5442 vfcrange01_02 = _mm256_unpacklo_pd(vfcrange0, vfcrange1);
5443 vfcrange01_13 = _mm256_unpackhi_pd(vfcrange0, vfcrange1);
5444 vfcrange23_02 = _mm256_unpacklo_pd(vfcrange2, vfcrange3);
5445 vfcrange23_13 = _mm256_unpackhi_pd(vfcrange2, vfcrange3);
5447 vfcrange0123_0 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (2 << 4) | 0);
5448 vfcrange0123_1 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (2 << 4) | 0);
5449 vfcrange0123_2 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (3 << 4) | 1);
5450 vfcrange0123_3 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (3 << 4) | 1);
5453 _mm256_storeu_pd(fcs[i]->range, vfcrange0123_0);
5455 if (imask & (1 << 1))
5456 _mm256_storeu_pd(fcs[i + 1]->range, vfcrange0123_1);
5458 if (imask & (1 << 2))
5459 _mm256_storeu_pd(fcs[i + 2]->range, vfcrange0123_2);
5461 if (imask & (1 << 3))
5462 _mm256_storeu_pd(fcs[i + 3]->range, vfcrange0123_3);
5464 __m256d vfcdiv_flt_rate = _mm256_set_pd(
5465 i + 3 < batch_size ? fcs[i + 3]->div_flt_rate : fcs[i]->div_flt_rate,
5466 i + 2 < batch_size ? fcs[i + 2]->div_flt_rate : fcs[i]->div_flt_rate,
5467 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
5468 fcs[i]->div_flt_rate
5471 __m256d vf = _mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
5473 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
5475 __m256d vreso_db_cf_p = _mm256_set_pd(
5476 i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : reso_db_cf_p,
5477 i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : reso_db_cf_p,
5478 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
5482 __m256d v1 = _mm256_set1_pd(1.0);
5483 __m256d v2 = _mm256_set1_pd(2.0);
5484 __m256d v0_5 = _mm256_set1_pd(0.5);
5486 __m256d vq = _mm256_sub_pd(v1, _mm256_div_pd(vf, MM256_FMA_PD(v2, _mm256_add_pd(vreso_db_cf_p, _mm256_div_pd(v0_5, _mm256_add_pd(v1, vf))), _mm256_sub_pd(vf, v2))));
5487 __m256d vc0 = _mm256_mul_pd(vq, vq);
5489 __m256d vcosf = _mm256_cos_pd(vf);
5491 ALIGN FLOAT_T af[4];
5492 _mm256_storeu_pd(af, vf);
5493 __m256d vcosf = _mm256_set_pd(cos(af[3]), cos(af[2]), cos(af[1]), cos(af[0]));
5495 __m256d vc1 = _mm256_sub_pd(_mm256_add_pd(vc0, v1), _mm256_mul_pd(_mm256_mul_pd(v2, vcosf), vq));
5497 __m256d vdc02 = _mm256_unpacklo_pd(vc0, vc1);
5498 __m256d vdc13 = _mm256_unpackhi_pd(vc0, vc1);
5501 _mm_storeu_pd(fcs[i]->dc, _mm256_castpd256_pd128(vdc02));
5502 if (imask & (1 << 1))
5503 _mm_storeu_pd(fcs[i + 1]->dc, _mm256_castpd256_pd128(vdc13));
5504 if (imask & (1 << 2))
5505 _mm_storeu_pd(fcs[i + 2]->dc, _mm256_extractf128_pd(vdc02, 1));
5506 if (imask & (1 << 3))
5507 _mm_storeu_pd(fcs[i + 3]->dc, _mm256_extractf128_pd(vdc13, 1));
5512 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5514 static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients** fcs)
5516 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
5517 if (i >= batch_size)
5520 __m128d vfcrange01_0 = _mm_loadu_pd(fcs[i]->range);
5521 __m128d vfcrange23_0 = _mm_loadu_pd(&fcs[i]->range[2]);
5522 __m128d vfcrange01_1 = i + 1 < batch_size ? _mm_loadu_pd(fcs[i + 1]->range) : _mm_setzero_pd();
5523 __m128d vfcrange23_1 = i + 1 < batch_size ? _mm_loadu_pd(&fcs[i + 1]->range[2]) : _mm_setzero_pd();
5525 __m128d vfcrange0 = _mm_unpacklo_pd(vfcrange01_0, vfcrange01_1);
5526 __m128d vfcrange1 = _mm_unpackhi_pd(vfcrange01_0, vfcrange01_1);
5527 __m128d vfcrange2 = _mm_unpacklo_pd(vfcrange23_0, vfcrange23_1);
5528 __m128d vfcrange3 = _mm_unpackhi_pd(vfcrange23_0, vfcrange23_1);
5530 __m128d vfcfreq = _mm_set_pd(
5531 i + 1 < batch_size ? fcs[i + 1]->freq : 0.0,
5535 __m128d vfcreso_DB = _mm_set_pd(
5536 i + 1 < batch_size ? fcs[i + 1]->reso_dB : 0.0,
5540 __m128d vmask = _mm_or_pd(
5541 _mm_or_pd(_mm_cmplt_pd(vfcfreq, vfcrange0), _mm_cmpgt_pd(vfcfreq, vfcrange1)),
5542 _mm_or_pd(_mm_cmplt_pd(vfcreso_DB, vfcrange2), _mm_cmpgt_pd(vfcreso_DB, vfcrange3))
5545 int imask = _mm_movemask_pd(vmask);
5547 if (batch_size - i < 2)
5548 imask &= (1 << (batch_size - i)) - 1;
5551 __m128d v1mmargin = _mm_set1_pd(1.0 - ext_filter_margin);
5552 __m128d v1pmargin = _mm_set1_pd(1.0 + ext_filter_margin);
5554 vfcrange0 = _mm_mul_pd(vfcfreq, v1mmargin);
5555 vfcrange1 = _mm_mul_pd(vfcfreq, v1pmargin);
5556 vfcrange2 = _mm_mul_pd(vfcreso_DB, v1mmargin);
5557 vfcrange3 = _mm_mul_pd(vfcreso_DB, v1pmargin);
5559 vfcrange01_0 = _mm_unpacklo_pd(vfcrange0, vfcrange1);
5560 vfcrange01_1 = _mm_unpackhi_pd(vfcrange0, vfcrange1);
5561 vfcrange23_0 = _mm_unpacklo_pd(vfcrange2, vfcrange3);
5562 vfcrange23_1 = _mm_unpackhi_pd(vfcrange2, vfcrange3);
5565 _mm_storeu_pd(fcs[i]->range, vfcrange01_0);
5566 _mm_storeu_pd(&fcs[i]->range[2], vfcrange23_0);
5569 if (imask & (1 << 1)) {
5570 _mm_storeu_pd(fcs[i + 1]->range, vfcrange01_1);
5571 _mm_storeu_pd(&fcs[i + 1]->range[2], vfcrange23_1);
5574 __m128d vfcdiv_flt_rate = _mm_set_pd(
5575 i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
5576 fcs[i]->div_flt_rate
5579 __m128d vf = _mm_mul_pd(_mm_mul_pd(_mm_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
5581 FLOAT_T reso_db_cf_p = RESO_DB_CF_P(fcs[i]->reso_dB);
5583 __m128d vreso_db_cf_p = _mm_set_pd(
5584 i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : reso_db_cf_p,
5588 __m128d v1 = _mm_set1_pd(1.0);
5589 __m128d v2 = _mm_set1_pd(2.0);
5590 __m128d v0_5 = _mm_set1_pd(0.5);
5592 __m128d vq = _mm_sub_pd(v1, _mm_div_pd(vf, MM_FMA_PD(v2, _mm_add_pd(vreso_db_cf_p, _mm_div_pd(v0_5, _mm_add_pd(v1, vf))), _mm_sub_pd(vf, v2))));
5593 __m128d vc0 = _mm_mul_pd(vq, vq);
5595 __m128d vcosf = _mm_cos_pd(vf);
5597 ALIGN FLOAT_T af[2];
5598 _mm_storeu_pd(af, vf);
5599 __m128d vcosf = _mm_set_pd(cos(af[1]), cos(af[0]));
5601 __m128d vc1 = _mm_sub_pd(_mm_add_pd(vc0, v1), _mm_mul_pd(_mm_mul_pd(v2, vcosf), vq));
5603 __m128d vdc0 = _mm_unpacklo_pd(vc0, vc1);
5604 __m128d vdc1 = _mm_unpackhi_pd(vc0, vc1);
5607 _mm_storeu_pd(fcs[i]->dc, vdc0);
5609 if (imask & (1 << 1))
5610 _mm_storeu_pd(fcs[i + 1]->dc, vdc1);
5617 #if (USE_X86_EXT_INTRIN >= 10) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5619 static void sample_filter_HPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5621 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 8) {
5622 if (i >= batch_size)
5625 __m256i vcounts = _mm256_maskz_loadu_epi32(generate_mask8_for_count(i, batch_size), &counts[i]);
5627 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
5628 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
5629 __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
5630 __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
5631 __m128d vdb01_4 = i + 4 < batch_size ? _mm_loadu_pd(dbs[i + 4]) : _mm_setzero_pd();
5632 __m128d vdb01_5 = i + 5 < batch_size ? _mm_loadu_pd(dbs[i + 5]) : _mm_setzero_pd();
5633 __m128d vdb01_6 = i + 6 < batch_size ? _mm_loadu_pd(dbs[i + 6]) : _mm_setzero_pd();
5634 __m128d vdb01_7 = i + 7 < batch_size ? _mm_loadu_pd(dbs[i + 7]) : _mm_setzero_pd();
5636 __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
5637 __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
5638 __m256d vdb01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_4), vdb01_6, 1);
5639 __m256d vdb01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_5), vdb01_7, 1);
5641 __m512d vdb01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_02), vdb01_46, 1);
5642 __m512d vdb01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdb01_13), vdb01_57, 1);
5644 __m512d vdb0 = _mm512_unpacklo_pd(vdb01_0246, vdb01_1357);
5645 __m512d vdb1 = _mm512_unpackhi_pd(vdb01_0246, vdb01_1357);
5647 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
5648 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
5649 __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
5650 __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
5651 __m128d vdc01_4 = i + 4 < batch_size ? _mm_loadu_pd(dcs[i + 4]) : _mm_setzero_pd();
5652 __m128d vdc01_5 = i + 5 < batch_size ? _mm_loadu_pd(dcs[i + 5]) : _mm_setzero_pd();
5653 __m128d vdc01_6 = i + 6 < batch_size ? _mm_loadu_pd(dcs[i + 6]) : _mm_setzero_pd();
5654 __m128d vdc01_7 = i + 7 < batch_size ? _mm_loadu_pd(dcs[i + 7]) : _mm_setzero_pd();
5656 __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
5657 __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
5658 __m256d vdc01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_4), vdc01_6, 1);
5659 __m256d vdc01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_5), vdc01_7, 1);
5661 __m512d vdc01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_02), vdc01_46, 1);
5662 __m512d vdc01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vdc01_13), vdc01_57, 1);
5664 __m512d vdc0 = _mm512_unpacklo_pd(vdc01_0246, vdc01_1357);
5665 __m512d vdc1 = _mm512_unpackhi_pd(vdc01_0246, vdc01_1357);
5667 __m128i vcounts_max = _mm_max_epi32(_mm256_castsi256_si128(vcounts), _mm256_extracti128_si256(vcounts, 1));
5668 vcounts_max = _mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, (3 << 2) | 2));
5669 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_max, _mm_shuffle_epi32(vcounts_max, 1)));
5671 for (int32 j = 0; j < count_max; j += 8) {
5673 vin[0] = _mm512_maskz_loadu_pd(generate_mask8_for_count(j, counts[i]), &sps[i][j]);
5675 for (int k = 1; k < 8; k++)
5676 vin[k] = _mm512_maskz_loadu_pd(i + k < batch_size ? generate_mask8_for_count(j, counts[i + k]) : 0, & sps[i + k][j]);
5678 __m512d vsp0246_01 = _mm512_unpacklo_pd(vin[0], vin[1]);
5679 __m512d vsp1357_01 = _mm512_unpackhi_pd(vin[0], vin[1]);
5680 __m512d vsp0246_23 = _mm512_unpacklo_pd(vin[2], vin[3]);
5681 __m512d vsp1357_23 = _mm512_unpackhi_pd(vin[2], vin[3]);
5682 __m512d vsp0246_45 = _mm512_unpacklo_pd(vin[4], vin[5]);
5683 __m512d vsp1357_45 = _mm512_unpackhi_pd(vin[4], vin[5]);
5684 __m512d vsp0246_67 = _mm512_unpacklo_pd(vin[6], vin[7]);
5685 __m512d vsp1357_67 = _mm512_unpackhi_pd(vin[6], vin[7]);
5687 __m512d vsp04_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5688 __m512d vsp26_0123 = _mm512_shuffle_f64x2(vsp0246_01, vsp0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5689 __m512d vsp15_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5690 __m512d vsp37_0123 = _mm512_shuffle_f64x2(vsp1357_01, vsp1357_23, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5691 __m512d vsp04_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5692 __m512d vsp26_4567 = _mm512_shuffle_f64x2(vsp0246_45, vsp0246_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5693 __m512d vsp15_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5694 __m512d vsp37_4567 = _mm512_shuffle_f64x2(vsp1357_45, vsp1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5697 vsps[0] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5698 vsps[4] = _mm512_shuffle_f64x2(vsp04_0123, vsp04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5699 vsps[1] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5700 vsps[5] = _mm512_shuffle_f64x2(vsp15_0123, vsp15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5701 vsps[2] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5702 vsps[6] = _mm512_shuffle_f64x2(vsp26_0123, vsp26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5703 vsps[3] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5704 vsps[7] = _mm512_shuffle_f64x2(vsp37_0123, vsp37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5706 for (int k = 0; k < 8; k++) {
5707 __mmask8 kmask = _mm256_cmplt_epi32_mask(_mm256_set1_epi32(j + k), vcounts);
5709 vdb1 = _mm512_mask3_fmadd_pd(_mm512_sub_pd(vsps[k], vdb0), vdc1, vdb1, kmask);
5710 vdb0 = _mm512_mask_add_pd(vdb0, kmask, vdb0, vdb1);
5711 vdb1 = _mm512_mask_mul_pd(vdb1, kmask, vdb1, vdc0);
5712 vsps[k] = _mm512_sub_pd(vsps[k], vdb0);
5715 __m512d vsp01_0246 = _mm512_unpacklo_pd(vsps[0], vsps[1]);
5716 __m512d vsp01_1357 = _mm512_unpackhi_pd(vsps[0], vsps[1]);
5717 __m512d vsp23_0246 = _mm512_unpacklo_pd(vsps[2], vsps[3]);
5718 __m512d vsp23_1357 = _mm512_unpackhi_pd(vsps[2], vsps[3]);
5719 __m512d vsp45_0246 = _mm512_unpacklo_pd(vsps[4], vsps[5]);
5720 __m512d vsp45_1357 = _mm512_unpackhi_pd(vsps[4], vsps[5]);
5721 __m512d vsp67_0246 = _mm512_unpacklo_pd(vsps[6], vsps[7]);
5722 __m512d vsp67_1357 = _mm512_unpackhi_pd(vsps[6], vsps[7]);
5724 __m512d vsp0123_04 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5725 __m512d vsp0123_26 = _mm512_shuffle_f64x2(vsp01_0246, vsp23_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5726 __m512d vsp0123_15 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5727 __m512d vsp0123_37 = _mm512_shuffle_f64x2(vsp01_1357, vsp23_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5728 __m512d vsp4567_04 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5729 __m512d vsp4567_26 = _mm512_shuffle_f64x2(vsp45_0246, vsp67_0246, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5730 __m512d vsp4567_15 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5731 __m512d vsp4567_37 = _mm512_shuffle_f64x2(vsp45_1357, vsp67_1357, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5734 vout[0] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5735 vout[4] = _mm512_shuffle_f64x2(vsp0123_04, vsp4567_04, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5736 vout[1] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5737 vout[5] = _mm512_shuffle_f64x2(vsp0123_15, vsp4567_15, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5738 vout[2] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5739 vout[6] = _mm512_shuffle_f64x2(vsp0123_26, vsp4567_26, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5740 vout[3] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (2 << 6) | (0 << 4) | (2 << 2) | 0);
5741 vout[7] = _mm512_shuffle_f64x2(vsp0123_37, vsp4567_37, (3 << 6) | (1 << 4) | (3 << 2) | 1);
5743 for (int k = 0; k < 8; k++)
5744 _mm512_mask_storeu_pd(&sps[i + k][j], generate_mask8_for_count(j, counts[i + k]), vout[k]);
5747 vdb01_0246 = _mm512_unpacklo_pd(vdb0, vdb1);
5748 vdb01_1357 = _mm512_unpackhi_pd(vdb0, vdb1);
5750 _mm_storeu_pd(dbs[i], _mm512_castpd512_pd128(vdb01_0246));
5752 if (i + 1 < batch_size)
5753 _mm_storeu_pd(dbs[i + 1], _mm512_castpd512_pd128(vdb01_1357));
5755 if (i + 2 < batch_size)
5756 _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_0246), 1));
5758 if (i + 3 < batch_size)
5759 _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb01_1357), 1));
5761 if (i + 4 < batch_size)
5762 _mm_storeu_pd(dbs[i + 4], _mm512_extractf64x2_pd(vdb01_0246, 2));
5764 if (i + 5 < batch_size)
5765 _mm_storeu_pd(dbs[i + 5], _mm512_extractf64x2_pd(vdb01_1357, 2));
5767 if (i + 6 < batch_size)
5768 _mm_storeu_pd(dbs[i + 6], _mm512_extractf64x2_pd(vdb01_0246, 3));
5770 if (i + 7 < batch_size)
5771 _mm_storeu_pd(dbs[i + 7], _mm512_extractf64x2_pd(vdb01_1357, 3));
5775 #elif (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5777 static void sample_filter_HPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5779 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
5780 if (i >= batch_size)
5783 __m128i vcounts = _mm_set_epi32(
5784 i + 3 < batch_size ? counts[i + 3] : 0,
5785 i + 2 < batch_size ? counts[i + 2] : 0,
5786 i + 1 < batch_size ? counts[i + 1] : 0,
5790 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
5791 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
5792 __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
5793 __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
5795 __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
5796 __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
5798 __m256d vdb0 = _mm256_unpacklo_pd(vdb01_02, vdb01_13);
5799 __m256d vdb1 = _mm256_unpackhi_pd(vdb01_02, vdb01_13);
5801 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
5802 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
5803 __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
5804 __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
5806 __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
5807 __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
5809 __m256d vdc0 = _mm256_unpacklo_pd(vdc01_02, vdc01_13);
5810 __m256d vdc1 = _mm256_unpackhi_pd(vdc01_02, vdc01_13);
5812 __m128i vcounts_halfmax = _mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, (3 << 2) | 2));
5813 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, 1)));
5815 for (int32 j = 0; j < count_max; j += 4) {
5816 __m256d vsp0123_0 = j < counts[i] ? _mm256_loadu_pd(&sps[i][j]) : _mm256_setzero_pd();
5817 __m256d vsp0123_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm256_loadu_pd(&sps[i + 1][j]) : _mm256_setzero_pd();
5818 __m256d vsp0123_2 = i + 2 < batch_size && j < counts[i + 2] ? _mm256_loadu_pd(&sps[i + 2][j]) : _mm256_setzero_pd();
5819 __m256d vsp0123_3 = i + 3 < batch_size && j < counts[i + 3] ? _mm256_loadu_pd(&sps[i + 3][j]) : _mm256_setzero_pd();
5821 __m256d vsp01_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (2 << 4) | 0);
5822 __m256d vsp01_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (2 << 4) | 0);
5823 __m256d vsp23_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (3 << 4) | 1);
5824 __m256d vsp23_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (3 << 4) | 1);
5827 vsps[0] = _mm256_unpacklo_pd(vsp01_02, vsp01_13);
5828 vsps[1] = _mm256_unpackhi_pd(vsp01_02, vsp01_13);
5829 vsps[2] = _mm256_unpacklo_pd(vsp23_02, vsp23_13);
5830 vsps[3] = _mm256_unpackhi_pd(vsp23_02, vsp23_13);
5832 for (int k = 0; k < 4; k++) {
5833 __m256d vmask = _mm256_castsi256_pd(_mm256_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
5835 vdb1 = _mm256_blendv_pd(vdb1, MM256_FMA_PD(_mm256_sub_pd(vsps[k], vdb0), vdc1, vdb1), vmask);
5836 vdb0 = _mm256_blendv_pd(vdb0, _mm256_add_pd(vdb0, vdb1), vmask);
5837 vdb1 = _mm256_blendv_pd(vdb1, _mm256_mul_pd(vdb1, vdc0), vmask);
5838 vsps[k] = _mm256_sub_pd(vsps[k], vdb0);
5841 vsp01_02 = _mm256_unpacklo_pd(vsps[0], vsps[1]);
5842 vsp01_13 = _mm256_unpackhi_pd(vsps[0], vsps[1]);
5843 vsp23_02 = _mm256_unpacklo_pd(vsps[2], vsps[3]);
5844 vsp23_13 = _mm256_unpackhi_pd(vsps[2], vsps[3]);
5846 vsp0123_0 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (2 << 4) | 0);
5847 vsp0123_1 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (2 << 4) | 0);
5848 vsp0123_2 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (3 << 4) | 1);
5849 vsp0123_3 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (3 << 4) | 1);
5852 _mm256_storeu_pd(&sps[i][j], vsp0123_0);
5854 if (i + 1 < batch_size && j < counts[i + 1])
5855 _mm256_storeu_pd(&sps[i + 1][j], vsp0123_1);
5857 if (i + 2 < batch_size && j < counts[i + 2])
5858 _mm256_storeu_pd(&sps[i + 2][j], vsp0123_2);
5860 if (i + 3 < batch_size && j < counts[i + 3])
5861 _mm256_storeu_pd(&sps[i + 3][j], vsp0123_3);
5864 vdb01_02 = _mm256_unpacklo_pd(vdb0, vdb1);
5865 vdb01_13 = _mm256_unpackhi_pd(vdb0, vdb1);
5867 _mm_storeu_pd(dbs[i], _mm256_castpd256_pd128(vdb01_02));
5869 if (i + 1 < batch_size)
5870 _mm_storeu_pd(dbs[i + 1], _mm256_castpd256_pd128(vdb01_13));
5872 if (i + 2 < batch_size)
5873 _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(vdb01_02, 1));
5875 if (i + 3 < batch_size)
5876 _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(vdb01_13, 1));
5880 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
5882 static void sample_filter_HPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
5884 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 2) {
5885 if (i >= batch_size)
5888 __m128i vcounts = _mm_set_epi32(
5891 i + 1 < batch_size ? counts[i + 1] : 0,
5895 __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
5896 __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
5898 __m128d vdb0 = _mm_unpacklo_pd(vdb01_0, vdb01_1);
5899 __m128d vdb1 = _mm_unpackhi_pd(vdb01_0, vdb01_1);
5901 __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
5902 __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
5904 __m128d vdc0 = _mm_unpacklo_pd(vdc01_0, vdc01_1);
5905 __m128d vdc1 = _mm_unpackhi_pd(vdc01_0, vdc01_1);
5907 int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, 1)));
5909 for (int32 j = 0; j < count_max; j += 2) {
5910 __m128d vsp01_0 = j < counts[i] ? _mm_loadu_pd(&sps[i][j]) : _mm_setzero_pd();
5911 __m128d vsp01_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm_loadu_pd(&sps[i + 1][j]) : _mm_setzero_pd();
5914 vsps[0] = _mm_unpacklo_pd(vsp01_0, vsp01_1);
5915 vsps[1] = _mm_unpackhi_pd(vsp01_0, vsp01_1);
5917 for (int k = 0; k < 2; k++) {
5918 __m128d vmask = _mm_castsi128_pd(_mm_cvtepi32_epi64(_mm_cmplt_epi32(_mm_set1_epi32(j + k), vcounts)));
5920 vdb1 = MM_BLENDV_PD(vdb1, MM_FMA_PD(_mm_sub_pd(vsps[k], vdb0), vdc1, vdb1), vmask);
5921 vdb0 = MM_BLENDV_PD(vdb0, _mm_add_pd(vdb0, vdb1), vmask);
5922 vdb1 = MM_BLENDV_PD(vdb1, _mm_mul_pd(vdb1, vdc0), vmask);
5923 vsps[k] = _mm_sub_pd(vsps[k], vdb0);
5926 vsp01_0 = _mm_unpacklo_pd(vsps[0], vsps[1]);
5927 vsp01_1 = _mm_unpackhi_pd(vsps[0], vsps[1]);
5930 _mm_storeu_pd(&sps[i][j], vsp01_0);
5932 if (i + 1 < batch_size && j < counts[i + 1])
5933 _mm_storeu_pd(&sps[i + 1][j], vsp01_1);
5936 vdb01_0 = _mm_unpacklo_pd(vdb0, vdb1);
5937 vdb01_1 = _mm_unpackhi_pd(vdb0, vdb1);
5939 _mm_storeu_pd(dbs[i], vdb01_0);
5941 if (i + 1 < batch_size)
5942 _mm_storeu_pd(dbs[i + 1], vdb01_1);
5948 void buffer_filter_batch(int batch_size, FilterCoefficients **fcs, DATA_T **sps, int32 *counts)
5951 for (int i = 0; i < batch_size; i++) {
5953 ctl->cmsg(CMSG_ERROR, VERB_NORMAL, "buffer_filter_batch(): error: filter not initialized");
5958 for (int i = 1; i < batch_size; i++) {
5959 if (fcs[0]->type != fcs[i]->type) {
5960 ctl->cmsg(CMSG_ERROR, VERB_NORMAL, "buffer_filter_batch(): error: filter type mismatch");
5965 if (fcs[0]->type == FILTER_NONE)
5968 FILTER_T *dcs[MIX_VOICE_BATCH_SIZE];
5969 FILTER_T *dbs[MIX_VOICE_BATCH_SIZE];
5971 for (int i = 0; i < batch_size; i++) {
5972 dcs[i] = fcs[i]->dc;
5973 dbs[i] = &fcs[i]->db[FILTER_FB_L];
5976 switch (fcs[0]->type) {
5978 recalc_filter_LPF_BW_batch(batch_size, fcs);
5979 sample_filter_LPF_BW_batch(batch_size, dcs, dbs, sps, counts);
5982 case FILTER_LPF12_2:
5983 recalc_filter_LPF12_2_batch(batch_size, fcs);
5984 sample_filter_LPF12_2_batch(batch_size, dcs, dbs, sps, counts);
5987 case FILTER_HPF12_2:
5988 recalc_filter_LPF12_2_batch(batch_size, fcs);
5989 sample_filter_HPF12_2_batch(batch_size, dcs, dbs, sps, counts);
5993 ctl->cmsg(CMSG_ERROR, VERB_NORMAL, "buffer_filter_batch(): error: unsupported filter type");
5998 void voice_filter_batch(int batch_size, int *vs, DATA_T **sps, int32 *counts)
6000 if (batch_size <= 0)
6003 FilterCoefficients *fcs[MIX_VOICE_BATCH_SIZE];
6005 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++)
6006 fcs[i] = (i < batch_size ? &voice[vs[i]].fc : NULL);
6008 buffer_filter_batch(batch_size, fcs, sps, counts);
6010 for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++)
6011 fcs[i] = (i < batch_size ? &voice[vs[i]].fc2 : NULL);
6013 buffer_filter_batch(batch_size, fcs, sps, counts);
6017 #endif // MIX_VOICE_BATCH
6020 /*************** antialiasing ********************/
6024 /* bessel function */
6025 static FLOAT_T ino(FLOAT_T x)
6027 FLOAT_T y, de, e, sde;
6035 de = de * y / (FLOAT_T) i;
6038 } while (!( (e * 1.0e-08 - sde > 0) || (i++ > 25) ));
6042 /* Kaiser Window (symetric) */
6043 static void kaiser(FLOAT_T *w,int n,FLOAT_T beta)
6048 xind = (2*n - 1) * (2*n - 1);
6049 for (i =0; i<n ; i++)
6052 w[i] = ino((FLOAT_T)(beta * sqrt((double)(1. - 4 * xi * xi / xind))))
6053 / ino((FLOAT_T)beta);
6058 * fir coef in g, cuttoff frequency in fc
6060 static void designfir(FLOAT_T *g , FLOAT_T fc, FLOAT_T att)
6062 /* attenuation in db */
6064 FLOAT_T xi, omega, beta ;
6065 FLOAT_T w[LPF_FIR_ORDER2];
6067 for (i =0; i < LPF_FIR_ORDER2 ;i++)
6069 xi = (FLOAT_T) i + 0.5;
6071 g[i] = sin( (double) omega * fc) / omega;
6074 beta = (FLOAT_T) exp(log((double)0.58417 * (att - 20.96)) * 0.4) + 0.07886
6076 kaiser( w, LPF_FIR_ORDER2, beta);
6078 /* Matrix product */
6079 for (i =0; i < LPF_FIR_ORDER2 ; i++)
6084 * FIR filtering -> apply the filter given by coef[] to the data buffer
6085 * Note that we simulate leading and trailing 0 at the border of the
6088 static void filter(int16 *result,int16 *data, int32 length,FLOAT_T coef[])
6090 int32 sample,i,sample_window;
6094 /* Simulate leading 0 at the begining of the buffer */
6095 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6098 sample_window= sample - LPF_FIR_ORDER2;
6100 for (i = 0; i < LPF_FIR_ORDER ;i++)
6102 ((sample_window<0)? 0.0 : data[sample_window++]) ;
6104 /* Saturation ??? */
6105 if (sum> 32767.) { sum=32767.; peak++; }
6106 if (sum< -32768.) { sum=-32768; peak++; }
6107 result[sample] = (int16) sum;
6110 /* The core of the buffer */
6111 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6114 sample_window= sample - LPF_FIR_ORDER2;
6116 for (i = 0; i < LPF_FIR_ORDER ;i++)
6117 sum += data[sample_window++] * coef[i];
6119 /* Saturation ??? */
6120 if (sum> 32767.) { sum=32767.; peak++; }
6121 if (sum< -32768.) { sum=-32768; peak++; }
6122 result[sample] = (int16) sum;
6125 /* Simulate 0 at the end of the buffer */
6126 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6129 sample_window= sample - LPF_FIR_ORDER2;
6131 for (i = 0; i < LPF_FIR_ORDER ;i++)
6133 ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6135 /* Saturation ??? */
6136 if (sum> 32767.) { sum=32767.; peak++; }
6137 if (sum< -32768.) { sum=-32768; peak++; }
6138 result[sample] = (int16) sum;
6142 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Saturation %2.3f %%.", 100.0*peak/ (FLOAT_T) length);
6145 static void filter_int8(int8 *result,int8 *data, int32 length,FLOAT_T coef[])
6147 int32 sample,i,sample_window;
6151 /* Simulate leading 0 at the begining of the buffer */
6152 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6155 sample_window= sample - LPF_FIR_ORDER2;
6157 for (i = 0; i < LPF_FIR_ORDER ;i++)
6159 ((sample_window<0)? 0.0 : data[sample_window++]) ;
6161 /* Saturation ??? */
6162 if (sum> 127.) { sum=127.; peak++; }
6163 if (sum< -128.) { sum=-128; peak++; }
6164 result[sample] = (int8) sum;
6167 /* The core of the buffer */
6168 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6171 sample_window= sample - LPF_FIR_ORDER2;
6173 for (i = 0; i < LPF_FIR_ORDER ;i++)
6174 sum += data[sample_window++] * coef[i];
6176 /* Saturation ??? */
6177 if (sum> 127.) { sum=127.; peak++; }
6178 if (sum< -128.) { sum=-128; peak++; }
6179 result[sample] = (int8) sum;
6182 /* Simulate 0 at the end of the buffer */
6183 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6186 sample_window= sample - LPF_FIR_ORDER2;
6188 for (i = 0; i < LPF_FIR_ORDER ;i++)
6190 ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6192 /* Saturation ??? */
6193 if (sum> 127.) { sum=127.; peak++; }
6194 if (sum< -128.) { sum=-128; peak++; }
6195 result[sample] = (int8) sum;
6199 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Saturation %2.3f %%.", 100.0*peak/ (FLOAT_T) length);
6202 static void filter_int32(int32 *result,int32 *data, int32 length,FLOAT_T coef[])
6204 int32 sample,i,sample_window;
6208 /* Simulate leading 0 at the begining of the buffer */
6209 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6212 sample_window= sample - LPF_FIR_ORDER2;
6214 for (i = 0; i < LPF_FIR_ORDER ;i++)
6216 ((sample_window<0)? 0.0 : data[sample_window++]) ;
6218 /* Saturation ??? */
6219 if (sum> 2147483647.) { sum=2147483647.; peak++; }
6220 if (sum< -2147483648.) { sum=-2147483648.; peak++; }
6221 result[sample] = (int32) sum;
6224 /* The core of the buffer */
6225 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6228 sample_window= sample - LPF_FIR_ORDER2;
6230 for (i = 0; i < LPF_FIR_ORDER ;i++)
6231 sum += data[sample_window++] * coef[i];
6233 /* Saturation ??? */
6234 if (sum> 2147483647.) { sum=2147483647.; peak++; }
6235 if (sum< -2147483648.) { sum=-2147483648.; peak++; }
6236 result[sample] = (int32) sum;
6239 /* Simulate 0 at the end of the buffer */
6240 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6243 sample_window= sample - LPF_FIR_ORDER2;
6245 for (i = 0; i < LPF_FIR_ORDER ;i++)
6247 ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6249 /* Saturation ??? */
6250 if (sum> 2147483647.) { sum=2147483647.; peak++; }
6251 if (sum< -2147483648.) { sum=-2147483648.; peak++; }
6252 result[sample] = (int32) sum;
6256 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Saturation %2.3f %%.", 100.0*peak/ (FLOAT_T) length);
6259 static void filter_float(float *result,float *data, int32 length,FLOAT_T coef[])
6261 int32 sample,i,sample_window;
6264 /* Simulate leading 0 at the begining of the buffer */
6265 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6268 sample_window= sample - LPF_FIR_ORDER2;
6270 for (i = 0; i < LPF_FIR_ORDER ;i++)
6271 sum += coef[i] * ((sample_window<0)? 0.0 : data[sample_window++]) ;
6272 result[sample] = (float)sum;
6275 /* The core of the buffer */
6276 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6279 sample_window= sample - LPF_FIR_ORDER2;
6281 for (i = 0; i < LPF_FIR_ORDER ;i++)
6282 sum += data[sample_window++] * coef[i];
6283 result[sample] = (float)sum;
6286 /* Simulate 0 at the end of the buffer */
6287 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6290 sample_window= sample - LPF_FIR_ORDER2;
6292 for (i = 0; i < LPF_FIR_ORDER ;i++)
6293 sum += coef[i] * ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6294 result[sample] = (float)sum;
6298 static void filter_double(double *result,double *data, int32 length,FLOAT_T coef[])
6300 int32 sample,i,sample_window;
6303 /* Simulate leading 0 at the begining of the buffer */
6304 for (sample = 0; sample < LPF_FIR_ORDER2 ; sample++ )
6307 sample_window= sample - LPF_FIR_ORDER2;
6309 for (i = 0; i < LPF_FIR_ORDER ;i++)
6310 sum += coef[i] * ((sample_window<0)? 0.0 : data[sample_window++]) ;
6311 result[sample] = (double)sum;
6314 /* The core of the buffer */
6315 for (sample = LPF_FIR_ORDER2; sample < length - LPF_FIR_ORDER + LPF_FIR_ORDER2 ; sample++ )
6318 sample_window= sample - LPF_FIR_ORDER2;
6320 for (i = 0; i < LPF_FIR_ORDER ;i++)
6321 sum += data[sample_window++] * coef[i];
6322 result[sample] = (double)sum;
6325 /* Simulate 0 at the end of the buffer */
6326 for (sample = length - LPF_FIR_ORDER + LPF_FIR_ORDER2; sample < length ; sample++ )
6329 sample_window= sample - LPF_FIR_ORDER2;
6331 for (i = 0; i < LPF_FIR_ORDER ;i++)
6332 sum += coef[i] * ((sample_window>=length)? 0.0 : data[sample_window++]) ;
6333 result[sample] = (double)sum;
6336 /***********************************************************************/
6337 /* Prevent aliasing by filtering any freq above the output_rate */
6339 /* I don't worry about looping point -> they will remain soft if they */
6341 /***********************************************************************/
6342 void antialiasing(int16 *data, int32 data_length, int32 sample_rate, int32 output_rate)
6347 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6348 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6349 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6352 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6355 /* No oversampling */
6356 if (output_rate>=sample_rate)
6359 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6360 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6363 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6365 /* Make the filter symetric */
6366 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6367 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6369 /* We apply the filter we have designed on a copy of the patch */
6373 bytes = sizeof(int16) * data_length;
6374 temp = (int16 *)safe_malloc(bytes);
6375 memcpy(temp, data, bytes);
6376 filter(data, temp, data_length, fir_symetric);
6380 void antialiasing_int8(int8 *data, int32 data_length, int32 sample_rate, int32 output_rate)
6385 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6386 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6387 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6390 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6393 /* No oversampling */
6394 if (output_rate>=sample_rate)
6397 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6398 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6401 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6403 /* Make the filter symetric */
6404 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6405 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6407 /* We apply the filter we have designed on a copy of the patch */
6411 bytes = sizeof(int8) * data_length;
6412 temp = (int8 *)safe_malloc(bytes);
6413 memcpy(temp, data, bytes);
6414 filter_int8(data, temp, data_length, fir_symetric);
6418 void antialiasing_int32(int32 *data, int32 data_length, int32 sample_rate, int32 output_rate)
6423 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6424 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6425 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6428 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6431 /* No oversampling */
6432 if (output_rate>=sample_rate)
6435 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6436 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6439 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6441 /* Make the filter symetric */
6442 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6443 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6445 /* We apply the filter we have designed on a copy of the patch */
6449 bytes = sizeof(int32) * data_length;
6450 temp = (int32 *)safe_malloc(bytes);
6451 memcpy(temp, data, bytes);
6452 filter_int32(data, temp, data_length, fir_symetric);
6456 void antialiasing_float(float *data, int32 data_length, int32 sample_rate, int32 output_rate)
6461 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6462 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6463 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6466 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6469 /* No oversampling */
6470 if (output_rate>=sample_rate)
6473 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6474 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6477 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6479 /* Make the filter symetric */
6480 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6481 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6483 /* We apply the filter we have designed on a copy of the patch */
6487 bytes = sizeof(float) * data_length;
6488 temp = (float *)safe_malloc(bytes);
6489 memcpy(temp, data, bytes);
6490 filter_float(data, temp, data_length, fir_symetric);
6495 void antialiasing_double(double *data, int32 data_length, int32 sample_rate, int32 output_rate)
6500 FLOAT_T fir_symetric[LPF_FIR_ORDER];
6501 FLOAT_T fir_coef[LPF_FIR_ORDER2];
6502 FLOAT_T freq_cut; /* cutoff frequency [0..1.0] FREQ_CUT/SAMP_FREQ*/
6504 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: Fsample=%iKHz",
6507 /* No oversampling */
6508 if (output_rate>=sample_rate)
6511 freq_cut= (FLOAT_T)output_rate / (FLOAT_T)sample_rate;
6512 ctl->cmsg(CMSG_INFO, VERB_NOISY, "Antialiasing: cutoff=%f%%",
6515 designfir(fir_coef,freq_cut, LPF_FIR_ANTIALIASING_ATT);
6517 /* Make the filter symetric */
6518 for (i = 0 ; i<LPF_FIR_ORDER2 ;i++)
6519 fir_symetric[LPF_FIR_ORDER-1 - i] = fir_symetric[i] = fir_coef[LPF_FIR_ORDER2-1 - i];
6521 /* We apply the filter we have designed on a copy of the patch */
6525 bytes = sizeof(double) * data_length;
6526 temp = (double *)safe_malloc(bytes);
6527 memcpy(temp, data, bytes);
6528 filter_double(data, temp, data_length, fir_symetric);
6534 /*************** fir_eq ********************/
6536 void init_fir_eq(FIR_EQ *fc)
6538 int32 i, j, k, l, count1 = 0, count2 = 0, flg = 0, size_2;
6539 double amp[FIR_EQ_BAND_MAX*16], bounds[FIR_EQ_BAND_MAX*16][2]; // max_band*16 , [0]:f [1]:g
6540 double fL, gL, fR, gR, log_fL, log_fR, ft, gt, kT, div_kT2, hk, gain, f0, f1, w0, w1;
6541 const double div_2pi = 1.0 / (2 * M_PI);
6544 memset(fc, 0, sizeof(FIR_EQ));
6547 fc->st = 1; // stereo
6554 fc->freq[4] = 12000;
6555 fc->freq[5] = 20000;
6563 { // calc window size
6564 int32 t = play_mode->rate / 100; // sr/ 100Hz
6573 else if(fc->band > FIR_EQ_BAND_MAX)
6574 fc->band = FIR_EQ_BAND_MAX;
6575 if(fc->band != fc->band_p)
6577 fc->band_p = fc->band;
6580 else if(fc->bit > 12)
6582 if(fc->bit != fc->bit_p)
6584 fc->bit_p = fc->bit;
6585 fc->size = 1 << fc->bit;
6586 for(i = 0; i < FIR_EQ_SIZE_MAX; ++i){
6589 for(i = 0; i < fc->band; ++i){
6590 if(fc->gain[0] != 0)
6596 size_2 = fc->size / 2;
6597 for(i = 0; i < fc->band; ++i){
6598 amp[i] = pow(10, fc->gain[i] * DIV_20);
6600 bounds[0][1] = amp[0];
6601 for(i = 0; i < (fc->band - 2); i++){
6604 fR = fc->freq[i + 1];
6608 for(j = 0; j < 16; ++j){
6609 ft = ((double)j + 0.5) * DIV_16;
6610 bounds[count1][0] = exp(log_fL * (1 - ft) + log_fR * ft);
6611 gt = (double)j * DIV_16;
6612 bounds[count1][1] = gL * (1.0 - gt) + gR * gt;
6616 for(k = 0; k < size_2; ++k){
6617 kT = k * div_playmode_rate;
6621 while(count2 != count1){
6622 gain = bounds[count2][1];
6623 f0 = bounds[count2][0];
6625 f1 = count2 == count1 ? play_mode->rate * DIV_2 : bounds[count2][0];
6629 hk += gain * (w1 - w0 + (-w0) - (-w1));
6631 hk += gain * (sin(w1 * kT) - sin(w0 * kT)) * div_kT2; // * 2 / kT
6634 hk *= div_playmode_rate * div_2pi; // / (2 * M_PI);
6635 fc->dc[size_2 - 1 - k] = hk;
6636 fc->dc[size_2 - 1 + k] = hk;
6640 memset(fc->buff, 0, sizeof(fc->buff));
6646 void apply_fir_eq(FIR_EQ *fc, DATA_T *buf, int32 count)
6649 const int32 mask = FIR_EQ_SIZE_MAX - 1;
6653 #if (USE_X86_EXT_INTRIN >= 3) && defined(FLOAT_T_DOUBLE) && defined(DATA_T_DOUBLE)
6654 if(fc->st){ // stereo
6655 __m128d vout[2], tmp[2]; // DATA_T out[2];
6656 for(i = 0; i < count; i += 2){
6657 int32 sbuff = fc->count + FIR_EQ_SIZE_MAX;
6658 fc->buff[0][fc->count] = buf[i];
6659 fc->buff[0][sbuff] = buf[i]; // for SIMD
6660 fc->buff[1][fc->count] = buf[i + 1];
6661 fc->buff[1][sbuff] = buf[i + 1]; // for SIMD
6662 fc->count = (fc->count + 1) & mask;
6663 offset = (fc->count - fc->size) & mask;
6664 vout[0] = _mm_setzero_pd(); // out[0] = 0;
6665 vout[1] = _mm_setzero_pd(); // out[1] = 0;
6666 for(j = 0; j < fc->size; j += 2){
6667 int32 ofs = offset + j;
6668 __m128d vdc = _mm_loadu_pd(&fc->dc[j]);
6669 vout[0] = MM_FMA_PD(vdc, _mm_loadu_pd(&fc->buff[0][ofs]), vout[0]); // out[0] += fc->dc[j] * fc->buff[0][ofs];
6670 vout[1] = MM_FMA_PD(vdc, _mm_loadu_pd(&fc->buff[1][ofs]), vout[1]); // out[1] += fc->dc[j] * fc->buff[1][ofs];
6672 // vout[0](L0,L1) vout[1](R0,R1)
6673 tmp[0] = _mm_unpacklo_pd(vout[0], vout[1]); // (L0,R0)
6674 tmp[1] = _mm_unpackhi_pd(vout[0], vout[1]); // (L1,R1)
6675 tmp[0] = _mm_add_pd(tmp[0], tmp[1]); // (L0+L1,R0+R1)
6676 _mm_store_pd(&buf[i], tmp[0]); // buf[i] = out[0]; buf[i + 1] = out[1];
6679 __m128d vout; //DATA_T out;
6680 for(i = 0; i < count; i++){
6681 int32 sbuff = fc->count + FIR_EQ_SIZE_MAX;
6682 fc->buff[0][fc->count] = buf[i];
6683 fc->buff[0][sbuff] = buf[i]; // for SIMD
6684 fc->count = (fc->count + 1) & mask;
6685 offset = (fc->count - fc->size) & mask;
6686 vout = _mm_setzero_pd(); // out = 0;
6687 for(j = 0; j < fc->size; j += 2){
6688 int32 ofs = (offset + j) & mask;
6689 __m128d vdc = _mm_loadu_pd(&fc->dc[j]);
6690 vout = MM_FMA_PD(vdc, _mm_loadu_pd(&fc->buff[0][ofs]), vout); // out += fc->dc[j] * fc->buff[0][ofs];
6692 vout= _mm_add_pd(vout, _mm_shuffle_pd(vout, vout, 0x1)); // v0=v0+v1 v1=v1+v0
6693 _mm_store_sd(&buf[i], vout); // buf[i] = out;
6697 if(fc->st){ // stereo
6699 for(i = 0; i < count; i += 2){
6700 fc->buff[0][fc->count] = buf[i];
6701 fc->buff[1][fc->count] = buf[i + 1];
6702 fc->count = (fc->count + 1) & mask;
6703 offset = fc->count - fc->size;
6704 //offset &= fc->mask;
6705 //buf[i] = fc->buff[0][offset];
6706 //buf[i + 1] = fc->buff[1][offset];
6707 out[0] = 0; out[1] = 0;
6708 for(j = 0; j < fc->size; ++j){
6709 int32 ofs = (offset + j) & mask;
6710 out[0] += fc->dc[j] * fc->buff[0][ofs];
6711 out[1] += fc->dc[j] * fc->buff[1][ofs];
6713 buf[i] = out[0]; buf[i + 1] = out[1];
6717 for(i = 0; i < count; i++){
6718 fc->buff[0][fc->count] = buf[i];
6719 fc->count = (fc->count + 1) & mask;
6720 offset = fc->count - fc->size;
6722 for(j = 0; j < fc->size; ++j){
6723 int32 ofs = (offset + j) & mask;
6724 out += fc->dc[j] * fc->buff[0][ofs];