FLAC/src/lpc_intrin_avx2.c

   1 /* libFLAC - Free Lossless Audio Codec library
   2  * Copyright (C) 2000-2009  Josh Coalson
   3  * Copyright (C) 2011-2016  Xiph.Org Foundation
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * - Redistributions of source code must retain the above copyright
  10  * notice, this list of conditions and the following disclaimer.
  11  *
  12  * - Redistributions in binary form must reproduce the above copyright
  13  * notice, this list of conditions and the following disclaimer in the
  14  * documentation and/or other materials provided with the distribution.
  15  *
  16  * - Neither the name of the Xiph.org Foundation nor the names of its
  17  * contributors may be used to endorse or promote products derived from
  18  * this software without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
  24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32
  33 #ifdef HAVE_CONFIG_H
  34 #  include <config.h>
  35 #endif
  36
  37 #include "private/cpu.h"
  38
  39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
  40 #ifndef FLAC__NO_ASM
  41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
  42 #include "private/lpc.h"
  43 #ifdef FLAC__AVX2_SUPPORTED
  44
  45 #include "FLAC/assert.h"
  46 #include "FLAC/format.h"
  47
  48 #include <immintrin.h> /* AVX2 */
  49
  50 FLAC__SSE_TARGET("avx2")
  51 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
  52 {
  53         int i;
  54         FLAC__int32 sum;
  55         const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
  56
  57         FLAC__ASSERT(order > 0);
  58         FLAC__ASSERT(order <= 32);
  59
  60         if(order <= 12) {
  61                 if(order > 8) {
  62                         if(order > 10) {
  63                                 if(order == 12) {
  64                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
  65                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
  66                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
  67                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
  68                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
  69                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
  70                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
  71                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
  72                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
  73                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
  74                                         q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
  75                                         q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
  76                                         q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
  77
  78                                         for(i = 0; i < (int)data_len-7; i+=8) {
  79                                                 __m256i summ, mull;
  80                                                 summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
  81                                                 mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
  82                                                 mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
  83                                                 mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
  84                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
  85                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
  86                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
  87                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
  88                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
  89                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
  90                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
  91                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
  92                                                 summ = _mm256_sra_epi32(summ, cnt);
  93                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
  94                                         }
  95                                 }
  96                                 else { /* order == 11 */
  97                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
  98                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
  99                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 100                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
 101                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
 102                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
 103                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
 104                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
 105                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
 106                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
 107                                         q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
 108                                         q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
 109
 110                                         for(i = 0; i < (int)data_len-7; i+=8) {
 111                                                 __m256i summ, mull;
 112                                                 summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
 113                                                 mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
 114                                                 mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
 115                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
 116                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
 117                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
 118                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
 119                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
 120                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
 121                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
 122                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 123                                                 summ = _mm256_sra_epi32(summ, cnt);
 124                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 125                                         }
 126                                 }
 127                         }
 128                         else {
 129                                 if(order == 10) {
 130                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
 131                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 132                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 133                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
 134                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
 135                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
 136                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
 137                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
 138                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
 139                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
 140                                         q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
 141
 142                                         for(i = 0; i < (int)data_len-7; i+=8) {
 143                                                 __m256i summ, mull;
 144                                                 summ = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
 145                                                 mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
 146                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
 147                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
 148                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
 149                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
 150                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
 151                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
 152                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
 153                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 154                                                 summ = _mm256_sra_epi32(summ, cnt);
 155                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 156                                         }
 157                                 }
 158                                 else { /* order == 9 */
 159                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
 160                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 161                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 162                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
 163                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
 164                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
 165                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
 166                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
 167                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
 168                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
 169
 170                                         for(i = 0; i < (int)data_len-7; i+=8) {
 171                                                 __m256i summ, mull;
 172                                                 summ = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 )));
 173                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
 174                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
 175                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
 176                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
 177                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
 178                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
 179                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
 180                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 181                                                 summ = _mm256_sra_epi32(summ, cnt);
 182                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 183                                         }
 184                                 }
 185                         }
 186                 }
 187                 else if(order > 4) {
 188                         if(order > 6) {
 189                                 if(order == 8) {
 190                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7;
 191                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 192                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 193                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
 194                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
 195                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
 196                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
 197                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
 198                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
 199
 200                                         for(i = 0; i < (int)data_len-7; i+=8) {
 201                                                 __m256i summ, mull;
 202                                                 summ = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 )));
 203                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
 204                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
 205                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
 206                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
 207                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
 208                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
 209                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 210                                                 summ = _mm256_sra_epi32(summ, cnt);
 211                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 212                                         }
 213                                 }
 214                                 else { /* order == 7 */
 215                                         __m256i q0, q1, q2, q3, q4, q5, q6;
 216                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 217                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 218                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
 219                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
 220                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
 221                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
 222                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
 223
 224                                         for(i = 0; i < (int)data_len-7; i+=8) {
 225                                                 __m256i summ, mull;
 226                                                 summ = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 )));
 227                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
 228                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
 229                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
 230                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
 231                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
 232                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 233                                                 summ = _mm256_sra_epi32(summ, cnt);
 234                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 235                                         }
 236                                 }
 237                         }
 238                         else {
 239                                 if(order == 6) {
 240                                         __m256i q0, q1, q2, q3, q4, q5;
 241                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 242                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 243                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
 244                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
 245                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
 246                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
 247
 248                                         for(i = 0; i < (int)data_len-7; i+=8) {
 249                                                 __m256i summ, mull;
 250                                                 summ = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 )));
 251                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
 252                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
 253                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
 254                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
 255                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 256                                                 summ = _mm256_sra_epi32(summ, cnt);
 257                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 258                                         }
 259                                 }
 260                                 else { /* order == 5 */
 261                                         __m256i q0, q1, q2, q3, q4;
 262                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 263                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 264                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
 265                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
 266                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
 267
 268                                         for(i = 0; i < (int)data_len-7; i+=8) {
 269                                                 __m256i summ, mull;
 270                                                 summ = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 )));
 271                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
 272                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
 273                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
 274                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 275                                                 summ = _mm256_sra_epi32(summ, cnt);
 276                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 277                                         }
 278                                 }
 279                         }
 280                 }
 281                 else {
 282                         if(order > 2) {
 283                                 if(order == 4) {
 284                                         __m256i q0, q1, q2, q3;
 285                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 286                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 287                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
 288                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
 289
 290                                         for(i = 0; i < (int)data_len-7; i+=8) {
 291                                                 __m256i summ, mull;
 292                                                 summ = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 )));
 293                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
 294                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
 295                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 296                                                 summ = _mm256_sra_epi32(summ, cnt);
 297                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 298                                         }
 299                                 }
 300                                 else { /* order == 3 */
 301                                         __m256i q0, q1, q2;
 302                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 303                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 304                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
 305
 306                                         for(i = 0; i < (int)data_len-7; i+=8) {
 307                                                 __m256i summ, mull;
 308                                                 summ = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 )));
 309                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
 310                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 311                                                 summ = _mm256_sra_epi32(summ, cnt);
 312                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 313                                         }
 314                                 }
 315                         }
 316                         else {
 317                                 if(order == 2) {
 318                                         __m256i q0, q1;
 319                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 320                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
 321
 322                                         for(i = 0; i < (int)data_len-7; i+=8) {
 323                                                 __m256i summ, mull;
 324                                                 summ = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 )));
 325                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
 326                                                 summ = _mm256_sra_epi32(summ, cnt);
 327                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 328                                         }
 329                                 }
 330                                 else { /* order == 1 */
 331                                         __m256i q0;
 332                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
 333
 334                                         for(i = 0; i < (int)data_len-7; i+=8) {
 335                                                 __m256i summ;
 336                                                 summ = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 )));
 337                                                 summ = _mm256_sra_epi32(summ, cnt);
 338                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 339                                         }
 340                                 }
 341                         }
 342                 }
 343                 for(; i < (int)data_len; i++) {
 344                         sum = 0;
 345                         switch(order) {
 346                                 case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
 347                                 case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
 348                                 case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
 349                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
 350                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
 351                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
 352                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
 353                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
 354                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
 355                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
 356                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
 357                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
 358                         }
 359                         residual[i] = data[i] - (sum >> lp_quantization);
 360                 }
 361         }
 362         else { /* order > 12 */
 363                 for(i = 0; i < (int)data_len; i++) {
 364                         sum = 0;
 365                         switch(order) {
 366                                 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
 367                                 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
 368                                 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
 369                                 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
 370                                 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
 371                                 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
 372                                 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
 373                                 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
 374                                 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
 375                                 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
 376                                 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
 377                                 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
 378                                 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
 379                                 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
 380                                 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
 381                                 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
 382                                 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
 383                                 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
 384                                 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
 385                                 case 13: sum += qlp_coeff[12] * data[i-13];
 386                                          sum += qlp_coeff[11] * data[i-12];
 387                                          sum += qlp_coeff[10] * data[i-11];
 388                                          sum += qlp_coeff[ 9] * data[i-10];
 389                                          sum += qlp_coeff[ 8] * data[i- 9];
 390                                          sum += qlp_coeff[ 7] * data[i- 8];
 391                                          sum += qlp_coeff[ 6] * data[i- 7];
 392                                          sum += qlp_coeff[ 5] * data[i- 6];
 393                                          sum += qlp_coeff[ 4] * data[i- 5];
 394                                          sum += qlp_coeff[ 3] * data[i- 4];
 395                                          sum += qlp_coeff[ 2] * data[i- 3];
 396                                          sum += qlp_coeff[ 1] * data[i- 2];
 397                                          sum += qlp_coeff[ 0] * data[i- 1];
 398                         }
 399                         residual[i] = data[i] - (sum >> lp_quantization);
 400                 }
 401         }
 402         _mm256_zeroupper();
 403 }
 404
 405 FLAC__SSE_TARGET("avx2")
 406 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
 407 {
 408         int i;
 409         FLAC__int32 sum;
 410         const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
 411
 412         FLAC__ASSERT(order > 0);
 413         FLAC__ASSERT(order <= 32);
 414
 415         if(order <= 12) {
 416                 if(order > 8) {
 417                         if(order > 10) {
 418                                 if(order == 12) {
 419                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
 420                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 421                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 422                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 423                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
 424                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
 425                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
 426                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
 427                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
 428                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
 429                                         q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
 430                                         q10 = _mm256_set1_epi32(qlp_coeff[10]);
 431                                         q11 = _mm256_set1_epi32(qlp_coeff[11]);
 432
 433                                         for(i = 0; i < (int)data_len-7; i+=8) {
 434                                                 __m256i summ, mull;
 435                                                 summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
 436                                                 mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
 437                                                 mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
 438                                                 mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
 439                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
 440                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
 441                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
 442                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
 443                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
 444                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
 445                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 446                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 447                                                 summ = _mm256_sra_epi32(summ, cnt);
 448                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 449                                         }
 450                                 }
 451                                 else { /* order == 11 */
 452                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
 453                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 454                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 455                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 456                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
 457                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
 458                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
 459                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
 460                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
 461                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
 462                                         q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
 463                                         q10 = _mm256_set1_epi32(qlp_coeff[10]);
 464
 465                                         for(i = 0; i < (int)data_len-7; i+=8) {
 466                                                 __m256i summ, mull;
 467                                                 summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
 468                                                 mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
 469                                                 mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
 470                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
 471                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
 472                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
 473                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
 474                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
 475                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
 476                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 477                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 478                                                 summ = _mm256_sra_epi32(summ, cnt);
 479                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 480                                         }
 481                                 }
 482                         }
 483                         else {
 484                                 if(order == 10) {
 485                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
 486                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 487                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 488                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 489                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
 490                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
 491                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
 492                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
 493                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
 494                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
 495                                         q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
 496
 497                                         for(i = 0; i < (int)data_len-7; i+=8) {
 498                                                 __m256i summ, mull;
 499                                                 summ = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
 500                                                 mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
 501                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
 502                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
 503                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
 504                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
 505                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
 506                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
 507                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 508                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 509                                                 summ = _mm256_sra_epi32(summ, cnt);
 510                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 511                                         }
 512                                 }
 513                                 else { /* order == 9 */
 514                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
 515                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 516                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 517                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 518                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
 519                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
 520                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
 521                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
 522                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
 523                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
 524
 525                                         for(i = 0; i < (int)data_len-7; i+=8) {
 526                                                 __m256i summ, mull;
 527                                                 summ = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));
 528                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
 529                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
 530                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
 531                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
 532                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
 533                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
 534                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 535                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 536                                                 summ = _mm256_sra_epi32(summ, cnt);
 537                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 538                                         }
 539                                 }
 540                         }
 541                 }
 542                 else if(order > 4) {
 543                         if(order > 6) {
 544                                 if(order == 8) {
 545                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7;
 546                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 547                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 548                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 549                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
 550                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
 551                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
 552                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
 553                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
 554
 555                                         for(i = 0; i < (int)data_len-7; i+=8) {
 556                                                 __m256i summ, mull;
 557                                                 summ = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));
 558                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
 559                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
 560                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
 561                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
 562                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
 563                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 564                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 565                                                 summ = _mm256_sra_epi32(summ, cnt);
 566                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 567                                         }
 568                                 }
 569                                 else { /* order == 7 */
 570                                         __m256i q0, q1, q2, q3, q4, q5, q6;
 571                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 572                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 573                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 574                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
 575                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
 576                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
 577                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
 578
 579                                         for(i = 0; i < (int)data_len-7; i+=8) {
 580                                                 __m256i summ, mull;
 581                                                 summ = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));
 582                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
 583                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
 584                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
 585                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
 586                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 587                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 588                                                 summ = _mm256_sra_epi32(summ, cnt);
 589                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 590                                         }
 591                                 }
 592                         }
 593                         else {
 594                                 if(order == 6) {
 595                                         __m256i q0, q1, q2, q3, q4, q5;
 596                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 597                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 598                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 599                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
 600                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
 601                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
 602
 603                                         for(i = 0; i < (int)data_len-7; i+=8) {
 604                                                 __m256i summ, mull;
 605                                                 summ = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));
 606                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
 607                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
 608                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
 609                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 610                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 611                                                 summ = _mm256_sra_epi32(summ, cnt);
 612                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 613                                         }
 614                                 }
 615                                 else { /* order == 5 */
 616                                         __m256i q0, q1, q2, q3, q4;
 617                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 618                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 619                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 620                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
 621                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
 622
 623                                         for(i = 0; i < (int)data_len-7; i+=8) {
 624                                                 __m256i summ, mull;
 625                                                 summ = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));
 626                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
 627                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
 628                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 629                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 630                                                 summ = _mm256_sra_epi32(summ, cnt);
 631                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 632                                         }
 633                                 }
 634                         }
 635                 }
 636                 else {
 637                         if(order > 2) {
 638                                 if(order == 4) {
 639                                         __m256i q0, q1, q2, q3;
 640                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 641                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 642                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 643                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
 644
 645                                         for(i = 0; i < (int)data_len-7; i+=8) {
 646                                                 __m256i summ, mull;
 647                                                 summ = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));
 648                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
 649                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 650                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 651                                                 summ = _mm256_sra_epi32(summ, cnt);
 652                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 653                                         }
 654                                 }
 655                                 else { /* order == 3 */
 656                                         __m256i q0, q1, q2;
 657                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 658                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 659                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
 660
 661                                         for(i = 0; i < (int)data_len-7; i+=8) {
 662                                                 __m256i summ, mull;
 663                                                 summ = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));
 664                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
 665                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 666                                                 summ = _mm256_sra_epi32(summ, cnt);
 667                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 668                                         }
 669                                 }
 670                         }
 671                         else {
 672                                 if(order == 2) {
 673                                         __m256i q0, q1;
 674                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 675                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
 676
 677                                         for(i = 0; i < (int)data_len-7; i+=8) {
 678                                                 __m256i summ, mull;
 679                                                 summ = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));
 680                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
 681                                                 summ = _mm256_sra_epi32(summ, cnt);
 682                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 683                                         }
 684                                 }
 685                                 else { /* order == 1 */
 686                                         __m256i q0;
 687                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
 688
 689                                         for(i = 0; i < (int)data_len-7; i+=8) {
 690                                                 __m256i summ;
 691                                                 summ = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));
 692                                                 summ = _mm256_sra_epi32(summ, cnt);
 693                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
 694                                         }
 695                                 }
 696                         }
 697                 }
 698                 for(; i < (int)data_len; i++) {
 699                         sum = 0;
 700                         switch(order) {
 701                                 case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
 702                                 case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
 703                                 case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
 704                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
 705                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
 706                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
 707                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
 708                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
 709                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
 710                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
 711                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
 712                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
 713                         }
 714                         residual[i] = data[i] - (sum >> lp_quantization);
 715                 }
 716         }
 717         else { /* order > 12 */
 718                 for(i = 0; i < (int)data_len; i++) {
 719                         sum = 0;
 720                         switch(order) {
 721                                 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
 722                                 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
 723                                 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
 724                                 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
 725                                 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
 726                                 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
 727                                 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
 728                                 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
 729                                 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
 730                                 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
 731                                 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
 732                                 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
 733                                 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
 734                                 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
 735                                 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
 736                                 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
 737                                 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
 738                                 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
 739                                 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
 740                                 case 13: sum += qlp_coeff[12] * data[i-13];
 741                                          sum += qlp_coeff[11] * data[i-12];
 742                                          sum += qlp_coeff[10] * data[i-11];
 743                                          sum += qlp_coeff[ 9] * data[i-10];
 744                                          sum += qlp_coeff[ 8] * data[i- 9];
 745                                          sum += qlp_coeff[ 7] * data[i- 8];
 746                                          sum += qlp_coeff[ 6] * data[i- 7];
 747                                          sum += qlp_coeff[ 5] * data[i- 6];
 748                                          sum += qlp_coeff[ 4] * data[i- 5];
 749                                          sum += qlp_coeff[ 3] * data[i- 4];
 750                                          sum += qlp_coeff[ 2] * data[i- 3];
 751                                          sum += qlp_coeff[ 1] * data[i- 2];
 752                                          sum += qlp_coeff[ 0] * data[i- 1];
 753                         }
 754                         residual[i] = data[i] - (sum >> lp_quantization);
 755                 }
 756         }
 757         _mm256_zeroupper();
 758 }
 759
 760 static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
 761
 762 FLAC__SSE_TARGET("avx2")
 763 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
 764 {
 765         int i;
 766         FLAC__int64 sum;
 767         const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
 768         const __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
 769
 770         FLAC__ASSERT(order > 0);
 771         FLAC__ASSERT(order <= 32);
 772         FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
 773
 774         if(order <= 12) {
 775                 if(order > 8) {
 776                         if(order > 10) {
 777                                 if(order == 12) {
 778                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
 779                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
 780                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
 781                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
 782                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
 783                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
 784                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
 785                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
 786                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
 787                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
 788                                         q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
 789                                         q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
 790                                         q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
 791
 792                                         for(i = 0; i < (int)data_len-3; i+=4) {
 793                                                 __m256i summ, mull;
 794                                                 summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
 795                                                 mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
 796                                                 mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
 797                                                 mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
 798                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
 799                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
 800                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
 801                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
 802                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
 803                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
 804                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
 805                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
 806                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
 807                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
 808                                         }
 809                                 }
 810                                 else { /* order == 11 */
 811                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
 812                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
 813                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
 814                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
 815                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
 816                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
 817                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
 818                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
 819                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
 820                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
 821                                         q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
 822                                         q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
 823
 824                                         for(i = 0; i < (int)data_len-3; i+=4) {
 825                                                 __m256i summ, mull;
 826                                                 summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
 827                                                 mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
 828                                                 mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
 829                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
 830                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
 831                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
 832                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
 833                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
 834                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
 835                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
 836                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
 837                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
 838                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
 839                                         }
 840                                 }
 841                         }
 842                         else {
 843                                 if(order == 10) {
 844                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
 845                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
 846                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
 847                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
 848                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
 849                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
 850                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
 851                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
 852                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
 853                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
 854                                         q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
 855
 856                                         for(i = 0; i < (int)data_len-3; i+=4) {
 857                                                 __m256i summ, mull;
 858                                                 summ = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
 859                                                 mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
 860                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
 861                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
 862                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
 863                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
 864                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
 865                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
 866                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
 867                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
 868                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
 869                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
 870                                         }
 871                                 }
 872                                 else { /* order == 9 */
 873                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
 874                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
 875                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
 876                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
 877                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
 878                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
 879                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
 880                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
 881                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
 882                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
 883
 884                                         for(i = 0; i < (int)data_len-3; i+=4) {
 885                                                 __m256i summ, mull;
 886                                                 summ = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
 887                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
 888                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
 889                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
 890                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
 891                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
 892                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
 893                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
 894                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
 895                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
 896                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
 897                                         }
 898                                 }
 899                         }
 900                 }
 901                 else if(order > 4) {
 902                         if(order > 6) {
 903                                 if(order == 8) {
 904                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7;
 905                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
 906                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
 907                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
 908                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
 909                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
 910                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
 911                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
 912                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
 913
 914                                         for(i = 0; i < (int)data_len-3; i+=4) {
 915                                                 __m256i summ, mull;
 916                                                 summ = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
 917                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
 918                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
 919                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
 920                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
 921                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
 922                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
 923                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
 924                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
 925                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
 926                                         }
 927                                 }
 928                                 else { /* order == 7 */
 929                                         __m256i q0, q1, q2, q3, q4, q5, q6;
 930                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
 931                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
 932                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
 933                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
 934                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
 935                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
 936                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
 937
 938                                         for(i = 0; i < (int)data_len-3; i+=4) {
 939                                                 __m256i summ, mull;
 940                                                 summ = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
 941                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
 942                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
 943                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
 944                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
 945                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
 946                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
 947                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
 948                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
 949                                         }
 950                                 }
 951                         }
 952                         else {
 953                                 if(order == 6) {
 954                                         __m256i q0, q1, q2, q3, q4, q5;
 955                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
 956                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
 957                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
 958                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
 959                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
 960                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
 961
 962                                         for(i = 0; i < (int)data_len-3; i+=4) {
 963                                                 __m256i summ, mull;
 964                                                 summ = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
 965                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
 966                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
 967                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
 968                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
 969                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
 970                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
 971                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
 972                                         }
 973                                 }
 974                                 else { /* order == 5 */
 975                                         __m256i q0, q1, q2, q3, q4;
 976                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
 977                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
 978                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
 979                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
 980                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
 981
 982                                         for(i = 0; i < (int)data_len-3; i+=4) {
 983                                                 __m256i summ, mull;
 984                                                 summ = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
 985                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
 986                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
 987                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
 988                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
 989                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
 990                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
 991                                         }
 992                                 }
 993                         }
 994                 }
 995                 else {
 996                         if(order > 2) {
 997                                 if(order == 4) {
 998                                         __m256i q0, q1, q2, q3;
 999                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1000                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1001                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1002                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
1003
1004                                         for(i = 0; i < (int)data_len-3; i+=4) {
1005                                                 __m256i summ, mull;
1006                                                 summ = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
1007                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
1008                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1009                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1010                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1011                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1012                                         }
1013                                 }
1014                                 else { /* order == 3 */
1015                                         __m256i q0, q1, q2;
1016                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1017                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1018                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1019
1020                                         for(i = 0; i < (int)data_len-3; i+=4) {
1021                                                 __m256i summ, mull;
1022                                                 summ = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
1023                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1024                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1025                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1026                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1027                                         }
1028                                 }
1029                         }
1030                         else {
1031                                 if(order == 2) {
1032                                         __m256i q0, q1;
1033                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1034                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1035
1036                                         for(i = 0; i < (int)data_len-3; i+=4) {
1037                                                 __m256i summ, mull;
1038                                                 summ = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
1039                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1040                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1041                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1042                                         }
1043                                 }
1044                                 else { /* order == 1 */
1045                                         __m256i q0;
1046                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1047
1048                                         for(i = 0; i < (int)data_len-3; i+=4) {
1049                                                 __m256i summ;
1050                                                 summ = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
1051                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1052                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1053                                         }
1054                                 }
1055                         }
1056                 }
1057                 for(; i < (int)data_len; i++) {
1058                         sum = 0;
1059                         switch(order) {
1060                                 case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; /* Falls through. */
1061                                 case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; /* Falls through. */
1062                                 case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; /* Falls through. */
1063                                 case 9:  sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; /* Falls through. */
1064                                 case 8:  sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; /* Falls through. */
1065                                 case 7:  sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; /* Falls through. */
1066                                 case 6:  sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; /* Falls through. */
1067                                 case 5:  sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; /* Falls through. */
1068                                 case 4:  sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; /* Falls through. */
1069                                 case 3:  sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; /* Falls through. */
1070                                 case 2:  sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; /* Falls through. */
1071                                 case 1:  sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1072                         }
1073                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1074                 }
1075         }
1076         else { /* order > 12 */
1077                 for(i = 0; i < (int)data_len; i++) {
1078                         sum = 0;
1079                         switch(order) {
1080                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; /* Falls through. */
1081                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; /* Falls through. */
1082                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; /* Falls through. */
1083                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; /* Falls through. */
1084                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; /* Falls through. */
1085                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; /* Falls through. */
1086                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; /* Falls through. */
1087                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; /* Falls through. */
1088                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; /* Falls through. */
1089                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; /* Falls through. */
1090                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; /* Falls through. */
1091                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; /* Falls through. */
1092                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; /* Falls through. */
1093                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; /* Falls through. */
1094                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; /* Falls through. */
1095                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; /* Falls through. */
1096                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; /* Falls through. */
1097                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; /* Falls through. */
1098                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; /* Falls through. */
1099                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1100                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1101                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1102                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1103                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1104                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1105                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1106                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1107                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1108                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1109                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1110                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1111                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1112                         }
1113                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1114                 }
1115         }
1116         _mm256_zeroupper();
1117 }
1118
1119 #endif /* FLAC__AVX2_SUPPORTED */
1120 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1121 #endif /* FLAC__NO_ASM */
1122 #endif /* FLAC__INTEGER_ONLY_LIBRARY */