libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/avassert.h"
  64 #include "libavutil/intreadwrite.h"
  65 #include "libavutil/cpu.h"
  66 #include "libavutil/avutil.h"
  67 #include "libavutil/mathematics.h"
  68 #include "libavutil/bswap.h"
  69 #include "libavutil/pixdesc.h"
  70
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 186 {  36, 68, 60, 92, 34, 66, 58, 90,},
 187 { 100,  4,124, 28, 98,  2,122, 26,},
 188 {  52, 84, 44, 76, 50, 82, 42, 74,},
 189 { 116, 20,108, 12,114, 18,106, 10,},
 190 {  32, 64, 56, 88, 38, 70, 62, 94,},
 191 {  96,  0,120, 24,102,  6,126, 30,},
 192 {  48, 80, 40, 72, 54, 86, 46, 78,},
 193 { 112, 16,104,  8,118, 22,110, 14,},
 194 };
 195 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 196 {  64, 64, 64, 64, 64, 64, 64, 64 };
 197
 198 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 199 {
 200   {   0,  1,  0,  1,  0,  1,  0,  1,},
 201   {   1,  0,  1,  0,  1,  0,  1,  0,},
 202   {   0,  1,  0,  1,  0,  1,  0,  1,},
 203   {   1,  0,  1,  0,  1,  0,  1,  0,},
 204   {   0,  1,  0,  1,  0,  1,  0,  1,},
 205   {   1,  0,  1,  0,  1,  0,  1,  0,},
 206   {   0,  1,  0,  1,  0,  1,  0,  1,},
 207   {   1,  0,  1,  0,  1,  0,  1,  0,},
 208 },{
 209   {   1,  2,  1,  2,  1,  2,  1,  2,},
 210   {   3,  0,  3,  0,  3,  0,  3,  0,},
 211   {   1,  2,  1,  2,  1,  2,  1,  2,},
 212   {   3,  0,  3,  0,  3,  0,  3,  0,},
 213   {   1,  2,  1,  2,  1,  2,  1,  2,},
 214   {   3,  0,  3,  0,  3,  0,  3,  0,},
 215   {   1,  2,  1,  2,  1,  2,  1,  2,},
 216   {   3,  0,  3,  0,  3,  0,  3,  0,},
 217 },{
 218   {   2,  4,  3,  5,  2,  4,  3,  5,},
 219   {   6,  0,  7,  1,  6,  0,  7,  1,},
 220   {   3,  5,  2,  4,  3,  5,  2,  4,},
 221   {   7,  1,  6,  0,  7,  1,  6,  0,},
 222   {   2,  4,  3,  5,  2,  4,  3,  5,},
 223   {   6,  0,  7,  1,  6,  0,  7,  1,},
 224   {   3,  5,  2,  4,  3,  5,  2,  4,},
 225   {   7,  1,  6,  0,  7,  1,  6,  0,},
 226 },{
 227   {   4,  8,  7, 11,  4,  8,  7, 11,},
 228   {  12,  0, 15,  3, 12,  0, 15,  3,},
 229   {   6, 10,  5,  9,  6, 10,  5,  9,},
 230   {  14,  2, 13,  1, 14,  2, 13,  1,},
 231   {   4,  8,  7, 11,  4,  8,  7, 11,},
 232   {  12,  0, 15,  3, 12,  0, 15,  3,},
 233   {   6, 10,  5,  9,  6, 10,  5,  9,},
 234   {  14,  2, 13,  1, 14,  2, 13,  1,},
 235 },{
 236   {   9, 17, 15, 23,  8, 16, 14, 22,},
 237   {  25,  1, 31,  7, 24,  0, 30,  6,},
 238   {  13, 21, 11, 19, 12, 20, 10, 18,},
 239   {  29,  5, 27,  3, 28,  4, 26,  2,},
 240   {   8, 16, 14, 22,  9, 17, 15, 23,},
 241   {  24,  0, 30,  6, 25,  1, 31,  7,},
 242   {  12, 20, 10, 18, 13, 21, 11, 19,},
 243   {  28,  4, 26,  2, 29,  5, 27,  3,},
 244 },{
 245   {  18, 34, 30, 46, 17, 33, 29, 45,},
 246   {  50,  2, 62, 14, 49,  1, 61, 13,},
 247   {  26, 42, 22, 38, 25, 41, 21, 37,},
 248   {  58, 10, 54,  6, 57,  9, 53,  5,},
 249   {  16, 32, 28, 44, 19, 35, 31, 47,},
 250   {  48,  0, 60, 12, 51,  3, 63, 15,},
 251   {  24, 40, 20, 36, 27, 43, 23, 39,},
 252   {  56,  8, 52,  4, 59, 11, 55,  7,},
 253 },{
 254   {  18, 34, 30, 46, 17, 33, 29, 45,},
 255   {  50,  2, 62, 14, 49,  1, 61, 13,},
 256   {  26, 42, 22, 38, 25, 41, 21, 37,},
 257   {  58, 10, 54,  6, 57,  9, 53,  5,},
 258   {  16, 32, 28, 44, 19, 35, 31, 47,},
 259   {  48,  0, 60, 12, 51,  3, 63, 15,},
 260   {  24, 40, 20, 36, 27, 43, 23, 39,},
 261   {  56,  8, 52,  4, 59, 11, 55,  7,},
 262 },{
 263   {  36, 68, 60, 92, 34, 66, 58, 90,},
 264   { 100,  4,124, 28, 98,  2,122, 26,},
 265   {  52, 84, 44, 76, 50, 82, 42, 74,},
 266   { 116, 20,108, 12,114, 18,106, 10,},
 267   {  32, 64, 56, 88, 38, 70, 62, 94,},
 268   {  96,  0,120, 24,102,  6,126, 30,},
 269   {  48, 80, 40, 72, 54, 86, 46, 78,},
 270   { 112, 16,104,  8,118, 22,110, 14,},
 271 }};
 272
 273 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 274
 275 const uint16_t dither_scale[15][16]={
 276 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 277 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 278 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 279 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 280 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 281 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 282 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 283 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 284 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 285 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 286 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 287 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 288 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 289 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 290 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 291 };
 292
 293 static av_always_inline void
 294 yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 295                       int lumFilterSize, const int16_t *chrFilter,
 296                       const int32_t **chrUSrc, const int32_t **chrVSrc,
 297                       int chrFilterSize, const int32_t **alpSrc,
 298                       uint16_t *dest[4], int dstW, int chrDstW,
 299                       int big_endian, int output_bits)
 300 {
 301     //FIXME Optimize (just quickly written not optimized..)
 302     int i;
 303     int dword= output_bits == 16;
 304     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 305              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 306     int shift = 11 + 4*dword + 16 - output_bits - 1;
 307
 308 #define output_pixel(pos, val) \
 309     if (big_endian) { \
 310         AV_WB16(pos, av_clip_uint16(val >> shift)); \
 311     } else { \
 312         AV_WL16(pos, av_clip_uint16(val >> shift)); \
 313     }
 314     for (i = 0; i < dstW; i++) {
 315         int val = 1 << (26-output_bits + 4*dword - 1);
 316         int j;
 317
 318         for (j = 0; j < lumFilterSize; j++)
 319             val += ((dword ? lumSrc[j][i] : ((int16_t**)lumSrc)[j][i]) * lumFilter[j])>>1;
 320
 321         output_pixel(&yDest[i], val);
 322     }
 323
 324     if (uDest) {
 325         for (i = 0; i < chrDstW; i++) {
 326             int u = 1 << (26-output_bits + 4*dword - 1);
 327             int v = 1 << (26-output_bits + 4*dword - 1);
 328             int j;
 329
 330             for (j = 0; j < chrFilterSize; j++) {
 331                 u += ((dword ? chrUSrc[j][i] : ((int16_t**)chrUSrc)[j][i]) * chrFilter[j]) >> 1;
 332                 v += ((dword ? chrVSrc[j][i] : ((int16_t**)chrVSrc)[j][i]) * chrFilter[j]) >> 1;
 333             }
 334
 335             output_pixel(&uDest[i], u);
 336             output_pixel(&vDest[i], v);
 337         }
 338     }
 339
 340     if (CONFIG_SWSCALE_ALPHA && aDest) {
 341         for (i = 0; i < dstW; i++) {
 342             int val = 1 << (26-output_bits + 4*dword - 1);
 343             int j;
 344
 345             for (j = 0; j < lumFilterSize; j++)
 346                 val += ((dword ? alpSrc[j][i] : ((int16_t**)alpSrc)[j][i]) * lumFilter[j]) >> 1;
 347
 348             output_pixel(&aDest[i], val);
 349         }
 350     }
 351 #undef output_pixel
 352 }
 353
 354 static av_always_inline void
 355 yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 356                       int lumFilterSize, const int16_t *chrFilter,
 357                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 358                       int chrFilterSize, const int16_t **alpSrc,
 359                       uint16_t *dest[4], int dstW, int chrDstW,
 360                       int big_endian, int output_bits)
 361 {
 362     //FIXME Optimize (just quickly written not optimized..)
 363     int i;
 364     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 365              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 366     int shift = 11 + 16 - output_bits - 1;
 367
 368 #define output_pixel(pos, val) \
 369     if (big_endian) { \
 370         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 371     } else { \
 372         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 373     }
 374     for (i = 0; i < dstW; i++) {
 375         int val = 1 << (26-output_bits - 1);
 376         int j;
 377
 378         for (j = 0; j < lumFilterSize; j++)
 379             val += (lumSrc[j][i] * lumFilter[j]) >> 1;
 380
 381         output_pixel(&yDest[i], val);
 382     }
 383
 384     if (uDest) {
 385         for (i = 0; i < chrDstW; i++) {
 386             int u = 1 << (26-output_bits - 1);
 387             int v = 1 << (26-output_bits - 1);
 388             int j;
 389
 390             for (j = 0; j < chrFilterSize; j++) {
 391                 u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
 392                 v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
 393             }
 394
 395             output_pixel(&uDest[i], u);
 396             output_pixel(&vDest[i], v);
 397         }
 398     }
 399
 400     if (CONFIG_SWSCALE_ALPHA && aDest) {
 401         for (i = 0; i < dstW; i++) {
 402             int val = 1 << (26-output_bits - 1);
 403             int j;
 404
 405             for (j = 0; j < lumFilterSize; j++)
 406                 val += (alpSrc[j][i] * lumFilter[j]) >> 1;
 407
 408             output_pixel(&aDest[i], val);
 409         }
 410     }
 411 #undef output_pixel
 412 }
 413
 414 #define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \
 415 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 416                               const int16_t **_lumSrc, int lumFilterSize, \
 417                               const int16_t *chrFilter, const int16_t **_chrUSrc, \
 418                               const int16_t **_chrVSrc, \
 419                               int chrFilterSize, const int16_t **_alpSrc, \
 420                               uint8_t *_dest[4], int dstW, int chrDstW) \
 421 { \
 422     const typeX_t **lumSrc  = (const typeX_t **) _lumSrc, \
 423                   **chrUSrc = (const typeX_t **) _chrUSrc, \
 424                   **chrVSrc = (const typeX_t **) _chrVSrc, \
 425                   **alpSrc  = (const typeX_t **) _alpSrc; \
 426     yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \
 427                          chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 428                          alpSrc, (uint16_t **) _dest, \
 429                          dstW, chrDstW, is_be, bits); \
 430 }
 431 yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t);
 432 yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t);
 433 yuv2NBPS(10, BE, 1, yuv2yuvX10_c_template, int16_t);
 434 yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t);
 435 yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t);
 436 yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t);
 437
 438 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 439                        const int16_t **lumSrc, int lumFilterSize,
 440                        const int16_t *chrFilter, const int16_t **chrUSrc,
 441                        const int16_t **chrVSrc,
 442                        int chrFilterSize, const int16_t **alpSrc,
 443                        uint8_t *dest[4], int dstW, int chrDstW)
 444 {
 445     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 446             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 447     int i;
 448     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 449
 450     //FIXME Optimize (just quickly written not optimized..)
 451     for (i=0; i<dstW; i++) {
 452         int val = lumDither[i & 7] << 12;
 453         int j;
 454         for (j=0; j<lumFilterSize; j++)
 455             val += lumSrc[j][i] * lumFilter[j];
 456
 457         yDest[i]= av_clip_uint8(val>>19);
 458     }
 459
 460     if (uDest)
 461         for (i=0; i<chrDstW; i++) {
 462             int u = chrDither[i & 7] << 12;
 463             int v = chrDither[(i + 3) & 7] << 12;
 464             int j;
 465             for (j=0; j<chrFilterSize; j++) {
 466                 u += chrUSrc[j][i] * chrFilter[j];
 467                 v += chrVSrc[j][i] * chrFilter[j];
 468             }
 469
 470             uDest[i]= av_clip_uint8(u>>19);
 471             vDest[i]= av_clip_uint8(v>>19);
 472         }
 473
 474     if (CONFIG_SWSCALE_ALPHA && aDest)
 475         for (i=0; i<dstW; i++) {
 476             int val = lumDither[i & 7] << 12;
 477             int j;
 478             for (j=0; j<lumFilterSize; j++)
 479                 val += alpSrc[j][i] * lumFilter[j];
 480
 481             aDest[i]= av_clip_uint8(val>>19);
 482         }
 483 }
 484
 485 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 486                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 487                        const int16_t *alpSrc,
 488                        uint8_t *dest[4], int dstW, int chrDstW)
 489 {
 490     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 491             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 492     int i;
 493     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 494
 495     for (i=0; i<dstW; i++) {
 496         int val = (lumSrc[i]+  lumDither[i & 7]) >> 7;
 497         yDest[i]= av_clip_uint8(val);
 498     }
 499
 500     if (uDest)
 501         for (i=0; i<chrDstW; i++) {
 502             int u = (chrUSrc[i] + chrDither[i & 7])       >> 7;
 503             int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7;
 504             uDest[i]= av_clip_uint8(u);
 505             vDest[i]= av_clip_uint8(v);
 506         }
 507
 508     if (CONFIG_SWSCALE_ALPHA && aDest)
 509         for (i=0; i<dstW; i++) {
 510             int val = (alpSrc[i] + lumDither[i & 7]) >> 7;
 511             aDest[i]= av_clip_uint8(val);
 512         }
 513 }
 514
 515 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 516                         const int16_t **lumSrc, int lumFilterSize,
 517                         const int16_t *chrFilter, const int16_t **chrUSrc,
 518                         const int16_t **chrVSrc, int chrFilterSize,
 519                         const int16_t **alpSrc, uint8_t *dest[4],
 520                         int dstW, int chrDstW)
 521 {
 522     uint8_t *yDest = dest[0], *uDest = dest[1];
 523     enum PixelFormat dstFormat = c->dstFormat;
 524     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 525
 526     //FIXME Optimize (just quickly written not optimized..)
 527     int i;
 528     for (i=0; i<dstW; i++) {
 529         int val = lumDither[i & 7] << 12;
 530         int j;
 531         for (j=0; j<lumFilterSize; j++)
 532             val += lumSrc[j][i] * lumFilter[j];
 533
 534         yDest[i]= av_clip_uint8(val>>19);
 535     }
 536
 537     if (!uDest)
 538         return;
 539
 540     if (dstFormat == PIX_FMT_NV12)
 541         for (i=0; i<chrDstW; i++) {
 542             int u = chrDither[i & 7] << 12;
 543             int v = chrDither[(i + 3) & 7] << 12;
 544             int j;
 545             for (j=0; j<chrFilterSize; j++) {
 546                 u += chrUSrc[j][i] * chrFilter[j];
 547                 v += chrVSrc[j][i] * chrFilter[j];
 548             }
 549
 550             uDest[2*i]= av_clip_uint8(u>>19);
 551             uDest[2*i+1]= av_clip_uint8(v>>19);
 552         }
 553     else
 554         for (i=0; i<chrDstW; i++) {
 555             int u = chrDither[i & 7] << 12;
 556             int v = chrDither[(i + 3) & 7] << 12;
 557             int j;
 558             for (j=0; j<chrFilterSize; j++) {
 559                 u += chrUSrc[j][i] * chrFilter[j];
 560                 v += chrVSrc[j][i] * chrFilter[j];
 561             }
 562
 563             uDest[2*i]= av_clip_uint8(v>>19);
 564             uDest[2*i+1]= av_clip_uint8(u>>19);
 565         }
 566 }
 567
 568 #define output_pixel(pos, val) \
 569         if (target == PIX_FMT_GRAY16BE) { \
 570             AV_WB16(pos, val); \
 571         } else { \
 572             AV_WL16(pos, val); \
 573         }
 574
 575 static av_always_inline void
 576 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 577                         const int32_t **lumSrc, int lumFilterSize,
 578                         const int16_t *chrFilter, const int32_t **chrUSrc,
 579                         const int32_t **chrVSrc, int chrFilterSize,
 580                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 581                         int y, enum PixelFormat target)
 582 {
 583     int i;
 584
 585     for (i = 0; i < (dstW >> 1); i++) {
 586         int j;
 587         int Y1 = 1 << 14;
 588         int Y2 = 1 << 14;
 589
 590         for (j = 0; j < lumFilterSize; j++) {
 591             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 592             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 593         }
 594         Y1 >>= 15;
 595         Y2 >>= 15;
 596         if ((Y1 | Y2) & 0x10000) {
 597             Y1 = av_clip_uint16(Y1);
 598             Y2 = av_clip_uint16(Y2);
 599         }
 600         output_pixel(&dest[i * 2 + 0], Y1);
 601         output_pixel(&dest[i * 2 + 1], Y2);
 602     }
 603 }
 604
 605 static av_always_inline void
 606 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 607                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 608                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 609                         int yalpha, int uvalpha, int y,
 610                         enum PixelFormat target)
 611 {
 612     int  yalpha1 = 4095 - yalpha;
 613     int i;
 614     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 615
 616     for (i = 0; i < (dstW >> 1); i++) {
 617         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 618         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 619
 620         output_pixel(&dest[i * 2 + 0], Y1);
 621         output_pixel(&dest[i * 2 + 1], Y2);
 622     }
 623 }
 624
 625 static av_always_inline void
 626 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 627                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 628                         const int32_t *abuf0, uint16_t *dest, int dstW,
 629                         int uvalpha, int y, enum PixelFormat target)
 630 {
 631     int i;
 632
 633     for (i = 0; i < (dstW >> 1); i++) {
 634         int Y1 = (buf0[i * 2    ]+4)>>3;
 635         int Y2 = (buf0[i * 2 + 1]+4)>>3;
 636
 637         output_pixel(&dest[i * 2 + 0], Y1);
 638         output_pixel(&dest[i * 2 + 1], Y2);
 639     }
 640 }
 641
 642 #undef output_pixel
 643
 644 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 645 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 646                         const int16_t **_lumSrc, int lumFilterSize, \
 647                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 648                         const int16_t **_chrVSrc, int chrFilterSize, \
 649                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 650                         int y) \
 651 { \
 652     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 653                   **chrUSrc = (const int32_t **) _chrUSrc, \
 654                   **chrVSrc = (const int32_t **) _chrVSrc, \
 655                   **alpSrc  = (const int32_t **) _alpSrc; \
 656     uint16_t *dest = (uint16_t *) _dest; \
 657     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 658                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 659                           alpSrc, dest, dstW, y, fmt); \
 660 } \
 661  \
 662 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 663                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 664                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 665                         int yalpha, int uvalpha, int y) \
 666 { \
 667     const int32_t **buf  = (const int32_t **) _buf, \
 668                   **ubuf = (const int32_t **) _ubuf, \
 669                   **vbuf = (const int32_t **) _vbuf, \
 670                   **abuf = (const int32_t **) _abuf; \
 671     uint16_t *dest = (uint16_t *) _dest; \
 672     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 673                           dest, dstW, yalpha, uvalpha, y, fmt); \
 674 } \
 675  \
 676 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 677                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 678                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 679                         int uvalpha, int y) \
 680 { \
 681     const int32_t *buf0  = (const int32_t *)  _buf0, \
 682                  **ubuf  = (const int32_t **) _ubuf, \
 683                  **vbuf  = (const int32_t **) _vbuf, \
 684                   *abuf0 = (const int32_t *)  _abuf0; \
 685     uint16_t *dest = (uint16_t *) _dest; \
 686     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 687                                   dstW, uvalpha, y, fmt); \
 688 }
 689
 690 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 691 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 692
 693 #define output_pixel(pos, acc) \
 694     if (target == PIX_FMT_MONOBLACK) { \
 695         pos = acc; \
 696     } else { \
 697         pos = ~acc; \
 698     }
 699
 700 static av_always_inline void
 701 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 702                       const int16_t **lumSrc, int lumFilterSize,
 703                       const int16_t *chrFilter, const int16_t **chrUSrc,
 704                       const int16_t **chrVSrc, int chrFilterSize,
 705                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 706                       int y, enum PixelFormat target)
 707 {
 708     const uint8_t * const d128=dither_8x8_220[y&7];
 709     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 710     int i;
 711     int acc = 0;
 712
 713     for (i = 0; i < dstW - 1; i += 2) {
 714         int j;
 715         int Y1 = 1 << 18;
 716         int Y2 = 1 << 18;
 717
 718         for (j = 0; j < lumFilterSize; j++) {
 719             Y1 += lumSrc[j][i]   * lumFilter[j];
 720             Y2 += lumSrc[j][i+1] * lumFilter[j];
 721         }
 722         Y1 >>= 19;
 723         Y2 >>= 19;
 724         if ((Y1 | Y2) & 0x100) {
 725             Y1 = av_clip_uint8(Y1);
 726             Y2 = av_clip_uint8(Y2);
 727         }
 728         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 729         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 730         if ((i & 7) == 6) {
 731             output_pixel(*dest++, acc);
 732         }
 733     }
 734 }
 735
 736 static av_always_inline void
 737 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 738                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 739                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 740                       int yalpha, int uvalpha, int y,
 741                       enum PixelFormat target)
 742 {
 743     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 744     const uint8_t * const d128 = dither_8x8_220[y & 7];
 745     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 746     int  yalpha1 = 4095 - yalpha;
 747     int i;
 748
 749     for (i = 0; i < dstW - 7; i += 8) {
 750         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 751         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 752         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 753         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 754         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 755         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 756         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 757         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 758         output_pixel(*dest++, acc);
 759     }
 760 }
 761
 762 static av_always_inline void
 763 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 764                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 765                       const int16_t *abuf0, uint8_t *dest, int dstW,
 766                       int uvalpha, int y, enum PixelFormat target)
 767 {
 768     const uint8_t * const d128 = dither_8x8_220[y & 7];
 769     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 770     int i;
 771
 772     for (i = 0; i < dstW - 7; i += 8) {
 773         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 774         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 775         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 776         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 777         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 778         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 779         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 780         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 781         output_pixel(*dest++, acc);
 782     }
 783 }
 784
 785 #undef output_pixel
 786
 787 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 788 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 789                                 const int16_t **lumSrc, int lumFilterSize, \
 790                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 791                                 const int16_t **chrVSrc, int chrFilterSize, \
 792                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 793                                 int y) \
 794 { \
 795     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 796                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 797                                   alpSrc, dest, dstW, y, fmt); \
 798 } \
 799  \
 800 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 801                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 802                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 803                                 int yalpha, int uvalpha, int y) \
 804 { \
 805     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 806                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 807 } \
 808  \
 809 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 810                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 811                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 812                                 int uvalpha, int y) \
 813 { \
 814     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 815                                   abuf0, dest, dstW, uvalpha, \
 816                                   y, fmt); \
 817 }
 818
 819 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 820 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 821
 822 #define output_pixels(pos, Y1, U, Y2, V) \
 823     if (target == PIX_FMT_YUYV422) { \
 824         dest[pos + 0] = Y1; \
 825         dest[pos + 1] = U;  \
 826         dest[pos + 2] = Y2; \
 827         dest[pos + 3] = V;  \
 828     } else { \
 829         dest[pos + 0] = U;  \
 830         dest[pos + 1] = Y1; \
 831         dest[pos + 2] = V;  \
 832         dest[pos + 3] = Y2; \
 833     }
 834
 835 static av_always_inline void
 836 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 837                      const int16_t **lumSrc, int lumFilterSize,
 838                      const int16_t *chrFilter, const int16_t **chrUSrc,
 839                      const int16_t **chrVSrc, int chrFilterSize,
 840                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 841                      int y, enum PixelFormat target)
 842 {
 843     int i;
 844
 845     for (i = 0; i < (dstW >> 1); i++) {
 846         int j;
 847         int Y1 = 1 << 18;
 848         int Y2 = 1 << 18;
 849         int U  = 1 << 18;
 850         int V  = 1 << 18;
 851
 852         for (j = 0; j < lumFilterSize; j++) {
 853             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 854             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 855         }
 856         for (j = 0; j < chrFilterSize; j++) {
 857             U += chrUSrc[j][i] * chrFilter[j];
 858             V += chrVSrc[j][i] * chrFilter[j];
 859         }
 860         Y1 >>= 19;
 861         Y2 >>= 19;
 862         U  >>= 19;
 863         V  >>= 19;
 864         if ((Y1 | Y2 | U | V) & 0x100) {
 865             Y1 = av_clip_uint8(Y1);
 866             Y2 = av_clip_uint8(Y2);
 867             U  = av_clip_uint8(U);
 868             V  = av_clip_uint8(V);
 869         }
 870         output_pixels(4*i, Y1, U, Y2, V);
 871     }
 872 }
 873
 874 static av_always_inline void
 875 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 876                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 877                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 878                      int yalpha, int uvalpha, int y,
 879                      enum PixelFormat target)
 880 {
 881     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 882                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 883                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 884     int  yalpha1 = 4095 - yalpha;
 885     int uvalpha1 = 4095 - uvalpha;
 886     int i;
 887
 888     for (i = 0; i < (dstW >> 1); i++) {
 889         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 890         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 891         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 892         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 893
 894         output_pixels(i * 4, Y1, U, Y2, V);
 895     }
 896 }
 897
 898 static av_always_inline void
 899 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 900                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 901                      const int16_t *abuf0, uint8_t *dest, int dstW,
 902                      int uvalpha, int y, enum PixelFormat target)
 903 {
 904     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 905                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 906     int i;
 907
 908     if (uvalpha < 2048) {
 909         for (i = 0; i < (dstW >> 1); i++) {
 910             int Y1 = buf0[i * 2]     >> 7;
 911             int Y2 = buf0[i * 2 + 1] >> 7;
 912             int U  = ubuf1[i]        >> 7;
 913             int V  = vbuf1[i]        >> 7;
 914
 915             output_pixels(i * 4, Y1, U, Y2, V);
 916         }
 917     } else {
 918         for (i = 0; i < (dstW >> 1); i++) {
 919             int Y1 =  buf0[i * 2]          >> 7;
 920             int Y2 =  buf0[i * 2 + 1]      >> 7;
 921             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 922             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 923
 924             output_pixels(i * 4, Y1, U, Y2, V);
 925         }
 926     }
 927 }
 928
 929 #undef output_pixels
 930
 931 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 932 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 933
 934 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 935 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 936 #define output_pixel(pos, val) \
 937     if (isBE(target)) { \
 938         AV_WB16(pos, val); \
 939     } else { \
 940         AV_WL16(pos, val); \
 941     }
 942
 943 static av_always_inline void
 944 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 945                        const int32_t **lumSrc, int lumFilterSize,
 946                        const int16_t *chrFilter, const int32_t **chrUSrc,
 947                        const int32_t **chrVSrc, int chrFilterSize,
 948                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 949                        int y, enum PixelFormat target)
 950 {
 951     int i;
 952
 953     for (i = 0; i < (dstW >> 1); i++) {
 954         int j;
 955         int Y1 = 0;
 956         int Y2 = 0;
 957         int U  = -128 << 23; // 19
 958         int V  = -128 << 23;
 959         int R, G, B;
 960
 961         for (j = 0; j < lumFilterSize; j++) {
 962             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 963             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 964         }
 965         for (j = 0; j < chrFilterSize; j++) {
 966             U += chrUSrc[j][i] * chrFilter[j];
 967             V += chrVSrc[j][i] * chrFilter[j];
 968         }
 969
 970         // 8bit: 12+15=27; 16-bit: 12+19=31
 971         Y1 >>= 14; // 10
 972         Y2 >>= 14;
 973         U  >>= 14;
 974         V  >>= 14;
 975
 976         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 977         Y1 -= c->yuv2rgb_y_offset;
 978         Y2 -= c->yuv2rgb_y_offset;
 979         Y1 *= c->yuv2rgb_y_coeff;
 980         Y2 *= c->yuv2rgb_y_coeff;
 981         Y1 += 1 << 13; // 21
 982         Y2 += 1 << 13;
 983         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 984
 985         R = V * c->yuv2rgb_v2r_coeff;
 986         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 987         B =                            U * c->yuv2rgb_u2b_coeff;
 988
 989         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 990         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 991         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 992         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 993         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 994         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 995         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 996         dest += 6;
 997     }
 998 }
 999
1000 static av_always_inline void
1001 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
1002                        const int32_t *ubuf[2], const int32_t *vbuf[2],
1003                        const int32_t *abuf[2], uint16_t *dest, int dstW,
1004                        int yalpha, int uvalpha, int y,
1005                        enum PixelFormat target)
1006 {
1007     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
1008                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1009                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1010     int  yalpha1 = 4095 - yalpha;
1011     int uvalpha1 = 4095 - uvalpha;
1012     int i;
1013
1014     for (i = 0; i < (dstW >> 1); i++) {
1015         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
1016         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
1017         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
1018         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
1019         int R, G, B;
1020
1021         Y1 -= c->yuv2rgb_y_offset;
1022         Y2 -= c->yuv2rgb_y_offset;
1023         Y1 *= c->yuv2rgb_y_coeff;
1024         Y2 *= c->yuv2rgb_y_coeff;
1025         Y1 += 1 << 13;
1026         Y2 += 1 << 13;
1027
1028         R = V * c->yuv2rgb_v2r_coeff;
1029         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1030         B =                            U * c->yuv2rgb_u2b_coeff;
1031
1032         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1033         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1034         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1035         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1036         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1037         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1038         dest += 6;
1039     }
1040 }
1041
1042 static av_always_inline void
1043 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
1044                        const int32_t *ubuf[2], const int32_t *vbuf[2],
1045                        const int32_t *abuf0, uint16_t *dest, int dstW,
1046                        int uvalpha, int y, enum PixelFormat target)
1047 {
1048     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1049                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1050     int i;
1051
1052     if (uvalpha < 2048) {
1053         for (i = 0; i < (dstW >> 1); i++) {
1054             int Y1 = (buf0[i * 2]    ) >> 2;
1055             int Y2 = (buf0[i * 2 + 1]) >> 2;
1056             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
1057             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
1058             int R, G, B;
1059
1060             Y1 -= c->yuv2rgb_y_offset;
1061             Y2 -= c->yuv2rgb_y_offset;
1062             Y1 *= c->yuv2rgb_y_coeff;
1063             Y2 *= c->yuv2rgb_y_coeff;
1064             Y1 += 1 << 13;
1065             Y2 += 1 << 13;
1066
1067             R = V * c->yuv2rgb_v2r_coeff;
1068             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1069             B =                            U * c->yuv2rgb_u2b_coeff;
1070
1071             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1072             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1073             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1074             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1075             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1076             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1077             dest += 6;
1078         }
1079     } else {
1080         for (i = 0; i < (dstW >> 1); i++) {
1081             int Y1 = (buf0[i * 2]    ) >> 2;
1082             int Y2 = (buf0[i * 2 + 1]) >> 2;
1083             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
1084             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
1085             int R, G, B;
1086
1087             Y1 -= c->yuv2rgb_y_offset;
1088             Y2 -= c->yuv2rgb_y_offset;
1089             Y1 *= c->yuv2rgb_y_coeff;
1090             Y2 *= c->yuv2rgb_y_coeff;
1091             Y1 += 1 << 13;
1092             Y2 += 1 << 13;
1093
1094             R = V * c->yuv2rgb_v2r_coeff;
1095             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1096             B =                            U * c->yuv2rgb_u2b_coeff;
1097
1098             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1099             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1100             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1101             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1102             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1103             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1104             dest += 6;
1105         }
1106     }
1107 }
1108
1109 #undef output_pixel
1110 #undef r_b
1111 #undef b_r
1112
1113 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
1114 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
1115 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
1116 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
1117
1118 static av_always_inline void
1119 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
1120               int U, int V, int A1, int A2,
1121               const void *_r, const void *_g, const void *_b, int y,
1122               enum PixelFormat target, int hasAlpha)
1123 {
1124     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
1125         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
1126         uint32_t *dest = (uint32_t *) _dest;
1127         const uint32_t *r = (const uint32_t *) _r;
1128         const uint32_t *g = (const uint32_t *) _g;
1129         const uint32_t *b = (const uint32_t *) _b;
1130
1131 #if CONFIG_SMALL
1132         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
1133
1134         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1135         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1136 #else
1137         if (hasAlpha) {
1138             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1139
1140             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1141             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1142         } else {
1143             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1144             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1145         }
1146 #endif
1147     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1148         uint8_t *dest = (uint8_t *) _dest;
1149         const uint8_t *r = (const uint8_t *) _r;
1150         const uint8_t *g = (const uint8_t *) _g;
1151         const uint8_t *b = (const uint8_t *) _b;
1152
1153 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1154 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1155
1156         dest[i * 6 + 0] = r_b[Y1];
1157         dest[i * 6 + 1] =   g[Y1];
1158         dest[i * 6 + 2] = b_r[Y1];
1159         dest[i * 6 + 3] = r_b[Y2];
1160         dest[i * 6 + 4] =   g[Y2];
1161         dest[i * 6 + 5] = b_r[Y2];
1162 #undef r_b
1163 #undef b_r
1164     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1165                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1166                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1167         uint16_t *dest = (uint16_t *) _dest;
1168         const uint16_t *r = (const uint16_t *) _r;
1169         const uint16_t *g = (const uint16_t *) _g;
1170         const uint16_t *b = (const uint16_t *) _b;
1171         int dr1, dg1, db1, dr2, dg2, db2;
1172
1173         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1174             dr1 = dither_2x2_8[ y & 1     ][0];
1175             dg1 = dither_2x2_4[ y & 1     ][0];
1176             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1177             dr2 = dither_2x2_8[ y & 1     ][1];
1178             dg2 = dither_2x2_4[ y & 1     ][1];
1179             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1180         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1181             dr1 = dither_2x2_8[ y & 1     ][0];
1182             dg1 = dither_2x2_8[ y & 1     ][1];
1183             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1184             dr2 = dither_2x2_8[ y & 1     ][1];
1185             dg2 = dither_2x2_8[ y & 1     ][0];
1186             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1187         } else {
1188             dr1 = dither_4x4_16[ y & 3     ][0];
1189             dg1 = dither_4x4_16[ y & 3     ][1];
1190             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1191             dr2 = dither_4x4_16[ y & 3     ][1];
1192             dg2 = dither_4x4_16[ y & 3     ][0];
1193             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1194         }
1195
1196         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1197         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1198     } else /* 8/4-bit */ {
1199         uint8_t *dest = (uint8_t *) _dest;
1200         const uint8_t *r = (const uint8_t *) _r;
1201         const uint8_t *g = (const uint8_t *) _g;
1202         const uint8_t *b = (const uint8_t *) _b;
1203         int dr1, dg1, db1, dr2, dg2, db2;
1204
1205         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1206             const uint8_t * const d64 = dither_8x8_73[y & 7];
1207             const uint8_t * const d32 = dither_8x8_32[y & 7];
1208             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1209             db1 =       d64[(i * 2 + 0) & 7];
1210             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1211             db2 =       d64[(i * 2 + 1) & 7];
1212         } else {
1213             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1214             const uint8_t * const d128 = dither_8x8_220[y & 7];
1215             dr1 = db1 = d128[(i * 2 + 0) & 7];
1216             dg1 =        d64[(i * 2 + 0) & 7];
1217             dr2 = db2 = d128[(i * 2 + 1) & 7];
1218             dg2 =        d64[(i * 2 + 1) & 7];
1219         }
1220
1221         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1222             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1223                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1224         } else {
1225             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1226             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1227         }
1228     }
1229 }
1230
1231 static av_always_inline void
1232 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1233                      const int16_t **lumSrc, int lumFilterSize,
1234                      const int16_t *chrFilter, const int16_t **chrUSrc,
1235                      const int16_t **chrVSrc, int chrFilterSize,
1236                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1237                      int y, enum PixelFormat target, int hasAlpha)
1238 {
1239     int i;
1240
1241     for (i = 0; i < (dstW >> 1); i++) {
1242         int j;
1243         int Y1 = 1 << 18;
1244         int Y2 = 1 << 18;
1245         int U  = 1 << 18;
1246         int V  = 1 << 18;
1247         int av_unused A1, A2;
1248         const void *r, *g, *b;
1249
1250         for (j = 0; j < lumFilterSize; j++) {
1251             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1252             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1253         }
1254         for (j = 0; j < chrFilterSize; j++) {
1255             U += chrUSrc[j][i] * chrFilter[j];
1256             V += chrVSrc[j][i] * chrFilter[j];
1257         }
1258         Y1 >>= 19;
1259         Y2 >>= 19;
1260         U  >>= 19;
1261         V  >>= 19;
1262         if ((Y1 | Y2 | U | V) & 0x100) {
1263             Y1 = av_clip_uint8(Y1);
1264             Y2 = av_clip_uint8(Y2);
1265             U  = av_clip_uint8(U);
1266             V  = av_clip_uint8(V);
1267         }
1268         if (hasAlpha) {
1269             A1 = 1 << 18;
1270             A2 = 1 << 18;
1271             for (j = 0; j < lumFilterSize; j++) {
1272                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1273                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1274             }
1275             A1 >>= 19;
1276             A2 >>= 19;
1277             if ((A1 | A2) & 0x100) {
1278                 A1 = av_clip_uint8(A1);
1279                 A2 = av_clip_uint8(A2);
1280             }
1281         }
1282
1283         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1284         r =  c->table_rV[V];
1285         g = (c->table_gU[U] + c->table_gV[V]);
1286         b =  c->table_bU[U];
1287
1288         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1289                       r, g, b, y, target, hasAlpha);
1290     }
1291 }
1292
1293 static av_always_inline void
1294 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1295                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1296                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1297                      int yalpha, int uvalpha, int y,
1298                      enum PixelFormat target, int hasAlpha)
1299 {
1300     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1301                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1302                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1303                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1304                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1305     int  yalpha1 = 4095 - yalpha;
1306     int uvalpha1 = 4095 - uvalpha;
1307     int i;
1308
1309     for (i = 0; i < (dstW >> 1); i++) {
1310         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1311         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1312         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1313         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1314         int A1, A2;
1315         const void *r =  c->table_rV[V],
1316                    *g = (c->table_gU[U] + c->table_gV[V]),
1317                    *b =  c->table_bU[U];
1318
1319         if (hasAlpha) {
1320             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1321             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1322         }
1323
1324         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1325                       r, g, b, y, target, hasAlpha);
1326     }
1327 }
1328
1329 static av_always_inline void
1330 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1331                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1332                      const int16_t *abuf0, uint8_t *dest, int dstW,
1333                      int uvalpha, int y, enum PixelFormat target,
1334                      int hasAlpha)
1335 {
1336     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1337                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1338     int i;
1339
1340     if (uvalpha < 2048) {
1341         for (i = 0; i < (dstW >> 1); i++) {
1342             int Y1 = buf0[i * 2]     >> 7;
1343             int Y2 = buf0[i * 2 + 1] >> 7;
1344             int U  = ubuf1[i]        >> 7;
1345             int V  = vbuf1[i]        >> 7;
1346             int A1, A2;
1347             const void *r =  c->table_rV[V],
1348                        *g = (c->table_gU[U] + c->table_gV[V]),
1349                        *b =  c->table_bU[U];
1350
1351             if (hasAlpha) {
1352                 A1 = abuf0[i * 2    ] >> 7;
1353                 A2 = abuf0[i * 2 + 1] >> 7;
1354             }
1355
1356             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1357                           r, g, b, y, target, hasAlpha);
1358         }
1359     } else {
1360         for (i = 0; i < (dstW >> 1); i++) {
1361             int Y1 =  buf0[i * 2]          >> 7;
1362             int Y2 =  buf0[i * 2 + 1]      >> 7;
1363             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1364             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1365             int A1, A2;
1366             const void *r =  c->table_rV[V],
1367                        *g = (c->table_gU[U] + c->table_gV[V]),
1368                        *b =  c->table_bU[U];
1369
1370             if (hasAlpha) {
1371                 A1 = abuf0[i * 2    ] >> 7;
1372                 A2 = abuf0[i * 2 + 1] >> 7;
1373             }
1374
1375             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1376                           r, g, b, y, target, hasAlpha);
1377         }
1378     }
1379 }
1380
1381 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1382 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1383                                 const int16_t **lumSrc, int lumFilterSize, \
1384                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1385                                 const int16_t **chrVSrc, int chrFilterSize, \
1386                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1387                                 int y) \
1388 { \
1389     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1390                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1391                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1392 }
1393 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1394 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1395 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1396                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1397                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1398                                 int yalpha, int uvalpha, int y) \
1399 { \
1400     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1401                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1402 } \
1403  \
1404 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1405                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1406                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1407                                 int uvalpha, int y) \
1408 { \
1409     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1410                                   dstW, uvalpha, y, fmt, hasAlpha); \
1411 }
1412
1413 #if CONFIG_SMALL
1414 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1415 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1416 #else
1417 #if CONFIG_SWSCALE_ALPHA
1418 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1419 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1420 #endif
1421 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1422 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1423 #endif
1424 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1425 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1426 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1427 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1428 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1429 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1430 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1431 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1432
1433 static av_always_inline void
1434 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1435                           const int16_t **lumSrc, int lumFilterSize,
1436                           const int16_t *chrFilter, const int16_t **chrUSrc,
1437                           const int16_t **chrVSrc, int chrFilterSize,
1438                           const int16_t **alpSrc, uint8_t *dest,
1439                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1440 {
1441     int i;
1442     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1443
1444     for (i = 0; i < dstW; i++) {
1445         int j;
1446         int Y = 1<<9;
1447         int U = (1<<9)-(128 << 19);
1448         int V = (1<<9)-(128 << 19);
1449         int av_unused A;
1450         int R, G, B;
1451
1452         for (j = 0; j < lumFilterSize; j++) {
1453             Y += lumSrc[j][i] * lumFilter[j];
1454         }
1455         for (j = 0; j < chrFilterSize; j++) {
1456             U += chrUSrc[j][i] * chrFilter[j];
1457             V += chrVSrc[j][i] * chrFilter[j];
1458         }
1459         Y >>= 10;
1460         U >>= 10;
1461         V >>= 10;
1462         if (hasAlpha) {
1463             A = 1 << 18;
1464             for (j = 0; j < lumFilterSize; j++) {
1465                 A += alpSrc[j][i] * lumFilter[j];
1466             }
1467             A >>= 19;
1468             if (A & 0x100)
1469                 A = av_clip_uint8(A);
1470         }
1471         Y -= c->yuv2rgb_y_offset;
1472         Y *= c->yuv2rgb_y_coeff;
1473         Y += 1 << 21;
1474         R = Y + V*c->yuv2rgb_v2r_coeff;
1475         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1476         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1477         if ((R | G | B) & 0xC0000000) {
1478             R = av_clip_uintp2(R, 30);
1479             G = av_clip_uintp2(G, 30);
1480             B = av_clip_uintp2(B, 30);
1481         }
1482
1483         switch(target) {
1484         case PIX_FMT_ARGB:
1485             dest[0] = hasAlpha ? A : 255;
1486             dest[1] = R >> 22;
1487             dest[2] = G >> 22;
1488             dest[3] = B >> 22;
1489             break;
1490         case PIX_FMT_RGB24:
1491             dest[0] = R >> 22;
1492             dest[1] = G >> 22;
1493             dest[2] = B >> 22;
1494             break;
1495         case PIX_FMT_RGBA:
1496             dest[0] = R >> 22;
1497             dest[1] = G >> 22;
1498             dest[2] = B >> 22;
1499             dest[3] = hasAlpha ? A : 255;
1500             break;
1501         case PIX_FMT_ABGR:
1502             dest[0] = hasAlpha ? A : 255;
1503             dest[1] = B >> 22;
1504             dest[2] = G >> 22;
1505             dest[3] = R >> 22;
1506             break;
1507         case PIX_FMT_BGR24:
1508             dest[0] = B >> 22;
1509             dest[1] = G >> 22;
1510             dest[2] = R >> 22;
1511             break;
1512         case PIX_FMT_BGRA:
1513             dest[0] = B >> 22;
1514             dest[1] = G >> 22;
1515             dest[2] = R >> 22;
1516             dest[3] = hasAlpha ? A : 255;
1517             break;
1518         }
1519         dest += step;
1520     }
1521 }
1522
1523 #if CONFIG_SMALL
1524 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1525 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1526 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1527 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1528 #else
1529 #if CONFIG_SWSCALE_ALPHA
1530 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1531 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1532 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1533 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1534 #endif
1535 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1536 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1537 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1538 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1539 #endif
1540 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1541 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1542
1543 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1544                                        int width, int height,
1545                                        int y, uint8_t val)
1546 {
1547     int i;
1548     uint8_t *ptr = plane + stride*y;
1549     for (i=0; i<height; i++) {
1550         memset(ptr, val, width);
1551         ptr += stride;
1552     }
1553 }
1554
1555 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1556
1557 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1558 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1559
1560 static av_always_inline void
1561 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1562                     enum PixelFormat origin)
1563 {
1564     int i;
1565     for (i = 0; i < width; i++) {
1566         unsigned int r_b = input_pixel(&src[i*3+0]);
1567         unsigned int   g = input_pixel(&src[i*3+1]);
1568         unsigned int b_r = input_pixel(&src[i*3+2]);
1569
1570         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1571     }
1572 }
1573
1574 static av_always_inline void
1575 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1576                     const uint16_t *src1, const uint16_t *src2,
1577                     int width, enum PixelFormat origin)
1578 {
1579     int i;
1580     assert(src1==src2);
1581     for (i = 0; i < width; i++) {
1582         int r_b = input_pixel(&src1[i*3+0]);
1583         int   g = input_pixel(&src1[i*3+1]);
1584         int b_r = input_pixel(&src1[i*3+2]);
1585
1586         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1587         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1588     }
1589 }
1590
1591 static av_always_inline void
1592 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1593                           const uint16_t *src1, const uint16_t *src2,
1594                           int width, enum PixelFormat origin)
1595 {
1596     int i;
1597     assert(src1==src2);
1598     for (i = 0; i < width; i++) {
1599         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1600         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1601         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1602
1603         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1604         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1605     }
1606 }
1607
1608 #undef r
1609 #undef b
1610 #undef input_pixel
1611
1612 #define rgb48funcs(pattern, BE_LE, origin) \
1613 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
1614                                     int width, uint32_t *unused) \
1615 { \
1616     const uint16_t *src = (const uint16_t *) _src; \
1617     uint16_t *dst = (uint16_t *) _dst; \
1618     rgb48ToY_c_template(dst, src, width, origin); \
1619 } \
1620  \
1621 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1622                                     const uint8_t *_src1, const uint8_t *_src2, \
1623                                     int width, uint32_t *unused) \
1624 { \
1625     const uint16_t *src1 = (const uint16_t *) _src1, \
1626                    *src2 = (const uint16_t *) _src2; \
1627     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1628     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1629 } \
1630  \
1631 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1632                                     const uint8_t *_src1, const uint8_t *_src2, \
1633                                     int width, uint32_t *unused) \
1634 { \
1635     const uint16_t *src1 = (const uint16_t *) _src1, \
1636                    *src2 = (const uint16_t *) _src2; \
1637     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1638     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1639 }
1640
1641 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1642 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1643 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1644 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1645
1646 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1647                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1648                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1649
1650 static av_always_inline void
1651 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1652                        int width, enum PixelFormat origin,
1653                        int shr,   int shg,   int shb, int shp,
1654                        int maskr, int maskg, int maskb,
1655                        int rsh,   int gsh,   int bsh, int S)
1656 {
1657     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1658               rnd = (32<<((S)-1)) + (1<<(S-7));
1659     int i;
1660
1661     for (i = 0; i < width; i++) {
1662         int px = input_pixel(i) >> shp;
1663         int b = (px & maskb) >> shb;
1664         int g = (px & maskg) >> shg;
1665         int r = (px & maskr) >> shr;
1666
1667         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1668     }
1669 }
1670
1671 static av_always_inline void
1672 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1673                         const uint8_t *src, int width,
1674                         enum PixelFormat origin,
1675                         int shr,   int shg,   int shb, int shp,
1676                         int maskr, int maskg, int maskb,
1677                         int rsh,   int gsh,   int bsh, int S)
1678 {
1679     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1680               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1681               rnd = (256<<((S)-1)) + (1<<(S-7));
1682     int i;
1683
1684     for (i = 0; i < width; i++) {
1685         int px = input_pixel(i) >> shp;
1686         int b = (px & maskb) >> shb;
1687         int g = (px & maskg) >> shg;
1688         int r = (px & maskr) >> shr;
1689
1690         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1691         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1692     }
1693 }
1694
1695 static av_always_inline void
1696 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1697                              const uint8_t *src, int width,
1698                              enum PixelFormat origin,
1699                              int shr,   int shg,   int shb, int shp,
1700                              int maskr, int maskg, int maskb,
1701                              int rsh,   int gsh,   int bsh, int S)
1702 {
1703     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1704               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1705               rnd = (256U<<(S)) + (1<<(S-6)), maskgx = ~(maskr | maskb);
1706     int i;
1707
1708     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1709     for (i = 0; i < width; i++) {
1710         int px0 = input_pixel(2 * i + 0) >> shp;
1711         int px1 = input_pixel(2 * i + 1) >> shp;
1712         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1713         int rb = px0 + px1 - g;
1714
1715         b = (rb & maskb) >> shb;
1716         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1717             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1718             g >>= shg;
1719         } else {
1720             g = (g  & maskg) >> shg;
1721         }
1722         r = (rb & maskr) >> shr;
1723
1724         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1725         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1726     }
1727 }
1728
1729 #undef input_pixel
1730
1731 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1732                          maskg, maskb, rsh, gsh, bsh, S) \
1733 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1734                           int width, uint32_t *unused) \
1735 { \
1736     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1737                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1738 } \
1739  \
1740 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1741                            const uint8_t *src, const uint8_t *dummy, \
1742                            int width, uint32_t *unused) \
1743 { \
1744     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1745                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1746 } \
1747  \
1748 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1749                                 const uint8_t *src, const uint8_t *dummy, \
1750                                 int width, uint32_t *unused) \
1751 { \
1752     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1753                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1754 }
1755
1756 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1757 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1758 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1759 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1760 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1761 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1762 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1763 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1764 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1765 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1766 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1767 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1768
1769 static void abgrToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1770 {
1771     int i;
1772     for (i=0; i<width; i++) {
1773         dst[i]= src[4*i]<<6;
1774     }
1775 }
1776
1777 static void rgbaToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1778 {
1779     int i;
1780     for (i=0; i<width; i++) {
1781         dst[i]= src[4*i+3]<<6;
1782     }
1783 }
1784
1785 static void palToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *pal)
1786 {
1787     int i;
1788     for (i=0; i<width; i++) {
1789         int d= src[i];
1790
1791         dst[i]= (pal[d] >> 24)<<6;
1792     }
1793 }
1794
1795 static void palToY_c(int16_t *dst, const uint8_t *src, long width, uint32_t *pal)
1796 {
1797     int i;
1798     for (i=0; i<width; i++) {
1799         int d= src[i];
1800
1801         dst[i]= (pal[d] & 0xFF)<<6;
1802     }
1803 }
1804
1805 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1806                            const uint8_t *src1, const uint8_t *src2,
1807                            int width, uint32_t *pal)
1808 {
1809     int i;
1810     assert(src1 == src2);
1811     for (i=0; i<width; i++) {
1812         int p= pal[src1[i]];
1813
1814         dstU[i]= (uint8_t)(p>> 8)<<6;
1815         dstV[i]= (uint8_t)(p>>16)<<6;
1816     }
1817 }
1818
1819 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1820 {
1821     int i, j;
1822     for (i=0; i<width/8; i++) {
1823         int d= ~src[i];
1824         for(j=0; j<8; j++)
1825             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1826     }
1827     if(width&7){
1828         int d= ~src[i];
1829         for(j=0; j<(width&7); j++)
1830             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1831     }
1832 }
1833
1834 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1835 {
1836     int i, j;
1837     for (i=0; i<width/8; i++) {
1838         int d= src[i];
1839         for(j=0; j<8; j++)
1840             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1841     }
1842     if(width&7){
1843         int d= src[i];
1844         for(j=0; j<(width&7); j++)
1845             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1846     }
1847 }
1848
1849 //FIXME yuy2* can read up to 7 samples too much
1850
1851 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1852                       uint32_t *unused)
1853 {
1854     int i;
1855     for (i=0; i<width; i++)
1856         dst[i]= src[2*i];
1857 }
1858
1859 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1860                        const uint8_t *src2, int width, uint32_t *unused)
1861 {
1862     int i;
1863     for (i=0; i<width; i++) {
1864         dstU[i]= src1[4*i + 1];
1865         dstV[i]= src1[4*i + 3];
1866     }
1867     assert(src1 == src2);
1868 }
1869
1870 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
1871 {
1872     int i;
1873     const uint16_t *src = (const uint16_t *) _src;
1874     uint16_t *dst = (uint16_t *) _dst;
1875     for (i=0; i<width; i++) {
1876         dst[i] = av_bswap16(src[i]);
1877     }
1878 }
1879
1880 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
1881                         const uint8_t *_src2, int width, uint32_t *unused)
1882 {
1883     int i;
1884     const uint16_t *src1 = (const uint16_t *) _src1,
1885                    *src2 = (const uint16_t *) _src2;
1886     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1887     for (i=0; i<width; i++) {
1888         dstU[i] = av_bswap16(src1[i]);
1889         dstV[i] = av_bswap16(src2[i]);
1890     }
1891 }
1892
1893 /* This is almost identical to the previous, end exists only because
1894  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1895 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1896                       uint32_t *unused)
1897 {
1898     int i;
1899     for (i=0; i<width; i++)
1900         dst[i]= src[2*i+1];
1901 }
1902
1903 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1904                        const uint8_t *src2, int width, uint32_t *unused)
1905 {
1906     int i;
1907     for (i=0; i<width; i++) {
1908         dstU[i]= src1[4*i + 0];
1909         dstV[i]= src1[4*i + 2];
1910     }
1911     assert(src1 == src2);
1912 }
1913
1914 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1915                                         const uint8_t *src, int width)
1916 {
1917     int i;
1918     for (i = 0; i < width; i++) {
1919         dst1[i] = src[2*i+0];
1920         dst2[i] = src[2*i+1];
1921     }
1922 }
1923
1924 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1925                        const uint8_t *src1, const uint8_t *src2,
1926                        int width, uint32_t *unused)
1927 {
1928     nvXXtoUV_c(dstU, dstV, src1, width);
1929 }
1930
1931 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1932                        const uint8_t *src1, const uint8_t *src2,
1933                        int width, uint32_t *unused)
1934 {
1935     nvXXtoUV_c(dstV, dstU, src1, width);
1936 }
1937
1938 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1939
1940 static void bgr24ToY_c(int16_t *dst, const uint8_t *src,
1941                        int width, uint32_t *unused)
1942 {
1943     int i;
1944     for (i=0; i<width; i++) {
1945         int b= src[i*3+0];
1946         int g= src[i*3+1];
1947         int r= src[i*3+2];
1948
1949         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1950     }
1951 }
1952
1953 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1954                         const uint8_t *src2, int width, uint32_t *unused)
1955 {
1956     int i;
1957     for (i=0; i<width; i++) {
1958         int b= src1[3*i + 0];
1959         int g= src1[3*i + 1];
1960         int r= src1[3*i + 2];
1961
1962         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1963         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1964     }
1965     assert(src1 == src2);
1966 }
1967
1968 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1969                              const uint8_t *src2, int width, uint32_t *unused)
1970 {
1971     int i;
1972     for (i=0; i<width; i++) {
1973         int b= src1[6*i + 0] + src1[6*i + 3];
1974         int g= src1[6*i + 1] + src1[6*i + 4];
1975         int r= src1[6*i + 2] + src1[6*i + 5];
1976
1977         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1978         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1979     }
1980     assert(src1 == src2);
1981 }
1982
1983 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, int width,
1984                        uint32_t *unused)
1985 {
1986     int i;
1987     for (i=0; i<width; i++) {
1988         int r= src[i*3+0];
1989         int g= src[i*3+1];
1990         int b= src[i*3+2];
1991
1992         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1993     }
1994 }
1995
1996 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1997                         const uint8_t *src2, int width, uint32_t *unused)
1998 {
1999     int i;
2000     assert(src1==src2);
2001     for (i=0; i<width; i++) {
2002         int r= src1[3*i + 0];
2003         int g= src1[3*i + 1];
2004         int b= src1[3*i + 2];
2005
2006         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
2007         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
2008     }
2009 }
2010
2011 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
2012                                     const uint8_t *src2, int width, uint32_t *unused)
2013 {
2014     int i;
2015     assert(src1==src2);
2016     for (i=0; i<width; i++) {
2017         int r= src1[6*i + 0] + src1[6*i + 3];
2018         int g= src1[6*i + 1] + src1[6*i + 4];
2019         int b= src1[6*i + 2] + src1[6*i + 5];
2020
2021         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
2022         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
2023     }
2024 }
2025
2026 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
2027                            const int16_t *filter,
2028                            const int16_t *filterPos, int filterSize)
2029 {
2030     int i;
2031     int32_t *dst = (int32_t *) _dst;
2032     const uint16_t *src = (const uint16_t *) _src;
2033     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2034     int sh = bits - 4;
2035
2036     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2037         sh= 9;
2038
2039     for (i = 0; i < dstW; i++) {
2040         int j;
2041         int srcPos = filterPos[i];
2042         int val = 0;
2043
2044         for (j = 0; j < filterSize; j++) {
2045             val += src[srcPos + j] * filter[filterSize * i + j];
2046         }
2047         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
2048         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
2049     }
2050 }
2051
2052 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
2053                            const int16_t *filter,
2054                            const int16_t *filterPos, int filterSize)
2055 {
2056     int i;
2057     const uint16_t *src = (const uint16_t *) _src;
2058     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2059
2060     for (i = 0; i < dstW; i++) {
2061         int j;
2062         int srcPos = filterPos[i];
2063         int val = 0;
2064
2065         for (j = 0; j < filterSize; j++) {
2066             val += src[srcPos + j] * filter[filterSize * i + j];
2067         }
2068         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2069         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2070     }
2071 }
2072
2073 // bilinear / bicubic scaling
2074 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
2075                           const int16_t *filter, const int16_t *filterPos,
2076                           int filterSize)
2077 {
2078     int i;
2079     for (i=0; i<dstW; i++) {
2080         int j;
2081         int srcPos= filterPos[i];
2082         int val=0;
2083         for (j=0; j<filterSize; j++) {
2084             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2085         }
2086         //filter += hFilterSize;
2087         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2088         //dst[i] = val>>7;
2089     }
2090 }
2091
2092 static inline void hScale16N_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
2093                                     const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
2094 {
2095     int i, j;
2096
2097     for (i=0; i<dstW; i++) {
2098         int srcPos= filterPos[i];
2099         int val=0;
2100         for (j=0; j<filterSize; j++) {
2101             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2102         }
2103         dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
2104     }
2105 }
2106
2107 static inline void hScale16NX_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
2108                                     const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
2109 {
2110     int i, j;
2111     for (i=0; i<dstW; i++) {
2112         int srcPos= filterPos[i];
2113         int val=0;
2114         for (j=0; j<filterSize; j++) {
2115             val += ((int)av_bswap16(src[srcPos + j]))*filter[filterSize*i + j];
2116         }
2117         dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
2118     }
2119 }
2120
2121 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
2122                           const int16_t *filter, const int16_t *filterPos,
2123                           int filterSize)
2124 {
2125     int i;
2126     int32_t *dst = (int32_t *) _dst;
2127     for (i=0; i<dstW; i++) {
2128         int j;
2129         int srcPos= filterPos[i];
2130         int val=0;
2131         for (j=0; j<filterSize; j++) {
2132             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2133         }
2134         //filter += hFilterSize;
2135         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
2136         //dst[i] = val>>7;
2137     }
2138 }
2139
2140 //FIXME all pal and rgb srcFormats could do this convertion as well
2141 //FIXME all scalers more complex than bilinear could do half of this transform
2142 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2143 {
2144     int i;
2145     for (i = 0; i < width; i++) {
2146         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2147         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2148     }
2149 }
2150 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2151 {
2152     int i;
2153     for (i = 0; i < width; i++) {
2154         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2155         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2156     }
2157 }
2158 static void lumRangeToJpeg_c(int16_t *dst, int width)
2159 {
2160     int i;
2161     for (i = 0; i < width; i++)
2162         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2163 }
2164 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2165 {
2166     int i;
2167     for (i = 0; i < width; i++)
2168         dst[i] = (dst[i]*14071 + 33561947)>>14;
2169 }
2170
2171 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2172 {
2173     int i;
2174     int32_t *dstU = (int32_t *) _dstU;
2175     int32_t *dstV = (int32_t *) _dstV;
2176     for (i = 0; i < width; i++) {
2177         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2178         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2179     }
2180 }
2181 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2182 {
2183     int i;
2184     int32_t *dstU = (int32_t *) _dstU;
2185     int32_t *dstV = (int32_t *) _dstV;
2186     for (i = 0; i < width; i++) {
2187         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2188         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2189     }
2190 }
2191 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2192 {
2193     int i;
2194     int32_t *dst = (int32_t *) _dst;
2195     for (i = 0; i < width; i++)
2196         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2197 }
2198 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2199 {
2200     int i;
2201     int32_t *dst = (int32_t *) _dst;
2202     for (i = 0; i < width; i++)
2203         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
2204 }
2205
2206 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2207                            const uint8_t *src, int srcW, int xInc)
2208 {
2209     int i;
2210     unsigned int xpos=0;
2211     for (i=0;i<dstWidth;i++) {
2212         register unsigned int xx=xpos>>16;
2213         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2214         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2215         xpos+=xInc;
2216     }
2217     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2218         dst[i] = src[srcW-1]*128;
2219 }
2220
2221 // *** horizontal scale Y line to temp buffer
2222 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2223                                      const uint8_t *src, int srcW, int xInc,
2224                                      const int16_t *hLumFilter,
2225                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2226                                      uint8_t *formatConvBuffer,
2227                                      uint32_t *pal, int isAlpha)
2228 {
2229     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2230     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2231
2232     if (toYV12) {
2233         toYV12(formatConvBuffer, src, srcW, pal);
2234         src= formatConvBuffer;
2235     }
2236
2237     if (c->hScale16) {
2238         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2239         c->hScale16(dst, dstWidth, (const uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, shift);
2240     } else if (!c->hyscale_fast) {
2241         c->hScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2242     } else { // fast bilinear upscale / crap downscale
2243         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2244     }
2245
2246     if (convertRange)
2247         convertRange(dst, dstWidth);
2248 }
2249
2250 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2251                            int dstWidth, const uint8_t *src1,
2252                            const uint8_t *src2, int srcW, int xInc)
2253 {
2254     int i;
2255     unsigned int xpos=0;
2256     for (i=0;i<dstWidth;i++) {
2257         register unsigned int xx=xpos>>16;
2258         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2259         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2260         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2261         xpos+=xInc;
2262     }
2263     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2264         dst1[i] = src1[srcW-1]*128;
2265         dst2[i] = src2[srcW-1]*128;
2266     }
2267 }
2268
2269 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2270                                      const uint8_t *src1, const uint8_t *src2,
2271                                      int srcW, int xInc, const int16_t *hChrFilter,
2272                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2273                                      uint8_t *formatConvBuffer, uint32_t *pal)
2274 {
2275     if (c->chrToYV12) {
2276         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2277         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2278         src1= formatConvBuffer;
2279         src2= buf2;
2280     }
2281
2282     if (c->hScale16) {
2283         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2284         c->hScale16(dst1, dstWidth, (const uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2285         c->hScale16(dst2, dstWidth, (const uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2286     } else if (!c->hcscale_fast) {
2287         c->hScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2288         c->hScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2289     } else { // fast bilinear upscale / crap downscale
2290         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2291     }
2292
2293     if (c->chrConvertRange)
2294         c->chrConvertRange(dst1, dst2, dstWidth);
2295 }
2296
2297 static av_always_inline void
2298 find_c_packed_planar_out_funcs(SwsContext *c,
2299                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
2300                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2301                                yuv2packedX_fn *yuv2packedX)
2302 {
2303     enum PixelFormat dstFormat = c->dstFormat;
2304
2305     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2306         *yuv2yuvX     = yuv2nv12X_c;
2307     } else if (is16BPS(dstFormat)) {
2308         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
2309     } else if (is9_OR_10BPS(dstFormat)) {
2310         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2311             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
2312         } else {
2313             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
2314         }
2315     } else {
2316         *yuv2yuv1     = yuv2yuv1_c;
2317         *yuv2yuvX     = yuv2yuvX_c;
2318     }
2319     if(c->flags & SWS_FULL_CHR_H_INT) {
2320         switch (dstFormat) {
2321             case PIX_FMT_RGBA:
2322 #if CONFIG_SMALL
2323                 *yuv2packedX = yuv2rgba32_full_X_c;
2324 #else
2325 #if CONFIG_SWSCALE_ALPHA
2326                 if (c->alpPixBuf) {
2327                     *yuv2packedX = yuv2rgba32_full_X_c;
2328                 } else
2329 #endif /* CONFIG_SWSCALE_ALPHA */
2330                 {
2331                     *yuv2packedX = yuv2rgbx32_full_X_c;
2332                 }
2333 #endif /* !CONFIG_SMALL */
2334                 break;
2335             case PIX_FMT_ARGB:
2336 #if CONFIG_SMALL
2337                 *yuv2packedX = yuv2argb32_full_X_c;
2338 #else
2339 #if CONFIG_SWSCALE_ALPHA
2340                 if (c->alpPixBuf) {
2341                     *yuv2packedX = yuv2argb32_full_X_c;
2342                 } else
2343 #endif /* CONFIG_SWSCALE_ALPHA */
2344                 {
2345                     *yuv2packedX = yuv2xrgb32_full_X_c;
2346                 }
2347 #endif /* !CONFIG_SMALL */
2348                 break;
2349             case PIX_FMT_BGRA:
2350 #if CONFIG_SMALL
2351                 *yuv2packedX = yuv2bgra32_full_X_c;
2352 #else
2353 #if CONFIG_SWSCALE_ALPHA
2354                 if (c->alpPixBuf) {
2355                     *yuv2packedX = yuv2bgra32_full_X_c;
2356                 } else
2357 #endif /* CONFIG_SWSCALE_ALPHA */
2358                 {
2359                     *yuv2packedX = yuv2bgrx32_full_X_c;
2360                 }
2361 #endif /* !CONFIG_SMALL */
2362                 break;
2363             case PIX_FMT_ABGR:
2364 #if CONFIG_SMALL
2365                 *yuv2packedX = yuv2abgr32_full_X_c;
2366 #else
2367 #if CONFIG_SWSCALE_ALPHA
2368                 if (c->alpPixBuf) {
2369                     *yuv2packedX = yuv2abgr32_full_X_c;
2370                 } else
2371 #endif /* CONFIG_SWSCALE_ALPHA */
2372                 {
2373                     *yuv2packedX = yuv2xbgr32_full_X_c;
2374                 }
2375 #endif /* !CONFIG_SMALL */
2376                 break;
2377             case PIX_FMT_RGB24:
2378             *yuv2packedX = yuv2rgb24_full_X_c;
2379             break;
2380         case PIX_FMT_BGR24:
2381             *yuv2packedX = yuv2bgr24_full_X_c;
2382             break;
2383         }
2384         if(!*yuv2packedX)
2385             goto YUV_PACKED;
2386     } else {
2387         YUV_PACKED:
2388         switch (dstFormat) {
2389         case PIX_FMT_GRAY16BE:
2390             *yuv2packed1 = yuv2gray16BE_1_c;
2391             *yuv2packed2 = yuv2gray16BE_2_c;
2392             *yuv2packedX = yuv2gray16BE_X_c;
2393             break;
2394         case PIX_FMT_GRAY16LE:
2395             *yuv2packed1 = yuv2gray16LE_1_c;
2396             *yuv2packed2 = yuv2gray16LE_2_c;
2397             *yuv2packedX = yuv2gray16LE_X_c;
2398             break;
2399         case PIX_FMT_MONOWHITE:
2400             *yuv2packed1 = yuv2monowhite_1_c;
2401             *yuv2packed2 = yuv2monowhite_2_c;
2402             *yuv2packedX = yuv2monowhite_X_c;
2403             break;
2404         case PIX_FMT_MONOBLACK:
2405             *yuv2packed1 = yuv2monoblack_1_c;
2406             *yuv2packed2 = yuv2monoblack_2_c;
2407             *yuv2packedX = yuv2monoblack_X_c;
2408             break;
2409         case PIX_FMT_YUYV422:
2410             *yuv2packed1 = yuv2yuyv422_1_c;
2411             *yuv2packed2 = yuv2yuyv422_2_c;
2412             *yuv2packedX = yuv2yuyv422_X_c;
2413             break;
2414         case PIX_FMT_UYVY422:
2415             *yuv2packed1 = yuv2uyvy422_1_c;
2416             *yuv2packed2 = yuv2uyvy422_2_c;
2417             *yuv2packedX = yuv2uyvy422_X_c;
2418             break;
2419         case PIX_FMT_RGB48LE:
2420             *yuv2packed1 = yuv2rgb48le_1_c;
2421             *yuv2packed2 = yuv2rgb48le_2_c;
2422             *yuv2packedX = yuv2rgb48le_X_c;
2423             break;
2424         case PIX_FMT_RGB48BE:
2425             *yuv2packed1 = yuv2rgb48be_1_c;
2426             *yuv2packed2 = yuv2rgb48be_2_c;
2427             *yuv2packedX = yuv2rgb48be_X_c;
2428             break;
2429         case PIX_FMT_BGR48LE:
2430             *yuv2packed1 = yuv2bgr48le_1_c;
2431             *yuv2packed2 = yuv2bgr48le_2_c;
2432             *yuv2packedX = yuv2bgr48le_X_c;
2433             break;
2434         case PIX_FMT_BGR48BE:
2435             *yuv2packed1 = yuv2bgr48be_1_c;
2436             *yuv2packed2 = yuv2bgr48be_2_c;
2437             *yuv2packedX = yuv2bgr48be_X_c;
2438             break;
2439         case PIX_FMT_RGB32:
2440         case PIX_FMT_BGR32:
2441 #if CONFIG_SMALL
2442             *yuv2packed1 = yuv2rgb32_1_c;
2443             *yuv2packed2 = yuv2rgb32_2_c;
2444             *yuv2packedX = yuv2rgb32_X_c;
2445 #else
2446 #if CONFIG_SWSCALE_ALPHA
2447                 if (c->alpPixBuf) {
2448                     *yuv2packed1 = yuv2rgba32_1_c;
2449                     *yuv2packed2 = yuv2rgba32_2_c;
2450                     *yuv2packedX = yuv2rgba32_X_c;
2451                 } else
2452 #endif /* CONFIG_SWSCALE_ALPHA */
2453                 {
2454                     *yuv2packed1 = yuv2rgbx32_1_c;
2455                     *yuv2packed2 = yuv2rgbx32_2_c;
2456                     *yuv2packedX = yuv2rgbx32_X_c;
2457                 }
2458 #endif /* !CONFIG_SMALL */
2459             break;
2460         case PIX_FMT_RGB32_1:
2461         case PIX_FMT_BGR32_1:
2462 #if CONFIG_SMALL
2463                 *yuv2packed1 = yuv2rgb32_1_1_c;
2464                 *yuv2packed2 = yuv2rgb32_1_2_c;
2465                 *yuv2packedX = yuv2rgb32_1_X_c;
2466 #else
2467 #if CONFIG_SWSCALE_ALPHA
2468                 if (c->alpPixBuf) {
2469                     *yuv2packed1 = yuv2rgba32_1_1_c;
2470                     *yuv2packed2 = yuv2rgba32_1_2_c;
2471                     *yuv2packedX = yuv2rgba32_1_X_c;
2472                 } else
2473 #endif /* CONFIG_SWSCALE_ALPHA */
2474                 {
2475                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2476                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2477                     *yuv2packedX = yuv2rgbx32_1_X_c;
2478                 }
2479 #endif /* !CONFIG_SMALL */
2480                 break;
2481         case PIX_FMT_RGB24:
2482             *yuv2packed1 = yuv2rgb24_1_c;
2483             *yuv2packed2 = yuv2rgb24_2_c;
2484             *yuv2packedX = yuv2rgb24_X_c;
2485             break;
2486         case PIX_FMT_BGR24:
2487             *yuv2packed1 = yuv2bgr24_1_c;
2488             *yuv2packed2 = yuv2bgr24_2_c;
2489             *yuv2packedX = yuv2bgr24_X_c;
2490             break;
2491         case PIX_FMT_RGB565LE:
2492         case PIX_FMT_RGB565BE:
2493         case PIX_FMT_BGR565LE:
2494         case PIX_FMT_BGR565BE:
2495             *yuv2packed1 = yuv2rgb16_1_c;
2496             *yuv2packed2 = yuv2rgb16_2_c;
2497             *yuv2packedX = yuv2rgb16_X_c;
2498             break;
2499         case PIX_FMT_RGB555LE:
2500         case PIX_FMT_RGB555BE:
2501         case PIX_FMT_BGR555LE:
2502         case PIX_FMT_BGR555BE:
2503             *yuv2packed1 = yuv2rgb15_1_c;
2504             *yuv2packed2 = yuv2rgb15_2_c;
2505             *yuv2packedX = yuv2rgb15_X_c;
2506             break;
2507         case PIX_FMT_RGB444LE:
2508         case PIX_FMT_RGB444BE:
2509         case PIX_FMT_BGR444LE:
2510         case PIX_FMT_BGR444BE:
2511             *yuv2packed1 = yuv2rgb12_1_c;
2512             *yuv2packed2 = yuv2rgb12_2_c;
2513             *yuv2packedX = yuv2rgb12_X_c;
2514             break;
2515         case PIX_FMT_RGB8:
2516         case PIX_FMT_BGR8:
2517             *yuv2packed1 = yuv2rgb8_1_c;
2518             *yuv2packed2 = yuv2rgb8_2_c;
2519             *yuv2packedX = yuv2rgb8_X_c;
2520             break;
2521         case PIX_FMT_RGB4:
2522         case PIX_FMT_BGR4:
2523             *yuv2packed1 = yuv2rgb4_1_c;
2524             *yuv2packed2 = yuv2rgb4_2_c;
2525             *yuv2packedX = yuv2rgb4_X_c;
2526             break;
2527         case PIX_FMT_RGB4_BYTE:
2528         case PIX_FMT_BGR4_BYTE:
2529             *yuv2packed1 = yuv2rgb4b_1_c;
2530             *yuv2packed2 = yuv2rgb4b_2_c;
2531             *yuv2packedX = yuv2rgb4b_X_c;
2532             break;
2533         }
2534     }
2535 }
2536
2537 #define DEBUG_SWSCALE_BUFFERS 0
2538 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2539
2540 static int swScale(SwsContext *c, const uint8_t* src[],
2541                    int srcStride[], int srcSliceY,
2542                    int srcSliceH, uint8_t* dst[], int dstStride[])
2543 {
2544     /* load a few things into local vars to make the code more readable? and faster */
2545     const int srcW= c->srcW;
2546     const int dstW= c->dstW;
2547     const int dstH= c->dstH;
2548     const int chrDstW= c->chrDstW;
2549     const int chrSrcW= c->chrSrcW;
2550     const int lumXInc= c->lumXInc;
2551     const int chrXInc= c->chrXInc;
2552     const enum PixelFormat dstFormat= c->dstFormat;
2553     const int flags= c->flags;
2554     int16_t *vLumFilterPos= c->vLumFilterPos;
2555     int16_t *vChrFilterPos= c->vChrFilterPos;
2556     int16_t *hLumFilterPos= c->hLumFilterPos;
2557     int16_t *hChrFilterPos= c->hChrFilterPos;
2558     int16_t *vLumFilter= c->vLumFilter;
2559     int16_t *vChrFilter= c->vChrFilter;
2560     int16_t *hLumFilter= c->hLumFilter;
2561     int16_t *hChrFilter= c->hChrFilter;
2562     int32_t *lumMmxFilter= c->lumMmxFilter;
2563     int32_t *chrMmxFilter= c->chrMmxFilter;
2564     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2565     const int vLumFilterSize= c->vLumFilterSize;
2566     const int vChrFilterSize= c->vChrFilterSize;
2567     const int hLumFilterSize= c->hLumFilterSize;
2568     const int hChrFilterSize= c->hChrFilterSize;
2569     int16_t **lumPixBuf= c->lumPixBuf;
2570     int16_t **chrUPixBuf= c->chrUPixBuf;
2571     int16_t **chrVPixBuf= c->chrVPixBuf;
2572     int16_t **alpPixBuf= c->alpPixBuf;
2573     const int vLumBufSize= c->vLumBufSize;
2574     const int vChrBufSize= c->vChrBufSize;
2575     uint8_t *formatConvBuffer= c->formatConvBuffer;
2576     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2577     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2578     int lastDstY;
2579     uint32_t *pal=c->pal_yuv;
2580
2581     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2582     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2583     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2584     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2585     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2586     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2587
2588     /* vars which will change and which we need to store back in the context */
2589     int dstY= c->dstY;
2590     int lumBufIndex= c->lumBufIndex;
2591     int chrBufIndex= c->chrBufIndex;
2592     int lastInLumBuf= c->lastInLumBuf;
2593     int lastInChrBuf= c->lastInChrBuf;
2594
2595     if (isPacked(c->srcFormat)) {
2596         src[0]=
2597         src[1]=
2598         src[2]=
2599         src[3]= src[0];
2600         srcStride[0]=
2601         srcStride[1]=
2602         srcStride[2]=
2603         srcStride[3]= srcStride[0];
2604     }
2605     srcStride[1]<<= c->vChrDrop;
2606     srcStride[2]<<= c->vChrDrop;
2607
2608     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2609                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2610                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2611     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2612                    srcSliceY,    srcSliceH,    dstY,    dstH);
2613     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2614                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2615
2616     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2617         static int warnedAlready=0; //FIXME move this into the context perhaps
2618         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2619             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2620                    "         ->cannot do aligned memory accesses anymore\n");
2621             warnedAlready=1;
2622         }
2623     }
2624
2625     /* Note the user might start scaling the picture in the middle so this
2626        will not get executed. This is not really intended but works
2627        currently, so people might do it. */
2628     if (srcSliceY ==0) {
2629         lumBufIndex=-1;
2630         chrBufIndex=-1;
2631         dstY=0;
2632         lastInLumBuf= -1;
2633         lastInChrBuf= -1;
2634     }
2635
2636     if (!should_dither) {
2637         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2638     }
2639     lastDstY= dstY;
2640
2641     for (;dstY < dstH; dstY++) {
2642         const int chrDstY= dstY>>c->chrDstVSubSample;
2643         uint8_t *dest[4] = {
2644             dst[0] + dstStride[0] * dstY,
2645             dst[1] + dstStride[1] * chrDstY,
2646             dst[2] + dstStride[2] * chrDstY,
2647             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2648         };
2649
2650         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2651         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2652         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2653         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2654         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2655         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2656         int enough_lines;
2657
2658         //handle holes (FAST_BILINEAR & weird filters)
2659         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2660         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2661         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2662         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2663
2664         DEBUG_BUFFERS("dstY: %d\n", dstY);
2665         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2666                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2667         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2668                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2669
2670         // Do we have enough lines in this slice to output the dstY line
2671         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2672
2673         if (!enough_lines) {
2674             lastLumSrcY = srcSliceY + srcSliceH - 1;
2675             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2676             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2677                                             lastLumSrcY, lastChrSrcY);
2678         }
2679
2680         //Do horizontal scaling
2681         while(lastInLumBuf < lastLumSrcY) {
2682             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2683             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2684             lumBufIndex++;
2685             assert(lumBufIndex < 2*vLumBufSize);
2686             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2687             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2688             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2689                     hLumFilter, hLumFilterPos, hLumFilterSize,
2690                     formatConvBuffer,
2691                     pal, 0);
2692             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2693                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2694                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2695                         formatConvBuffer,
2696                         pal, 1);
2697             lastInLumBuf++;
2698             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2699                                lumBufIndex,    lastInLumBuf);
2700         }
2701         while(lastInChrBuf < lastChrSrcY) {
2702             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2703             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2704             chrBufIndex++;
2705             assert(chrBufIndex < 2*vChrBufSize);
2706             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2707             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2708             //FIXME replace parameters through context struct (some at least)
2709
2710             if (c->needs_hcscale)
2711                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2712                           chrDstW, src1, src2, chrSrcW, chrXInc,
2713                           hChrFilter, hChrFilterPos, hChrFilterSize,
2714                           formatConvBuffer, pal);
2715             lastInChrBuf++;
2716             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2717                                chrBufIndex,    lastInChrBuf);
2718         }
2719         //wrap buf index around to stay inside the ring buffer
2720         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2721         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2722         if (!enough_lines)
2723             break; //we can't output a dstY line so let's try with the next slice
2724
2725 #if HAVE_MMX
2726         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2727 #endif
2728         if (should_dither) {
2729             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2730             c->lumDither8 = dither_8x8_128[dstY & 7];
2731         }
2732         if (dstY >= dstH-2) {
2733             // hmm looks like we can't use MMX here without overwriting this array's tail
2734             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2735                                            &yuv2packed1, &yuv2packed2,
2736                                            &yuv2packedX);
2737         }
2738
2739         {
2740             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2741             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2742             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2743             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2744
2745             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2746                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2747                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2748                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2749                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2750                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2751                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2752                              dest, dstW, chrDstW);
2753                 } else { //General YV12
2754                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2755                              lumSrcPtr, vLumFilterSize,
2756                              vChrFilter + chrDstY * vChrFilterSize,
2757                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2758                              alpSrcPtr, dest, dstW, chrDstW);
2759                 }
2760             } else {
2761                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2762                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2763                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2764                     int chrAlpha = vChrFilter[2 * dstY + 1];
2765                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2766                                 alpPixBuf ? *alpSrcPtr : NULL,
2767                                 dest[0], dstW, chrAlpha, dstY);
2768                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2769                     int lumAlpha = vLumFilter[2 * dstY + 1];
2770                     int chrAlpha = vChrFilter[2 * dstY + 1];
2771                     lumMmxFilter[2] =
2772                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2773                     chrMmxFilter[2] =
2774                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2775                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2776                                 alpPixBuf ? alpSrcPtr : NULL,
2777                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2778                 } else { //general RGB
2779                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2780                                 lumSrcPtr, vLumFilterSize,
2781                                 vChrFilter + dstY * vChrFilterSize,
2782                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2783                                 alpSrcPtr, dest[0], dstW, dstY);
2784                 }
2785             }
2786         }
2787     }
2788
2789     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2790         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2791
2792 #if HAVE_MMX2
2793     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2794         __asm__ volatile("sfence":::"memory");
2795 #endif
2796     emms_c();
2797
2798     /* store changed local vars back in the context */
2799     c->dstY= dstY;
2800     c->lumBufIndex= lumBufIndex;
2801     c->chrBufIndex= chrBufIndex;
2802     c->lastInLumBuf= lastInLumBuf;
2803     c->lastInChrBuf= lastInChrBuf;
2804
2805     return dstY - lastDstY;
2806 }
2807
2808 static av_cold void sws_init_swScale_c(SwsContext *c)
2809 {
2810     enum PixelFormat srcFormat = c->srcFormat;
2811
2812     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2813                                    &c->yuv2packed1, &c->yuv2packed2,
2814                                    &c->yuv2packedX);
2815
2816     c->chrToYV12 = NULL;
2817     switch(srcFormat) {
2818         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2819         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2820         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2821         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2822         case PIX_FMT_RGB8     :
2823         case PIX_FMT_BGR8     :
2824         case PIX_FMT_PAL8     :
2825         case PIX_FMT_BGR4_BYTE:
2826         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2827         case PIX_FMT_YUV444P9BE:
2828         case PIX_FMT_YUV420P9BE:
2829         case PIX_FMT_YUV444P10BE:
2830         case PIX_FMT_YUV422P10BE:
2831         case PIX_FMT_YUV420P10BE: c->hScale16= HAVE_BIGENDIAN ? NULL : hScale16NX_c; break;
2832         case PIX_FMT_YUV444P9LE:
2833         case PIX_FMT_YUV420P9LE:
2834         case PIX_FMT_YUV422P10LE:
2835         case PIX_FMT_YUV420P10LE:
2836         case PIX_FMT_YUV444P10LE: c->hScale16= HAVE_BIGENDIAN ? hScale16NX_c : NULL; break;
2837 #if HAVE_BIGENDIAN
2838         case PIX_FMT_YUV420P16LE:
2839         case PIX_FMT_YUV422P16LE:
2840         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2841 #else
2842         case PIX_FMT_YUV420P16BE:
2843         case PIX_FMT_YUV422P16BE:
2844         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2845 #endif
2846     }
2847     if (c->chrSrcHSubSample) {
2848         switch(srcFormat) {
2849         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2850         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2851         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2852         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2853         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2854         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2855         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2856         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2857         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2858         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2859         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2860         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2861         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2862         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2863         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2864         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2865         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2866         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2867         }
2868     } else {
2869         switch(srcFormat) {
2870         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2871         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2872         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2873         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2874         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2875         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2876         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2877         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2878         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2879         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2880         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2881         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2882         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2883         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2884         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2885         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2886         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2887         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2888         }
2889     }
2890
2891     c->lumToYV12 = NULL;
2892     c->alpToYV12 = NULL;
2893     switch (srcFormat) {
2894 #if HAVE_BIGENDIAN
2895     case PIX_FMT_YUV420P16LE:
2896     case PIX_FMT_YUV422P16LE:
2897     case PIX_FMT_YUV444P16LE:
2898     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2899 #else
2900     case PIX_FMT_YUV420P16BE:
2901     case PIX_FMT_YUV422P16BE:
2902     case PIX_FMT_YUV444P16BE:
2903     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2904 #endif
2905     case PIX_FMT_YUYV422  :
2906     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2907     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2908     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2909     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2910     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2911     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2912     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2913     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2914     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2915     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2916     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2917     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2918     case PIX_FMT_RGB8     :
2919     case PIX_FMT_BGR8     :
2920     case PIX_FMT_PAL8     :
2921     case PIX_FMT_BGR4_BYTE:
2922     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2923     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2924     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2925     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2926     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2927     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2928     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2929     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2930     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2931     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2932     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2933     }
2934     if (c->alpPixBuf) {
2935         switch (srcFormat) {
2936         case PIX_FMT_BGRA:
2937         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2938         case PIX_FMT_ABGR:
2939         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2940         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2941         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
2942         }
2943     }
2944
2945
2946     if (c->srcBpc == 8) {
2947         if (c->dstBpc <= 10) {
2948             if((isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2949             || c->srcFormat == PIX_FMT_PAL8)
2950                 c->hScale16= hScale16N_c;
2951             c->hScale       = hScale8To15_c;
2952             if (c->flags & SWS_FAST_BILINEAR) {
2953                 c->hyscale_fast = hyscale_fast_c;
2954                 c->hcscale_fast = hcscale_fast_c;
2955             }
2956         } else {
2957             c->hScale = hScale8To19_c;
2958             av_assert0(c->hScale16 != hScale16N_c && c->hScale16 != hScale16NX_c);
2959         }
2960     } else {
2961         if(c->dstBpc > 10){
2962             if((isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2963             || c->srcFormat == PIX_FMT_PAL8)
2964                 c->hScale16= hScale16N_c;
2965             if(c->hScale16 == hScale16NX_c && !isAnyRGB(c->srcFormat)){
2966                 c->chrToYV12 = bswap16UV_c;
2967                 c->lumToYV12 = bswap16Y_c;
2968             }
2969             c->hScale16 = NULL;
2970         }
2971         c->hScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
2972     }
2973
2974     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2975         if (c->dstBpc <= 10) {
2976             if (c->srcRange) {
2977                 c->lumConvertRange = lumRangeFromJpeg_c;
2978                 c->chrConvertRange = chrRangeFromJpeg_c;
2979             } else {
2980                 c->lumConvertRange = lumRangeToJpeg_c;
2981                 c->chrConvertRange = chrRangeToJpeg_c;
2982             }
2983         } else {
2984             if (c->srcRange) {
2985                 c->lumConvertRange = lumRangeFromJpeg16_c;
2986                 c->chrConvertRange = chrRangeFromJpeg16_c;
2987             } else {
2988                 c->lumConvertRange = lumRangeToJpeg16_c;
2989                 c->chrConvertRange = chrRangeToJpeg16_c;
2990             }
2991         }
2992     }
2993
2994     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2995           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2996         c->needs_hcscale = 1;
2997 }
2998
2999 SwsFunc ff_getSwsFunc(SwsContext *c)
3000 {
3001     sws_init_swScale_c(c);
3002
3003     if (HAVE_MMX)
3004         ff_sws_init_swScale_mmx(c);
3005     if (HAVE_ALTIVEC)
3006         ff_sws_init_swScale_altivec(c);
3007
3008     return swScale;
3009 }