2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #if COMPILE_TEMPLATE_MMX2
26 #define PREFETCH "prefetchnta"
28 #define PREFETCH " # nop"
31 #if COMPILE_TEMPLATE_MMX2
32 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
36 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
38 #define YSCALEYUV2YV12X(offset, dest, end, pos) \
40 "movq "DITHER16"+0(%0), %%mm3 \n\t"\
41 "movq "DITHER16"+8(%0), %%mm4 \n\t"\
42 "lea " offset "(%0), %%"REG_d" \n\t"\
43 "mov (%%"REG_d"), %%"REG_S" \n\t"\
44 ".p2align 4 \n\t" /* FIXME Unroll? */\
46 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
47 "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
48 "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
49 "add $16, %%"REG_d" \n\t"\
50 "mov (%%"REG_d"), %%"REG_S" \n\t"\
51 "test %%"REG_S", %%"REG_S" \n\t"\
52 "pmulhw %%mm0, %%mm2 \n\t"\
53 "pmulhw %%mm0, %%mm5 \n\t"\
54 "paddw %%mm2, %%mm3 \n\t"\
55 "paddw %%mm5, %%mm4 \n\t"\
57 "psraw $3, %%mm3 \n\t"\
58 "psraw $3, %%mm4 \n\t"\
59 "packuswb %%mm4, %%mm3 \n\t"\
60 MOVNTQ(%%mm3, (%1, %3))\
63 "movq "DITHER16"+0(%0), %%mm3 \n\t"\
64 "movq "DITHER16"+8(%0), %%mm4 \n\t"\
65 "lea " offset "(%0), %%"REG_d" \n\t"\
66 "mov (%%"REG_d"), %%"REG_S" \n\t"\
68 :: "r" (&c->redDither),\
69 "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
73 static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
74 const int16_t **lumSrc, int lumFilterSize,
75 const int16_t *chrFilter, const int16_t **chrUSrc,
76 const int16_t **chrVSrc,
77 int chrFilterSize, const int16_t **alpSrc,
78 uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
79 uint8_t *aDest, int dstW, int chrDstW,
80 const uint8_t *lumDither, const uint8_t *chrDither)
84 x86_reg uv_off = c->uv_off;
85 for(i=0; i<8; i++) c->dither16[i] = chrDither[i]>>4;
86 YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
87 for(i=0; i<8; i++) c->dither16[i] = chrDither[(i+3)&7]>>4;
88 YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
90 for(i=0; i<8; i++) c->dither16[i] = lumDither[i]>>4;
91 if (CONFIG_SWSCALE_ALPHA && aDest) {
92 YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
95 YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
98 #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
100 "lea " offset "(%0), %%"REG_d" \n\t"\
101 "movq "DITHER32"+0(%0), %%mm4 \n\t"\
102 "movq "DITHER32"+8(%0), %%mm5 \n\t"\
103 "movq "DITHER32"+16(%0), %%mm6 \n\t"\
104 "movq "DITHER32"+24(%0), %%mm7 \n\t"\
105 "pxor %%mm4, %%mm4 \n\t"\
106 "pxor %%mm5, %%mm5 \n\t"\
107 "pxor %%mm6, %%mm6 \n\t"\
108 "pxor %%mm7, %%mm7 \n\t"\
109 "mov (%%"REG_d"), %%"REG_S" \n\t"\
112 "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
113 "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
114 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
115 "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
116 "movq %%mm0, %%mm3 \n\t"\
117 "punpcklwd %%mm1, %%mm0 \n\t"\
118 "punpckhwd %%mm1, %%mm3 \n\t"\
119 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
120 "pmaddwd %%mm1, %%mm0 \n\t"\
121 "pmaddwd %%mm1, %%mm3 \n\t"\
122 "paddd %%mm0, %%mm4 \n\t"\
123 "paddd %%mm3, %%mm5 \n\t"\
124 "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
125 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
126 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
127 "test %%"REG_S", %%"REG_S" \n\t"\
128 "movq %%mm2, %%mm0 \n\t"\
129 "punpcklwd %%mm3, %%mm2 \n\t"\
130 "punpckhwd %%mm3, %%mm0 \n\t"\
131 "pmaddwd %%mm1, %%mm2 \n\t"\
132 "pmaddwd %%mm1, %%mm0 \n\t"\
133 "paddd %%mm2, %%mm6 \n\t"\
134 "paddd %%mm0, %%mm7 \n\t"\
136 "psrad $19, %%mm4 \n\t"\
137 "psrad $19, %%mm5 \n\t"\
138 "psrad $19, %%mm6 \n\t"\
139 "psrad $19, %%mm7 \n\t"\
140 "packssdw %%mm5, %%mm4 \n\t"\
141 "packssdw %%mm7, %%mm6 \n\t"\
142 "packuswb %%mm6, %%mm4 \n\t"\
143 MOVNTQ(%%mm4, (%1, %3))\
146 "lea " offset "(%0), %%"REG_d" \n\t"\
147 "movq "DITHER32"+0(%0), %%mm4 \n\t"\
148 "movq "DITHER32"+8(%0), %%mm5 \n\t"\
149 "movq "DITHER32"+16(%0), %%mm6 \n\t"\
150 "movq "DITHER32"+24(%0), %%mm7 \n\t"\
151 "mov (%%"REG_d"), %%"REG_S" \n\t"\
153 :: "r" (&c->redDither),\
154 "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
155 : "%"REG_a, "%"REG_d, "%"REG_S\
158 static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
159 const int16_t **lumSrc, int lumFilterSize,
160 const int16_t *chrFilter, const int16_t **chrUSrc,
161 const int16_t **chrVSrc,
162 int chrFilterSize, const int16_t **alpSrc,
163 uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
164 uint8_t *aDest, int dstW, int chrDstW,
165 const uint8_t *lumDither, const uint8_t *chrDither)
169 x86_reg uv_off = c->uv_off;
170 for(i=0; i<8; i++) c->dither32[i] = chrDither[i]<<12;
171 YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
172 for(i=0; i<8; i++) c->dither32[i] = chrDither[(i+3)&7]<<12;
173 YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
175 for(i=0; i<8; i++) c->dither32[i] = lumDither[i]<<12;
176 if (CONFIG_SWSCALE_ALPHA && aDest) {
177 YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
180 YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
183 static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
184 const int16_t *chrUSrc, const int16_t *chrVSrc,
185 const int16_t *alpSrc,
186 uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
187 uint8_t *aDest, int dstW, int chrDstW,
188 const uint8_t *lumDither, const uint8_t *chrDither)
191 const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
192 uint8_t *dst[4]= { aDest, dest, uDest, vDest };
193 x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
198 "mov %2, %%"REG_a" \n\t"
199 ".p2align 4 \n\t" /* FIXME Unroll? */
201 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
202 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
203 "psraw $7, %%mm0 \n\t"
204 "psraw $7, %%mm1 \n\t"
205 "packuswb %%mm1, %%mm0 \n\t"
206 MOVNTQ(%%mm0, (%1, %%REGa))
207 "add $8, %%"REG_a" \n\t"
209 :: "r" (src[p]), "r" (dst[p] + counter[p]),
217 static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
218 const int16_t *chrUSrc, const int16_t *chrVSrc,
219 const int16_t *alpSrc,
220 uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
221 uint8_t *aDest, int dstW, int chrDstW,
222 const uint8_t *lumDither, const uint8_t *chrDither)
225 const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
226 uint8_t *dst[4]= { aDest, dest, uDest, vDest };
227 x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
232 for(i=0; i<8; i++) c->dither16[i] = i<2 ? lumDither[i] : chrDither[i];
234 "mov %2, %%"REG_a" \n\t"
235 "movq 0(%3), %%mm6 \n\t"
236 "movq 8(%3), %%mm7 \n\t"
237 ".p2align 4 \n\t" /* FIXME Unroll? */
239 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
240 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
241 "paddsw %%mm6, %%mm0 \n\t"
242 "paddsw %%mm7, %%mm1 \n\t"
243 "psraw $7, %%mm0 \n\t"
244 "psraw $7, %%mm1 \n\t"
245 "packuswb %%mm1, %%mm0 \n\t"
246 MOVNTQ(%%mm0, (%1, %%REGa))
247 "add $8, %%"REG_a" \n\t"
249 :: "r" (src[p]), "r" (dst[p] + counter[p]),
250 "g" (-counter[p]), "r"(c->dither16)
257 #define YSCALEYUV2PACKEDX_UV \
259 "xor %%"REG_a", %%"REG_a" \n\t"\
263 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
264 "mov (%%"REG_d"), %%"REG_S" \n\t"\
265 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
266 "movq %%mm3, %%mm4 \n\t"\
269 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
270 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
271 "add %6, %%"REG_S" \n\t" \
272 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
273 "add $16, %%"REG_d" \n\t"\
274 "mov (%%"REG_d"), %%"REG_S" \n\t"\
275 "pmulhw %%mm0, %%mm2 \n\t"\
276 "pmulhw %%mm0, %%mm5 \n\t"\
277 "paddw %%mm2, %%mm3 \n\t"\
278 "paddw %%mm5, %%mm4 \n\t"\
279 "test %%"REG_S", %%"REG_S" \n\t"\
282 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
283 "lea "offset"(%0), %%"REG_d" \n\t"\
284 "mov (%%"REG_d"), %%"REG_S" \n\t"\
285 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
286 "movq "#dst1", "#dst2" \n\t"\
289 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
290 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
291 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
292 "add $16, %%"REG_d" \n\t"\
293 "mov (%%"REG_d"), %%"REG_S" \n\t"\
294 "pmulhw "#coeff", "#src1" \n\t"\
295 "pmulhw "#coeff", "#src2" \n\t"\
296 "paddw "#src1", "#dst1" \n\t"\
297 "paddw "#src2", "#dst2" \n\t"\
298 "test %%"REG_S", %%"REG_S" \n\t"\
301 #define YSCALEYUV2PACKEDX \
302 YSCALEYUV2PACKEDX_UV \
303 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
305 #define YSCALEYUV2PACKEDX_END \
306 :: "r" (&c->redDither), \
307 "m" (dummy), "m" (dummy), "m" (dummy),\
308 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
309 : "%"REG_a, "%"REG_d, "%"REG_S \
312 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
314 "xor %%"REG_a", %%"REG_a" \n\t"\
318 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
319 "mov (%%"REG_d"), %%"REG_S" \n\t"\
320 "pxor %%mm4, %%mm4 \n\t"\
321 "pxor %%mm5, %%mm5 \n\t"\
322 "pxor %%mm6, %%mm6 \n\t"\
323 "pxor %%mm7, %%mm7 \n\t"\
326 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
327 "add %6, %%"REG_S" \n\t" \
328 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
329 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
331 "movq %%mm0, %%mm3 \n\t"\
332 "punpcklwd %%mm1, %%mm0 \n\t"\
333 "punpckhwd %%mm1, %%mm3 \n\t"\
334 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
335 "pmaddwd %%mm1, %%mm0 \n\t"\
336 "pmaddwd %%mm1, %%mm3 \n\t"\
337 "paddd %%mm0, %%mm4 \n\t"\
338 "paddd %%mm3, %%mm5 \n\t"\
339 "add %6, %%"REG_S" \n\t" \
340 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
341 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
342 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
343 "test %%"REG_S", %%"REG_S" \n\t"\
344 "movq %%mm2, %%mm0 \n\t"\
345 "punpcklwd %%mm3, %%mm2 \n\t"\
346 "punpckhwd %%mm3, %%mm0 \n\t"\
347 "pmaddwd %%mm1, %%mm2 \n\t"\
348 "pmaddwd %%mm1, %%mm0 \n\t"\
349 "paddd %%mm2, %%mm6 \n\t"\
350 "paddd %%mm0, %%mm7 \n\t"\
352 "psrad $16, %%mm4 \n\t"\
353 "psrad $16, %%mm5 \n\t"\
354 "psrad $16, %%mm6 \n\t"\
355 "psrad $16, %%mm7 \n\t"\
356 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
357 "packssdw %%mm5, %%mm4 \n\t"\
358 "packssdw %%mm7, %%mm6 \n\t"\
359 "paddw %%mm0, %%mm4 \n\t"\
360 "paddw %%mm0, %%mm6 \n\t"\
361 "movq %%mm4, "U_TEMP"(%0) \n\t"\
362 "movq %%mm6, "V_TEMP"(%0) \n\t"\
364 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
365 "lea "offset"(%0), %%"REG_d" \n\t"\
366 "mov (%%"REG_d"), %%"REG_S" \n\t"\
367 "pxor %%mm1, %%mm1 \n\t"\
368 "pxor %%mm5, %%mm5 \n\t"\
369 "pxor %%mm7, %%mm7 \n\t"\
370 "pxor %%mm6, %%mm6 \n\t"\
373 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
374 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
375 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
376 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
377 "movq %%mm0, %%mm3 \n\t"\
378 "punpcklwd %%mm4, %%mm0 \n\t"\
379 "punpckhwd %%mm4, %%mm3 \n\t"\
380 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
381 "pmaddwd %%mm4, %%mm0 \n\t"\
382 "pmaddwd %%mm4, %%mm3 \n\t"\
383 "paddd %%mm0, %%mm1 \n\t"\
384 "paddd %%mm3, %%mm5 \n\t"\
385 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
386 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
387 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
388 "test %%"REG_S", %%"REG_S" \n\t"\
389 "movq %%mm2, %%mm0 \n\t"\
390 "punpcklwd %%mm3, %%mm2 \n\t"\
391 "punpckhwd %%mm3, %%mm0 \n\t"\
392 "pmaddwd %%mm4, %%mm2 \n\t"\
393 "pmaddwd %%mm4, %%mm0 \n\t"\
394 "paddd %%mm2, %%mm7 \n\t"\
395 "paddd %%mm0, %%mm6 \n\t"\
397 "psrad $16, %%mm1 \n\t"\
398 "psrad $16, %%mm5 \n\t"\
399 "psrad $16, %%mm7 \n\t"\
400 "psrad $16, %%mm6 \n\t"\
401 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
402 "packssdw %%mm5, %%mm1 \n\t"\
403 "packssdw %%mm6, %%mm7 \n\t"\
404 "paddw %%mm0, %%mm1 \n\t"\
405 "paddw %%mm0, %%mm7 \n\t"\
406 "movq "U_TEMP"(%0), %%mm3 \n\t"\
407 "movq "V_TEMP"(%0), %%mm4 \n\t"\
409 #define YSCALEYUV2PACKEDX_ACCURATE \
410 YSCALEYUV2PACKEDX_ACCURATE_UV \
411 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
413 #define YSCALEYUV2RGBX \
414 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
415 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
416 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
417 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
418 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
419 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
420 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
421 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
422 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
423 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
424 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
425 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
426 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
427 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
428 "paddw %%mm3, %%mm4 \n\t"\
429 "movq %%mm2, %%mm0 \n\t"\
430 "movq %%mm5, %%mm6 \n\t"\
431 "movq %%mm4, %%mm3 \n\t"\
432 "punpcklwd %%mm2, %%mm2 \n\t"\
433 "punpcklwd %%mm5, %%mm5 \n\t"\
434 "punpcklwd %%mm4, %%mm4 \n\t"\
435 "paddw %%mm1, %%mm2 \n\t"\
436 "paddw %%mm1, %%mm5 \n\t"\
437 "paddw %%mm1, %%mm4 \n\t"\
438 "punpckhwd %%mm0, %%mm0 \n\t"\
439 "punpckhwd %%mm6, %%mm6 \n\t"\
440 "punpckhwd %%mm3, %%mm3 \n\t"\
441 "paddw %%mm7, %%mm0 \n\t"\
442 "paddw %%mm7, %%mm6 \n\t"\
443 "paddw %%mm7, %%mm3 \n\t"\
444 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
445 "packuswb %%mm0, %%mm2 \n\t"\
446 "packuswb %%mm6, %%mm5 \n\t"\
447 "packuswb %%mm3, %%mm4 \n\t"\
449 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
450 "movq "#b", "#q2" \n\t" /* B */\
451 "movq "#r", "#t" \n\t" /* R */\
452 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
453 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
454 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
455 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
456 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
457 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
458 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
459 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
460 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
461 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
463 MOVNTQ( q0, (dst, index, 4))\
464 MOVNTQ( b, 8(dst, index, 4))\
465 MOVNTQ( q2, 16(dst, index, 4))\
466 MOVNTQ( q3, 24(dst, index, 4))\
468 "add $8, "#index" \n\t"\
469 "cmp "#dstw", "#index" \n\t"\
471 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
473 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
474 const int16_t **lumSrc, int lumFilterSize,
475 const int16_t *chrFilter, const int16_t **chrUSrc,
476 const int16_t **chrVSrc,
477 int chrFilterSize, const int16_t **alpSrc,
478 uint8_t *dest, int dstW, int dstY)
481 x86_reg dstW_reg = dstW;
482 x86_reg uv_off = c->uv_off << 1;
484 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
485 YSCALEYUV2PACKEDX_ACCURATE
487 "movq %%mm2, "U_TEMP"(%0) \n\t"
488 "movq %%mm4, "V_TEMP"(%0) \n\t"
489 "movq %%mm5, "Y_TEMP"(%0) \n\t"
490 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
491 "movq "Y_TEMP"(%0), %%mm5 \n\t"
492 "psraw $3, %%mm1 \n\t"
493 "psraw $3, %%mm7 \n\t"
494 "packuswb %%mm7, %%mm1 \n\t"
495 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
496 YSCALEYUV2PACKEDX_END
498 YSCALEYUV2PACKEDX_ACCURATE
500 "pcmpeqd %%mm7, %%mm7 \n\t"
501 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
502 YSCALEYUV2PACKEDX_END
506 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
507 const int16_t **lumSrc, int lumFilterSize,
508 const int16_t *chrFilter, const int16_t **chrUSrc,
509 const int16_t **chrVSrc,
510 int chrFilterSize, const int16_t **alpSrc,
511 uint8_t *dest, int dstW, int dstY)
514 x86_reg dstW_reg = dstW;
515 x86_reg uv_off = c->uv_off << 1;
517 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
520 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
521 "psraw $3, %%mm1 \n\t"
522 "psraw $3, %%mm7 \n\t"
523 "packuswb %%mm7, %%mm1 \n\t"
524 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
525 YSCALEYUV2PACKEDX_END
529 "pcmpeqd %%mm7, %%mm7 \n\t"
530 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
531 YSCALEYUV2PACKEDX_END
535 #define REAL_WRITERGB16(dst, dstw, index) \
536 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
537 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
538 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
539 "psrlq $3, %%mm2 \n\t"\
541 "movq %%mm2, %%mm1 \n\t"\
542 "movq %%mm4, %%mm3 \n\t"\
544 "punpcklbw %%mm7, %%mm3 \n\t"\
545 "punpcklbw %%mm5, %%mm2 \n\t"\
546 "punpckhbw %%mm7, %%mm4 \n\t"\
547 "punpckhbw %%mm5, %%mm1 \n\t"\
549 "psllq $3, %%mm3 \n\t"\
550 "psllq $3, %%mm4 \n\t"\
552 "por %%mm3, %%mm2 \n\t"\
553 "por %%mm4, %%mm1 \n\t"\
555 MOVNTQ(%%mm2, (dst, index, 2))\
556 MOVNTQ(%%mm1, 8(dst, index, 2))\
558 "add $8, "#index" \n\t"\
559 "cmp "#dstw", "#index" \n\t"\
561 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
563 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
564 const int16_t **lumSrc, int lumFilterSize,
565 const int16_t *chrFilter, const int16_t **chrUSrc,
566 const int16_t **chrVSrc,
567 int chrFilterSize, const int16_t **alpSrc,
568 uint8_t *dest, int dstW, int dstY)
571 x86_reg dstW_reg = dstW;
572 x86_reg uv_off = c->uv_off << 1;
574 YSCALEYUV2PACKEDX_ACCURATE
576 "pxor %%mm7, %%mm7 \n\t"
577 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
579 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
580 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
581 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
583 WRITERGB16(%4, %5, %%REGa)
584 YSCALEYUV2PACKEDX_END
587 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
588 const int16_t **lumSrc, int lumFilterSize,
589 const int16_t *chrFilter, const int16_t **chrUSrc,
590 const int16_t **chrVSrc,
591 int chrFilterSize, const int16_t **alpSrc,
592 uint8_t *dest, int dstW, int dstY)
595 x86_reg dstW_reg = dstW;
596 x86_reg uv_off = c->uv_off << 1;
600 "pxor %%mm7, %%mm7 \n\t"
601 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
603 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
604 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
605 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
607 WRITERGB16(%4, %5, %%REGa)
608 YSCALEYUV2PACKEDX_END
611 #define REAL_WRITERGB15(dst, dstw, index) \
612 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
613 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
614 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
615 "psrlq $3, %%mm2 \n\t"\
616 "psrlq $1, %%mm5 \n\t"\
618 "movq %%mm2, %%mm1 \n\t"\
619 "movq %%mm4, %%mm3 \n\t"\
621 "punpcklbw %%mm7, %%mm3 \n\t"\
622 "punpcklbw %%mm5, %%mm2 \n\t"\
623 "punpckhbw %%mm7, %%mm4 \n\t"\
624 "punpckhbw %%mm5, %%mm1 \n\t"\
626 "psllq $2, %%mm3 \n\t"\
627 "psllq $2, %%mm4 \n\t"\
629 "por %%mm3, %%mm2 \n\t"\
630 "por %%mm4, %%mm1 \n\t"\
632 MOVNTQ(%%mm2, (dst, index, 2))\
633 MOVNTQ(%%mm1, 8(dst, index, 2))\
635 "add $8, "#index" \n\t"\
636 "cmp "#dstw", "#index" \n\t"\
638 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
640 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
641 const int16_t **lumSrc, int lumFilterSize,
642 const int16_t *chrFilter, const int16_t **chrUSrc,
643 const int16_t **chrVSrc,
644 int chrFilterSize, const int16_t **alpSrc,
645 uint8_t *dest, int dstW, int dstY)
648 x86_reg dstW_reg = dstW;
649 x86_reg uv_off = c->uv_off << 1;
651 YSCALEYUV2PACKEDX_ACCURATE
653 "pxor %%mm7, %%mm7 \n\t"
654 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
656 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
657 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
658 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
660 WRITERGB15(%4, %5, %%REGa)
661 YSCALEYUV2PACKEDX_END
664 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
665 const int16_t **lumSrc, int lumFilterSize,
666 const int16_t *chrFilter, const int16_t **chrUSrc,
667 const int16_t **chrVSrc,
668 int chrFilterSize, const int16_t **alpSrc,
669 uint8_t *dest, int dstW, int dstY)
672 x86_reg dstW_reg = dstW;
673 x86_reg uv_off = c->uv_off << 1;
677 "pxor %%mm7, %%mm7 \n\t"
678 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
680 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
681 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
682 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
684 WRITERGB15(%4, %5, %%REGa)
685 YSCALEYUV2PACKEDX_END
688 #define WRITEBGR24MMX(dst, dstw, index) \
689 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
690 "movq %%mm2, %%mm1 \n\t" /* B */\
691 "movq %%mm5, %%mm6 \n\t" /* R */\
692 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
693 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
694 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
695 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
696 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
697 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
698 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
699 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
700 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
701 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
703 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
704 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
705 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
706 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
708 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
709 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
710 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
711 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
713 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
714 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
715 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
716 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
718 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
719 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
720 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
721 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
722 MOVNTQ(%%mm0, (dst))\
724 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
725 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
726 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
727 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
728 MOVNTQ(%%mm6, 8(dst))\
730 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
731 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
732 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
733 MOVNTQ(%%mm5, 16(dst))\
735 "add $24, "#dst" \n\t"\
737 "add $8, "#index" \n\t"\
738 "cmp "#dstw", "#index" \n\t"\
741 #define WRITEBGR24MMX2(dst, dstw, index) \
742 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
743 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
744 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
745 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
746 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
747 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
749 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
750 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
751 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
753 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
754 "por %%mm1, %%mm6 \n\t"\
755 "por %%mm3, %%mm6 \n\t"\
756 MOVNTQ(%%mm6, (dst))\
758 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
759 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
760 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
761 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
763 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
764 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
765 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
767 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
768 "por %%mm3, %%mm6 \n\t"\
769 MOVNTQ(%%mm6, 8(dst))\
771 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
772 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
773 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
775 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
776 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
777 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
779 "por %%mm1, %%mm3 \n\t"\
780 "por %%mm3, %%mm6 \n\t"\
781 MOVNTQ(%%mm6, 16(dst))\
783 "add $24, "#dst" \n\t"\
785 "add $8, "#index" \n\t"\
786 "cmp "#dstw", "#index" \n\t"\
789 #if COMPILE_TEMPLATE_MMX2
791 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
794 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
797 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
798 const int16_t **lumSrc, int lumFilterSize,
799 const int16_t *chrFilter, const int16_t **chrUSrc,
800 const int16_t **chrVSrc,
801 int chrFilterSize, const int16_t **alpSrc,
802 uint8_t *dest, int dstW, int dstY)
805 x86_reg dstW_reg = dstW;
806 x86_reg uv_off = c->uv_off << 1;
808 YSCALEYUV2PACKEDX_ACCURATE
810 "pxor %%mm7, %%mm7 \n\t"
811 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
812 "add %4, %%"REG_c" \n\t"
813 WRITEBGR24(%%REGc, %5, %%REGa)
814 :: "r" (&c->redDither),
815 "m" (dummy), "m" (dummy), "m" (dummy),
816 "r" (dest), "m" (dstW_reg), "m"(uv_off)
817 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
821 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
822 const int16_t **lumSrc, int lumFilterSize,
823 const int16_t *chrFilter, const int16_t **chrUSrc,
824 const int16_t **chrVSrc,
825 int chrFilterSize, const int16_t **alpSrc,
826 uint8_t *dest, int dstW, int dstY)
829 x86_reg dstW_reg = dstW;
830 x86_reg uv_off = c->uv_off << 1;
834 "pxor %%mm7, %%mm7 \n\t"
835 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
836 "add %4, %%"REG_c" \n\t"
837 WRITEBGR24(%%REGc, %5, %%REGa)
838 :: "r" (&c->redDither),
839 "m" (dummy), "m" (dummy), "m" (dummy),
840 "r" (dest), "m" (dstW_reg), "m"(uv_off)
841 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
845 #define REAL_WRITEYUY2(dst, dstw, index) \
846 "packuswb %%mm3, %%mm3 \n\t"\
847 "packuswb %%mm4, %%mm4 \n\t"\
848 "packuswb %%mm7, %%mm1 \n\t"\
849 "punpcklbw %%mm4, %%mm3 \n\t"\
850 "movq %%mm1, %%mm7 \n\t"\
851 "punpcklbw %%mm3, %%mm1 \n\t"\
852 "punpckhbw %%mm3, %%mm7 \n\t"\
854 MOVNTQ(%%mm1, (dst, index, 2))\
855 MOVNTQ(%%mm7, 8(dst, index, 2))\
857 "add $8, "#index" \n\t"\
858 "cmp "#dstw", "#index" \n\t"\
860 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
862 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
863 const int16_t **lumSrc, int lumFilterSize,
864 const int16_t *chrFilter, const int16_t **chrUSrc,
865 const int16_t **chrVSrc,
866 int chrFilterSize, const int16_t **alpSrc,
867 uint8_t *dest, int dstW, int dstY)
870 x86_reg dstW_reg = dstW;
871 x86_reg uv_off = c->uv_off << 1;
873 YSCALEYUV2PACKEDX_ACCURATE
874 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
875 "psraw $3, %%mm3 \n\t"
876 "psraw $3, %%mm4 \n\t"
877 "psraw $3, %%mm1 \n\t"
878 "psraw $3, %%mm7 \n\t"
879 WRITEYUY2(%4, %5, %%REGa)
880 YSCALEYUV2PACKEDX_END
883 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
884 const int16_t **lumSrc, int lumFilterSize,
885 const int16_t *chrFilter, const int16_t **chrUSrc,
886 const int16_t **chrVSrc,
887 int chrFilterSize, const int16_t **alpSrc,
888 uint8_t *dest, int dstW, int dstY)
891 x86_reg dstW_reg = dstW;
892 x86_reg uv_off = c->uv_off << 1;
895 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
896 "psraw $3, %%mm3 \n\t"
897 "psraw $3, %%mm4 \n\t"
898 "psraw $3, %%mm1 \n\t"
899 "psraw $3, %%mm7 \n\t"
900 WRITEYUY2(%4, %5, %%REGa)
901 YSCALEYUV2PACKEDX_END
904 #define REAL_YSCALEYUV2RGB_UV(index, c) \
905 "xor "#index", "#index" \n\t"\
908 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
909 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
910 "add "UV_OFFx2"("#c"), "#index" \n\t" \
911 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
912 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
913 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
914 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
915 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
916 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
917 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
918 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
919 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
920 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
921 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
922 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
923 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
924 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
925 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
926 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
927 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
928 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
929 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
931 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
932 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
933 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
934 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
935 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
936 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
937 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
938 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
939 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
940 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
941 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
942 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
943 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
945 #define REAL_YSCALEYUV2RGB_COEFF(c) \
946 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
947 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
948 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
949 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
950 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
951 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
952 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
953 "paddw %%mm3, %%mm4 \n\t"\
954 "movq %%mm2, %%mm0 \n\t"\
955 "movq %%mm5, %%mm6 \n\t"\
956 "movq %%mm4, %%mm3 \n\t"\
957 "punpcklwd %%mm2, %%mm2 \n\t"\
958 "punpcklwd %%mm5, %%mm5 \n\t"\
959 "punpcklwd %%mm4, %%mm4 \n\t"\
960 "paddw %%mm1, %%mm2 \n\t"\
961 "paddw %%mm1, %%mm5 \n\t"\
962 "paddw %%mm1, %%mm4 \n\t"\
963 "punpckhwd %%mm0, %%mm0 \n\t"\
964 "punpckhwd %%mm6, %%mm6 \n\t"\
965 "punpckhwd %%mm3, %%mm3 \n\t"\
966 "paddw %%mm7, %%mm0 \n\t"\
967 "paddw %%mm7, %%mm6 \n\t"\
968 "paddw %%mm7, %%mm3 \n\t"\
969 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
970 "packuswb %%mm0, %%mm2 \n\t"\
971 "packuswb %%mm6, %%mm5 \n\t"\
972 "packuswb %%mm3, %%mm4 \n\t"\
974 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
976 #define YSCALEYUV2RGB(index, c) \
977 REAL_YSCALEYUV2RGB_UV(index, c) \
978 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
979 REAL_YSCALEYUV2RGB_COEFF(c)
982 * vertical bilinear scale YV12 to RGB
984 static void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
985 const uint16_t *buf1, const uint16_t *ubuf0,
986 const uint16_t *ubuf1, const uint16_t *vbuf0,
987 const uint16_t *vbuf1, const uint16_t *abuf0,
988 const uint16_t *abuf1, uint8_t *dest,
989 int dstW, int yalpha, int uvalpha, int y)
991 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
994 YSCALEYUV2RGB(%%r8, %5)
995 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
996 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
997 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
998 "packuswb %%mm7, %%mm1 \n\t"
999 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1000 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
1001 "a" (&c->redDither),
1002 "r" (abuf0), "r" (abuf1)
1006 c->u_temp=(intptr_t)abuf0;
1007 c->v_temp=(intptr_t)abuf1;
1009 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1010 "mov %4, %%"REG_b" \n\t"
1011 "push %%"REG_BP" \n\t"
1012 YSCALEYUV2RGB(%%REGBP, %5)
1015 "mov "U_TEMP"(%5), %0 \n\t"
1016 "mov "V_TEMP"(%5), %1 \n\t"
1017 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1018 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1019 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1020 "packuswb %%mm7, %%mm1 \n\t"
1023 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1024 "pop %%"REG_BP" \n\t"
1025 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1026 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1032 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1033 "mov %4, %%"REG_b" \n\t"
1034 "push %%"REG_BP" \n\t"
1035 YSCALEYUV2RGB(%%REGBP, %5)
1036 "pcmpeqd %%mm7, %%mm7 \n\t"
1037 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1038 "pop %%"REG_BP" \n\t"
1039 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1040 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1046 static void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
1047 const uint16_t *buf1, const uint16_t *ubuf0,
1048 const uint16_t *ubuf1, const uint16_t *vbuf0,
1049 const uint16_t *vbuf1, const uint16_t *abuf0,
1050 const uint16_t *abuf1, uint8_t *dest,
1051 int dstW, int yalpha, int uvalpha, int y)
1053 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1055 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1056 "mov %4, %%"REG_b" \n\t"
1057 "push %%"REG_BP" \n\t"
1058 YSCALEYUV2RGB(%%REGBP, %5)
1059 "pxor %%mm7, %%mm7 \n\t"
1060 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1061 "pop %%"REG_BP" \n\t"
1062 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1063 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1068 static void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
1069 const uint16_t *buf1, const uint16_t *ubuf0,
1070 const uint16_t *ubuf1, const uint16_t *vbuf0,
1071 const uint16_t *vbuf1, const uint16_t *abuf0,
1072 const uint16_t *abuf1, uint8_t *dest,
1073 int dstW, int yalpha, int uvalpha, int y)
1075 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1077 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1078 "mov %4, %%"REG_b" \n\t"
1079 "push %%"REG_BP" \n\t"
1080 YSCALEYUV2RGB(%%REGBP, %5)
1081 "pxor %%mm7, %%mm7 \n\t"
1082 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1084 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1085 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1086 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1088 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1089 "pop %%"REG_BP" \n\t"
1090 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1091 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1096 static void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
1097 const uint16_t *buf1, const uint16_t *ubuf0,
1098 const uint16_t *ubuf1, const uint16_t *vbuf0,
1099 const uint16_t *vbuf1, const uint16_t *abuf0,
1100 const uint16_t *abuf1, uint8_t *dest,
1101 int dstW, int yalpha, int uvalpha, int y)
1103 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1105 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1106 "mov %4, %%"REG_b" \n\t"
1107 "push %%"REG_BP" \n\t"
1108 YSCALEYUV2RGB(%%REGBP, %5)
1109 "pxor %%mm7, %%mm7 \n\t"
1110 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1112 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1113 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1114 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1116 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1117 "pop %%"REG_BP" \n\t"
1118 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1119 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1124 #define REAL_YSCALEYUV2PACKED(index, c) \
1125 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1126 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
1127 "psraw $3, %%mm0 \n\t"\
1128 "psraw $3, %%mm1 \n\t"\
1129 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1130 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1131 "xor "#index", "#index" \n\t"\
1134 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1135 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1136 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1137 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1138 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1139 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1140 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
1141 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
1142 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1143 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
1144 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
1145 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1146 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1147 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
1148 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
1149 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
1150 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
1151 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
1152 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
1153 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
1154 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
1155 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1156 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1157 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1158 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1159 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1160 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1162 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
1164 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
1165 const uint16_t *buf1, const uint16_t *ubuf0,
1166 const uint16_t *ubuf1, const uint16_t *vbuf0,
1167 const uint16_t *vbuf1, const uint16_t *abuf0,
1168 const uint16_t *abuf1, uint8_t *dest,
1169 int dstW, int yalpha, int uvalpha, int y)
1171 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1173 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1174 "mov %4, %%"REG_b" \n\t"
1175 "push %%"REG_BP" \n\t"
1176 YSCALEYUV2PACKED(%%REGBP, %5)
1177 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1178 "pop %%"REG_BP" \n\t"
1179 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1180 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1185 #define REAL_YSCALEYUV2RGB1(index, c) \
1186 "xor "#index", "#index" \n\t"\
1189 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1190 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1191 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1192 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1193 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1194 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1195 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1196 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1197 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1198 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1199 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1200 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1201 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1202 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1203 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1204 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1205 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1206 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1207 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1208 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1209 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1210 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1211 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1212 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1213 "paddw %%mm3, %%mm4 \n\t"\
1214 "movq %%mm2, %%mm0 \n\t"\
1215 "movq %%mm5, %%mm6 \n\t"\
1216 "movq %%mm4, %%mm3 \n\t"\
1217 "punpcklwd %%mm2, %%mm2 \n\t"\
1218 "punpcklwd %%mm5, %%mm5 \n\t"\
1219 "punpcklwd %%mm4, %%mm4 \n\t"\
1220 "paddw %%mm1, %%mm2 \n\t"\
1221 "paddw %%mm1, %%mm5 \n\t"\
1222 "paddw %%mm1, %%mm4 \n\t"\
1223 "punpckhwd %%mm0, %%mm0 \n\t"\
1224 "punpckhwd %%mm6, %%mm6 \n\t"\
1225 "punpckhwd %%mm3, %%mm3 \n\t"\
1226 "paddw %%mm7, %%mm0 \n\t"\
1227 "paddw %%mm7, %%mm6 \n\t"\
1228 "paddw %%mm7, %%mm3 \n\t"\
1229 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1230 "packuswb %%mm0, %%mm2 \n\t"\
1231 "packuswb %%mm6, %%mm5 \n\t"\
1232 "packuswb %%mm3, %%mm4 \n\t"\
1234 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1236 // do vertical chrominance interpolation
1237 #define REAL_YSCALEYUV2RGB1b(index, c) \
1238 "xor "#index", "#index" \n\t"\
1241 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1242 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1243 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1244 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1245 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1246 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1247 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1248 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1249 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
1250 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
1251 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1252 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1253 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1254 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1255 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1256 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1257 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1258 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1259 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1260 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1261 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1262 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1263 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1264 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1265 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1266 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1267 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1268 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1269 "paddw %%mm3, %%mm4 \n\t"\
1270 "movq %%mm2, %%mm0 \n\t"\
1271 "movq %%mm5, %%mm6 \n\t"\
1272 "movq %%mm4, %%mm3 \n\t"\
1273 "punpcklwd %%mm2, %%mm2 \n\t"\
1274 "punpcklwd %%mm5, %%mm5 \n\t"\
1275 "punpcklwd %%mm4, %%mm4 \n\t"\
1276 "paddw %%mm1, %%mm2 \n\t"\
1277 "paddw %%mm1, %%mm5 \n\t"\
1278 "paddw %%mm1, %%mm4 \n\t"\
1279 "punpckhwd %%mm0, %%mm0 \n\t"\
1280 "punpckhwd %%mm6, %%mm6 \n\t"\
1281 "punpckhwd %%mm3, %%mm3 \n\t"\
1282 "paddw %%mm7, %%mm0 \n\t"\
1283 "paddw %%mm7, %%mm6 \n\t"\
1284 "paddw %%mm7, %%mm3 \n\t"\
1285 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1286 "packuswb %%mm0, %%mm2 \n\t"\
1287 "packuswb %%mm6, %%mm5 \n\t"\
1288 "packuswb %%mm3, %%mm4 \n\t"\
1290 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1292 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1293 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
1294 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
1295 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
1296 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
1297 "packuswb %%mm1, %%mm7 \n\t"
1298 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1301 * YV12 to RGB without scaling or interpolating
1303 static void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
1304 const uint16_t *ubuf0, const uint16_t *ubuf1,
1305 const uint16_t *vbuf0, const uint16_t *vbuf1,
1306 const uint16_t *abuf0, uint8_t *dest,
1307 int dstW, int uvalpha, enum PixelFormat dstFormat,
1310 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1312 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1313 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1315 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1316 "mov %4, %%"REG_b" \n\t"
1317 "push %%"REG_BP" \n\t"
1318 YSCALEYUV2RGB1(%%REGBP, %5)
1319 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1320 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1321 "pop %%"REG_BP" \n\t"
1322 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1323 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1328 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1329 "mov %4, %%"REG_b" \n\t"
1330 "push %%"REG_BP" \n\t"
1331 YSCALEYUV2RGB1(%%REGBP, %5)
1332 "pcmpeqd %%mm7, %%mm7 \n\t"
1333 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1334 "pop %%"REG_BP" \n\t"
1335 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1336 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1341 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1343 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1344 "mov %4, %%"REG_b" \n\t"
1345 "push %%"REG_BP" \n\t"
1346 YSCALEYUV2RGB1b(%%REGBP, %5)
1347 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1348 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1349 "pop %%"REG_BP" \n\t"
1350 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1351 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1356 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1357 "mov %4, %%"REG_b" \n\t"
1358 "push %%"REG_BP" \n\t"
1359 YSCALEYUV2RGB1b(%%REGBP, %5)
1360 "pcmpeqd %%mm7, %%mm7 \n\t"
1361 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1362 "pop %%"REG_BP" \n\t"
1363 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1364 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1371 static void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
1372 const uint16_t *ubuf0, const uint16_t *ubuf1,
1373 const uint16_t *vbuf0, const uint16_t *vbuf1,
1374 const uint16_t *abuf0, uint8_t *dest,
1375 int dstW, int uvalpha, enum PixelFormat dstFormat,
1378 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1380 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1382 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1383 "mov %4, %%"REG_b" \n\t"
1384 "push %%"REG_BP" \n\t"
1385 YSCALEYUV2RGB1(%%REGBP, %5)
1386 "pxor %%mm7, %%mm7 \n\t"
1387 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1388 "pop %%"REG_BP" \n\t"
1389 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1390 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1395 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1396 "mov %4, %%"REG_b" \n\t"
1397 "push %%"REG_BP" \n\t"
1398 YSCALEYUV2RGB1b(%%REGBP, %5)
1399 "pxor %%mm7, %%mm7 \n\t"
1400 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1401 "pop %%"REG_BP" \n\t"
1402 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1403 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1409 static void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
1410 const uint16_t *ubuf0, const uint16_t *ubuf1,
1411 const uint16_t *vbuf0, const uint16_t *vbuf1,
1412 const uint16_t *abuf0, uint8_t *dest,
1413 int dstW, int uvalpha, enum PixelFormat dstFormat,
1416 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1418 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1420 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1421 "mov %4, %%"REG_b" \n\t"
1422 "push %%"REG_BP" \n\t"
1423 YSCALEYUV2RGB1(%%REGBP, %5)
1424 "pxor %%mm7, %%mm7 \n\t"
1425 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1427 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1428 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1429 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1431 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1432 "pop %%"REG_BP" \n\t"
1433 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1434 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1439 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1440 "mov %4, %%"REG_b" \n\t"
1441 "push %%"REG_BP" \n\t"
1442 YSCALEYUV2RGB1b(%%REGBP, %5)
1443 "pxor %%mm7, %%mm7 \n\t"
1444 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1446 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1447 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1448 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1450 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1459 static void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
1460 const uint16_t *ubuf0, const uint16_t *ubuf1,
1461 const uint16_t *vbuf0, const uint16_t *vbuf1,
1462 const uint16_t *abuf0, uint8_t *dest,
1463 int dstW, int uvalpha, enum PixelFormat dstFormat,
1466 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1468 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1470 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1471 "mov %4, %%"REG_b" \n\t"
1472 "push %%"REG_BP" \n\t"
1473 YSCALEYUV2RGB1(%%REGBP, %5)
1474 "pxor %%mm7, %%mm7 \n\t"
1475 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1477 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1478 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1479 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1481 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1482 "pop %%"REG_BP" \n\t"
1483 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1484 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1489 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1490 "mov %4, %%"REG_b" \n\t"
1491 "push %%"REG_BP" \n\t"
1492 YSCALEYUV2RGB1b(%%REGBP, %5)
1493 "pxor %%mm7, %%mm7 \n\t"
1494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1496 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1497 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1498 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1500 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1501 "pop %%"REG_BP" \n\t"
1502 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1503 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1509 #define REAL_YSCALEYUV2PACKED1(index, c) \
1510 "xor "#index", "#index" \n\t"\
1513 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1514 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1515 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1516 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1517 "psraw $7, %%mm3 \n\t" \
1518 "psraw $7, %%mm4 \n\t" \
1519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1521 "psraw $7, %%mm1 \n\t" \
1522 "psraw $7, %%mm7 \n\t" \
1524 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1526 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1527 "xor "#index", "#index" \n\t"\
1530 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1531 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1532 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1533 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1534 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1535 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1536 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1537 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1538 "psrlw $8, %%mm3 \n\t" \
1539 "psrlw $8, %%mm4 \n\t" \
1540 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1541 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1542 "psraw $7, %%mm1 \n\t" \
1543 "psraw $7, %%mm7 \n\t"
1544 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1546 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
1547 const uint16_t *ubuf0, const uint16_t *ubuf1,
1548 const uint16_t *vbuf0, const uint16_t *vbuf1,
1549 const uint16_t *abuf0, uint8_t *dest,
1550 int dstW, int uvalpha, enum PixelFormat dstFormat,
1553 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1555 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1557 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1558 "mov %4, %%"REG_b" \n\t"
1559 "push %%"REG_BP" \n\t"
1560 YSCALEYUV2PACKED1(%%REGBP, %5)
1561 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1562 "pop %%"REG_BP" \n\t"
1563 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1564 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1569 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1570 "mov %4, %%"REG_b" \n\t"
1571 "push %%"REG_BP" \n\t"
1572 YSCALEYUV2PACKED1b(%%REGBP, %5)
1573 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1574 "pop %%"REG_BP" \n\t"
1575 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1576 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1582 #if !COMPILE_TEMPLATE_MMX2
1583 //FIXME yuy2* can read up to 7 samples too much
1585 static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
1586 int width, uint32_t *unused)
1589 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1590 "mov %0, %%"REG_a" \n\t"
1592 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1593 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1594 "pand %%mm2, %%mm0 \n\t"
1595 "pand %%mm2, %%mm1 \n\t"
1596 "packuswb %%mm1, %%mm0 \n\t"
1597 "movq %%mm0, (%2, %%"REG_a") \n\t"
1598 "add $8, %%"REG_a" \n\t"
1600 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1605 static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
1606 const uint8_t *src1, const uint8_t *src2,
1607 int width, uint32_t *unused)
1610 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1611 "mov %0, %%"REG_a" \n\t"
1613 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1614 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1615 "psrlw $8, %%mm0 \n\t"
1616 "psrlw $8, %%mm1 \n\t"
1617 "packuswb %%mm1, %%mm0 \n\t"
1618 "movq %%mm0, %%mm1 \n\t"
1619 "psrlw $8, %%mm0 \n\t"
1620 "pand %%mm4, %%mm1 \n\t"
1621 "packuswb %%mm0, %%mm0 \n\t"
1622 "packuswb %%mm1, %%mm1 \n\t"
1623 "movd %%mm0, (%3, %%"REG_a") \n\t"
1624 "movd %%mm1, (%2, %%"REG_a") \n\t"
1625 "add $4, %%"REG_a" \n\t"
1627 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1630 assert(src1 == src2);
1633 static void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV,
1634 const uint8_t *src1, const uint8_t *src2,
1635 int width, uint32_t *unused)
1638 "mov %0, %%"REG_a" \n\t"
1640 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1641 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1642 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1643 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1644 "psrlw $8, %%mm0 \n\t"
1645 "psrlw $8, %%mm1 \n\t"
1646 "psrlw $8, %%mm2 \n\t"
1647 "psrlw $8, %%mm3 \n\t"
1648 "packuswb %%mm1, %%mm0 \n\t"
1649 "packuswb %%mm3, %%mm2 \n\t"
1650 "movq %%mm0, (%3, %%"REG_a") \n\t"
1651 "movq %%mm2, (%4, %%"REG_a") \n\t"
1652 "add $8, %%"REG_a" \n\t"
1654 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1659 /* This is almost identical to the previous, end exists only because
1660 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1661 static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
1662 int width, uint32_t *unused)
1665 "mov %0, %%"REG_a" \n\t"
1667 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1668 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1669 "psrlw $8, %%mm0 \n\t"
1670 "psrlw $8, %%mm1 \n\t"
1671 "packuswb %%mm1, %%mm0 \n\t"
1672 "movq %%mm0, (%2, %%"REG_a") \n\t"
1673 "add $8, %%"REG_a" \n\t"
1675 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1680 static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
1681 const uint8_t *src1, const uint8_t *src2,
1682 int width, uint32_t *unused)
1685 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1686 "mov %0, %%"REG_a" \n\t"
1688 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1689 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1690 "pand %%mm4, %%mm0 \n\t"
1691 "pand %%mm4, %%mm1 \n\t"
1692 "packuswb %%mm1, %%mm0 \n\t"
1693 "movq %%mm0, %%mm1 \n\t"
1694 "psrlw $8, %%mm0 \n\t"
1695 "pand %%mm4, %%mm1 \n\t"
1696 "packuswb %%mm0, %%mm0 \n\t"
1697 "packuswb %%mm1, %%mm1 \n\t"
1698 "movd %%mm0, (%3, %%"REG_a") \n\t"
1699 "movd %%mm1, (%2, %%"REG_a") \n\t"
1700 "add $4, %%"REG_a" \n\t"
1702 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1705 assert(src1 == src2);
1708 static void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV,
1709 const uint8_t *src1, const uint8_t *src2,
1710 int width, uint32_t *unused)
1713 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1714 "mov %0, %%"REG_a" \n\t"
1716 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1717 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1718 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1719 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1720 "pand %%mm4, %%mm0 \n\t"
1721 "pand %%mm4, %%mm1 \n\t"
1722 "pand %%mm4, %%mm2 \n\t"
1723 "pand %%mm4, %%mm3 \n\t"
1724 "packuswb %%mm1, %%mm0 \n\t"
1725 "packuswb %%mm3, %%mm2 \n\t"
1726 "movq %%mm0, (%3, %%"REG_a") \n\t"
1727 "movq %%mm2, (%4, %%"REG_a") \n\t"
1728 "add $8, %%"REG_a" \n\t"
1730 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1735 static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1736 const uint8_t *src, int width)
1739 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1740 "mov %0, %%"REG_a" \n\t"
1742 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1743 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1744 "movq %%mm0, %%mm2 \n\t"
1745 "movq %%mm1, %%mm3 \n\t"
1746 "pand %%mm4, %%mm0 \n\t"
1747 "pand %%mm4, %%mm1 \n\t"
1748 "psrlw $8, %%mm2 \n\t"
1749 "psrlw $8, %%mm3 \n\t"
1750 "packuswb %%mm1, %%mm0 \n\t"
1751 "packuswb %%mm3, %%mm2 \n\t"
1752 "movq %%mm0, (%2, %%"REG_a") \n\t"
1753 "movq %%mm2, (%3, %%"REG_a") \n\t"
1754 "add $8, %%"REG_a" \n\t"
1756 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1761 static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1762 const uint8_t *src1, const uint8_t *src2,
1763 int width, uint32_t *unused)
1765 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1768 static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1769 const uint8_t *src1, const uint8_t *src2,
1770 int width, uint32_t *unused)
1772 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1774 #endif /* !COMPILE_TEMPLATE_MMX2 */
1776 static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src,
1777 int width, enum PixelFormat srcFormat)
1780 if(srcFormat == PIX_FMT_BGR24) {
1782 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1783 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1788 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1789 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1795 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1796 "mov %2, %%"REG_a" \n\t"
1797 "pxor %%mm7, %%mm7 \n\t"
1799 PREFETCH" 64(%0) \n\t"
1800 "movd (%0), %%mm0 \n\t"
1801 "movd 2(%0), %%mm1 \n\t"
1802 "movd 6(%0), %%mm2 \n\t"
1803 "movd 8(%0), %%mm3 \n\t"
1805 "punpcklbw %%mm7, %%mm0 \n\t"
1806 "punpcklbw %%mm7, %%mm1 \n\t"
1807 "punpcklbw %%mm7, %%mm2 \n\t"
1808 "punpcklbw %%mm7, %%mm3 \n\t"
1809 "pmaddwd %%mm5, %%mm0 \n\t"
1810 "pmaddwd %%mm6, %%mm1 \n\t"
1811 "pmaddwd %%mm5, %%mm2 \n\t"
1812 "pmaddwd %%mm6, %%mm3 \n\t"
1813 "paddd %%mm1, %%mm0 \n\t"
1814 "paddd %%mm3, %%mm2 \n\t"
1815 "paddd %%mm4, %%mm0 \n\t"
1816 "paddd %%mm4, %%mm2 \n\t"
1817 "psrad $9, %%mm0 \n\t"
1818 "psrad $9, %%mm2 \n\t"
1819 "packssdw %%mm2, %%mm0 \n\t"
1820 "movq %%mm0, (%1, %%"REG_a") \n\t"
1821 "add $8, %%"REG_a" \n\t"
1824 : "r" (dst+width), "g" ((x86_reg)-2*width)
1829 static void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src,
1830 int width, uint32_t *unused)
1832 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1835 static void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src,
1836 int width, uint32_t *unused)
1838 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1841 static av_always_inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV,
1842 const uint8_t *src, int width,
1843 enum PixelFormat srcFormat)
1846 "movq 24(%4), %%mm6 \n\t"
1847 "mov %3, %%"REG_a" \n\t"
1848 "pxor %%mm7, %%mm7 \n\t"
1850 PREFETCH" 64(%0) \n\t"
1851 "movd (%0), %%mm0 \n\t"
1852 "movd 2(%0), %%mm1 \n\t"
1853 "punpcklbw %%mm7, %%mm0 \n\t"
1854 "punpcklbw %%mm7, %%mm1 \n\t"
1855 "movq %%mm0, %%mm2 \n\t"
1856 "movq %%mm1, %%mm3 \n\t"
1857 "pmaddwd (%4), %%mm0 \n\t"
1858 "pmaddwd 8(%4), %%mm1 \n\t"
1859 "pmaddwd 16(%4), %%mm2 \n\t"
1860 "pmaddwd %%mm6, %%mm3 \n\t"
1861 "paddd %%mm1, %%mm0 \n\t"
1862 "paddd %%mm3, %%mm2 \n\t"
1864 "movd 6(%0), %%mm1 \n\t"
1865 "movd 8(%0), %%mm3 \n\t"
1867 "punpcklbw %%mm7, %%mm1 \n\t"
1868 "punpcklbw %%mm7, %%mm3 \n\t"
1869 "movq %%mm1, %%mm4 \n\t"
1870 "movq %%mm3, %%mm5 \n\t"
1871 "pmaddwd (%4), %%mm1 \n\t"
1872 "pmaddwd 8(%4), %%mm3 \n\t"
1873 "pmaddwd 16(%4), %%mm4 \n\t"
1874 "pmaddwd %%mm6, %%mm5 \n\t"
1875 "paddd %%mm3, %%mm1 \n\t"
1876 "paddd %%mm5, %%mm4 \n\t"
1878 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1879 "paddd %%mm3, %%mm0 \n\t"
1880 "paddd %%mm3, %%mm2 \n\t"
1881 "paddd %%mm3, %%mm1 \n\t"
1882 "paddd %%mm3, %%mm4 \n\t"
1883 "psrad $9, %%mm0 \n\t"
1884 "psrad $9, %%mm2 \n\t"
1885 "psrad $9, %%mm1 \n\t"
1886 "psrad $9, %%mm4 \n\t"
1887 "packssdw %%mm1, %%mm0 \n\t"
1888 "packssdw %%mm4, %%mm2 \n\t"
1889 "movq %%mm0, (%1, %%"REG_a") \n\t"
1890 "movq %%mm2, (%2, %%"REG_a") \n\t"
1891 "add $8, %%"REG_a" \n\t"
1894 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1899 static void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV,
1900 const uint8_t *src1, const uint8_t *src2,
1901 int width, uint32_t *unused)
1903 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1904 assert(src1 == src2);
1907 static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV,
1908 const uint8_t *src1, const uint8_t *src2,
1909 int width, uint32_t *unused)
1912 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1915 #if !COMPILE_TEMPLATE_MMX2
1916 // bilinear / bicubic scaling
1917 static void RENAME(hScale)(int16_t *dst, int dstW,
1918 const uint8_t *src, int srcW,
1919 int xInc, const int16_t *filter,
1920 const int16_t *filterPos, int filterSize)
1922 assert(filterSize % 4 == 0 && filterSize>0);
1923 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
1924 x86_reg counter= -2*dstW;
1926 filterPos-= counter/2;
1930 "push %%"REG_b" \n\t"
1932 "pxor %%mm7, %%mm7 \n\t"
1933 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1934 "mov %%"REG_a", %%"REG_BP" \n\t"
1937 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1938 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1939 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1940 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1941 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1942 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1943 "punpcklbw %%mm7, %%mm0 \n\t"
1944 "punpcklbw %%mm7, %%mm2 \n\t"
1945 "pmaddwd %%mm1, %%mm0 \n\t"
1946 "pmaddwd %%mm2, %%mm3 \n\t"
1947 "movq %%mm0, %%mm4 \n\t"
1948 "punpckldq %%mm3, %%mm0 \n\t"
1949 "punpckhdq %%mm3, %%mm4 \n\t"
1950 "paddd %%mm4, %%mm0 \n\t"
1951 "psrad $7, %%mm0 \n\t"
1952 "packssdw %%mm0, %%mm0 \n\t"
1953 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1954 "add $4, %%"REG_BP" \n\t"
1957 "pop %%"REG_BP" \n\t"
1959 "pop %%"REG_b" \n\t"
1962 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1967 } else if (filterSize==8) {
1968 x86_reg counter= -2*dstW;
1970 filterPos-= counter/2;
1974 "push %%"REG_b" \n\t"
1976 "pxor %%mm7, %%mm7 \n\t"
1977 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1978 "mov %%"REG_a", %%"REG_BP" \n\t"
1981 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1982 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1983 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
1984 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
1985 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1986 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1987 "punpcklbw %%mm7, %%mm0 \n\t"
1988 "punpcklbw %%mm7, %%mm2 \n\t"
1989 "pmaddwd %%mm1, %%mm0 \n\t"
1990 "pmaddwd %%mm2, %%mm3 \n\t"
1992 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
1993 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
1994 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
1995 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
1996 "punpcklbw %%mm7, %%mm4 \n\t"
1997 "punpcklbw %%mm7, %%mm2 \n\t"
1998 "pmaddwd %%mm1, %%mm4 \n\t"
1999 "pmaddwd %%mm2, %%mm5 \n\t"
2000 "paddd %%mm4, %%mm0 \n\t"
2001 "paddd %%mm5, %%mm3 \n\t"
2002 "movq %%mm0, %%mm4 \n\t"
2003 "punpckldq %%mm3, %%mm0 \n\t"
2004 "punpckhdq %%mm3, %%mm4 \n\t"
2005 "paddd %%mm4, %%mm0 \n\t"
2006 "psrad $7, %%mm0 \n\t"
2007 "packssdw %%mm0, %%mm0 \n\t"
2008 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2009 "add $4, %%"REG_BP" \n\t"
2012 "pop %%"REG_BP" \n\t"
2014 "pop %%"REG_b" \n\t"
2017 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2023 const uint8_t *offset = src+filterSize;
2024 x86_reg counter= -2*dstW;
2025 //filter-= counter*filterSize/2;
2026 filterPos-= counter/2;
2029 "pxor %%mm7, %%mm7 \n\t"
2032 "mov %2, %%"REG_c" \n\t"
2033 "movzwl (%%"REG_c", %0), %%eax \n\t"
2034 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2035 "mov %5, %%"REG_c" \n\t"
2036 "pxor %%mm4, %%mm4 \n\t"
2037 "pxor %%mm5, %%mm5 \n\t"
2039 "movq (%1), %%mm1 \n\t"
2040 "movq (%1, %6), %%mm3 \n\t"
2041 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2042 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2043 "punpcklbw %%mm7, %%mm0 \n\t"
2044 "punpcklbw %%mm7, %%mm2 \n\t"
2045 "pmaddwd %%mm1, %%mm0 \n\t"
2046 "pmaddwd %%mm2, %%mm3 \n\t"
2047 "paddd %%mm3, %%mm5 \n\t"
2048 "paddd %%mm0, %%mm4 \n\t"
2050 "add $4, %%"REG_c" \n\t"
2051 "cmp %4, %%"REG_c" \n\t"
2054 "movq %%mm4, %%mm0 \n\t"
2055 "punpckldq %%mm5, %%mm4 \n\t"
2056 "punpckhdq %%mm5, %%mm0 \n\t"
2057 "paddd %%mm0, %%mm4 \n\t"
2058 "psrad $7, %%mm4 \n\t"
2059 "packssdw %%mm4, %%mm4 \n\t"
2060 "mov %3, %%"REG_a" \n\t"
2061 "movd %%mm4, (%%"REG_a", %0) \n\t"
2065 : "+r" (counter), "+r" (filter)
2066 : "m" (filterPos), "m" (dst), "m"(offset),
2067 "m" (src), "r" ((x86_reg)filterSize*2)
2068 : "%"REG_a, "%"REG_c, "%"REG_d
2072 #endif /* !COMPILE_TEMPLATE_MMX2 */
2074 static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
2075 const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
2079 assert(filterSize % 4 == 0 && filterSize>0);
2080 if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
2081 x86_reg counter= -2*dstW;
2083 filterPos-= counter/2;
2086 "movd %5, %%mm7 \n\t"
2088 "push %%"REG_b" \n\t"
2090 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2091 "mov %%"REG_a", %%"REG_BP" \n\t"
2094 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2095 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2096 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2097 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2098 "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
2099 "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
2100 "pmaddwd %%mm1, %%mm0 \n\t"
2101 "pmaddwd %%mm2, %%mm3 \n\t"
2102 "movq %%mm0, %%mm4 \n\t"
2103 "punpckldq %%mm3, %%mm0 \n\t"
2104 "punpckhdq %%mm3, %%mm4 \n\t"
2105 "paddd %%mm4, %%mm0 \n\t"
2106 "psrad %%mm7, %%mm0 \n\t"
2107 "packssdw %%mm0, %%mm0 \n\t"
2108 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2109 "add $4, %%"REG_BP" \n\t"
2112 "pop %%"REG_BP" \n\t"
2114 "pop %%"REG_b" \n\t"
2117 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
2122 } else if (filterSize==8 && shift<15) {
2123 x86_reg counter= -2*dstW;
2125 filterPos-= counter/2;
2128 "movd %5, %%mm7 \n\t"
2130 "push %%"REG_b" \n\t"
2132 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2133 "mov %%"REG_a", %%"REG_BP" \n\t"
2136 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2137 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2138 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2139 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2140 "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
2141 "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
2142 "pmaddwd %%mm1, %%mm0 \n\t"
2143 "pmaddwd %%mm2, %%mm3 \n\t"
2145 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2146 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2147 "movq 8(%3, %%"REG_a", 2), %%mm4 \n\t"
2148 "movq 8(%3, %%"REG_b", 2), %%mm2 \n\t"
2149 "pmaddwd %%mm1, %%mm4 \n\t"
2150 "pmaddwd %%mm2, %%mm5 \n\t"
2151 "paddd %%mm4, %%mm0 \n\t"
2152 "paddd %%mm5, %%mm3 \n\t"
2153 "movq %%mm0, %%mm4 \n\t"
2154 "punpckldq %%mm3, %%mm0 \n\t"
2155 "punpckhdq %%mm3, %%mm4 \n\t"
2156 "paddd %%mm4, %%mm0 \n\t"
2157 "psrad %%mm7, %%mm0 \n\t"
2158 "packssdw %%mm0, %%mm0 \n\t"
2159 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2160 "add $4, %%"REG_BP" \n\t"
2163 "pop %%"REG_BP" \n\t"
2165 "pop %%"REG_b" \n\t"
2168 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
2173 } else if (shift<15){
2174 const uint16_t *offset = src+filterSize;
2175 x86_reg counter= -2*dstW;
2176 //filter-= counter*filterSize/2;
2177 filterPos-= counter/2;
2180 "movd %7, %%mm7 \n\t"
2183 "mov %2, %%"REG_c" \n\t"
2184 "movzwl (%%"REG_c", %0), %%eax \n\t"
2185 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2186 "mov %5, %%"REG_c" \n\t"
2187 "pxor %%mm4, %%mm4 \n\t"
2188 "pxor %%mm5, %%mm5 \n\t"
2190 "movq (%1), %%mm1 \n\t"
2191 "movq (%1, %6), %%mm3 \n\t"
2192 "movq (%%"REG_c", %%"REG_a", 2), %%mm0 \n\t"
2193 "movq (%%"REG_c", %%"REG_d", 2), %%mm2 \n\t"
2194 "pmaddwd %%mm1, %%mm0 \n\t"
2195 "pmaddwd %%mm2, %%mm3 \n\t"
2196 "paddd %%mm3, %%mm5 \n\t"
2197 "paddd %%mm0, %%mm4 \n\t"
2199 "add $8, %%"REG_c" \n\t"
2200 "cmp %4, %%"REG_c" \n\t"
2203 "movq %%mm4, %%mm0 \n\t"
2204 "punpckldq %%mm5, %%mm4 \n\t"
2205 "punpckhdq %%mm5, %%mm0 \n\t"
2206 "paddd %%mm0, %%mm4 \n\t"
2207 "psrad %%mm7, %%mm4 \n\t"
2208 "packssdw %%mm4, %%mm4 \n\t"
2209 "mov %3, %%"REG_a" \n\t"
2210 "movd %%mm4, (%%"REG_a", %0) \n\t"
2214 : "+r" (counter), "+r" (filter)
2215 : "m" (filterPos), "m" (dst), "m"(offset),
2216 "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
2217 : "%"REG_a, "%"REG_c, "%"REG_d
2220 for (i=0; i<dstW; i++) {
2221 int srcPos= filterPos[i];
2223 for (j=0; j<filterSize; j++) {
2224 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2226 dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
2231 #if COMPILE_TEMPLATE_MMX2
2232 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2233 int dstWidth, const uint8_t *src,
2236 int16_t *filterPos = c->hLumFilterPos;
2237 int16_t *filter = c->hLumFilter;
2238 void *mmx2FilterCode= c->lumMmx2FilterCode;
2241 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2246 "mov %%"REG_b", %5 \n\t"
2248 "pxor %%mm7, %%mm7 \n\t"
2249 "mov %0, %%"REG_c" \n\t"
2250 "mov %1, %%"REG_D" \n\t"
2251 "mov %2, %%"REG_d" \n\t"
2252 "mov %3, %%"REG_b" \n\t"
2253 "xor %%"REG_a", %%"REG_a" \n\t" // i
2254 PREFETCH" (%%"REG_c") \n\t"
2255 PREFETCH" 32(%%"REG_c") \n\t"
2256 PREFETCH" 64(%%"REG_c") \n\t"
2259 #define CALL_MMX2_FILTER_CODE \
2260 "movl (%%"REG_b"), %%esi \n\t"\
2262 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2263 "add %%"REG_S", %%"REG_c" \n\t"\
2264 "add %%"REG_a", %%"REG_D" \n\t"\
2265 "xor %%"REG_a", %%"REG_a" \n\t"\
2268 #define CALL_MMX2_FILTER_CODE \
2269 "movl (%%"REG_b"), %%esi \n\t"\
2271 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2272 "add %%"REG_a", %%"REG_D" \n\t"\
2273 "xor %%"REG_a", %%"REG_a" \n\t"\
2275 #endif /* ARCH_X86_64 */
2277 CALL_MMX2_FILTER_CODE
2278 CALL_MMX2_FILTER_CODE
2279 CALL_MMX2_FILTER_CODE
2280 CALL_MMX2_FILTER_CODE
2281 CALL_MMX2_FILTER_CODE
2282 CALL_MMX2_FILTER_CODE
2283 CALL_MMX2_FILTER_CODE
2284 CALL_MMX2_FILTER_CODE
2287 "mov %5, %%"REG_b" \n\t"
2289 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2290 "m" (mmx2FilterCode)
2294 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2300 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2301 dst[i] = src[srcW-1]*128;
2304 static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
2305 int dstWidth, const uint8_t *src1,
2306 const uint8_t *src2, int srcW, int xInc)
2308 int16_t *filterPos = c->hChrFilterPos;
2309 int16_t *filter = c->hChrFilter;
2310 void *mmx2FilterCode= c->chrMmx2FilterCode;
2313 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2318 "mov %%"REG_b", %7 \n\t"
2320 "pxor %%mm7, %%mm7 \n\t"
2321 "mov %0, %%"REG_c" \n\t"
2322 "mov %1, %%"REG_D" \n\t"
2323 "mov %2, %%"REG_d" \n\t"
2324 "mov %3, %%"REG_b" \n\t"
2325 "xor %%"REG_a", %%"REG_a" \n\t" // i
2326 PREFETCH" (%%"REG_c") \n\t"
2327 PREFETCH" 32(%%"REG_c") \n\t"
2328 PREFETCH" 64(%%"REG_c") \n\t"
2330 CALL_MMX2_FILTER_CODE
2331 CALL_MMX2_FILTER_CODE
2332 CALL_MMX2_FILTER_CODE
2333 CALL_MMX2_FILTER_CODE
2334 "xor %%"REG_a", %%"REG_a" \n\t" // i
2335 "mov %5, %%"REG_c" \n\t" // src
2336 "mov %6, %%"REG_D" \n\t" // buf2
2337 PREFETCH" (%%"REG_c") \n\t"
2338 PREFETCH" 32(%%"REG_c") \n\t"
2339 PREFETCH" 64(%%"REG_c") \n\t"
2341 CALL_MMX2_FILTER_CODE
2342 CALL_MMX2_FILTER_CODE
2343 CALL_MMX2_FILTER_CODE
2344 CALL_MMX2_FILTER_CODE
2347 "mov %7, %%"REG_b" \n\t"
2349 :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
2350 "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
2354 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2360 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2361 dst1[i] = src1[srcW-1]*128;
2362 dst2[i] = src2[srcW-1]*128;
2365 #endif /* COMPILE_TEMPLATE_MMX2 */
2367 static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
2369 enum PixelFormat srcFormat = c->srcFormat,
2370 dstFormat = c->dstFormat;
2372 if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
2373 && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
2374 if (c->flags & SWS_ACCURATE_RND) {
2375 c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
2376 c->yuv2yuvX = RENAME(yuv2yuvX_ar );
2377 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
2378 switch (c->dstFormat) {
2379 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
2380 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
2381 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
2382 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
2383 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
2388 int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2389 c->yuv2yuv1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 );
2390 c->yuv2yuvX = RENAME(yuv2yuvX );
2391 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
2392 switch (c->dstFormat) {
2393 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
2394 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
2395 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
2396 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
2397 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
2402 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
2403 switch (c->dstFormat) {
2405 c->yuv2packed1 = RENAME(yuv2rgb32_1);
2406 c->yuv2packed2 = RENAME(yuv2rgb32_2);
2409 c->yuv2packed1 = RENAME(yuv2bgr24_1);
2410 c->yuv2packed2 = RENAME(yuv2bgr24_2);
2412 case PIX_FMT_RGB555:
2413 c->yuv2packed1 = RENAME(yuv2rgb555_1);
2414 c->yuv2packed2 = RENAME(yuv2rgb555_2);
2416 case PIX_FMT_RGB565:
2417 c->yuv2packed1 = RENAME(yuv2rgb565_1);
2418 c->yuv2packed2 = RENAME(yuv2rgb565_2);
2420 case PIX_FMT_YUYV422:
2421 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
2422 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
2430 #if !COMPILE_TEMPLATE_MMX2
2431 c->hScale = RENAME(hScale );
2432 #endif /* !COMPILE_TEMPLATE_MMX2 */
2434 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2435 #if COMPILE_TEMPLATE_MMX2
2436 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2438 c->hyscale_fast = RENAME(hyscale_fast);
2439 c->hcscale_fast = RENAME(hcscale_fast);
2441 #endif /* COMPILE_TEMPLATE_MMX2 */
2442 c->hyscale_fast = NULL;
2443 c->hcscale_fast = NULL;
2444 #if COMPILE_TEMPLATE_MMX2
2446 #endif /* COMPILE_TEMPLATE_MMX2 */
2448 #if !COMPILE_TEMPLATE_MMX2
2450 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2451 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2452 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2453 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
2454 case PIX_FMT_GRAY16LE :
2455 case PIX_FMT_YUV420P9LE:
2456 case PIX_FMT_YUV422P10LE:
2457 case PIX_FMT_YUV420P10LE:
2458 case PIX_FMT_YUV420P16LE:
2459 case PIX_FMT_YUV422P16LE:
2460 case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
2462 #endif /* !COMPILE_TEMPLATE_MMX2 */
2463 if (!c->chrSrcHSubSample) {
2465 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2466 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2471 switch (srcFormat) {
2472 #if !COMPILE_TEMPLATE_MMX2
2473 case PIX_FMT_YUYV422 :
2474 case PIX_FMT_Y400A :
2475 c->lumToYV12 = RENAME(yuy2ToY); break;
2476 case PIX_FMT_UYVY422 :
2477 c->lumToYV12 = RENAME(uyvyToY); break;
2478 #endif /* !COMPILE_TEMPLATE_MMX2 */
2479 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
2480 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
2483 #if !COMPILE_TEMPLATE_MMX2
2485 switch (srcFormat) {
2486 case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
2490 #endif /* !COMPILE_TEMPLATE_MMX2 */
2491 if(isAnyRGB(c->srcFormat))
2492 c->hScale16= RENAME(hScale16);