libpostproc/postprocess.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
   3  *
   4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /**
  24  * @file
  25  * postprocessing.
  26  */
  27
  28 /*
  29                         C       MMX     MMX2    3DNow   AltiVec
  30 isVertDC                Ec      Ec                      Ec
  31 isVertMinMaxOk          Ec      Ec                      Ec
  32 doVertLowPass           E               e       e       Ec
  33 doVertDefFilter         Ec      Ec      e       e       Ec
  34 isHorizDC               Ec      Ec                      Ec
  35 isHorizMinMaxOk         a       E                       Ec
  36 doHorizLowPass          E               e       e       Ec
  37 doHorizDefFilter        Ec      Ec      e       e       Ec
  38 do_a_deblock            Ec      E       Ec      E
  39 deRing                  E               e       e*      Ecp
  40 Vertical RKAlgo1        E               a       a
  41 Horizontal RKAlgo1                      a       a
  42 Vertical X1#            a               E       E
  43 Horizontal X1#          a               E       E
  44 LinIpolDeinterlace      e               E       E*
  45 CubicIpolDeinterlace    a               e       e*
  46 LinBlendDeinterlace     e               E       E*
  47 MedianDeinterlace#      E       Ec      Ec
  48 TempDeNoiser#           E               e       e       Ec
  49
  50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
  51 # more or less selfinvented filters so the exactness is not too meaningful
  52 E = Exact implementation
  53 e = almost exact implementation (slightly different rounding,...)
  54 a = alternative / approximate impl
  55 c = checked against the other implementations (-vo md5)
  56 p = partially optimized, still some work to do
  57 */
  58
  59 /*
  60 TODO:
  61 reduce the time wasted on the mem transfer
  62 unroll stuff if instructions depend too much on the prior one
  63 move YScale thing to the end instead of fixing QP
  64 write a faster and higher quality deblocking filter :)
  65 make the mainloop more flexible (variable number of blocks at once
  66         (the if/else stuff per block is slowing things down)
  67 compare the quality & speed of all filters
  68 split this huge file
  69 optimize c versions
  70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
  71 ...
  72 */
  73
  74 //Changelog: use git log
  75
  76 #include "config.h"
  77 #include "libavutil/avutil.h"
  78 #include "libavutil/avassert.h"
  79 #include "libavutil/intreadwrite.h"
  80 #include <inttypes.h>
  81 #include <stdio.h>
  82 #include <stdlib.h>
  83 #include <string.h>
  84 //#undef HAVE_MMXEXT_INLINE
  85 //#define HAVE_AMD3DNOW_INLINE
  86 //#undef HAVE_MMX_INLINE
  87 //#undef ARCH_X86
  88 //#define DEBUG_BRIGHTNESS
  89 #include "postprocess.h"
  90 #include "postprocess_internal.h"
  91 #include "libavutil/avstring.h"
  92 #include "libavutil/ppc/util_altivec.h"
  93
  94 #include "libavutil/ffversion.h"
  95 const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
  96
  97 unsigned postproc_version(void)
  98 {
  99     av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
 100     return LIBPOSTPROC_VERSION_INT;
 101 }
 102
 103 const char *postproc_configuration(void)
 104 {
 105     return FFMPEG_CONFIGURATION;
 106 }
 107
 108 const char *postproc_license(void)
 109 {
 110 #define LICENSE_PREFIX "libpostproc license: "
 111     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 112 }
 113
 114 #define GET_MODE_BUFFER_SIZE 500
 115 #define OPTIONS_ARRAY_SIZE 10
 116 #define BLOCK_SIZE 8
 117 #define TEMP_STRIDE 8
 118 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 119
 120 #if ARCH_X86 && HAVE_INLINE_ASM
 121 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
 122 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
 123 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
 124 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
 125 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
 126 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
 127 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
 128 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
 129 #endif
 130
 131 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
 132
 133
 134 static const struct PPFilter filters[]=
 135 {
 136     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
 137     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
 138 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
 139     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
 140     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
 141     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
 142     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
 143     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
 144     {"dr", "dering",                1, 5, 6, DERING},
 145     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
 146     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
 147     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
 148     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
 149     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
 150     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
 151     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
 152     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
 153     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
 154     {"be", "bitexact",              1, 0, 0, BITEXACT},
 155     {"vi", "visualize",             1, 0, 0, VISUALIZE},
 156     {NULL, NULL,0,0,0,0} //End Marker
 157 };
 158
 159 static const char * const replaceTable[]=
 160 {
 161     "default",      "hb:a,vb:a,dr:a",
 162     "de",           "hb:a,vb:a,dr:a",
 163     "fast",         "h1:a,v1:a,dr:a",
 164     "fa",           "h1:a,v1:a,dr:a",
 165     "ac",           "ha:a:128:7,va:a,dr:a",
 166     NULL //End Marker
 167 };
 168
 169 /* The horizontal functions exist only in C because the MMX
 170  * code is faster with vertical filters and transposing. */
 171
 172 /**
 173  * Check if the given 8x8 Block is mostly "flat"
 174  */
 175 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
 176 {
 177     int numEq= 0;
 178     int y;
 179     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 180     const int dcThreshold= dcOffset*2 + 1;
 181
 182     for(y=0; y<BLOCK_SIZE; y++){
 183         numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
 184         numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
 185         numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
 186         numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
 187         numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
 188         numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
 189         numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
 190         src+= stride;
 191     }
 192     return numEq > c->ppMode.flatnessThreshold;
 193 }
 194
 195 /**
 196  * Check if the middle 8x8 Block in the given 8x16 block is flat
 197  */
 198 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
 199 {
 200     int numEq= 0;
 201     int y;
 202     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 203     const int dcThreshold= dcOffset*2 + 1;
 204
 205     src+= stride*4; // src points to begin of the 8x8 Block
 206     for(y=0; y<BLOCK_SIZE-1; y++){
 207         numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
 208         numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
 209         numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
 210         numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
 211         numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
 212         numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
 213         numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
 214         numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
 215         src+= stride;
 216     }
 217     return numEq > c->ppMode.flatnessThreshold;
 218 }
 219
 220 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
 221 {
 222     int i;
 223     for(i=0; i<2; i++){
 224         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
 225         src += stride;
 226         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
 227         src += stride;
 228         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
 229         src += stride;
 230         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
 231         src += stride;
 232     }
 233     return 1;
 234 }
 235
 236 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
 237 {
 238     int x;
 239     src+= stride*4;
 240     for(x=0; x<BLOCK_SIZE; x+=4){
 241         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
 242         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
 243         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
 244         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
 245     }
 246     return 1;
 247 }
 248
 249 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
 250 {
 251     if( isHorizDC_C(src, stride, c) ){
 252         return isHorizMinMaxOk_C(src, stride, c->QP);
 253     }else{
 254         return 2;
 255     }
 256 }
 257
 258 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
 259 {
 260     if( isVertDC_C(src, stride, c) ){
 261         return isVertMinMaxOk_C(src, stride, c->QP);
 262     }else{
 263         return 2;
 264     }
 265 }
 266
 267 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
 268 {
 269     int y;
 270     for(y=0; y<BLOCK_SIZE; y++){
 271         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 272
 273         if(FFABS(middleEnergy) < 8*c->QP){
 274             const int q=(dst[3] - dst[4])/2;
 275             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
 276             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
 277
 278             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 279             d= FFMAX(d, 0);
 280
 281             d= (5*d + 32) >> 6;
 282             d*= FFSIGN(-middleEnergy);
 283
 284             if(q>0)
 285             {
 286                 d = FFMAX(d, 0);
 287                 d = FFMIN(d, q);
 288             }
 289             else
 290             {
 291                 d = FFMIN(d, 0);
 292                 d = FFMAX(d, q);
 293             }
 294
 295             dst[3]-= d;
 296             dst[4]+= d;
 297         }
 298         dst+= stride;
 299     }
 300 }
 301
 302 /**
 303  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
 304  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
 305  */
 306 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
 307 {
 308     int y;
 309     for(y=0; y<BLOCK_SIZE; y++){
 310         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 311         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 312
 313         int sums[10];
 314         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
 315         sums[1] = sums[0] - first  + dst[3];
 316         sums[2] = sums[1] - first  + dst[4];
 317         sums[3] = sums[2] - first  + dst[5];
 318         sums[4] = sums[3] - first  + dst[6];
 319         sums[5] = sums[4] - dst[0] + dst[7];
 320         sums[6] = sums[5] - dst[1] + last;
 321         sums[7] = sums[6] - dst[2] + last;
 322         sums[8] = sums[7] - dst[3] + last;
 323         sums[9] = sums[8] - dst[4] + last;
 324
 325         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
 326         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
 327         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
 328         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
 329         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
 330         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
 331         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
 332         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 333
 334         dst+= stride;
 335     }
 336 }
 337
 338 /**
 339  * Experimental Filter 1 (Horizontal)
 340  * will not damage linear gradients
 341  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
 342  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
 343  * MMX2 version does correct clipping C version does not
 344  * not identical with the vertical one
 345  */
 346 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 347 {
 348     int y;
 349     static uint64_t lut[256];
 350     if(!lut[255])
 351     {
 352         int i;
 353         for(i=0; i<256; i++)
 354         {
 355             int v= i < 128 ? 2*i : 2*(i-256);
 356 /*
 357 //Simulate 112242211 9-Tap filter
 358             uint64_t a= (v/16)  & 0xFF;
 359             uint64_t b= (v/8)   & 0xFF;
 360             uint64_t c= (v/4)   & 0xFF;
 361             uint64_t d= (3*v/8) & 0xFF;
 362 */
 363 //Simulate piecewise linear interpolation
 364             uint64_t a= (v/16)   & 0xFF;
 365             uint64_t b= (v*3/16) & 0xFF;
 366             uint64_t c= (v*5/16) & 0xFF;
 367             uint64_t d= (7*v/16) & 0xFF;
 368             uint64_t A= (0x100 - a)&0xFF;
 369             uint64_t B= (0x100 - b)&0xFF;
 370             uint64_t C= (0x100 - c)&0xFF;
 371             uint64_t D= (0x100 - c)&0xFF;
 372
 373             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
 374                        (D<<24) | (C<<16) | (B<<8)  | (A);
 375             //lut[i] = (v<<32) | (v<<24);
 376         }
 377     }
 378
 379     for(y=0; y<BLOCK_SIZE; y++){
 380         int a= src[1] - src[2];
 381         int b= src[3] - src[4];
 382         int c= src[5] - src[6];
 383
 384         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
 385
 386         if(d < QP){
 387             int v = d * FFSIGN(-b);
 388
 389             src[1] +=v/8;
 390             src[2] +=v/4;
 391             src[3] +=3*v/8;
 392             src[4] -=3*v/8;
 393             src[5] -=v/4;
 394             src[6] -=v/8;
 395         }
 396         src+=stride;
 397     }
 398 }
 399
 400 /**
 401  * accurate deblock filter
 402  */
 403 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
 404                                             int stride, const PPContext *c, int mode)
 405 {
 406     int y;
 407     const int QP= c->QP;
 408     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 409     const int dcThreshold= dcOffset*2 + 1;
 410 //START_TIMER
 411     src+= step*4; // src points to begin of the 8x8 Block
 412     for(y=0; y<8; y++){
 413         int numEq= 0;
 414
 415         numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
 416         numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
 417         numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
 418         numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
 419         numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
 420         numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
 421         numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
 422         numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
 423         numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
 424         if(numEq > c->ppMode.flatnessThreshold){
 425             int min, max, x;
 426
 427             if(src[0] > src[step]){
 428                 max= src[0];
 429                 min= src[step];
 430             }else{
 431                 max= src[step];
 432                 min= src[0];
 433             }
 434             for(x=2; x<8; x+=2){
 435                 if(src[x*step] > src[(x+1)*step]){
 436                         if(src[x    *step] > max) max= src[ x   *step];
 437                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
 438                 }else{
 439                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
 440                         if(src[ x   *step] < min) min= src[ x   *step];
 441                 }
 442             }
 443             if(max-min < 2*QP){
 444                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
 445                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
 446
 447                 int sums[10];
 448                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
 449                 sums[1] = sums[0] - first       + src[3*step];
 450                 sums[2] = sums[1] - first       + src[4*step];
 451                 sums[3] = sums[2] - first       + src[5*step];
 452                 sums[4] = sums[3] - first       + src[6*step];
 453                 sums[5] = sums[4] - src[0*step] + src[7*step];
 454                 sums[6] = sums[5] - src[1*step] + last;
 455                 sums[7] = sums[6] - src[2*step] + last;
 456                 sums[8] = sums[7] - src[3*step] + last;
 457                 sums[9] = sums[8] - src[4*step] + last;
 458
 459                 if (mode & VISUALIZE) {
 460                     src[0*step] =
 461                     src[1*step] =
 462                     src[2*step] =
 463                     src[3*step] =
 464                     src[4*step] =
 465                     src[5*step] =
 466                     src[6*step] =
 467                     src[7*step] = 128;
 468                 }
 469                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
 470                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
 471                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
 472                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
 473                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
 474                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
 475                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
 476                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
 477             }
 478         }else{
 479             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
 480
 481             if(FFABS(middleEnergy) < 8*QP){
 482                 const int q=(src[3*step] - src[4*step])/2;
 483                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
 484                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
 485
 486                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 487                 d= FFMAX(d, 0);
 488
 489                 d= (5*d + 32) >> 6;
 490                 d*= FFSIGN(-middleEnergy);
 491
 492                 if(q>0){
 493                     d = FFMAX(d, 0);
 494                     d = FFMIN(d, q);
 495                 }else{
 496                     d = FFMIN(d, 0);
 497                     d = FFMAX(d, q);
 498                 }
 499
 500                 if ((mode & VISUALIZE) && d) {
 501                     d= (d < 0) ? 32 : -32;
 502                     src[3*step]= av_clip_uint8(src[3*step] - d);
 503                     src[4*step]= av_clip_uint8(src[4*step] + d);
 504                     d = 0;
 505                 }
 506
 507                 src[3*step]-= d;
 508                 src[4*step]+= d;
 509             }
 510         }
 511
 512         src += stride;
 513     }
 514 /*if(step==16){
 515     STOP_TIMER("step16")
 516 }else{
 517     STOP_TIMER("stepX")
 518 }*/
 519 }
 520
 521 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 522 //Plain C versions
 523 //we always compile C for testing which needs bitexactness
 524 #define TEMPLATE_PP_C 1
 525 #include "postprocess_template.c"
 526
 527 #if HAVE_ALTIVEC
 528 #   define TEMPLATE_PP_ALTIVEC 1
 529 #   include "postprocess_altivec_template.c"
 530 #   include "postprocess_template.c"
 531 #endif
 532
 533 #if ARCH_X86 && HAVE_INLINE_ASM
 534 #    if CONFIG_RUNTIME_CPUDETECT
 535 #        define TEMPLATE_PP_MMX 1
 536 #        include "postprocess_template.c"
 537 #        define TEMPLATE_PP_MMXEXT 1
 538 #        include "postprocess_template.c"
 539 #        define TEMPLATE_PP_3DNOW 1
 540 #        include "postprocess_template.c"
 541 #        define TEMPLATE_PP_SSE2 1
 542 #        include "postprocess_template.c"
 543 #    else
 544 #        if HAVE_SSE2_INLINE
 545 #            define TEMPLATE_PP_SSE2 1
 546 #            include "postprocess_template.c"
 547 #        elif HAVE_MMXEXT_INLINE
 548 #            define TEMPLATE_PP_MMXEXT 1
 549 #            include "postprocess_template.c"
 550 #        elif HAVE_AMD3DNOW_INLINE
 551 #            define TEMPLATE_PP_3DNOW 1
 552 #            include "postprocess_template.c"
 553 #        elif HAVE_MMX_INLINE
 554 #            define TEMPLATE_PP_MMX 1
 555 #            include "postprocess_template.c"
 556 #        endif
 557 #    endif
 558 #endif
 559
 560 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 561                       const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
 562
 563 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 564         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
 565 {
 566     pp_fn pp = postProcess_C;
 567     PPContext *c= (PPContext *)vc;
 568     PPMode *ppMode= (PPMode *)vm;
 569     c->ppMode= *ppMode; //FIXME
 570
 571     if (!(ppMode->lumMode & BITEXACT)) {
 572 #if CONFIG_RUNTIME_CPUDETECT
 573 #if ARCH_X86 && HAVE_INLINE_ASM
 574         // ordered per speed fastest first
 575         if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
 576         else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
 577         else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
 578         else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
 579 #elif HAVE_ALTIVEC
 580         if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
 581 #endif
 582 #else /* CONFIG_RUNTIME_CPUDETECT */
 583 #if     HAVE_SSE2_INLINE
 584         pp = postProcess_SSE2;
 585 #elif   HAVE_MMXEXT_INLINE
 586         pp = postProcess_MMX2;
 587 #elif HAVE_AMD3DNOW_INLINE
 588         pp = postProcess_3DNow;
 589 #elif HAVE_MMX_INLINE
 590         pp = postProcess_MMX;
 591 #elif HAVE_ALTIVEC
 592         pp = postProcess_altivec;
 593 #endif
 594 #endif /* !CONFIG_RUNTIME_CPUDETECT */
 595     }
 596
 597     pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 598 }
 599
 600 /* -pp Command line Help
 601 */
 602 const char pp_help[] =
 603 "Available postprocessing filters:\n"
 604 "Filters                        Options\n"
 605 "short  long name       short   long option     Description\n"
 606 "*      *               a       autoq           CPU power dependent enabler\n"
 607 "                       c       chrom           chrominance filtering enabled\n"
 608 "                       y       nochrom         chrominance filtering disabled\n"
 609 "                       n       noluma          luma filtering disabled\n"
 610 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
 611 "       1. difference factor: default=32, higher -> more deblocking\n"
 612 "       2. flatness threshold: default=39, lower -> more deblocking\n"
 613 "                       the h & v deblocking filters share these\n"
 614 "                       so you can't set different thresholds for h / v\n"
 615 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
 616 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
 617 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
 618 "h1     x1hdeblock                              experimental h deblock filter 1\n"
 619 "v1     x1vdeblock                              experimental v deblock filter 1\n"
 620 "dr     dering                                  deringing filter\n"
 621 "al     autolevels                              automatic brightness / contrast\n"
 622 "                       f        fullyrange     stretch luminance to (0..255)\n"
 623 "lb     linblenddeint                           linear blend deinterlacer\n"
 624 "li     linipoldeint                            linear interpolating deinterlace\n"
 625 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
 626 "md     mediandeint                             median deinterlacer\n"
 627 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
 628 "l5     lowpass5                                FIR lowpass deinterlacer\n"
 629 "de     default                                 hb:a,vb:a,dr:a\n"
 630 "fa     fast                                    h1:a,v1:a,dr:a\n"
 631 "ac                                             ha:a:128:7,va:a,dr:a\n"
 632 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
 633 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
 634 "fq     forceQuant      <quantizer>             force quantizer\n"
 635 "Usage:\n"
 636 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
 637 "long form example:\n"
 638 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
 639 "short form example:\n"
 640 "vb:a/hb:a/lb                                   de,-vb\n"
 641 "more examples:\n"
 642 "tn:64:128:256\n"
 643 "\n"
 644 ;
 645
 646 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
 647 {
 648     char temp[GET_MODE_BUFFER_SIZE];
 649     char *p= temp;
 650     static const char filterDelimiters[] = ",/";
 651     static const char optionDelimiters[] = ":|";
 652     struct PPMode *ppMode;
 653     char *filterToken;
 654
 655     if (!name)  {
 656         av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
 657         return NULL;
 658     }
 659
 660     if (!strcmp(name, "help")) {
 661         const char *p;
 662         for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
 663             av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
 664             av_log(NULL, AV_LOG_INFO, "%s", temp);
 665         }
 666         return NULL;
 667     }
 668
 669     ppMode= av_malloc(sizeof(PPMode));
 670     if (!ppMode)
 671         return NULL;
 672
 673     ppMode->lumMode= 0;
 674     ppMode->chromMode= 0;
 675     ppMode->maxTmpNoise[0]= 700;
 676     ppMode->maxTmpNoise[1]= 1500;
 677     ppMode->maxTmpNoise[2]= 3000;
 678     ppMode->maxAllowedY= 234;
 679     ppMode->minAllowedY= 16;
 680     ppMode->baseDcDiff= 256/8;
 681     ppMode->flatnessThreshold= 56-16-1;
 682     ppMode->maxClippedThreshold= (AVRational){1,100};
 683     ppMode->error=0;
 684
 685     memset(temp, 0, GET_MODE_BUFFER_SIZE);
 686     av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
 687
 688     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
 689
 690     for(;;){
 691         const char *filterName;
 692         int q= 1000000; //PP_QUALITY_MAX;
 693         int chrom=-1;
 694         int luma=-1;
 695         const char *option;
 696         const char *options[OPTIONS_ARRAY_SIZE];
 697         int i;
 698         int filterNameOk=0;
 699         int numOfUnknownOptions=0;
 700         int enable=1; //does the user want us to enabled or disabled the filter
 701         char *tokstate;
 702
 703         filterToken= av_strtok(p, filterDelimiters, &tokstate);
 704         if(!filterToken) break;
 705         p+= strlen(filterToken) + 1; // p points to next filterToken
 706         filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
 707         if (!filterName) {
 708             ppMode->error++;
 709             break;
 710         }
 711         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
 712
 713         if(*filterName == '-'){
 714             enable=0;
 715             filterName++;
 716         }
 717
 718         for(;;){ //for all options
 719             option= av_strtok(NULL, optionDelimiters, &tokstate);
 720             if(!option) break;
 721
 722             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
 723             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 724             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 725             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
 726             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 727             else{
 728                 options[numOfUnknownOptions] = option;
 729                 numOfUnknownOptions++;
 730             }
 731             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
 732         }
 733         options[numOfUnknownOptions] = NULL;
 734
 735         /* replace stuff from the replace Table */
 736         for(i=0; replaceTable[2*i]; i++){
 737             if(!strcmp(replaceTable[2*i], filterName)){
 738                 size_t newlen = strlen(replaceTable[2*i + 1]);
 739                 int plen;
 740                 int spaceLeft;
 741
 742                 p--, *p=',';
 743
 744                 plen= strlen(p);
 745                 spaceLeft= p - temp + plen;
 746                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
 747                     ppMode->error++;
 748                     break;
 749                 }
 750                 memmove(p + newlen, p, plen+1);
 751                 memcpy(p, replaceTable[2*i + 1], newlen);
 752                 filterNameOk=1;
 753             }
 754         }
 755
 756         for(i=0; filters[i].shortName; i++){
 757             if(   !strcmp(filters[i].longName, filterName)
 758                || !strcmp(filters[i].shortName, filterName)){
 759                 ppMode->lumMode &= ~filters[i].mask;
 760                 ppMode->chromMode &= ~filters[i].mask;
 761
 762                 filterNameOk=1;
 763                 if(!enable) break; // user wants to disable it
 764
 765                 if(q >= filters[i].minLumQuality && luma)
 766                     ppMode->lumMode|= filters[i].mask;
 767                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 768                     if(q >= filters[i].minChromQuality)
 769                             ppMode->chromMode|= filters[i].mask;
 770
 771                 if(filters[i].mask == LEVEL_FIX){
 772                     int o;
 773                     ppMode->minAllowedY= 16;
 774                     ppMode->maxAllowedY= 234;
 775                     for(o=0; options[o]; o++){
 776                         if(  !strcmp(options[o],"fullyrange")
 777                            ||!strcmp(options[o],"f")){
 778                             ppMode->minAllowedY= 0;
 779                             ppMode->maxAllowedY= 255;
 780                             numOfUnknownOptions--;
 781                         }
 782                     }
 783                 }
 784                 else if(filters[i].mask == TEMP_NOISE_FILTER)
 785                 {
 786                     int o;
 787                     int numOfNoises=0;
 788
 789                     for(o=0; options[o]; o++){
 790                         char *tail;
 791                         ppMode->maxTmpNoise[numOfNoises]=
 792                             strtol(options[o], &tail, 0);
 793                         if(tail!=options[o]){
 794                             numOfNoises++;
 795                             numOfUnknownOptions--;
 796                             if(numOfNoises >= 3) break;
 797                         }
 798                     }
 799                 }
 800                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
 801                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
 802                     int o;
 803
 804                     for(o=0; options[o] && o<2; o++){
 805                         char *tail;
 806                         int val= strtol(options[o], &tail, 0);
 807                         if(tail==options[o]) break;
 808
 809                         numOfUnknownOptions--;
 810                         if(o==0) ppMode->baseDcDiff= val;
 811                         else ppMode->flatnessThreshold= val;
 812                     }
 813                 }
 814                 else if(filters[i].mask == FORCE_QUANT){
 815                     int o;
 816                     ppMode->forcedQuant= 15;
 817
 818                     for(o=0; options[o] && o<1; o++){
 819                         char *tail;
 820                         int val= strtol(options[o], &tail, 0);
 821                         if(tail==options[o]) break;
 822
 823                         numOfUnknownOptions--;
 824                         ppMode->forcedQuant= val;
 825                     }
 826                 }
 827             }
 828         }
 829         if(!filterNameOk) ppMode->error++;
 830         ppMode->error += numOfUnknownOptions;
 831     }
 832
 833     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
 834     if(ppMode->error){
 835         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
 836         av_free(ppMode);
 837         return NULL;
 838     }
 839     return ppMode;
 840 }
 841
 842 void pp_free_mode(pp_mode *mode){
 843     av_free(mode);
 844 }
 845
 846 static void reallocAlign(void **p, int size){
 847     av_free(*p);
 848     *p= av_mallocz(size);
 849 }
 850
 851 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
 852     int mbWidth = (width+15)>>4;
 853     int mbHeight= (height+15)>>4;
 854     int i;
 855
 856     c->stride= stride;
 857     c->qpStride= qpStride;
 858
 859     reallocAlign((void **)&c->tempDst, stride*24+32);
 860     reallocAlign((void **)&c->tempSrc, stride*24);
 861     reallocAlign((void **)&c->tempBlocks, 2*16*8);
 862     reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
 863     for(i=0; i<256; i++)
 864             c->yHistogram[i]= width*height/64*15/256;
 865
 866     for(i=0; i<3; i++){
 867         //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
 868         reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
 869         reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
 870     }
 871
 872     reallocAlign((void **)&c->deintTemp, 2*width+32);
 873     reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
 874     reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
 875     reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(QP_STORE_T));
 876 }
 877
 878 static const char * context_to_name(void * ptr) {
 879     return "postproc";
 880 }
 881
 882 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
 883
 884 av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
 885     PPContext *c= av_mallocz(sizeof(PPContext));
 886     int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
 887     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
 888
 889     if (!c)
 890         return NULL;
 891
 892     c->av_class = &av_codec_context_class;
 893     if(cpuCaps&PP_FORMAT){
 894         c->hChromaSubSample= cpuCaps&0x3;
 895         c->vChromaSubSample= (cpuCaps>>4)&0x3;
 896     }else{
 897         c->hChromaSubSample= 1;
 898         c->vChromaSubSample= 1;
 899     }
 900     if (cpuCaps & PP_CPU_CAPS_AUTO) {
 901         c->cpuCaps = av_get_cpu_flags();
 902     } else {
 903         c->cpuCaps = 0;
 904         if (cpuCaps & PP_CPU_CAPS_MMX)      c->cpuCaps |= AV_CPU_FLAG_MMX;
 905         if (cpuCaps & PP_CPU_CAPS_MMX2)     c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
 906         if (cpuCaps & PP_CPU_CAPS_3DNOW)    c->cpuCaps |= AV_CPU_FLAG_3DNOW;
 907         if (cpuCaps & PP_CPU_CAPS_ALTIVEC)  c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
 908     }
 909
 910     reallocBuffers(c, width, height, stride, qpStride);
 911
 912     c->frameNum=-1;
 913
 914     return c;
 915 }
 916
 917 av_cold void pp_free_context(void *vc){
 918     PPContext *c = (PPContext*)vc;
 919     int i;
 920
 921     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
 922         av_free(c->tempBlurred[i]);
 923     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
 924         av_free(c->tempBlurredPast[i]);
 925
 926     av_free(c->tempBlocks);
 927     av_free(c->yHistogram);
 928     av_free(c->tempDst);
 929     av_free(c->tempSrc);
 930     av_free(c->deintTemp);
 931     av_free(c->stdQPTable);
 932     av_free(c->nonBQPTable);
 933     av_free(c->forcedQPTable);
 934
 935     memset(c, 0, sizeof(PPContext));
 936
 937     av_free(c);
 938 }
 939
 940 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
 941                      uint8_t * dst[3], const int dstStride[3],
 942                      int width, int height,
 943                      const QP_STORE_T *QP_store,  int QPStride,
 944                      pp_mode *vm,  void *vc, int pict_type)
 945 {
 946     int mbWidth = (width+15)>>4;
 947     int mbHeight= (height+15)>>4;
 948     PPMode *mode = vm;
 949     PPContext *c = vc;
 950     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
 951     int absQPStride = FFABS(QPStride);
 952
 953     // c->stride and c->QPStride are always positive
 954     if(c->stride < minStride || c->qpStride < absQPStride)
 955         reallocBuffers(c, width, height,
 956                        FFMAX(minStride, c->stride),
 957                        FFMAX(c->qpStride, absQPStride));
 958
 959     if(!QP_store || (mode->lumMode & FORCE_QUANT)){
 960         int i;
 961         QP_store= c->forcedQPTable;
 962         absQPStride = QPStride = 0;
 963         if(mode->lumMode & FORCE_QUANT)
 964             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
 965         else
 966             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
 967     }
 968
 969     if(pict_type & PP_PICT_TYPE_QP2){
 970         int i;
 971         const int count= FFMAX(mbHeight * absQPStride, mbWidth);
 972         for(i=0; i<(count>>2); i++){
 973             AV_WN32(c->stdQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) >> 1 & 0x7F7F7F7F);
 974         }
 975         for(i<<=2; i<count; i++){
 976             c->stdQPTable[i] = QP_store[i]>>1;
 977         }
 978         QP_store= c->stdQPTable;
 979         QPStride= absQPStride;
 980     }
 981
 982     if(0){
 983         int x,y;
 984         for(y=0; y<mbHeight; y++){
 985             for(x=0; x<mbWidth; x++){
 986                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
 987             }
 988             av_log(c, AV_LOG_INFO, "\n");
 989         }
 990         av_log(c, AV_LOG_INFO, "\n");
 991     }
 992
 993     if((pict_type&7)!=3){
 994         if (QPStride >= 0){
 995             int i;
 996             const int count= FFMAX(mbHeight * QPStride, mbWidth);
 997             for(i=0; i<(count>>2); i++){
 998                 AV_WN32(c->nonBQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) & 0x3F3F3F3F);
 999             }
1000             for(i<<=2; i<count; i++){
1001                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1002             }
1003         } else {
1004             int i,j;
1005             for(i=0; i<mbHeight; i++) {
1006                 for(j=0; j<absQPStride; j++) {
1007                     c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1008                 }
1009             }
1010         }
1011     }
1012
1013     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1014            mode->lumMode, mode->chromMode);
1015
1016     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1017                 width, height, QP_store, QPStride, 0, mode, c);
1018
1019     if (!(src[1] && src[2] && dst[1] && dst[2]))
1020         return;
1021
1022     width  = (width )>>c->hChromaSubSample;
1023     height = (height)>>c->vChromaSubSample;
1024
1025     if(mode->chromMode){
1026         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1027                     width, height, QP_store, QPStride, 1, mode, c);
1028         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1029                     width, height, QP_store, QPStride, 2, mode, c);
1030     }
1031     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1032         linecpy(dst[1], src[1], height, srcStride[1]);
1033         linecpy(dst[2], src[2], height, srcStride[2]);
1034     }else{
1035         int y;
1036         for(y=0; y<height; y++){
1037             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1038             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1039         }
1040     }
1041 }