OSDN Git Service

add MMX LoopFilter.
authorNoumi Akira <noumiakira@users.sourceforge.jp>
Mon, 13 Jul 2009 07:59:47 +0000 (16:59 +0900)
committerNoumi Akira <noumiakira@users.sourceforge.jp>
Mon, 13 Jul 2009 07:59:47 +0000 (16:59 +0900)
Lib/QTheoraEx/FrameReconstructor_MMX.c

index a9d1950..183adb1 100644 (file)
@@ -1067,6 +1067,258 @@ static void Reconstruct_CPlane_MMX(
 
 /* */
 
+struct MFilter {
+
+       __m64 L;
+       __m64 L2;
+
+       __m64 NL;
+       __m64 NL2;
+
+}; /* MFilter */
+
+typedef struct MFilter MFilter_t;
+
+static void MFilter_Setup(MFilter_t* t, INT32 lim)
+{
+       const __m64 z = _mm_setzero_si64();
+
+       __m64 l  = _mm_set1_pi16(lim);
+       __m64 nl = _mm_sub_pi16(z, l);
+
+       t->L   = l;
+       t->L2  = _mm_slli_pi16(l, 1);
+       t->NL  = nl;
+       t->NL2 = _mm_slli_pi16(nl, 1);
+}
+
+ALIGN(0x10) static const UINT16 UR_4[4] = { 4, 4, 4, 4 };
+
+/* */
+
+#if 0
+static void Filter_LoopFilterH(
+       const LoopFilter_t* t,
+       UINT8*              b,
+       INT32               s)
+{
+       const INT16* d = t->Delta + 127;
+
+       INT32 p0[2];
+       INT32 p1[2];
+
+       INT32 q0[2];
+       INT32 q1[2];
+
+       UINT8* p   = b;
+       UINT8* end = p + s * 8;
+
+       p0[1] = 0;
+       p1[1] = 0;
+       q0[1] = 255;
+       q1[1] = 255;
+
+       for (; p < end; p += s) {
+               INT32 x = (p[-2] - p[1]) + 3 * (p[0] - p[-1]);
+               INT32 v = d[(x + 4) >> 3];
+
+               p0[0] = p[-1] + v;
+               p1[0] = p[ 0] - v;
+
+               q0[0] = p0[(p0[0] < 0)];
+               q1[0] = p1[(p1[0] < 0)];
+
+               p[-1] = q0[(q0[0] > 255)];
+               p[ 0] = q1[(q1[0] > 255)];
+       }
+}
+#endif
+
+static void Filter_LoopFilterH_MMX(
+       const MFilter_t* t,
+       UINT8*           b,
+       INT32            s)
+{
+       UINT8* p = (UINT8*)b;
+       UINT8* e = p + s * 8;
+
+       const __m64 z = _mm_setzero_si64();
+
+       for (; p < e; p += 4 * s) {
+               __m64 S0 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 0 * s))), z);
+               __m64 S1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 1 * s))), z);
+               __m64 S2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 2 * s))), z);
+               __m64 S3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 3 * s))), z);
+
+               __m64 u0 = _mm_unpacklo_pi16(S0, S1);
+               __m64 u1 = _mm_unpackhi_pi16(S0, S1);
+               __m64 u2 = _mm_unpacklo_pi16(S2, S3);
+               __m64 u3 = _mm_unpackhi_pi16(S2, S3);
+
+               __m64 P0 = _mm_unpackhi_pi32(u0, u2);
+               __m64 P1 = _mm_unpacklo_pi32(u1, u3);
+
+               __m64 X = _mm_sub_pi16(_mm_unpacklo_pi32(u0, u2), _mm_unpackhi_pi32(u1, u3));
+               __m64 Y = _mm_sub_pi16(P1, P0);
+               __m64 R = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(_mm_add_pi16(X, Y), _mm_slli_pi16(Y, 1)), *((__m64*)UR_4)), 3);
+
+               __m64 m1 = _mm_cmpgt_pi16(R,     t->L);
+               __m64 m2 = _mm_cmpgt_pi16(t->NL, R   );
+
+               __m64 r, D;
+
+               r = _mm_or_si64(_mm_andnot_si64(m1, R), _mm_and_si64(_mm_sub_pi16(t->L2,  R), m1));
+               r = _mm_or_si64(_mm_andnot_si64(m2, r), _mm_and_si64(_mm_sub_pi16(t->NL2, R), m2));
+
+               r = _mm_andnot_si64(_mm_cmpgt_pi16(R,      t->L2), r);
+               r = _mm_andnot_si64(_mm_cmpgt_pi16(t->NL2, R    ), r);
+
+               P0 = _mm_add_pi16(P0, r);
+               P1 = _mm_sub_pi16(P1, r);
+
+               D = _mm_unpacklo_pi8(_mm_packs_pu16(P0, P0), _mm_packs_pu16(P1, P1));
+
+               {
+                       UINT32 d0 = _mm_cvtsi64_si32(D);
+                       UINT32 d1 = _mm_cvtsi64_si32(_mm_unpackhi_pi32(D, D));
+
+                       *((UINT16*)(p - 1 + 0 * s)) = (UINT16)d0;
+                       *((UINT16*)(p - 1 + 1 * s)) =         d0 >> 16;
+                       *((UINT16*)(p - 1 + 2 * s)) = (UINT16)d1;
+                       *((UINT16*)(p - 1 + 3 * s)) =         d1 >> 16;
+               }
+       }
+}
+
+/* */
+
+#if 0
+static void Filter_LoopFilterV(
+       const LoopFilter_t* t,
+       UINT8*              b,
+       INT32               s)
+{
+       const INT16* d = t->Delta + 127;
+
+       INT32 p0[2];
+       INT32 p1[2];
+
+       INT32 q0[2];
+       INT32 q1[2];
+
+       UINT8* p   = b;
+       UINT8* end = p + 8;
+
+       p0[1] = 0;
+       p1[1] = 0;
+       q0[1] = 255;
+       q1[1] = 255;
+
+       for (; p < end; p++) {
+               INT32 x = (p[-2 * s] - p[1 * s]) + 3 * (p[0] - p[-1 * s]);
+               INT32 v = d[(x + 4) >> 3];
+
+               p0[0] = p[-s] + v;
+               p1[0] = p[ 0] - v;
+
+               q0[0] = p0[(p0[0] < 0)];
+               q1[0] = p1[(p1[0] < 0)];
+
+               p[-s] = q0[(q0[0] > 255)];
+               p[ 0] = q1[(q1[0] > 255)];
+       }
+}
+#endif
+
+static void Filter_LoopFilterV_MMX(
+       const MFilter_t* t,
+       UINT8*           b,
+       INT32            s)
+{
+       UINT8* p = (UINT8*)b;
+       UINT8* e = p + 8;
+
+       const __m64 z = _mm_setzero_si64();
+
+       for (; p < e; p += 4) {
+               __m64 P0 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - s))), z);
+               __m64 P1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p + 0))), z);
+
+               __m64 X = _mm_sub_pi16(
+                       _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 * s))), z),
+                       _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p +     s))), z));
+               __m64 Y = _mm_sub_pi16(P1, P0);
+               __m64 R = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(_mm_add_pi16(X, Y), _mm_slli_pi16(Y, 1)), *((__m64*)UR_4)), 3);
+
+               __m64 m1 = _mm_cmpgt_pi16(R,     t->L);
+               __m64 m2 = _mm_cmpgt_pi16(t->NL, R   );
+
+               __m64 r;
+
+               r = _mm_or_si64(_mm_andnot_si64(m1, R), _mm_and_si64(_mm_sub_pi16(t->L2,  R), m1));
+               r = _mm_or_si64(_mm_andnot_si64(m2, r), _mm_and_si64(_mm_sub_pi16(t->NL2, R), m2));
+
+               r = _mm_andnot_si64(_mm_cmpgt_pi16(R,      t->L2), r);
+               r = _mm_andnot_si64(_mm_cmpgt_pi16(t->NL2, R    ), r);
+
+               P0 = _mm_add_pi16(P0, r);
+               P1 = _mm_sub_pi16(P1, r);
+
+               *((UINT32*)(p - s)) = _mm_cvtsi64_si32(_mm_packs_pu16(P0, P0));
+               *((UINT32*)(p + 0)) = _mm_cvtsi64_si32(_mm_packs_pu16(P1, P1));
+       }
+}
+
+/* */
+
+static void FrameLoopFilter_MMX(
+       FrameDecoder_t* t)
+{
+       INT32 i;
+       INT32 x, y;
+
+       const INT16* b = t->DC;
+
+       Plane_t* plane = t->Frame[1];
+
+       ALIGN(0x10) MFilter_t mf;
+
+       MFilter_Setup(&mf, t->Filter.Limit);
+
+       for (i = 0; i < 3; i++, plane++) {
+               INT32 bx = t->Index->BX[i];
+               INT32 by = t->Index->BY[i];
+
+               UINT8* r0 = plane->Plane;
+
+               for (y = 0; y < by; y++, r0 += plane->Pitch * 8) {
+                       UINT8* r = r0;
+
+                       for (x = 0; x < bx; x++, r += 8, b++) {
+                               if (*b != NOT_CODED) {
+                                       if (x > 0) {
+                                               Filter_LoopFilterH_MMX(&mf, r, plane->Pitch);
+                                       }
+
+                                       if (y > 0) {
+                                               Filter_LoopFilterV_MMX(&mf, r, plane->Pitch);
+                                       }
+
+                                       if (x < bx - 1 && b[ 1] == NOT_CODED) {
+                                               Filter_LoopFilterH_MMX(&mf, r + 8, plane->Pitch);
+                                       }
+
+                                       if (y < by - 1 && b[bx] == NOT_CODED) {
+                                               Filter_LoopFilterV_MMX(&mf, r + 8 * plane->Pitch, plane->Pitch);
+                                       }
+                               }
+                       }
+               }
+       }
+}
+
+/* */
+
 void QT_ReconstructFrame_MMX(
        FrameDecoder_t* t)
 {
@@ -1075,7 +1327,7 @@ void QT_ReconstructFrame_MMX(
        Reconstruct_CPlane_MMX(t);
 
        if (t->Filter.Limit > 0) {
-               QT_FrameLoopFilter(t);
+               FrameLoopFilter_MMX(t);
        }
 }