/* */
+struct MFilter {
+
+ __m64 L;
+ __m64 L2;
+
+ __m64 NL;
+ __m64 NL2;
+
+}; /* MFilter */
+
+typedef struct MFilter MFilter_t;
+
+static void MFilter_Setup(MFilter_t* t, INT32 lim)
+{
+ const __m64 z = _mm_setzero_si64();
+
+ __m64 l = _mm_set1_pi16(lim);
+ __m64 nl = _mm_sub_pi16(z, l);
+
+ t->L = l;
+ t->L2 = _mm_slli_pi16(l, 1);
+ t->NL = nl;
+ t->NL2 = _mm_slli_pi16(nl, 1);
+}
+
+ALIGN(0x10) static const UINT16 UR_4[4] = { 4, 4, 4, 4 };
+
+/* */
+
+#if 0
+static void Filter_LoopFilterH(
+ const LoopFilter_t* t,
+ UINT8* b,
+ INT32 s)
+{
+ const INT16* d = t->Delta + 127;
+
+ INT32 p0[2];
+ INT32 p1[2];
+
+ INT32 q0[2];
+ INT32 q1[2];
+
+ UINT8* p = b;
+ UINT8* end = p + s * 8;
+
+ p0[1] = 0;
+ p1[1] = 0;
+ q0[1] = 255;
+ q1[1] = 255;
+
+ for (; p < end; p += s) {
+ INT32 x = (p[-2] - p[1]) + 3 * (p[0] - p[-1]);
+ INT32 v = d[(x + 4) >> 3];
+
+ p0[0] = p[-1] + v;
+ p1[0] = p[ 0] - v;
+
+ q0[0] = p0[(p0[0] < 0)];
+ q1[0] = p1[(p1[0] < 0)];
+
+ p[-1] = q0[(q0[0] > 255)];
+ p[ 0] = q1[(q1[0] > 255)];
+ }
+}
+#endif
+
+static void Filter_LoopFilterH_MMX(
+ const MFilter_t* t,
+ UINT8* b,
+ INT32 s)
+{
+ UINT8* p = (UINT8*)b;
+ UINT8* e = p + s * 8;
+
+ const __m64 z = _mm_setzero_si64();
+
+ for (; p < e; p += 4 * s) {
+ __m64 S0 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 0 * s))), z);
+ __m64 S1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 1 * s))), z);
+ __m64 S2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 2 * s))), z);
+ __m64 S3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 3 * s))), z);
+
+ __m64 u0 = _mm_unpacklo_pi16(S0, S1);
+ __m64 u1 = _mm_unpackhi_pi16(S0, S1);
+ __m64 u2 = _mm_unpacklo_pi16(S2, S3);
+ __m64 u3 = _mm_unpackhi_pi16(S2, S3);
+
+ __m64 P0 = _mm_unpackhi_pi32(u0, u2);
+ __m64 P1 = _mm_unpacklo_pi32(u1, u3);
+
+ __m64 X = _mm_sub_pi16(_mm_unpacklo_pi32(u0, u2), _mm_unpackhi_pi32(u1, u3));
+ __m64 Y = _mm_sub_pi16(P1, P0);
+ __m64 R = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(_mm_add_pi16(X, Y), _mm_slli_pi16(Y, 1)), *((__m64*)UR_4)), 3);
+
+ __m64 m1 = _mm_cmpgt_pi16(R, t->L);
+ __m64 m2 = _mm_cmpgt_pi16(t->NL, R );
+
+ __m64 r, D;
+
+ r = _mm_or_si64(_mm_andnot_si64(m1, R), _mm_and_si64(_mm_sub_pi16(t->L2, R), m1));
+ r = _mm_or_si64(_mm_andnot_si64(m2, r), _mm_and_si64(_mm_sub_pi16(t->NL2, R), m2));
+
+ r = _mm_andnot_si64(_mm_cmpgt_pi16(R, t->L2), r);
+ r = _mm_andnot_si64(_mm_cmpgt_pi16(t->NL2, R ), r);
+
+ P0 = _mm_add_pi16(P0, r);
+ P1 = _mm_sub_pi16(P1, r);
+
+ D = _mm_unpacklo_pi8(_mm_packs_pu16(P0, P0), _mm_packs_pu16(P1, P1));
+
+ {
+ UINT32 d0 = _mm_cvtsi64_si32(D);
+ UINT32 d1 = _mm_cvtsi64_si32(_mm_unpackhi_pi32(D, D));
+
+ *((UINT16*)(p - 1 + 0 * s)) = (UINT16)d0;
+ *((UINT16*)(p - 1 + 1 * s)) = d0 >> 16;
+ *((UINT16*)(p - 1 + 2 * s)) = (UINT16)d1;
+ *((UINT16*)(p - 1 + 3 * s)) = d1 >> 16;
+ }
+ }
+}
+
+/* */
+
+#if 0
+static void Filter_LoopFilterV(
+ const LoopFilter_t* t,
+ UINT8* b,
+ INT32 s)
+{
+ const INT16* d = t->Delta + 127;
+
+ INT32 p0[2];
+ INT32 p1[2];
+
+ INT32 q0[2];
+ INT32 q1[2];
+
+ UINT8* p = b;
+ UINT8* end = p + 8;
+
+ p0[1] = 0;
+ p1[1] = 0;
+ q0[1] = 255;
+ q1[1] = 255;
+
+ for (; p < end; p++) {
+ INT32 x = (p[-2 * s] - p[1 * s]) + 3 * (p[0] - p[-1 * s]);
+ INT32 v = d[(x + 4) >> 3];
+
+ p0[0] = p[-s] + v;
+ p1[0] = p[ 0] - v;
+
+ q0[0] = p0[(p0[0] < 0)];
+ q1[0] = p1[(p1[0] < 0)];
+
+ p[-s] = q0[(q0[0] > 255)];
+ p[ 0] = q1[(q1[0] > 255)];
+ }
+}
+#endif
+
+static void Filter_LoopFilterV_MMX(
+ const MFilter_t* t,
+ UINT8* b,
+ INT32 s)
+{
+ UINT8* p = (UINT8*)b;
+ UINT8* e = p + 8;
+
+ const __m64 z = _mm_setzero_si64();
+
+ for (; p < e; p += 4) {
+ __m64 P0 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - s))), z);
+ __m64 P1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p + 0))), z);
+
+ __m64 X = _mm_sub_pi16(
+ _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 * s))), z),
+ _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p + s))), z));
+ __m64 Y = _mm_sub_pi16(P1, P0);
+ __m64 R = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(_mm_add_pi16(X, Y), _mm_slli_pi16(Y, 1)), *((__m64*)UR_4)), 3);
+
+ __m64 m1 = _mm_cmpgt_pi16(R, t->L);
+ __m64 m2 = _mm_cmpgt_pi16(t->NL, R );
+
+ __m64 r;
+
+ r = _mm_or_si64(_mm_andnot_si64(m1, R), _mm_and_si64(_mm_sub_pi16(t->L2, R), m1));
+ r = _mm_or_si64(_mm_andnot_si64(m2, r), _mm_and_si64(_mm_sub_pi16(t->NL2, R), m2));
+
+ r = _mm_andnot_si64(_mm_cmpgt_pi16(R, t->L2), r);
+ r = _mm_andnot_si64(_mm_cmpgt_pi16(t->NL2, R ), r);
+
+ P0 = _mm_add_pi16(P0, r);
+ P1 = _mm_sub_pi16(P1, r);
+
+ *((UINT32*)(p - s)) = _mm_cvtsi64_si32(_mm_packs_pu16(P0, P0));
+ *((UINT32*)(p + 0)) = _mm_cvtsi64_si32(_mm_packs_pu16(P1, P1));
+ }
+}
+
+/* */
+
+static void FrameLoopFilter_MMX(
+ FrameDecoder_t* t)
+{
+ INT32 i;
+ INT32 x, y;
+
+ const INT16* b = t->DC;
+
+ Plane_t* plane = t->Frame[1];
+
+ ALIGN(0x10) MFilter_t mf;
+
+ MFilter_Setup(&mf, t->Filter.Limit);
+
+ for (i = 0; i < 3; i++, plane++) {
+ INT32 bx = t->Index->BX[i];
+ INT32 by = t->Index->BY[i];
+
+ UINT8* r0 = plane->Plane;
+
+ for (y = 0; y < by; y++, r0 += plane->Pitch * 8) {
+ UINT8* r = r0;
+
+ for (x = 0; x < bx; x++, r += 8, b++) {
+ if (*b != NOT_CODED) {
+ if (x > 0) {
+ Filter_LoopFilterH_MMX(&mf, r, plane->Pitch);
+ }
+
+ if (y > 0) {
+ Filter_LoopFilterV_MMX(&mf, r, plane->Pitch);
+ }
+
+ if (x < bx - 1 && b[ 1] == NOT_CODED) {
+ Filter_LoopFilterH_MMX(&mf, r + 8, plane->Pitch);
+ }
+
+ if (y < by - 1 && b[bx] == NOT_CODED) {
+ Filter_LoopFilterV_MMX(&mf, r + 8 * plane->Pitch, plane->Pitch);
+ }
+ }
+ }
+ }
+ }
+}
+
+/* */
+
void QT_ReconstructFrame_MMX(
FrameDecoder_t* t)
{
Reconstruct_CPlane_MMX(t);
if (t->Filter.Limit > 0) {
- QT_FrameLoopFilter(t);
+ FrameLoopFilter_MMX(t);
}
}