// This file is distributed under a BSD license. See LICENSE.txt for details.

// FRIED
// transform innerloops

#include "_types.hpp"
#include "fried_internal.hpp"

namespace FRIED
{
  // new 1d dct (11A 6S)
  // b norm: 1.3260 d norm: 1.5104
  static void ndct4(sInt &ar,sInt &br,sInt &cr,sInt &dr)
  {
    sInt a,b,c,d;

    a = ar;
    b = br;
    c = cr;
    d = dr;

    // stage 1
    a += d;
    c -= b;
    d <<= 1;
    d -= a;

    // stage 2
    b += (c - a) >> 1;
    a += b;

    c -= (d >> 1) - (d >> 3);
    d += (c >> 1) - (c >> 3);

    // store (with reordering!)
    ar = a;
    br = d; // !
    cr = b; // !
    dr = c; // !
  }

  // new 1d idct
  static void indct4(sS16 &ar,sS16 &br,sS16 &cr,sS16 &dr)
  {
    sInt a,b,c,d;

    // load (with reordering)!
    a = ar;
    b = cr; // !
    c = dr; // !
    d = br; // !

    // stage 2
    d -= (c >> 1) - (c >> 3);
    c += (d >> 1) - (d >> 3);

    a -= b;
    b -= (c - a) >> 1;

    // stage 1
    d += a;
    d >>= 1;
    c += b;
    a -= d;

    ar = a;
    br = b;
    cr = c;
    dr = d;
  }

  // 1d wht (lifting-based)
  static void wht4(sInt &ar,sInt &br,sInt &cr,sInt &dr)
  {
    sInt a,b,c,d,t;

    // load
    a = ar;
    b = br;
    c = cr;
    d = dr;

    // computation
    a += d;
    c -= b;
    t = (c - a) >> 1;
    d += t;
    b += t;
    c -= d;
    a += b;

    // store (with reordering)
    ar = a;
    br = d; // !
    cr = b; // !
    dr = c; // !
  }

  // 1d iwht
  static void iwht4(sInt &ar,sInt &br,sInt &cr,sInt &dr)
  {
    sInt a,b,c,d,t;

    // load (with reordering)
    a = ar;
    b = cr; // !
    c = dr; // !
    d = br; // !

    // computation
    a -= b;
    c += d;
    t = (c - a) >> 1;
    b -= t;
    d -= t;
    c += b;
    a -= d;

    // store
    ar = a;
    br = b;
    cr = c;
    dr = d;
  }

  // several dct variants. ndcts generate permuted output,
  // indcts expect permuted input. all permutation handling
  // is done during coefficient reordering.
  // 
  // current cost:
  //   88A 48S
  // 
  // dct 4x4:  72A 24M
  // h.264 IT: 64A 16S

  void ndct42D(sInt *x0,sInt *x1,sInt *x2,sInt *x3)
  {
    // transpose in
    sSwap(x0[1],x1[0]);
    sSwap(x0[2],x2[0]);
    sSwap(x0[3],x3[0]);
    sSwap(x1[2],x2[1]);
    sSwap(x1[3],x3[1]);
    sSwap(x2[3],x3[2]);

    // horizontal
    ndct4(x0[0],x0[1],x0[2],x0[3]);
    ndct4(x1[0],x1[1],x1[2],x1[3]);
    ndct4(x2[0],x2[1],x2[2],x2[3]);
    ndct4(x3[0],x3[1],x3[2],x3[3]);

    // vertical
    ndct4(x0[0],x1[0],x2[0],x3[0]);
    ndct4(x0[1],x1[1],x2[1],x3[1]);
    ndct4(x0[2],x1[2],x2[2],x3[2]);
    ndct4(x0[3],x1[3],x2[3],x3[3]);
  }

  // FIXME: no output shifting as of yet
  void indct42D(sS16 *x0,sS16 *x1,sS16 *x2,sS16 *x3)
  {
  #if 1
    __asm
    {
      mov       eax, [x0];
      mov       ebx, [x1];
      mov       ecx, [x2];
      mov       edx, [x3];

      // load (ok, this is going to be somewhat confusing)
      movq      mm0, [eax]; // mm0=a
      movq      mm1, [ecx]; // mm1=c
      movq      mm2, [edx]; // mm2=d
      movq      mm3, [ebx]; // mm3=b

      // vertical pass
      movq      mm4, mm2;
      movq      mm5, mm2;
      psraw     mm4, 1;
      psraw     mm5, 3;
      psubw     mm3, mm4;
      paddw     mm3, mm5;
      psubw     mm0, mm1;
      pxor      mm6, mm6;
      movq      mm4, mm3;
      movq      mm5, mm3;
      psraw     mm4, 1;
      psraw     mm5, 3;   
      paddw     mm3, mm0;
      psubw     mm6, mm0;
      paddw     mm2, mm4;
      psraw     mm3, 1;
      psubw     mm2, mm5;
      psubw     mm0, mm3;
      paddw     mm6, mm2;
      psraw     mm6, 1;
      psubw     mm1, mm6;
      paddw     mm2, mm1;

      // transpose (afterwards value from mm4 now in mm2)
      movq      mm4, mm0;
      movq      mm5, mm2;
      punpcklwd mm0, mm1;
      punpckhwd mm4, mm1;
      punpcklwd mm2, mm3;
      punpckhwd mm5, mm3;

      movq      mm1, mm0;
      movq      mm3, mm4;
      punpckldq mm0, mm2;
      punpckhdq mm1, mm2;
      punpckldq mm4, mm5;
      punpckhdq mm3, mm5;

      // translation: mm1 => mm4, mm2 => mm3, mm3 => mm1, mm4 => mm2
      // horizontal pass
      movq      mm2, mm3;
      movq      mm5, mm3;
      psraw     mm2, 1;
      psraw     mm5, 3;
      psubw     mm1, mm2;
      paddw     mm1, mm5;
      psubw     mm0, mm4;
      pxor      mm6, mm6;
      movq      mm2, mm1;
      movq      mm5, mm1;
      psraw     mm2, 1;
      psraw     mm5, 3;   
      paddw     mm1, mm0;
      psubw     mm6, mm0;
      paddw     mm3, mm2;
      psraw     mm1, 1;
      psubw     mm3, mm5;
      psubw     mm0, mm1;
      paddw     mm6, mm3;
      psraw     mm6, 1;
      psubw     mm4, mm6;
      paddw     mm3, mm4;

      movq      [eax], mm0;
      movq      [ebx], mm4;
      movq      [ecx], mm3;
      movq      [edx], mm1;

      emms;
    }
  #else
    // vertical
    indct4_s(x0[0],x1[0],x2[0],x3[0]);
    indct4_s(x0[1],x1[1],x2[1],x3[1]);
    indct4_s(x0[2],x1[2],x2[2],x3[2]);
    indct4_s(x0[3],x1[3],x2[3],x3[3]);

    // horizontal
    indct4_s(x0[0],x0[1],x0[2],x0[3]);
    indct4_s(x1[0],x1[1],x1[2],x1[3]);
    indct4_s(x2[0],x2[1],x2[2],x2[3]);
    indct4_s(x3[0],x3[1],x3[2],x3[3]);
  #endif
  }

  void ndct42D_MB(sInt *x0,sInt *x1,sInt *x2,sInt *x3)
  {
    // horizontal
    wht4(x0[ 0],x0[ 4],x0[ 8],x0[12]);
    wht4(x1[ 0],x1[ 4],x1[ 8],x1[12]);
    wht4(x2[ 0],x2[ 4],x2[ 8],x2[12]);
    wht4(x3[ 0],x3[ 4],x3[ 8],x3[12]);

    // vertical
    wht4(x0[ 0],x1[ 0],x2[ 0],x3[ 0]);
    wht4(x0[ 4],x1[ 4],x2[ 4],x3[ 4]);
    wht4(x0[ 8],x1[ 8],x2[ 8],x3[ 8]);
    wht4(x0[12],x1[12],x2[12],x3[12]);
  }

  // hardcoded now to save on call costs
  void indct42D_MB(sS16 *x0,sS16 *x1,sS16 *x2,sS16 *x3)
  {
    sInt temp[16];
    sInt i,a,b,c,d,t;

    // vertical
    for(i=0;i<4;i++)
    {
      a = x0[i*4];
      b = x2[i*4];
      c = x3[i*4];
      d = x1[i*4];

      a -= b;
      c += d;
      t = (c - a) >> 1;
      b -= t;
      d -= t;
      c += b;
      a -= d;

      temp[ 0+i] = a;
      temp[ 4+i] = b;
      temp[ 8+i] = c;
      temp[12+i] = d;
    }

    // horizontal
    for(i=0;i<4;i++)
    {
      a = temp[i*4+0];
      b = temp[i*4+2];
      c = temp[i*4+3];
      d = temp[i*4+1];

      a -= b;
      c += d;
      t = (c - a) >> 1;
      b -= t;
      d -= t;
      c += b;
      a -= d;

      switch(i)
      {
      case 0: x0[0] = a; x0[4] = b; x0[8] = c; x0[12] = d; break;
      case 1: x1[0] = a; x1[4] = b; x1[8] = c; x1[12] = d; break;
      case 2: x2[0] = a; x2[4] = b; x2[8] = c; x2[12] = d; break;
      case 3: x3[0] = a; x3[4] = b; x3[8] = c; x3[12] = d; break;
      }
    }
  }

  static void rot_pp(sInt &u,sInt &v)
  {
    v -= u;
    u <<= 1;
    u += v >> 1;
    v += u >> 1;
  }

  static void __forceinline irot_pp(sInt &ur,sInt &vr)
  {
    sInt u,v;

    u = ur;
    v = vr;

    v -= u >> 1;
    u -= v >> 1;
    u >>= 1;
    v += u;

    ur = u;
    vr = v;
  }

  // gain: 2 (+1bit)
  static void lbtpre1D(sInt &a,sInt &b,sInt &c,sInt &d)
  {
    // stage 1 butterfly
    d -= a;
    c -= b;
    a += a + d;
    b += b + c;

    // rotation
    rot_pp(c,d);

    // stage 3 butterfly
    a -= d - 1;
    b -= c;
    c += c + b + 1;
    d += d + a;

    a >>= 1;
    b >>= 1;
    c >>= 1;
    d >>= 1;
  }

  static void lbtpost1D(sS16 &ar,sS16 &br,sS16 &cr,sS16 &dr)
  {
    sInt a,b,c,d;

    a = ar;
    b = br;
    c = cr;
    d = dr;

    // stage 1 butterfly
    d -= a;
    c -= b;
    a += a + d;
    b += b + c;

    // inverse rotation
    irot_pp(c,d);

    // stage 3 butterfly
    a -= d;
    b -= c;
    c += c + b;
    d += d + a;

    ar = a << 1;
    br = b << 1;
    cr = c << 1;
    dr = d << 1;
  }

  // several variants of lbt pre/postfilters
  void lbt4pre2x4(sInt *x0,sInt *x1)
  {
    lbtpre1D(x0[0],x0[1],x0[2],x0[3]);
    lbtpre1D(x1[0],x1[1],x1[2],x1[3]);
  }

  void lbt4post2x4(sS16 *x0,sS16 *x1)
  {
    lbtpost1D(x0[0],x0[1],x0[2],x0[3]);
    lbtpost1D(x1[0],x1[1],x1[2],x1[3]);
  }

  void lbt4pre4x2(sInt *x0,sInt *x1,sInt *x2,sInt *x3)
  {
    lbtpre1D(x0[0],x1[0],x2[0],x3[0]);
    lbtpre1D(x0[1],x1[1],x2[1],x3[1]);
  }

  void lbt4post4x2(sS16 *x0,sS16 *x1,sS16 *x2,sS16 *x3)
  {
    lbtpost1D(x0[0],x1[0],x2[0],x3[0]);
    lbtpost1D(x0[1],x1[1],x2[1],x3[1]);
  }

  void lbt4pre4x4(sInt *x0,sInt *x1,sInt *x2,sInt *x3)
  {
    // horizontal
    lbtpre1D(x0[0],x0[1],x0[2],x0[3]);
    lbtpre1D(x1[0],x1[1],x1[2],x1[3]);
    lbtpre1D(x2[0],x2[1],x2[2],x2[3]);
    lbtpre1D(x3[0],x3[1],x3[2],x3[3]);

    // vertical
    lbtpre1D(x0[0],x1[0],x2[0],x3[0]);
    lbtpre1D(x0[1],x1[1],x2[1],x3[1]);
    lbtpre1D(x0[2],x1[2],x2[2],x3[2]);
    lbtpre1D(x0[3],x1[3],x2[3],x3[3]);
  }

  void lbt4post4x4(sS16 *x0,sS16 *x1,sS16 *x2,sS16 *x3)
  {
#if 1
    __asm
    {
      mov       eax, [x0];
      mov       ebx, [x1];
      mov       ecx, [x2];
      mov       edx, [x3];

      // split the load, because movq SUCKS for non-8byte-aligned data
      movq      mm0, [eax];
      movq      mm1, [ebx];
      movq      mm2, [ecx];
      movq      mm3, [edx];

      /*movd      mm0, [eax];
      movd      mm1, [ebx];
      movd      mm2, [ecx];
      movd      mm3, [edx];
      punpckldq mm0, [eax+4];
      punpckldq mm1, [ebx+4];
      punpckldq mm2, [ecx+4];
      punpckldq mm3, [edx+4];*/

      // vertical lbt postfilter stage 1
      psubw     mm3, mm0;
      psubw     mm2, mm1;
      movq      mm4, mm2;
      paddw     mm0, mm0;
      paddw     mm1, mm1;
      paddw     mm0, mm3;
      paddw     mm1, mm2;
      psraw     mm4, 1;

      // vertical lbt postfilter stage 2
      paddw     mm2, mm2;
      psubw     mm3, mm4;
      psubw     mm2, mm3;
      psraw     mm2, 2;
      paddw     mm3, mm2;

      // vertical lbt postfilter stage 3
      psubw     mm0, mm3;
      psubw     mm1, mm2;
      paddw     mm2, mm2;
      paddw     mm3, mm3;
      paddw     mm2, mm1;
      paddw     mm3, mm0;

      // transpose (afterwards value from mm4 now in mm2)
      movq      mm4, mm0;
      movq      mm5, mm2;
      punpcklwd mm0, mm1;
      punpckhwd mm4, mm1;
      punpcklwd mm2, mm3;
      punpckhwd mm5, mm3;

      movq      mm1, mm0;
      movq      mm3, mm4;
      punpckldq mm0, mm2;
      punpckhdq mm1, mm2;
      punpckldq mm4, mm5;
      punpckhdq mm3, mm5;

      // horizontal lbt postfilter stage 1
      psubw     mm3, mm0;
      psubw     mm4, mm1;
      movq      mm2, mm4;
      paddw     mm0, mm0;
      paddw     mm1, mm1;
      psraw     mm2, 1;
      paddw     mm0, mm3;
      paddw     mm1, mm4;

      // horizontal lbt postfilter stage 2
      paddw     mm4, mm4;
      psubw     mm3, mm2;
      psubw     mm4, mm3;
      psraw     mm4, 2;
      paddw     mm3, mm4;

      // horizontal lbt postfilter stage 3
      psubw     mm1, mm4;
      psubw     mm0, mm3;
      paddw     mm4, mm4;
      paddw     mm3, mm3;
      paddw     mm4, mm1;
      paddw     mm3, mm0;

      // transpose
      movq      mm2, mm0;
      movq      mm5, mm4;
      punpcklwd mm0, mm1;
      punpckhwd mm2, mm1;
      punpcklwd mm4, mm3;
      punpckhwd mm5, mm3;

  #if 1
      movq      mm1, mm0;
      movq      mm3, mm2;
      punpckldq mm0, mm4;
      punpckhdq mm1, mm4;
      punpckldq mm2, mm5;
      punpckhdq mm3, mm5;

      movq      [eax], mm0;
      movq      [ebx], mm1;
      movq      [ecx], mm2;
      movq      [edx], mm3;
  #else
      // store + second half of transpose
      movd      [eax], mm0;
      movd      [eax+4], mm4;
      movd      [ecx], mm2;
      movd      [ecx+4], mm5;

      psrlq     mm0, 32;
      psrlq     mm4, 32;
      psrlq     mm2, 32;
      psrlq     mm5, 32;

      movd      [ebx], mm0;
      movd      [ebx+4], mm4;
      movd      [edx], mm2;
      movd      [edx+4], mm5;
  #endif

      emms;
    }
#else
    // vertical
    lbtpost1D(x0[0],x1[0],x2[0],x3[0]);
    lbtpost1D(x0[1],x1[1],x2[1],x3[1]);
    lbtpost1D(x0[2],x1[2],x2[2],x3[2]);
    lbtpost1D(x0[3],x1[3],x2[3],x3[3]);

    // horizontal
    lbtpost1D(x0[0],x0[1],x0[2],x0[3]);
    lbtpost1D(x1[0],x1[1],x1[2],x1[3]);
    lbtpost1D(x2[0],x2[1],x2[2],x2[3]);
    lbtpost1D(x3[0],x3[1],x3[2],x3[3]);
#endif
  }
}