/* ------------------------------------------------------------------
 * Copyright (C) 1998-2009 PacketVideo
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 * -------------------------------------------------------------------
 */
#include "avcdec_lib.h"
//#define USE_PRED_BLOCK
#include <stdio.h>

#define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
                 x = 0xFF & (~(x>>31));}

#define SWAP_BYTES(x) ((((x)&0xFF)<<24) | (((x)&0xFF00)<<8) | (((x)&0xFF0000)>>8) | (((x)&0xFF000000)>>24))
#define SWAP_HALFBYTES(x) ((((x)&0xFF)<<8) | (((x)&0xFF00)>>8))

/* (blkwidth << 2) + (dy << 1) + dx */
static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int,AVCCommonObj*) =
{
    &ChromaFullMC_SIMD,
    &ChromaHorizontalMC_SIMD,
    &ChromaVerticalMC_SIMD,
    &ChromaDiagonalMC_SIMD,
    &ChromaFullMC_SIMD,
    &ChromaHorizontalMC2_SIMD,
    &ChromaVerticalMC2_SIMD,
    &ChromaDiagonalMC2_SIMD
};
/* Perform motion prediction and compensation with residue if exist. */
void InterMBPrediction(AVCCommonObj *video , AVCMacroblock *currMB)
{
    AVCPictureData *currPic = video->currPic;
    int mbPartIdx, subMbPartIdx;
    int ref_idx;
    int offset_MbPart_indx = 0;
    int16 *mv;
    uint32 x_pos, y_pos;
    uint8 *curL, *curCb, *curCr;
    uint8 *ref_l, *ref_Cb, *ref_Cr;
    uint8 *predBlock, *predCb, *predCr;
    int block_x, block_y, offset_x, offset_y, offsetP, offset;
	int x_position = (currMB->mb_x << 4);
    int y_position = (currMB->mb_y << 4);
    int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
    int picWidth = currPic->pitch;
    int picHeight = currPic->height;
    int16 *dataBlock;
    uint32 cbp4x4;
    uint32 tmp_word;

    tmp_word = y_position * picWidth;
    curL = currPic->Sl + tmp_word + x_position;
    offset = (tmp_word >> 2) + (x_position >> 1);
    curCb = currPic->Scb + offset;
    curCr = currPic->Scr + offset;

#ifdef USE_PRED_BLOCK
    predBlock = currMB->pred + 84;
    predCb = currMB->pred + 452;
    predCr = currMB->pred + 596;
#else
    predBlock = curL;
    predCb = curCb;
    predCr = curCr;
#endif

    GetMotionVectorPredictor(video, 0);

    for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
    {
        MbHeight = currMB->SubMbPartHeight[mbPartIdx];
        MbWidth = currMB->SubMbPartWidth[mbPartIdx];
        mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
        mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
        ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
        offset_indx = 0;

        ref_l = video->RefPicList0[ref_idx]->Sl;
        ref_Cb = video->RefPicList0[ref_idx]->Scb;
        ref_Cr = video->RefPicList0[ref_idx]->Scr;

        for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
        {
            block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);  // check this
            block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
            mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
            offset_x = x_position + (block_x << 2);
            offset_y = y_position + (block_y << 2);
            x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
            y_pos = (offset_y << 2) + *mv;   /*quarter pel */

            //offset = offset_y * currPic->width;
            //offsetC = (offset >> 2) + (offset_x >> 1);
#ifdef USE_PRED_BLOCK


            offsetP = (block_y * 80) + (block_x << 2);
            LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
                           /*comp_Sl + offset + offset_x,*/
                           predBlock + offsetP, 20, MbWidth, MbHeight,video);
#else
            offsetP = (block_y << 2) * picWidth + (block_x << 2);
            LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
                           /*comp_Sl + offset + offset_x,*/
                           predBlock + offsetP, picWidth, MbWidth, MbHeight);
#endif


#ifdef USE_PRED_BLOCK
            offsetP = (block_y * 24) + (block_x << 1);

            ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
                             /*comp_Scb +  offsetC,*/
                             predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1,video);
            ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
                             /*comp_Scr +  offsetC,*/
                             predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1,video);


#else
            offsetP = (block_y * picWidth) + (block_x << 1);
            ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
                             /*comp_Scb +  offsetC,*/
                             predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
            ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
                             /*comp_Scr +  offsetC,*/
                             predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
#endif

            offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
        }
        offset_MbPart_indx = currMB->MbPartWidth >> 4;
    }

    /* used in decoder, used to be if(!encFlag)  */

    /* transform in raster scan order */
	dataBlock = currMB->block;
	cbp4x4 = currMB->cbp4x4;
    /* luma */
    for (block_y = 4; block_y > 0; block_y--)
    {
        for (block_x = 4; block_x > 0; block_x--)
        {
#ifdef USE_PRED_BLOCK
            if (cbp4x4&1)
            {
                itrans(dataBlock, predBlock, predBlock, 20);
            }
#else
            if (cbp4x4&1)
            {
                itrans(dataBlock, curL, curL, picWidth);
            }
#endif
            cbp4x4 >>= 1;
            dataBlock += 4;
#ifdef USE_PRED_BLOCK
            predBlock += 4;
#else
            curL += 4;
#endif
        }
        dataBlock += 48;
#ifdef USE_PRED_BLOCK
        predBlock += 64;
#else
        curL += ((picWidth << 2) - 16);
#endif
    }

    /* chroma */
    picWidth = (picWidth >> 1);
    for (block_y = 2; block_y > 0; block_y--)
    {
        for (block_x = 2; block_x > 0; block_x--)
        {
#ifdef USE_PRED_BLOCK
            if (cbp4x4&1)
            {
                ictrans(dataBlock, predCb, predCb, 12);
            }
#else
            if (cbp4x4&1)
            {
                ictrans(dataBlock, curCb, curCb, picWidth);
            }
#endif
            cbp4x4 >>= 1;
            dataBlock += 4;
#ifdef USE_PRED_BLOCK
            predCb += 4;
#else
            curCb += 4;
#endif
        }
        for (block_x = 2; block_x > 0; block_x--)
        {
#ifdef USE_PRED_BLOCK
            if (cbp4x4&1)
            {
                ictrans(dataBlock, predCr, predCr, 12);
            }
#else
            if (cbp4x4&1)
            {
                ictrans(dataBlock, curCr, curCr, picWidth);
            }
#endif
            cbp4x4 >>= 1;
            dataBlock += 4;
#ifdef USE_PRED_BLOCK
            predCr += 4;
#else
            curCr += 4;
#endif
        }
        dataBlock += 48;
#ifdef USE_PRED_BLOCK
        predCb += 40;
        predCr += 40;
#else
        curCb += ((picWidth << 2) - 8);
        curCr += ((picWidth << 2) - 8);
#endif
    }

#ifdef MB_BASED_DEBLOCK
    SaveNeighborForIntraPred(video,currMB,offset);
#endif

    return ;
}


/* preform the actual  motion comp here */
void LumaMotionComp(uint8 *ref, int picwidth, int picheight,
                    int x_pos, int y_pos,
                    uint8 *pred, int pred_pitch,
                    int blkwidth, int blkheight,AVCCommonObj *video)
{
    int dx, dy;
    uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */
    int temp2[21][21]; /* for intermediate results */
    uint8 *ref2;

    dx = x_pos & 3;
    dy = y_pos & 3;
    x_pos = x_pos >> 2;  /* round it to full-pel resolution */
    y_pos = y_pos >> 2;

    /* perform actual motion compensation */
    if (dx == 0 && dy == 0)
    {  /* fullpel position *//* G */
        if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
        {
            ref += y_pos * picwidth + x_pos;
            FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight,video);
        }
        else
        {
            CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight);
            FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight,video);
        }

    }   /* other positions */
    else  if (dy == 0)
    { /* no vertical interpolation *//* a,b,c*/

        if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
        {
            ref += y_pos * picwidth + x_pos;

            HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx);
        }
        else  /* need padding */
        {
            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight);

            HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx);
        }
    }
    else if (dx == 0)
    { /*no horizontal interpolation *//* d,h,n */

        if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
        {
            ref += y_pos * picwidth + x_pos;

            VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy,video);
        }
        else  /* need padding */
        {
            CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5);

            VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy,video);

        }
    }
    else if (dy == 2)
    {  /* horizontal cross *//* i, j, k */

        if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
        {
            ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */

            VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight);

            HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
        }
        else /* need padding */
        {
            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);

            VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight);

            HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
        }
    }
    else if (dx == 2)
    { /* vertical cross */ /* f,q */

        if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
        {
            ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */

            HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5);
            VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
        }
        else  /* need padding */
        {
            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
            HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5);
            VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
        }
    }
    else
    { /* diagonal *//* e,g,p,r */

        if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth &&
                y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight)
        {
            ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos;

            ref += (y_pos * picwidth) + x_pos + (dx / 2);

            DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight,video);
        }
        else  /* need padding */
        {
            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2));

            ref2 = &temp[2 + (dy/2)][2];

            ref = &temp[2][2 + (dx/2)];

            DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight,video);
        }
    }

    return ;
}

void CreateAlign(uint8 *ref, int picwidth, int y_pos,
                 uint8 *out, int blkwidth, int blkheight,AVCCommonObj *video)
{
    int i, j;
    int offset, out_offset;
    uint32 prev_pix, result, pix1, pix2, pix4;

    out_offset = 24 - blkwidth;

    //switch(x_pos&0x3){
    switch (((uint32)ref)&0x3)
    {
        case 1:
            ref += y_pos * picwidth;
            offset =  picwidth - blkwidth - 3;
            for (j = 0; j < blkheight; j++)
            {
                pix1 = *ref++;
                pix2 = *((uint16*)ref);
                pix2 = SWAP_HALFBYTES(pix2);    //for endian
                ref += 2;
                result = (pix2 << 8) | pix1;

                for (i = 3; i < blkwidth; i += 4)
                {
                    pix4 = *((uint32*)ref);
                    pix4 = SWAP_BYTES(pix4);    // for endian
                    ref += 4;
                    prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
                    result |= prev_pix;
                    result = SWAP_BYTES(result);    // for endian
                    *((uint32*)out) = result;  /* write 4 bytes */
                    out += 4;
                    result = pix4 >> 8; /* for the next loop */
                }
                ref += offset;
                out += out_offset;
            }
            break;
        case 2:
            ref += y_pos * picwidth;
            offset =  picwidth - blkwidth - 2;
            uint32 tmp;
            for (j = 0; j < blkheight; j++)
            {
                result = *((uint16*)ref);
                result = SWAP_HALFBYTES(result);    // for endian
                ref += 2;

                tmp = result;

                for (i = 2; i < blkwidth; i += 4)
                {
                    pix4 = *((uint32*)ref);
                    pix4 = SWAP_BYTES(pix4);    // for endian
                    ref += 4;
                    prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
                    result |= prev_pix;

                    result = SWAP_BYTES(result);    // for endian
                    *((uint32*)out) = result;  /* write 4 bytes */
                    out += 4;
                    result = pix4 >> 16; /* for the next loop */
                }
                ref += offset;
                out += out_offset;
            }
            break;
        case 3:
            ref += y_pos * picwidth;
            offset =  picwidth - blkwidth - 1;
            for (j = 0; j < blkheight; j++)
            {
                result = *ref++;
                for (i = 1; i < blkwidth; i += 4)
                {
                    pix4 = *((uint32*)ref);
                    pix4 = SWAP_BYTES(pix4);    // for endian
                    ref += 4;
                    prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
                    result |= prev_pix;
                    result = SWAP_BYTES(result);    // for endian
                    *((uint32*)out) = result;  /* write 4 bytes */
                    out += 4;
                    result = pix4 >> 24; /* for the next loop */
                }
                ref += offset;
                out += out_offset;
            }
            break;
    }
}

void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos,
               uint8 *out, int blkwidth, int blkheight)
{
    int x_inc0, x_mid;
    int y_inc, y_inc0, y_inc1, y_mid;
    int i, j;
    int offset;

    if (x_pos < 0)
    {
        x_inc0 = 0;  /* increment for the first part */
        x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth);  /* stopping point */
        x_pos = 0;
    }
    else if (x_pos + blkwidth > picwidth)
    {
        x_inc0 = 1;  /* increasing */
        x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0);  /* clip negative to zero, encode fool proof! */
    }
    else    /* normal case */
    {
        x_inc0 = 1;
        x_mid = blkwidth; /* just one run */
    }


    /* boundary for y_pos, taking the result from x_pos into account */
    if (y_pos < 0)
    {
        y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */
        y_inc1 = picwidth + y_inc0;
        y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */
        y_pos = 0;
    }
    else  if (y_pos + blkheight > picheight)
    {
        y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */
        y_inc0 = picwidth + y_inc1;                 /* increasing */
        y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0);
    }
    else  /* normal case */
    {
        y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid);
        y_inc0 = picwidth + y_inc1;
        y_mid = blkheight;
    }

    /* clip y_pos and x_pos */
    if (y_pos > picheight - 1) y_pos = picheight - 1;
    if (x_pos > picwidth - 1) x_pos = picwidth - 1;

    ref += y_pos * picwidth + x_pos;

    y_inc = y_inc0;  /* start with top half */

    offset = 24 - blkwidth; /* to use in offset out */
    blkwidth -= x_mid; /* to use in the loop limit */

    if (x_inc0 == 0)
    {
        for (j = 0; j < blkheight; j++)
        {
            if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
            {
                y_inc = y_inc1;  /* switch to lower half */
            }
            for (i = x_mid; i > 0; i--)   /* first or third quarter */
            {
                *out++ = *ref;
            }
            for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
            {
                *out++ = *ref++;
            }
            out += offset;
            ref += y_inc;
        }
    }
    else
    {
        for (j = 0; j < blkheight; j++)
        {
            if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
            {
                y_inc = y_inc1;  /* switch to lower half */
            }
            for (i = x_mid; i > 0; i--)   /* first or third quarter */
            {
                *out++ = *ref++;
            }
            for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
            {
                *out++ = *ref;
            }
            out += offset;
            ref += y_inc;
        }
    }

    return ;
}

void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
                   int blkwidth, int blkheight, int dx)
{
    uint8 *p_ref;
    uint32 *p_cur;
    uint32 tmp, pkres;
    int result, curr_offset, ref_offset;
    int j;
    int32 r0, r1, r2, r3, r4, r5;
    int32 r13, r6;

    p_cur = (uint32*)out; /* assume it's word aligned */
    curr_offset = (outpitch - blkwidth) >> 2;
    p_ref = in;
    ref_offset = inpitch - blkwidth;

    if (dx&1)
    {
        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
        p_ref -= 2;
        r13 = 0;
        for (j = blkheight; j > 0; j--)
        {
            tmp = (uint32)(p_ref + blkwidth);
            r0 = p_ref[0];
            r1 = p_ref[2];
            r0 |= (r1 << 16);           /* 0,c,0,a */
            r1 = p_ref[1];
            r2 = p_ref[3];
            r1 |= (r2 << 16);           /* 0,d,0,b */
            while ((uint32)p_ref < tmp)
            {
                r2 = *(p_ref += 4); /* move pointer to e */
                r3 = p_ref[2];
                r2 |= (r3 << 16);           /* 0,g,0,e */
                r3 = p_ref[1];
                r4 = p_ref[3];
                r3 |= (r4 << 16);           /* 0,h,0,f */

                r4 = r0 + r3;       /* c+h, a+f */
                r5 = r0 + r1;   /* c+d, a+b */
                r6 = r2 + r3;   /* g+h, e+f */
                r5 >>= 16;
                r5 |= (r6 << 16);   /* e+f, c+d */
                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
                r4 += 0x100010; /* +16, +16 */
                r5 = r1 + r2;       /* d+g, b+e */
                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
                r4 >>= 5;
                r13 |= r4;      /* check clipping */

                r5 = p_ref[dx+2];
                r6 = p_ref[dx+4];
                r5 |= (r6 << 16);
                r4 += r5;
                r4 += 0x10001;
                r4 = (r4 >> 1) & 0xFF00FF;

                r5 = p_ref[4];  /* i */
                r6 = (r5 << 16);
                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
                r5 += r1;       /* d+i, b+g */ /* r5 not free */
                r1 >>= 16;
                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
                r1 += r2;       /* f+g, d+e */
                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
                r0 >>= 16;
                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
                r0 += r3;       /* e+h, c+f */
                r5 += 0x100010; /* 16,16 */
                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
                r5 >>= 5;
                r13 |= r5;      /* check clipping */

                r0 = p_ref[dx+3];
                r1 = p_ref[dx+5];
                r0 |= (r1 << 16);
                r5 += r0;
                r5 += 0x10001;
                r5 = (r5 >> 1) & 0xFF00FF;

                r4 |= (r5 << 8);    /* pack them together */
                r4 = SWAP_BYTES(r4);    // for endian , 2013/9/9
                *p_cur++ = r4;
                r1 = r3;
                r0 = r2;
            }
            p_cur += curr_offset; /* move to the next line */
            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */

            if (r13&0xFF000700) /* need clipping */
            {
                /* move back to the beginning of the line */
                p_ref -= (ref_offset + blkwidth);   /* input */
                p_cur -= (outpitch >> 2);

                tmp = (uint32)(p_ref + blkwidth);
                for (; (uint32)p_ref < tmp;)
                {

                    r0 = *p_ref++;
                    r1 = *p_ref++;
                    r2 = *p_ref++;
                    r3 = *p_ref++;
                    r4 = *p_ref++;
                    /* first pixel */
                    r5 = *p_ref++;
                    result = (r0 + r5);
                    r0 = (r1 + r4);
                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                    r0 = (r2 + r3);
                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    /* 3/4 pel,  no need to clip */
                    result = (result + p_ref[dx] + 1);
                    pkres = (result >> 1) ;
                    /* second pixel */
                    r0 = *p_ref++;
                    result = (r1 + r0);
                    r1 = (r2 + r5);
                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                    r1 = (r3 + r4);
                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    /* 3/4 pel,  no need to clip */
                    result = (result + p_ref[dx] + 1);
                    result = (result >> 1);
                    pkres  |= (result << 8);
                    /* third pixel */
                    r1 = *p_ref++;
                    result = (r2 + r1);
                    r2 = (r3 + r0);
                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                    r2 = (r4 + r5);
                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    /* 3/4 pel,  no need to clip */
                    result = (result + p_ref[dx] + 1);
                    result = (result >> 1);
                    pkres  |= (result << 16);
                    /* fourth pixel */
                    r2 = *p_ref++;
                    result = (r3 + r2);
                    r3 = (r4 + r1);
                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                    r3 = (r5 + r0);
                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    /* 3/4 pel,  no need to clip */
                    result = (result + p_ref[dx] + 1);
                    result = (result >> 1);
                    pkres  |= (result << 24);
                    pkres = SWAP_BYTES(pkres);    // for endian , 2013/9/9
                    *p_cur++ = pkres; /* write 4 pixels */
                    p_ref -= 5;  /* offset back to the middle of filter */
                }
                p_cur += curr_offset;  /* move to the next line */
                p_ref += ref_offset;    /* move to the next line */
            }
        }
    }
    else
    {
        p_ref -= 2;
        r13 = 0;
        for (j = blkheight; j > 0; j--)
        {
            tmp = (uint32)(p_ref + blkwidth);
            r0 = p_ref[0];
            r1 = p_ref[2];
            r0 |= (r1 << 16);           /* 0,c,0,a */
            r1 = p_ref[1];
            r2 = p_ref[3];
            r1 |= (r2 << 16);           /* 0,d,0,b */
            while ((uint32)p_ref < tmp)
            {
                r2 = *(p_ref += 4); /* move pointer to e */
                r3 = p_ref[2];
                r2 |= (r3 << 16);           /* 0,g,0,e */
                r3 = p_ref[1];
                r4 = p_ref[3];
                r3 |= (r4 << 16);           /* 0,h,0,f */

                r4 = r0 + r3;       /* c+h, a+f */
                r5 = r0 + r1;   /* c+d, a+b */
                r6 = r2 + r3;   /* g+h, e+f */
                r5 >>= 16;
                r5 |= (r6 << 16);   /* e+f, c+d */
                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
                r4 += 0x100010; /* +16, +16 */
                r5 = r1 + r2;       /* d+g, b+e */
                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
                r4 >>= 5;
                r13 |= r4;      /* check clipping */
                r4 &= 0xFF00FF; /* mask */

                r5 = p_ref[4];  /* i */
                r6 = (r5 << 16);
                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
                r5 += r1;       /* d+i, b+g */ /* r5 not free */
                r1 >>= 16;
                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
                r1 += r2;       /* f+g, d+e */
                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
                r0 >>= 16;
                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
                r0 += r3;       /* e+h, c+f */
                r5 += 0x100010; /* 16,16 */
                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
                r5 >>= 5;
                r13 |= r5;      /* check clipping */
                r5 &= 0xFF00FF; /* mask */

                r4 |= (r5 << 8);    /* pack them together */
                r4 = SWAP_BYTES(r4);	// for endian , 2013/9/9
                *p_cur++ = r4;
                r1 = r3;
                r0 = r2;
            }
            p_cur += curr_offset; /* move to the next line */
            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */

            if (r13&0xFF000700) /* need clipping */
            {
                /* move back to the beginning of the line */
                p_ref -= (ref_offset + blkwidth);   /* input */
                p_cur -= (outpitch >> 2);

                tmp = (uint32)(p_ref + blkwidth);
                for (; (uint32)p_ref < tmp;)
                {

                    r0 = *p_ref++;
                    r1 = *p_ref++;
                    r2 = *p_ref++;
                    r3 = *p_ref++;
                    r4 = *p_ref++;
                    /* first pixel */
                    r5 = *p_ref++;
                    result = (r0 + r5);
                    r0 = (r1 + r4);
                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                    r0 = (r2 + r3);
                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    pkres  = result;
                    /* second pixel */
                    r0 = *p_ref++;
                    result = (r1 + r0);
                    r1 = (r2 + r5);
                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                    r1 = (r3 + r4);
                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    pkres  |= (result << 8);
                    /* third pixel */
                    r1 = *p_ref++;
                    result = (r2 + r1);
                    r2 = (r3 + r0);
                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                    r2 = (r4 + r5);
                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    pkres  |= (result << 16);
                    /* fourth pixel */
                    r2 = *p_ref++;
                    result = (r3 + r2);
                    r3 = (r4 + r1);
                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                    r3 = (r5 + r0);
                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    pkres  |= (result << 24);
                    pkres = SWAP_BYTES(pkres);	// for endian , 2013/9/9
                    *p_cur++ = pkres;   /* write 4 pixels */
                    p_ref -= 5;
                }
                p_cur += curr_offset; /* move to the next line */
                p_ref += ref_offset;
            }
        }
    }

    return ;
}

void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
                   int blkwidth, int blkheight, int dx)
{
    int *p_ref;
    uint32 *p_cur;
    uint32 tmp, pkres;
    int result, result2, curr_offset, ref_offset;
    int j, r0, r1, r2, r3, r4, r5;

    p_cur = (uint32*)out; /* assume it's word aligned */
    curr_offset = (outpitch - blkwidth) >> 2;
    p_ref = in;
    ref_offset = inpitch - blkwidth;

    if (dx&1)
    {
        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */

        for (j = blkheight; j > 0 ; j--)
        {
            tmp = (uint32)(p_ref + blkwidth);
            for (; (uint32)p_ref < tmp;)
            {

                r0 = p_ref[-2];
                r1 = p_ref[-1];
                r2 = *p_ref++;
                r3 = *p_ref++;
                r4 = *p_ref++;
                /* first pixel */
                r5 = *p_ref++;
                result = (r0 + r5);
                r0 = (r1 + r4);
                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                r0 = (r2 + r3);
                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                result2 = ((p_ref[dx] + 16) >> 5);
                CLIP_RESULT(result2)
                /* 3/4 pel,  no need to clip */
                result = (result + result2 + 1);
                pkres = (result >> 1);
                /* second pixel */
                r0 = *p_ref++;
                result = (r1 + r0);
                r1 = (r2 + r5);
                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                r1 = (r3 + r4);
                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                result2 = ((p_ref[dx] + 16) >> 5);
                CLIP_RESULT(result2)
                /* 3/4 pel,  no need to clip */
                result = (result + result2 + 1);
                result = (result >> 1);
                pkres  |= (result << 8);
                /* third pixel */
                r1 = *p_ref++;
                result = (r2 + r1);
                r2 = (r3 + r0);
                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                r2 = (r4 + r5);
                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                result2 = ((p_ref[dx] + 16) >> 5);
                CLIP_RESULT(result2)
                /* 3/4 pel,  no need to clip */
                result = (result + result2 + 1);
                result = (result >> 1);
                pkres  |= (result << 16);
                /* fourth pixel */
                r2 = *p_ref++;
                result = (r3 + r2);
                r3 = (r4 + r1);
                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                r3 = (r5 + r0);
                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                result2 = ((p_ref[dx] + 16) >> 5);
                CLIP_RESULT(result2)
                /* 3/4 pel,  no need to clip */
                result = (result + result2 + 1);
                result = (result >> 1);
                pkres  |= (result << 24);
                pkres = SWAP_BYTES(pkres);  // for endian , 2013/9/9
                *p_cur++ = pkres; /* write 4 pixels */
                p_ref -= 3;  /* offset back to the middle of filter */
            }
            p_cur += curr_offset;  /* move to the next line */
            p_ref += ref_offset;    /* move to the next line */
        }
    }
    else
    {
        for (j = blkheight; j > 0 ; j--)
        {
            tmp = (uint32)(p_ref + blkwidth);
            for (; (uint32)p_ref < tmp;)
            {

                r0 = p_ref[-2];
                r1 = p_ref[-1];
                r2 = *p_ref++;
                r3 = *p_ref++;
                r4 = *p_ref++;
                /* first pixel */
                r5 = *p_ref++;
                result = (r0 + r5);
                r0 = (r1 + r4);
                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                r0 = (r2 + r3);
                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                pkres  = result;
                /* second pixel */
                r0 = *p_ref++;
                result = (r1 + r0);
                r1 = (r2 + r5);
                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                r1 = (r3 + r4);
                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                pkres  |= (result << 8);
                /* third pixel */
                r1 = *p_ref++;
                result = (r2 + r1);
                r2 = (r3 + r0);
                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                r2 = (r4 + r5);
                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                pkres  |= (result << 16);
                /* fourth pixel */
                r2 = *p_ref++;
                result = (r3 + r2);
                r3 = (r4 + r1);
                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                r3 = (r5 + r0);
                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                pkres  |= (result << 24);
                pkres = SWAP_BYTES(pkres);  // for endian , 2013/9/9
                *p_cur++ = pkres; /* write 4 pixels */
                p_ref -= 3;  /* offset back to the middle of filter */
            }
            p_cur += curr_offset;  /* move to the next line */
            p_ref += ref_offset;    /* move to the next line */
        }
    }

    return ;
}

void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
                   int blkwidth, int blkheight)
{
    uint8 *p_ref;
    int   *p_cur;
    uint32 tmp;
    int result, curr_offset, ref_offset;
    int j, r0, r1, r2, r3, r4, r5;

    p_cur = out;
    curr_offset = (outpitch - blkwidth);
    p_ref = in;
    ref_offset = inpitch - blkwidth;

    for (j = blkheight; j > 0 ; j--)
    {
        tmp = (uint32)(p_ref + blkwidth);
        for (; (uint32)p_ref < tmp;)
        {

            r0 = p_ref[-2];
            r1 = p_ref[-1];
            r2 = *p_ref++;
            r3 = *p_ref++;
            r4 = *p_ref++;
            /* first pixel */
            r5 = *p_ref++;
            result = (r0 + r5);
            r0 = (r1 + r4);
            result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
            r0 = (r2 + r3);
            result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
            *p_cur++ = result;
            /* second pixel */
            r0 = *p_ref++;
            result = (r1 + r0);
            r1 = (r2 + r5);
            result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
            r1 = (r3 + r4);
            result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
            *p_cur++ = result;
            /* third pixel */
            r1 = *p_ref++;
            result = (r2 + r1);
            r2 = (r3 + r0);
            result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
            r2 = (r4 + r5);
            result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
            *p_cur++ = result;
            /* fourth pixel */
            r2 = *p_ref++;
            result = (r3 + r2);
            r3 = (r4 + r1);
            result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
            r3 = (r5 + r0);
            result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
            *p_cur++ = result;
            p_ref -= 3; /* move back to the middle of the filter */
        }
        p_cur += curr_offset; /* move to the next line */
        p_ref += ref_offset;
    }

    return ;
}
void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
                   int blkwidth, int blkheight, int dy , AVCCommonObj *video)
{
    uint8 *p_cur, *p_ref;
    uint32 tmp;
    int result, curr_offset, ref_offset;
    int j, i;
    int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
    uint8  tmp_in[24][24];


    /* not word-aligned */
    if (((uint32)in)&0x3)
    {
        CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5,video);
        in = &tmp_in[2][0];
        inpitch = 24;
    }
    p_cur = out;
    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
    ref_offset = blkheight * inpitch; /* for limit */

    curr_offset += 3;

    if (dy&1)
    {
        dy = (dy >> 1) ? 0 : -inpitch;

        for (j = 0; j < blkwidth; j += 4, in += 4)
        {
            r13 = 0;
            p_ref = in;
            p_cur -= outpitch;  /* compensate for the first offset */
            tmp = (uint32)(p_ref + ref_offset); /* limit */
            while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
            {
                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
                p_ref += inpitch;
                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
                r0 &= 0xFF00FF;

                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
                r7 = (r1 >> 8) & 0xFF00FF;
                r1 &= 0xFF00FF;

                r0 += r1;
                r6 += r7;

                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
                r8 = (r2 >> 8) & 0xFF00FF;
                r2 &= 0xFF00FF;

                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
                r7 = (r1 >> 8) & 0xFF00FF;
                r1 &= 0xFF00FF;
                r1 += r2;

                r7 += r8;

                r0 += 20 * r1;
                r6 += 20 * r7;
                r0 += 0x100010;
                r6 += 0x100010;

                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
                r8 = (r2 >> 8) & 0xFF00FF;
                r2 &= 0xFF00FF;

                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
                r7 = (r1 >> 8) & 0xFF00FF;
                r1 &= 0xFF00FF;
                r1 += r2;

                r7 += r8;

                r0 -= 5 * r1;
                r6 -= 5 * r7;

                r0 >>= 5;
                r6 >>= 5;
                /* clip */
                r13 |= r6;
                r13 |= r0;
                //CLIPPACK(r6,result)

                r1 = *((uint32*)(p_ref + dy));
                r2 = (r1 >> 8) & 0xFF00FF;
                r1 &= 0xFF00FF;
                r0 += r1;
                r6 += r2;
                r0 += 0x10001;
                r6 += 0x10001;
                r0 = (r0 >> 1) & 0xFF00FF;
                r6 = (r6 >> 1) & 0xFF00FF;

                r0 |= (r6 << 8);  /* pack it back */
                *((uint32*)(p_cur += outpitch)) = r0;
            }
            p_cur += curr_offset; /* offset to the next pixel */
            if (r13 & 0xFF000700) /* this column need clipping */
            {
                p_cur -= 4;
                for (i = 0; i < 4; i++)
                {
                    p_ref = in + i;
                    p_cur -= outpitch;  /* compensate for the first offset */

                    tmp = (uint32)(p_ref + ref_offset); /* limit */
                    while ((uint32)p_ref < tmp)
                    {                           /* loop un-rolled */
                        r0 = *(p_ref - (inpitch << 1));
                        r1 = *(p_ref - inpitch);
                        r2 = *p_ref;
                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
                        r4 = *(p_ref += inpitch);
                        /* first pixel */
                        r5 = *(p_ref += inpitch);
                        result = (r0 + r5);
                        r0 = (r1 + r4);
                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                        r0 = (r2 + r3);
                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                        result = (result + 16) >> 5;
                        CLIP_RESULT(result)
                        /* 3/4 pel,  no need to clip */
                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
                        result = (result >> 1);
                        *(p_cur += outpitch) = result;
                        /* second pixel */
                        r0 = *(p_ref += inpitch);
                        result = (r1 + r0);
                        r1 = (r2 + r5);
                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                        r1 = (r3 + r4);
                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                        result = (result + 16) >> 5;
                        CLIP_RESULT(result)
                        /* 3/4 pel,  no need to clip */
                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
                        result = (result >> 1);
                        *(p_cur += outpitch) = result;
                        /* third pixel */
                        r1 = *(p_ref += inpitch);
                        result = (r2 + r1);
                        r2 = (r3 + r0);
                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                        r2 = (r4 + r5);
                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                        result = (result + 16) >> 5;
                        CLIP_RESULT(result)
                        /* 3/4 pel,  no need to clip */
                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
                        result = (result >> 1);
                        *(p_cur += outpitch) = result;
                        /* fourth pixel */
                        r2 = *(p_ref += inpitch);
                        result = (r3 + r2);
                        r3 = (r4 + r1);
                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                        r3 = (r5 + r0);
                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                        result = (result + 16) >> 5;
                        CLIP_RESULT(result)
                        /* 3/4 pel,  no need to clip */
                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
                        result = (result >> 1);
                        *(p_cur += outpitch) = result;
                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
                    }
                    p_cur += (curr_offset - 3);
                }
            }
        }
    }
    else
    {
        for (j = 0; j < blkwidth; j += 4, in += 4)
        {
            r13 = 0;
            p_ref = in;
            p_cur -= outpitch;  /* compensate for the first offset */
            tmp = (uint32)(p_ref + ref_offset); /* limit */
            while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
            {
                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
                p_ref += inpitch;
                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
                r0 &= 0xFF00FF;

                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
                r7 = (r1 >> 8) & 0xFF00FF;
                r1 &= 0xFF00FF;

                r0 += r1;
                r6 += r7;

                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
                r8 = (r2 >> 8) & 0xFF00FF;
                r2 &= 0xFF00FF;

                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
                r7 = (r1 >> 8) & 0xFF00FF;
                r1 &= 0xFF00FF;
                r1 += r2;

                r7 += r8;

                r0 += 20 * r1;
                r6 += 20 * r7;
                r0 += 0x100010;
                r6 += 0x100010;

                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
                r8 = (r2 >> 8) & 0xFF00FF;
                r2 &= 0xFF00FF;

                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
                r7 = (r1 >> 8) & 0xFF00FF;
                r1 &= 0xFF00FF;
                r1 += r2;

                r7 += r8;

                r0 -= 5 * r1;
                r6 -= 5 * r7;

                r0 >>= 5;
                r6 >>= 5;
                /* clip */
                r13 |= r6;
                r13 |= r0;
                //CLIPPACK(r6,result)
                r0 &= 0xFF00FF;
                r6 &= 0xFF00FF;
                r0 |= (r6 << 8);  /* pack it back */
                *((uint32*)(p_cur += outpitch)) = r0;
            }
            p_cur += curr_offset; /* offset to the next pixel */
            if (r13 & 0xFF000700) /* this column need clipping */
            {
                p_cur -= 4;
                for (i = 0; i < 4; i++)
                {
                    p_ref = in + i;
                    p_cur -= outpitch;  /* compensate for the first offset */
                    tmp = (uint32)(p_ref + ref_offset); /* limit */
                    while ((uint32)p_ref < tmp)
                    {                           /* loop un-rolled */
                        r0 = *(p_ref - (inpitch << 1));
                        r1 = *(p_ref - inpitch);
                        r2 = *p_ref;
                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
                        r4 = *(p_ref += inpitch);
                        /* first pixel */
                        r5 = *(p_ref += inpitch);
                        result = (r0 + r5);
                        r0 = (r1 + r4);
                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                        r0 = (r2 + r3);
                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                        result = (result + 16) >> 5;
                        CLIP_RESULT(result)
                        *(p_cur += outpitch) = result;
                        /* second pixel */
                        r0 = *(p_ref += inpitch);
                        result = (r1 + r0);
                        r1 = (r2 + r5);
                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                        r1 = (r3 + r4);
                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                        result = (result + 16) >> 5;
                        CLIP_RESULT(result)
                        *(p_cur += outpitch) = result;
                        /* third pixel */
                        r1 = *(p_ref += inpitch);
                        result = (r2 + r1);
                        r2 = (r3 + r0);
                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                        r2 = (r4 + r5);
                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                        result = (result + 16) >> 5;
                        CLIP_RESULT(result)
                        *(p_cur += outpitch) = result;
                        /* fourth pixel */
                        r2 = *(p_ref += inpitch);
                        result = (r3 + r2);
                        r3 = (r4 + r1);
                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                        r3 = (r5 + r0);
                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                        result = (result + 16) >> 5;
                        CLIP_RESULT(result)
                        *(p_cur += outpitch) = result;
                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
                    }
                    p_cur += (curr_offset - 3);
                }
            }
        }
    }



    return ;
}

void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
                   int blkwidth, int blkheight)
{
    int *p_cur;
    uint8 *p_ref;
    uint32 tmp;
    int result, curr_offset, ref_offset;
    int j, r0, r1, r2, r3, r4, r5;

    p_cur = out;
    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
    ref_offset = blkheight * inpitch; /* for limit */

    for (j = 0; j < blkwidth; j++)
    {
        p_cur -= outpitch; /* compensate for the first offset */
        p_ref = in++;

        tmp = (uint32)(p_ref + ref_offset); /* limit */
        while ((uint32)p_ref < tmp)
        {                           /* loop un-rolled */
            r0 = *(p_ref - (inpitch << 1));
            r1 = *(p_ref - inpitch);
            r2 = *p_ref;
            r3 = *(p_ref += inpitch);  /* modify pointer before loading */
            r4 = *(p_ref += inpitch);
            /* first pixel */
            r5 = *(p_ref += inpitch);
            result = (r0 + r5);
            r0 = (r1 + r4);
            result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
            r0 = (r2 + r3);
            result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
            *(p_cur += outpitch) = result;
            /* second pixel */
            r0 = *(p_ref += inpitch);
            result = (r1 + r0);
            r1 = (r2 + r5);
            result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
            r1 = (r3 + r4);
            result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
            *(p_cur += outpitch) = result;
            /* third pixel */
            r1 = *(p_ref += inpitch);
            result = (r2 + r1);
            r2 = (r3 + r0);
            result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
            r2 = (r4 + r5);
            result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
            *(p_cur += outpitch) = result;
            /* fourth pixel */
            r2 = *(p_ref += inpitch);
            result = (r3 + r2);
            r3 = (r4 + r1);
            result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
            r3 = (r5 + r0);
            result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
            *(p_cur += outpitch) = result;
            p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
        }
        p_cur += curr_offset;
    }

    return ;
}

void VertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
                   int blkwidth, int blkheight, int dy)
{
    uint8 *p_cur;
    int *p_ref;
    uint32 tmp;
    int result, result2, curr_offset, ref_offset;
    int j, r0, r1, r2, r3, r4, r5;

    p_cur = out;
    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
    ref_offset = blkheight * inpitch; /* for limit */

    if (dy&1)
    {
        dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;

        for (j = 0; j < blkwidth; j++)
        {
            p_cur -= outpitch; /* compensate for the first offset */
            p_ref = in++;

            tmp = (uint32)(p_ref + ref_offset); /* limit */
            while ((uint32)p_ref < tmp)
            {                           /* loop un-rolled */
                r0 = *(p_ref - (inpitch << 1));
                r1 = *(p_ref - inpitch);
                r2 = *p_ref;
                r3 = *(p_ref += inpitch);  /* modify pointer before loading */
                r4 = *(p_ref += inpitch);
                /* first pixel */
                r5 = *(p_ref += inpitch);
                result = (r0 + r5);
                r0 = (r1 + r4);
                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                r0 = (r2 + r3);
                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                result2 = ((p_ref[dy] + 16) >> 5);
                CLIP_RESULT(result2)
                /* 3/4 pel,  no need to clip */
                result = (result + result2 + 1);
                result = (result >> 1);
                *(p_cur += outpitch) = result;
                /* second pixel */
                r0 = *(p_ref += inpitch);
                result = (r1 + r0);
                r1 = (r2 + r5);
                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                r1 = (r3 + r4);
                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                result2 = ((p_ref[dy] + 16) >> 5);
                CLIP_RESULT(result2)
                /* 3/4 pel,  no need to clip */
                result = (result + result2 + 1);
                result = (result >> 1);
                *(p_cur += outpitch) = result;
                /* third pixel */
                r1 = *(p_ref += inpitch);
                result = (r2 + r1);
                r2 = (r3 + r0);
                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                r2 = (r4 + r5);
                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                result2 = ((p_ref[dy] + 16) >> 5);
                CLIP_RESULT(result2)
                /* 3/4 pel,  no need to clip */
                result = (result + result2 + 1);
                result = (result >> 1);
                *(p_cur += outpitch) = result;
                /* fourth pixel */
                r2 = *(p_ref += inpitch);
                result = (r3 + r2);
                r3 = (r4 + r1);
                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                r3 = (r5 + r0);
                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                result2 = ((p_ref[dy] + 16) >> 5);
                CLIP_RESULT(result2)
                /* 3/4 pel,  no need to clip */
                result = (result + result2 + 1);
                result = (result >> 1);
                *(p_cur += outpitch) = result;
                p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
            }
            p_cur += curr_offset;
        }
    }
    else
    {
        for (j = 0; j < blkwidth; j++)
        {
            p_cur -= outpitch; /* compensate for the first offset */
            p_ref = in++;

            tmp = (uint32)(p_ref + ref_offset); /* limit */
            while ((uint32)p_ref < tmp)
            {                           /* loop un-rolled */
                r0 = *(p_ref - (inpitch << 1));
                r1 = *(p_ref - inpitch);
                r2 = *p_ref;
                r3 = *(p_ref += inpitch);  /* modify pointer before loading */
                r4 = *(p_ref += inpitch);
                /* first pixel */
                r5 = *(p_ref += inpitch);
                result = (r0 + r5);
                r0 = (r1 + r4);
                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                r0 = (r2 + r3);
                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                *(p_cur += outpitch) = result;
                /* second pixel */
                r0 = *(p_ref += inpitch);
                result = (r1 + r0);
                r1 = (r2 + r5);
                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                r1 = (r3 + r4);
                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                *(p_cur += outpitch) = result;
                /* third pixel */
                r1 = *(p_ref += inpitch);
                result = (r2 + r1);
                r2 = (r3 + r0);
                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                r2 = (r4 + r5);
                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                *(p_cur += outpitch) = result;
                /* fourth pixel */
                r2 = *(p_ref += inpitch);
                result = (r3 + r2);
                r3 = (r4 + r1);
                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                r3 = (r5 + r0);
                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                result = (result + 512) >> 10;
                CLIP_RESULT(result)
                *(p_cur += outpitch) = result;
                p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
            }
            p_cur += curr_offset;
        }
    }

    return ;
}

void DiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
                      uint8 *out, int outpitch,
                      int blkwidth, int blkheight , AVCCommonObj *video)
{
    int j, i;
    int result;
    uint8 *p_cur, *p_ref, *p_tmp8;
    int curr_offset, ref_offset;
    uint8 tmp_res[24][24], tmp_in[24][24];
    uint32 *p_tmp;
    uint32 tmp, pkres, tmp_result;
    int32 r0, r1, r2, r3, r4, r5;
    int32 r6, r7, r8, r9, r10, r13;

    for(i=0;i<24;i++)
        for(j=0;j<24;j++){
            tmp_res[i][j] = 0;
            tmp_in[i][j] = 0;
        }



    ref_offset = inpitch - blkwidth;
    p_ref = in1 - 2;



    /* perform horizontal interpolation */
    /* not word-aligned */
    /* It is faster to read 1 byte at time to avoid calling CreateAlign */
    /*  if(((uint32)p_ref)&0x3)
        {
            CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
            p_ref = &tmp_in[0][0];
            ref_offset = 24-blkwidth;
        }*/

    p_tmp = (uint32*) & (tmp_res[0][0]);


    for (j = blkheight; j > 0; j--)
    {
        r13 = 0;
        tmp = (uint32)(p_ref + blkwidth);

        //r0 = *((uint32*)p_ref);   /* d,c,b,a */
        //r1 = (r0>>8)&0xFF00FF;    /* 0,d,0,b */
        //r0 &= 0xFF00FF;           /* 0,c,0,a */
        /* It is faster to read 1 byte at a time,  */
        r0 = p_ref[0];
        r1 = p_ref[2];
        r0 |= (r1 << 16);           /* 0,c,0,a */
        r1 = p_ref[1];
        r2 = p_ref[3];
        r1 |= (r2 << 16);           /* 0,d,0,b */

        while ((uint32)p_ref < tmp)
        {
            //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
            //r3 = (r2>>8)&0xFF00FF;  /* 0,h,0,f */
            //r2 &= 0xFF00FF;           /* 0,g,0,e */
            /* It is faster to read 1 byte at a time,  */
            r2 = *(p_ref += 4);
            r3 = p_ref[2];
            r2 |= (r3 << 16);           /* 0,g,0,e */
            r3 = p_ref[1];
            r4 = p_ref[3];
            r3 |= (r4 << 16);           /* 0,h,0,f */

            r4 = r0 + r3;       /* c+h, a+f */
            r5 = r0 + r1;   /* c+d, a+b */
            r6 = r2 + r3;   /* g+h, e+f */
            r5 >>= 16;
            r5 |= (r6 << 16);   /* e+f, c+d */
            r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
            r4 += 0x100010; /* +16, +16 */
            r5 = r1 + r2;       /* d+g, b+e */
            r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
            r4 >>= 5;
            r13 |= r4;      /* check clipping */
            r4 &= 0xFF00FF; /* mask */

            r5 = p_ref[4];  /* i */
            r6 = (r5 << 16);
            r5 = r6 | (r2 >> 16);/* 0,i,0,g */
            r5 += r1;       /* d+i, b+g */ /* r5 not free */
            r1 >>= 16;
            r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
            r1 += r2;       /* f+g, d+e */
            r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
            r0 >>= 16;
            r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
            r0 += r3;       /* e+h, c+f */
            r5 += 0x100010; /* 16,16 */
            r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
            r5 >>= 5;
            r13 |= r5;      /* check clipping */
            r5 &= 0xFF00FF; /* mask */

            r4 |= (r5 << 8);    /* pack them together */
            r4 = SWAP_BYTES(r4);    // for endian test
            *p_tmp++ = r4;

            r1 = r3;
            r0 = r2;

        }
        p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
        p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */

        if (r13&0xFF000700) /* need clipping */
        {

            /* move back to the beginning of the line */
            p_ref -= (ref_offset + blkwidth);   /* input */
            p_tmp -= 6; /* intermediate output */

            tmp = (uint32)(p_ref + blkwidth);

            while ((uint32)p_ref < tmp)
            {
                r0 = *p_ref++;
                r1 = *p_ref++;
                r2 = *p_ref++;
                r3 = *p_ref++;
                r4 = *p_ref++;
                /* first pixel */
                r5 = *p_ref++;
                result = (r0 + r5);
                r0 = (r1 + r4);
                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                r0 = (r2 + r3);
                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                result = (result + 16) >> 5;
                CLIP_RESULT(result)
                pkres = result;
                /* second pixel */
                r0 = *p_ref++;
                result = (r1 + r0);
                r1 = (r2 + r5);
                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                r1 = (r3 + r4);
                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                result = (result + 16) >> 5;
                CLIP_RESULT(result)
                pkres |= (result << 8);
                /* third pixel */
                r1 = *p_ref++;
                result = (r2 + r1);
                r2 = (r3 + r0);
                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                r2 = (r4 + r5);
                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                result = (result + 16) >> 5;
                CLIP_RESULT(result)
                pkres |= (result << 16);
                /* fourth pixel */
                r2 = *p_ref++;
                result = (r3 + r2);
                r3 = (r4 + r1);
                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                r3 = (r5 + r0);
                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                result = (result + 16) >> 5;
                CLIP_RESULT(result)
                pkres |= (result << 24);

                pkres = SWAP_BYTES(pkres);
                *p_tmp++ = pkres; /* write 4 pixel */
                p_ref -= 5;
            }
            p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */

        }
    }



    /*  perform vertical interpolation */
    /* not word-aligned */
    if (((uint32)in2)&0x3)
    {

        CreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5,video);
        in2 = &tmp_in[2][0];
        inpitch = 24;

    }


    p_cur = out;
    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
    pkres = blkheight * inpitch; /* reuse it for limit */

    curr_offset += 3;

    for (j = 0; j < blkwidth; j += 4, in2 += 4)
    {
        r13 = 0;
        p_ref = in2;
        p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
        p_tmp8 -= 24;  /* compensate for the first offset */
        p_cur -= outpitch;  /* compensate for the first offset */
        tmp = (uint32)(p_ref + pkres); /* limit */

        while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
        {
            /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign,  */
            /*p_ref8 = p_ref-(inpitch<<1);          r0 = p_ref8[0];         r1 = p_ref8[2];
            r0 |= (r1<<16);         r6 = p_ref8[1];         r1 = p_ref8[3];
            r6 |= (r1<<16);         p_ref+=inpitch; */
            r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
            r0 = SWAP_BYTES(r0);	// for endian


            p_ref += inpitch;
            r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
            r0 &= 0xFF00FF;

            /*p_ref8 = p_ref+(inpitch<<1);
            r1 = p_ref8[0];         r7 = p_ref8[2];         r1 |= (r7<<16);
            r7 = p_ref8[1];         r2 = p_ref8[3];         r7 |= (r2<<16);*/
            r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
            r1 = SWAP_BYTES(r1);	//for endian


            r7 = (r1 >> 8) & 0xFF00FF;
            r1 &= 0xFF00FF;

            r0 += r1;
            r6 += r7;

            /*r2 = p_ref[0];            r8 = p_ref[2];          r2 |= (r8<<16);
            r8 = p_ref[1];          r1 = p_ref[3];          r8 |= (r1<<16);*/
            r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
            r2 = SWAP_BYTES(r2);	//for endian


            r8 = (r2 >> 8) & 0xFF00FF;
            r2 &= 0xFF00FF;

            /*p_ref8 = p_ref-inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
            r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
            r2 = p_ref8[3];         r7 |= (r2<<16);*/
            r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
            r1 = SWAP_BYTES(r1);	// for endian


            r7 = (r1 >> 8) & 0xFF00FF;
            r1 &= 0xFF00FF;
            r1 += r2;

            r7 += r8;

            r0 += 20 * r1;
            r6 += 20 * r7;
            r0 += 0x100010;
            r6 += 0x100010;

            /*p_ref8 = p_ref-(inpitch<<1);          r2 = p_ref8[0];         r8 = p_ref8[2];
            r2 |= (r8<<16);         r8 = p_ref8[1];         r1 = p_ref8[3];         r8 |= (r1<<16);*/
            r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
            r2 = SWAP_BYTES(r2);	//for endian

            r8 = (r2 >> 8) & 0xFF00FF;
            r2 &= 0xFF00FF;

            /*p_ref8 = p_ref+inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
            r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
            r2 = p_ref8[3];         r7 |= (r2<<16);*/
            r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
            r1 = SWAP_BYTES(r1);	// for endian

            r7 = (r1 >> 8) & 0xFF00FF;

            r1 &= 0xFF00FF;
            r1 += r2;

            r7 += r8;

            r0 -= 5 * r1;
            r6 -= 5 * r7;

            r0 >>= 5;
            r6 >>= 5;
            /* clip */
            r13 |= r6;
            r13 |= r0;
            //CLIPPACK(r6,result)
            /* add with horizontal results */
            r10 = *((uint32*)(p_tmp8 += 24));
            r10 = SWAP_BYTES(r10);	// for endian


            r9 = (r10 >> 8) & 0xFF00FF;
            r10 &= 0xFF00FF;

            r0 += r10;
            r0 += 0x10001;
            r0 = (r0 >> 1) & 0xFF00FF;   /* mask to 8 bytes */

            r6 += r9;
            r6 += 0x10001;
            r6 = (r6 >> 1) & 0xFF00FF;   /* mask to 8 bytes */

            r0 |= (r6 << 8);  /* pack it back */

            r0 = SWAP_BYTES(r0);	// for endian
            *((uint32*)(p_cur += outpitch)) = r0;
        }

        p_cur += curr_offset; /* offset to the next pixel */
        if (r13 & 0xFF000700) /* this column need clipping */
        {
            p_cur -= 4;

            for (i = 0; i < 4; i++)
            {
                p_ref = in2 + i;
                p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
                p_tmp8 -= 24;  /* compensate for the first offset */
                p_cur -= outpitch;  /* compensate for the first offset */
                tmp = (uint32)(p_ref + pkres); /* limit */
                while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
                {
                    r0 = *(p_ref - (inpitch << 1));
                    r1 = *(p_ref - inpitch);
                    r2 = *p_ref;
                    r3 = *(p_ref += inpitch);  /* modify pointer before loading */
                    r4 = *(p_ref += inpitch);
                    /* first pixel */
                    r5 = *(p_ref += inpitch);
                    result = (r0 + r5);
                    r0 = (r1 + r4);
                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
                    r0 = (r2 + r3);
                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    tmp_result = *(p_tmp8 += 24);  /* modify pointer before loading */
                    result = (result + tmp_result + 1);  /* no clip */
                    result = (result >> 1);
                    *(p_cur += outpitch) = result;
                    /* second pixel */
                    r0 = *(p_ref += inpitch);
                    result = (r1 + r0);
                    r1 = (r2 + r5);
                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
                    r1 = (r3 + r4);
                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
                    result = (result + tmp_result + 1);  /* no clip */
                    result = (result >> 1);
                    *(p_cur += outpitch) = result;
                    /* third pixel */
                    r1 = *(p_ref += inpitch);
                    result = (r2 + r1);
                    r2 = (r3 + r0);
                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
                    r2 = (r4 + r5);
                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
                    result = (result + tmp_result + 1);  /* no clip */
                    result = (result >> 1);
                    *(p_cur += outpitch) = result;
                    /* fourth pixel */
                    r2 = *(p_ref += inpitch);
                    result = (r3 + r2);
                    r3 = (r4 + r1);
                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
                    r3 = (r5 + r0);
                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
                    result = (result + 16) >> 5;
                    CLIP_RESULT(result)
                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
                    result = (result + tmp_result + 1);  /* no clip */
                    result = (result >> 1);
                    *(p_cur += outpitch) = result;
                    p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
                }
                p_cur += (curr_offset - 3);
            }

        }
    }



    return ;
}

/* position G */
void FullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
               int blkwidth, int blkheight,AVCCommonObj *video)
{
    int i, j;
    int offset_in = inpitch - blkwidth;
    int offset_out = outpitch - blkwidth;
    uint32 temp;
    uint8 byte;


    if (((uint32)in)&3)
    {
        for (j = blkheight; j > 0; j--)
        {
            for (i = blkwidth; i > 0; i -= 4)
            {
                temp = *in++;
                byte = *in++;
                temp |= (byte << 8);
                byte = *in++;
                temp |= (byte << 16);
                byte = *in++;
                temp |= (byte << 24);

                temp = SWAP_BYTES(temp);	// for endian
                *((uint32*)out) = temp; /* write 4 bytes */
                out += 4;
            }
            out += offset_out;
            in += offset_in;
        }
    }
    else
    {
        for (j = blkheight; j > 0; j--)
        {
            for (i = blkwidth; i > 0; i -= 4)
            {
                temp = *((uint32*)in);
                *((uint32*)out) = temp;
                in += 4;
                out += 4;
            }
            out += offset_out;
            in += offset_in;
        }
    }


    return ;
}

void ChromaMotionComp(uint8 *ref, int picwidth, int picheight,
                      int x_pos, int y_pos,
                      uint8 *pred, int pred_pitch,
                      int blkwidth, int blkheight,AVCCommonObj* video)
{
    int dx, dy;
    int offset_dx, offset_dy;
    int index;
    uint8 temp[24][24];

    dx = x_pos & 7;
    dy = y_pos & 7;
    offset_dx = (dx + 7) >> 3;
    offset_dy = (dy + 7) >> 3;
    x_pos = x_pos >> 3;  /* round it to full-pel resolution */
    y_pos = y_pos >> 3;

    if ((x_pos >= 0 && x_pos + blkwidth + offset_dx <= picwidth) && (y_pos >= 0 && y_pos + blkheight + offset_dy <= picheight))
    {
        ref += y_pos * picwidth + x_pos;
    }
    else
    {
        CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth + offset_dx, blkheight + offset_dy);
        ref = &temp[0][0];
        picwidth = 24;
    }

    index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);

    (*(ChromaMC_SIMD[index]))(ref, picwidth , dx, dy, pred, pred_pitch, blkwidth, blkheight,video);
    return ;
}


/* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done)  */
void ChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
                           uint8 *pOut, int predPitch, int blkwidth, int blkheight,AVCCommonObj* video)
{
    int32 r0, r1, r2, r3, result0, result1;
    uint8 temp[288];
    uint8 *ref, *out;
    int i, j;
    int dx_8 = 8 - dx;
    int dy_8 = 8 - dy;

    /*	endian free in this function	*/
    /* horizontal first */
    out = temp;
    for (i = 0; i < blkheight + 1; i++)
    {
        ref = pRef;
        r0 = ref[0];
        for (j = 0; j < blkwidth; j += 4)
        {
            r0 |= (ref[2] << 16);
            result0 = dx_8 * r0;

            r1 = ref[1] | (ref[3] << 16);
            result0 += dx * r1;
            *(int32 *)out = result0;

            result0 = dx_8 * r1;

            r2 = ref[4];
            r0 = r0 >> 16;
            r1 = r0 | (r2 << 16);
            result0 += dx * r1;
            *(int32 *)(out + 16) = result0;

            ref += 4;
            out += 4;
            r0 = r2;
        }
        pRef += srcPitch;
        out += (32 - blkwidth);
    }

//  pRef -= srcPitch*(blkheight+1);
    ref = temp;

    for (j = 0; j < blkwidth; j += 4)
    {
        r0 = *(int32 *)ref;
        r1 = *(int32 *)(ref + 16);
        ref += 32;
        out = pOut;
        for (i = 0; i < (blkheight >> 1); i++)
        {
            result0 = dy_8 * r0 + 0x00200020;
            r2 = *(int32 *)ref;
            result0 += dy * r2;
            result0 >>= 6;
            result0 &= 0x00FF00FF;
            r0 = r2;

            result1 = dy_8 * r1 + 0x00200020;
            r3 = *(int32 *)(ref + 16);
            result1 += dy * r3;
            result1 >>= 6;
            result1 &= 0x00FF00FF;
            r1 = r3;
            int tmpForEndian = result0 | (result1 << 8);    // for endian
            tmpForEndian = SWAP_BYTES(tmpForEndian);  // for endian
            //*(int32 *)out = result0 | (result1 << 8); // be replaced
            *(int32 *)out = tmpForEndian;   // for endian
            out += predPitch;
            ref += 32;

            result0 = dy_8 * r0 + 0x00200020;
            r2 = *(int32 *)ref;
            result0 += dy * r2;
            result0 >>= 6;
            result0 &= 0x00FF00FF;
            r0 = r2;

            result1 = dy_8 * r1 + 0x00200020;
            r3 = *(int32 *)(ref + 16);
            result1 += dy * r3;
            result1 >>= 6;
            result1 &= 0x00FF00FF;
            r1 = r3;
            int tmpForEndian1 = result0 | (result1 << 8);    // for endian
            tmpForEndian1 = SWAP_BYTES(tmpForEndian1);  // for endian
            //*(int32 *)out = result0 | (result1 << 8); // be replaced
            *(int32 *)out = tmpForEndian1;   // for endian
            out += predPitch;
            ref += 32;
        }
        pOut += 4;
        ref = temp + 4; /* since it can only iterate twice max  */
    }


    return;
}

void ChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
                             uint8 *pOut, int predPitch, int blkwidth, int blkheight,AVCCommonObj* video)
{
//    OSCL_UNUSED_ARG(dy);
    int32 r0, r1, r2, result0, result1;
    uint8 *ref, *out;
    int i, j;
    int dx_8 = 8 - dx;


    int32 endianTmp;

    /* horizontal first */
    for (i = 0; i < blkheight; i++)
    {
        ref = pRef;
        out = pOut;

        r0 = ref[0];
        for (j = 0; j < blkwidth; j += 4)
        {
            r0 |= (ref[2] << 16);
            result0 = dx_8 * r0 + 0x00040004;

            r1 = ref[1] | (ref[3] << 16);
            result0 += dx * r1;
            result0 >>= 3;
            result0 &= 0x00FF00FF;

            result1 = dx_8 * r1 + 0x00040004;

            r2 = ref[4];
            r0 = r0 >> 16;
            r1 = r0 | (r2 << 16);
            result1 += dx * r1;
            result1 >>= 3;
            result1 &= 0x00FF00FF;

            endianTmp = result0 | (result1 << 8);	// for endian , 2013/9/9
            endianTmp = SWAP_BYTES(endianTmp);	// for endian , 2013/9/9
            *(int32 *)out = endianTmp;	// for endian , 2013/9/9
            //*(int32 *)out = result0 | (result1 << 8);

            ref += 4;
            out += 4;
            r0 = r2;
        }

        pRef += srcPitch;
        pOut += predPitch;
    }


    return;
}

void ChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
                           uint8 *pOut, int predPitch, int blkwidth, int blkheight,AVCCommonObj* video)
{
//    OSCL_UNUSED_ARG(dx);
    int32 r0, r1, r2, r3, result0, result1;
    int i, j;
    uint8 *ref, *out;
    int dy_8 = 8 - dy;
    /* vertical first */

    int32 endianTmp;	// for endian , 2013/9/9

    for (i = 0; i < blkwidth; i += 4)
    {
        ref = pRef;
        out = pOut;

        r0 = ref[0] | (ref[2] << 16);
        r1 = ref[1] | (ref[3] << 16);
        ref += srcPitch;
        for (j = 0; j < blkheight; j++)
        {
            result0 = dy_8 * r0 + 0x00040004;
            r2 = ref[0] | (ref[2] << 16);
            result0 += dy * r2;
            result0 >>= 3;
            result0 &= 0x00FF00FF;
            r0 = r2;

            result1 = dy_8 * r1 + 0x00040004;
            r3 = ref[1] | (ref[3] << 16);
            result1 += dy * r3;
            result1 >>= 3;
            result1 &= 0x00FF00FF;
            r1 = r3;
            endianTmp = result0 | (result1 << 8);	// for endian , 2013/9/9
            endianTmp = SWAP_BYTES(endianTmp);	// for endian , 2013/9/9
            *(int32 *)out = endianTmp;	// for endian , 2013/9/9
            //*(int32 *)out = result0 | (result1 << 8);
            ref += srcPitch;
            out += predPitch;
        }
        pOut += 4;
        pRef += 4;
    }
    return;
}

void ChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
                            uint8 *pOut,  int predPitch, int blkwidth, int blkheight,AVCCommonObj* video)
{
//    OSCL_UNUSED_ARG(blkwidth);
    int32 r0, r1, temp0, temp1, result;
    int32 temp[9];
    int32 *out;
    int i, r_temp;
    int dy_8 = 8 - dy;

    /* horizontal first */
    out = temp;
    for (i = 0; i < blkheight + 1; i++)
    {
        r_temp = pRef[1];
        temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
        temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
        r0 = temp0 | (temp1 << 16);
        *out++ = r0;
        pRef += srcPitch;
    }

    pRef -= srcPitch * (blkheight + 1);

    out = temp;

    r0 = *out++;

    int32 endianTmp;	// for endian , 2013/9/9

    for (i = 0; i < blkheight; i++)
    {
        result = dy_8 * r0 + 0x00200020;
        r1 = *out++;
        result += dy * r1;
        result >>= 6;
        result &= 0x00FF00FF;
        endianTmp = (result >> 8) | (result & 0xFF);	// for endian , 2013/9/9
        endianTmp = SWAP_HALFBYTES(endianTmp);	// for endian , 2013/9/9
        *(int16 *)pOut = endianTmp;	// for endian , 2013/9/9
        //*(int16 *)pOut = (result >> 8) | (result & 0xFF);
        r0 = r1;
        pOut += predPitch;
    }
    return;
}

void ChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
                              uint8 *pOut, int predPitch, int blkwidth, int blkheight,AVCCommonObj* video)
{
//    OSCL_UNUSED_ARG(dy);
//    OSCL_UNUSED_ARG(blkwidth);
    int i, temp, temp0, temp1;

    int endianTmp;	// for endian , 2013/9/9

    /* horizontal first */
    for (i = 0; i < blkheight; i++)
    {
        temp = pRef[1];
        temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
        temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;

        endianTmp = temp0 | (temp1 << 8);	// for endian , 2013/9/9
        endianTmp = SWAP_HALFBYTES(endianTmp);	// for endian , 2013/9/9
        *(int16 *)pOut = endianTmp;	// for endian , 2013/9/9
        //*(int16 *)pOut = temp0 | (temp1 << 8);
        pRef += srcPitch;
        pOut += predPitch;

    }
    return;
}
void ChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
                            uint8 *pOut, int predPitch, int blkwidth, int blkheight,AVCCommonObj* video)
{
//    OSCL_UNUSED_ARG(dx);
  //  OSCL_UNUSED_ARG(blkwidth);
    int32 r0, r1, result;
    int i;
    int dy_8 = 8 - dy;

    int32 endianTmp;	// for endian , 2013/9/9

    r0 = pRef[0] | (pRef[1] << 16);
    pRef += srcPitch;
    for (i = 0; i < blkheight; i++)
    {
        result = dy_8 * r0 + 0x00040004;
        r1 = pRef[0] | (pRef[1] << 16);
        result += dy * r1;
        result >>= 3;
        result &= 0x00FF00FF;
        endianTmp = (result >> 8) | (result & 0xFF);	// for endian , 2013/9/9
        endianTmp = SWAP_HALFBYTES(endianTmp);	// for endian , 2013/9/9
        *(int16 *)pOut = endianTmp;
        //*(int16 *)pOut = (result >> 8) | (result & 0xFF);
        r0 = r1;
        pRef += srcPitch;
        pOut += predPitch;
    }
    return;
}

void ChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
                       uint8 *pOut, int predPitch, int blkwidth, int blkheight,AVCCommonObj* video)
{
//    OSCL_UNUSED_ARG(dx);
 //   OSCL_UNUSED_ARG(dy);
    int i, j;
    int offset_in = srcPitch - blkwidth;
    int offset_out = predPitch - blkwidth;
    uint16 temp;
    uint8 byte;


    if (((uint32)pRef)&1)
    {
        for (j = blkheight; j > 0; j--)
        {
            for (i = blkwidth; i > 0; i -= 2)
            {
                temp = *pRef++;
                byte = *pRef++;
                temp |= (byte << 8);
                temp = SWAP_HALFBYTES(temp);	// for endian ,2013/9/9
                *((uint16*)pOut) = temp; /* write 2 bytes */
                pOut += 2;
            }
            pOut += offset_out;
            pRef += offset_in;
        }
    }
    else
    {
        for (j = blkheight; j > 0; j--)
        {
            for (i = blkwidth; i > 0; i -= 2)
            {
                temp = *((uint16*)pRef);
                *((uint16*)pOut) = temp;
                pRef += 2;
                pOut += 2;
            }
            pOut += offset_out;
            pRef += offset_in;
        }
    }
    return ;
}
