"Fossies" - the Fresh Open Source Software Archive

Member "quicktime4linux-2.3/rtjpeg_core.c" (9 Jan 2007, 96149 Bytes) of package /linux/privat/old/quicktime4linux-2.3-src.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /* 
    2    bttvgrab 0.15.4 [1999-03-23]
    3    (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
    4 
    5    Maintained by: Joerg Walter
    6    Current version at http:/*moes.pmnet.uni-oldenburg.de/bttvgrab/ */
    7 
    8     This program is free software; you can rquantptr it and/or modify
    9     it under the terms of the GNU General Public License as published by
   10     the Free Software Foundation; either version 2 of the License, or
   11     (at your option) any later version.
   12 
   13     This program is distributed in the hope that it will be useful,
   14     but WITHOUT ANY WARRANTY; without even the implied warranty of
   15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   16     GNU General Public License for more details.
   17 
   18     You should have received a copy of the GNU General Public License
   19     along with this program; if not, write to the Free Software
   20     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   21 
   22    This file is a modified version of RTjpeg 0.1.2, (C) Justin Schoeman 1998
   23 */
   24 
   25 
   26 /*
   27 
   28 Main Routines
   29 
   30 This file contains most of the initialisation and control functions
   31 
   32 (C) Justin Schoeman 1998
   33 
   34 */
   35 
   36 #include <sys/types.h>
   37 #include <stdio.h>
   38 #include <stdlib.h>
   39 #include <string.h>
   40 #include "rtjpeg_core.h"
   41 
   42 static const unsigned char RTjpeg_ZZ[64]={
   43 0,
   44 8, 1,
   45 2, 9, 16,
   46 24, 17, 10, 3,
   47 4, 11, 18, 25, 32,
   48 40, 33, 26, 19, 12, 5,
   49 6, 13, 20, 27, 34, 41, 48,
   50 56, 49, 42, 35, 28, 21, 14, 7,
   51 15, 22, 29, 36, 43, 50, 57,
   52 58, 51, 44, 37, 30, 23,
   53 31, 38, 45, 52, 59,
   54 60, 53, 46, 39,
   55 47, 54, 61,
   56 62, 55,
   57 63 };
   58 
   59 static const __u64 RTjpeg_aan_tab[64]={
   60 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, 
   61 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL, 
   62 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL, 
   63 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL, 
   64 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, 
   65 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL, 
   66 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL, 
   67 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL, 
   68 };
   69 
   70 static const unsigned char RTjpeg_lum_quant_tbl[64] = {
   71     16,  11,  10,  16,  24,  40,  51,  61,
   72     12,  12,  14,  19,  26,  58,  60,  55,
   73     14,  13,  16,  24,  40,  57,  69,  56,
   74     14,  17,  22,  29,  51,  87,  80,  62,
   75     18,  22,  37,  56,  68, 109, 103,  77,
   76     24,  35,  55,  64,  81, 104, 113,  92,
   77     49,  64,  78,  87, 103, 121, 120, 101,
   78     72,  92,  95,  98, 112, 100, 103,  99
   79  };
   80 
   81 static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
   82     17,  18,  24,  47,  99,  99,  99,  99,
   83     18,  21,  26,  66,  99,  99,  99,  99,
   84     24,  26,  56,  99,  99,  99,  99,  99,
   85     47,  66,  99,  99,  99,  99,  99,  99,
   86     99,  99,  99,  99,  99,  99,  99,  99,
   87     99,  99,  99,  99,  99,  99,  99,  99,
   88     99,  99,  99,  99,  99,  99,  99,  99,
   89     99,  99,  99,  99,  99,  99,  99,  99
   90  };
   91  
   92 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
   93 {
   94  register int ci, co=1, tmp;
   95  register __s16 ZZvalue;
   96 
   97  (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
   98  
   99  for(ci=1; ci<=bt8; ci++) 
  100  {
  101     ZZvalue = data[RTjpeg_ZZ[ci]];
  102 
  103    if(ZZvalue>0) 
  104     {
  105      strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
  106    } 
  107     else 
  108     {
  109      strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
  110    }
  111  }
  112 
  113  for(; ci<64; ci++) 
  114  {
  115   ZZvalue = data[RTjpeg_ZZ[ci]];
  116 
  117   if(ZZvalue>0)
  118   {
  119    strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue;
  120   } 
  121   else if(ZZvalue<0)
  122   {
  123    strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue;
  124   } 
  125   else /* compress zeros */
  126   {
  127    tmp=ci;
  128    do
  129    {
  130     ci++;
  131    } 
  132     while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
  133 
  134    strm[co++]=(__s8)(63+(ci-tmp));
  135    ci--;
  136   }
  137  }
  138  return (int)co;
  139 }
  140 
  141 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
  142 {
  143  int ci=1, co=1, tmp;
  144  register int i;
  145 
  146  i=RTjpeg_ZZ[0];
  147  data[i]=((__u8)strm[0])*qtbl[i];
  148 
  149  for(co=1; co<=bt8; co++)
  150  {
  151   i=RTjpeg_ZZ[co];
  152   data[i]=strm[ci++]*qtbl[i];
  153  }
  154  
  155  for(; co<64; co++)
  156  {
  157   if(strm[ci]>63)
  158   {
  159    tmp=co+strm[ci]-63;
  160    for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
  161    co--;
  162   } else
  163   {
  164    i=RTjpeg_ZZ[co];
  165    data[i]=strm[ci]*qtbl[i];
  166   }
  167   ci++;
  168  }
  169  return (int)ci;
  170 }
  171 
  172 #if defined(MMX)
  173 void RTjpeg_quant_init(void)
  174 {
  175  int i;
  176  __s16 *qtbl;
  177  
  178  qtbl=(__s16 *)RTjpeg_lqt;
  179  for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i];
  180 
  181  qtbl=(__s16 *)RTjpeg_cqt;
  182  for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i];
  183 }
  184 
  185 static mmx_t RTjpeg_ones=(mmx_t)(long long)0x0001000100010001LL;
  186 static mmx_t RTjpeg_half=(mmx_t)(long long)0x7fff7fff7fff7fffLL;
  187 
  188 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
  189 {
  190  int i;
  191  mmx_t *bl, *ql;
  192  
  193  ql=(mmx_t *)qtbl;
  194  bl=(mmx_t *)block;
  195  
  196  movq_m2r(RTjpeg_ones, mm6);
  197  movq_m2r(RTjpeg_half, mm7);
  198 
  199  for(i=16; i; i--) 
  200  {
  201   movq_m2r(*(ql++), mm0); /* quant vals (4) */
  202   movq_m2r(*bl, mm2); /* block vals (4) */
  203   movq_r2r(mm0, mm1);
  204   movq_r2r(mm2, mm3);
  205   
  206   punpcklwd_r2r(mm6, mm0); /*           1 qb 1 qa */
  207   punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
  208   
  209   punpcklwd_r2r(mm7, mm2); /*                   32767 bb 32767 ba */
  210   punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
  211   
  212   pmaddwd_r2r(mm2, mm0); /*                         32767+bb*qb 32767+ba*qa */
  213   pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
  214   
  215   psrad_i2r(16, mm0);
  216   psrad_i2r(16, mm1);
  217   
  218   packssdw_r2r(mm1, mm0);
  219   
  220   movq_r2m(mm0, *(bl++));
  221   
  222  }
  223 }
  224 #else
  225 void RTjpeg_quant_init(void)
  226 {
  227 }
  228 
  229 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
  230 {
  231  int i;
  232  
  233  for(i=0; i<64; i++)
  234    block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16);
  235 }
  236 #endif
  237 
  238 /*
  239  * Perform the forward DCT on one block of samples.
  240  */
  241 #ifdef MMX
  242 static mmx_t RTjpeg_C4   =(mmx_t)(long long)0x2D412D412D412D41LL;
  243 static mmx_t RTjpeg_C6   =(mmx_t)(long long)0x187E187E187E187ELL;
  244 static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL;
  245 static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL;
  246 static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL;
  247 
  248 #else
  249 
  250 #define FIX_0_382683433  ((__s32)   98)     /* FIX(0.382683433) */
  251 #define FIX_0_541196100  ((__s32)  139)     /* FIX(0.541196100) */
  252 #define FIX_0_707106781  ((__s32)  181)     /* FIX(0.707106781) */
  253 #define FIX_1_306562965  ((__s32)  334)     /* FIX(1.306562965) */
  254 
  255 #define DESCALE10(x) (__s16)( ((x)+128) >> 8)
  256 #define DESCALE20(x)  (__s16)(((x)+32768) >> 16)
  257 #define D_MULTIPLY(var,const)  ((__s32) ((var) * (const)))
  258 #endif
  259 
  260 void RTjpeg_dct_init(void)
  261 {
  262  int i;
  263  
  264  for(i=0; i<64; i++)
  265  {
  266   RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]);
  267   RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]);
  268  }
  269 }
  270 
  271 void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip)
  272 {
  273 #ifndef MMX
  274   __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  275   __s32 tmp10, tmp11, tmp12, tmp13;
  276   __s32 z1, z2, z3, z4, z5, z11, z13;
  277   __u8 *idataptr;
  278   __s16 *odataptr;
  279   __s32 *wsptr;
  280   int ctr;
  281 
  282   idataptr = idata;
  283   wsptr = RTjpeg_ws;
  284   for (ctr = 7; ctr >= 0; ctr--) {
  285     tmp0 = idataptr[0] + idataptr[7];
  286     tmp7 = idataptr[0] - idataptr[7];
  287     tmp1 = idataptr[1] + idataptr[6];
  288     tmp6 = idataptr[1] - idataptr[6];
  289     tmp2 = idataptr[2] + idataptr[5];
  290     tmp5 = idataptr[2] - idataptr[5];
  291     tmp3 = idataptr[3] + idataptr[4];
  292     tmp4 = idataptr[3] - idataptr[4];
  293     
  294     tmp10 = (tmp0 + tmp3);  /* phase 2 */
  295     tmp13 = tmp0 - tmp3;
  296     tmp11 = (tmp1 + tmp2);
  297     tmp12 = tmp1 - tmp2;
  298     
  299     wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
  300     wsptr[4] = (tmp10 - tmp11)<<8;
  301     
  302     z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
  303     wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
  304     wsptr[6] = (tmp13<<8) - z1;
  305     
  306     tmp10 = tmp4 + tmp5;    /* phase 2 */
  307     tmp11 = tmp5 + tmp6;
  308     tmp12 = tmp6 + tmp7;
  309 
  310     z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
  311     z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
  312     z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
  313     z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
  314 
  315     z11 = (tmp7<<8) + z3;       /* phase 5 */
  316     z13 = (tmp7<<8) - z3;
  317 
  318     wsptr[5] = z13 + z2;    /* phase 6 */
  319     wsptr[3] = z13 - z2;
  320     wsptr[1] = z11 + z4;
  321     wsptr[7] = z11 - z4;
  322 
  323     idataptr += rskip<<3;       /* advance pointer to next row */
  324     wsptr += 8;
  325   }
  326 
  327   wsptr = RTjpeg_ws;
  328   odataptr=odata;
  329   for (ctr = 7; ctr >= 0; ctr--) {
  330     tmp0 = wsptr[0] + wsptr[56];
  331     tmp7 = wsptr[0] - wsptr[56];
  332     tmp1 = wsptr[8] + wsptr[48];
  333     tmp6 = wsptr[8] - wsptr[48];
  334     tmp2 = wsptr[16] + wsptr[40];
  335     tmp5 = wsptr[16] - wsptr[40];
  336     tmp3 = wsptr[24] + wsptr[32];
  337     tmp4 = wsptr[24] - wsptr[32];
  338     
  339     tmp10 = tmp0 + tmp3;    /* phase 2 */
  340     tmp13 = tmp0 - tmp3;
  341     tmp11 = tmp1 + tmp2;
  342     tmp12 = tmp1 - tmp2;
  343     
  344     odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
  345     odataptr[32] = DESCALE10(tmp10 - tmp11);
  346     
  347     z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
  348     odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
  349     odataptr[48] = DESCALE20((tmp13<<8) - z1);
  350 
  351     tmp10 = tmp4 + tmp5;    /* phase 2 */
  352     tmp11 = tmp5 + tmp6;
  353     tmp12 = tmp6 + tmp7;
  354 
  355     z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
  356     z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
  357     z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
  358     z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
  359 
  360     z11 = (tmp7<<8) + z3;       /* phase 5 */
  361     z13 = (tmp7<<8) - z3;
  362 
  363     odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
  364     odataptr[24] = DESCALE20(z13 - z2);
  365     odataptr[8] = DESCALE20(z11 + z4);
  366     odataptr[56] = DESCALE20(z11 - z4);
  367 
  368     odataptr++;         /* advance pointer to next column */
  369     wsptr++;
  370   }
  371 #else
  372   mmx_t tmp6, tmp7;
  373   register mmx_t *dataptr = (mmx_t *)odata;
  374   mmx_t *idata2 = (mmx_t *)idata;
  375 
  376    /* first copy the input 8 bit to the destination 16 bits */
  377 
  378    movq_m2r(RTjpeg_zero, mm2);
  379 
  380 
  381     movq_m2r(*idata2, mm0);      
  382     movq_r2r(mm0, mm1);                 
  383 
  384     punpcklbw_r2r(mm2, mm0);
  385     movq_r2m(mm0, *(dataptr));
  386 
  387     punpckhbw_r2r(mm2, mm1);
  388     movq_r2m(mm1, *(dataptr+1));
  389     
  390     idata2 += rskip;
  391 
  392     movq_m2r(*idata2, mm0);      
  393     movq_r2r(mm0, mm1);                 
  394 
  395     punpcklbw_r2r(mm2, mm0);
  396     movq_r2m(mm0, *(dataptr+2));
  397 
  398     punpckhbw_r2r(mm2, mm1);
  399     movq_r2m(mm1, *(dataptr+3));
  400     
  401     idata2 += rskip;
  402 
  403     movq_m2r(*idata2, mm0);      
  404     movq_r2r(mm0, mm1);                 
  405 
  406     punpcklbw_r2r(mm2, mm0);
  407     movq_r2m(mm0, *(dataptr+4));
  408 
  409     punpckhbw_r2r(mm2, mm1);
  410     movq_r2m(mm1, *(dataptr+5));
  411     
  412     idata2 += rskip;
  413 
  414     movq_m2r(*idata2, mm0);      
  415     movq_r2r(mm0, mm1);                 
  416 
  417     punpcklbw_r2r(mm2, mm0);
  418     movq_r2m(mm0, *(dataptr+6));
  419 
  420     punpckhbw_r2r(mm2, mm1);
  421     movq_r2m(mm1, *(dataptr+7));
  422     
  423     idata2 += rskip;
  424 
  425     movq_m2r(*idata2, mm0);      
  426     movq_r2r(mm0, mm1);                 
  427 
  428     punpcklbw_r2r(mm2, mm0);
  429     movq_r2m(mm0, *(dataptr+8));
  430 
  431     punpckhbw_r2r(mm2, mm1);
  432     movq_r2m(mm1, *(dataptr+9));
  433     
  434     idata2 += rskip;
  435 
  436     movq_m2r(*idata2, mm0);      
  437     movq_r2r(mm0, mm1);                 
  438 
  439     punpcklbw_r2r(mm2, mm0);
  440     movq_r2m(mm0, *(dataptr+10));
  441 
  442     punpckhbw_r2r(mm2, mm1);
  443     movq_r2m(mm1, *(dataptr+11));
  444     
  445     idata2 += rskip;
  446 
  447     movq_m2r(*idata2, mm0);      
  448     movq_r2r(mm0, mm1);                 
  449 
  450     punpcklbw_r2r(mm2, mm0);
  451     movq_r2m(mm0, *(dataptr+12));
  452 
  453     punpckhbw_r2r(mm2, mm1);
  454     movq_r2m(mm1, *(dataptr+13));
  455     
  456     idata2 += rskip;
  457 
  458     movq_m2r(*idata2, mm0);      
  459     movq_r2r(mm0, mm1);                 
  460 
  461     punpcklbw_r2r(mm2, mm0);
  462     movq_r2m(mm0, *(dataptr+14));
  463 
  464     punpckhbw_r2r(mm2, mm1);
  465     movq_r2m(mm1, *(dataptr+15));
  466 
  467 /*  Start Transpose to do calculations on rows */
  468 
  469     movq_m2r(*(dataptr+9), mm7);            /* m03:m02|m01:m00 - first line (line 4)and copy into m5 */
  470 
  471     movq_m2r(*(dataptr+13), mm6);           /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */
  472     movq_r2r(mm7, mm5);                 
  473 
  474     punpcklwd_m2r(*(dataptr+11), mm7);  /* m11:m01|m10:m00 - interleave first and second lines */
  475     movq_r2r(mm6, mm2);                      
  476 
  477     punpcklwd_m2r(*(dataptr+15), mm6);  /* m31:m21|m30:m20 - interleave third and fourth lines */
  478     movq_r2r(mm7, mm1);
  479 
  480     movq_m2r(*(dataptr+11), mm3);         /* m13:m13|m11:m10 - second line    */
  481     punpckldq_r2r(mm6, mm7);                /* m30:m20|m10:m00 - interleave to produce result 1 */
  482 
  483     movq_m2r(*(dataptr+15), mm0);         /* m13:m13|m11:m10 - fourth line */
  484     punpckhdq_r2r(mm6, mm1);                /* m31:m21|m11:m01 - interleave to produce result 2 */
  485 
  486     movq_r2m(mm7,*(dataptr+9));         /* write result 1 */
  487     punpckhwd_r2r(mm3, mm5);                /* m13:m03|m12:m02 - interleave first and second lines */
  488     
  489     movq_r2m(mm1,*(dataptr+11));            /* write result 2 */
  490     punpckhwd_r2r(mm0, mm2);                /* m33:m23|m32:m22 - interleave third and fourth lines */
  491 
  492     movq_r2r(mm5, mm1);
  493     punpckldq_r2r(mm2, mm5);                /* m32:m22|m12:m02 - interleave to produce result 3 */
  494 
  495     movq_m2r(*(dataptr+1), mm0);            /* m03:m02|m01:m00 - first line, 4x4 */
  496     punpckhdq_r2r(mm2, mm1);                /* m33:m23|m13:m03 - interleave to produce result 4 */
  497 
  498     movq_r2m(mm5,*(dataptr+13));            /* write result 3 */
  499 
  500     /* last 4x4 done */
  501 
  502     movq_r2m(mm1, *(dataptr+15));           /* write result 4, last 4x4 */
  503 
  504     movq_m2r(*(dataptr+5), mm2);            /* m23:m22|m21:m20 - third line */
  505     movq_r2r(mm0, mm6);
  506 
  507     punpcklwd_m2r(*(dataptr+3), mm0);   /* m11:m01|m10:m00 - interleave first and second lines */
  508     movq_r2r(mm2, mm7);
  509 
  510     punpcklwd_m2r(*(dataptr+7), mm2);   /* m31:m21|m30:m20 - interleave third and fourth lines */
  511     movq_r2r(mm0, mm4);
  512 
  513     /* */
  514     movq_m2r(*(dataptr+8), mm1);            /* n03:n02|n01:n00 - first line  */
  515     punpckldq_r2r(mm2, mm0);                /* m30:m20|m10:m00 - interleave to produce first result */
  516 
  517     movq_m2r(*(dataptr+12), mm3);           /* n23:n22|n21:n20 - third line */
  518     punpckhdq_r2r(mm2, mm4);                /* m31:m21|m11:m01 - interleave to produce second result */
  519 
  520     punpckhwd_m2r(*(dataptr+3), mm6);   /* m13:m03|m12:m02 - interleave first and second lines */
  521     movq_r2r(mm1, mm2);                 /* copy first line */
  522 
  523     punpckhwd_m2r(*(dataptr+7), mm7);   /* m33:m23|m32:m22 - interleave third and fourth lines */
  524     movq_r2r(mm6, mm5);                     /* copy first intermediate result */
  525 
  526     movq_r2m(mm0, *(dataptr+8));            /* write result 1 */
  527     punpckhdq_r2r(mm7, mm5);                /* m33:m23|m13:m03 - produce third result */
  528 
  529     punpcklwd_m2r(*(dataptr+10), mm1);  /* n11:n01|n10:n00 - interleave first and second lines */
  530     movq_r2r(mm3, mm0);                     /* copy third line */
  531 
  532     punpckhwd_m2r(*(dataptr+10), mm2);  /* n13:n03|n12:n02 - interleave first and second lines */
  533 
  534     movq_r2m(mm4, *(dataptr+10));           /* write result 2 out */
  535     punpckldq_r2r(mm7, mm6);                /* m32:m22|m12:m02 - produce fourth result */
  536 
  537     punpcklwd_m2r(*(dataptr+14), mm3);  /* n31:n21|n30:n20 - interleave third and fourth lines */
  538     movq_r2r(mm1, mm4);
  539 
  540     movq_r2m(mm6, *(dataptr+12));           /* write result 3 out */
  541     punpckldq_r2r(mm3, mm1);                /* n30:n20|n10:n00 - produce first result */
  542 
  543     punpckhwd_m2r(*(dataptr+14), mm0);  /* n33:n23|n32:n22 - interleave third and fourth lines */
  544     movq_r2r(mm2, mm6);
  545 
  546     movq_r2m(mm5, *(dataptr+14));           /* write result 4 out */
  547     punpckhdq_r2r(mm3, mm4);                /* n31:n21|n11:n01- produce second result */
  548 
  549     movq_r2m(mm1, *(dataptr+1));            /* write result 5 out - (first result for other 4 x 4 block) */
  550     punpckldq_r2r(mm0, mm2);                /* n32:n22|n12:n02- produce third result */
  551 
  552     movq_r2m(mm4, *(dataptr+3));            /* write result 6 out */
  553     punpckhdq_r2r(mm0, mm6);                /* n33:n23|n13:n03 - produce fourth result */
  554 
  555     movq_r2m(mm2, *(dataptr+5));            /* write result 7 out */
  556 
  557     movq_m2r(*dataptr, mm0);                /* m03:m02|m01:m00 - first line, first 4x4 */
  558 
  559     movq_r2m(mm6, *(dataptr+7));            /* write result 8 out */
  560 
  561 
  562 /* Do first 4x4 quadrant, which is used in the beginning of the DCT: */
  563 
  564     movq_m2r(*(dataptr+4), mm7);            /* m23:m22|m21:m20 - third line */
  565     movq_r2r(mm0, mm2);
  566 
  567     punpcklwd_m2r(*(dataptr+2), mm0);   /* m11:m01|m10:m00 - interleave first and second lines */
  568     movq_r2r(mm7, mm4);
  569 
  570     punpcklwd_m2r(*(dataptr+6), mm7);   /* m31:m21|m30:m20 - interleave third and fourth lines */
  571     movq_r2r(mm0, mm1);
  572 
  573     movq_m2r(*(dataptr+2), mm6);            /* m13:m12|m11:m10 - second line */
  574     punpckldq_r2r(mm7, mm0);                /* m30:m20|m10:m00 - interleave to produce result 1 */
  575 
  576     movq_m2r(*(dataptr+6), mm5);            /* m33:m32|m31:m30 - fourth line */
  577     punpckhdq_r2r(mm7, mm1);                /* m31:m21|m11:m01 - interleave to produce result 2 */
  578 
  579     movq_r2r(mm0, mm7);                     /* write result 1 */
  580     punpckhwd_r2r(mm6, mm2);                /* m13:m03|m12:m02 - interleave first and second lines */
  581 
  582     psubw_m2r(*(dataptr+14), mm7);      /* tmp07=x0-x7  /* Stage 1 */ */
  583     movq_r2r(mm1, mm6);                     /* write result 2 */
  584 
  585     paddw_m2r(*(dataptr+14), mm0);      /* tmp00=x0+x7  /* Stage 1 */ */
  586     punpckhwd_r2r(mm5, mm4);            /* m33:m23|m32:m22 - interleave third and fourth lines */
  587 
  588     paddw_m2r(*(dataptr+12), mm1);      /* tmp01=x1+x6  /* Stage 1 */ */
  589     movq_r2r(mm2, mm3);                     /* copy first intermediate result */
  590 
  591     psubw_m2r(*(dataptr+12), mm6);      /* tmp06=x1-x6  /* Stage 1 */ */
  592     punpckldq_r2r(mm4, mm2);                /* m32:m22|m12:m02 - interleave to produce result 3 */
  593 
  594    movq_r2m(mm7, tmp7);
  595     movq_r2r(mm2, mm5);                     /* write result 3 */
  596 
  597    movq_r2m(mm6, tmp6);
  598     punpckhdq_r2r(mm4, mm3);                /* m33:m23|m13:m03 - interleave to produce result 4 */
  599 
  600     paddw_m2r(*(dataptr+10), mm2);      /* tmp02=x2+5 /* Stage 1 */ */
  601     movq_r2r(mm3, mm4);                     /* write result 4 */
  602 
  603 /************************************************************************************************
  604                     End of Transpose
  605 ************************************************************************************************/
  606 
  607 
  608    paddw_m2r(*(dataptr+8), mm3);        /* tmp03=x3+x4 /* stage 1*/ */
  609    movq_r2r(mm0, mm7);
  610 
  611    psubw_m2r(*(dataptr+8), mm4);        /* tmp04=x3-x4 /* stage 1*/ */
  612    movq_r2r(mm1, mm6);
  613 
  614     paddw_r2r(mm3, mm0);                    /* tmp10 = tmp00 + tmp03 /* even 2 */ */
  615     psubw_r2r(mm3, mm7);                    /* tmp13 = tmp00 - tmp03 /* even 2 */ */
  616 
  617     psubw_r2r(mm2, mm6);                    /* tmp12 = tmp01 - tmp02 /* even 2 */ */
  618     paddw_r2r(mm2, mm1);                    /* tmp11 = tmp01 + tmp02 /* even 2 */ */
  619 
  620    psubw_m2r(*(dataptr+10), mm5);       /* tmp05=x2-x5 /* stage 1*/ */
  621     paddw_r2r(mm7, mm6);                        /* tmp12 + tmp13 */
  622 
  623     /* stage 3 */
  624 
  625    movq_m2r(tmp6, mm2);
  626    movq_r2r(mm0, mm3);
  627 
  628     psllw_i2r(2, mm6);          /* m8 * 2^2 */
  629     paddw_r2r(mm1, mm0);        
  630 
  631     pmulhw_m2r(RTjpeg_C4, mm6);         /* z1 */
  632     psubw_r2r(mm1, mm3);        
  633 
  634    movq_r2m(mm0, *dataptr);
  635    movq_r2r(mm7, mm0);
  636    
  637     /* Odd part */
  638    movq_r2m(mm3, *(dataptr+8));
  639     paddw_r2r(mm5, mm4);                        /* tmp10 */
  640 
  641    movq_m2r(tmp7, mm3);
  642     paddw_r2r(mm6, mm0);                        /* tmp32 */
  643 
  644     paddw_r2r(mm2, mm5);                        /* tmp11 */
  645     psubw_r2r(mm6, mm7);                        /* tmp33 */
  646 
  647    movq_r2m(mm0, *(dataptr+4));
  648     paddw_r2r(mm3, mm2);                        /* tmp12 */
  649 
  650     /* stage 4 */
  651 
  652    movq_r2m(mm7, *(dataptr+12));
  653     movq_r2r(mm4, mm1);                     /* copy of tmp10 */
  654 
  655     psubw_r2r(mm2, mm1);                        /* tmp10 - tmp12 */
  656     psllw_i2r(2, mm4);          /* m8 * 2^2 */
  657 
  658     movq_m2r(RTjpeg_C2mC6, mm0);        
  659     psllw_i2r(2, mm1);
  660 
  661     pmulhw_m2r(RTjpeg_C6, mm1);         /* z5 */
  662     psllw_i2r(2, mm2);
  663 
  664     pmulhw_r2r(mm0, mm4);                   /* z5 */
  665 
  666     /* stage 5 */
  667 
  668     pmulhw_m2r(RTjpeg_C2pC6, mm2);
  669     psllw_i2r(2, mm5);
  670 
  671     pmulhw_m2r(RTjpeg_C4, mm5);         /* z3 */
  672     movq_r2r(mm3, mm0);                     /* copy tmp7 */
  673 
  674    movq_m2r(*(dataptr+1), mm7);
  675     paddw_r2r(mm1, mm4);                        /* z2 */
  676 
  677     paddw_r2r(mm1, mm2);                        /* z4 */
  678 
  679     paddw_r2r(mm5, mm0);                        /* z11 */
  680     psubw_r2r(mm5, mm3);                        /* z13 */
  681 
  682     /* stage 6 */
  683 
  684     movq_r2r(mm3, mm5);                     /* copy z13 */
  685     psubw_r2r(mm4, mm3);                        /* y3=z13 - z2 */
  686 
  687     paddw_r2r(mm4, mm5);                        /* y5=z13 + z2 */
  688     movq_r2r(mm0, mm6);                     /* copy z11 */
  689 
  690    movq_r2m(mm3, *(dataptr+6));             /*save y3 */
  691     psubw_r2r(mm2, mm0);                        /* y7=z11 - z4 */
  692 
  693    movq_r2m(mm5, *(dataptr+10));        /*save y5 */
  694     paddw_r2r(mm2, mm6);                        /* y1=z11 + z4 */
  695 
  696    movq_r2m(mm0, *(dataptr+14));        /*save y7 */
  697 
  698     /************************************************
  699      *  End of 1st 4 rows
  700      ************************************************/
  701 
  702    movq_m2r(*(dataptr+3), mm1);             /* load x1   /* stage 1 */ */
  703     movq_r2r(mm7, mm0);                     /* copy x0 */
  704 
  705    movq_r2m(mm6, *(dataptr+2));             /*save y1 */
  706 
  707    movq_m2r(*(dataptr+5), mm2);             /* load x2   /* stage 1 */ */
  708     movq_r2r(mm1, mm6);                     /* copy x1 */
  709 
  710    paddw_m2r(*(dataptr+15), mm0);       /* tmp00 = x0 + x7 */
  711 
  712    movq_m2r(*(dataptr+7), mm3);             /* load x3   /* stage 1 */ */
  713     movq_r2r(mm2, mm5);                     /* copy x2 */
  714 
  715    psubw_m2r(*(dataptr+15), mm7);       /* tmp07 = x0 - x7 */
  716     movq_r2r(mm3, mm4);                     /* copy x3 */
  717 
  718    paddw_m2r(*(dataptr+13), mm1);       /* tmp01 = x1 + x6 */
  719 
  720     movq_r2m(mm7, tmp7);                        /* save tmp07 */
  721     movq_r2r(mm0, mm7);                     /* copy tmp00 */
  722 
  723    psubw_m2r(*(dataptr+13), mm6);       /* tmp06 = x1 - x6 */
  724 
  725    /* stage 2, Even Part */
  726 
  727    paddw_m2r(*(dataptr+9), mm3);        /* tmp03 = x3 + x4 */
  728 
  729     movq_r2m(mm6, tmp6);                        /* save tmp07 */
  730     movq_r2r(mm1, mm6);                     /* copy tmp01 */
  731 
  732    paddw_m2r(*(dataptr+11), mm2);       /* tmp02 = x2 + x5 */
  733     paddw_r2r(mm3, mm0);                /* tmp10 = tmp00 + tmp03 */
  734 
  735     psubw_r2r(mm3, mm7);                /* tmp13 = tmp00 - tmp03 */
  736 
  737    psubw_m2r(*(dataptr+9), mm4);        /* tmp04 = x3 - x4 */
  738     psubw_r2r(mm2, mm6);                /* tmp12 = tmp01 - tmp02 */
  739 
  740     paddw_r2r(mm2, mm1);                /* tmp11 = tmp01 + tmp02 */
  741 
  742    psubw_m2r(*(dataptr+11), mm5);       /* tmp05 = x2 - x5 */
  743     paddw_r2r(mm7, mm6);                /*  tmp12 + tmp13 */
  744 
  745    /* stage 3, Even and stage 4 & 5 even */
  746 
  747     movq_m2r(tmp6, mm2);                    /* load tmp6 */
  748     movq_r2r(mm0, mm3);                     /* copy tmp10 */
  749 
  750     psllw_i2r(2, mm6);          /* shift z1 */
  751     paddw_r2r(mm1, mm0);                    /* y0=tmp10 + tmp11 */
  752 
  753     pmulhw_m2r(RTjpeg_C4, mm6);         /* z1 */
  754     psubw_r2r(mm1, mm3);                    /* y4=tmp10 - tmp11 */
  755 
  756    movq_r2m(mm0, *(dataptr+1));             /*save y0 */
  757     movq_r2r(mm7, mm0);                     /* copy tmp13 */
  758   
  759     /* odd part */
  760 
  761    movq_r2m(mm3, *(dataptr+9));             /*save y4 */
  762     paddw_r2r(mm5, mm4);                /* tmp10 = tmp4 + tmp5 */
  763 
  764     movq_m2r(tmp7, mm3);                    /* load tmp7 */
  765     paddw_r2r(mm6, mm0);                /* tmp32 = tmp13 + z1 */
  766 
  767     paddw_r2r(mm2, mm5);                /* tmp11 = tmp5 + tmp6 */
  768     psubw_r2r(mm6, mm7);                /* tmp33 = tmp13 - z1 */
  769 
  770    movq_r2m(mm0, *(dataptr+5));             /*save y2 */
  771     paddw_r2r(mm3, mm2);                /* tmp12 = tmp6 + tmp7 */
  772 
  773     /* stage 4 */
  774 
  775    movq_r2m(mm7, *(dataptr+13));        /*save y6 */
  776     movq_r2r(mm4, mm1);                     /* copy tmp10 */
  777 
  778     psubw_r2r(mm2, mm1);                    /* tmp10 - tmp12 */
  779     psllw_i2r(2, mm4);          /* shift tmp10 */
  780 
  781     movq_m2r(RTjpeg_C2mC6, mm0);            /* load C2mC6 */
  782     psllw_i2r(2, mm1);          /* shift (tmp10-tmp12) */
  783 
  784     pmulhw_m2r(RTjpeg_C6, mm1);         /* z5 */
  785     psllw_i2r(2, mm5);          /* prepare for multiply  */
  786 
  787     pmulhw_r2r(mm0, mm4);                   /* multiply by converted real */
  788 
  789     /* stage 5 */
  790 
  791     pmulhw_m2r(RTjpeg_C4, mm5);         /* z3 */
  792     psllw_i2r(2, mm2);          /* prepare for multiply  */
  793 
  794     pmulhw_m2r(RTjpeg_C2pC6, mm2);      /* multiply */
  795     movq_r2r(mm3, mm0);                     /* copy tmp7 */
  796 
  797     movq_m2r(*(dataptr+9), mm7);            /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */
  798     paddw_r2r(mm1, mm4);                        /* z2 */
  799 
  800     paddw_r2r(mm5, mm0);                        /* z11 */
  801     psubw_r2r(mm5, mm3);                        /* z13 */
  802 
  803     /* stage 6 */
  804 
  805     movq_r2r(mm3, mm5);                     /* copy z13 */
  806     paddw_r2r(mm1, mm2);                        /* z4 */
  807 
  808     movq_r2r(mm0, mm6);                     /* copy z11 */
  809     psubw_r2r(mm4, mm5);                        /* y3 */
  810 
  811     paddw_r2r(mm2, mm6);                        /* y1 */
  812     paddw_r2r(mm4, mm3);                        /* y5 */
  813 
  814    movq_r2m(mm5, *(dataptr+7));             /*save y3 */
  815 
  816    movq_r2m(mm6, *(dataptr+3));             /*save y1 */
  817     psubw_r2r(mm2, mm0);                        /* y7 */
  818     
  819 /************************************************************************************************
  820                     Start of Transpose
  821 ************************************************************************************************/
  822 
  823     movq_m2r(*(dataptr+13), mm6);           /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */
  824     movq_r2r(mm7, mm5);                     /* copy first line */
  825 
  826     punpcklwd_r2r(mm3, mm7);                /* m11:m01|m10:m00 - interleave first and second lines */
  827     movq_r2r(mm6, mm2);                     /* copy third line */
  828 
  829     punpcklwd_r2r(mm0, mm6);                /* m31:m21|m30:m20 - interleave third and fourth lines */
  830     movq_r2r(mm7, mm1);                     /* copy first intermediate result */
  831 
  832     punpckldq_r2r(mm6, mm7);                /* m30:m20|m10:m00 - interleave to produce result 1 */
  833 
  834     punpckhdq_r2r(mm6, mm1);                /* m31:m21|m11:m01 - interleave to produce result 2 */
  835 
  836     movq_r2m(mm7, *(dataptr+9));            /* write result 1 */
  837     punpckhwd_r2r(mm3, mm5);                /* m13:m03|m12:m02 - interleave first and second lines */
  838 
  839     movq_r2m(mm1, *(dataptr+11));           /* write result 2 */
  840     punpckhwd_r2r(mm0, mm2);                /* m33:m23|m32:m22 - interleave third and fourth lines */
  841 
  842     movq_r2r(mm5, mm1);                     /* copy first intermediate result */
  843     punpckldq_r2r(mm2, mm5);                /* m32:m22|m12:m02 - interleave to produce result 3 */
  844 
  845     movq_m2r(*(dataptr+1), mm0);            /* m03:m02|m01:m00 - first line, 4x4 */
  846     punpckhdq_r2r(mm2, mm1);                /* m33:m23|m13:m03 - interleave to produce result 4 */
  847 
  848     movq_r2m(mm5, *(dataptr+13));           /* write result 3 */
  849 
  850     /****** last 4x4 done */
  851 
  852     movq_r2m(mm1, *(dataptr+15));           /* write result 4, last 4x4 */
  853 
  854     movq_m2r(*(dataptr+5), mm2);            /* m23:m22|m21:m20 - third line */
  855     movq_r2r(mm0, mm6);                     /* copy first line */
  856 
  857     punpcklwd_m2r(*(dataptr+3), mm0);   /* m11:m01|m10:m00 - interleave first and second lines */
  858     movq_r2r(mm2, mm7);                     /* copy third line */
  859 
  860     punpcklwd_m2r(*(dataptr+7), mm2);   /* m31:m21|m30:m20 - interleave third and fourth lines */
  861     movq_r2r(mm0, mm4);                     /* copy first intermediate result */
  862 
  863     
  864 
  865     movq_m2r(*(dataptr+8), mm1);            /* n03:n02|n01:n00 - first line  */
  866     punpckldq_r2r(mm2, mm0);                /* m30:m20|m10:m00 - interleave to produce first result */
  867 
  868     movq_m2r(*(dataptr+12), mm3);           /* n23:n22|n21:n20 - third line */
  869     punpckhdq_r2r(mm2, mm4);                /* m31:m21|m11:m01 - interleave to produce second result */
  870 
  871     punpckhwd_m2r(*(dataptr+3), mm6);   /* m13:m03|m12:m02 - interleave first and second lines */
  872     movq_r2r(mm1, mm2);                     /* copy first line */
  873 
  874     punpckhwd_m2r(*(dataptr+7), mm7);   /* m33:m23|m32:m22 - interleave third and fourth lines */
  875     movq_r2r(mm6, mm5);                     /* copy first intermediate result */
  876 
  877     movq_r2m(mm0, *(dataptr+8));            /* write result 1 */
  878     punpckhdq_r2r(mm7, mm5);                /* m33:m23|m13:m03 - produce third result */
  879 
  880     punpcklwd_m2r(*(dataptr+10), mm1);  /* n11:n01|n10:n00 - interleave first and second lines */
  881     movq_r2r(mm3, mm0);                     /* copy third line */
  882 
  883     punpckhwd_m2r(*(dataptr+10), mm2);  /* n13:n03|n12:n02 - interleave first and second lines */
  884 
  885     movq_r2m(mm4, *(dataptr+10));           /* write result 2 out */
  886     punpckldq_r2r(mm7, mm6);                /* m32:m22|m12:m02 - produce fourth result */
  887 
  888     punpcklwd_m2r(*(dataptr+14), mm3);  /* n33:n23|n32:n22 - interleave third and fourth lines */
  889     movq_r2r(mm1, mm4);                     /* copy second intermediate result */
  890 
  891     movq_r2m(mm6, *(dataptr+12));           /* write result 3 out */
  892     punpckldq_r2r(mm3, mm1);                /*  */
  893 
  894     punpckhwd_m2r(*(dataptr+14), mm0);  /* n33:n23|n32:n22 - interleave third and fourth lines */
  895     movq_r2r(mm2, mm6);                     /* copy second intermediate result */
  896 
  897     movq_r2m(mm5, *(dataptr+14));           /* write result 4 out */
  898     punpckhdq_r2r(mm3, mm4);                /* n31:n21|n11:n01- produce second result */
  899 
  900     movq_r2m(mm1, *(dataptr+1));            /* write result 5 out - (first result for other 4 x 4 block) */
  901     punpckldq_r2r(mm0, mm2);                /* n32:n22|n12:n02- produce third result */
  902 
  903     movq_r2m(mm4, *(dataptr+3));            /* write result 6 out */
  904     punpckhdq_r2r(mm0, mm6);                /* n33:n23|n13:n03 - produce fourth result */
  905 
  906     movq_r2m(mm2, *(dataptr+5));            /* write result 7 out */
  907 
  908     movq_m2r(*dataptr, mm0);                /* m03:m02|m01:m00 - first line, first 4x4 */
  909 
  910     movq_r2m(mm6, *(dataptr+7));            /* write result 8 out */
  911 
  912 /* Do first 4x4 quadrant, which is used in the beginning of the DCT: */
  913 
  914     movq_m2r(*(dataptr+4), mm7);            /* m23:m22|m21:m20 - third line */
  915     movq_r2r(mm0, mm2);                     /* copy first line */
  916 
  917     punpcklwd_m2r(*(dataptr+2), mm0);   /* m11:m01|m10:m00 - interleave first and second lines */
  918     movq_r2r(mm7, mm4);                     /* copy third line */
  919     
  920     punpcklwd_m2r(*(dataptr+6), mm7);   /* m31:m21|m30:m20 - interleave third and fourth lines */
  921     movq_r2r(mm0, mm1);                     /* copy first intermediate result */
  922 
  923     movq_m2r(*(dataptr+2), mm6);            /* m13:m12|m11:m10 - second line */
  924     punpckldq_r2r(mm7, mm0);                /* m30:m20|m10:m00 - interleave to produce result 1 */
  925 
  926     movq_m2r(*(dataptr+6), mm5);            /* m33:m32|m31:m30 - fourth line */
  927     punpckhdq_r2r(mm7, mm1);                /* m31:m21|m11:m01 - interleave to produce result 2 */
  928 
  929     movq_r2r(mm0, mm7);                     /* write result 1 */
  930     punpckhwd_r2r(mm6, mm2);                /* m13:m03|m12:m02 - interleave first and second lines */
  931 
  932     psubw_m2r(*(dataptr+14), mm7);      /* tmp07=x0-x7  /* Stage 1 */ */
  933     movq_r2r(mm1, mm6);                     /* write result 2 */
  934 
  935     paddw_m2r(*(dataptr+14), mm0);      /* tmp00=x0+x7  /* Stage 1 */ */
  936     punpckhwd_r2r(mm5, mm4);            /* m33:m23|m32:m22 - interleave third and fourth lines */
  937 
  938     paddw_m2r(*(dataptr+12), mm1);      /* tmp01=x1+x6  /* Stage 1 */ */
  939     movq_r2r(mm2, mm3);                     /* copy first intermediate result */
  940 
  941     psubw_m2r(*(dataptr+12), mm6);      /* tmp06=x1-x6  /* Stage 1 */ */
  942     punpckldq_r2r(mm4, mm2);                /* m32:m22|m12:m02 - interleave to produce result 3 */
  943 
  944     movq_r2m(mm7, tmp7);                        /* save tmp07 */
  945     movq_r2r(mm2, mm5);                     /* write result 3 */
  946 
  947     movq_r2m(mm6, tmp6);                        /* save tmp06 */
  948 
  949     punpckhdq_r2r(mm4, mm3);                /* m33:m23|m13:m03 - interleave to produce result 4 */
  950 
  951     paddw_m2r(*(dataptr+10), mm2);      /* tmp02=x2+x5 /* stage 1 */ */
  952     movq_r2r(mm3, mm4);                     /* write result 4 */
  953 
  954 /************************************************************************************************
  955                     End of Transpose 2
  956 ************************************************************************************************/
  957 
  958    paddw_m2r(*(dataptr+8), mm3);        /* tmp03=x3+x4 /* stage 1*/ */
  959    movq_r2r(mm0, mm7);
  960 
  961    psubw_m2r(*(dataptr+8), mm4);        /* tmp04=x3-x4 /* stage 1*/ */
  962    movq_r2r(mm1, mm6);
  963 
  964     paddw_r2r(mm3, mm0);                    /* tmp10 = tmp00 + tmp03 /* even 2 */ */
  965     psubw_r2r(mm3, mm7);                    /* tmp13 = tmp00 - tmp03 /* even 2 */ */
  966 
  967     psubw_r2r(mm2, mm6);                    /* tmp12 = tmp01 - tmp02 /* even 2 */ */
  968     paddw_r2r(mm2, mm1);                    /* tmp11 = tmp01 + tmp02 /* even 2 */ */
  969 
  970    psubw_m2r(*(dataptr+10), mm5);       /* tmp05=x2-x5 /* stage 1*/ */
  971     paddw_r2r(mm7, mm6);                        /* tmp12 + tmp13 */
  972 
  973     /* stage 3 */
  974 
  975    movq_m2r(tmp6, mm2);
  976    movq_r2r(mm0, mm3);
  977 
  978     psllw_i2r(2, mm6);          /* m8 * 2^2 */
  979     paddw_r2r(mm1, mm0);        
  980 
  981     pmulhw_m2r(RTjpeg_C4, mm6);         /* z1 */
  982     psubw_r2r(mm1, mm3);        
  983 
  984    movq_r2m(mm0, *dataptr);
  985    movq_r2r(mm7, mm0);
  986    
  987     /* Odd part */
  988    movq_r2m(mm3, *(dataptr+8));
  989     paddw_r2r(mm5, mm4);                        /* tmp10 */
  990 
  991    movq_m2r(tmp7, mm3);
  992     paddw_r2r(mm6, mm0);                        /* tmp32 */
  993 
  994     paddw_r2r(mm2, mm5);                        /* tmp11 */
  995     psubw_r2r(mm6, mm7);                        /* tmp33 */
  996 
  997    movq_r2m(mm0, *(dataptr+4));
  998     paddw_r2r(mm3, mm2);                        /* tmp12 */
  999 
 1000     /* stage 4 */
 1001    movq_r2m(mm7, *(dataptr+12));
 1002     movq_r2r(mm4, mm1);                     /* copy of tmp10 */
 1003 
 1004     psubw_r2r(mm2, mm1);                        /* tmp10 - tmp12 */
 1005     psllw_i2r(2, mm4);          /* m8 * 2^2 */
 1006 
 1007     movq_m2r(RTjpeg_C2mC6, mm0);
 1008     psllw_i2r(2, mm1);
 1009 
 1010     pmulhw_m2r(RTjpeg_C6, mm1);         /* z5 */
 1011     psllw_i2r(2, mm2);
 1012 
 1013     pmulhw_r2r(mm0, mm4);                   /* z5 */
 1014 
 1015     /* stage 5 */
 1016 
 1017     pmulhw_m2r(RTjpeg_C2pC6, mm2);
 1018     psllw_i2r(2, mm5);
 1019 
 1020     pmulhw_m2r(RTjpeg_C4, mm5);         /* z3 */
 1021     movq_r2r(mm3, mm0);                     /* copy tmp7 */
 1022 
 1023    movq_m2r(*(dataptr+1), mm7);
 1024     paddw_r2r(mm1, mm4);                        /* z2 */
 1025 
 1026     paddw_r2r(mm1, mm2);                        /* z4 */
 1027 
 1028     paddw_r2r(mm5, mm0);                        /* z11 */
 1029     psubw_r2r(mm5, mm3);                        /* z13 */
 1030 
 1031     /* stage 6 */
 1032 
 1033     movq_r2r(mm3, mm5);                     /* copy z13 */
 1034     psubw_r2r(mm4, mm3);                        /* y3=z13 - z2 */
 1035 
 1036     paddw_r2r(mm4, mm5);                        /* y5=z13 + z2 */
 1037     movq_r2r(mm0, mm6);                     /* copy z11 */
 1038 
 1039    movq_r2m(mm3, *(dataptr+6));             /*save y3 */
 1040     psubw_r2r(mm2, mm0);                        /* y7=z11 - z4 */
 1041 
 1042    movq_r2m(mm5, *(dataptr+10));        /*save y5 */
 1043     paddw_r2r(mm2, mm6);                        /* y1=z11 + z4 */
 1044 
 1045    movq_r2m(mm0, *(dataptr+14));        /*save y7 */
 1046 
 1047     /************************************************
 1048      *  End of 1st 4 rows
 1049      ************************************************/
 1050 
 1051    movq_m2r(*(dataptr+3), mm1);             /* load x1   /* stage 1 */ */
 1052     movq_r2r(mm7, mm0);                     /* copy x0 */
 1053 
 1054    movq_r2m(mm6, *(dataptr+2));             /*save y1 */
 1055 
 1056    movq_m2r(*(dataptr+5), mm2);             /* load x2   /* stage 1 */ */
 1057     movq_r2r(mm1, mm6);                     /* copy x1 */
 1058 
 1059    paddw_m2r(*(dataptr+15), mm0);       /* tmp00 = x0 + x7 */
 1060 
 1061    movq_m2r(*(dataptr+7), mm3);             /* load x3   /* stage 1 */ */
 1062     movq_r2r(mm2, mm5);                     /* copy x2 */
 1063 
 1064    psubw_m2r(*(dataptr+15), mm7);       /* tmp07 = x0 - x7 */
 1065     movq_r2r(mm3, mm4);                     /* copy x3 */
 1066 
 1067    paddw_m2r(*(dataptr+13), mm1);       /* tmp01 = x1 + x6 */
 1068 
 1069     movq_r2m(mm7, tmp7);                        /* save tmp07 */
 1070     movq_r2r(mm0, mm7);                     /* copy tmp00 */
 1071 
 1072    psubw_m2r(*(dataptr+13), mm6);       /* tmp06 = x1 - x6 */
 1073 
 1074    /* stage 2, Even Part */
 1075 
 1076    paddw_m2r(*(dataptr+9), mm3);        /* tmp03 = x3 + x4 */
 1077 
 1078     movq_r2m(mm6, tmp6);                        /* save tmp07 */
 1079     movq_r2r(mm1, mm6);                     /* copy tmp01 */
 1080 
 1081    paddw_m2r(*(dataptr+11), mm2);       /* tmp02 = x2 + x5 */
 1082     paddw_r2r(mm3, mm0);                /* tmp10 = tmp00 + tmp03 */
 1083 
 1084     psubw_r2r(mm3, mm7);                /* tmp13 = tmp00 - tmp03 */
 1085 
 1086    psubw_m2r(*(dataptr+9), mm4);        /* tmp04 = x3 - x4 */
 1087     psubw_r2r(mm2, mm6);                /* tmp12 = tmp01 - tmp02 */
 1088 
 1089     paddw_r2r(mm2, mm1);                /* tmp11 = tmp01 + tmp02 */
 1090 
 1091    psubw_m2r(*(dataptr+11), mm5);       /* tmp05 = x2 - x5 */
 1092     paddw_r2r(mm7, mm6);                /*  tmp12 + tmp13 */
 1093 
 1094    /* stage 3, Even and stage 4 & 5 even */
 1095 
 1096     movq_m2r(tmp6, mm2);                    /* load tmp6 */
 1097     movq_r2r(mm0, mm3);                     /* copy tmp10 */
 1098 
 1099     psllw_i2r(2, mm6);          /* shift z1 */
 1100     paddw_r2r(mm1, mm0);                    /* y0=tmp10 + tmp11 */
 1101 
 1102     pmulhw_m2r(RTjpeg_C4, mm6);         /* z1 */
 1103     psubw_r2r(mm1, mm3);                    /* y4=tmp10 - tmp11 */
 1104 
 1105    movq_r2m(mm0, *(dataptr+1));             /*save y0 */
 1106     movq_r2r(mm7, mm0);                     /* copy tmp13 */
 1107   
 1108     /* odd part */
 1109 
 1110    movq_r2m(mm3, *(dataptr+9));             /*save y4 */
 1111     paddw_r2r(mm5, mm4);                /* tmp10 = tmp4 + tmp5 */
 1112 
 1113     movq_m2r(tmp7, mm3);                    /* load tmp7 */
 1114     paddw_r2r(mm6, mm0);                /* tmp32 = tmp13 + z1 */
 1115 
 1116     paddw_r2r(mm2, mm5);                /* tmp11 = tmp5 + tmp6 */
 1117     psubw_r2r(mm6, mm7);                /* tmp33 = tmp13 - z1 */
 1118 
 1119    movq_r2m(mm0, *(dataptr+5));             /*save y2 */
 1120     paddw_r2r(mm3, mm2);                /* tmp12 = tmp6 + tmp7 */
 1121 
 1122     /* stage 4 */
 1123 
 1124    movq_r2m(mm7, *(dataptr+13));        /*save y6 */
 1125     movq_r2r(mm4, mm1);                     /* copy tmp10 */
 1126 
 1127     psubw_r2r(mm2, mm1);                    /* tmp10 - tmp12 */
 1128     psllw_i2r(2, mm4);          /* shift tmp10 */
 1129 
 1130     movq_m2r(RTjpeg_C2mC6, mm0);            /* load C2mC6 */
 1131     psllw_i2r(2, mm1);          /* shift (tmp10-tmp12) */
 1132 
 1133     pmulhw_m2r(RTjpeg_C6, mm1);         /* z5 */
 1134     psllw_i2r(2, mm5);          /* prepare for multiply  */
 1135 
 1136     pmulhw_r2r(mm0, mm4);                   /* multiply by converted real */
 1137 
 1138     /* stage 5 */
 1139 
 1140     pmulhw_m2r(RTjpeg_C4, mm5);         /* z3 */
 1141     psllw_i2r(2, mm2);          /* prepare for multiply  */
 1142 
 1143     pmulhw_m2r(RTjpeg_C2pC6, mm2);      /* multiply */
 1144     movq_r2r(mm3, mm0);                     /* copy tmp7 */
 1145 
 1146     movq_m2r(*(dataptr+9), mm7);            /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */
 1147     paddw_r2r(mm1, mm4);                        /* z2 */
 1148 
 1149     paddw_r2r(mm5, mm0);                        /* z11 */
 1150     psubw_r2r(mm5, mm3);                        /* z13 */
 1151 
 1152     /* stage 6 */
 1153 
 1154     movq_r2r(mm3, mm5);                     /* copy z13 */
 1155     paddw_r2r(mm1, mm2);                        /* z4 */
 1156 
 1157     movq_r2r(mm0, mm6);                     /* copy z11 */
 1158     psubw_r2r(mm4, mm5);                        /* y3 */
 1159 
 1160     paddw_r2r(mm2, mm6);                        /* y1 */
 1161     paddw_r2r(mm4, mm3);                        /* y5 */
 1162 
 1163    movq_r2m(mm5, *(dataptr+7));             /*save y3 */
 1164     psubw_r2r(mm2, mm0);                        /* yŤ=z11 - z4 */
 1165 
 1166    movq_r2m(mm3, *(dataptr+11));        /*save y5 */
 1167 
 1168    movq_r2m(mm6, *(dataptr+3));             /*save y1 */
 1169 
 1170    movq_r2m(mm0, *(dataptr+15));        /*save y7 */
 1171     
 1172 
 1173 #endif
 1174 }
 1175 
 1176 #define FIX_1_082392200  ((__s32)  277)     /* FIX(1.082392200) */
 1177 #define FIX_1_414213562  ((__s32)  362)     /* FIX(1.414213562) */
 1178 #define FIX_1_847759065  ((__s32)  473)     /* FIX(1.847759065) */
 1179 #define FIX_2_613125930  ((__s32)  669)     /* FIX(2.613125930) */
 1180 
 1181 #define DESCALE(x) (__s16)( ((x)+4) >> 3)
 1182 
 1183 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
 1184 
 1185 #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
 1186 #define MULTIPLY(var,const)  (((__s32) ((var) * (const)) + 128)>>8)
 1187 
 1188 void RTjpeg_idct_init(void)
 1189 {
 1190  int i;
 1191  
 1192  for(i=0; i<64; i++)
 1193  {
 1194   RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32;
 1195   RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32;
 1196  }
 1197 }
 1198 
 1199 void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip)
 1200 {
 1201 #ifdef MMX
 1202 
 1203 static mmx_t fix_141            = (mmx_t)(long long)0x5a825a825a825a82LL;
 1204 static mmx_t fix_184n261    = (mmx_t)(long long)0xcf04cf04cf04cf04LL;
 1205 static mmx_t fix_184            = (mmx_t)(long long)0x7641764176417641LL;
 1206 static mmx_t fix_n184       = (mmx_t)(long long)0x896f896f896f896fLL;
 1207 static mmx_t fix_108n184    = (mmx_t)(long long)0xcf04cf04cf04cf04LL;
 1208 
 1209   mmx_t workspace[64];
 1210   mmx_t *wsptr = workspace;
 1211   register mmx_t *dataptr = (mmx_t *)odata;
 1212   mmx_t *idata = (mmx_t *)data;
 1213 
 1214   rskip = rskip>>3;
 1215 /*
 1216  * Perform inverse DCT on one block of coefficients.
 1217  */
 1218 
 1219     /* Odd part */
 1220 
 1221     movq_m2r(*(idata+10), mm1); /* load idata[DCTSIZE*5] */
 1222 
 1223     movq_m2r(*(idata+6), mm0);      /* load idata[DCTSIZE*3] */
 1224 
 1225     movq_m2r(*(idata+2), mm3);      /* load idata[DCTSIZE*1] */
 1226 
 1227     movq_r2r(mm1, mm2);             /* copy tmp6    /* phase 6 */ */
 1228 
 1229     movq_m2r(*(idata+14), mm4); /* load idata[DCTSIZE*7] */
 1230 
 1231     paddw_r2r(mm0, mm1);                /* z13 = tmp6 + tmp5; */
 1232 
 1233     psubw_r2r(mm0, mm2);                /* z10 = tmp6 - tmp5    */
 1234 
 1235     psllw_i2r(2, mm2);              /* shift z10 */
 1236     movq_r2r(mm2, mm0);                 /* copy z10 */
 1237 
 1238     pmulhw_m2r(fix_184n261, mm2);   /* MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ */
 1239     movq_r2r(mm3, mm5);             /* copy tmp4 */
 1240 
 1241     pmulhw_m2r(fix_n184, mm0);      /* MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ */
 1242     paddw_r2r(mm4, mm3);                /* z11 = tmp4 + tmp7; */
 1243 
 1244     movq_r2r(mm3, mm6);             /* copy z11         /* phase 5 */ */
 1245     psubw_r2r(mm4, mm5);                /* z12 = tmp4 - tmp7; */
 1246 
 1247     psubw_r2r(mm1, mm6);                /* z11-z13 */
 1248     psllw_i2r(2, mm5);              /*  shift z12 */
 1249 
 1250     movq_m2r(*(idata+12), mm4); /* load idata[DCTSIZE*6], even part */
 1251     movq_r2r(mm5, mm7);             /*  copy z12 */
 1252 
 1253     pmulhw_m2r(fix_108n184, mm5); /*    MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part */
 1254     paddw_r2r(mm1, mm3);                /* tmp7 = z11 + z13;     */
 1255 
 1256     /*ok */
 1257 
 1258     /* Even part */
 1259     pmulhw_m2r(fix_184, mm7);       /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ */
 1260     psllw_i2r(2, mm6);
 1261 
 1262     movq_m2r(*(idata+4), mm1);      /* load idata[DCTSIZE*2] */
 1263 
 1264     paddw_r2r(mm5, mm0);                /*  tmp10 */
 1265 
 1266     paddw_r2r(mm7, mm2);                /* tmp12 */
 1267 
 1268     pmulhw_m2r(fix_141, mm6);       /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ */
 1269     psubw_r2r(mm3, mm2);                /* tmp6 = tmp12 - tmp7 */
 1270 
 1271     movq_r2r(mm1, mm5);             /* copy tmp1 */
 1272     paddw_r2r(mm4, mm1);                /* tmp13= tmp1 + tmp3;  /* phases 5-3 */ */
 1273 
 1274     psubw_r2r(mm4, mm5);                /* tmp1-tmp3 */
 1275     psubw_r2r(mm2, mm6);                /* tmp5 = tmp11 - tmp6; */
 1276 
 1277     movq_r2m(mm1, *(wsptr));        /* save tmp13 in workspace */
 1278     psllw_i2r(2, mm5);  /* shift tmp1-tmp3 */
 1279     
 1280     movq_m2r(*(idata), mm7);        /* load idata[DCTSIZE*0] */
 1281 
 1282     pmulhw_m2r(fix_141, mm5);       /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */
 1283     paddw_r2r(mm6, mm0);                /* tmp4 = tmp10 + tmp5; */
 1284 
 1285     movq_m2r(*(idata+8), mm4);  /* load idata[DCTSIZE*4] */
 1286     
 1287     psubw_r2r(mm1, mm5);                /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ */
 1288 
 1289     movq_r2m(mm0, *(wsptr+4));      /* save tmp4 in workspace */
 1290     movq_r2r(mm7, mm1);             /* copy tmp0    /* phase 3 */ */
 1291 
 1292     movq_r2m(mm5, *(wsptr+2));      /* save tmp12 in workspace */
 1293     psubw_r2r(mm4, mm1);                /* tmp11 = tmp0 - tmp2;  */
 1294 
 1295     paddw_r2r(mm4, mm7);                /* tmp10 = tmp0 + tmp2; */
 1296    movq_r2r(mm1, mm5);              /* copy tmp11 */
 1297     
 1298     paddw_m2r(*(wsptr+2), mm1); /* tmp1 = tmp11 + tmp12; */
 1299     movq_r2r(mm7, mm4);             /* copy tmp10       /* phase 2 */ */
 1300 
 1301     paddw_m2r(*(wsptr), mm7);       /* tmp0 = tmp10 + tmp13;     */
 1302 
 1303     psubw_m2r(*(wsptr), mm4);       /* tmp3 = tmp10 - tmp13; */
 1304     movq_r2r(mm7, mm0);             /*  copy tmp0 */
 1305 
 1306     psubw_m2r(*(wsptr+2), mm5); /* tmp2 = tmp11 - tmp12; */
 1307     paddw_r2r(mm3, mm7);                /*  wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */
 1308     
 1309     psubw_r2r(mm3, mm0);                /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */
 1310 
 1311     movq_r2m(mm7, *(wsptr));        /*  wsptr[DCTSIZE*0] */
 1312     movq_r2r(mm1, mm3);             /*  copy tmp1 */
 1313 
 1314     movq_r2m(mm0, *(wsptr+14));     /* wsptr[DCTSIZE*7] */
 1315     paddw_r2r(mm2, mm1);                /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */
 1316 
 1317     psubw_r2r(mm2, mm3);                /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */
 1318 
 1319     movq_r2m(mm1, *(wsptr+2));      /* wsptr[DCTSIZE*1] */
 1320     movq_r2r(mm4, mm1);             /*  copy tmp3 */
 1321 
 1322     movq_r2m(mm3, *(wsptr+12));     /* wsptr[DCTSIZE*6] */
 1323 
 1324     paddw_m2r(*(wsptr+4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */
 1325 
 1326     psubw_m2r(*(wsptr+4), mm1);     /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */
 1327 
 1328     movq_r2m(mm4, *(wsptr+8));      
 1329     movq_r2r(mm5, mm7);             /* copy tmp2 */
 1330 
 1331     paddw_r2r(mm6, mm5);                /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */
 1332 
 1333     movq_r2m(mm1, *(wsptr+6));  
 1334     psubw_r2r(mm6, mm7);                /*  wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */
 1335 
 1336     movq_r2m(mm5, *(wsptr+4));  
 1337 
 1338     movq_r2m(mm7, *(wsptr+10));     
 1339 
 1340     /*ok */
 1341 
 1342 
 1343 /*****************************************************************/
 1344 
 1345     idata++;
 1346     wsptr++;
 1347 
 1348 /*****************************************************************/
 1349 
 1350     movq_m2r(*(idata+10), mm1); /* load idata[DCTSIZE*5] */
 1351 
 1352     movq_m2r(*(idata+6), mm0);      /* load idata[DCTSIZE*3] */
 1353 
 1354     movq_m2r(*(idata+2),    mm3);       /* load idata[DCTSIZE*1] */
 1355     movq_r2r(mm1, mm2);             /*  copy tmp6   /* phase 6 */ */
 1356 
 1357     movq_m2r(*(idata+14),   mm4);       /* load idata[DCTSIZE*7] */
 1358     paddw_r2r(mm0, mm1);                /*  z13 = tmp6 + tmp5; */
 1359 
 1360     psubw_r2r(mm0, mm2);                /*  z10 = tmp6 - tmp5    */
 1361 
 1362     psllw_i2r(2, mm2);              /*  shift z10 */
 1363     movq_r2r(mm2, mm0);             /*  copy z10 */
 1364 
 1365     pmulhw_m2r(fix_184n261, mm2);   /* MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ */
 1366     movq_r2r(mm3, mm5);             /*  copy tmp4 */
 1367 
 1368     pmulhw_m2r(fix_n184, mm0);      /* MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ */
 1369     paddw_r2r(mm4, mm3);                /* z11 = tmp4 + tmp7; */
 1370 
 1371     movq_r2r(mm3, mm6);             /* copy z11         /* phase 5 */ */
 1372     psubw_r2r(mm4, mm5);                /*  z12 = tmp4 - tmp7; */
 1373 
 1374     psubw_r2r(mm1, mm6);                /* z11-z13 */
 1375     psllw_i2r(2, mm5);              /*  shift z12 */
 1376 
 1377     movq_m2r(*(idata+12), mm4); /* load idata[DCTSIZE*6], even part */
 1378     movq_r2r(mm5, mm7);             /* copy z12 */
 1379 
 1380     pmulhw_m2r(fix_108n184, mm5);   /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part */
 1381     paddw_r2r(mm1, mm3);                /* tmp7 = z11 + z13;     */
 1382 
 1383     /*ok */
 1384 
 1385     /* Even part */
 1386     pmulhw_m2r(fix_184, mm7);       /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ */
 1387     psllw_i2r(2, mm6);
 1388 
 1389     movq_m2r(*(idata+4), mm1);      /* load idata[DCTSIZE*2] */
 1390 
 1391     paddw_r2r(mm5, mm0);                /*  tmp10 */
 1392 
 1393     paddw_r2r(mm7, mm2);                /* tmp12 */
 1394 
 1395     pmulhw_m2r(fix_141, mm6);       /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ */
 1396     psubw_r2r(mm3, mm2);                /* tmp6 = tmp12 - tmp7 */
 1397 
 1398     movq_r2r(mm1, mm5);             /* copy tmp1 */
 1399     paddw_r2r(mm4, mm1);                /* tmp13= tmp1 + tmp3;  /* phases 5-3 */ */
 1400 
 1401     psubw_r2r(mm4, mm5);                /* tmp1-tmp3 */
 1402     psubw_r2r(mm2, mm6);                /* tmp5 = tmp11 - tmp6; */
 1403 
 1404     movq_r2m(mm1, *(wsptr));        /* save tmp13 in workspace */
 1405     psllw_i2r(2, mm5);              /* shift tmp1-tmp3 */
 1406     
 1407     movq_m2r(*(idata), mm7);        /* load idata[DCTSIZE*0] */
 1408     paddw_r2r(mm6, mm0);                /* tmp4 = tmp10 + tmp5; */
 1409 
 1410     pmulhw_m2r(fix_141, mm5);       /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */
 1411 
 1412     movq_m2r(*(idata+8), mm4);    /* load idata[DCTSIZE*4] */
 1413     
 1414     psubw_r2r(mm1, mm5);                /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ */
 1415 
 1416     movq_r2m(mm0, *(wsptr+4));      /* save tmp4 in workspace */
 1417     movq_r2r(mm7, mm1);             /* copy tmp0    /* phase 3 */ */
 1418 
 1419     movq_r2m(mm5, *(wsptr+2));      /* save tmp12 in workspace */
 1420     psubw_r2r(mm4, mm1);                /* tmp11 = tmp0 - tmp2;  */
 1421 
 1422     paddw_r2r(mm4, mm7);                /* tmp10 = tmp0 + tmp2; */
 1423    movq_r2r(mm1, mm5);              /* copy tmp11 */
 1424     
 1425     paddw_m2r(*(wsptr+2), mm1); /* tmp1 = tmp11 + tmp12; */
 1426     movq_r2r(mm7, mm4);             /* copy tmp10       /* phase 2 */ */
 1427 
 1428     paddw_m2r(*(wsptr), mm7);       /* tmp0 = tmp10 + tmp13;     */
 1429 
 1430     psubw_m2r(*(wsptr), mm4);       /* tmp3 = tmp10 - tmp13; */
 1431     movq_r2r(mm7, mm0);             /* copy tmp0 */
 1432 
 1433     psubw_m2r(*(wsptr+2), mm5); /* tmp2 = tmp11 - tmp12; */
 1434     paddw_r2r(mm3, mm7);                /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */
 1435     
 1436     psubw_r2r(mm3, mm0);                /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */
 1437 
 1438     movq_r2m(mm7, *(wsptr));        /* wsptr[DCTSIZE*0] */
 1439     movq_r2r(mm1, mm3);             /* copy tmp1 */
 1440 
 1441     movq_r2m(mm0, *(wsptr+14));     /* wsptr[DCTSIZE*7] */
 1442     paddw_r2r(mm2, mm1);                /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */
 1443 
 1444     psubw_r2r(mm2, mm3);                /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */
 1445 
 1446     movq_r2m(mm1, *(wsptr+2));      /* wsptr[DCTSIZE*1] */
 1447     movq_r2r(mm4, mm1);             /* copy tmp3 */
 1448 
 1449     movq_r2m(mm3, *(wsptr+12));     /* wsptr[DCTSIZE*6] */
 1450 
 1451     paddw_m2r(*(wsptr+4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */
 1452 
 1453     psubw_m2r(*(wsptr+4), mm1); /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */
 1454 
 1455     movq_r2m(mm4, *(wsptr+8));      
 1456     movq_r2r(mm5, mm7);             /* copy tmp2 */
 1457 
 1458     paddw_r2r(mm6, mm5);                /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */
 1459 
 1460     movq_r2m(mm1, *(wsptr+6));      
 1461     psubw_r2r(mm6, mm7);                /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */
 1462 
 1463     movq_r2m(mm5, *(wsptr+4));  
 1464 
 1465     movq_r2m(mm7, *(wsptr+10));
 1466 
 1467 /*****************************************************************/
 1468 
 1469   /* Pass 2: process rows from work array, store into output array. */
 1470   /* Note that we must descale the results by a factor of 8 == 2**3, */
 1471   /* and also undo the PASS1_BITS scaling. */
 1472 
 1473 /*****************************************************************/
 1474     /* Even part */
 1475 
 1476     wsptr--;
 1477 
 1478 /*    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
 1479 /*    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
 1480 /*    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
 1481 /*    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */
 1482     movq_m2r(*(wsptr), mm0);        /* wsptr[0,0],[0,1],[0,2],[0,3] */
 1483 
 1484     movq_m2r(*(wsptr+1),    mm1);       /* wsptr[0,4],[0,5],[0,6],[0,7] */
 1485     movq_r2r(mm0, mm2);
 1486     
 1487     movq_m2r(*(wsptr+2), mm3);      /* wsptr[1,0],[1,1],[1,2],[1,3] */
 1488     paddw_r2r(mm1, mm0);                /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */
 1489 
 1490     movq_m2r(*(wsptr+3), mm4);      /* wsptr[1,4],[1,5],[1,6],[1,7] */
 1491     psubw_r2r(mm1, mm2);                /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */
 1492 
 1493     movq_r2r(mm0, mm6);
 1494     movq_r2r(mm3, mm5);
 1495     
 1496     paddw_r2r(mm4, mm3);                /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */
 1497     movq_r2r(mm2, mm1);
 1498 
 1499     psubw_r2r(mm4, mm5);                /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */
 1500     punpcklwd_r2r(mm3, mm0);        /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */
 1501 
 1502     movq_m2r(*(wsptr+7), mm7);      /* wsptr[3,4],[3,5],[3,6],[3,7] */
 1503     punpckhwd_r2r(mm3, mm6);        /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */
 1504 
 1505     movq_m2r(*(wsptr+4), mm3);      /* wsptr[2,0],[2,1],[2,2],[2,3] */
 1506     punpckldq_r2r(mm6, mm0);        /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
 1507 
 1508     punpcklwd_r2r(mm5, mm1);        /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */
 1509     movq_r2r(mm3, mm4);
 1510 
 1511     movq_m2r(*(wsptr+6), mm6);      /* wsptr[3,0],[3,1],[3,2],[3,3] */
 1512     punpckhwd_r2r(mm5, mm2);        /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */
 1513 
 1514     movq_m2r(*(wsptr+5), mm5);      /* wsptr[2,4],[2,5],[2,6],[2,7] */
 1515     punpckldq_r2r(mm2, mm1);        /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */
 1516 
 1517     
 1518     paddw_r2r(mm5, mm3);                /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */
 1519     movq_r2r(mm6, mm2);
 1520 
 1521     psubw_r2r(mm5, mm4);                /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */
 1522     paddw_r2r(mm7, mm6);                /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */
 1523 
 1524     movq_r2r(mm3, mm5);
 1525     punpcklwd_r2r(mm6, mm3);        /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */
 1526     
 1527     psubw_r2r(mm7, mm2);                /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */
 1528     punpckhwd_r2r(mm6, mm5);        /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */
 1529 
 1530     movq_r2r(mm4, mm7);
 1531     punpckldq_r2r(mm5, mm3);        /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */
 1532                          
 1533     punpcklwd_r2r(mm2, mm4);        /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */
 1534 
 1535     punpckhwd_r2r(mm2, mm7);        /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */
 1536 
 1537     punpckldq_r2r(mm7, mm4);        /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */
 1538     movq_r2r(mm1, mm6);
 1539 
 1540     /*ok */
 1541 
 1542 /*  mm0 =   ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
 1543 /*  mm1 =   ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */
 1544 
 1545 
 1546     movq_r2r(mm0, mm2);
 1547     punpckhdq_r2r(mm4, mm6);        /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */
 1548 
 1549     punpckldq_r2r(mm4, mm1);        /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */
 1550     psllw_i2r(2, mm6);
 1551 
 1552     pmulhw_m2r(fix_141, mm6);
 1553     punpckldq_r2r(mm3, mm0);        /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */
 1554 
 1555     punpckhdq_r2r(mm3, mm2);        /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */
 1556     movq_r2r(mm0, mm7);
 1557 
 1558 /*    tmp0 = tmp10 + tmp13; */
 1559 /*    tmp3 = tmp10 - tmp13; */
 1560     paddw_r2r(mm2, mm0);                /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */
 1561     psubw_r2r(mm2, mm7);                /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */
 1562 
 1563 /*    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */
 1564     psubw_r2r(mm2, mm6);                /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */
 1565 /*    tmp1 = tmp11 + tmp12; */
 1566 /*    tmp2 = tmp11 - tmp12; */
 1567     movq_r2r(mm1, mm5);
 1568 
 1569     /*OK */
 1570 
 1571     /* Odd part */
 1572 
 1573 /*    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
 1574 /*    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
 1575 /*    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
 1576 /*    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
 1577     movq_m2r(*(wsptr), mm3);        /* wsptr[0,0],[0,1],[0,2],[0,3] */
 1578     paddw_r2r(mm6, mm1);                /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */
 1579 
 1580     movq_m2r(*(wsptr+1), mm4);      /* wsptr[0,4],[0,5],[0,6],[0,7] */
 1581     psubw_r2r(mm6, mm5);                /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */
 1582 
 1583     movq_r2r(mm3, mm6);
 1584     punpckldq_r2r(mm4, mm3);        /* wsptr[0,0],[0,1],[0,4],[0,5] */
 1585 
 1586     punpckhdq_r2r(mm6, mm4);        /* wsptr[0,6],[0,7],[0,2],[0,3] */
 1587     movq_r2r(mm3, mm2);
 1588 
 1589 /*Save tmp0 and tmp1 in wsptr */
 1590     movq_r2m(mm0, *(wsptr));        /* save tmp0 */
 1591     paddw_r2r(mm4, mm2);                /* wsptr[xxx],[0,z11],[xxx],[0,z13] */
 1592 
 1593     
 1594 /*Continue with z10 --- z13 */
 1595     movq_m2r(*(wsptr+2), mm6);      /* wsptr[1,0],[1,1],[1,2],[1,3] */
 1596     psubw_r2r(mm4, mm3);                /* wsptr[xxx],[0,z12],[xxx],[0,z10] */
 1597 
 1598     movq_m2r(*(wsptr+3), mm0);      /* wsptr[1,4],[1,5],[1,6],[1,7] */
 1599     movq_r2r(mm6, mm4);
 1600 
 1601     movq_r2m(mm1, *(wsptr+1));      /* save tmp1 */
 1602     punpckldq_r2r(mm0, mm6);        /* wsptr[1,0],[1,1],[1,4],[1,5] */
 1603 
 1604     punpckhdq_r2r(mm4, mm0);        /* wsptr[1,6],[1,7],[1,2],[1,3] */
 1605     movq_r2r(mm6, mm1);
 1606     
 1607 /*Save tmp2 and tmp3 in wsptr */
 1608     paddw_r2r(mm0, mm6);                /* wsptr[xxx],[1,z11],[xxx],[1,z13] */
 1609     movq_r2r(mm2, mm4);
 1610     
 1611 /*Continue with z10 --- z13 */
 1612     movq_r2m(mm5, *(wsptr+2));      /* save tmp2 */
 1613     punpcklwd_r2r(mm6, mm2);        /* wsptr[xxx],[xxx],[0,z11],[1,z11] */
 1614 
 1615     psubw_r2r(mm0, mm1);                /* wsptr[xxx],[1,z12],[xxx],[1,z10] */
 1616     punpckhwd_r2r(mm6, mm4);        /* wsptr[xxx],[xxx],[0,z13],[1,z13] */
 1617 
 1618     movq_r2r(mm3, mm0);
 1619     punpcklwd_r2r(mm1, mm3);        /* wsptr[xxx],[xxx],[0,z12],[1,z12] */
 1620 
 1621     movq_r2m(mm7, *(wsptr+3));      /* save tmp3 */
 1622     punpckhwd_r2r(mm1, mm0);        /* wsptr[xxx],[xxx],[0,z10],[1,z10] */
 1623 
 1624     movq_m2r(*(wsptr+4), mm6);      /* wsptr[2,0],[2,1],[2,2],[2,3] */
 1625     punpckhdq_r2r(mm2, mm0);        /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */
 1626 
 1627     movq_m2r(*(wsptr+5), mm7);  /* wsptr[2,4],[2,5],[2,6],[2,7] */
 1628     punpckhdq_r2r(mm4, mm3);        /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */
 1629 
 1630     movq_m2r(*(wsptr+6), mm1);  /* wsptr[3,0],[3,1],[3,2],[3,3] */
 1631     movq_r2r(mm6, mm4);
 1632 
 1633     punpckldq_r2r(mm7, mm6);        /* wsptr[2,0],[2,1],[2,4],[2,5] */
 1634     movq_r2r(mm1, mm5);
 1635 
 1636     punpckhdq_r2r(mm4, mm7);        /* wsptr[2,6],[2,7],[2,2],[2,3] */
 1637     movq_r2r(mm6, mm2);
 1638     
 1639     movq_m2r(*(wsptr+7), mm4);  /* wsptr[3,4],[3,5],[3,6],[3,7] */
 1640     paddw_r2r(mm7, mm6);                /* wsptr[xxx],[2,z11],[xxx],[2,z13] */
 1641 
 1642     psubw_r2r(mm7, mm2);                /* wsptr[xxx],[2,z12],[xxx],[2,z10] */
 1643     punpckldq_r2r(mm4, mm1);        /* wsptr[3,0],[3,1],[3,4],[3,5] */
 1644 
 1645     punpckhdq_r2r(mm5, mm4);        /* wsptr[3,6],[3,7],[3,2],[3,3] */
 1646     movq_r2r(mm1, mm7);
 1647 
 1648     paddw_r2r(mm4, mm1);                /* wsptr[xxx],[3,z11],[xxx],[3,z13] */
 1649     psubw_r2r(mm4, mm7);                /* wsptr[xxx],[3,z12],[xxx],[3,z10] */
 1650 
 1651     movq_r2r(mm6, mm5);
 1652     punpcklwd_r2r(mm1, mm6);        /* wsptr[xxx],[xxx],[2,z11],[3,z11] */
 1653 
 1654     punpckhwd_r2r(mm1, mm5);        /* wsptr[xxx],[xxx],[2,z13],[3,z13] */
 1655     movq_r2r(mm2, mm4);
 1656 
 1657     punpcklwd_r2r(mm7, mm2);        /* wsptr[xxx],[xxx],[2,z12],[3,z12] */
 1658 
 1659     punpckhwd_r2r(mm7, mm4);        /* wsptr[xxx],[xxx],[2,z10],[3,z10] */
 1660 
 1661     punpckhdq_r2r(mm6, mm4);        /*/ wsptr[2,z10],[3,z10],[2,z11],[3,z11] */
 1662 
 1663     punpckhdq_r2r(mm5, mm2);        /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */
 1664     movq_r2r(mm0, mm5);
 1665 
 1666     punpckldq_r2r(mm4, mm0);        /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */
 1667 
 1668     punpckhdq_r2r(mm4, mm5);        /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */
 1669     movq_r2r(mm3, mm4);
 1670 
 1671     punpckhdq_r2r(mm2, mm4);        /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */
 1672     movq_r2r(mm5, mm1);
 1673 
 1674     punpckldq_r2r(mm2, mm3);        /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */
 1675 /*    tmp7 = z11 + z13;     /* phase 5 */ */
 1676 /*    tmp8 = z11 - z13;     /* phase 5 */ */
 1677     psubw_r2r(mm4, mm1);                /* tmp8 */
 1678 
 1679     paddw_r2r(mm4, mm5);                /* tmp7 */
 1680 /*    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ */
 1681     psllw_i2r(2, mm1);
 1682 
 1683     psllw_i2r(2, mm0);
 1684 
 1685     pmulhw_m2r(fix_141, mm1);       /* tmp21 */
 1686 /*    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */ */
 1687 /*          + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ */
 1688     psllw_i2r(2, mm3);
 1689     movq_r2r(mm0, mm7);
 1690 
 1691     pmulhw_m2r(fix_n184, mm7);
 1692     movq_r2r(mm3, mm6);
 1693 
 1694     movq_m2r(*(wsptr), mm2);        /* tmp0,final1 */
 1695 
 1696     pmulhw_m2r(fix_108n184, mm6);
 1697 /*   tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ */
 1698 /*          + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ */
 1699     movq_r2r(mm2, mm4);             /* final1 */
 1700   
 1701     pmulhw_m2r(fix_184n261, mm0);
 1702     paddw_r2r(mm5, mm2);                /* tmp0+tmp7,final1 */
 1703 
 1704     pmulhw_m2r(fix_184, mm3);
 1705     psubw_r2r(mm5, mm4);                /* tmp0-tmp7,final1 */
 1706 
 1707 /*    tmp6 = tmp22 - tmp7;  /* phase 2 */ */
 1708     psraw_i2r(3, mm2);              /* outptr[0,0],[1,0],[2,0],[3,0],final1 */
 1709 
 1710     paddw_r2r(mm6, mm7);                /* tmp20 */
 1711     psraw_i2r(3, mm4);              /* outptr[0,7],[1,7],[2,7],[3,7],final1 */
 1712 
 1713     paddw_r2r(mm0, mm3);                /* tmp22 */
 1714 
 1715 /*    tmp5 = tmp21 - tmp6; */
 1716     psubw_r2r(mm5, mm3);                /* tmp6 */
 1717 
 1718 /*    tmp4 = tmp20 + tmp5; */
 1719     movq_m2r(*(wsptr+1), mm0);      /* tmp1,final2 */
 1720     psubw_r2r(mm3, mm1);                /* tmp5 */
 1721 
 1722     movq_r2r(mm0, mm6);             /* final2 */
 1723     paddw_r2r(mm3, mm0);                /* tmp1+tmp6,final2 */
 1724 
 1725     /* Final output stage: scale down by a factor of 8 and range-limit */
 1726 
 1727 
 1728 /*    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
 1729 /*              & RANGE_MASK]; */
 1730 /*    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
 1731 /*              & RANGE_MASK];  final1 */
 1732 
 1733 
 1734 /*    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
 1735 /*              & RANGE_MASK]; */
 1736 /*    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
 1737 /*              & RANGE_MASK];  final2 */
 1738     psubw_r2r(mm3, mm6);                /* tmp1-tmp6,final2 */
 1739     psraw_i2r(3, mm0);              /* outptr[0,1],[1,1],[2,1],[3,1] */
 1740 
 1741     psraw_i2r(3, mm6);              /* outptr[0,6],[1,6],[2,6],[3,6] */
 1742     
 1743     packuswb_r2r(mm4, mm0);         /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */
 1744     
 1745     movq_m2r(*(wsptr+2), mm5);      /* tmp2,final3 */
 1746     packuswb_r2r(mm6, mm2);         /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */
 1747 
 1748 /*    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
 1749 /*              & RANGE_MASK]; */
 1750 /*    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
 1751 /*              & RANGE_MASK];  final3 */
 1752     paddw_r2r(mm1, mm7);                /* tmp4 */
 1753     movq_r2r(mm5, mm3);
 1754 
 1755     paddw_r2r(mm1, mm5);                /* tmp2+tmp5 */
 1756     psubw_r2r(mm1, mm3);                /* tmp2-tmp5 */
 1757 
 1758     psraw_i2r(3, mm5);              /* outptr[0,2],[1,2],[2,2],[3,2] */
 1759 
 1760     movq_m2r(*(wsptr+3), mm4);      /* tmp3,final4 */
 1761     psraw_i2r(3, mm3);              /* outptr[0,5],[1,5],[2,5],[3,5] */
 1762 
 1763 
 1764 
 1765 /*    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
 1766 /*              & RANGE_MASK]; */
 1767 /*    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
 1768 /*              & RANGE_MASK];  final4 */
 1769     movq_r2r(mm4, mm6);
 1770     paddw_r2r(mm7, mm4);                /* tmp3+tmp4 */
 1771 
 1772     psubw_r2r(mm7, mm6);                /* tmp3-tmp4 */
 1773     psraw_i2r(3, mm4);              /* outptr[0,4],[1,4],[2,4],[3,4] */
 1774 
 1775     /* mov          ecx, [dataptr] */
 1776 
 1777     psraw_i2r(3, mm6);              /* outptr[0,3],[1,3],[2,3],[3,3] */
 1778 
 1779     packuswb_r2r(mm4, mm5);         /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */
 1780 
 1781     packuswb_r2r(mm3, mm6);         /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */
 1782     movq_r2r(mm2, mm4);
 1783 
 1784     movq_r2r(mm5, mm7);
 1785     punpcklbw_r2r(mm0, mm2);        /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */
 1786 
 1787     punpckhbw_r2r(mm0, mm4);        /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */
 1788     movq_r2r(mm2, mm1);
 1789 
 1790     punpcklbw_r2r(mm6, mm5);        /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */
 1791 
 1792     /* add          dataptr, 4 */
 1793 
 1794     punpckhbw_r2r(mm6, mm7);        /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */
 1795 
 1796     punpcklwd_r2r(mm5, mm2);        /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */
 1797     
 1798     /* add          ecx, output_col */
 1799 
 1800     movq_r2r(mm7, mm6);
 1801     punpckhwd_r2r(mm5, mm1);        /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */
 1802 
 1803     movq_r2r(mm2, mm0);
 1804     punpcklwd_r2r(mm4, mm6);        /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */
 1805 
 1806     /* mov          idata, [dataptr] */
 1807     
 1808     punpckldq_r2r(mm6, mm2);        /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */
 1809 
 1810     /* add          dataptr, 4 */
 1811      
 1812     movq_r2r(mm1, mm3);
 1813 
 1814     /* add          idata, output_col  */
 1815     
 1816     punpckhwd_r2r(mm4, mm7);        /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */
 1817     
 1818     movq_r2m(mm2, *(dataptr));
 1819     
 1820     punpckhdq_r2r(mm6, mm0);        /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */
 1821 
 1822     dataptr += rskip;
 1823     movq_r2m(mm0, *(dataptr));
 1824 
 1825     punpckldq_r2r(mm7, mm1);        /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */
 1826     punpckhdq_r2r(mm7, mm3);        /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */
 1827     
 1828     dataptr += rskip;
 1829     movq_r2m(mm1, *(dataptr));
 1830 
 1831     dataptr += rskip;
 1832     movq_r2m(mm3, *(dataptr));
 1833 
 1834 /*******************************************************************/
 1835 
 1836     wsptr += 8;
 1837 
 1838 /*******************************************************************/
 1839 
 1840 /*    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
 1841 /*    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
 1842 /*    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
 1843 /*    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */
 1844     movq_m2r(*(wsptr), mm0);        /* wsptr[0,0],[0,1],[0,2],[0,3] */
 1845 
 1846     movq_m2r(*(wsptr+1), mm1);      /* wsptr[0,4],[0,5],[0,6],[0,7] */
 1847     movq_r2r(mm0, mm2);
 1848     
 1849     movq_m2r(*(wsptr+2), mm3);      /* wsptr[1,0],[1,1],[1,2],[1,3] */
 1850     paddw_r2r(mm1, mm0);                /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */
 1851 
 1852     movq_m2r(*(wsptr+3), mm4);      /* wsptr[1,4],[1,5],[1,6],[1,7] */
 1853     psubw_r2r(mm1, mm2);                /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */
 1854 
 1855     movq_r2r(mm0, mm6);
 1856     movq_r2r(mm3, mm5);
 1857     
 1858     paddw_r2r(mm4, mm3);                /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */
 1859     movq_r2r(mm2, mm1);
 1860 
 1861     psubw_r2r(mm4, mm5);                /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */
 1862     punpcklwd_r2r(mm3, mm0);        /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */
 1863 
 1864     movq_m2r(*(wsptr+7), mm7);  /* wsptr[3,4],[3,5],[3,6],[3,7] */
 1865     punpckhwd_r2r(mm3, mm6);        /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */
 1866 
 1867     movq_m2r(*(wsptr+4),    mm3);       /* wsptr[2,0],[2,1],[2,2],[2,3] */
 1868     punpckldq_r2r(mm6, mm0);        /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
 1869 
 1870     punpcklwd_r2r(mm5, mm1);        /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */
 1871     movq_r2r(mm3, mm4);
 1872 
 1873     movq_m2r(*(wsptr+6), mm6);  /* wsptr[3,0],[3,1],[3,2],[3,3] */
 1874     punpckhwd_r2r(mm5, mm2);        /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */
 1875 
 1876     movq_m2r(*(wsptr+5), mm5);  /* wsptr[2,4],[2,5],[2,6],[2,7] */
 1877     punpckldq_r2r(mm2, mm1);        /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */
 1878 
 1879     paddw_r2r(mm5, mm3);                /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */
 1880     movq_r2r(mm6, mm2);
 1881 
 1882     psubw_r2r(mm5, mm4);                /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */
 1883     paddw_r2r(mm7, mm6);                /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */
 1884 
 1885     movq_r2r(mm3, mm5);
 1886     punpcklwd_r2r(mm6, mm3);        /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */
 1887     
 1888     psubw_r2r(mm7, mm2);                /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */
 1889     punpckhwd_r2r(mm6, mm5);        /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */
 1890 
 1891     movq_r2r(mm4, mm7);
 1892     punpckldq_r2r(mm5, mm3);        /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */
 1893 
 1894     punpcklwd_r2r(mm2, mm4);        /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */
 1895 
 1896     punpckhwd_r2r(mm2, mm7);        /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */
 1897 
 1898     punpckldq_r2r(mm7, mm4);        /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */
 1899     movq_r2r(mm1, mm6);
 1900 
 1901     /*OK */
 1902 
 1903 /*  mm0 =   ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
 1904 /*  mm1 =   ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */
 1905 
 1906     movq_r2r(mm0, mm2);
 1907     punpckhdq_r2r(mm4, mm6);        /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */
 1908 
 1909     punpckldq_r2r(mm4, mm1);        /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */
 1910     psllw_i2r(2, mm6);
 1911 
 1912     pmulhw_m2r(fix_141, mm6);
 1913     punpckldq_r2r(mm3, mm0);        /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */
 1914 
 1915     punpckhdq_r2r(mm3, mm2);        /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */
 1916     movq_r2r(mm0, mm7);
 1917 
 1918 /*    tmp0 = tmp10 + tmp13; */
 1919 /*    tmp3 = tmp10 - tmp13; */
 1920     paddw_r2r(mm2, mm0);                /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */
 1921     psubw_r2r(mm2, mm7);                /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */
 1922 
 1923 /*    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */
 1924     psubw_r2r(mm2, mm6);                /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */
 1925 /*    tmp1 = tmp11 + tmp12; */
 1926 /*    tmp2 = tmp11 - tmp12; */
 1927     movq_r2r(mm1, mm5);
 1928 
 1929      /*OK */
 1930 
 1931 
 1932     /* Odd part */
 1933 
 1934 /*    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
 1935 /*    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
 1936 /*    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
 1937 /*    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
 1938     movq_m2r(*(wsptr), mm3);        /* wsptr[0,0],[0,1],[0,2],[0,3] */
 1939     paddw_r2r(mm6, mm1);                /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */
 1940 
 1941     movq_m2r(*(wsptr+1),    mm4);       /* wsptr[0,4],[0,5],[0,6],[0,7] */
 1942     psubw_r2r(mm6, mm5);                /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */
 1943 
 1944     movq_r2r(mm3, mm6);
 1945     punpckldq_r2r(mm4, mm3);        /* wsptr[0,0],[0,1],[0,4],[0,5] */
 1946 
 1947     punpckhdq_r2r(mm6, mm4);        /* wsptr[0,6],[0,7],[0,2],[0,3] */
 1948     movq_r2r(mm3, mm2);
 1949 
 1950 /*Save tmp0 and tmp1 in wsptr */
 1951     movq_r2m(mm0, *(wsptr));        /* save tmp0 */
 1952     paddw_r2r(mm4, mm2);                /* wsptr[xxx],[0,z11],[xxx],[0,z13] */
 1953 
 1954     
 1955 /*Continue with z10 --- z13 */
 1956     movq_m2r(*(wsptr+2), mm6);      /* wsptr[1,0],[1,1],[1,2],[1,3] */
 1957     psubw_r2r(mm4, mm3);                /* wsptr[xxx],[0,z12],[xxx],[0,z10] */
 1958 
 1959     movq_m2r(*(wsptr+3), mm0);      /* wsptr[1,4],[1,5],[1,6],[1,7] */
 1960     movq_r2r(mm6, mm4);
 1961 
 1962     movq_r2m(mm1, *(wsptr+1));      /* save tmp1 */
 1963     punpckldq_r2r(mm0, mm6);        /* wsptr[1,0],[1,1],[1,4],[1,5] */
 1964 
 1965     punpckhdq_r2r(mm4, mm0);        /* wsptr[1,6],[1,7],[1,2],[1,3] */
 1966     movq_r2r(mm6, mm1);
 1967     
 1968 /*Save tmp2 and tmp3 in wsptr */
 1969     paddw_r2r(mm0, mm6);                /* wsptr[xxx],[1,z11],[xxx],[1,z13] */
 1970     movq_r2r(mm2, mm4);
 1971     
 1972 /*Continue with z10 --- z13 */
 1973     movq_r2m(mm5, *(wsptr+2));      /* save tmp2 */
 1974     punpcklwd_r2r(mm6, mm2);        /* wsptr[xxx],[xxx],[0,z11],[1,z11] */
 1975 
 1976     psubw_r2r(mm0, mm1);                /* wsptr[xxx],[1,z12],[xxx],[1,z10] */
 1977     punpckhwd_r2r(mm6, mm4);        /* wsptr[xxx],[xxx],[0,z13],[1,z13] */
 1978 
 1979     movq_r2r(mm3, mm0);
 1980     punpcklwd_r2r(mm1, mm3);        /* wsptr[xxx],[xxx],[0,z12],[1,z12] */
 1981 
 1982     movq_r2m(mm7, *(wsptr+3));      /* save tmp3 */
 1983     punpckhwd_r2r(mm1, mm0);        /* wsptr[xxx],[xxx],[0,z10],[1,z10] */
 1984 
 1985     movq_m2r(*(wsptr+4), mm6);      /* wsptr[2,0],[2,1],[2,2],[2,3] */
 1986     punpckhdq_r2r(mm2, mm0);        /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */
 1987 
 1988     movq_m2r(*(wsptr+5), mm7);  /* wsptr[2,4],[2,5],[2,6],[2,7] */
 1989     punpckhdq_r2r(mm4, mm3);        /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */
 1990 
 1991     movq_m2r(*(wsptr+6), mm1);  /* wsptr[3,0],[3,1],[3,2],[3,3] */
 1992     movq_r2r(mm6, mm4);
 1993 
 1994     punpckldq_r2r(mm7, mm6);        /* wsptr[2,0],[2,1],[2,4],[2,5] */
 1995     movq_r2r(mm1, mm5);
 1996 
 1997     punpckhdq_r2r(mm4, mm7);        /* wsptr[2,6],[2,7],[2,2],[2,3] */
 1998     movq_r2r(mm6, mm2);
 1999     
 2000     movq_m2r(*(wsptr+7), mm4);  /* wsptr[3,4],[3,5],[3,6],[3,7] */
 2001     paddw_r2r(mm7, mm6);                /* wsptr[xxx],[2,z11],[xxx],[2,z13] */
 2002 
 2003     psubw_r2r(mm7, mm2);                /* wsptr[xxx],[2,z12],[xxx],[2,z10] */
 2004     punpckldq_r2r(mm4, mm1);        /* wsptr[3,0],[3,1],[3,4],[3,5] */
 2005 
 2006     punpckhdq_r2r(mm5, mm4);        /* wsptr[3,6],[3,7],[3,2],[3,3] */
 2007     movq_r2r(mm1, mm7);
 2008 
 2009     paddw_r2r(mm4, mm1);                /* wsptr[xxx],[3,z11],[xxx],[3,z13] */
 2010     psubw_r2r(mm4, mm7);                /* wsptr[xxx],[3,z12],[xxx],[3,z10] */
 2011 
 2012     movq_r2r(mm6, mm5);
 2013     punpcklwd_r2r(mm1, mm6);        /* wsptr[xxx],[xxx],[2,z11],[3,z11] */
 2014 
 2015     punpckhwd_r2r(mm1, mm5);        /* wsptr[xxx],[xxx],[2,z13],[3,z13] */
 2016     movq_r2r(mm2, mm4);
 2017 
 2018     punpcklwd_r2r(mm7, mm2);        /* wsptr[xxx],[xxx],[2,z12],[3,z12] */
 2019 
 2020     punpckhwd_r2r(mm7, mm4);        /* wsptr[xxx],[xxx],[2,z10],[3,z10] */
 2021 
 2022     punpckhdq_r2r(mm6, mm4);        /* wsptr[2,z10],[3,z10],[2,z11],[3,z11] */
 2023 
 2024     punpckhdq_r2r(mm5, mm2);        /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */
 2025     movq_r2r(mm0, mm5);
 2026 
 2027     punpckldq_r2r(mm4, mm0);        /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */
 2028 
 2029     punpckhdq_r2r(mm4, mm5);        /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */
 2030     movq_r2r(mm3, mm4);
 2031 
 2032     punpckhdq_r2r(mm2, mm4);        /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */
 2033     movq_r2r(mm5, mm1);
 2034 
 2035     punpckldq_r2r(mm2, mm3);        /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */
 2036 /*    tmp7 = z11 + z13;     /* phase 5 */ */
 2037 /*    tmp8 = z11 - z13;     /* phase 5 */ */
 2038     psubw_r2r(mm4, mm1);                /* tmp8 */
 2039 
 2040     paddw_r2r(mm4, mm5);                /* tmp7 */
 2041 /*    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ */
 2042     psllw_i2r(2, mm1);
 2043 
 2044     psllw_i2r(2, mm0);
 2045 
 2046     pmulhw_m2r(fix_141, mm1);       /* tmp21 */
 2047 /*    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */ */
 2048 /*          + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ */
 2049     psllw_i2r(2, mm3);
 2050     movq_r2r(mm0, mm7);
 2051 
 2052     pmulhw_m2r(fix_n184, mm7);
 2053     movq_r2r(mm3, mm6);
 2054 
 2055     movq_m2r(*(wsptr), mm2);        /* tmp0,final1 */
 2056 
 2057     pmulhw_m2r(fix_108n184, mm6);
 2058 /*   tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ */
 2059 /*          + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ */
 2060     movq_r2r(mm2, mm4);             /* final1 */
 2061   
 2062     pmulhw_m2r(fix_184n261, mm0);
 2063     paddw_r2r(mm5, mm2);                /* tmp0+tmp7,final1 */
 2064 
 2065     pmulhw_m2r(fix_184, mm3);
 2066     psubw_r2r(mm5, mm4);                /* tmp0-tmp7,final1 */
 2067 
 2068 /*    tmp6 = tmp22 - tmp7;  /* phase 2 */ */
 2069     psraw_i2r(3, mm2);              /* outptr[0,0],[1,0],[2,0],[3,0],final1 */
 2070 
 2071     paddw_r2r(mm6, mm7);                /* tmp20 */
 2072     psraw_i2r(3, mm4);              /* outptr[0,7],[1,7],[2,7],[3,7],final1 */
 2073 
 2074     paddw_r2r(mm0, mm3);                /* tmp22 */
 2075 
 2076 /*    tmp5 = tmp21 - tmp6; */
 2077     psubw_r2r(mm5, mm3);                /* tmp6 */
 2078 
 2079 /*    tmp4 = tmp20 + tmp5; */
 2080     movq_m2r(*(wsptr+1), mm0);      /* tmp1,final2 */
 2081     psubw_r2r(mm3, mm1);                /* tmp5 */
 2082 
 2083     movq_r2r(mm0, mm6);             /* final2 */
 2084     paddw_r2r(mm3, mm0);                /* tmp1+tmp6,final2 */
 2085 
 2086     /* Final output stage: scale down by a factor of 8 and range-limit */
 2087 
 2088 /*    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
 2089 /*              & RANGE_MASK]; */
 2090 /*    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
 2091 /*              & RANGE_MASK];  final1 */
 2092 
 2093 
 2094 /*    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
 2095 /*              & RANGE_MASK]; */
 2096 /*    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
 2097 /*              & RANGE_MASK];  final2 */
 2098     psubw_r2r(mm3, mm6);                /* tmp1-tmp6,final2 */
 2099     psraw_i2r(3, mm0);              /* outptr[0,1],[1,1],[2,1],[3,1] */
 2100 
 2101     psraw_i2r(3, mm6);              /* outptr[0,6],[1,6],[2,6],[3,6] */
 2102     
 2103     packuswb_r2r(mm4, mm0);         /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */
 2104     
 2105     movq_m2r(*(wsptr+2), mm5);      /* tmp2,final3 */
 2106     packuswb_r2r(mm6, mm2);         /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */
 2107 
 2108 /*    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
 2109 /*              & RANGE_MASK]; */
 2110 /*    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
 2111 /*              & RANGE_MASK];  final3 */
 2112     paddw_r2r(mm1, mm7);                /* tmp4 */
 2113     movq_r2r(mm5, mm3);
 2114 
 2115     paddw_r2r(mm1, mm5);                /* tmp2+tmp5 */
 2116     psubw_r2r(mm1, mm3);                /* tmp2-tmp5 */
 2117 
 2118     psraw_i2r(3, mm5);              /* outptr[0,2],[1,2],[2,2],[3,2] */
 2119 
 2120     movq_m2r(*(wsptr+3), mm4);      /* tmp3,final4 */
 2121     psraw_i2r(3, mm3);              /* outptr[0,5],[1,5],[2,5],[3,5] */
 2122 
 2123 
 2124 
 2125 /*    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
 2126 /*              & RANGE_MASK]; */
 2127 /*    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
 2128 /*              & RANGE_MASK];  final4 */
 2129     movq_r2r(mm4, mm6);
 2130     paddw_r2r(mm7, mm4);                /* tmp3+tmp4 */
 2131 
 2132     psubw_r2r(mm7, mm6);                /* tmp3-tmp4 */
 2133     psraw_i2r(3, mm4);              /* outptr[0,4],[1,4],[2,4],[3,4] */
 2134 
 2135     psraw_i2r(3, mm6);              /* outptr[0,3],[1,3],[2,3],[3,3] */
 2136 
 2137     /*
 2138    movq_r2m(mm4, *dummy);
 2139     fprintf(stderr, "3-4 %016llx\n", dummy);
 2140    movq_r2m(mm4, *dummy);
 2141     fprintf(stderr, "3+4 %016llx\n", dummy);
 2142     */
 2143     
 2144 
 2145     packuswb_r2r(mm4, mm5);         /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */
 2146 
 2147     packuswb_r2r(mm3, mm6);         /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */
 2148     movq_r2r(mm2, mm4);
 2149 
 2150     movq_r2r(mm5, mm7);
 2151     punpcklbw_r2r(mm0, mm2);        /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */
 2152 
 2153     punpckhbw_r2r(mm0, mm4);        /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */
 2154     movq_r2r(mm2, mm1);
 2155 
 2156     punpcklbw_r2r(mm6, mm5);        /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */
 2157     
 2158     punpckhbw_r2r(mm6, mm7);        /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */
 2159 
 2160     punpcklwd_r2r(mm5, mm2);        /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */
 2161     
 2162     movq_r2r(mm7, mm6);
 2163     punpckhwd_r2r(mm5, mm1);        /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */
 2164 
 2165     movq_r2r(mm2, mm0);
 2166     punpcklwd_r2r(mm4, mm6);        /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */
 2167 
 2168     punpckldq_r2r(mm6, mm2);        /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */
 2169 
 2170     movq_r2r(mm1, mm3);
 2171 
 2172     punpckhwd_r2r(mm4, mm7);        /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */
 2173     
 2174     dataptr += rskip;
 2175     movq_r2m(mm2, *(dataptr));
 2176 
 2177     punpckhdq_r2r(mm6, mm0);        /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */
 2178 
 2179     dataptr += rskip;
 2180     movq_r2m(mm0, *(dataptr));
 2181 
 2182     punpckldq_r2r(mm7, mm1);        /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */
 2183     
 2184     punpckhdq_r2r(mm7, mm3);        /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */
 2185 
 2186     dataptr += rskip;
 2187     movq_r2m(mm1, *(dataptr));
 2188 
 2189     dataptr += rskip;
 2190     movq_r2m(mm3, *(dataptr));
 2191 
 2192 #else
 2193   __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 2194   __s32 tmp10, tmp11, tmp12, tmp13;
 2195   __s32 z5, z10, z11, z12, z13;
 2196   __s16 *inptr;
 2197   __s32 *wsptr;
 2198   __u8 *outptr;
 2199   int ctr;
 2200   __s32 dcval;
 2201   __s32 workspace[64];
 2202 
 2203   inptr = data;
 2204   wsptr = workspace;
 2205   for (ctr = 8; ctr > 0; ctr--) {
 2206     
 2207     if ((inptr[8] | inptr[16] | inptr[24] |
 2208      inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
 2209       dcval = inptr[0];
 2210       wsptr[0] = dcval;
 2211       wsptr[8] = dcval;
 2212       wsptr[16] = dcval;
 2213       wsptr[24] = dcval;
 2214       wsptr[32] = dcval;
 2215       wsptr[40] = dcval;
 2216       wsptr[48] = dcval;
 2217       wsptr[56] = dcval;
 2218       
 2219       inptr++;  
 2220       wsptr++;
 2221       continue;
 2222     } 
 2223     
 2224     tmp0 = inptr[0];
 2225     tmp1 = inptr[16];
 2226     tmp2 = inptr[32];
 2227     tmp3 = inptr[48];
 2228 
 2229     tmp10 = tmp0 + tmp2;
 2230     tmp11 = tmp0 - tmp2;
 2231 
 2232     tmp13 = tmp1 + tmp3;
 2233     tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
 2234 
 2235     tmp0 = tmp10 + tmp13;
 2236     tmp3 = tmp10 - tmp13;
 2237     tmp1 = tmp11 + tmp12;
 2238     tmp2 = tmp11 - tmp12;
 2239     
 2240     tmp4 = inptr[8];
 2241     tmp5 = inptr[24];
 2242     tmp6 = inptr[40];
 2243     tmp7 = inptr[56];
 2244 
 2245     z13 = tmp6 + tmp5;
 2246     z10 = tmp6 - tmp5;
 2247     z11 = tmp4 + tmp7;
 2248     z12 = tmp4 - tmp7;
 2249 
 2250     tmp7 = z11 + z13;
 2251     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
 2252 
 2253     z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
 2254     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
 2255     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
 2256 
 2257     tmp6 = tmp12 - tmp7;
 2258     tmp5 = tmp11 - tmp6;
 2259     tmp4 = tmp10 + tmp5;
 2260 
 2261     wsptr[0] = (__s32) (tmp0 + tmp7);
 2262     wsptr[56] = (__s32) (tmp0 - tmp7);
 2263     wsptr[8] = (__s32) (tmp1 + tmp6);
 2264     wsptr[48] = (__s32) (tmp1 - tmp6);
 2265     wsptr[16] = (__s32) (tmp2 + tmp5);
 2266     wsptr[40] = (__s32) (tmp2 - tmp5);
 2267     wsptr[32] = (__s32) (tmp3 + tmp4);
 2268     wsptr[24] = (__s32) (tmp3 - tmp4);
 2269 
 2270     inptr++;
 2271     wsptr++;
 2272   }
 2273 
 2274   wsptr = workspace;
 2275   for (ctr = 0; ctr < 8; ctr++) {
 2276     outptr = &(odata[ctr*rskip]);
 2277 
 2278     tmp10 = wsptr[0] + wsptr[4];
 2279     tmp11 = wsptr[0] - wsptr[4];
 2280 
 2281     tmp13 = wsptr[2] + wsptr[6];
 2282     tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
 2283 
 2284     tmp0 = tmp10 + tmp13;
 2285     tmp3 = tmp10 - tmp13;
 2286     tmp1 = tmp11 + tmp12;
 2287     tmp2 = tmp11 - tmp12;
 2288 
 2289     z13 = wsptr[5] + wsptr[3];
 2290     z10 = wsptr[5] - wsptr[3];
 2291     z11 = wsptr[1] + wsptr[7];
 2292     z12 = wsptr[1] - wsptr[7];
 2293 
 2294     tmp7 = z11 + z13;
 2295     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
 2296 
 2297     z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
 2298     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
 2299     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
 2300 
 2301     tmp6 = tmp12 - tmp7;
 2302     tmp5 = tmp11 - tmp6;
 2303     tmp4 = tmp10 + tmp5;
 2304 
 2305     outptr[0] = RL(DESCALE(tmp0 + tmp7));
 2306     outptr[7] = RL(DESCALE(tmp0 - tmp7));
 2307     outptr[1] = RL(DESCALE(tmp1 + tmp6));
 2308     outptr[6] = RL(DESCALE(tmp1 - tmp6));
 2309     outptr[2] = RL(DESCALE(tmp2 + tmp5));
 2310     outptr[5] = RL(DESCALE(tmp2 - tmp5));
 2311     outptr[4] = RL(DESCALE(tmp3 + tmp4));
 2312     outptr[3] = RL(DESCALE(tmp3 - tmp4));
 2313 
 2314     wsptr += 8;
 2315   }
 2316 #endif
 2317 }
 2318 /*
 2319 
 2320 Main Routines
 2321 
 2322 This file contains most of the initialisation and control functions
 2323 
 2324 (C) Justin Schoeman 1998
 2325 
 2326 */
 2327 
 2328 /*
 2329 
 2330 Private function
 2331 
 2332 Initialise all the cache-aliged data blocks
 2333 
 2334 */
 2335 
 2336 void RTjpeg_init_data(void)
 2337 {
 2338  unsigned long dptr;
 2339  
 2340  dptr=(unsigned long)&(RTjpeg_alldata[0]);
 2341  dptr+=32;
 2342  dptr=dptr>>5;
 2343  dptr=dptr<<5; /* cache align data */
 2344  
 2345  RTjpeg_block=(__s16 *)dptr;
 2346  dptr+=sizeof(__s16)*64;
 2347  RTjpeg_lqt=(__s32 *)dptr;
 2348  dptr+=sizeof(__s32)*64;
 2349  RTjpeg_cqt=(__s32 *)dptr;
 2350  dptr+=sizeof(__s32)*64;
 2351  RTjpeg_liqt=(__u32 *)dptr;
 2352  dptr+=sizeof(__u32)*64;
 2353  RTjpeg_ciqt=(__u32 *)dptr;
 2354 }
 2355 
 2356 /*
 2357 
 2358 External Function
 2359 
 2360 Re-set quality factor
 2361 
 2362 Input: buf -> pointer to 128 ints for quant values store to pass back to
 2363               init_decompress.
 2364        Q -> quality factor (192=best, 32=worst)
 2365 */
 2366 
 2367 void RTjpeg_init_Q(__u8 Q)
 2368 {
 2369  int i;
 2370  __u64 qual;
 2371  
 2372  qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
 2373 
 2374  for(i=0; i<64; i++)
 2375  {
 2376   RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
 2377   if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
 2378   RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
 2379   if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
 2380   RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
 2381   RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
 2382   RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
 2383   RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
 2384  }
 2385  
 2386  RTjpeg_lb8=0;
 2387  while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
 2388  RTjpeg_lb8--;
 2389  RTjpeg_cb8=0;
 2390  while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
 2391  RTjpeg_cb8--;
 2392 
 2393  RTjpeg_dct_init();
 2394  RTjpeg_idct_init();
 2395  RTjpeg_quant_init();
 2396 }
 2397 
 2398 /*
 2399 
 2400 External Function
 2401 
 2402 Initialise compression.
 2403 
 2404 Input: buf -> pointer to 128 ints for quant values store to pass back to 
 2405                 init_decompress.
 2406        width -> width of image
 2407        height -> height of image
 2408        Q -> quality factor (192=best, 32=worst)
 2409        
 2410 */
 2411 
 2412 void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q)
 2413 {
 2414  int i;
 2415  __u64 qual;
 2416  
 2417  RTjpeg_init_data();
 2418  
 2419  RTjpeg_width=width;
 2420  RTjpeg_height=height;
 2421  RTjpeg_Ywidth = RTjpeg_width>>3;
 2422  RTjpeg_Ysize=width * height;
 2423  RTjpeg_Cwidth = RTjpeg_width>>4;
 2424  RTjpeg_Csize= (width>>1) * height;
 2425 
 2426  qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
 2427 
 2428  for(i=0; i<64; i++)
 2429  {
 2430   RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
 2431   if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
 2432   RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
 2433   if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
 2434   RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
 2435   RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
 2436   RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
 2437   RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
 2438  }
 2439  
 2440  RTjpeg_lb8=0;
 2441  while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
 2442  RTjpeg_lb8--;
 2443  RTjpeg_cb8=0;
 2444  while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
 2445  RTjpeg_cb8--;
 2446  
 2447  RTjpeg_dct_init();
 2448  RTjpeg_quant_init();
 2449 
 2450  for(i=0; i<64; i++)
 2451   buf[i]=RTjpeg_liqt[i];
 2452  for(i=0; i<64; i++)
 2453   buf[64+i]=RTjpeg_ciqt[i];
 2454 }
 2455 
 2456 void RTjpeg_init_decompress(__u32 *buf, int width, int height)
 2457 {
 2458  int i;
 2459 
 2460  RTjpeg_init_data();
 2461  
 2462  RTjpeg_width=width;
 2463  RTjpeg_height=height;
 2464  RTjpeg_Ywidth = RTjpeg_width>>3;
 2465  RTjpeg_Ysize=width * height;
 2466  RTjpeg_Cwidth = RTjpeg_width>>4;
 2467  RTjpeg_Csize= (width>>1) * height;
 2468 
 2469  for(i=0; i<64; i++)
 2470  {
 2471   RTjpeg_liqt[i]=buf[i];
 2472   RTjpeg_ciqt[i]=buf[i+64];
 2473  }
 2474 
 2475  RTjpeg_lb8=0;
 2476  while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
 2477  RTjpeg_lb8--;
 2478  RTjpeg_cb8=0;
 2479  while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
 2480  RTjpeg_cb8--;
 2481 
 2482  RTjpeg_idct_init();
 2483 
 2484 /* RTjpeg_color_init(); */
 2485 }
 2486 
 2487 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp)
 2488 {
 2489  __s8 * sb;
 2490  register __s8 * bp1 = bp + (RTjpeg_width<<3);
 2491  register __s8 * bp2 = bp + RTjpeg_Ysize;
 2492  register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
 2493  register int i, j, k;
 2494 
 2495 #ifdef MMX
 2496  emms();
 2497 #endif
 2498  sb=sp;
 2499 /* Y */
 2500  for(i=RTjpeg_height>>1; i; i-=8)
 2501  {
 2502   for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
 2503   {
 2504    RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
 2505    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2506    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2507 
 2508    RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
 2509    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2510    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2511 
 2512    RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
 2513    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2514    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2515 
 2516    RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
 2517    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2518    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2519 
 2520    RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
 2521    RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
 2522    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
 2523 
 2524    RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
 2525    RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
 2526    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
 2527 
 2528   }
 2529   bp+=RTjpeg_width<<4;
 2530   bp1+=RTjpeg_width<<4;
 2531   bp2+=RTjpeg_width<<2;
 2532   bp3+=RTjpeg_width<<2;
 2533              
 2534  }
 2535 #ifdef MMX
 2536  emms();
 2537 #endif
 2538  return (sp-sb);
 2539 }
 2540 
 2541 int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp)
 2542 {
 2543  __s8 * sb;
 2544  register __s8 * bp2 = bp + RTjpeg_Ysize;
 2545  register __s8 * bp3 = bp2 + RTjpeg_Csize;
 2546  register int i, j, k;
 2547 
 2548 #ifdef MMX
 2549  emms();
 2550 #endif
 2551  sb=sp;
 2552 /* Y */
 2553  for(i=RTjpeg_height; i; i-=8)
 2554  {
 2555   for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
 2556   {
 2557    RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
 2558    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2559    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2560 
 2561    RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
 2562    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2563    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2564 
 2565    RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
 2566    RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
 2567    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
 2568 
 2569    RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
 2570    RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
 2571    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
 2572 
 2573   }
 2574   bp+=RTjpeg_width<<3;
 2575   bp2+=RTjpeg_width<<2;
 2576   bp3+=RTjpeg_width<<2;
 2577              
 2578  }
 2579 #ifdef MMX
 2580  emms();
 2581 #endif
 2582  return (sp-sb);
 2583 }
 2584 
 2585 int RTjpeg_compress8(__s8 *sp, unsigned char *bp)
 2586 {
 2587  __s8 * sb;
 2588  int i, j;
 2589 
 2590 #ifdef MMX
 2591  emms();
 2592 #endif
 2593  
 2594  sb=sp;
 2595 /* Y */
 2596  for(i=0; i<RTjpeg_height; i+=8)
 2597  {
 2598   for(j=0; j<RTjpeg_width; j+=8)
 2599   {
 2600    RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
 2601    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2602    sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2603   }
 2604   bp+=RTjpeg_width;
 2605  }
 2606 
 2607 #ifdef MMX
 2608  emms();
 2609 #endif
 2610  return (sp-sb);
 2611 }
 2612 
 2613 void RTjpeg_decompressYUV422(__s8 *sp, __u8 *bp)
 2614 {
 2615  register __s8 * bp2 = bp + RTjpeg_Ysize;
 2616  register __s8 * bp3 = bp2 + (RTjpeg_Csize);
 2617  int i, j,k;
 2618 
 2619 #ifdef MMX
 2620  emms();
 2621 #endif
 2622 
 2623 /* Y */
 2624  for(i=RTjpeg_height; i; i-=8)
 2625  {
 2626   for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
 2627    if(*sp==-1)sp++;
 2628    else
 2629    { 
 2630     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
 2631     RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
 2632    }
 2633    if(*sp==-1)sp++;
 2634    else
 2635    { 
 2636     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
 2637     RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
 2638    }
 2639    if(*sp==-1)sp++;
 2640    else
 2641    { 
 2642     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
 2643     RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
 2644    } 
 2645    if(*sp==-1)sp++;
 2646    else
 2647    { 
 2648     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
 2649     RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
 2650    } 
 2651   }
 2652   bp+=RTjpeg_width<<3;
 2653   bp2+=RTjpeg_width<<2;
 2654   bp3+=RTjpeg_width<<2;
 2655  }
 2656 #ifdef MMX
 2657  emms();
 2658 #endif
 2659 }
 2660 
 2661 void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp)
 2662 {
 2663  register __s8 * bp1 = bp + (RTjpeg_width<<3);
 2664  register __s8 * bp2 = bp + RTjpeg_Ysize;
 2665  register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
 2666  int i, j,k;
 2667 
 2668 #ifdef MMX
 2669  emms();
 2670 #endif
 2671 
 2672 /* Y */
 2673  for(i=RTjpeg_height>>1; i; i-=8)
 2674  {
 2675   for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
 2676    if(*sp==-1)sp++;
 2677    else
 2678    { 
 2679     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
 2680     RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
 2681    }
 2682    if(*sp==-1)sp++;
 2683    else
 2684    { 
 2685     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
 2686     RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
 2687    }
 2688    if(*sp==-1)sp++;
 2689    else
 2690    { 
 2691     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
 2692     RTjpeg_idct(bp1+j, RTjpeg_block, RTjpeg_width);
 2693    }
 2694    if(*sp==-1)sp++;
 2695    else
 2696    { 
 2697     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
 2698     RTjpeg_idct(bp1+j+8, RTjpeg_block, RTjpeg_width);
 2699    }
 2700    if(*sp==-1)sp++;
 2701    else
 2702    { 
 2703     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
 2704     RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
 2705    } 
 2706    if(*sp==-1)sp++;
 2707    else
 2708    { 
 2709     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
 2710     RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
 2711    } 
 2712   }
 2713   bp+=RTjpeg_width<<4;
 2714   bp1+=RTjpeg_width<<4;
 2715   bp2+=RTjpeg_width<<2;
 2716   bp3+=RTjpeg_width<<2;
 2717  }
 2718 #ifdef MMX
 2719  emms();
 2720 #endif
 2721 }
 2722 
 2723 void RTjpeg_decompress8(__s8 *sp, __u8 *bp)
 2724 {
 2725  int i, j;
 2726 
 2727 #ifdef MMX
 2728  emms();
 2729 #endif
 2730 
 2731 /* Y */
 2732  for(i=0; i<RTjpeg_height; i+=8)
 2733  {
 2734   for(j=0; j<RTjpeg_width; j+=8)
 2735    if(*sp==-1)sp++;
 2736    else
 2737    { 
 2738     sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
 2739     RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
 2740    }
 2741   bp+=RTjpeg_width<<3;
 2742  }
 2743 }
 2744 
 2745 /*
 2746 External Function
 2747 
 2748 Initialise additional data structures for motion compensation
 2749 
 2750 */
 2751 
 2752 void RTjpeg_init_mcompress(void)
 2753 {
 2754  unsigned long tmp;
 2755 
 2756  if(!RTjpeg_old)
 2757  {
 2758   RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32);
 2759   tmp=(unsigned long)RTjpeg_old;
 2760   tmp+=32;
 2761   tmp=tmp>>5;
 2762   RTjpeg_old=(__s16 *)(tmp<<5);
 2763  }
 2764  if (!RTjpeg_old)
 2765  {
 2766   fprintf(stderr, "RTjpeg: Could not allocate memory\n");
 2767   exit(-1);
 2768  }
 2769  bzero(RTjpeg_old, ((4*RTjpeg_width*RTjpeg_height)));
 2770 }
 2771 
 2772 #ifdef MMX
 2773 
 2774 int RTjpeg_bcomp(__s16 *old, mmx_t *mask)
 2775 {
 2776  int i;
 2777  mmx_t *mold=(mmx_t *)old;
 2778  mmx_t *mblock=(mmx_t *)RTjpeg_block;
 2779  mmx_t result;
 2780  static mmx_t neg=(mmx_t)(unsigned long long)0xffffffffffffffffULL;
 2781  
 2782  movq_m2r(*mask, mm7);
 2783  movq_m2r(neg, mm6);
 2784  pxor_r2r(mm5, mm5);
 2785  
 2786  for(i=0; i<8; i++)
 2787  {
 2788   movq_m2r(*(mblock++), mm0);
 2789             movq_m2r(*(mblock++), mm2);
 2790   movq_m2r(*(mold++), mm1);
 2791             movq_m2r(*(mold++), mm3);
 2792   psubsw_r2r(mm1, mm0);
 2793             psubsw_r2r(mm3, mm2);
 2794   movq_r2r(mm0, mm1);
 2795             movq_r2r(mm2, mm3);
 2796   pcmpgtw_r2r(mm7, mm0);
 2797             pcmpgtw_r2r(mm7, mm2);
 2798   pxor_r2r(mm6, mm1);
 2799             pxor_r2r(mm6, mm3);
 2800   pcmpgtw_r2r(mm7, mm1);
 2801             pcmpgtw_r2r(mm7, mm3);
 2802   por_r2r(mm0, mm5);
 2803             por_r2r(mm2, mm5);
 2804   por_r2r(mm1, mm5);
 2805             por_r2r(mm3, mm5);
 2806  }
 2807  movq_r2m(mm5, result);
 2808  
 2809  if(result.q)
 2810  {
 2811   if(!RTjpeg_mtest)
 2812    for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
 2813   return 0;
 2814  }
 2815 /* printf("."); */
 2816  return 1;
 2817 }
 2818 
 2819 #else
 2820 int RTjpeg_bcomp(__s16 *old, __u16 *mask)
 2821 {
 2822  int i;
 2823 
 2824  for(i=0; i<64; i++)
 2825   if(abs(old[i]-RTjpeg_block[i])>*mask)
 2826   {
 2827    if(!RTjpeg_mtest)
 2828     for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
 2829    return 0;
 2830   }
 2831  return 1;
 2832 }
 2833 #endif
 2834 
 2835 void RTjpeg_set_test(int i)
 2836 {
 2837  RTjpeg_mtest=i;
 2838 }
 2839 
 2840 int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
 2841 {
 2842  __s8 * sb;
 2843  __s16 *block;
 2844  register __s8 * bp1 = bp + (RTjpeg_width<<3);
 2845  register __s8 * bp2 = bp + RTjpeg_Ysize;
 2846  register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
 2847  register int i, j, k;
 2848 
 2849 #ifdef MMX
 2850  emms();
 2851  RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
 2852  RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask);
 2853 #else
 2854  RTjpeg_lmask=lmask;
 2855  RTjpeg_cmask=cmask;
 2856 #endif
 2857  
 2858  sb=sp;
 2859  block=RTjpeg_old;
 2860 /* Y */
 2861  for(i=RTjpeg_height>>1; i; i-=8)
 2862  {
 2863   for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
 2864   {
 2865    RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
 2866    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2867    if(RTjpeg_bcomp(block, &RTjpeg_lmask))
 2868    {
 2869     *((__u8 *)sp++)=255;
 2870    } 
 2871     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2872    block+=64;
 2873 
 2874    RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
 2875    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2876    if(RTjpeg_bcomp(block, &RTjpeg_lmask))
 2877    {
 2878     *((__u8 *)sp++)=255;
 2879    } 
 2880     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2881    block+=64;
 2882 
 2883    RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
 2884    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2885    if(RTjpeg_bcomp(block, &RTjpeg_lmask))
 2886    {
 2887     *((__u8 *)sp++)=255;
 2888    } 
 2889     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2890    block+=64;
 2891 
 2892    RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
 2893    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2894    if(RTjpeg_bcomp(block, &RTjpeg_lmask))
 2895    {
 2896     *((__u8 *)sp++)=255;
 2897    } 
 2898     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2899    block+=64;
 2900 
 2901    RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
 2902    RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
 2903    if(RTjpeg_bcomp(block, &RTjpeg_cmask))
 2904    {
 2905     *((__u8 *)sp++)=255;
 2906    } 
 2907     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
 2908    block+=64;
 2909 
 2910    RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
 2911    RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
 2912    if(RTjpeg_bcomp(block, &RTjpeg_cmask))
 2913    {
 2914     *((__u8 *)sp++)=255;
 2915    } 
 2916     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
 2917    block+=64;
 2918   }
 2919   bp+=RTjpeg_width<<4;
 2920   bp1+=RTjpeg_width<<4;
 2921   bp2+=RTjpeg_width<<2;
 2922   bp3+=RTjpeg_width<<2;
 2923              
 2924  }
 2925 #ifdef MMX
 2926  emms();
 2927 #endif
 2928  return (sp-sb);
 2929 }
 2930 
 2931 
 2932 int RTjpeg_mcompressYUV422(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
 2933 {
 2934  __s8 * sb;
 2935  __s16 *block;
 2936  register __s8 * bp2;
 2937  register __s8 * bp3;
 2938  register int i, j, k;
 2939 
 2940 #ifdef MMX
 2941  emms();
 2942  RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
 2943  RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask);
 2944 #else
 2945  RTjpeg_lmask=lmask;
 2946  RTjpeg_cmask=cmask;
 2947 #endif
 2948  
 2949  bp = bp - RTjpeg_width*0;
 2950  bp2 = bp + RTjpeg_Ysize-RTjpeg_width*0;
 2951  bp3 = bp2 + RTjpeg_Csize;
 2952 
 2953  sb=sp;
 2954  block=RTjpeg_old;
 2955 /* Y */
 2956  for(i=RTjpeg_height; i; i-=8)
 2957  {
 2958   for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
 2959   {
 2960    RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
 2961    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2962    if(RTjpeg_bcomp(block, &RTjpeg_lmask))
 2963    {
 2964     *((__u8 *)sp++)=255;
 2965    } 
 2966     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2967    block+=64;
 2968 
 2969    RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
 2970    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 2971    if(RTjpeg_bcomp(block, &RTjpeg_lmask))
 2972    {
 2973     *((__u8 *)sp++)=255;
 2974    } 
 2975     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 2976    block+=64;
 2977 
 2978    RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
 2979    RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
 2980    if(RTjpeg_bcomp(block, &RTjpeg_cmask))
 2981    {
 2982     *((__u8 *)sp++)=255;
 2983    } 
 2984     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
 2985    block+=64;
 2986 
 2987    RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
 2988    RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
 2989    if(RTjpeg_bcomp(block, &RTjpeg_cmask))
 2990    {
 2991     *((__u8 *)sp++)=255;
 2992    } 
 2993     else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
 2994    block+=64;
 2995 
 2996   }
 2997   bp+=RTjpeg_width<<3;
 2998   bp2+=RTjpeg_width<<2;
 2999   bp3+=RTjpeg_width<<2;
 3000  }
 3001  printf ("%d\n", block - RTjpeg_old);
 3002 #ifdef MMX
 3003  emms();
 3004 #endif
 3005  return (sp-sb);
 3006 }
 3007 
 3008 int RTjpeg_mcompress8(__s8 *sp, unsigned char *bp, __u16 lmask)
 3009 {
 3010  __s8 * sb;
 3011  __s16 *block;
 3012  int i, j;
 3013 
 3014 #ifdef MMX
 3015  emms();
 3016  RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
 3017 #else
 3018  RTjpeg_lmask=lmask;
 3019 #endif
 3020 
 3021  
 3022  sb=sp;
 3023  block=RTjpeg_old;
 3024 /* Y */
 3025  for(i=0; i<RTjpeg_height; i+=8)
 3026  {
 3027   for(j=0; j<RTjpeg_width; j+=8)
 3028   {
 3029    RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
 3030    RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
 3031    if(RTjpeg_bcomp(block, &RTjpeg_lmask))
 3032    {
 3033     *((__u8 *)sp++)=255;
 3034 /*    printf("* %d ", sp[-1]); */
 3035    } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
 3036    block+=64;
 3037   }
 3038   bp+=RTjpeg_width<<3;
 3039  }
 3040 #ifdef MMX
 3041  emms();
 3042 #endif
 3043  return (sp-sb);
 3044 }
 3045 
 3046 void RTjpeg_color_init(void)
 3047 {
 3048 }  
 3049 
 3050 #define KcrR 76284
 3051 #define KcrG 53281
 3052 #define KcbG 25625
 3053 #define KcbB 132252
 3054 #define Ky 76284
 3055 
 3056 void RTjpeg_yuv422rgb(__u8 *buf, __u8 *rgb, int stride)
 3057 {
 3058  int tmp;
 3059  int i, j;
 3060  __s32 y, crR, crG, cbG, cbB;
 3061  __u8 *bufcr, *bufcb, *bufy, *bufoute;
 3062  int yskip;
 3063  
 3064  yskip=RTjpeg_width;
 3065  
 3066  bufcb=&buf[RTjpeg_width*RTjpeg_height];
 3067  bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
 3068  bufy=&buf[0];
 3069  bufoute=rgb;
 3070  
 3071  for(i=0; i<(RTjpeg_height); i++)
 3072  {
 3073   for(j=0; j<RTjpeg_width; j+=2)
 3074   {
 3075    crR=(*bufcr-128)*KcrR;
 3076    crG=(*(bufcr++)-128)*KcrG;
 3077    cbG=(*bufcb-128)*KcbG;
 3078    cbB=(*(bufcb++)-128)*KcbB;
 3079   
 3080    y=(bufy[j]-16)*Ky;
 3081    
 3082    tmp=(y+crR)>>16;
 3083    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3084    tmp=(y-crG-cbG)>>16;
 3085    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3086    tmp=(y+cbB)>>16;
 3087    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3088 
 3089    y=(bufy[j+1]-16)*Ky;
 3090 
 3091    tmp=(y+crR)>>16;
 3092    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3093    tmp=(y-crG-cbG)>>16;
 3094    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3095    tmp=(y+cbB)>>16;
 3096    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3097 
 3098   }
 3099   bufy+=yskip;
 3100  }
 3101 }
 3102 
 3103 
 3104 void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb, int stride)
 3105 {
 3106  int tmp;
 3107  int i, j;
 3108  __s32 y, crR, crG, cbG, cbB;
 3109  __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
 3110  int oskip, yskip;
 3111  
 3112  if(stride==0)
 3113     oskip=RTjpeg_width*3;
 3114  else
 3115     oskip=2*stride-RTjpeg_width*3;
 3116  
 3117  yskip=RTjpeg_width;
 3118  
 3119  bufcb=&buf[RTjpeg_width*RTjpeg_height];
 3120  bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
 3121  bufy=&buf[0];
 3122  bufoute=rgb;
 3123  bufouto=rgb+RTjpeg_width*3;
 3124  
 3125  for(i=0; i<(RTjpeg_height>>1); i++)
 3126  {
 3127   for(j=0; j<RTjpeg_width; j+=2)
 3128   {
 3129    crR=(*bufcr-128)*KcrR;
 3130    crG=(*(bufcr++)-128)*KcrG;
 3131    cbG=(*bufcb-128)*KcbG;
 3132    cbB=(*(bufcb++)-128)*KcbB;
 3133   
 3134    y=(bufy[j]-16)*Ky;
 3135    
 3136    tmp=(y+crR)>>16;
 3137    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3138    tmp=(y-crG-cbG)>>16;
 3139    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3140    tmp=(y+cbB)>>16;
 3141    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3142 
 3143    y=(bufy[j+1]-16)*Ky;
 3144 
 3145    tmp=(y+crR)>>16;
 3146    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3147    tmp=(y-crG-cbG)>>16;
 3148    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3149    tmp=(y+cbB)>>16;
 3150    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3151 
 3152    y=(bufy[j+yskip]-16)*Ky;
 3153 
 3154    tmp=(y+crR)>>16;
 3155    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3156    tmp=(y-crG-cbG)>>16;
 3157    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3158    tmp=(y+cbB)>>16;
 3159    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3160 
 3161    y=(bufy[j+1+yskip]-16)*Ky;
 3162 
 3163    tmp=(y+crR)>>16;
 3164    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3165    tmp=(y-crG-cbG)>>16;
 3166    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3167    tmp=(y+cbB)>>16;
 3168    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3169    
 3170   }
 3171   bufoute+=oskip;
 3172   bufouto+=oskip;
 3173   bufy+=yskip<<1;
 3174  }
 3175 }
 3176 
 3177 
 3178 void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb, int stride)
 3179 {
 3180  int tmp;
 3181  int i, j;
 3182  __s32 y, crR, crG, cbG, cbB;
 3183  __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
 3184  int oskip, yskip;
 3185  
 3186  if(stride==0)
 3187     oskip=RTjpeg_width*4;
 3188  else
 3189     oskip = 2*stride-RTjpeg_width*4;
 3190  yskip=RTjpeg_width;
 3191  
 3192  bufcb=&buf[RTjpeg_width*RTjpeg_height];
 3193  bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
 3194  bufy=&buf[0];
 3195  bufoute=rgb;
 3196  bufouto=rgb+RTjpeg_width*4;
 3197  
 3198  for(i=0; i<(RTjpeg_height>>1); i++)
 3199  {
 3200   for(j=0; j<RTjpeg_width; j+=2)
 3201   {
 3202    crR=(*bufcr-128)*KcrR;
 3203    crG=(*(bufcr++)-128)*KcrG;
 3204    cbG=(*bufcb-128)*KcbG;
 3205    cbB=(*(bufcb++)-128)*KcbB;
 3206   
 3207    y=(bufy[j]-16)*Ky;
 3208    
 3209    tmp=(y+cbB)>>16;
 3210    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3211    tmp=(y-crG-cbG)>>16;
 3212    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3213    tmp=(y+crR)>>16;
 3214    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3215    bufoute++;
 3216 
 3217    y=(bufy[j+1]-16)*Ky;
 3218 
 3219    tmp=(y+cbB)>>16;
 3220    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3221    tmp=(y-crG-cbG)>>16;
 3222    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3223    tmp=(y+crR)>>16;
 3224    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3225    bufoute++;
 3226 
 3227    y=(bufy[j+yskip]-16)*Ky;
 3228 
 3229    tmp=(y+cbB)>>16;
 3230    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3231    tmp=(y-crG-cbG)>>16;
 3232    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3233    tmp=(y+crR)>>16;
 3234    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3235    bufouto++;
 3236 
 3237    y=(bufy[j+1+yskip]-16)*Ky;
 3238 
 3239    tmp=(y+cbB)>>16;
 3240    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3241    tmp=(y-crG-cbG)>>16;
 3242    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3243    tmp=(y+crR)>>16;
 3244    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3245    bufouto++;
 3246    
 3247   }
 3248   bufoute+=oskip;
 3249   bufouto+=oskip;
 3250   bufy+=yskip<<1;
 3251  }
 3252 }
 3253 
 3254 void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb, int stride)
 3255 {
 3256  int tmp;
 3257  int i, j;
 3258  __s32 y, crR, crG, cbG, cbB;
 3259  __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
 3260  int oskip, yskip;
 3261  
 3262  if(stride==0)
 3263     oskip=RTjpeg_width*3;
 3264  else
 3265     oskip=2*stride - RTjpeg_width*3;
 3266     
 3267  yskip=RTjpeg_width;
 3268  
 3269  bufcb=&buf[RTjpeg_width*RTjpeg_height];
 3270  bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
 3271  bufy=&buf[0];
 3272  bufoute=rgb;
 3273  bufouto=rgb+RTjpeg_width*3;
 3274  
 3275  for(i=0; i<(RTjpeg_height>>1); i++)
 3276  {
 3277   for(j=0; j<RTjpeg_width; j+=2)
 3278   {
 3279    crR=(*bufcr-128)*KcrR;
 3280    crG=(*(bufcr++)-128)*KcrG;
 3281    cbG=(*bufcb-128)*KcbG;
 3282    cbB=(*(bufcb++)-128)*KcbB;
 3283   
 3284    y=(bufy[j]-16)*Ky;
 3285    
 3286    tmp=(y+cbB)>>16;
 3287    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3288    tmp=(y-crG-cbG)>>16;
 3289    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3290    tmp=(y+crR)>>16;
 3291    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3292 
 3293    y=(bufy[j+1]-16)*Ky;
 3294 
 3295    tmp=(y+cbB)>>16;
 3296    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3297    tmp=(y-crG-cbG)>>16;
 3298    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3299    tmp=(y+crR)>>16;
 3300    *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3301 
 3302    y=(bufy[j+yskip]-16)*Ky;
 3303 
 3304    tmp=(y+cbB)>>16;
 3305    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3306    tmp=(y-crG-cbG)>>16;
 3307    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3308    tmp=(y+crR)>>16;
 3309    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3310 
 3311    y=(bufy[j+1+yskip]-16)*Ky;
 3312 
 3313    tmp=(y+cbB)>>16;
 3314    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3315    tmp=(y-crG-cbG)>>16;
 3316    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3317    tmp=(y+crR)>>16;
 3318    *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
 3319    
 3320   }
 3321   bufoute+=oskip;
 3322   bufouto+=oskip;
 3323   bufy+=yskip<<1;
 3324  }
 3325 }
 3326 
 3327 void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb, int stride)
 3328 {
 3329  int tmp;
 3330  int i, j;
 3331  __s32 y, crR, crG, cbG, cbB;
 3332  __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
 3333  int oskip, yskip;
 3334  unsigned char r, g, b;
 3335  
 3336  if(stride==0)
 3337     oskip=RTjpeg_width*2;
 3338  else
 3339     oskip=2*stride-RTjpeg_width*2;
 3340     
 3341  yskip=RTjpeg_width;
 3342  
 3343  bufcb=&buf[RTjpeg_width*RTjpeg_height];
 3344  bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
 3345  bufy=&buf[0];
 3346  bufoute=rgb;
 3347  bufouto=rgb+RTjpeg_width*2;
 3348  
 3349  for(i=0; i<(RTjpeg_height>>1); i++)
 3350  {
 3351   for(j=0; j<RTjpeg_width; j+=2)
 3352   {
 3353    crR=(*bufcr-128)*KcrR;
 3354    crG=(*(bufcr++)-128)*KcrG;
 3355    cbG=(*bufcb-128)*KcbG;
 3356    cbB=(*(bufcb++)-128)*KcbB;
 3357   
 3358    y=(bufy[j]-16)*Ky;
 3359    
 3360    tmp=(y+cbB)>>16;
 3361    b=(tmp>255)?255:((tmp<0)?0:tmp);
 3362    tmp=(y-crG-cbG)>>16;
 3363    g=(tmp>255)?255:((tmp<0)?0:tmp);
 3364    tmp=(y+crR)>>16;
 3365    r=(tmp>255)?255:((tmp<0)?0:tmp);
 3366    tmp=(int)((int)b >> 3);
 3367    tmp|=(int)(((int)g >> 2) << 5);
 3368    tmp|=(int)(((int)r >> 3) << 11);
 3369    *(bufoute++)=tmp&0xff;
 3370    *(bufoute++)=tmp>>8;
 3371    
 3372 
 3373    y=(bufy[j+1]-16)*Ky;
 3374 
 3375    tmp=(y+cbB)>>16;
 3376    b=(tmp>255)?255:((tmp<0)?0:tmp);
 3377    tmp=(y-crG-cbG)>>16;
 3378    g=(tmp>255)?255:((tmp<0)?0:tmp);
 3379    tmp=(y+crR)>>16;
 3380    r=(tmp>255)?255:((tmp<0)?0:tmp);
 3381    tmp=(int)((int)b >> 3);
 3382    tmp|=(int)(((int)g >> 2) << 5);
 3383    tmp|=(int)(((int)r >> 3) << 11);
 3384    *(bufoute++)=tmp&0xff;
 3385    *(bufoute++)=tmp>>8;
 3386 
 3387    y=(bufy[j+yskip]-16)*Ky;
 3388 
 3389    tmp=(y+cbB)>>16;
 3390    b=(tmp>255)?255:((tmp<0)?0:tmp);
 3391    tmp=(y-crG-cbG)>>16;
 3392    g=(tmp>255)?255:((tmp<0)?0:tmp);
 3393    tmp=(y+crR)>>16;
 3394    r=(tmp>255)?255:((tmp<0)?0:tmp);
 3395    tmp=(int)((int)b >> 3);
 3396    tmp|=(int)(((int)g >> 2) << 5);
 3397    tmp|=(int)(((int)r >> 3) << 11);
 3398    *(bufouto++)=tmp&0xff;
 3399    *(bufouto++)=tmp>>8;
 3400 
 3401    y=(bufy[j+1+yskip]-16)*Ky;
 3402 
 3403    tmp=(y+cbB)>>16;
 3404    b=(tmp>255)?255:((tmp<0)?0:tmp);
 3405    tmp=(y-crG-cbG)>>16;
 3406    g=(tmp>255)?255:((tmp<0)?0:tmp);
 3407    tmp=(y+crR)>>16;
 3408    r=(tmp>255)?255:((tmp<0)?0:tmp);
 3409    tmp=(int)((int)b >> 3);
 3410    tmp|=(int)(((int)g >> 2) << 5);
 3411    tmp|=(int)(((int)r >> 3) << 11);
 3412    *(bufouto++)=tmp&0xff;
 3413    *(bufouto++)=tmp>>8;
 3414 
 3415   }
 3416   bufoute+=oskip;
 3417   bufouto+=oskip;
 3418   bufy+=yskip<<1;
 3419  }
 3420 }
 3421 
 3422 /* fix stride */
 3423 
 3424 void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb, int stride)
 3425 {
 3426  bcopy(buf, rgb, RTjpeg_width*RTjpeg_height);
 3427 }
 3428