"Fossies" - the Fresh Open Source Software Archive

Member "quicktime4linux-2.3/mmx.h" (9 Jan 2007, 22919 Bytes) of package /linux/privat/old/quicktime4linux-2.3-src.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /*  mmx.h
    2 
    3     MultiMedia eXtensions GCC interface library for IA32.
    4 
    5     To use this library, simply include this header file
    6     and compile with GCC.  You MUST have inlining enabled
    7     in order for mmx_ok() to work; this can be done by
    8     simply using -O on the GCC command line.
    9 
   10     Compiling with -DMMX_TRACE will cause detailed trace
   11     output to be sent to stderr for each mmx operation.
   12     This adds lots of code, and obviously slows execution to
   13     a crawl, but can be very useful for debugging.
   14 
   15     THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
   16     EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
   17     LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
   18     AND FITNESS FOR ANY PARTICULAR PURPOSE.
   19 
   20     1997-98 by H. Dietz and R. Fisher
   21 
   22  History:
   23     97-98*  R.Fisher    Early versions
   24     980501  R.Fisher    Original Release
   25     980611* H.Dietz     Rewrite, correctly implementing inlines, and
   26         R.Fisher     including direct register accesses.
   27     980616  R.Fisher    Release of 980611 as 980616.
   28     980714  R.Fisher    Minor corrections to Makefile, etc.
   29     980715  R.Fisher    mmx_ok() now prevents optimizer from using
   30                  clobbered values.
   31                 mmx_ok() now checks if cpuid instruction is
   32                  available before trying to use it.
   33     980726* R.Fisher    mm_support() searches for AMD 3DNow, Cyrix
   34                  Extended MMX, and standard MMX.  It returns a
   35                  value which is positive if any of these are
   36                  supported, and can be masked with constants to
   37                  see which.  mmx_ok() is now a call to this
   38     980726* R.Fisher    Added i2r support for shift functions
   39     980919  R.Fisher    Fixed AMD extended feature recognition bug.
   40     980921  R.Fisher    Added definition/check for _MMX_H.
   41                 Added "float s[2]" to mmx_t for use with
   42                   3DNow and EMMX.  So same mmx_t can be used.
   43     981013  R.Fisher    Fixed cpuid function 1 bug (looked at wrong reg)
   44                 Fixed psllq_i2r error in mmxtest.c
   45 
   46     * Unreleased (internal or interim) versions
   47 
   48  Notes:
   49     It appears that the latest gas has the pand problem fixed, therefore
   50       I'll undefine BROKEN_PAND by default.
   51     String compares may be quicker than the multiple test/jumps in vendor
   52       test sequence in mmx_ok(), but I'm not concerned with that right now.
   53 
   54  Acknowledgments:
   55     Jussi Laako for pointing out the errors ultimately found to be
   56       connected to the failure to notify the optimizer of clobbered values.
   57     Roger Hardiman for reminding us that CPUID isn't everywhere, and that
   58       someone may actually try to use this on a machine without CPUID.
   59       Also for suggesting code for checking this.
   60     Robert Dale for pointing out the AMD recognition bug.
   61     Jimmy Mayfield and Carl Witty for pointing out the Intel recognition
   62       bug.
   63     Carl Witty for pointing out the psllq_i2r test bug.
   64 */
   65 
   66 #ifndef _MMX_H
   67 #define _MMX_H
   68 
   69 /*#define MMX_TRACE */
   70 
   71 /*  Warning:  at this writing, the version of GAS packaged
   72     with most Linux distributions does not handle the
   73     parallel AND operation mnemonic correctly.  If the
   74     symbol BROKEN_PAND is defined, a slower alternative
   75     coding will be used.  If execution of mmxtest results
   76     in an illegal instruction fault, define this symbol.
   77 */
   78 #undef  BROKEN_PAND
   79 
   80 
   81 /*  The type of an value that fits in an MMX register
   82     (note that long long constant values MUST be suffixed
   83      by LL and unsigned long long values by ULL, lest
   84      they be truncated by the compiler)
   85 */
   86 typedef union {
   87     long long       q;  /* Quadword (64-bit) value */
   88     unsigned long long  uq; /* Unsigned Quadword */
   89     int         d[2];   /* 2 Doubleword (32-bit) values */
   90     unsigned int        ud[2];  /* 2 Unsigned Doubleword */
   91     short           w[4];   /* 4 Word (16-bit) values */
   92     unsigned short      uw[4];  /* 4 Unsigned Word */
   93     char            b[8];   /* 8 Byte (8-bit) values */
   94     unsigned char       ub[8];  /* 8 Unsigned Byte */
   95     float           s[2];   /* Single-precision (32-bit) value */
   96 } mmx_t;
   97 
   98 
   99 
  100 /*  Function to test if multimedia instructions are supported...
  101 */
  102 inline extern int
  103 mm_support(void)
  104 {
  105     /* Returns 1 if MMX instructions are supported,
  106        3 if Cyrix MMX and Extended MMX instructions are supported
  107        5 if AMD MMX and 3DNow! instructions are supported
  108        0 if hardware does not support any of these
  109     */
  110     register int rval = 0;
  111 
  112     __asm__ __volatile__ (
  113         /* See if CPUID instruction is supported ... */
  114         /* ... Get copies of EFLAGS into eax and ecx */
  115         "pushf\n\t"
  116         "popl %%eax\n\t"
  117         "movl %%eax, %%ecx\n\t"
  118 
  119         /* ... Toggle the ID bit in one copy and store */
  120         /*     to the EFLAGS reg */
  121         "xorl $0x200000, %%eax\n\t"
  122         "push %%eax\n\t"
  123         "popf\n\t"
  124 
  125         /* ... Get the (hopefully modified) EFLAGS */
  126         "pushf\n\t"
  127         "popl %%eax\n\t"
  128 
  129         /* ... Compare and test result */
  130         "xorl %%eax, %%ecx\n\t"
  131         "testl $0x200000, %%ecx\n\t"
  132         "jz NotSupported1\n\t"      /* Nothing supported */
  133 
  134 
  135         /* Get standard CPUID information, and
  136                go to a specific vendor section */
  137         "movl $0, %%eax\n\t"
  138         "cpuid\n\t"
  139 
  140         /* Check for Intel */
  141         "cmpl $0x756e6547, %%ebx\n\t"
  142         "jne TryAMD\n\t"
  143         "cmpl $0x49656e69, %%edx\n\t"
  144         "jne TryAMD\n\t"
  145         "cmpl $0x6c65746e, %%ecx\n"
  146         "jne TryAMD\n\t"
  147         "jmp Intel\n\t"
  148 
  149         /* Check for AMD */
  150         "\nTryAMD:\n\t"
  151         "cmpl $0x68747541, %%ebx\n\t"
  152         "jne TryCyrix\n\t"
  153         "cmpl $0x69746e65, %%edx\n\t"
  154         "jne TryCyrix\n\t"
  155         "cmpl $0x444d4163, %%ecx\n"
  156         "jne TryCyrix\n\t"
  157         "jmp AMD\n\t"
  158 
  159         /* Check for Cyrix */
  160         "\nTryCyrix:\n\t"
  161         "cmpl $0x69727943, %%ebx\n\t"
  162         "jne NotSupported2\n\t"
  163         "cmpl $0x736e4978, %%edx\n\t"
  164         "jne NotSupported3\n\t"
  165         "cmpl $0x64616574, %%ecx\n\t"
  166         "jne NotSupported4\n\t"
  167         /* Drop through to Cyrix... */
  168 
  169 
  170         /* Cyrix Section */
  171         /* See if extended CPUID is supported */
  172         "movl $0x80000000, %%eax\n\t"
  173         "cpuid\n\t"
  174         "cmpl $0x80000000, %%eax\n\t"
  175         "jl MMXtest\n\t"    /* Try standard CPUID instead */
  176 
  177         /* Extended CPUID supported, so get extended features */
  178         "movl $0x80000001, %%eax\n\t"
  179         "cpuid\n\t"
  180         "testl $0x00800000, %%eax\n\t"  /* Test for MMX */
  181         "jz NotSupported5\n\t"      /* MMX not supported */
  182         "testl $0x01000000, %%eax\n\t"  /* Test for Ext'd MMX */
  183         "jnz EMMXSupported\n\t"
  184         "movl $1, %0:\n\n\t"        /* MMX Supported */
  185         "jmp Return\n\n"
  186         "EMMXSupported:\n\t"
  187         "movl $3, %0:\n\n\t"        /* EMMX and MMX Supported */
  188         "jmp Return\n\t"
  189 
  190 
  191         /* AMD Section */
  192         "AMD:\n\t"
  193 
  194         /* See if extended CPUID is supported */
  195         "movl $0x80000000, %%eax\n\t"
  196         "cpuid\n\t"
  197         "cmpl $0x80000000, %%eax\n\t"
  198         "jl MMXtest\n\t"    /* Try standard CPUID instead */
  199 
  200         /* Extended CPUID supported, so get extended features */
  201         "movl $0x80000001, %%eax\n\t"
  202         "cpuid\n\t"
  203         "testl $0x00800000, %%edx\n\t"  /* Test for MMX */
  204         "jz NotSupported6\n\t"      /* MMX not supported */
  205         "testl $0x80000000, %%edx\n\t"  /* Test for 3DNow! */
  206         "jnz ThreeDNowSupported\n\t"
  207         "movl $1, %0:\n\n\t"        /* MMX Supported */
  208         "jmp Return\n\n"
  209         "ThreeDNowSupported:\n\t"
  210         "movl $5, %0:\n\n\t"        /* 3DNow! and MMX Supported */
  211         "jmp Return\n\t"
  212 
  213 
  214         /* Intel Section */
  215         "Intel:\n\t"
  216 
  217         /* Check for MMX */
  218         "MMXtest:\n\t"
  219         "movl $1, %%eax\n\t"
  220         "cpuid\n\t"
  221         "testl $0x00800000, %%edx\n\t"  /* Test for MMX */
  222         "jz NotSupported7\n\t"      /* MMX Not supported */
  223         "movl $1, %0:\n\n\t"        /* MMX Supported */
  224         "jmp Return\n\t"
  225 
  226         /* Nothing supported */
  227         "\nNotSupported1:\n\t"
  228         "#movl $101, %0:\n\n\t"
  229         "\nNotSupported2:\n\t"
  230         "#movl $102, %0:\n\n\t"
  231         "\nNotSupported3:\n\t"
  232         "#movl $103, %0:\n\n\t"
  233         "\nNotSupported4:\n\t"
  234         "#movl $104, %0:\n\n\t"
  235         "\nNotSupported5:\n\t"
  236         "#movl $105, %0:\n\n\t"
  237         "\nNotSupported6:\n\t"
  238         "#movl $106, %0:\n\n\t"
  239         "\nNotSupported7:\n\t"
  240         "#movl $107, %0:\n\n\t"
  241         "movl $0, %0:\n\n\t"
  242 
  243         "Return:\n\t"
  244         : "=a" (rval)
  245         : /* no input */
  246         : "eax", "ebx", "ecx", "edx"
  247     );
  248 
  249     /* Return */
  250     return(rval);
  251 }
  252 
  253 /*  Function to test if mmx instructions are supported...
  254 */
  255 inline extern int
  256 mmx_ok(void)
  257 {
  258     /* Returns 1 if MMX instructions are supported, 0 otherwise */
  259     return ( mm_support() & 0x1 );
  260 }
  261 
  262 
  263 /*  Helper functions for the instruction macros that follow...
  264     (note that memory-to-register, m2r, instructions are nearly
  265      as efficient as register-to-register, r2r, instructions;
  266      however, memory-to-memory instructions are really simulated
  267      as a convenience, and are only 1/3 as efficient)
  268 */
  269 #ifdef  MMX_TRACE
  270 
  271 /*  Include the stuff for printing a trace to stderr...
  272 */
  273 
  274 #include <stdio.h>
  275 
  276 #define mmx_i2r(op, imm, reg) \
  277     { \
  278         mmx_t mmx_trace; \
  279         mmx_trace = (imm); \
  280         fprintf(stderr, #op "_i2r(" #imm "=0x%016llx, ", mmx_trace.q); \
  281         __asm__ __volatile__ ("movq %%" #reg ", %0" \
  282                       : "=X" (mmx_trace) \
  283                       : /* nothing */ ); \
  284         fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \
  285         __asm__ __volatile__ (#op " %0, %%" #reg \
  286                       : /* nothing */ \
  287                       : "X" (imm)); \
  288         __asm__ __volatile__ ("movq %%" #reg ", %0" \
  289                       : "=X" (mmx_trace) \
  290                       : /* nothing */ ); \
  291         fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \
  292     }
  293 
  294 #define mmx_m2r(op, mem, reg) \
  295     { \
  296         mmx_t mmx_trace; \
  297         mmx_trace = (mem); \
  298         fprintf(stderr, #op "_m2r(" #mem "=0x%016llx, ", mmx_trace.q); \
  299         __asm__ __volatile__ ("movq %%" #reg ", %0" \
  300                       : "=X" (mmx_trace) \
  301                       : /* nothing */ ); \
  302         fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \
  303         __asm__ __volatile__ (#op " %0, %%" #reg \
  304                       : /* nothing */ \
  305                       : "X" (mem)); \
  306         __asm__ __volatile__ ("movq %%" #reg ", %0" \
  307                       : "=X" (mmx_trace) \
  308                       : /* nothing */ ); \
  309         fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \
  310     }
  311 
  312 #define mmx_r2m(op, reg, mem) \
  313     { \
  314         mmx_t mmx_trace; \
  315         __asm__ __volatile__ ("movq %%" #reg ", %0" \
  316                       : "=X" (mmx_trace) \
  317                       : /* nothing */ ); \
  318         fprintf(stderr, #op "_r2m(" #reg "=0x%016llx, ", mmx_trace.q); \
  319         mmx_trace = (mem); \
  320         fprintf(stderr, #mem "=0x%016llx) => ", mmx_trace.q); \
  321         __asm__ __volatile__ (#op " %%" #reg ", %0" \
  322                       : "=X" (mem) \
  323                       : /* nothing */ ); \
  324         mmx_trace = (mem); \
  325         fprintf(stderr, #mem "=0x%016llx\n", mmx_trace.q); \
  326     }
  327 
  328 #define mmx_r2r(op, regs, regd) \
  329     { \
  330         mmx_t mmx_trace; \
  331         __asm__ __volatile__ ("movq %%" #regs ", %0" \
  332                       : "=X" (mmx_trace) \
  333                       : /* nothing */ ); \
  334         fprintf(stderr, #op "_r2r(" #regs "=0x%016llx, ", mmx_trace.q); \
  335         __asm__ __volatile__ ("movq %%" #regd ", %0" \
  336                       : "=X" (mmx_trace) \
  337                       : /* nothing */ ); \
  338         fprintf(stderr, #regd "=0x%016llx) => ", mmx_trace.q); \
  339         __asm__ __volatile__ (#op " %" #regs ", %" #regd); \
  340         __asm__ __volatile__ ("movq %%" #regd ", %0" \
  341                       : "=X" (mmx_trace) \
  342                       : /* nothing */ ); \
  343         fprintf(stderr, #regd "=0x%016llx\n", mmx_trace.q); \
  344     }
  345 
  346 #define mmx_m2m(op, mems, memd) \
  347     { \
  348         mmx_t mmx_trace; \
  349         mmx_trace = (mems); \
  350         fprintf(stderr, #op "_m2m(" #mems "=0x%016llx, ", mmx_trace.q); \
  351         mmx_trace = (memd); \
  352         fprintf(stderr, #memd "=0x%016llx) => ", mmx_trace.q); \
  353         __asm__ __volatile__ ("movq %0, %%mm0\n\t" \
  354                       #op " %1, %%mm0\n\t" \
  355                       "movq %%mm0, %0" \
  356                       : "=X" (memd) \
  357                       : "X" (mems)); \
  358         mmx_trace = (memd); \
  359         fprintf(stderr, #memd "=0x%016llx\n", mmx_trace.q); \
  360     }
  361 
  362 #else
  363 
  364 /*  These macros are a lot simpler without the tracing...
  365 */
  366 
  367 #define mmx_i2r(op, imm, reg) \
  368     __asm__ __volatile__ (#op " $" #imm ", %%" #reg \
  369                   : /* nothing */ \
  370                   : /* nothing */);
  371 
  372 #define mmx_m2r(op, mem, reg) \
  373     __asm__ __volatile__ (#op " %0, %%" #reg \
  374                   : /* nothing */ \
  375                   : "X" (mem))
  376 
  377 #define mmx_r2m(op, reg, mem) \
  378     __asm__ __volatile__ (#op " %%" #reg ", %0" \
  379                   : "=X" (mem) \
  380                   : /* nothing */ )
  381 
  382 #define mmx_r2r(op, regs, regd) \
  383     __asm__ __volatile__ (#op " %" #regs ", %" #regd)
  384 
  385 #define mmx_m2m(op, mems, memd) \
  386     __asm__ __volatile__ ("movq %0, %%mm0\n\t" \
  387                   #op " %1, %%mm0\n\t" \
  388                   "movq %%mm0, %0" \
  389                   : "=X" (memd) \
  390                   : "X" (mems))
  391 
  392 #endif
  393 
  394 
  395 /*  1x64 MOVe Quadword
  396     (this is both a load and a store...
  397      in fact, it is the only way to store)
  398 */
  399 #define movq_m2r(var, reg)  mmx_m2r(movq, var, reg)
  400 #define movq_r2m(reg, var)  mmx_r2m(movq, reg, var)
  401 #define movq_r2r(regs, regd)    mmx_r2r(movq, regs, regd)
  402 #define movq(vars, vard) \
  403     __asm__ __volatile__ ("movq %1, %%mm0\n\t" \
  404                   "movq %%mm0, %0" \
  405                   : "=X" (vard) \
  406                   : "X" (vars))
  407 
  408 
  409 /*  1x32 MOVe Doubleword
  410     (like movq, this is both load and store...
  411      but is most useful for moving things between
  412      mmx registers and ordinary registers)
  413 */
  414 #define movd_m2r(var, reg)  mmx_m2r(movd, var, reg)
  415 #define movd_r2m(reg, var)  mmx_r2m(movd, reg, var)
  416 #define movd_r2r(regs, regd)    mmx_r2r(movd, regs, regd)
  417 #define movd(vars, vard) \
  418     __asm__ __volatile__ ("movd %1, %%mm0\n\t" \
  419                   "movd %%mm0, %0" \
  420                   : "=X" (vard) \
  421                   : "X" (vars))
  422 
  423 
  424 /*  2x32, 4x16, and 8x8 Parallel ADDs
  425 */
  426 #define paddd_m2r(var, reg) mmx_m2r(paddd, var, reg)
  427 #define paddd_r2r(regs, regd)   mmx_r2r(paddd, regs, regd)
  428 #define paddd(vars, vard)   mmx_m2m(paddd, vars, vard)
  429 
  430 #define paddw_m2r(var, reg) mmx_m2r(paddw, var, reg)
  431 #define paddw_r2r(regs, regd)   mmx_r2r(paddw, regs, regd)
  432 #define paddw(vars, vard)   mmx_m2m(paddw, vars, vard)
  433 
  434 #define paddb_m2r(var, reg) mmx_m2r(paddb, var, reg)
  435 #define paddb_r2r(regs, regd)   mmx_r2r(paddb, regs, regd)
  436 #define paddb(vars, vard)   mmx_m2m(paddb, vars, vard)
  437 
  438 
  439 /*  4x16 and 8x8 Parallel ADDs using Saturation arithmetic
  440 */
  441 #define paddsw_m2r(var, reg)    mmx_m2r(paddsw, var, reg)
  442 #define paddsw_r2r(regs, regd)  mmx_r2r(paddsw, regs, regd)
  443 #define paddsw(vars, vard)  mmx_m2m(paddsw, vars, vard)
  444 
  445 #define paddsb_m2r(var, reg)    mmx_m2r(paddsb, var, reg)
  446 #define paddsb_r2r(regs, regd)  mmx_r2r(paddsb, regs, regd)
  447 #define paddsb(vars, vard)  mmx_m2m(paddsb, vars, vard)
  448 
  449 
  450 /*  4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
  451 */
  452 #define paddusw_m2r(var, reg)   mmx_m2r(paddusw, var, reg)
  453 #define paddusw_r2r(regs, regd) mmx_r2r(paddusw, regs, regd)
  454 #define paddusw(vars, vard) mmx_m2m(paddusw, vars, vard)
  455 
  456 #define paddusb_m2r(var, reg)   mmx_m2r(paddusb, var, reg)
  457 #define paddusb_r2r(regs, regd) mmx_r2r(paddusb, regs, regd)
  458 #define paddusb(vars, vard) mmx_m2m(paddusb, vars, vard)
  459 
  460 
  461 /*  2x32, 4x16, and 8x8 Parallel SUBs
  462 */
  463 #define psubd_m2r(var, reg) mmx_m2r(psubd, var, reg)
  464 #define psubd_r2r(regs, regd)   mmx_r2r(psubd, regs, regd)
  465 #define psubd(vars, vard)   mmx_m2m(psubd, vars, vard)
  466 
  467 #define psubw_m2r(var, reg) mmx_m2r(psubw, var, reg)
  468 #define psubw_r2r(regs, regd)   mmx_r2r(psubw, regs, regd)
  469 #define psubw(vars, vard)   mmx_m2m(psubw, vars, vard)
  470 
  471 #define psubb_m2r(var, reg) mmx_m2r(psubb, var, reg)
  472 #define psubb_r2r(regs, regd)   mmx_r2r(psubb, regs, regd)
  473 #define psubb(vars, vard)   mmx_m2m(psubb, vars, vard)
  474 
  475 
  476 /*  4x16 and 8x8 Parallel SUBs using Saturation arithmetic
  477 */
  478 #define psubsw_m2r(var, reg)    mmx_m2r(psubsw, var, reg)
  479 #define psubsw_r2r(regs, regd)  mmx_r2r(psubsw, regs, regd)
  480 #define psubsw(vars, vard)  mmx_m2m(psubsw, vars, vard)
  481 
  482 #define psubsb_m2r(var, reg)    mmx_m2r(psubsb, var, reg)
  483 #define psubsb_r2r(regs, regd)  mmx_r2r(psubsb, regs, regd)
  484 #define psubsb(vars, vard)  mmx_m2m(psubsb, vars, vard)
  485 
  486 
  487 /*  4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
  488 */
  489 #define psubusw_m2r(var, reg)   mmx_m2r(psubusw, var, reg)
  490 #define psubusw_r2r(regs, regd) mmx_r2r(psubusw, regs, regd)
  491 #define psubusw(vars, vard) mmx_m2m(psubusw, vars, vard)
  492 
  493 #define psubusb_m2r(var, reg)   mmx_m2r(psubusb, var, reg)
  494 #define psubusb_r2r(regs, regd) mmx_r2r(psubusb, regs, regd)
  495 #define psubusb(vars, vard) mmx_m2m(psubusb, vars, vard)
  496 
  497 
  498 /*  4x16 Parallel MULs giving Low 4x16 portions of results
  499 */
  500 #define pmullw_m2r(var, reg)    mmx_m2r(pmullw, var, reg)
  501 #define pmullw_r2r(regs, regd)  mmx_r2r(pmullw, regs, regd)
  502 #define pmullw(vars, vard)  mmx_m2m(pmullw, vars, vard)
  503 
  504 
  505 /*  4x16 Parallel MULs giving High 4x16 portions of results
  506 */
  507 #define pmulhw_m2r(var, reg)    mmx_m2r(pmulhw, var, reg)
  508 #define pmulhw_r2r(regs, regd)  mmx_r2r(pmulhw, regs, regd)
  509 #define pmulhw(vars, vard)  mmx_m2m(pmulhw, vars, vard)
  510 
  511 
  512 /*  4x16->2x32 Parallel Mul-ADD
  513     (muls like pmullw, then adds adjacent 16-bit fields
  514      in the multiply result to make the final 2x32 result)
  515 */
  516 #define pmaddwd_m2r(var, reg)   mmx_m2r(pmaddwd, var, reg)
  517 #define pmaddwd_r2r(regs, regd) mmx_r2r(pmaddwd, regs, regd)
  518 #define pmaddwd(vars, vard) mmx_m2m(pmaddwd, vars, vard)
  519 
  520 
  521 /*  1x64 bitwise AND
  522 */
  523 #ifdef  BROKEN_PAND
  524 #define pand_m2r(var, reg) \
  525     { \
  526         mmx_m2r(pandn, (mmx_t) -1LL, reg); \
  527         mmx_m2r(pandn, var, reg); \
  528     }
  529 #define pand_r2r(regs, regd) \
  530     { \
  531         mmx_m2r(pandn, (mmx_t) -1LL, regd); \
  532         mmx_r2r(pandn, regs, regd) \
  533     }
  534 #define pand(vars, vard) \
  535     { \
  536         movq_m2r(vard, mm0); \
  537         mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
  538         mmx_m2r(pandn, vars, mm0); \
  539         movq_r2m(mm0, vard); \
  540     }
  541 #else
  542 #define pand_m2r(var, reg)  mmx_m2r(pand, var, reg)
  543 #define pand_r2r(regs, regd)    mmx_r2r(pand, regs, regd)
  544 #define pand(vars, vard)    mmx_m2m(pand, vars, vard)
  545 #endif
  546 
  547 
  548 /*  1x64 bitwise AND with Not the destination
  549 */
  550 #define pandn_m2r(var, reg) mmx_m2r(pandn, var, reg)
  551 #define pandn_r2r(regs, regd)   mmx_r2r(pandn, regs, regd)
  552 #define pandn(vars, vard)   mmx_m2m(pandn, vars, vard)
  553 
  554 
  555 /*  1x64 bitwise OR
  556 */
  557 #define por_m2r(var, reg)   mmx_m2r(por, var, reg)
  558 #define por_r2r(regs, regd) mmx_r2r(por, regs, regd)
  559 #define por(vars, vard) mmx_m2m(por, vars, vard)
  560 
  561 
  562 /*  1x64 bitwise eXclusive OR
  563 */
  564 #define pxor_m2r(var, reg)  mmx_m2r(pxor, var, reg)
  565 #define pxor_r2r(regs, regd)    mmx_r2r(pxor, regs, regd)
  566 #define pxor(vars, vard)    mmx_m2m(pxor, vars, vard)
  567 
  568 
  569 /*  2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
  570     (resulting fields are either 0 or -1)
  571 */
  572 #define pcmpeqd_m2r(var, reg)   mmx_m2r(pcmpeqd, var, reg)
  573 #define pcmpeqd_r2r(regs, regd) mmx_r2r(pcmpeqd, regs, regd)
  574 #define pcmpeqd(vars, vard) mmx_m2m(pcmpeqd, vars, vard)
  575 
  576 #define pcmpeqw_m2r(var, reg)   mmx_m2r(pcmpeqw, var, reg)
  577 #define pcmpeqw_r2r(regs, regd) mmx_r2r(pcmpeqw, regs, regd)
  578 #define pcmpeqw(vars, vard) mmx_m2m(pcmpeqw, vars, vard)
  579 
  580 #define pcmpeqb_m2r(var, reg)   mmx_m2r(pcmpeqb, var, reg)
  581 #define pcmpeqb_r2r(regs, regd) mmx_r2r(pcmpeqb, regs, regd)
  582 #define pcmpeqb(vars, vard) mmx_m2m(pcmpeqb, vars, vard)
  583 
  584 
  585 /*  2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
  586     (resulting fields are either 0 or -1)
  587 */
  588 #define pcmpgtd_m2r(var, reg)   mmx_m2r(pcmpgtd, var, reg)
  589 #define pcmpgtd_r2r(regs, regd) mmx_r2r(pcmpgtd, regs, regd)
  590 #define pcmpgtd(vars, vard) mmx_m2m(pcmpgtd, vars, vard)
  591 
  592 #define pcmpgtw_m2r(var, reg)   mmx_m2r(pcmpgtw, var, reg)
  593 #define pcmpgtw_r2r(regs, regd) mmx_r2r(pcmpgtw, regs, regd)
  594 #define pcmpgtw(vars, vard) mmx_m2m(pcmpgtw, vars, vard)
  595 
  596 #define pcmpgtb_m2r(var, reg)   mmx_m2r(pcmpgtb, var, reg)
  597 #define pcmpgtb_r2r(regs, regd) mmx_r2r(pcmpgtb, regs, regd)
  598 #define pcmpgtb(vars, vard) mmx_m2m(pcmpgtb, vars, vard)
  599 
  600 
  601 /*  1x64, 2x32, and 4x16 Parallel Shift Left Logical
  602 */
  603 #define psllq_i2r(imm, reg) mmx_i2r(psllq, imm, reg)
  604 #define psllq_m2r(var, reg) mmx_m2r(psllq, var, reg)
  605 #define psllq_r2r(regs, regd)   mmx_r2r(psllq, regs, regd)
  606 #define psllq(vars, vard)   mmx_m2m(psllq, vars, vard)
  607 
  608 #define pslld_i2r(imm, reg) mmx_i2r(pslld, imm, reg)
  609 #define pslld_m2r(var, reg) mmx_m2r(pslld, var, reg)
  610 #define pslld_r2r(regs, regd)   mmx_r2r(pslld, regs, regd)
  611 #define pslld(vars, vard)   mmx_m2m(pslld, vars, vard)
  612 
  613 #define psllw_i2r(imm, reg) mmx_i2r(psllw, imm, reg)
  614 #define psllw_m2r(var, reg) mmx_m2r(psllw, var, reg)
  615 #define psllw_r2r(regs, regd)   mmx_r2r(psllw, regs, regd)
  616 #define psllw(vars, vard)   mmx_m2m(psllw, vars, vard)
  617 
  618 
  619 /*  1x64, 2x32, and 4x16 Parallel Shift Right Logical
  620 */
  621 #define psrlq_i2r(imm, reg) mmx_i2r(psrlq, imm, reg)
  622 #define psrlq_m2r(var, reg) mmx_m2r(psrlq, var, reg)
  623 #define psrlq_r2r(regs, regd)   mmx_r2r(psrlq, regs, regd)
  624 #define psrlq(vars, vard)   mmx_m2m(psrlq, vars, vard)
  625 
  626 #define psrld_i2r(imm, reg) mmx_i2r(psrld, imm, reg)
  627 #define psrld_m2r(var, reg) mmx_m2r(psrld, var, reg)
  628 #define psrld_r2r(regs, regd)   mmx_r2r(psrld, regs, regd)
  629 #define psrld(vars, vard)   mmx_m2m(psrld, vars, vard)
  630 
  631 #define psrlw_i2r(imm, reg) mmx_i2r(psrlw, imm, reg)
  632 #define psrlw_m2r(var, reg) mmx_m2r(psrlw, var, reg)
  633 #define psrlw_r2r(regs, regd)   mmx_r2r(psrlw, regs, regd)
  634 #define psrlw(vars, vard)   mmx_m2m(psrlw, vars, vard)
  635 
  636 
  637 /*  2x32 and 4x16 Parallel Shift Right Arithmetic
  638 */
  639 #define psrad_i2r(imm, reg) mmx_i2r(psrad, imm, reg)
  640 #define psrad_m2r(var, reg) mmx_m2r(psrad, var, reg)
  641 #define psrad_r2r(regs, regd)   mmx_r2r(psrad, regs, regd)
  642 #define psrad(vars, vard)   mmx_m2m(psrad, vars, vard)
  643 
  644 #define psraw_i2r(imm, reg) mmx_i2r(psraw, imm, reg)
  645 #define psraw_m2r(var, reg) mmx_m2r(psraw, var, reg)
  646 #define psraw_r2r(regs, regd)   mmx_r2r(psraw, regs, regd)
  647 #define psraw(vars, vard)   mmx_m2m(psraw, vars, vard)
  648 
  649 
  650 /*  2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
  651     (packs source and dest fields into dest in that order)
  652 */
  653 #define packssdw_m2r(var, reg)  mmx_m2r(packssdw, var, reg)
  654 #define packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
  655 #define packssdw(vars, vard)    mmx_m2m(packssdw, vars, vard)
  656 
  657 #define packsswb_m2r(var, reg)  mmx_m2r(packsswb, var, reg)
  658 #define packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
  659 #define packsswb(vars, vard)    mmx_m2m(packsswb, vars, vard)
  660 
  661 
  662 /*  4x16->8x8 PACK and Unsigned Saturate
  663     (packs source and dest fields into dest in that order)
  664 */
  665 #define packuswb_m2r(var, reg)  mmx_m2r(packuswb, var, reg)
  666 #define packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
  667 #define packuswb(vars, vard)    mmx_m2m(packuswb, vars, vard)
  668 
  669 
  670 /*  2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
  671     (interleaves low half of dest with low half of source
  672      as padding in each result field)
  673 */
  674 #define punpckldq_m2r(var, reg) mmx_m2r(punpckldq, var, reg)
  675 #define punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
  676 #define punpckldq(vars, vard)   mmx_m2m(punpckldq, vars, vard)
  677 
  678 #define punpcklwd_m2r(var, reg) mmx_m2r(punpcklwd, var, reg)
  679 #define punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
  680 #define punpcklwd(vars, vard)   mmx_m2m(punpcklwd, vars, vard)
  681 
  682 #define punpcklbw_m2r(var, reg) mmx_m2r(punpcklbw, var, reg)
  683 #define punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
  684 #define punpcklbw(vars, vard)   mmx_m2m(punpcklbw, vars, vard)
  685 
  686 
  687 /*  2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
  688     (interleaves high half of dest with high half of source
  689      as padding in each result field)
  690 */
  691 #define punpckhdq_m2r(var, reg) mmx_m2r(punpckhdq, var, reg)
  692 #define punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
  693 #define punpckhdq(vars, vard)   mmx_m2m(punpckhdq, vars, vard)
  694 
  695 #define punpckhwd_m2r(var, reg) mmx_m2r(punpckhwd, var, reg)
  696 #define punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
  697 #define punpckhwd(vars, vard)   mmx_m2m(punpckhwd, vars, vard)
  698 
  699 #define punpckhbw_m2r(var, reg) mmx_m2r(punpckhbw, var, reg)
  700 #define punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
  701 #define punpckhbw(vars, vard)   mmx_m2m(punpckhbw, vars, vard)
  702 
  703 
  704 /*  Empty MMx State
  705     (used to clean-up when going from mmx to float use
  706      of the registers that are shared by both; note that
  707      there is no float-to-mmx operation needed, because
  708      only the float tag word info is corruptible)
  709 */
  710 #ifdef  MMX_TRACE
  711 
  712 #define emms() \
  713     { \
  714         fprintf(stderr, "emms()\n"); \
  715         __asm__ __volatile__ ("emms"); \
  716     }
  717 
  718 #else
  719 
  720 #define emms()          __asm__ __volatile__ ("emms")
  721 
  722 #endif
  723 
  724 #endif
  725 
  726