"Fossies" - the Fresh Open Source Software Archive

Member "NetPIPE-3.7.2/src/MP_memcpy.c" (19 Aug 2010, 6784 Bytes) of package /linux/privat/old/NetPIPE-3.7.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "MP_memcpy.c" see the Fossies "Dox" file reference documentation.

    1 /* This optimized memcpy requires gcc 3.x to be installed (needs xmmintrin.h) */
    2 
    3 /*#define _INTEL */
    4 #if defined (_INTEL)
    5 #include <xmmintrin.h>
    6 #else 
    7 #include "xmmintrin.h"
    8 #endif
    9 
   10 
   11 
   12 #include <stdio.h>
   13 
   14 /*construct a function that can do well on most bufferalignment */
   15 #define LONGMSGSIZE (2.5*131072)    /* Long message size */    
   16 /*#define BLOCKSIZE 131072 */
   17 #define BLOCKSIZE  131072     /* Needs to be divisible by 16 */ 
   18 #define PAGESIZE   4096
   19 #define NUMPERPAGE 512        /* Number of elements fit in a page */
   20 #define ALIGNMENT  16
   21 /* #define P4 */
   22 
   23 #if defined(P4)
   24 #define CACHELINE     128     /* on Pentimum 4 */
   25 #else
   26 #define CACHELINE     32      /* on Pentimum 3 */
   27 #endif
   28 
   29 #define NTCOPY        1       /* Use nontemporal copy */
   30 #define WACOPY        2       /* Write allocate copy  */ 
   31 #define CBCOPY        3       /* 
   32                                * mixed copy, small message use  
   33                                * write allocate copy, and long   
   34                                * message use nontemporal copy
   35                                */ 
   36 
   37 #define COPY_TYPE     CBCOPY
   38 #define small_memcpy(dst,src,n) \
   39     { register unsigned long int dummy; \
   40     asm volatile ( \
   41       "rep; movsb\n\t" \
   42       :"=&D"(dst), "=&S"(src), "=&c"(dummy) \
   43       :"0" (dst), "1" (src),"2" (n) \
   44       : "memory");  }
   45 
   46 
   47 extern int myproc; 
   48 void ntcopy(void *dst, const void *src, int size); 
   49 void memcpy_8(void *destination, const void *source, int nbytes);
   50 void memcpy_16(void *destination, const void *source, int nbytes);
   51 
   52 void MP_memcpy(void *dst, const void *src, int nbytes);
   53 
   54 int intlog2(int i)
   55 {
   56   float x = i;
   57   return (*(int*)&x >> 23) - 127;
   58 }
   59 
   60 /* 
   61  * This function optimize the memory copy if number of bytes
   62  * to transfer is not equal to 8   
   63  */
   64 void memcpy_8(void *destination, const void *source, int nbytes)
   65 {
   66   int nb_b4, nb_after;
   67   char *dest = (char *)destination, *src = (char *) source;
   68 
   69   nb_b4 = 8 - ((long int)src % 8);
   70 
   71   if( nb_b4 != 8 && nb_b4 <= nbytes) {  /* 
   72                      * Copy up to an 8-byte boundary first
   73                                          * considering that nbytes can be less
   74                                          * than nb_b4  
   75                      */
   76     memcpy( dest, src, nb_b4 );
   77 
   78     src += nb_b4;
   79     dest += nb_b4;
   80     nbytes -= nb_b4;
   81 
   82   }
   83 
   84   nb_after = nbytes % 8;
   85   nbytes -= nb_after;
   86 
   87   if( nbytes > 0 ) {      /* Copy the main data */
   88 
   89     memcpy( dest, src, nbytes );
   90   }
   91 
   92   if( nb_after > 0 ) {    /* Copy the last few bytes */
   93 
   94     src += nbytes;
   95     dest += nbytes;
   96 
   97     memcpy( dest, src, nb_after );
   98 
   99   }
  100 }
  101 
  102 void memcpy_16(void *destination, const void *source, int nbytes)
  103 {
  104   int nb_b4, nb_after; 
  105   char *dest = (char *)destination, *src = (char *)source; 
  106  
  107   nb_b4 = 16 - ((int) dest % 16); 
  108   if (nb_b4 != 16 && nb_b4 <= nbytes) 
  109   { 
  110     memcpy(dest, src, nb_b4);
  111     src += nb_b4;
  112     dest += nb_b4;
  113     nbytes -= nb_b4; 
  114   } 
  115 
  116   /*memcpy(dest, src, nbytes);  */
  117   nb_after = nbytes % 16;
  118   nbytes -= nb_after;
  119 
  120   if ( nbytes > 0) {
  121     memcpy(dest, src, nbytes);
  122   } 
  123 
  124   if( nb_after > 0 ) {    
  125     src += nbytes;
  126     dest += nbytes;
  127     memcpy( dest, src, nb_after );
  128   }  
  129 }
  130 
  131 //#if defined(_INTEL)
  132 void ntcopy(void *dst, const void *src, int size)
  133 {
  134   int ii, jj, kk, N, delta, LEFT, blocksize, size1;
  135 
  136   double *a, *b;
  137   double temp;
  138 
  139   /* copy the first few bytes to make dest divisible by 8 */
  140   if (size <= ALIGNMENT)
  141   {
  142     memcpy(dst, (void *)src, size);  
  143     return;
  144   }
  145 
  146   delta = ((int)dst) & (ALIGNMENT - 1);
  147   if (delta != 0)
  148   {
  149     delta = ALIGNMENT - delta;
  150     size -= delta;
  151     memcpy(dst, (void *)src, delta);
  152   } 
  153   a = (double *)(src + delta);
  154   b = (double *)(dst + delta);
  155   N  = 2 * (size / 16);   /* number of doubles  */      
  156   LEFT = size % 16;  
  157   blocksize = N; 
  158 
  159   if (blocksize > BLOCKSIZE / 8)
  160     blocksize = BLOCKSIZE / 8;
  161 
  162   for (X3;;či) 
  163   {
  164     if (N < blocksize) blocksize = N; 
  165     _mm_prefetch((char*)&a[0], _MM_HINT_NTA);
  166     /* prefetch a block of size blocksize */
  167     for (jj = 0; jj < blocksize; jj += NUMPERPAGE)  
  168     {
  169       /* prefetch one page of memory */  
  170       if (jj + NUMPERPAGE < blocksize ) 
  171       { 
  172         temp = a[jj + NUMPERPAGE]; /* TLB priming */
  173       }
  174 
  175       for (kk = jj + 16; kk < jj + NUMPERPAGE && kk < blocksize; kk += 16) {
  176         _mm_prefetch((char*)&a[kk], _MM_HINT_NTA);
  177       } 
  178     }
  179 
  180     if ( ((int) a) & (ALIGNMENT - 1) )
  181     {
  182       size1 = blocksize - blocksize % 16; 
  183       for (kk = 0; kk < size1; kk += 16) 
  184       {
  185         /* copy one cacheline (128 bytes) */  
  186         _mm_stream_ps((float*)&b[kk],
  187           _mm_loadu_ps((float*)&a[kk]));
  188         _mm_stream_ps((float*)&b[kk+2],
  189           _mm_loadu_ps((float*)&a[kk+2]));
  190         _mm_stream_ps((float*)&b[kk+4],
  191           _mm_loadu_ps((float*)&a[kk+4]));
  192         _mm_stream_ps((float*)&b[kk+6],
  193           _mm_loadu_ps((float*)&a[kk+6]));
  194         _mm_stream_ps((float*)&b[kk+8],
  195           _mm_loadu_ps((float*)&a[kk+8]));
  196         _mm_stream_ps((float*)&b[kk+10],
  197           _mm_loadu_ps((float*)&a[kk+10]));
  198         _mm_stream_ps((float*)&b[kk+12],
  199           _mm_loadu_ps((float*)&a[kk+12]));
  200         _mm_stream_ps((float*)&b[kk+14],
  201           _mm_loadu_ps((float*)&a[kk+14]));
  202       }
  203 
  204       for (kk = size1; kk <  blocksize; kk += 2)   
  205       {
  206         _mm_stream_ps((float*)&b[kk],
  207           _mm_loadu_ps((float*)&a[kk]));
  208       }
  209     }
  210 
  211     else 
  212     {
  213       size1 = blocksize - blocksize % 16;
  214       for (kk = 0; kk < size1; kk+=16) 
  215       {
  216         _mm_stream_ps((float*)&b[kk],
  217           _mm_load_ps((float*)&a[kk]));
  218         _mm_stream_ps((float*)&b[kk+2],
  219           _mm_load_ps((float*)&a[kk+2]));
  220         _mm_stream_ps((float*)&b[kk+4],
  221           _mm_load_ps((float*)&a[kk+4]));
  222         _mm_stream_ps((float*)&b[kk+6],
  223           _mm_load_ps((float*)&a[kk+6]));
  224         _mm_stream_ps((float*)&b[kk+8],
  225           _mm_load_ps((float*)&a[kk+8]));
  226         _mm_stream_ps((float*)&b[kk+10],
  227           _mm_load_ps((float*)&a[kk+10]));
  228         _mm_stream_ps((float*)&b[kk+12],
  229           _mm_load_ps((float*)&a[kk+12]));
  230         _mm_stream_ps((float*)&b[kk+14],
  231           _mm_load_ps((float*)&a[kk+14]));
  232       }
  233       for (kk = size1; kk < blocksize; kk += 2)
  234       {
  235         _mm_stream_ps((float*)&b[kk],
  236           _mm_load_ps((float*)&a[kk]));
  237       }
  238     } 
  239     /* finished copying one block  */
  240     a = a + blocksize;
  241     b = b + blocksize;
  242   } 
  243   _mm_sfence();
  244 
  245   
  246   if (LEFT > 0)
  247   {
  248     memcpy((char*)b, (char *)a, LEFT);  
  249     
  250   }
  251 } 
  252 //#endif
  253 
  254 void  MP_memcpy(void *dst, const void *src, int nbytes) 
  255 {
  256 #if COPY_TYPE == WACOPY
  257 
  258   memcpy_16(dst, (void *)src, nbytes);
  259 
  260 #elif COPY_TYPE == NTCOPY
  261 
  262   ntcopy(dst, src, nbytes); 
  263 
  264 #elif COPY_TYPE == CBCOPY
  265 
  266   if (nbytes > LONGMSGSIZE)
  267     ntcopy(dst, src, nbytes);
  268   else
  269     memcpy_16(dst, src, nbytes);
  270 
  271 #endif
  272 }
  273 
  274