"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "src/opt_memcpy.c" between
NetPIPE_4.x.tar.gz and NetPIPE-3.7.2.tar.gz

About: NetPIPE - a Network Protocol Independent Performance Evaluator

opt_memcpy.c  (NetPIPE_4.x):opt_memcpy.c  (NetPIPE-3.7.2)
/*#define AMDCOPY*/ /* This optimized memcpy requires gcc 3.x to be installed (needs xmmintrin.h) */
/*#define NTCOPY*/
/* Requires gcc 3.x to be installed (needs xmmintrin.h) */ /*#define _INTEL */
#if defined( NTCOPY ) #if defined (_INTEL)
#include <xmmintrin.h> #include <xmmintrin.h>
#else
#include "xmmintrin.h"
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <string.h>
/*construct a function that can do well on most bufferalignment */
/*construct a function that can do well on most bufferalignment */
#define LONGMSGSIZE (2.5*131072) /* Long message size */ #define LONGMSGSIZE (2.5*131072) /* Long message size */
/*#define BLOCKSIZE 131072 */
#define BLOCKSIZE 131072 /* Needs to be divisible by 16 */ #define BLOCKSIZE 131072 /* Needs to be divisible by 16 */
#define PAGESIZE 4096 #define PAGESIZE 4096
#define NUMPERPAGE 512 /* Number of elements fit in a page */ #define NUMPERPAGE 512 /* Number of elements fit in a page */
#define ALIGNMENT 16 #define ALIGNMENT 16
/* #define P4 */
#define small_memcpy(dest,src,n) \ #if defined(P4)
#define CACHELINE 128 /* on Pentimum 4 */
#else
#define CACHELINE 32 /* on Pentimum 3 */
#endif
#define NTCOPY 1 /* Use nontemporal copy */
#define WACOPY 2 /* Write allocate copy */
#define CBCOPY 3 /*
* mixed copy, small message use
* write allocate copy, and long
* message use nontemporal copy
*/
#define COPY_TYPE CBCOPY
#define small_memcpy(dst,src,n) \
{ register unsigned long int dummy; \ { register unsigned long int dummy; \
asm volatile ( \ asm volatile ( \
"rep; movsb\n\t" \ "rep; movsb\n\t" \
:"=&D"(dest), "=&S"(src), "=&c"(dummy) \ :"=&D"(dst), "=&S"(src), "=&c"(dummy) \
:"0" (dest), "1" (src),"2" (n) \ :"0" (dst), "1" (src),"2" (n) \
: "memory"); } : "memory"); }
extern int myproc; extern int myproc;
void ntcopy(void *dest, const void *src, int size); void ntcopy(void *dst, const void *src, int size);
void opt_memcpy(void *dest, const void *src, int nbytes); void memcpy_8(void *destination, const void *source, int nbytes);
void memcpy_16(void *destination, const void *source, int nbytes);
void MP_memcpy(void *dst, const void *src, int nbytes);
int intlog2(int i)
{
float x = i;
return (*(int*)&x >> 23) - 127;
}
/*
* This function optimize the memory copy if number of bytes
* to transfer is not equal to 8
*/
void memcpy_8(void *destination, const void *source, int nbytes)
{
int nb_b4, nb_after;
char *dest = (char *)destination, *src = (char *) source;
nb_b4 = 8 - ((long int)src % 8);
if( nb_b4 != 8 && nb_b4 <= nbytes) { /*
* Copy up to an 8-byte boundary first
* considering that nbytes can be less
* than nb_b4
*/
memcpy( dest, src, nb_b4 );
src += nb_b4;
dest += nb_b4;
nbytes -= nb_b4;
}
nb_after = nbytes % 8;
nbytes -= nb_after;
if( nbytes > 0 ) { /* Copy the main data */
memcpy( dest, src, nbytes );
}
if( nb_after > 0 ) { /* Copy the last few bytes */
src += nbytes;
dest += nbytes;
memcpy( dest, src, nb_after );
}
}
#if defined(NTCOPY) void memcpy_16(void *destination, const void *source, int nbytes)
void ntcopy(void *dest, const void *src, int size) {
int nb_b4, nb_after;
char *dest = (char *)destination, *src = (char *)source;
nb_b4 = 16 - ((int) dest % 16);
if (nb_b4 != 16 && nb_b4 <= nbytes)
{
memcpy(dest, src, nb_b4);
src += nb_b4;
dest += nb_b4;
nbytes -= nb_b4;
}
/*memcpy(dest, src, nbytes); */
nb_after = nbytes % 16;
nbytes -= nb_after;
if ( nbytes > 0) {
memcpy(dest, src, nbytes);
}
if( nb_after > 0 ) {
src += nbytes;
dest += nbytes;
memcpy( dest, src, nb_after );
}
}
//#if defined(_INTEL)
void ntcopy(void *dst, const void *src, int size)
{ {
int ii, jj, kk, N, delta, LEFT, blocksize, size1; int ii, jj, kk, N, delta, LEFT, blocksize, size1;
double *a, *b; double *a, *b;
double temp; double temp;
/* copy the first few bytes to make dest divisible by 8 */ /* copy the first few bytes to make dest divisible by 8 */
if (size <= ALIGNMENT)
if (size <= ALIGNMENT) { {
memcpy(dest, (void *)src, size); memcpy(dst, (void *)src, size);
return; return;
} }
delta = ((long)dest) & (ALIGNMENT - 1); delta = ((int)dst) & (ALIGNMENT - 1);
if (delta != 0) { if (delta != 0)
{
delta = ALIGNMENT - delta; delta = ALIGNMENT - delta;
size -= delta; size -= delta;
memcpy(dest, (void *)src, delta); memcpy(dst, (void *)src, delta);
} }
a = (double *)((unsigned int)src + delta); a = (double *)(src + delta);
b = (double *)((unsigned int)dest + delta); b = (double *)(dst + delta);
N = 2 * (size / 16); /* number of doubles */ N = 2 * (size / 16); /* number of doubles */
LEFT = size % 16; LEFT = size % 16;
blocksize = N; blocksize = N;
if (blocksize > BLOCKSIZE / 8) blocksize = BLOCKSIZE / 8; if (blocksize > BLOCKSIZE / 8)
blocksize = BLOCKSIZE / 8;
for(N=2*(size/16); N>0; N -= blocksize) for (X3;;i)
{ {
if (N < blocksize) blocksize = N; if (N < blocksize) blocksize = N;
_mm_prefetch((char*)&a[0], _MM_HINT_NTA); _mm_prefetch((char*)&a[0], _MM_HINT_NTA);
/* prefetch a block of size blocksize */ /* prefetch a block of size blocksize */
for (jj = 0; jj < blocksize; jj += NUMPERPAGE) for (jj = 0; jj < blocksize; jj += NUMPERPAGE)
{ {
/* prefetch one page of memory */ /* prefetch one page of memory */
if (jj + NUMPERPAGE < blocksize ) if (jj + NUMPERPAGE < blocksize )
{ {
temp = a[jj + NUMPERPAGE]; /* TLB priming */ temp = a[jj + NUMPERPAGE]; /* TLB priming */
} }
for (kk = jj + 16; kk < jj + NUMPERPAGE && kk < blocksize; kk += 16) { for (kk = jj + 16; kk < jj + NUMPERPAGE && kk < blocksize; kk += 16) {
_mm_prefetch((char*)&a[kk], _MM_HINT_NTA); _mm_prefetch((char*)&a[kk], _MM_HINT_NTA);
} }
} }
if ( ((long) a) & (ALIGNMENT - 1) ) if ( ((int) a) & (ALIGNMENT - 1) )
{ {
size1 = blocksize - blocksize % 16; size1 = blocksize - blocksize % 16;
for (kk = 0; kk < size1; kk += 16) for (kk = 0; kk < size1; kk += 16)
{ {
/* copy one cacheline (128 bytes) */ /* copy one cacheline (128 bytes) */
_mm_stream_ps((float*)&b[kk], _mm_stream_ps((float*)&b[kk],
_mm_loadu_ps((float*)&a[kk])); _mm_loadu_ps((float*)&a[kk]));
_mm_stream_ps((float*)&b[kk+2], _mm_stream_ps((float*)&b[kk+2],
_mm_loadu_ps((float*)&a[kk+2])); _mm_loadu_ps((float*)&a[kk+2]));
_mm_stream_ps((float*)&b[kk+4], _mm_stream_ps((float*)&b[kk+4],
skipping to change at line 150 skipping to change at line 248
b = b + blocksize; b = b + blocksize;
} }
_mm_sfence(); _mm_sfence();
if (LEFT > 0) if (LEFT > 0)
{ {
memcpy((char*)b, (char *)a, LEFT); memcpy((char*)b, (char *)a, LEFT);
} }
} }
//#endif
#elif defined(AMDCOPY) void MP_memcpy(void *dst, const void *src, int nbytes)
/* This implementation is based on the code in
* Pages from 73 to 74 of AMD AThlonTM Processor
* x86 Optimization Guide. The mmx_copy called by this function is in
* mmx_copy.s, which basically converts the assembly code in the Guide
* from Intel's syntax to AT&T's syntax */
void *amdcopy(void *to, const void *from, size_t len)
{
void *dest, *src;
int nhead, nblock, nleft;
/* Copy serveral bytes in the beginning to
* make the destination pointer divisible by 16. */
if (len < 16) {
memcpy(to, from, len);
} else {
dest = to;
src = (void *)from;
if ( (int) dest % 16 ) {
nhead = 16 - ((int) dest % 16);
memcpy(dest, src, nhead);
dest = (void *)((int) dest + nhead);
src = (void *)((int) src + nhead);
len -= nhead;
}
/* Copy the main part, which is devisible by 8192 */
nleft = len % 128;
nblock = len - nleft;
if (nblock > 0) { /* lock_copy(); */
mmx_copy(dest, src, nblock); /* In mmx_copy.s */
/* unlock_copy(); */
dest = (void *)((int) dest + nblock);
src = (void *)((int) src + nblock);
}
/* Copy the remaining part */
memcpy(dest, src, nleft);\
}
return to;
}
#elif defined(MMXCOPY) | defined(PAGECOPY)
/* mmxcopy and pagecopy are modified from the Linux kernel - Dave Turner */
static char fpu_state[512];
#define save_fpu_state() \
__asm__ __volatile__ ( \
" fnsave %0\n" \
" fwait\n" \
: "=m" (fpu_state) ) \
#define restore_fpu_state() \
__asm__ __volatile__ ( \
" frstor %0\n" \
: "=m" (fpu_state) ); \
#if defined(MMXCOPY) /* The performance beats memcpy by just 5-10% */
void *_mmx_memcpy(void *to, const void *from, size_t len)
{
void *p;
int i;
p = to;
i = len >> 6; /* len/64 */
if( len > 63 ) {
save_fpu_state(); /* Store the FPU state */
/* Prefetch 5 64-byte cache lines??? */
__asm__ __volatile__ (
"1: prefetcht0 (%0)\n"
" prefetcht0 64(%0)\n"
" prefetcht0 128(%0)\n"
" prefetcht0 192(%0)\n"
" prefetcht0 256(%0)\n"
"2: \n"
".section .fixup, \"ax\"\n"
"3: movw $0x1AEB, 1b\n"
" jmp 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 1b, 3b\n"
".previous"
: : "r" (from) );
/* Prefetch cache line then copy 64 bytes */
for(; i>5; i--) {
__asm__ __volatile__ (
"1: prefetcht0 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
".section .fixup, \"ax\"\n"
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 1b, 3b\n"
".previous"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
/* Move the last 5 blocks (5*64=320 Bytes) without prefetch */
for(; i>0; i--) {
__asm__ __volatile__ (
" movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
restore_fpu_state(); /* Restore the FPU state */
}
/* Now do the tail of the block */
memcpy(to, from, len&63);
return p;
}
#else
int fast_page_copy( void *to, void *from, int nbytes)
{
char *dest = (char *) to, *src = (char *) from;
if( nbytes >= 4096 ) {
save_fpu_state();
while( nbytes >= 4096 ) {
fast_copy_page(dest, src);
src += 4096;
dest += 4096;
nbytes -= 4096;
}
restore_fpu_state();
}
if( nbytes > 0 ) memcpy( dest, src, nbytes);
}
#ifdef CONFIG_MK7
int fast_copy_page(void *to, void *from)
{
int i;
/* kernel_fpu_begin();*/
/* maybe the prefetcht0 stuff can go before the expensive fnsave...
* but that is for later. -AV */
__asm__ __volatile__ (
"1: prefetcht0 (%0)\n"
" prefetcht0 64(%0)\n"
" prefetcht0 128(%0)\n"
" prefetcht0 192(%0)\n"
" prefetcht0 256(%0)\n"
"2: \n"
".section .fixup, \"ax\"\n"
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 1b, 3b\n"
".previous"
: : "r" (from) );
for(i=0; i<(4096-320)/64; i++)
{
__asm__ __volatile__ (
"1: prefetcht0 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
" movq 8(%0), %%mm1\n"
" movntq %%mm1, 8(%1)\n"
" movq 16(%0), %%mm2\n"
" movntq %%mm2, 16(%1)\n"
" movq 24(%0), %%mm3\n"
" movntq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm4\n"
" movntq %%mm4, 32(%1)\n"
" movq 40(%0), %%mm5\n"
" movntq %%mm5, 40(%1)\n"
" movq 48(%0), %%mm6\n"
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
".section .fixup, \"ax\"\n"
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 1b, 3b\n"
".previous"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
for(i=(4096-320)/64; i<4096/64; i++)
{
__asm__ __volatile__ (
"2: movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
" movq 8(%0), %%mm1\n"
" movntq %%mm1, 8(%1)\n"
" movq 16(%0), %%mm2\n"
" movntq %%mm2, 16(%1)\n"
" movq 24(%0), %%mm3\n"
" movntq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm4\n"
" movntq %%mm4, 32(%1)\n"
" movq 40(%0), %%mm5\n"
" movntq %%mm5, 40(%1)\n"
" movq 48(%0), %%mm6\n"
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
/* since movntq is weakly-ordered, a "sfence" is needed to become
* ordered again. */
__asm__ __volatile__ (
" sfence \n" : :
);
}
#else
/* Generic MMX implementation without K7 specific streaming */
int fast_copy_page(void *to, void *from)
{ {
int i; #if COPY_TYPE == WACOPY
__asm__ __volatile__ (
"1: prefetcht0 (%0)\n"
" prefetcht0 64(%0)\n"
" prefetcht0 128(%0)\n"
" prefetcht0 192(%0)\n"
" prefetcht0 256(%0)\n"
"2: \n"
".section .fixup, \"ax\"\n"
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 1b, 3b\n"
".previous"
: : "r" (from) );
for(i=0; i<4096/64; i++)
{
__asm__ __volatile__ (
"1: prefetcht0 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
".section .fixup, \"ax\"\n"
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 1b, 3b\n"
".previous"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
}
#endif /* k7 or Intel */
#endif /* PAGECOPY */
#endif /* optimized copy type */
void opt_memcpy(void *to, const void *from, int nbytes)
{
char *dest = (char *) to, *src = (char *) from;
#if defined( NTCOPY )
ntcopy(dest, src, nbytes);
#elif defined( AMDCOPY )
amdcopy(dest, src, nbytes); memcpy_16(dst, (void *)src, nbytes);
#elif defined( PAGECOPY ) #elif COPY_TYPE == NTCOPY
fast_page_copy(dest, src, nbytes); ntcopy(dst, src, nbytes);
#elif defined( MMXCOPY ) #elif COPY_TYPE == CBCOPY
_mmx_memcpy(dest, src, nbytes); if (nbytes > LONGMSGSIZE)
ntcopy(dst, src, nbytes);
else
memcpy_16(dst, src, nbytes);
#endif #endif
} }
 End of changes. 26 change blocks. 
388 lines changed or deleted 135 lines changed or added

Home  |  About  |  All  |  Newest  |  Fossies Dox  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTPS