diff options
| author | Suren A. Chilingaryan <csa@suren.me> | 2015-11-20 05:55:36 +0100 | 
|---|---|---|
| committer | Suren A. Chilingaryan <csa@suren.me> | 2015-11-20 05:55:36 +0100 | 
| commit | 2bda41263f2464c271509b0bd9ea9062c239d851 (patch) | |
| tree | a3f96dea1c86ecef4f822bff4f94faa133497e11 | |
| parent | 8ee679f837aed09f2abe5d47186505f98ccb4b6d (diff) | |
Fix AVX memory copy
| -rw-r--r-- | pcilib/pagecpy.c | 78 | ||||
| -rw-r--r-- | pcilib/pagecpy.h | 2 | 
2 files changed, 40 insertions, 40 deletions
| diff --git a/pcilib/pagecpy.c b/pcilib/pagecpy.c index f474f9f..c1e7bbd 100644 --- a/pcilib/pagecpy.c +++ b/pcilib/pagecpy.c @@ -87,7 +87,7 @@ void *memcpy128(void * dst, void const * src, size_t len) {  }   */ -void pcilib_memcpy4k_avx(void *dst, void *src, size_t size) { +void pcilib_memcpy4k_avx(void *dst, const void *src, size_t size) {      size_t sse_size = (size / 512);      __asm__ __volatile__ ( @@ -96,57 +96,57 @@ void pcilib_memcpy4k_avx(void *dst, void *src, size_t size) {              "1:					\n\t" -            "vmovdqa 	   (%0,%%rax), %%ymm0	\n\t" -            "vmovdqa 	 32(%0,%%rax), %%ymm1	\n\t" -            "vmovdqa 	 64(%0,%%rax), %%ymm2	\n\t" -            "vmovdqa 	 96(%0,%%rax), %%ymm3	\n\t" -            "vmovdqa 	128(%0,%%rax), %%ymm4	\n\t" -            "vmovdqa 	160(%0,%%rax), %%ymm5	\n\t" -            "vmovdqa 	192(%0,%%rax), %%ymm6	\n\t" -            "vmovdqa 	224(%0,%%rax), %%ymm7	\n\t" - -            "vmovdqa 	256(%0,%%rax), %%ymm8	\n\t" -            "vmovdqa 	288(%0,%%rax), %%ymm9	\n\t" -            "vmovdqa 	320(%0,%%rax), %%ymm10	\n\t" -            "vmovdqa 	352(%0,%%rax), %%ymm11	\n\t" -            "vmovdqa 	384(%0,%%rax), %%ymm12	\n\t" -            "vmovdqa 	416(%0,%%rax), %%ymm13	\n\t" -            "vmovdqa 	448(%0,%%rax), %%ymm14	\n\t" -            "vmovdqa 	480(%0,%%rax), %%ymm15	\n\t" - -            "vmovntps	%%ymm0,    (%1,%%rax)	\n\t" -            "vmovntps	%%ymm1,  32(%1,%%rax)	\n\t" -            "vmovntps	%%ymm2,  64(%1,%%rax)	\n\t" -            "vmovntps	%%ymm3,  96(%1,%%rax)	\n\t" -            "vmovntps	%%ymm4, 128(%1,%%rax)	\n\t" -            "vmovntps	%%ymm5, 160(%1,%%rax)	\n\t" -            "vmovntps	%%ymm6, 192(%1,%%rax)	\n\t" -            "vmovntps	%%ymm7, 224(%1,%%rax)	\n\t" - -            "vmovntps	%%ymm8,  256(%1,%%rax)	\n\t" -            "vmovntps	%%ymm9,  288(%1,%%rax)	\n\t" -            "vmovntps	%%ymm10, 320(%1,%%rax)	\n\t" -            "vmovntps	%%ymm11, 352(%1,%%rax)	\n\t" -            "vmovntps	%%ymm12, 384(%1,%%rax)	\n\t" -            "vmovntps	%%ymm13, 416(%1,%%rax)	\n\t" -            "vmovntps	%%ymm14, 448(%1,%%rax)	\n\t" -            "vmovntps	%%ymm15, 480(%1,%%rax)	\n\t" +            "vmovdqa 	   (%1,%%rax), %%ymm0	\n\t" +            "vmovdqa 	 32(%1,%%rax), %%ymm1	\n\t" +            "vmovdqa 	 64(%1,%%rax), %%ymm2	\n\t" +            "vmovdqa 	 96(%1,%%rax), %%ymm3	\n\t" +            "vmovdqa 	128(%1,%%rax), %%ymm4	\n\t" +            "vmovdqa 	160(%1,%%rax), %%ymm5	\n\t" +            "vmovdqa 	192(%1,%%rax), %%ymm6	\n\t" +            "vmovdqa 	224(%1,%%rax), %%ymm7	\n\t" + +            "vmovdqa 	256(%1,%%rax), %%ymm8	\n\t" +            "vmovdqa 	288(%1,%%rax), %%ymm9	\n\t" +            "vmovdqa 	320(%1,%%rax), %%ymm10	\n\t" +            "vmovdqa 	352(%1,%%rax), %%ymm11	\n\t" +            "vmovdqa 	384(%1,%%rax), %%ymm12	\n\t" +            "vmovdqa 	416(%1,%%rax), %%ymm13	\n\t" +            "vmovdqa 	448(%1,%%rax), %%ymm14	\n\t" +            "vmovdqa 	480(%1,%%rax), %%ymm15	\n\t" + +            "vmovdqa	%%ymm0,    (%0,%%rax)	\n\t" +            "vmovdqa	%%ymm1,  32(%0,%%rax)	\n\t" +            "vmovntps	%%ymm2,  64(%0,%%rax)	\n\t" +            "vmovntps	%%ymm3,  96(%0,%%rax)	\n\t" +            "vmovntps	%%ymm4, 128(%0,%%rax)	\n\t" +            "vmovntps	%%ymm5, 160(%0,%%rax)	\n\t" +            "vmovntps	%%ymm6, 192(%0,%%rax)	\n\t" +            "vmovntps	%%ymm7, 224(%0,%%rax)	\n\t" + +            "vmovntps	%%ymm8,  256(%0,%%rax)	\n\t" +            "vmovntps	%%ymm9,  288(%0,%%rax)	\n\t" +            "vmovntps	%%ymm10, 320(%0,%%rax)	\n\t" +            "vmovntps	%%ymm11, 352(%0,%%rax)	\n\t" +            "vmovntps	%%ymm12, 384(%0,%%rax)	\n\t" +            "vmovntps	%%ymm13, 416(%0,%%rax)	\n\t" +            "vmovntps	%%ymm14, 448(%0,%%rax)	\n\t" +            "vmovntps	%%ymm15, 480(%0,%%rax)	\n\t"              "add	$512, %%rax		\n\t"              "dec	%2			\n\t"              "jnz 	1b			\n\t"              "pop 	%2			\n\t" -            "sfence" +            "mfence"      :      : "p" (dst), "p" (src), "r" (sse_size)      : "%rax"          );  } -void pcilib_pagecpy(void *dst, void *src, size_t size) { +void pcilib_pagecpy(void *dst, const void *src, size_t size) {      int gen = pcilib_get_cpu_gen(); -    if ((gen > 3)&&(size%4096==0)&&((uintptr_t)dst%32==0)&&((uintptr_t)src%32==0)) { +    if ((gen > 3)&&((size%4096)==0)&&(((uintptr_t)dst%32)==0)&&(((uintptr_t)src%32)==0)) {  	pcilib_memcpy4k_avx(dst, src, size);      } else  	memcpy(dst, src, size); diff --git a/pcilib/pagecpy.h b/pcilib/pagecpy.h index ef8636b..4bcf505 100644 --- a/pcilib/pagecpy.h +++ b/pcilib/pagecpy.h @@ -20,7 +20,7 @@ extern "C" {   * @param[in] size - size of memory region in bytes.   * @return - `dst` or NULL on error   */ -void pcilib_pagecpy(void *dst, void *src, size_t size); +void pcilib_pagecpy(void *dst, const void *src, size_t size);  #ifdef __cplusplus  } | 
