summaryrefslogtreecommitdiffstats
path: root/pcilib/pagecpy.c
diff options
context:
space:
mode:
Diffstat (limited to 'pcilib/pagecpy.c')
-rw-r--r--pcilib/pagecpy.c153
1 files changed, 153 insertions, 0 deletions
diff --git a/pcilib/pagecpy.c b/pcilib/pagecpy.c
new file mode 100644
index 0000000..f474f9f
--- /dev/null
+++ b/pcilib/pagecpy.c
@@ -0,0 +1,153 @@
+#define _POSIX_C_SOURCE 200112L
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <assert.h>
+#include <ctype.h>
+#include <time.h>
+#include <sched.h>
+#include <arpa/inet.h>
+#include <sys/time.h>
+
+#include "cpu.h"
+#include "pci.h"
+#include "tools.h"
+#include "error.h"
+
+
+/*
+void *memcpy128(void * dst, void const * src, size_t len) {
+
+ long pos = - (len>>2);
+ char * plDst = (char *) dst - 4 * pos;
+ char const * plSrc = (char const *) src - 4 * pos;
+
+ if (pos) {
+ __asm__ __volatile__ (
+ "1: \n\t"
+ "mov (%0,%2,4), %%edi \n\t"
+ "mov %%edi, (%1,%2,4) \n\t"
+ "inc %2 \n\t"
+ "jnz 1b \n\t"
+ :
+ : "r" (plSrc), "r" (plDst), "r" (pos)
+ : "%edi"
+ );
+ }
+
+
+
+ long pos = - ((len>>4)<<4);
+ char * plDst = (char *) dst - pos;
+ char const * plSrc = (char const *) src - pos;
+
+ if (pos) {
+ __asm__ __volatile__ (
+ "1: \n\t"
+// "movdqa (%0,%2), %%xmm0 \n\t"
+ "mov (%0,%2), %%esi \n\t"
+ "movd %%esi, %%xmm0 \n\t"
+ "mov 4(%0,%2), %%esi \n\t"
+ "movd %%esi, %%xmm1 \n\t"
+ "mov 8(%0,%2), %%esi \n\t"
+ "movd %%esi, %%xmm2 \n\t"
+ "mov 12(%0,%2), %%esi \n\t"
+ "movd %%esi, %%xmm3 \n\t"
+ "pslldq $4, %%xmm1 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "pslldq $8, %%xmm2 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "pslldq $12, %%xmm3 \n\t"
+ "por %%xmm3, %%xmm0 \n\t"
+
+ "movntdq %%xmm0, (%1,%2) \n\t"
+ "add $16, %2 \n\t"
+ "jnz 1b \n\t"
+ :
+ : "r" (plSrc), "r" (plDst), "r" (pos)
+ : "%rsi"
+ );
+ }
+
+
+
+ len &= 0x3;
+
+ char * pcDst = (char *) plDst;
+ char const * pcSrc = (char const *) plSrc;
+
+ while (len--) {
+ *pcDst++ = *pcSrc++;
+ }
+
+ return (dst);
+}
+*/
+
+void pcilib_memcpy4k_avx(void *dst, void *src, size_t size) {
+ size_t sse_size = (size / 512);
+
+ __asm__ __volatile__ (
+ "push %2 \n\t"
+ "mov $0, %%rax \n\t"
+
+ "1: \n\t"
+
+ "vmovdqa (%0,%%rax), %%ymm0 \n\t"
+ "vmovdqa 32(%0,%%rax), %%ymm1 \n\t"
+ "vmovdqa 64(%0,%%rax), %%ymm2 \n\t"
+ "vmovdqa 96(%0,%%rax), %%ymm3 \n\t"
+ "vmovdqa 128(%0,%%rax), %%ymm4 \n\t"
+ "vmovdqa 160(%0,%%rax), %%ymm5 \n\t"
+ "vmovdqa 192(%0,%%rax), %%ymm6 \n\t"
+ "vmovdqa 224(%0,%%rax), %%ymm7 \n\t"
+
+ "vmovdqa 256(%0,%%rax), %%ymm8 \n\t"
+ "vmovdqa 288(%0,%%rax), %%ymm9 \n\t"
+ "vmovdqa 320(%0,%%rax), %%ymm10 \n\t"
+ "vmovdqa 352(%0,%%rax), %%ymm11 \n\t"
+ "vmovdqa 384(%0,%%rax), %%ymm12 \n\t"
+ "vmovdqa 416(%0,%%rax), %%ymm13 \n\t"
+ "vmovdqa 448(%0,%%rax), %%ymm14 \n\t"
+ "vmovdqa 480(%0,%%rax), %%ymm15 \n\t"
+
+ "vmovntps %%ymm0, (%1,%%rax) \n\t"
+ "vmovntps %%ymm1, 32(%1,%%rax) \n\t"
+ "vmovntps %%ymm2, 64(%1,%%rax) \n\t"
+ "vmovntps %%ymm3, 96(%1,%%rax) \n\t"
+ "vmovntps %%ymm4, 128(%1,%%rax) \n\t"
+ "vmovntps %%ymm5, 160(%1,%%rax) \n\t"
+ "vmovntps %%ymm6, 192(%1,%%rax) \n\t"
+ "vmovntps %%ymm7, 224(%1,%%rax) \n\t"
+
+ "vmovntps %%ymm8, 256(%1,%%rax) \n\t"
+ "vmovntps %%ymm9, 288(%1,%%rax) \n\t"
+ "vmovntps %%ymm10, 320(%1,%%rax) \n\t"
+ "vmovntps %%ymm11, 352(%1,%%rax) \n\t"
+ "vmovntps %%ymm12, 384(%1,%%rax) \n\t"
+ "vmovntps %%ymm13, 416(%1,%%rax) \n\t"
+ "vmovntps %%ymm14, 448(%1,%%rax) \n\t"
+ "vmovntps %%ymm15, 480(%1,%%rax) \n\t"
+
+ "add $512, %%rax \n\t"
+ "dec %2 \n\t"
+ "jnz 1b \n\t"
+ "pop %2 \n\t"
+
+ "sfence"
+ :
+ : "p" (dst), "p" (src), "r" (sse_size)
+ : "%rax"
+ );
+}
+
+void pcilib_pagecpy(void *dst, void *src, size_t size) {
+ int gen = pcilib_get_cpu_gen();
+ if ((gen > 3)&&(size%4096==0)&&((uintptr_t)dst%32==0)&&((uintptr_t)src%32==0)) {
+ pcilib_memcpy4k_avx(dst, src, size);
+ } else
+ memcpy(dst, src, size);
+}