bzr branch
http://suren.me/webbzr/alps/pcitool
330
by Suren A. Chilingaryan
Support for 64-bit registes |
1 |
#define _POSIX_C_SOURCE 200112L
|
2 |
#define _GNU_SOURCE
|
|
3 |
||
4 |
#include <stdio.h> |
|
5 |
#include <string.h> |
|
6 |
#include <unistd.h> |
|
7 |
#include <stdint.h> |
|
8 |
#include <assert.h> |
|
9 |
#include <ctype.h> |
|
10 |
#include <time.h> |
|
11 |
#include <sched.h> |
|
12 |
#include <arpa/inet.h> |
|
13 |
#include <sys/time.h> |
|
14 |
||
15 |
#include "cpu.h" |
|
16 |
#include "pci.h" |
|
17 |
#include "tools.h" |
|
18 |
#include "error.h" |
|
19 |
||
20 |
||
21 |
/*
|
|
22 |
void *memcpy128(void * dst, void const * src, size_t len) {
|
|
23 |
||
24 |
long pos = - (len>>2);
|
|
25 |
char * plDst = (char *) dst - 4 * pos;
|
|
26 |
char const * plSrc = (char const *) src - 4 * pos;
|
|
27 |
||
28 |
if (pos) {
|
|
29 |
__asm__ __volatile__ (
|
|
30 |
"1: \n\t"
|
|
31 |
"mov (%0,%2,4), %%edi \n\t"
|
|
32 |
"mov %%edi, (%1,%2,4) \n\t"
|
|
33 |
"inc %2 \n\t"
|
|
34 |
"jnz 1b \n\t"
|
|
35 |
:
|
|
36 |
: "r" (plSrc), "r" (plDst), "r" (pos)
|
|
37 |
: "%edi"
|
|
38 |
);
|
|
39 |
}
|
|
40 |
||
41 |
||
42 |
||
43 |
long pos = - ((len>>4)<<4);
|
|
44 |
char * plDst = (char *) dst - pos;
|
|
45 |
char const * plSrc = (char const *) src - pos;
|
|
46 |
||
47 |
if (pos) {
|
|
48 |
__asm__ __volatile__ (
|
|
49 |
"1: \n\t"
|
|
50 |
// "movdqa (%0,%2), %%xmm0 \n\t"
|
|
51 |
"mov (%0,%2), %%esi \n\t"
|
|
52 |
"movd %%esi, %%xmm0 \n\t"
|
|
53 |
"mov 4(%0,%2), %%esi \n\t"
|
|
54 |
"movd %%esi, %%xmm1 \n\t"
|
|
55 |
"mov 8(%0,%2), %%esi \n\t"
|
|
56 |
"movd %%esi, %%xmm2 \n\t"
|
|
57 |
"mov 12(%0,%2), %%esi \n\t"
|
|
58 |
"movd %%esi, %%xmm3 \n\t"
|
|
59 |
"pslldq $4, %%xmm1 \n\t"
|
|
60 |
"por %%xmm1, %%xmm0 \n\t"
|
|
61 |
"pslldq $8, %%xmm2 \n\t"
|
|
62 |
"por %%xmm2, %%xmm0 \n\t"
|
|
63 |
"pslldq $12, %%xmm3 \n\t"
|
|
64 |
"por %%xmm3, %%xmm0 \n\t"
|
|
65 |
|
|
66 |
"movntdq %%xmm0, (%1,%2) \n\t"
|
|
67 |
"add $16, %2 \n\t"
|
|
68 |
"jnz 1b \n\t"
|
|
69 |
:
|
|
70 |
: "r" (plSrc), "r" (plDst), "r" (pos)
|
|
71 |
: "%rsi"
|
|
72 |
);
|
|
73 |
}
|
|
74 |
||
75 |
||
76 |
||
77 |
len &= 0x3;
|
|
78 |
||
79 |
char * pcDst = (char *) plDst;
|
|
80 |
char const * pcSrc = (char const *) plSrc;
|
|
81 |
||
82 |
while (len--) {
|
|
83 |
*pcDst++ = *pcSrc++;
|
|
84 |
}
|
|
85 |
||
86 |
return (dst);
|
|
87 |
}
|
|
88 |
*/
|
|
89 |
||
341
by Suren A. Chilingaryan
Fix AVX memory copy |
90 |
void pcilib_memcpy4k_avx(void *dst, const void *src, size_t size) { |
330
by Suren A. Chilingaryan
Support for 64-bit registes |
91 |
size_t sse_size = (size / 512); |
92 |
||
93 |
__asm__ __volatile__ ( |
|
94 |
"push %2 \n\t" |
|
95 |
"mov $0, %%rax \n\t" |
|
96 |
||
97 |
"1: \n\t" |
|
98 |
||
341
by Suren A. Chilingaryan
Fix AVX memory copy |
99 |
"vmovdqa (%1,%%rax), %%ymm0 \n\t" |
100 |
"vmovdqa 32(%1,%%rax), %%ymm1 \n\t" |
|
101 |
"vmovdqa 64(%1,%%rax), %%ymm2 \n\t" |
|
102 |
"vmovdqa 96(%1,%%rax), %%ymm3 \n\t" |
|
103 |
"vmovdqa 128(%1,%%rax), %%ymm4 \n\t" |
|
104 |
"vmovdqa 160(%1,%%rax), %%ymm5 \n\t" |
|
105 |
"vmovdqa 192(%1,%%rax), %%ymm6 \n\t" |
|
106 |
"vmovdqa 224(%1,%%rax), %%ymm7 \n\t" |
|
107 |
||
108 |
"vmovdqa 256(%1,%%rax), %%ymm8 \n\t" |
|
109 |
"vmovdqa 288(%1,%%rax), %%ymm9 \n\t" |
|
110 |
"vmovdqa 320(%1,%%rax), %%ymm10 \n\t" |
|
111 |
"vmovdqa 352(%1,%%rax), %%ymm11 \n\t" |
|
112 |
"vmovdqa 384(%1,%%rax), %%ymm12 \n\t" |
|
113 |
"vmovdqa 416(%1,%%rax), %%ymm13 \n\t" |
|
114 |
"vmovdqa 448(%1,%%rax), %%ymm14 \n\t" |
|
115 |
"vmovdqa 480(%1,%%rax), %%ymm15 \n\t" |
|
116 |
||
117 |
"vmovdqa %%ymm0, (%0,%%rax) \n\t" |
|
118 |
"vmovdqa %%ymm1, 32(%0,%%rax) \n\t" |
|
119 |
"vmovntps %%ymm2, 64(%0,%%rax) \n\t" |
|
120 |
"vmovntps %%ymm3, 96(%0,%%rax) \n\t" |
|
121 |
"vmovntps %%ymm4, 128(%0,%%rax) \n\t" |
|
122 |
"vmovntps %%ymm5, 160(%0,%%rax) \n\t" |
|
123 |
"vmovntps %%ymm6, 192(%0,%%rax) \n\t" |
|
124 |
"vmovntps %%ymm7, 224(%0,%%rax) \n\t" |
|
125 |
||
126 |
"vmovntps %%ymm8, 256(%0,%%rax) \n\t" |
|
127 |
"vmovntps %%ymm9, 288(%0,%%rax) \n\t" |
|
128 |
"vmovntps %%ymm10, 320(%0,%%rax) \n\t" |
|
129 |
"vmovntps %%ymm11, 352(%0,%%rax) \n\t" |
|
130 |
"vmovntps %%ymm12, 384(%0,%%rax) \n\t" |
|
131 |
"vmovntps %%ymm13, 416(%0,%%rax) \n\t" |
|
132 |
"vmovntps %%ymm14, 448(%0,%%rax) \n\t" |
|
133 |
"vmovntps %%ymm15, 480(%0,%%rax) \n\t" |
|
330
by Suren A. Chilingaryan
Support for 64-bit registes |
134 |
|
135 |
"add $512, %%rax \n\t" |
|
136 |
"dec %2 \n\t" |
|
137 |
"jnz 1b \n\t" |
|
138 |
"pop %2 \n\t" |
|
139 |
||
341
by Suren A. Chilingaryan
Fix AVX memory copy |
140 |
"mfence"
|
330
by Suren A. Chilingaryan
Support for 64-bit registes |
141 |
:
|
142 |
: "p" (dst), "p" (src), "r" (sse_size) |
|
143 |
: "%rax" |
|
144 |
);
|
|
145 |
}
|
|
146 |
||
341
by Suren A. Chilingaryan
Fix AVX memory copy |
147 |
void pcilib_pagecpy(void *dst, const void *src, size_t size) { |
330
by Suren A. Chilingaryan
Support for 64-bit registes |
148 |
int gen = pcilib_get_cpu_gen(); |
341
by Suren A. Chilingaryan
Fix AVX memory copy |
149 |
if ((gen > 3)&&((size%4096)==0)&&(((uintptr_t)dst%32)==0)&&(((uintptr_t)src%32)==0)) { |
330
by Suren A. Chilingaryan
Support for 64-bit registes |
150 |
pcilib_memcpy4k_avx(dst, src, size); |
151 |
} else |
|
152 |
memcpy(dst, src, size); |
|
153 |
}
|