27
27
if (ps->image) free(ps->image);
31
// needs extra 32 bytes in the end of array, this is do20
32
static inline void reduct_ssse3(unsigned char *dst, unsigned char *src, int size) {
34
volatile int j; // to prevent reduction of the loop
37
ALGNW char shuf[64] ALGNL;
39
char *shuf2 = shuf+16;
40
char *shuf3 = shuf+32;
41
uint16_t *mul = (uint16_t*)(shuf+48);
43
memset(shuf, 0xFF, 48);
51
shuf3[i + 8] = 2 * i + 2;
57
long aligned_size = calc_alloc(size, 32);
58
long val = -aligned_size;
59
unsigned char *src2 = src + 4 * aligned_size;
60
unsigned char *dst2 = dst + aligned_size;
62
__asm__ __volatile__ (
63
"prefetchnta (%1) \n\t"
65
"movdqa (%3), %%xmm15 \n\t"
66
"movdqa 16(%3), %%xmm14 \n\t"
67
"movdqa 32(%3), %%xmm13 \n\t"
68
"movdqa 48(%3), %%xmm12 \n\t"
70
"movdqa (%1,%2,4), %%xmm0 \n\t"
71
"movdqa 16(%1,%2,4), %%xmm1 \n\t"
73
"prefetchnta 64(%1) \n\t"
75
"movdqa %%xmm0, %%xmm5 \n\t"
76
"pshufb %%xmm15, %%xmm0 \n\t"
78
"movdqa %%xmm1, %%xmm6 \n\t"
79
"pshufb %%xmm15, %%xmm1 \n\t"
81
"movdqa 32(%1,%2,4), %%xmm2 \n\t"
82
"movdqa 48(%1,%2,4), %%xmm3 \n\t"
84
"pshufb %%xmm14, %%xmm5 \n\t"
85
"pshufb %%xmm13, %%xmm6 \n\t"
86
"por %%xmm6, %%xmm5 \n\t"
89
"phaddw %%xmm1, %%xmm0 \n\t"
90
"paddw %%xmm5, %%xmm0 \n\t"
91
"pmulhrsw %%xmm12, %%xmm0 \n\t"
94
"movdqa %%xmm2, %%xmm5 \n\t"
95
"pshufb %%xmm15, %%xmm2 \n\t"
96
"movdqa %%xmm3, %%xmm6 \n\t"
97
"pshufb %%xmm15, %%xmm3 \n\t"
99
"movdqa 64(%1,%2,4), %%xmm7 \n\t"
100
"movdqa 80(%1,%2,4), %%xmm8 \n\t"
102
"pshufb %%xmm14, %%xmm5 \n\t"
103
"pshufb %%xmm13, %%xmm6 \n\t"
104
"por %%xmm6, %%xmm5 \n\t"
106
"phaddw %%xmm3, %%xmm2 \n\t"
107
"paddw %%xmm5, %%xmm2 \n\t"
108
"pmulhrsw %%xmm12, %%xmm2 \n\t"
111
"packuswb %%xmm2, %%xmm0 \n\t"
112
"movntps %%xmm0, (%0,%2) \n\t"
114
"prefetchnta 128(%1) \n\t"
116
"movdqa %%xmm7, %%xmm5 \n\t"
117
"pshufb %%xmm15, %%xmm7 \n\t"
118
"movdqa %%xmm8, %%xmm6 \n\t"
119
"pshufb %%xmm15, %%xmm8 \n\t"
121
"movdqa 96(%1,%2,4), %%xmm9 \n\t"
122
"movdqa 112(%1,%2,4), %%xmm10 \n\t"
124
"pshufb %%xmm14, %%xmm5 \n\t"
125
"pshufb %%xmm13, %%xmm6 \n\t"
126
"por %%xmm6, %%xmm5 \n\t"
128
"phaddw %%xmm8, %%xmm7 \n\t"
129
"paddw %%xmm5, %%xmm7 \n\t"
130
"pmulhrsw %%xmm12, %%xmm7 \n\t"
133
"movdqa %%xmm9, %%xmm5 \n\t"
134
"pshufb %%xmm15, %%xmm9 \n\t"
135
"movdqa %%xmm10, %%xmm6 \n\t"
136
"pshufb %%xmm15, %%xmm10 \n\t"
138
"pshufb %%xmm14, %%xmm5 \n\t"
139
"pshufb %%xmm13, %%xmm6 \n\t"
140
"por %%xmm6, %%xmm5 \n\t"
142
"movdqa 128(%1,%2,4), %%xmm0 \n\t"
143
"movdqa 144(%1,%2,4), %%xmm1 \n\t"
145
"phaddw %%xmm10, %%xmm9 \n\t"
146
"paddw %%xmm5, %%xmm9 \n\t"
147
"pmulhrsw %%xmm12, %%xmm9 \n\t"
150
"packuswb %%xmm9, %%xmm7 \n\t"
151
"movntps %%xmm7, 16(%0,%2) \n\t"
158
: "r" (dst2), "r" (src2), "r" (val), "r" (&shuf)
30
164
int dictReduceImage(DICTContext ps, void *img, int width, int height) {
31
165
int size = width * height;
32
166
unsigned char *res = ps->image;
39
173
#ifdef DICT_TIFF_SUPPORT
40
174
case DICT_IMAGE_TIFF:
41
175
if (ps->matlab_mode) {
42
177
for (int i = 0; i < size; ++i) {
43
178
int col = i % height;
44
179
int lin = i / height;
45
int pos = 4*(col * width + lin);
46
res[i] = rintf(((double)(image[pos] + image[pos+1] + image[pos+2]))/3);
180
res[i] = rintf(((float)(image[pos] + image[pos+1] + image[pos+2]))/3);
184
for (int col = 0; col < height; col++) {
185
for (int lin = 0; lin < width; lin++) {
186
int i = lin*height + col;
187
int pos = 4*(col * width + lin);
188
unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);
189
res[i] = (90 + sum + (sum>>8)) >> 8;
194
// We doing that to benefit from CPU caches
195
int aligned_width = (width / TRANSPOSE_SIZE) * TRANSPOSE_SIZE;
196
int aligned_height = (height / TRANSPOSE_SIZE) * TRANSPOSE_SIZE;
198
for (int col = 0; col < aligned_height; col+=TRANSPOSE_SIZE) {
199
for (int lin = 0; lin < aligned_width; lin+=TRANSPOSE_SIZE) {
201
for (int col2 = 0; col2 < TRANSPOSE_SIZE; col2++) {
202
for (int lin2 = 0; lin2 < TRANSPOSE_SIZE; lin2++) {
203
int i = (lin+lin2)*height + (col+col2);
204
int pos = 4*((col+col2) * width + (lin+lin2));
206
unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);
207
res[i] = (90 + sum + (sum>>8)) >> 8;
213
for (int col = aligned_height; col < height; col++) {
214
for (int lin = aligned_width; lin < width; lin++) {
215
int i = lin*height + col;
216
int pos = 4*(col * width + lin);
217
unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);
218
res[i] = (90 + sum + (sum>>8)) >> 8;
223
reduct_ssse3(res, image, size);
49
225
for (int i = 0; i < size; ++i) {
51
res[i] = rintf(((double)(image[pos] + image[pos+1] + image[pos+2]))/3);
227
unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);
228
res[i] = (90 + sum + (sum>>8)) >> 8;
229
// res[i] = rintf(((double)(image[pos] + image[pos+1] + image[pos+2]))/3);
55
234
#endif /* DICT_TIFF_SUPPORT */