/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to dict_hw/src/dict_image.cpp

  • Committer: Suren A. Chilingaryan
  • Date: 2009-12-15 19:23:39 UTC
  • Revision ID: csa@dside.dyndns.org-20091215192339-lzuciep3c2u99uc0
Optimize image reduction

Show diffs side-by-side

added added

removed removed

Lines of Context:
27
27
    if (ps->image) free(ps->image);
28
28
}
29
29
 
 
30
#ifdef USE_SSE
 
31
    // needs extra 32 bytes in the end of array, this is do20
 
32
static inline void reduct_ssse3(unsigned char *dst, unsigned char *src, int size) {
 
33
    int i;
 
34
    volatile int j;             // to prevent reduction of the loop
 
35
    unsigned char *end;
 
36
 
 
37
    ALGNW char shuf[64] ALGNL;
 
38
    char *shuf1 = shuf;
 
39
    char *shuf2 = shuf+16;
 
40
    char *shuf3 = shuf+32;
 
41
    uint16_t *mul = (uint16_t*)(shuf+48);
 
42
 
 
43
    memset(shuf, 0xFF, 48);
 
44
    for (i=0;i<16;i+=4) {
 
45
        shuf1[i] = i;
 
46
        shuf1[i+2] = i + 1;
 
47
    }
 
48
    for (i=0;i<8;i+=2) {
 
49
        shuf2[i] = 2 * i + 2;
 
50
        
 
51
        shuf3[i + 8] = 2 * i + 2;
 
52
    }
 
53
    for (i=0;i<8;i++) {
 
54
        mul[i] = 0x2AAA;
 
55
    }
 
56
 
 
57
    long aligned_size = calc_alloc(size, 32);
 
58
    long val = -aligned_size;
 
59
    unsigned char *src2 = src + 4 * aligned_size;
 
60
    unsigned char *dst2 = dst + aligned_size;
 
61
 
 
62
        __asm__ __volatile__ (
 
63
            "prefetchnta (%1)                           \n\t"
 
64
            "push       %2                              \n\t"
 
65
            "movdqa     (%3), %%xmm15                   \n\t"
 
66
            "movdqa     16(%3), %%xmm14                 \n\t"
 
67
            "movdqa     32(%3), %%xmm13                 \n\t"
 
68
            "movdqa     48(%3), %%xmm12                 \n\t"
 
69
 
 
70
            "movdqa     (%1,%2,4), %%xmm0               \n\t"
 
71
            "movdqa     16(%1,%2,4), %%xmm1             \n\t"
 
72
        "1:                                             \n\t"
 
73
            "prefetchnta 64(%1)                         \n\t"
 
74
            
 
75
            "movdqa     %%xmm0, %%xmm5                  \n\t"
 
76
            "pshufb     %%xmm15, %%xmm0                 \n\t"
 
77
 
 
78
            "movdqa     %%xmm1, %%xmm6                  \n\t"
 
79
            "pshufb     %%xmm15, %%xmm1                 \n\t"
 
80
 
 
81
            "movdqa     32(%1,%2,4), %%xmm2             \n\t"
 
82
            "movdqa     48(%1,%2,4), %%xmm3             \n\t"
 
83
 
 
84
            "pshufb     %%xmm14, %%xmm5                 \n\t"
 
85
            "pshufb     %%xmm13, %%xmm6                 \n\t"
 
86
            "por        %%xmm6, %%xmm5                  \n\t"
 
87
 
 
88
 
 
89
            "phaddw     %%xmm1, %%xmm0                  \n\t"
 
90
            "paddw      %%xmm5, %%xmm0                  \n\t"
 
91
            "pmulhrsw   %%xmm12, %%xmm0                 \n\t"
 
92
//------
 
93
            
 
94
            "movdqa     %%xmm2, %%xmm5                  \n\t"
 
95
            "pshufb     %%xmm15, %%xmm2                 \n\t"
 
96
            "movdqa     %%xmm3, %%xmm6                  \n\t"
 
97
            "pshufb     %%xmm15, %%xmm3                 \n\t"
 
98
 
 
99
            "movdqa     64(%1,%2,4), %%xmm7             \n\t"
 
100
            "movdqa     80(%1,%2,4), %%xmm8             \n\t"
 
101
 
 
102
            "pshufb     %%xmm14, %%xmm5                 \n\t"
 
103
            "pshufb     %%xmm13, %%xmm6                 \n\t"
 
104
            "por        %%xmm6, %%xmm5                  \n\t"
 
105
 
 
106
            "phaddw     %%xmm3, %%xmm2                  \n\t"
 
107
            "paddw      %%xmm5, %%xmm2                  \n\t"
 
108
            "pmulhrsw   %%xmm12, %%xmm2                 \n\t"
 
109
 
 
110
//-----
 
111
            "packuswb   %%xmm2, %%xmm0                  \n\t"
 
112
            "movntps    %%xmm0, (%0,%2)                 \n\t"
 
113
 
 
114
            "prefetchnta 128(%1)                        \n\t"
 
115
            
 
116
            "movdqa     %%xmm7, %%xmm5                  \n\t"
 
117
            "pshufb     %%xmm15, %%xmm7                 \n\t"
 
118
            "movdqa     %%xmm8, %%xmm6                  \n\t"
 
119
            "pshufb     %%xmm15, %%xmm8                 \n\t"
 
120
 
 
121
            "movdqa     96(%1,%2,4), %%xmm9             \n\t"
 
122
            "movdqa     112(%1,%2,4), %%xmm10           \n\t"
 
123
 
 
124
            "pshufb     %%xmm14, %%xmm5                 \n\t"
 
125
            "pshufb     %%xmm13, %%xmm6                 \n\t"
 
126
            "por        %%xmm6, %%xmm5                  \n\t"
 
127
 
 
128
            "phaddw     %%xmm8, %%xmm7                  \n\t"
 
129
            "paddw      %%xmm5, %%xmm7                  \n\t"
 
130
            "pmulhrsw   %%xmm12, %%xmm7                 \n\t"
 
131
//------
 
132
            
 
133
            "movdqa     %%xmm9, %%xmm5                  \n\t"
 
134
            "pshufb     %%xmm15, %%xmm9                 \n\t"
 
135
            "movdqa     %%xmm10, %%xmm6                 \n\t"
 
136
            "pshufb     %%xmm15, %%xmm10                \n\t"
 
137
 
 
138
            "pshufb     %%xmm14, %%xmm5                 \n\t"
 
139
            "pshufb     %%xmm13, %%xmm6                 \n\t"
 
140
            "por        %%xmm6, %%xmm5                  \n\t"
 
141
 
 
142
            "movdqa     128(%1,%2,4), %%xmm0            \n\t"
 
143
            "movdqa     144(%1,%2,4), %%xmm1            \n\t"
 
144
 
 
145
            "phaddw     %%xmm10, %%xmm9                 \n\t"
 
146
            "paddw      %%xmm5, %%xmm9                  \n\t"
 
147
            "pmulhrsw   %%xmm12, %%xmm9                 \n\t"
 
148
//-----
 
149
 
 
150
            "packuswb   %%xmm9, %%xmm7                  \n\t"
 
151
            "movntps    %%xmm7, 16(%0,%2)               \n\t"
 
152
 
 
153
            "add        $32, %2                         \n\t"
 
154
            "jnz        1b                              \n\t"
 
155
            "pop        %2                              \n\t"
 
156
 
 
157
    :
 
158
    : "r" (dst2), "r" (src2), "r" (val), "r" (&shuf)
 
159
    );
 
160
}
 
161
#endif /*USE_SSE */
 
162
 
 
163
 
30
164
int dictReduceImage(DICTContext ps, void *img, int width, int height) {
31
165
    int size = width * height;
32
166
    unsigned char *res = ps->image;
39
173
#ifdef DICT_TIFF_SUPPORT
40
174
        case DICT_IMAGE_TIFF:
41
175
            if (ps->matlab_mode) {
 
176
/*
42
177
                for (int i = 0; i < size; ++i) {
43
178
                    int col = i % height;
44
179
                    int lin = i / height;
45
 
                    int pos = 4*(col * width + lin);
46
 
                    res[i] = rintf(((double)(image[pos] + image[pos+1] + image[pos+2]))/3);
 
180
                    res[i] = rintf(((float)(image[pos] + image[pos+1] + image[pos+2]))/3);
 
181
                }
 
182
*/
 
183
/*
 
184
                for (int col = 0; col < height; col++) {
 
185
                    for (int lin = 0; lin < width; lin++) {
 
186
                        int i = lin*height + col;
 
187
                        int pos = 4*(col * width + lin);
 
188
                        unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);
 
189
                        res[i] = (90 + sum + (sum>>8)) >> 8;
 
190
                    }
 
191
                }
 
192
*/
 
193
 
 
194
                    // We doing that to benefit from CPU caches         
 
195
                int aligned_width = (width / TRANSPOSE_SIZE) * TRANSPOSE_SIZE;
 
196
                int aligned_height = (height / TRANSPOSE_SIZE) * TRANSPOSE_SIZE;
 
197
 
 
198
                for (int col = 0; col < aligned_height; col+=TRANSPOSE_SIZE) {
 
199
                    for (int lin = 0; lin < aligned_width; lin+=TRANSPOSE_SIZE) {
 
200
 
 
201
                        for (int col2 = 0; col2 < TRANSPOSE_SIZE; col2++) {
 
202
                            for (int lin2 = 0; lin2 < TRANSPOSE_SIZE; lin2++) {
 
203
                                int i = (lin+lin2)*height + (col+col2);
 
204
                                int pos = 4*((col+col2) * width + (lin+lin2));
 
205
                                
 
206
                                unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);
 
207
                                res[i] = (90 + sum + (sum>>8)) >> 8;
 
208
                            }
 
209
                        }
 
210
                    }
 
211
                }
 
212
                
 
213
                for (int col = aligned_height; col < height; col++) {
 
214
                    for (int lin = aligned_width; lin < width; lin++) {
 
215
                        int i = lin*height + col;
 
216
                        int pos = 4*(col * width + lin);
 
217
                        unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);
 
218
                        res[i] = (90 + sum + (sum>>8)) >> 8;
 
219
                    }
47
220
                }
48
221
            } else {
 
222
#ifdef USE_SSE
 
223
                reduct_ssse3(res, image, size);
 
224
#else /* USE_SSE */
49
225
                for (int i = 0; i < size; ++i) {
50
226
                    int pos = 4 * i;
51
 
                    res[i] = rintf(((double)(image[pos] + image[pos+1] + image[pos+2]))/3);
 
227
                    unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);
 
228
                    res[i] = (90 + sum + (sum>>8)) >> 8;
 
229
//                  res[i] = rintf(((double)(image[pos] + image[pos+1] + image[pos+2]))/3);
52
230
                }
 
231
#endif /* USE_SSE */
53
232
            }
54
233
        break;
55
234
#endif /* DICT_TIFF_SUPPORT */
81
260
    }
82
261
 
83
262
    ps->image_type = DICT_IMAGE_TIFF;
84
 
    ps->image_buf = _TIFFmalloc(w * h * sizeof (uint32_t));
85
 
    ps->image = (unsigned char*)malloc(w * h * sizeof(unsigned char));
 
263
        // we need extra 32 bytes in the end for do20 sse code
 
264
    int alloc_size = 32 + calc_alloc(w * h, 32);
 
265
    ps->image_buf = _TIFFmalloc(alloc_size * sizeof (uint32_t));
 
266
    ps->image = (unsigned char*)malloc(alloc_size * sizeof(unsigned char));
86
267
    
87
268
    ps->width = w;
88
269
    ps->height = h;