Commit | Line | Data |
---|---|---|
80f575fc DM |
1 | /* |
2 | * motiondetect_opt.c | |
3 | * | |
4 | * Copyright (C) Georg Martius - February 1007-2012 | |
5 | * georg dot martius at web dot de | |
6 | * Copyright (C) Alexey Osipov - Jule 2011 | |
7 | * simba at lerlan dot ru | |
8 | * speed optimizations (threshold, spiral, SSE, asm) | |
9 | * | |
10 | * This file is part of vid.stab video stabilization library | |
11 | * | |
12 | * vid.stab is free software; you can redistribute it and/or modify | |
13 | * it under the terms of the GNU General Public License, | |
14 | * as published by the Free Software Foundation; either version 2, or | |
15 | * (at your option) any later version. | |
16 | * | |
17 | * vid.stab is distributed in the hope that it will be useful, | |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | * GNU General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU General Public License | |
23 | * along with GNU Make; see the file COPYING. If not, write to | |
24 | * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | |
25 | * | |
26 | */ | |
27 | #include "motiondetect_opt.h" | |
28 | ||
29 | #ifdef USE_ORC | |
30 | #include "orc/motiondetectorc.h" | |
31 | #endif | |
32 | ||
33 | #ifdef USE_SSE2 | |
34 | #include <emmintrin.h> | |
35 | ||
36 | #define USE_SSE2_CMP_HOR | |
37 | #define SSE2_CMP_SUM_ROWS 8 | |
38 | #endif | |
39 | ||
40 | #ifdef USE_SSE2 | |
41 | /** | |
42 | \see contrastSubImg using SSE2 optimization, Planar (1 byte per channel) only | |
43 | */ | |
44 | double contrastSubImg1_SSE(unsigned char* const I, const Field* field, | |
45 | int width, int height) | |
46 | { | |
47 | int k, j; | |
48 | unsigned char* p = NULL; | |
49 | int s2 = field->size / 2; | |
50 | ||
51 | static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; | |
52 | ||
53 | p = I + ((field->x - s2) + (field->y - s2)*width); | |
54 | ||
55 | __m128i mmin, mmax; | |
56 | ||
57 | mmin = _mm_loadu_si128((__m128i const*)full); | |
58 | mmax = _mm_setzero_si128(); | |
59 | ||
60 | for (j = 0; j < field->size; j++){ | |
61 | for (k = 0; k < field->size; k += 16) { | |
62 | __m128i xmm0; | |
63 | xmm0 = _mm_loadu_si128((__m128i const*)p); | |
64 | mmin = _mm_min_epu8(mmin, xmm0); | |
65 | mmax = _mm_max_epu8(mmax, xmm0); | |
66 | p += 16; | |
67 | } | |
68 | p += (width - field->size); | |
69 | } | |
70 | ||
71 | __m128i xmm1; | |
72 | xmm1 = _mm_srli_si128(mmin, 8); | |
73 | mmin = _mm_min_epu8(mmin, xmm1); | |
74 | xmm1 = _mm_srli_si128(mmin, 4); | |
75 | mmin = _mm_min_epu8(mmin, xmm1); | |
76 | xmm1 = _mm_srli_si128(mmin, 2); | |
77 | mmin = _mm_min_epu8(mmin, xmm1); | |
78 | xmm1 = _mm_srli_si128(mmin, 1); | |
79 | mmin = _mm_min_epu8(mmin, xmm1); | |
80 | unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0); | |
81 | ||
82 | xmm1 = _mm_srli_si128(mmax, 8); | |
83 | mmax = _mm_max_epu8(mmax, xmm1); | |
84 | xmm1 = _mm_srli_si128(mmax, 4); | |
85 | mmax = _mm_max_epu8(mmax, xmm1); | |
86 | xmm1 = _mm_srli_si128(mmax, 2); | |
87 | mmax = _mm_max_epu8(mmax, xmm1); | |
88 | xmm1 = _mm_srli_si128(mmax, 1); | |
89 | mmax = _mm_max_epu8(mmax, xmm1); | |
90 | unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0); | |
91 | ||
92 | return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0 | |
93 | } | |
94 | #endif | |
95 | ||
96 | #ifdef USE_ORC | |
97 | /** | |
98 | calculates the contrast in the given small part of the given image | |
99 | using the absolute difference from mean luminance (like Root-Mean-Square, | |
100 | but with abs() (Manhattan-Norm)) | |
101 | For multichannel images use contrastSubImg_Michelson() | |
102 | ||
103 | \param I pointer to framebuffer | |
104 | \param field Field specifies position(center) and size of subimage | |
105 | \param width width of frame | |
106 | \param height height of frame | |
107 | */ | |
108 | double contrastSubImg_variance_orc(unsigned char* const I, const Field* field, | |
109 | int width, int height) { | |
110 | unsigned char* p = NULL; | |
111 | int s2 = field->size / 2; | |
112 | int numpixel = field->size*field->size; | |
113 | ||
114 | p = I + ((field->x - s2) + (field->y - s2) * width); | |
115 | ||
116 | unsigned int sum=0; | |
117 | image_sum_optimized((signed int*)&sum, p, width, field->size, field->size); | |
118 | unsigned char mean = sum / numpixel; | |
119 | int var=0; | |
120 | image_variance_optimized(&var, p, width, mean, field->size, field->size); | |
121 | return (double)var/numpixel/255.0; | |
122 | } | |
123 | ||
124 | /// plain C implementation of variance based contrastSubImg (without ORC) | |
125 | double contrastSubImg_variance_C(unsigned char* const I, | |
126 | const Field* field, int width, int height) { | |
127 | int k, j; | |
128 | unsigned char* p = NULL; | |
129 | unsigned char* pstart = NULL; | |
130 | int s2 = field->size / 2; | |
131 | unsigned int sum=0; | |
132 | int mean; | |
133 | int var=0; | |
134 | int numpixel = field->size*field->size; | |
135 | ||
136 | pstart = I + ((field->x - s2) + (field->y - s2) * width); | |
137 | p = pstart; | |
138 | for (j = 0; j < field->size; j++) { | |
139 | for (k = 0; k < field->size; k++, p++) { | |
140 | sum+=*p; | |
141 | } | |
142 | p += (width - field->size); | |
143 | } | |
144 | mean=sum/numpixel; | |
145 | p = pstart; | |
146 | for (j = 0; j < field->size; j++) { | |
147 | for (k = 0; k < field->size; k++, p++) { | |
148 | var+=abs(*p-mean); | |
149 | } | |
150 | p += (width - field->size); | |
151 | } | |
152 | return (double)var/numpixel/255.0; | |
153 | } | |
154 | #endif | |
155 | ||
156 | ||
157 | ||
158 | ||
159 | ||
160 | ||
161 | #ifdef USE_ORC | |
162 | /** | |
163 | compares a small part of two given images | |
164 | and returns the average absolute difference. | |
165 | Field center, size and shift have to be choosen, | |
166 | so that no clipping is required. | |
167 | Uses optimized inner loops by ORC. | |
168 | ||
169 | \param field Field specifies position(center) and size of subimage | |
170 | \param d_x shift in x direction | |
171 | \param d_y shift in y direction | |
172 | */ | |
173 | unsigned int compareSubImg_thr_orc(unsigned char* const I1, unsigned char* const I2, | |
174 | const Field* field, int width1, int width2, int height, | |
175 | int bytesPerPixel, int d_x, int d_y, | |
176 | unsigned int threshold) { | |
177 | unsigned char* p1 = NULL; | |
178 | unsigned char* p2 = NULL; | |
179 | int s2 = field->size / 2; | |
180 | int j; | |
181 | unsigned int sum = 0; | |
182 | p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel; | |
183 | p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) * bytesPerPixel; | |
184 | ||
185 | for (j = 0; j < field->size; j++) { | |
186 | unsigned int s = 0; | |
187 | image_line_difference_optimized(&s, p1, p2, field->size* bytesPerPixel); | |
188 | sum += s; | |
189 | if( sum > threshold) // no need to calculate any longer: worse than the best match | |
190 | break; | |
191 | p1 += width1 * bytesPerPixel; | |
192 | p2 += width2 * bytesPerPixel; | |
193 | } | |
194 | ||
195 | ||
196 | return sum; | |
197 | } | |
198 | ||
199 | // implementation with 1 orc function, but no threshold | |
200 | unsigned int compareSubImg_orc(unsigned char* const I1, unsigned char* const I2, | |
201 | const Field* field, int width1, int width2, int height, | |
202 | int bytesPerPixel, int d_x, int d_y, | |
203 | unsigned int threshold) { | |
204 | unsigned char* p1 = NULL; | |
205 | unsigned char* p2 = NULL; | |
206 | int s2 = field->size / 2; | |
207 | unsigned int sum=0; | |
208 | p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel; | |
209 | p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) | |
210 | * bytesPerPixel; | |
211 | ||
212 | image_difference_optimized(&sum, p1, width1 * bytesPerPixel, p2, width2 * bytesPerPixel, | |
213 | field->size* bytesPerPixel , field->size); | |
214 | return sum; | |
215 | } | |
216 | #endif | |
217 | ||
218 | #ifdef USE_SSE2 | |
219 | unsigned int compareSubImg_thr_sse2(unsigned char* const I1, unsigned char* const I2, | |
220 | const Field* field, | |
221 | int width1, int width2, int height, | |
222 | int bytesPerPixel, int d_x, int d_y, | |
223 | unsigned int treshold) { | |
224 | int k, j; | |
225 | unsigned char* p1 = NULL; | |
226 | unsigned char* p2 = NULL; | |
227 | int s2 = field->size / 2; | |
228 | unsigned int sum = 0; | |
229 | ||
230 | static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00}; | |
231 | unsigned char row = 0; | |
232 | #ifndef USE_SSE2_CMP_HOR | |
233 | unsigned char summes[16]; | |
234 | int i; | |
235 | #endif | |
236 | __m128i xmmsum, xmmmask; | |
237 | xmmsum = _mm_setzero_si128(); | |
238 | xmmmask = _mm_loadu_si128((__m128i const*)mask); | |
239 | ||
240 | p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel; | |
241 | p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel; | |
242 | for (j = 0; j < field->size; j++){ | |
243 | for (k = 0; k < field->size * bytesPerPixel; k+=16){ | |
244 | { | |
245 | __m128i xmm0, xmm1, xmm2; | |
246 | xmm0 = _mm_loadu_si128((__m128i const *)p1); | |
247 | xmm1 = _mm_loadu_si128((__m128i const *)p2); | |
248 | ||
249 | xmm2 = _mm_subs_epu8(xmm0, xmm1); | |
250 | xmm0 = _mm_subs_epu8(xmm1, xmm0); | |
251 | xmm0 = _mm_adds_epu8(xmm0, xmm2); | |
252 | ||
253 | xmm1 = _mm_and_si128(xmm0, xmmmask); | |
254 | xmm0 = _mm_srli_si128(xmm0, 1); | |
255 | xmm0 = _mm_and_si128(xmm0, xmmmask); | |
256 | ||
257 | xmmsum = _mm_adds_epu16(xmmsum, xmm0); | |
258 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); | |
259 | } | |
260 | ||
261 | p1+=16; | |
262 | p2+=16; | |
263 | ||
264 | row++; | |
265 | if (row == SSE2_CMP_SUM_ROWS) { | |
266 | row = 0; | |
267 | #ifdef USE_SSE2_CMP_HOR | |
268 | { | |
269 | __m128i xmm1; | |
270 | ||
271 | xmm1 = _mm_srli_si128(xmmsum, 8); | |
272 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); | |
273 | ||
274 | xmm1 = _mm_srli_si128(xmmsum, 4); | |
275 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); | |
276 | ||
277 | xmm1 = _mm_srli_si128(xmmsum, 2); | |
278 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); | |
279 | ||
280 | sum += _mm_extract_epi16(xmmsum, 0); | |
281 | } | |
282 | #else | |
283 | _mm_storeu_si128((__m128i*)summes, xmmsum); | |
284 | for(i = 0; i < 16; i+=2) | |
285 | sum += summes[i] + summes[i+1]*256; | |
286 | #endif | |
287 | xmmsum = _mm_setzero_si128(); | |
288 | } | |
289 | } | |
290 | if (sum > treshold) | |
291 | break; | |
292 | p1 += (width1 - field->size) * bytesPerPixel; | |
293 | p2 += (width2 - field->size) * bytesPerPixel; | |
294 | } | |
295 | ||
296 | #if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) \ | |
297 | && (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16) | |
298 | //process all data left unprocessed | |
299 | //this part can be safely ignored if | |
300 | //SSE_SUM_ROWS = {1, 2, 4, 8, 16} | |
301 | #ifdef USE_SSE2_CMP_HOR | |
302 | { | |
303 | __m128i xmm1; | |
304 | ||
305 | xmm1 = _mm_srli_si128(xmmsum, 8); | |
306 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); | |
307 | ||
308 | xmm1 = _mm_srli_si128(xmmsum, 4); | |
309 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); | |
310 | ||
311 | xmm1 = _mm_srli_si128(xmmsum, 2); | |
312 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); | |
313 | ||
314 | sum += _mm_extract_epi16(xmmsum, 0); | |
315 | } | |
316 | #else | |
317 | _mm_storeu_si128((__m128i*)summes, xmmsum); | |
318 | for(i = 0; i < 16; i+=2) | |
319 | sum += summes[i] + summes[i+1]*256; | |
320 | #endif | |
321 | #endif | |
322 | ||
323 | return sum; | |
324 | } | |
325 | #endif // USE_SSE2 | |
326 | ||
327 | #ifdef USE_SSE2_ASM | |
328 | unsigned int compareSubImg_thr_sse2_asm(unsigned char* const I1, unsigned char* const I2, | |
329 | const Field* field, | |
330 | int width1, int width2, int height, | |
331 | int bytesPerPixel, int d_x, int d_y, | |
332 | unsigned int treshold) { | |
333 | unsigned char* p1 = NULL; | |
334 | unsigned char* p2 = NULL; | |
335 | int s2 = field->size / 2; | |
336 | unsigned int sum = 0; | |
337 | ||
338 | static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00}; | |
339 | p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel; | |
340 | p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel; | |
341 | asm ( | |
342 | "xor %0,%0\n" | |
343 | "pxor %%xmm4,%%xmm4\n" //8 x 16bit partial sums | |
344 | "movdqu (%3),%%xmm3\n" //mask | |
345 | ||
346 | //main loop | |
347 | "movl %4,%%edx\n" //edx = field->size * bytesPerPixel / 16 | |
348 | "mov $8,%%ecx\n" //cx = 8 | |
349 | "1:\n" | |
350 | ||
351 | //calc intermediate sum of abs differences for 16 bytes | |
352 | "movdqu (%1),%%xmm0\n" //p1 | |
353 | "movdqu (%2),%%xmm1\n" //p2 | |
354 | "movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0 | |
355 | "psubusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 - xmm1 (by bytes) | |
356 | "psubusb %%xmm2,%%xmm1\n" //xmm1 = xmm1 - xmm2 (by bytes) | |
357 | "paddusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 + xmm1 (absolute difference) | |
358 | "movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0 | |
359 | "pand %%xmm3,%%xmm2\n" //xmm2 = xmm2 & xmm3 (apply mask) | |
360 | "psrldq $1,%%xmm0\n" //xmm0 = xmm0 >> 8 (shift by 1 byte) | |
361 | "pand %%xmm3,%%xmm0\n" //xmm0 = xmm0 & xmm3 (apply mask) | |
362 | "paddusw %%xmm0,%%xmm4\n" //xmm4 = xmm4 + xmm0 (by words) | |
363 | "paddusw %%xmm2,%%xmm4\n" //xmm4 = xmm4 + xmm2 (by words) | |
364 | ||
365 | "add $16,%1\n" //move to next 16 bytes (p1) | |
366 | "add $16,%2\n" //move to next 16 bytes (p2) | |
367 | ||
368 | //check if we need flush sum (i.e. xmm4 is about to saturate) | |
369 | "dec %%ecx\n" | |
370 | "jnz 2f\n" //skip flushing if not | |
371 | //flushing... | |
372 | "movdqu %%xmm4,%%xmm0\n" | |
373 | "psrldq $8,%%xmm0\n" | |
374 | "paddusw %%xmm0,%%xmm4\n" | |
375 | "movdqu %%xmm4,%%xmm0\n" | |
376 | "psrldq $4,%%xmm0\n" | |
377 | "paddusw %%xmm0,%%xmm4\n" | |
378 | "movdqu %%xmm4,%%xmm0\n" | |
379 | "psrldq $2,%%xmm0\n" | |
380 | "paddusw %%xmm0,%%xmm4\n" | |
381 | "movd %%xmm4,%%ecx\n" | |
382 | "and $0xFFFF,%%ecx\n" | |
383 | "addl %%ecx,%0\n" | |
384 | "pxor %%xmm4,%%xmm4\n" //clearing xmm4 | |
385 | "mov $8,%%ecx\n" //cx = 8 | |
386 | ||
387 | //check if we need to go to another line | |
388 | "2:\n" | |
389 | "dec %%edx\n" | |
390 | "jnz 1b\n" //skip if not | |
391 | ||
392 | //move p1 and p2 to the next line | |
393 | "add %5,%1\n" | |
394 | "add %5,%2\n" | |
395 | "cmp %7,%0\n" //if (sum > treshold) | |
396 | "ja 3f\n" // break; | |
397 | "movl %4,%%edx\n" | |
398 | ||
399 | //check if all lines done | |
400 | "decl %6\n" | |
401 | "jnz 1b\n" //if not, continue looping | |
402 | "3:\n" | |
403 | :"=r"(sum) | |
404 | :"r"(p1),"r"(p2),"r"(mask),"g"(field->size * bytesPerPixel / 16),"g"((unsigned char*)((width1 - field->size) * bytesPerPixel)),"g"(field->size), "g"(treshold), "0"(sum) | |
405 | :"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%ecx","%edx" | |
406 | ); | |
407 | // TODO width2 is not properly used here | |
408 | return sum; | |
409 | } | |
410 | #endif // USE_SSE2_ASM | |
411 | ||
412 | /* | |
413 | * Local variables: | |
414 | * c-file-style: "stroustrup" | |
415 | * c-file-offsets: ((case-label . *) (statement-case-intro . *)) | |
416 | * indent-tabs-mode: nil | |
417 | * tab-width: 2 | |
418 | * c-basic-offset: 2 t | |
419 | * End: | |
420 | * | |
421 | * vim: expandtab shiftwidth=2: | |
422 | */ |