Imported Debian version 1.0~trusty
[deb_vid.stab.git] / src / motiondetect_opt.c
1 /*
2 * motiondetect_opt.c
3 *
4 * Copyright (C) Georg Martius - February 1007-2012
5 * georg dot martius at web dot de
6 * Copyright (C) Alexey Osipov - Jule 2011
7 * simba at lerlan dot ru
8 * speed optimizations (threshold, spiral, SSE, asm)
9 *
10 * This file is part of vid.stab video stabilization library
11 *
12 * vid.stab is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License,
14 * as published by the Free Software Foundation; either version 2, or
15 * (at your option) any later version.
16 *
17 * vid.stab is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with GNU Make; see the file COPYING. If not, write to
24 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 */
27 #include "motiondetect_opt.h"
28
29 #ifdef USE_ORC
30 #include "orc/motiondetectorc.h"
31 #endif
32
33 #ifdef USE_SSE2
34 #include <emmintrin.h>
35
36 #define USE_SSE2_CMP_HOR
37 #define SSE2_CMP_SUM_ROWS 8
38 #endif
39
40 #ifdef USE_SSE2
41 /**
42 \see contrastSubImg using SSE2 optimization, Planar (1 byte per channel) only
43 */
44 double contrastSubImg1_SSE(unsigned char* const I, const Field* field,
45 int width, int height)
46 {
47 int k, j;
48 unsigned char* p = NULL;
49 int s2 = field->size / 2;
50
51 static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
52
53 p = I + ((field->x - s2) + (field->y - s2)*width);
54
55 __m128i mmin, mmax;
56
57 mmin = _mm_loadu_si128((__m128i const*)full);
58 mmax = _mm_setzero_si128();
59
60 for (j = 0; j < field->size; j++){
61 for (k = 0; k < field->size; k += 16) {
62 __m128i xmm0;
63 xmm0 = _mm_loadu_si128((__m128i const*)p);
64 mmin = _mm_min_epu8(mmin, xmm0);
65 mmax = _mm_max_epu8(mmax, xmm0);
66 p += 16;
67 }
68 p += (width - field->size);
69 }
70
71 __m128i xmm1;
72 xmm1 = _mm_srli_si128(mmin, 8);
73 mmin = _mm_min_epu8(mmin, xmm1);
74 xmm1 = _mm_srli_si128(mmin, 4);
75 mmin = _mm_min_epu8(mmin, xmm1);
76 xmm1 = _mm_srli_si128(mmin, 2);
77 mmin = _mm_min_epu8(mmin, xmm1);
78 xmm1 = _mm_srli_si128(mmin, 1);
79 mmin = _mm_min_epu8(mmin, xmm1);
80 unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0);
81
82 xmm1 = _mm_srli_si128(mmax, 8);
83 mmax = _mm_max_epu8(mmax, xmm1);
84 xmm1 = _mm_srli_si128(mmax, 4);
85 mmax = _mm_max_epu8(mmax, xmm1);
86 xmm1 = _mm_srli_si128(mmax, 2);
87 mmax = _mm_max_epu8(mmax, xmm1);
88 xmm1 = _mm_srli_si128(mmax, 1);
89 mmax = _mm_max_epu8(mmax, xmm1);
90 unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0);
91
92 return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0
93 }
94 #endif
95
96 #ifdef USE_ORC
97 /**
98 calculates the contrast in the given small part of the given image
99 using the absolute difference from mean luminance (like Root-Mean-Square,
100 but with abs() (Manhattan-Norm))
101 For multichannel images use contrastSubImg_Michelson()
102
103 \param I pointer to framebuffer
104 \param field Field specifies position(center) and size of subimage
105 \param width width of frame
106 \param height height of frame
107 */
108 double contrastSubImg_variance_orc(unsigned char* const I, const Field* field,
109 int width, int height) {
110 unsigned char* p = NULL;
111 int s2 = field->size / 2;
112 int numpixel = field->size*field->size;
113
114 p = I + ((field->x - s2) + (field->y - s2) * width);
115
116 unsigned int sum=0;
117 image_sum_optimized((signed int*)&sum, p, width, field->size, field->size);
118 unsigned char mean = sum / numpixel;
119 int var=0;
120 image_variance_optimized(&var, p, width, mean, field->size, field->size);
121 return (double)var/numpixel/255.0;
122 }
123
124 /// plain C implementation of variance based contrastSubImg (without ORC)
125 double contrastSubImg_variance_C(unsigned char* const I,
126 const Field* field, int width, int height) {
127 int k, j;
128 unsigned char* p = NULL;
129 unsigned char* pstart = NULL;
130 int s2 = field->size / 2;
131 unsigned int sum=0;
132 int mean;
133 int var=0;
134 int numpixel = field->size*field->size;
135
136 pstart = I + ((field->x - s2) + (field->y - s2) * width);
137 p = pstart;
138 for (j = 0; j < field->size; j++) {
139 for (k = 0; k < field->size; k++, p++) {
140 sum+=*p;
141 }
142 p += (width - field->size);
143 }
144 mean=sum/numpixel;
145 p = pstart;
146 for (j = 0; j < field->size; j++) {
147 for (k = 0; k < field->size; k++, p++) {
148 var+=abs(*p-mean);
149 }
150 p += (width - field->size);
151 }
152 return (double)var/numpixel/255.0;
153 }
154 #endif
155
156
157
158
159
160
161 #ifdef USE_ORC
162 /**
163 compares a small part of two given images
164 and returns the average absolute difference.
165 Field center, size and shift have to be choosen,
166 so that no clipping is required.
167 Uses optimized inner loops by ORC.
168
169 \param field Field specifies position(center) and size of subimage
170 \param d_x shift in x direction
171 \param d_y shift in y direction
172 */
173 unsigned int compareSubImg_thr_orc(unsigned char* const I1, unsigned char* const I2,
174 const Field* field, int width1, int width2, int height,
175 int bytesPerPixel, int d_x, int d_y,
176 unsigned int threshold) {
177 unsigned char* p1 = NULL;
178 unsigned char* p2 = NULL;
179 int s2 = field->size / 2;
180 int j;
181 unsigned int sum = 0;
182 p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
183 p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) * bytesPerPixel;
184
185 for (j = 0; j < field->size; j++) {
186 unsigned int s = 0;
187 image_line_difference_optimized(&s, p1, p2, field->size* bytesPerPixel);
188 sum += s;
189 if( sum > threshold) // no need to calculate any longer: worse than the best match
190 break;
191 p1 += width1 * bytesPerPixel;
192 p2 += width2 * bytesPerPixel;
193 }
194
195
196 return sum;
197 }
198
199 // implementation with 1 orc function, but no threshold
200 unsigned int compareSubImg_orc(unsigned char* const I1, unsigned char* const I2,
201 const Field* field, int width1, int width2, int height,
202 int bytesPerPixel, int d_x, int d_y,
203 unsigned int threshold) {
204 unsigned char* p1 = NULL;
205 unsigned char* p2 = NULL;
206 int s2 = field->size / 2;
207 unsigned int sum=0;
208 p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
209 p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2)
210 * bytesPerPixel;
211
212 image_difference_optimized(&sum, p1, width1 * bytesPerPixel, p2, width2 * bytesPerPixel,
213 field->size* bytesPerPixel , field->size);
214 return sum;
215 }
216 #endif
217
218 #ifdef USE_SSE2
219 unsigned int compareSubImg_thr_sse2(unsigned char* const I1, unsigned char* const I2,
220 const Field* field,
221 int width1, int width2, int height,
222 int bytesPerPixel, int d_x, int d_y,
223 unsigned int treshold) {
224 int k, j;
225 unsigned char* p1 = NULL;
226 unsigned char* p2 = NULL;
227 int s2 = field->size / 2;
228 unsigned int sum = 0;
229
230 static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
231 unsigned char row = 0;
232 #ifndef USE_SSE2_CMP_HOR
233 unsigned char summes[16];
234 int i;
235 #endif
236 __m128i xmmsum, xmmmask;
237 xmmsum = _mm_setzero_si128();
238 xmmmask = _mm_loadu_si128((__m128i const*)mask);
239
240 p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel;
241 p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel;
242 for (j = 0; j < field->size; j++){
243 for (k = 0; k < field->size * bytesPerPixel; k+=16){
244 {
245 __m128i xmm0, xmm1, xmm2;
246 xmm0 = _mm_loadu_si128((__m128i const *)p1);
247 xmm1 = _mm_loadu_si128((__m128i const *)p2);
248
249 xmm2 = _mm_subs_epu8(xmm0, xmm1);
250 xmm0 = _mm_subs_epu8(xmm1, xmm0);
251 xmm0 = _mm_adds_epu8(xmm0, xmm2);
252
253 xmm1 = _mm_and_si128(xmm0, xmmmask);
254 xmm0 = _mm_srli_si128(xmm0, 1);
255 xmm0 = _mm_and_si128(xmm0, xmmmask);
256
257 xmmsum = _mm_adds_epu16(xmmsum, xmm0);
258 xmmsum = _mm_adds_epu16(xmmsum, xmm1);
259 }
260
261 p1+=16;
262 p2+=16;
263
264 row++;
265 if (row == SSE2_CMP_SUM_ROWS) {
266 row = 0;
267 #ifdef USE_SSE2_CMP_HOR
268 {
269 __m128i xmm1;
270
271 xmm1 = _mm_srli_si128(xmmsum, 8);
272 xmmsum = _mm_adds_epu16(xmmsum, xmm1);
273
274 xmm1 = _mm_srli_si128(xmmsum, 4);
275 xmmsum = _mm_adds_epu16(xmmsum, xmm1);
276
277 xmm1 = _mm_srli_si128(xmmsum, 2);
278 xmmsum = _mm_adds_epu16(xmmsum, xmm1);
279
280 sum += _mm_extract_epi16(xmmsum, 0);
281 }
282 #else
283 _mm_storeu_si128((__m128i*)summes, xmmsum);
284 for(i = 0; i < 16; i+=2)
285 sum += summes[i] + summes[i+1]*256;
286 #endif
287 xmmsum = _mm_setzero_si128();
288 }
289 }
290 if (sum > treshold)
291 break;
292 p1 += (width1 - field->size) * bytesPerPixel;
293 p2 += (width2 - field->size) * bytesPerPixel;
294 }
295
296 #if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) \
297 && (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16)
298 //process all data left unprocessed
299 //this part can be safely ignored if
300 //SSE_SUM_ROWS = {1, 2, 4, 8, 16}
301 #ifdef USE_SSE2_CMP_HOR
302 {
303 __m128i xmm1;
304
305 xmm1 = _mm_srli_si128(xmmsum, 8);
306 xmmsum = _mm_adds_epu16(xmmsum, xmm1);
307
308 xmm1 = _mm_srli_si128(xmmsum, 4);
309 xmmsum = _mm_adds_epu16(xmmsum, xmm1);
310
311 xmm1 = _mm_srli_si128(xmmsum, 2);
312 xmmsum = _mm_adds_epu16(xmmsum, xmm1);
313
314 sum += _mm_extract_epi16(xmmsum, 0);
315 }
316 #else
317 _mm_storeu_si128((__m128i*)summes, xmmsum);
318 for(i = 0; i < 16; i+=2)
319 sum += summes[i] + summes[i+1]*256;
320 #endif
321 #endif
322
323 return sum;
324 }
325 #endif // USE_SSE2
326
327 #ifdef USE_SSE2_ASM
328 unsigned int compareSubImg_thr_sse2_asm(unsigned char* const I1, unsigned char* const I2,
329 const Field* field,
330 int width1, int width2, int height,
331 int bytesPerPixel, int d_x, int d_y,
332 unsigned int treshold) {
333 unsigned char* p1 = NULL;
334 unsigned char* p2 = NULL;
335 int s2 = field->size / 2;
336 unsigned int sum = 0;
337
338 static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
339 p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel;
340 p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel;
341 asm (
342 "xor %0,%0\n"
343 "pxor %%xmm4,%%xmm4\n" //8 x 16bit partial sums
344 "movdqu (%3),%%xmm3\n" //mask
345
346 //main loop
347 "movl %4,%%edx\n" //edx = field->size * bytesPerPixel / 16
348 "mov $8,%%ecx\n" //cx = 8
349 "1:\n"
350
351 //calc intermediate sum of abs differences for 16 bytes
352 "movdqu (%1),%%xmm0\n" //p1
353 "movdqu (%2),%%xmm1\n" //p2
354 "movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0
355 "psubusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 - xmm1 (by bytes)
356 "psubusb %%xmm2,%%xmm1\n" //xmm1 = xmm1 - xmm2 (by bytes)
357 "paddusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 + xmm1 (absolute difference)
358 "movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0
359 "pand %%xmm3,%%xmm2\n" //xmm2 = xmm2 & xmm3 (apply mask)
360 "psrldq $1,%%xmm0\n" //xmm0 = xmm0 >> 8 (shift by 1 byte)
361 "pand %%xmm3,%%xmm0\n" //xmm0 = xmm0 & xmm3 (apply mask)
362 "paddusw %%xmm0,%%xmm4\n" //xmm4 = xmm4 + xmm0 (by words)
363 "paddusw %%xmm2,%%xmm4\n" //xmm4 = xmm4 + xmm2 (by words)
364
365 "add $16,%1\n" //move to next 16 bytes (p1)
366 "add $16,%2\n" //move to next 16 bytes (p2)
367
368 //check if we need flush sum (i.e. xmm4 is about to saturate)
369 "dec %%ecx\n"
370 "jnz 2f\n" //skip flushing if not
371 //flushing...
372 "movdqu %%xmm4,%%xmm0\n"
373 "psrldq $8,%%xmm0\n"
374 "paddusw %%xmm0,%%xmm4\n"
375 "movdqu %%xmm4,%%xmm0\n"
376 "psrldq $4,%%xmm0\n"
377 "paddusw %%xmm0,%%xmm4\n"
378 "movdqu %%xmm4,%%xmm0\n"
379 "psrldq $2,%%xmm0\n"
380 "paddusw %%xmm0,%%xmm4\n"
381 "movd %%xmm4,%%ecx\n"
382 "and $0xFFFF,%%ecx\n"
383 "addl %%ecx,%0\n"
384 "pxor %%xmm4,%%xmm4\n" //clearing xmm4
385 "mov $8,%%ecx\n" //cx = 8
386
387 //check if we need to go to another line
388 "2:\n"
389 "dec %%edx\n"
390 "jnz 1b\n" //skip if not
391
392 //move p1 and p2 to the next line
393 "add %5,%1\n"
394 "add %5,%2\n"
395 "cmp %7,%0\n" //if (sum > treshold)
396 "ja 3f\n" // break;
397 "movl %4,%%edx\n"
398
399 //check if all lines done
400 "decl %6\n"
401 "jnz 1b\n" //if not, continue looping
402 "3:\n"
403 :"=r"(sum)
404 :"r"(p1),"r"(p2),"r"(mask),"g"(field->size * bytesPerPixel / 16),"g"((unsigned char*)((width1 - field->size) * bytesPerPixel)),"g"(field->size), "g"(treshold), "0"(sum)
405 :"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%ecx","%edx"
406 );
407 // TODO width2 is not properly used here
408 return sum;
409 }
410 #endif // USE_SSE2_ASM
411
412 /*
413 * Local variables:
414 * c-file-style: "stroustrup"
415 * c-file-offsets: ((case-label . *) (statement-case-intro . *))
416 * indent-tabs-mode: nil
417 * tab-width: 2
418 * c-basic-offset: 2 t
419 * End:
420 *
421 * vim: expandtab shiftwidth=2:
422 */