de63e3bfa7b76f2806025cff18895a8f6c8483b7
4 * Copyright (C) Georg Martius - February 1007-2012
5 * georg dot martius at web dot de
6 * Copyright (C) Alexey Osipov - Jule 2011
7 * simba at lerlan dot ru
8 * speed optimizations (threshold, spiral, SSE, asm)
10 * This file is part of vid.stab video stabilization library
12 * vid.stab is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License,
14 * as published by the Free Software Foundation; either version 2, or
15 * (at your option) any later version.
17 * vid.stab is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with GNU Make; see the file COPYING. If not, write to
24 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
27 #include "motiondetect_opt.h"
30 #include "orc/motiondetectorc.h"
34 #include <emmintrin.h>
36 #define USE_SSE2_CMP_HOR
37 #define SSE2_CMP_SUM_ROWS 8
42 \see contrastSubImg using SSE2 optimization, Planar (1 byte per channel) only
44 double contrastSubImg1_SSE(unsigned char* const I
, const Field
* field
,
45 int width
, int height
)
48 unsigned char* p
= NULL
;
49 int s2
= field
->size
/ 2;
51 static unsigned char full
[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
53 p
= I
+ ((field
->x
- s2
) + (field
->y
- s2
)*width
);
57 mmin
= _mm_loadu_si128((__m128i
const*)full
);
58 mmax
= _mm_setzero_si128();
60 for (j
= 0; j
< field
->size
; j
++){
61 for (k
= 0; k
< field
->size
; k
+= 16) {
63 xmm0
= _mm_loadu_si128((__m128i
const*)p
);
64 mmin
= _mm_min_epu8(mmin
, xmm0
);
65 mmax
= _mm_max_epu8(mmax
, xmm0
);
68 p
+= (width
- field
->size
);
72 xmm1
= _mm_srli_si128(mmin
, 8);
73 mmin
= _mm_min_epu8(mmin
, xmm1
);
74 xmm1
= _mm_srli_si128(mmin
, 4);
75 mmin
= _mm_min_epu8(mmin
, xmm1
);
76 xmm1
= _mm_srli_si128(mmin
, 2);
77 mmin
= _mm_min_epu8(mmin
, xmm1
);
78 xmm1
= _mm_srli_si128(mmin
, 1);
79 mmin
= _mm_min_epu8(mmin
, xmm1
);
80 unsigned char mini
= (unsigned char)_mm_extract_epi16(mmin
, 0);
82 xmm1
= _mm_srli_si128(mmax
, 8);
83 mmax
= _mm_max_epu8(mmax
, xmm1
);
84 xmm1
= _mm_srli_si128(mmax
, 4);
85 mmax
= _mm_max_epu8(mmax
, xmm1
);
86 xmm1
= _mm_srli_si128(mmax
, 2);
87 mmax
= _mm_max_epu8(mmax
, xmm1
);
88 xmm1
= _mm_srli_si128(mmax
, 1);
89 mmax
= _mm_max_epu8(mmax
, xmm1
);
90 unsigned char maxi
= (unsigned char)_mm_extract_epi16(mmax
, 0);
92 return (maxi
-mini
)/(maxi
+mini
+0.1); // +0.1 to avoid division by 0
98 calculates the contrast in the given small part of the given image
99 using the absolute difference from mean luminance (like Root-Mean-Square,
100 but with abs() (Manhattan-Norm))
101 For multichannel images use contrastSubImg_Michelson()
103 \param I pointer to framebuffer
104 \param field Field specifies position(center) and size of subimage
105 \param width width of frame
106 \param height height of frame
108 double contrastSubImg_variance_orc(unsigned char* const I
, const Field
* field
,
109 int width
, int height
) {
110 unsigned char* p
= NULL
;
111 int s2
= field
->size
/ 2;
112 int numpixel
= field
->size
*field
->size
;
114 p
= I
+ ((field
->x
- s2
) + (field
->y
- s2
) * width
);
117 image_sum_optimized((signed int*)&sum
, p
, width
, field
->size
, field
->size
);
118 unsigned char mean
= sum
/ numpixel
;
120 image_variance_optimized(&var
, p
, width
, mean
, field
->size
, field
->size
);
121 return (double)var
/numpixel
/255.0;
124 /// plain C implementation of variance based contrastSubImg (without ORC)
125 double contrastSubImg_variance_C(unsigned char* const I
,
126 const Field
* field
, int width
, int height
) {
128 unsigned char* p
= NULL
;
129 unsigned char* pstart
= NULL
;
130 int s2
= field
->size
/ 2;
134 int numpixel
= field
->size
*field
->size
;
136 pstart
= I
+ ((field
->x
- s2
) + (field
->y
- s2
) * width
);
138 for (j
= 0; j
< field
->size
; j
++) {
139 for (k
= 0; k
< field
->size
; k
++, p
++) {
142 p
+= (width
- field
->size
);
146 for (j
= 0; j
< field
->size
; j
++) {
147 for (k
= 0; k
< field
->size
; k
++, p
++) {
150 p
+= (width
- field
->size
);
152 return (double)var
/numpixel
/255.0;
163 compares a small part of two given images
164 and returns the average absolute difference.
165 Field center, size and shift have to be choosen,
166 so that no clipping is required.
167 Uses optimized inner loops by ORC.
169 \param field Field specifies position(center) and size of subimage
170 \param d_x shift in x direction
171 \param d_y shift in y direction
173 unsigned int compareSubImg_thr_orc(unsigned char* const I1
, unsigned char* const I2
,
174 const Field
* field
, int width1
, int width2
, int height
,
175 int bytesPerPixel
, int d_x
, int d_y
,
176 unsigned int threshold
) {
177 unsigned char* p1
= NULL
;
178 unsigned char* p2
= NULL
;
179 int s2
= field
->size
/ 2;
181 unsigned int sum
= 0;
182 p1
= I1
+ ((field
->x
- s2
) + (field
->y
- s2
) * width1
) * bytesPerPixel
;
183 p2
= I2
+ ((field
->x
- s2
+ d_x
) + (field
->y
- s2
+ d_y
) * width2
) * bytesPerPixel
;
185 for (j
= 0; j
< field
->size
; j
++) {
187 image_line_difference_optimized(&s
, p1
, p2
, field
->size
* bytesPerPixel
);
189 if( sum
> threshold
) // no need to calculate any longer: worse than the best match
191 p1
+= width1
* bytesPerPixel
;
192 p2
+= width2
* bytesPerPixel
;
199 // implementation with 1 orc function, but no threshold
200 unsigned int compareSubImg_orc(unsigned char* const I1
, unsigned char* const I2
,
201 const Field
* field
, int width1
, int width2
, int height
,
202 int bytesPerPixel
, int d_x
, int d_y
,
203 unsigned int threshold
) {
204 unsigned char* p1
= NULL
;
205 unsigned char* p2
= NULL
;
206 int s2
= field
->size
/ 2;
208 p1
= I1
+ ((field
->x
- s2
) + (field
->y
- s2
) * width1
) * bytesPerPixel
;
209 p2
= I2
+ ((field
->x
- s2
+ d_x
) + (field
->y
- s2
+ d_y
) * width2
)
212 image_difference_optimized(&sum
, p1
, width1
* bytesPerPixel
, p2
, width2
* bytesPerPixel
,
213 field
->size
* bytesPerPixel
, field
->size
);
219 unsigned int compareSubImg_thr_sse2(unsigned char* const I1
, unsigned char* const I2
,
221 int width1
, int width2
, int height
,
222 int bytesPerPixel
, int d_x
, int d_y
,
223 unsigned int treshold
) {
225 unsigned char* p1
= NULL
;
226 unsigned char* p2
= NULL
;
227 int s2
= field
->size
/ 2;
228 unsigned int sum
= 0;
230 static unsigned char mask
[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
231 unsigned char row
= 0;
232 #ifndef USE_SSE2_CMP_HOR
233 unsigned char summes
[16];
236 __m128i xmmsum
, xmmmask
;
237 xmmsum
= _mm_setzero_si128();
238 xmmmask
= _mm_loadu_si128((__m128i
const*)mask
);
240 p1
=I1
+ ((field
->x
- s2
) + (field
->y
- s2
)*width1
)*bytesPerPixel
;
241 p2
=I2
+ ((field
->x
- s2
+ d_x
) + (field
->y
- s2
+ d_y
)*width2
)*bytesPerPixel
;
242 for (j
= 0; j
< field
->size
; j
++){
243 for (k
= 0; k
< field
->size
* bytesPerPixel
; k
+=16){
245 __m128i xmm0
, xmm1
, xmm2
;
246 xmm0
= _mm_loadu_si128((__m128i
const *)p1
);
247 xmm1
= _mm_loadu_si128((__m128i
const *)p2
);
249 xmm2
= _mm_subs_epu8(xmm0
, xmm1
);
250 xmm0
= _mm_subs_epu8(xmm1
, xmm0
);
251 xmm0
= _mm_adds_epu8(xmm0
, xmm2
);
253 xmm1
= _mm_and_si128(xmm0
, xmmmask
);
254 xmm0
= _mm_srli_si128(xmm0
, 1);
255 xmm0
= _mm_and_si128(xmm0
, xmmmask
);
257 xmmsum
= _mm_adds_epu16(xmmsum
, xmm0
);
258 xmmsum
= _mm_adds_epu16(xmmsum
, xmm1
);
265 if (row
== SSE2_CMP_SUM_ROWS
) {
267 #ifdef USE_SSE2_CMP_HOR
271 xmm1
= _mm_srli_si128(xmmsum
, 8);
272 xmmsum
= _mm_adds_epu16(xmmsum
, xmm1
);
274 xmm1
= _mm_srli_si128(xmmsum
, 4);
275 xmmsum
= _mm_adds_epu16(xmmsum
, xmm1
);
277 xmm1
= _mm_srli_si128(xmmsum
, 2);
278 xmmsum
= _mm_adds_epu16(xmmsum
, xmm1
);
280 sum
+= _mm_extract_epi16(xmmsum
, 0);
283 _mm_storeu_si128((__m128i
*)summes
, xmmsum
);
284 for(i
= 0; i
< 16; i
+=2)
285 sum
+= summes
[i
] + summes
[i
+1]*256;
287 xmmsum
= _mm_setzero_si128();
292 p1
+= (width1
- field
->size
) * bytesPerPixel
;
293 p2
+= (width2
- field
->size
) * bytesPerPixel
;
296 #if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) \
297 && (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16)
298 //process all data left unprocessed
299 //this part can be safely ignored if
300 //SSE_SUM_ROWS = {1, 2, 4, 8, 16}
301 #ifdef USE_SSE2_CMP_HOR
305 xmm1
= _mm_srli_si128(xmmsum
, 8);
306 xmmsum
= _mm_adds_epu16(xmmsum
, xmm1
);
308 xmm1
= _mm_srli_si128(xmmsum
, 4);
309 xmmsum
= _mm_adds_epu16(xmmsum
, xmm1
);
311 xmm1
= _mm_srli_si128(xmmsum
, 2);
312 xmmsum
= _mm_adds_epu16(xmmsum
, xmm1
);
314 sum
+= _mm_extract_epi16(xmmsum
, 0);
317 _mm_storeu_si128((__m128i
*)summes
, xmmsum
);
318 for(i
= 0; i
< 16; i
+=2)
319 sum
+= summes
[i
] + summes
[i
+1]*256;
328 unsigned int compareSubImg_thr_sse2_asm(unsigned char* const I1
, unsigned char* const I2
,
330 int width1
, int width2
, int height
,
331 int bytesPerPixel
, int d_x
, int d_y
,
332 unsigned int treshold
) {
333 unsigned char* p1
= NULL
;
334 unsigned char* p2
= NULL
;
335 int s2
= field
->size
/ 2;
336 unsigned int sum
= 0;
338 static unsigned char mask
[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
339 p1
=I1
+ ((field
->x
- s2
) + (field
->y
- s2
)*width1
)*bytesPerPixel
;
340 p2
=I2
+ ((field
->x
- s2
+ d_x
) + (field
->y
- s2
+ d_y
)*width2
)*bytesPerPixel
;
343 "pxor %%xmm4,%%xmm4\n" //8 x 16bit partial sums
344 "movdqu (%3),%%xmm3\n" //mask
347 "movl %4,%%edx\n" //edx = field->size * bytesPerPixel / 16
348 "mov $8,%%ecx\n" //cx = 8
351 //calc intermediate sum of abs differences for 16 bytes
352 "movdqu (%1),%%xmm0\n" //p1
353 "movdqu (%2),%%xmm1\n" //p2
354 "movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0
355 "psubusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 - xmm1 (by bytes)
356 "psubusb %%xmm2,%%xmm1\n" //xmm1 = xmm1 - xmm2 (by bytes)
357 "paddusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 + xmm1 (absolute difference)
358 "movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0
359 "pand %%xmm3,%%xmm2\n" //xmm2 = xmm2 & xmm3 (apply mask)
360 "psrldq $1,%%xmm0\n" //xmm0 = xmm0 >> 8 (shift by 1 byte)
361 "pand %%xmm3,%%xmm0\n" //xmm0 = xmm0 & xmm3 (apply mask)
362 "paddusw %%xmm0,%%xmm4\n" //xmm4 = xmm4 + xmm0 (by words)
363 "paddusw %%xmm2,%%xmm4\n" //xmm4 = xmm4 + xmm2 (by words)
365 "add $16,%1\n" //move to next 16 bytes (p1)
366 "add $16,%2\n" //move to next 16 bytes (p2)
368 //check if we need flush sum (i.e. xmm4 is about to saturate)
370 "jnz 2f\n" //skip flushing if not
372 "movdqu %%xmm4,%%xmm0\n"
374 "paddusw %%xmm0,%%xmm4\n"
375 "movdqu %%xmm4,%%xmm0\n"
377 "paddusw %%xmm0,%%xmm4\n"
378 "movdqu %%xmm4,%%xmm0\n"
380 "paddusw %%xmm0,%%xmm4\n"
381 "movd %%xmm4,%%ecx\n"
382 "and $0xFFFF,%%ecx\n"
384 "pxor %%xmm4,%%xmm4\n" //clearing xmm4
385 "mov $8,%%ecx\n" //cx = 8
387 //check if we need to go to another line
390 "jnz 1b\n" //skip if not
392 //move p1 and p2 to the next line
395 "cmp %7,%0\n" //if (sum > treshold)
399 //check if all lines done
401 "jnz 1b\n" //if not, continue looping
404 :"r"(p1
),"r"(p2
),"r"(mask
),"g"(field
->size
* bytesPerPixel
/ 16),"g"((unsigned char*)((width1
- field
->size
) * bytesPerPixel
)),"g"(field
->size
), "g"(treshold
), "0"(sum
)
405 :"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%ecx","%edx"
407 // TODO width2 is not properly used here
410 #endif // USE_SSE2_ASM
414 * c-file-style: "stroustrup"
415 * c-file-offsets: ((case-label . *) (statement-case-intro . *))
416 * indent-tabs-mode: nil
418 * c-basic-offset: 2 t
421 * vim: expandtab shiftwidth=2: