| 1 | /* |
| 2 | * motiondetect_opt.c |
| 3 | * |
| 4 | * Copyright (C) Georg Martius - February 1007-2012 |
| 5 | * georg dot martius at web dot de |
| 6 | * Copyright (C) Alexey Osipov - Jule 2011 |
| 7 | * simba at lerlan dot ru |
| 8 | * speed optimizations (threshold, spiral, SSE, asm) |
| 9 | * |
| 10 | * This file is part of vid.stab video stabilization library |
| 11 | * |
| 12 | * vid.stab is free software; you can redistribute it and/or modify |
| 13 | * it under the terms of the GNU General Public License, |
| 14 | * as published by the Free Software Foundation; either version 2, or |
| 15 | * (at your option) any later version. |
| 16 | * |
| 17 | * vid.stab is distributed in the hope that it will be useful, |
| 18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | * GNU General Public License for more details. |
| 21 | * |
| 22 | * You should have received a copy of the GNU General Public License |
| 23 | * along with GNU Make; see the file COPYING. If not, write to |
| 24 | * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. |
| 25 | * |
| 26 | */ |
| 27 | #include "motiondetect_opt.h" |
| 28 | |
| 29 | #ifdef USE_ORC |
| 30 | #include "orc/motiondetectorc.h" |
| 31 | #endif |
| 32 | |
| 33 | #ifdef USE_SSE2 |
| 34 | #include <emmintrin.h> |
| 35 | |
| 36 | #define USE_SSE2_CMP_HOR |
| 37 | #define SSE2_CMP_SUM_ROWS 8 |
| 38 | #endif |
| 39 | |
| 40 | #ifdef USE_SSE2 |
| 41 | /** |
| 42 | \see contrastSubImg using SSE2 optimization, Planar (1 byte per channel) only |
| 43 | */ |
| 44 | double contrastSubImg1_SSE(unsigned char* const I, const Field* field, |
| 45 | int width, int height) |
| 46 | { |
| 47 | int k, j; |
| 48 | unsigned char* p = NULL; |
| 49 | int s2 = field->size / 2; |
| 50 | |
| 51 | static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; |
| 52 | |
| 53 | p = I + ((field->x - s2) + (field->y - s2)*width); |
| 54 | |
| 55 | __m128i mmin, mmax; |
| 56 | |
| 57 | mmin = _mm_loadu_si128((__m128i const*)full); |
| 58 | mmax = _mm_setzero_si128(); |
| 59 | |
| 60 | for (j = 0; j < field->size; j++){ |
| 61 | for (k = 0; k < field->size; k += 16) { |
| 62 | __m128i xmm0; |
| 63 | xmm0 = _mm_loadu_si128((__m128i const*)p); |
| 64 | mmin = _mm_min_epu8(mmin, xmm0); |
| 65 | mmax = _mm_max_epu8(mmax, xmm0); |
| 66 | p += 16; |
| 67 | } |
| 68 | p += (width - field->size); |
| 69 | } |
| 70 | |
| 71 | __m128i xmm1; |
| 72 | xmm1 = _mm_srli_si128(mmin, 8); |
| 73 | mmin = _mm_min_epu8(mmin, xmm1); |
| 74 | xmm1 = _mm_srli_si128(mmin, 4); |
| 75 | mmin = _mm_min_epu8(mmin, xmm1); |
| 76 | xmm1 = _mm_srli_si128(mmin, 2); |
| 77 | mmin = _mm_min_epu8(mmin, xmm1); |
| 78 | xmm1 = _mm_srli_si128(mmin, 1); |
| 79 | mmin = _mm_min_epu8(mmin, xmm1); |
| 80 | unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0); |
| 81 | |
| 82 | xmm1 = _mm_srli_si128(mmax, 8); |
| 83 | mmax = _mm_max_epu8(mmax, xmm1); |
| 84 | xmm1 = _mm_srli_si128(mmax, 4); |
| 85 | mmax = _mm_max_epu8(mmax, xmm1); |
| 86 | xmm1 = _mm_srli_si128(mmax, 2); |
| 87 | mmax = _mm_max_epu8(mmax, xmm1); |
| 88 | xmm1 = _mm_srli_si128(mmax, 1); |
| 89 | mmax = _mm_max_epu8(mmax, xmm1); |
| 90 | unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0); |
| 91 | |
| 92 | return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0 |
| 93 | } |
| 94 | #endif |
| 95 | |
| 96 | #ifdef USE_ORC |
| 97 | /** |
| 98 | calculates the contrast in the given small part of the given image |
| 99 | using the absolute difference from mean luminance (like Root-Mean-Square, |
| 100 | but with abs() (Manhattan-Norm)) |
| 101 | For multichannel images use contrastSubImg_Michelson() |
| 102 | |
| 103 | \param I pointer to framebuffer |
| 104 | \param field Field specifies position(center) and size of subimage |
| 105 | \param width width of frame |
| 106 | \param height height of frame |
| 107 | */ |
| 108 | double contrastSubImg_variance_orc(unsigned char* const I, const Field* field, |
| 109 | int width, int height) { |
| 110 | unsigned char* p = NULL; |
| 111 | int s2 = field->size / 2; |
| 112 | int numpixel = field->size*field->size; |
| 113 | |
| 114 | p = I + ((field->x - s2) + (field->y - s2) * width); |
| 115 | |
| 116 | unsigned int sum=0; |
| 117 | image_sum_optimized((signed int*)&sum, p, width, field->size, field->size); |
| 118 | unsigned char mean = sum / numpixel; |
| 119 | int var=0; |
| 120 | image_variance_optimized(&var, p, width, mean, field->size, field->size); |
| 121 | return (double)var/numpixel/255.0; |
| 122 | } |
| 123 | |
| 124 | /// plain C implementation of variance based contrastSubImg (without ORC) |
| 125 | double contrastSubImg_variance_C(unsigned char* const I, |
| 126 | const Field* field, int width, int height) { |
| 127 | int k, j; |
| 128 | unsigned char* p = NULL; |
| 129 | unsigned char* pstart = NULL; |
| 130 | int s2 = field->size / 2; |
| 131 | unsigned int sum=0; |
| 132 | int mean; |
| 133 | int var=0; |
| 134 | int numpixel = field->size*field->size; |
| 135 | |
| 136 | pstart = I + ((field->x - s2) + (field->y - s2) * width); |
| 137 | p = pstart; |
| 138 | for (j = 0; j < field->size; j++) { |
| 139 | for (k = 0; k < field->size; k++, p++) { |
| 140 | sum+=*p; |
| 141 | } |
| 142 | p += (width - field->size); |
| 143 | } |
| 144 | mean=sum/numpixel; |
| 145 | p = pstart; |
| 146 | for (j = 0; j < field->size; j++) { |
| 147 | for (k = 0; k < field->size; k++, p++) { |
| 148 | var+=abs(*p-mean); |
| 149 | } |
| 150 | p += (width - field->size); |
| 151 | } |
| 152 | return (double)var/numpixel/255.0; |
| 153 | } |
| 154 | #endif |
| 155 | |
| 156 | |
| 157 | |
| 158 | |
| 159 | |
| 160 | |
| 161 | #ifdef USE_ORC |
| 162 | /** |
| 163 | compares a small part of two given images |
| 164 | and returns the average absolute difference. |
| 165 | Field center, size and shift have to be choosen, |
| 166 | so that no clipping is required. |
| 167 | Uses optimized inner loops by ORC. |
| 168 | |
| 169 | \param field Field specifies position(center) and size of subimage |
| 170 | \param d_x shift in x direction |
| 171 | \param d_y shift in y direction |
| 172 | */ |
| 173 | unsigned int compareSubImg_thr_orc(unsigned char* const I1, unsigned char* const I2, |
| 174 | const Field* field, int width1, int width2, int height, |
| 175 | int bytesPerPixel, int d_x, int d_y, |
| 176 | unsigned int threshold) { |
| 177 | unsigned char* p1 = NULL; |
| 178 | unsigned char* p2 = NULL; |
| 179 | int s2 = field->size / 2; |
| 180 | int j; |
| 181 | unsigned int sum = 0; |
| 182 | p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel; |
| 183 | p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) * bytesPerPixel; |
| 184 | |
| 185 | for (j = 0; j < field->size; j++) { |
| 186 | unsigned int s = 0; |
| 187 | image_line_difference_optimized(&s, p1, p2, field->size* bytesPerPixel); |
| 188 | sum += s; |
| 189 | if( sum > threshold) // no need to calculate any longer: worse than the best match |
| 190 | break; |
| 191 | p1 += width1 * bytesPerPixel; |
| 192 | p2 += width2 * bytesPerPixel; |
| 193 | } |
| 194 | |
| 195 | |
| 196 | return sum; |
| 197 | } |
| 198 | |
| 199 | // implementation with 1 orc function, but no threshold |
| 200 | unsigned int compareSubImg_orc(unsigned char* const I1, unsigned char* const I2, |
| 201 | const Field* field, int width1, int width2, int height, |
| 202 | int bytesPerPixel, int d_x, int d_y, |
| 203 | unsigned int threshold) { |
| 204 | unsigned char* p1 = NULL; |
| 205 | unsigned char* p2 = NULL; |
| 206 | int s2 = field->size / 2; |
| 207 | unsigned int sum=0; |
| 208 | p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel; |
| 209 | p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) |
| 210 | * bytesPerPixel; |
| 211 | |
| 212 | image_difference_optimized(&sum, p1, width1 * bytesPerPixel, p2, width2 * bytesPerPixel, |
| 213 | field->size* bytesPerPixel , field->size); |
| 214 | return sum; |
| 215 | } |
| 216 | #endif |
| 217 | |
| 218 | #ifdef USE_SSE2 |
| 219 | unsigned int compareSubImg_thr_sse2(unsigned char* const I1, unsigned char* const I2, |
| 220 | const Field* field, |
| 221 | int width1, int width2, int height, |
| 222 | int bytesPerPixel, int d_x, int d_y, |
| 223 | unsigned int treshold) { |
| 224 | int k, j; |
| 225 | unsigned char* p1 = NULL; |
| 226 | unsigned char* p2 = NULL; |
| 227 | int s2 = field->size / 2; |
| 228 | unsigned int sum = 0; |
| 229 | |
| 230 | static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00}; |
| 231 | unsigned char row = 0; |
| 232 | #ifndef USE_SSE2_CMP_HOR |
| 233 | unsigned char summes[16]; |
| 234 | int i; |
| 235 | #endif |
| 236 | __m128i xmmsum, xmmmask; |
| 237 | xmmsum = _mm_setzero_si128(); |
| 238 | xmmmask = _mm_loadu_si128((__m128i const*)mask); |
| 239 | |
| 240 | p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel; |
| 241 | p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel; |
| 242 | for (j = 0; j < field->size; j++){ |
| 243 | for (k = 0; k < field->size * bytesPerPixel; k+=16){ |
| 244 | { |
| 245 | __m128i xmm0, xmm1, xmm2; |
| 246 | xmm0 = _mm_loadu_si128((__m128i const *)p1); |
| 247 | xmm1 = _mm_loadu_si128((__m128i const *)p2); |
| 248 | |
| 249 | xmm2 = _mm_subs_epu8(xmm0, xmm1); |
| 250 | xmm0 = _mm_subs_epu8(xmm1, xmm0); |
| 251 | xmm0 = _mm_adds_epu8(xmm0, xmm2); |
| 252 | |
| 253 | xmm1 = _mm_and_si128(xmm0, xmmmask); |
| 254 | xmm0 = _mm_srli_si128(xmm0, 1); |
| 255 | xmm0 = _mm_and_si128(xmm0, xmmmask); |
| 256 | |
| 257 | xmmsum = _mm_adds_epu16(xmmsum, xmm0); |
| 258 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); |
| 259 | } |
| 260 | |
| 261 | p1+=16; |
| 262 | p2+=16; |
| 263 | |
| 264 | row++; |
| 265 | if (row == SSE2_CMP_SUM_ROWS) { |
| 266 | row = 0; |
| 267 | #ifdef USE_SSE2_CMP_HOR |
| 268 | { |
| 269 | __m128i xmm1; |
| 270 | |
| 271 | xmm1 = _mm_srli_si128(xmmsum, 8); |
| 272 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); |
| 273 | |
| 274 | xmm1 = _mm_srli_si128(xmmsum, 4); |
| 275 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); |
| 276 | |
| 277 | xmm1 = _mm_srli_si128(xmmsum, 2); |
| 278 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); |
| 279 | |
| 280 | sum += _mm_extract_epi16(xmmsum, 0); |
| 281 | } |
| 282 | #else |
| 283 | _mm_storeu_si128((__m128i*)summes, xmmsum); |
| 284 | for(i = 0; i < 16; i+=2) |
| 285 | sum += summes[i] + summes[i+1]*256; |
| 286 | #endif |
| 287 | xmmsum = _mm_setzero_si128(); |
| 288 | } |
| 289 | } |
| 290 | if (sum > treshold) |
| 291 | break; |
| 292 | p1 += (width1 - field->size) * bytesPerPixel; |
| 293 | p2 += (width2 - field->size) * bytesPerPixel; |
| 294 | } |
| 295 | |
| 296 | #if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) \ |
| 297 | && (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16) |
| 298 | //process all data left unprocessed |
| 299 | //this part can be safely ignored if |
| 300 | //SSE_SUM_ROWS = {1, 2, 4, 8, 16} |
| 301 | #ifdef USE_SSE2_CMP_HOR |
| 302 | { |
| 303 | __m128i xmm1; |
| 304 | |
| 305 | xmm1 = _mm_srli_si128(xmmsum, 8); |
| 306 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); |
| 307 | |
| 308 | xmm1 = _mm_srli_si128(xmmsum, 4); |
| 309 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); |
| 310 | |
| 311 | xmm1 = _mm_srli_si128(xmmsum, 2); |
| 312 | xmmsum = _mm_adds_epu16(xmmsum, xmm1); |
| 313 | |
| 314 | sum += _mm_extract_epi16(xmmsum, 0); |
| 315 | } |
| 316 | #else |
| 317 | _mm_storeu_si128((__m128i*)summes, xmmsum); |
| 318 | for(i = 0; i < 16; i+=2) |
| 319 | sum += summes[i] + summes[i+1]*256; |
| 320 | #endif |
| 321 | #endif |
| 322 | |
| 323 | return sum; |
| 324 | } |
| 325 | #endif // USE_SSE2 |
| 326 | |
| 327 | #ifdef USE_SSE2_ASM |
| 328 | unsigned int compareSubImg_thr_sse2_asm(unsigned char* const I1, unsigned char* const I2, |
| 329 | const Field* field, |
| 330 | int width1, int width2, int height, |
| 331 | int bytesPerPixel, int d_x, int d_y, |
| 332 | unsigned int treshold) { |
| 333 | unsigned char* p1 = NULL; |
| 334 | unsigned char* p2 = NULL; |
| 335 | int s2 = field->size / 2; |
| 336 | unsigned int sum = 0; |
| 337 | |
| 338 | static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00}; |
| 339 | p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel; |
| 340 | p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel; |
| 341 | asm ( |
| 342 | "xor %0,%0\n" |
| 343 | "pxor %%xmm4,%%xmm4\n" //8 x 16bit partial sums |
| 344 | "movdqu (%3),%%xmm3\n" //mask |
| 345 | |
| 346 | //main loop |
| 347 | "movl %4,%%edx\n" //edx = field->size * bytesPerPixel / 16 |
| 348 | "mov $8,%%ecx\n" //cx = 8 |
| 349 | "1:\n" |
| 350 | |
| 351 | //calc intermediate sum of abs differences for 16 bytes |
| 352 | "movdqu (%1),%%xmm0\n" //p1 |
| 353 | "movdqu (%2),%%xmm1\n" //p2 |
| 354 | "movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0 |
| 355 | "psubusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 - xmm1 (by bytes) |
| 356 | "psubusb %%xmm2,%%xmm1\n" //xmm1 = xmm1 - xmm2 (by bytes) |
| 357 | "paddusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 + xmm1 (absolute difference) |
| 358 | "movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0 |
| 359 | "pand %%xmm3,%%xmm2\n" //xmm2 = xmm2 & xmm3 (apply mask) |
| 360 | "psrldq $1,%%xmm0\n" //xmm0 = xmm0 >> 8 (shift by 1 byte) |
| 361 | "pand %%xmm3,%%xmm0\n" //xmm0 = xmm0 & xmm3 (apply mask) |
| 362 | "paddusw %%xmm0,%%xmm4\n" //xmm4 = xmm4 + xmm0 (by words) |
| 363 | "paddusw %%xmm2,%%xmm4\n" //xmm4 = xmm4 + xmm2 (by words) |
| 364 | |
| 365 | "add $16,%1\n" //move to next 16 bytes (p1) |
| 366 | "add $16,%2\n" //move to next 16 bytes (p2) |
| 367 | |
| 368 | //check if we need flush sum (i.e. xmm4 is about to saturate) |
| 369 | "dec %%ecx\n" |
| 370 | "jnz 2f\n" //skip flushing if not |
| 371 | //flushing... |
| 372 | "movdqu %%xmm4,%%xmm0\n" |
| 373 | "psrldq $8,%%xmm0\n" |
| 374 | "paddusw %%xmm0,%%xmm4\n" |
| 375 | "movdqu %%xmm4,%%xmm0\n" |
| 376 | "psrldq $4,%%xmm0\n" |
| 377 | "paddusw %%xmm0,%%xmm4\n" |
| 378 | "movdqu %%xmm4,%%xmm0\n" |
| 379 | "psrldq $2,%%xmm0\n" |
| 380 | "paddusw %%xmm0,%%xmm4\n" |
| 381 | "movd %%xmm4,%%ecx\n" |
| 382 | "and $0xFFFF,%%ecx\n" |
| 383 | "addl %%ecx,%0\n" |
| 384 | "pxor %%xmm4,%%xmm4\n" //clearing xmm4 |
| 385 | "mov $8,%%ecx\n" //cx = 8 |
| 386 | |
| 387 | //check if we need to go to another line |
| 388 | "2:\n" |
| 389 | "dec %%edx\n" |
| 390 | "jnz 1b\n" //skip if not |
| 391 | |
| 392 | //move p1 and p2 to the next line |
| 393 | "add %5,%1\n" |
| 394 | "add %5,%2\n" |
| 395 | "cmp %7,%0\n" //if (sum > treshold) |
| 396 | "ja 3f\n" // break; |
| 397 | "movl %4,%%edx\n" |
| 398 | |
| 399 | //check if all lines done |
| 400 | "decl %6\n" |
| 401 | "jnz 1b\n" //if not, continue looping |
| 402 | "3:\n" |
| 403 | :"=r"(sum) |
| 404 | :"r"(p1),"r"(p2),"r"(mask),"g"(field->size * bytesPerPixel / 16),"g"((unsigned char*)((width1 - field->size) * bytesPerPixel)),"g"(field->size), "g"(treshold), "0"(sum) |
| 405 | :"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%ecx","%edx" |
| 406 | ); |
| 407 | // TODO width2 is not properly used here |
| 408 | return sum; |
| 409 | } |
| 410 | #endif // USE_SSE2_ASM |
| 411 | |
| 412 | /* |
| 413 | * Local variables: |
| 414 | * c-file-style: "stroustrup" |
| 415 | * c-file-offsets: ((case-label . *) (statement-case-intro . *)) |
| 416 | * indent-tabs-mode: nil |
| 417 | * tab-width: 2 |
| 418 | * c-basic-offset: 2 t |
| 419 | * End: |
| 420 | * |
| 421 | * vim: expandtab shiftwidth=2: |
| 422 | */ |