src/motiondetect_opt.c

   1 /*
   2  * motiondetect_opt.c
   3  *
   4  *  Copyright (C) Georg Martius - February 1007-2012
   5  *   georg dot martius at web dot de
   6  *  Copyright (C) Alexey Osipov - Jule 2011
   7  *   simba at lerlan dot ru
   8  *   speed optimizations (threshold, spiral, SSE, asm)
   9  *
  10  *  This file is part of vid.stab video stabilization library
  11  *
  12  *  vid.stab is free software; you can redistribute it and/or modify
  13  *  it under the terms of the GNU General Public License,
  14  *  as published by the Free Software Foundation; either version 2, or
  15  *  (at your option) any later version.
  16  *
  17  *  vid.stab is distributed in the hope that it will be useful,
  18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  *  GNU General Public License for more details.
  21  *
  22  *  You should have received a copy of the GNU General Public License
  23  *  along with GNU Make; see the file COPYING.  If not, write to
  24  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  25  *
  26  */
  27 #include "motiondetect_opt.h"
  28
  29 #ifdef USE_ORC
  30 #include "orc/motiondetectorc.h"
  31 #endif
  32
  33 #ifdef USE_SSE2
  34 #include <emmintrin.h>
  35
  36 #define USE_SSE2_CMP_HOR
  37 #define SSE2_CMP_SUM_ROWS 8
  38 #endif
  39
  40 #ifdef USE_SSE2
  41 /**
  42    \see contrastSubImg using SSE2 optimization, Planar (1 byte per channel) only
  43 */
  44 double contrastSubImg1_SSE(unsigned char* const I, const Field* field,
  45                            int width, int height)
  46 {
  47   int k, j;
  48   unsigned char* p = NULL;
  49   int s2 = field->size / 2;
  50
  51   static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
  52
  53   p = I + ((field->x - s2) + (field->y - s2)*width);
  54
  55   __m128i mmin, mmax;
  56
  57   mmin = _mm_loadu_si128((__m128i const*)full);
  58   mmax = _mm_setzero_si128();
  59
  60   for (j = 0; j < field->size; j++){
  61     for (k = 0; k < field->size; k += 16) {
  62       __m128i xmm0;
  63       xmm0 = _mm_loadu_si128((__m128i const*)p);
  64       mmin = _mm_min_epu8(mmin, xmm0);
  65       mmax = _mm_max_epu8(mmax, xmm0);
  66       p += 16;
  67     }
  68     p += (width - field->size);
  69   }
  70
  71   __m128i xmm1;
  72   xmm1 = _mm_srli_si128(mmin, 8);
  73   mmin = _mm_min_epu8(mmin, xmm1);
  74   xmm1 = _mm_srli_si128(mmin, 4);
  75   mmin = _mm_min_epu8(mmin, xmm1);
  76   xmm1 = _mm_srli_si128(mmin, 2);
  77   mmin = _mm_min_epu8(mmin, xmm1);
  78   xmm1 = _mm_srli_si128(mmin, 1);
  79   mmin = _mm_min_epu8(mmin, xmm1);
  80   unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0);
  81
  82   xmm1 = _mm_srli_si128(mmax, 8);
  83   mmax = _mm_max_epu8(mmax, xmm1);
  84   xmm1 = _mm_srli_si128(mmax, 4);
  85   mmax = _mm_max_epu8(mmax, xmm1);
  86   xmm1 = _mm_srli_si128(mmax, 2);
  87   mmax = _mm_max_epu8(mmax, xmm1);
  88   xmm1 = _mm_srli_si128(mmax, 1);
  89   mmax = _mm_max_epu8(mmax, xmm1);
  90   unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0);
  91
  92   return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0
  93 }
  94 #endif
  95
  96 #ifdef USE_ORC
  97 /**
  98    calculates the contrast in the given small part of the given image
  99    using the absolute difference from mean luminance (like Root-Mean-Square,
 100    but with abs() (Manhattan-Norm))
 101    For multichannel images use contrastSubImg_Michelson()
 102
 103    \param I pointer to framebuffer
 104    \param field Field specifies position(center) and size of subimage
 105    \param width width of frame
 106    \param height height of frame
 107 */
 108 double contrastSubImg_variance_orc(unsigned char* const I, const Field* field,
 109                                    int width, int height) {
 110   unsigned char* p = NULL;
 111   int s2 = field->size / 2;
 112   int numpixel = field->size*field->size;
 113
 114   p = I + ((field->x - s2) + (field->y - s2) * width);
 115
 116   unsigned int sum=0;
 117   image_sum_optimized((signed int*)&sum, p, width, field->size, field->size);
 118   unsigned char mean = sum / numpixel;
 119   int var=0;
 120   image_variance_optimized(&var, p, width, mean, field->size, field->size);
 121   return (double)var/numpixel/255.0;
 122 }
 123
 124 /// plain C implementation of variance based contrastSubImg (without ORC)
 125 double contrastSubImg_variance_C(unsigned char* const I,
 126                                  const Field* field, int width, int height) {
 127   int k, j;
 128   unsigned char* p = NULL;
 129   unsigned char* pstart = NULL;
 130   int s2 = field->size / 2;
 131   unsigned int sum=0;
 132   int mean;
 133   int var=0;
 134   int numpixel = field->size*field->size;
 135
 136   pstart = I + ((field->x - s2) + (field->y - s2) * width);
 137   p = pstart;
 138   for (j = 0; j < field->size; j++) {
 139     for (k = 0; k < field->size; k++, p++) {
 140       sum+=*p;
 141     }
 142     p += (width - field->size);
 143   }
 144   mean=sum/numpixel;
 145   p = pstart;
 146   for (j = 0; j < field->size; j++) {
 147     for (k = 0; k < field->size; k++, p++) {
 148       var+=abs(*p-mean);
 149     }
 150     p += (width - field->size);
 151   }
 152   return (double)var/numpixel/255.0;
 153 }
 154 #endif
 155
 156
 157
 158
 159
 160
 161 #ifdef USE_ORC
 162 /**
 163    compares a small part of two given images
 164    and returns the average absolute difference.
 165    Field center, size and shift have to be choosen,
 166    so that no clipping is required.
 167    Uses optimized inner loops by ORC.
 168
 169    \param field Field specifies position(center) and size of subimage
 170    \param d_x shift in x direction
 171    \param d_y shift in y direction
 172 */
 173 unsigned int compareSubImg_thr_orc(unsigned char* const I1, unsigned char* const I2,
 174                                    const Field* field, int width1, int width2, int height,
 175                                    int bytesPerPixel, int d_x, int d_y,
 176                                    unsigned int threshold) {
 177   unsigned char* p1 = NULL;
 178   unsigned char* p2 = NULL;
 179   int s2 = field->size / 2;
 180   int j;
 181   unsigned int sum = 0;
 182   p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
 183   p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) * bytesPerPixel;
 184
 185   for (j = 0; j < field->size; j++) {
 186     unsigned int s = 0;
 187     image_line_difference_optimized(&s, p1, p2, field->size* bytesPerPixel);
 188     sum += s;
 189     if( sum > threshold) // no need to calculate any longer: worse than the best match
 190       break;
 191     p1 += width1 * bytesPerPixel;
 192     p2 += width2 * bytesPerPixel;
 193   }
 194
 195
 196   return sum;
 197 }
 198
 199 // implementation with 1 orc function, but no threshold
 200 unsigned int compareSubImg_orc(unsigned char* const I1, unsigned char* const I2,
 201                                const Field* field, int width1, int width2, int height,
 202                                int bytesPerPixel, int d_x, int d_y,
 203                                unsigned int threshold) {
 204   unsigned char* p1 = NULL;
 205   unsigned char* p2 = NULL;
 206   int s2 = field->size / 2;
 207   unsigned int sum=0;
 208   p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
 209   p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2)
 210     * bytesPerPixel;
 211
 212   image_difference_optimized(&sum, p1, width1 * bytesPerPixel, p2, width2 * bytesPerPixel,
 213                              field->size* bytesPerPixel , field->size);
 214   return sum;
 215 }
 216 #endif
 217
 218 #ifdef USE_SSE2
 219 unsigned int compareSubImg_thr_sse2(unsigned char* const I1, unsigned char* const I2,
 220                                     const Field* field,
 221                                     int width1, int width2, int height,
 222                                     int bytesPerPixel, int d_x, int d_y,
 223                                     unsigned int treshold) {
 224   int k, j;
 225   unsigned char* p1 = NULL;
 226   unsigned char* p2 = NULL;
 227   int s2 = field->size / 2;
 228   unsigned int sum = 0;
 229
 230   static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
 231   unsigned char row = 0;
 232 #ifndef USE_SSE2_CMP_HOR
 233   unsigned char summes[16];
 234   int i;
 235 #endif
 236   __m128i xmmsum, xmmmask;
 237   xmmsum = _mm_setzero_si128();
 238   xmmmask = _mm_loadu_si128((__m128i const*)mask);
 239
 240   p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel;
 241   p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel;
 242   for (j = 0; j < field->size; j++){
 243     for (k = 0; k < field->size * bytesPerPixel; k+=16){
 244       {
 245         __m128i xmm0, xmm1, xmm2;
 246         xmm0 = _mm_loadu_si128((__m128i const *)p1);
 247         xmm1 = _mm_loadu_si128((__m128i const *)p2);
 248
 249         xmm2 = _mm_subs_epu8(xmm0, xmm1);
 250         xmm0 = _mm_subs_epu8(xmm1, xmm0);
 251         xmm0 = _mm_adds_epu8(xmm0, xmm2);
 252
 253         xmm1 = _mm_and_si128(xmm0, xmmmask);
 254         xmm0 = _mm_srli_si128(xmm0, 1);
 255         xmm0 = _mm_and_si128(xmm0, xmmmask);
 256
 257         xmmsum = _mm_adds_epu16(xmmsum, xmm0);
 258         xmmsum = _mm_adds_epu16(xmmsum, xmm1);
 259       }
 260
 261       p1+=16;
 262       p2+=16;
 263
 264       row++;
 265       if (row == SSE2_CMP_SUM_ROWS) {
 266         row = 0;
 267 #ifdef USE_SSE2_CMP_HOR
 268         {
 269           __m128i xmm1;
 270
 271           xmm1 = _mm_srli_si128(xmmsum, 8);
 272           xmmsum = _mm_adds_epu16(xmmsum, xmm1);
 273
 274           xmm1 = _mm_srli_si128(xmmsum, 4);
 275           xmmsum = _mm_adds_epu16(xmmsum, xmm1);
 276
 277           xmm1 = _mm_srli_si128(xmmsum, 2);
 278           xmmsum = _mm_adds_epu16(xmmsum, xmm1);
 279
 280           sum += _mm_extract_epi16(xmmsum, 0);
 281         }
 282 #else
 283         _mm_storeu_si128((__m128i*)summes, xmmsum);
 284         for(i = 0; i < 16; i+=2)
 285           sum += summes[i] + summes[i+1]*256;
 286 #endif
 287         xmmsum = _mm_setzero_si128();
 288       }
 289     }
 290     if (sum > treshold)
 291       break;
 292     p1 += (width1 - field->size) * bytesPerPixel;
 293     p2 += (width2 - field->size) * bytesPerPixel;
 294   }
 295
 296 #if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) \
 297   && (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16)
 298   //process all data left unprocessed
 299   //this part can be safely ignored if
 300   //SSE_SUM_ROWS = {1, 2, 4, 8, 16}
 301 #ifdef USE_SSE2_CMP_HOR
 302   {
 303     __m128i xmm1;
 304
 305     xmm1 = _mm_srli_si128(xmmsum, 8);
 306     xmmsum = _mm_adds_epu16(xmmsum, xmm1);
 307
 308     xmm1 = _mm_srli_si128(xmmsum, 4);
 309     xmmsum = _mm_adds_epu16(xmmsum, xmm1);
 310
 311     xmm1 = _mm_srli_si128(xmmsum, 2);
 312     xmmsum = _mm_adds_epu16(xmmsum, xmm1);
 313
 314     sum += _mm_extract_epi16(xmmsum, 0);
 315   }
 316 #else
 317   _mm_storeu_si128((__m128i*)summes, xmmsum);
 318   for(i = 0; i < 16; i+=2)
 319     sum += summes[i] + summes[i+1]*256;
 320 #endif
 321 #endif
 322
 323   return sum;
 324 }
 325 #endif // USE_SSE2
 326
 327 #ifdef USE_SSE2_ASM
 328 unsigned int compareSubImg_thr_sse2_asm(unsigned char* const I1, unsigned char* const I2,
 329                                         const Field* field,
 330                                         int width1, int width2, int height,
 331                                         int bytesPerPixel, int d_x, int d_y,
 332                                         unsigned int treshold) {
 333   unsigned char* p1 = NULL;
 334   unsigned char* p2 = NULL;
 335   int s2 = field->size / 2;
 336   unsigned int sum = 0;
 337
 338   static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
 339   p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel;
 340   p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel;
 341   asm (
 342     "xor %0,%0\n"
 343     "pxor %%xmm4,%%xmm4\n"         //8 x 16bit partial sums
 344     "movdqu (%3),%%xmm3\n"         //mask
 345
 346     //main loop
 347     "movl %4,%%edx\n"              //edx = field->size * bytesPerPixel / 16
 348     "mov $8,%%ecx\n"               //cx = 8
 349     "1:\n"
 350
 351     //calc intermediate sum of abs differences for 16 bytes
 352     "movdqu (%1),%%xmm0\n"       //p1
 353     "movdqu (%2),%%xmm1\n"       //p2
 354     "movdqu %%xmm0,%%xmm2\n"     //xmm2 = xmm0
 355     "psubusb %%xmm1,%%xmm0\n"    //xmm0 = xmm0 - xmm1 (by bytes)
 356     "psubusb %%xmm2,%%xmm1\n"    //xmm1 = xmm1 - xmm2 (by bytes)
 357     "paddusb %%xmm1,%%xmm0\n"    //xmm0 = xmm0 + xmm1 (absolute difference)
 358     "movdqu %%xmm0,%%xmm2\n"     //xmm2 = xmm0
 359     "pand %%xmm3,%%xmm2\n"       //xmm2 = xmm2 & xmm3 (apply mask)
 360     "psrldq $1,%%xmm0\n"         //xmm0 = xmm0 >> 8 (shift by 1 byte)
 361     "pand %%xmm3,%%xmm0\n"       //xmm0 = xmm0 & xmm3 (apply mask)
 362     "paddusw %%xmm0,%%xmm4\n"    //xmm4 = xmm4 + xmm0 (by words)
 363     "paddusw %%xmm2,%%xmm4\n"    //xmm4 = xmm4 + xmm2 (by words)
 364
 365     "add $16,%1\n"               //move to next 16 bytes (p1)
 366     "add $16,%2\n"               //move to next 16 bytes (p2)
 367
 368     //check if we need flush sum (i.e. xmm4 is about to saturate)
 369     "dec %%ecx\n"
 370     "jnz 2f\n"                   //skip flushing if not
 371     //flushing...
 372     "movdqu %%xmm4,%%xmm0\n"
 373     "psrldq $8,%%xmm0\n"
 374     "paddusw %%xmm0,%%xmm4\n"
 375     "movdqu %%xmm4,%%xmm0\n"
 376     "psrldq $4,%%xmm0\n"
 377     "paddusw %%xmm0,%%xmm4\n"
 378     "movdqu %%xmm4,%%xmm0\n"
 379     "psrldq $2,%%xmm0\n"
 380     "paddusw %%xmm0,%%xmm4\n"
 381     "movd %%xmm4,%%ecx\n"
 382     "and $0xFFFF,%%ecx\n"
 383     "addl %%ecx,%0\n"
 384     "pxor %%xmm4,%%xmm4\n"       //clearing xmm4
 385     "mov $8,%%ecx\n"             //cx = 8
 386
 387     //check if we need to go to another line
 388     "2:\n"
 389     "dec %%edx\n"
 390     "jnz 1b\n"                   //skip if not
 391
 392     //move p1 and p2 to the next line
 393     "add %5,%1\n"
 394     "add %5,%2\n"
 395     "cmp %7,%0\n"                //if (sum > treshold)
 396     "ja 3f\n"                    //    break;
 397     "movl %4,%%edx\n"
 398
 399     //check if all lines done
 400     "decl %6\n"
 401     "jnz 1b\n"                   //if not, continue looping
 402     "3:\n"
 403     :"=r"(sum)
 404     :"r"(p1),"r"(p2),"r"(mask),"g"(field->size * bytesPerPixel / 16),"g"((unsigned char*)((width1 - field->size) * bytesPerPixel)),"g"(field->size), "g"(treshold), "0"(sum)
 405     :"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%ecx","%edx"
 406     );
 407   // TODO width2 is not properly used here
 408   return sum;
 409 }
 410 #endif // USE_SSE2_ASM
 411
 412 /*
 413  * Local variables:
 414  *   c-file-style: "stroustrup"
 415  *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
 416  *   indent-tabs-mode: nil
 417  *   tab-width:  2
 418  *   c-basic-offset: 2 t
 419  * End:
 420  *
 421  * vim: expandtab shiftwidth=2:
 422  */