[deb_vid.stab.git] / src / motiondetect_opt.c

/*
 * motiondetect_opt.c
 *
 *  Copyright (C) Georg Martius - February 1007-2012
 *   georg dot martius at web dot de
 *  Copyright (C) Alexey Osipov - Jule 2011
 *   simba at lerlan dot ru
 *   speed optimizations (threshold, spiral, SSE, asm)
 *
 *  This file is part of vid.stab video stabilization library
 *
 *  vid.stab is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License,
 *  as published by the Free Software Foundation; either version 2, or
 *  (at your option) any later version.
 *
 *  vid.stab is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */
#include "motiondetect_opt.h"

#ifdef USE_ORC
#include "orc/motiondetectorc.h"
#endif

#ifdef USE_SSE2
#include <emmintrin.h>

#define USE_SSE2_CMP_HOR
#define SSE2_CMP_SUM_ROWS 8
#endif

#ifdef USE_SSE2
/**
   \see contrastSubImg using SSE2 optimization, Planar (1 byte per channel) only
*/
double contrastSubImg1_SSE(unsigned char* const I, const Field* field,
                           int width, int height)
{
  int k, j;
  unsigned char* p = NULL;
  int s2 = field->size / 2;

  static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};

  p = I + ((field->x - s2) + (field->y - s2)*width);

  __m128i mmin, mmax;

  mmin = _mm_loadu_si128((__m128i const*)full);
  mmax = _mm_setzero_si128();

  for (j = 0; j < field->size; j++){
    for (k = 0; k < field->size; k += 16) {
      __m128i xmm0;
      xmm0 = _mm_loadu_si128((__m128i const*)p);
      mmin = _mm_min_epu8(mmin, xmm0);
      mmax = _mm_max_epu8(mmax, xmm0);
      p += 16;
    }
    p += (width - field->size);
  }

  __m128i xmm1;
  xmm1 = _mm_srli_si128(mmin, 8);
  mmin = _mm_min_epu8(mmin, xmm1);
  xmm1 = _mm_srli_si128(mmin, 4);
  mmin = _mm_min_epu8(mmin, xmm1);
  xmm1 = _mm_srli_si128(mmin, 2);
  mmin = _mm_min_epu8(mmin, xmm1);
  xmm1 = _mm_srli_si128(mmin, 1);
  mmin = _mm_min_epu8(mmin, xmm1);
  unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0);

  xmm1 = _mm_srli_si128(mmax, 8);
  mmax = _mm_max_epu8(mmax, xmm1);
  xmm1 = _mm_srli_si128(mmax, 4);
  mmax = _mm_max_epu8(mmax, xmm1);
  xmm1 = _mm_srli_si128(mmax, 2);
  mmax = _mm_max_epu8(mmax, xmm1);
  xmm1 = _mm_srli_si128(mmax, 1);
  mmax = _mm_max_epu8(mmax, xmm1);
  unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0);

  return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0
}
#endif

#ifdef USE_ORC
/**
   calculates the contrast in the given small part of the given image
   using the absolute difference from mean luminance (like Root-Mean-Square,
   but with abs() (Manhattan-Norm))
   For multichannel images use contrastSubImg_Michelson()

   \param I pointer to framebuffer
   \param field Field specifies position(center) and size of subimage
   \param width width of frame
   \param height height of frame
*/
double contrastSubImg_variance_orc(unsigned char* const I, const Field* field,
                                   int width, int height) {
  unsigned char* p = NULL;
  int s2 = field->size / 2;
  int numpixel = field->size*field->size;

  p = I + ((field->x - s2) + (field->y - s2) * width);

  unsigned int sum=0;
  image_sum_optimized((signed int*)&sum, p, width, field->size, field->size);
  unsigned char mean = sum / numpixel;
  int var=0;
  image_variance_optimized(&var, p, width, mean, field->size, field->size);
  return (double)var/numpixel/255.0;
}

/// plain C implementation of variance based contrastSubImg (without ORC)
double contrastSubImg_variance_C(unsigned char* const I,
                                 const Field* field, int width, int height) {
  int k, j;
  unsigned char* p = NULL;
  unsigned char* pstart = NULL;
  int s2 = field->size / 2;
  unsigned int sum=0;
  int mean;
  int var=0;
  int numpixel = field->size*field->size;

  pstart = I + ((field->x - s2) + (field->y - s2) * width);
  p = pstart;
  for (j = 0; j < field->size; j++) {
    for (k = 0; k < field->size; k++, p++) {
      sum+=*p;
    }
    p += (width - field->size);
  }
  mean=sum/numpixel;
  p = pstart;
  for (j = 0; j < field->size; j++) {
    for (k = 0; k < field->size; k++, p++) {
      var+=abs(*p-mean);
    }
    p += (width - field->size);
  }
  return (double)var/numpixel/255.0;
}
#endif


#ifdef USE_ORC
/**
   compares a small part of two given images
   and returns the average absolute difference.
   Field center, size and shift have to be choosen,
   so that no clipping is required.
   Uses optimized inner loops by ORC.

   \param field Field specifies position(center) and size of subimage
   \param d_x shift in x direction
   \param d_y shift in y direction
*/
unsigned int compareSubImg_thr_orc(unsigned char* const I1, unsigned char* const I2,
                                   const Field* field, int width1, int width2, int height,
                                   int bytesPerPixel, int d_x, int d_y,
                                   unsigned int threshold) {
  unsigned char* p1 = NULL;
  unsigned char* p2 = NULL;
  int s2 = field->size / 2;
  int j;
  unsigned int sum = 0;
  p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
  p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) * bytesPerPixel;

  for (j = 0; j < field->size; j++) {
    unsigned int s = 0;
    image_line_difference_optimized(&s, p1, p2, field->size* bytesPerPixel);
    sum += s;
    if( sum > threshold) // no need to calculate any longer: worse than the best match
      break;
    p1 += width1 * bytesPerPixel;
    p2 += width2 * bytesPerPixel;
  }


  return sum;
}

// implementation with 1 orc function, but no threshold
unsigned int compareSubImg_orc(unsigned char* const I1, unsigned char* const I2,
                               const Field* field, int width1, int width2, int height,
                               int bytesPerPixel, int d_x, int d_y,
                               unsigned int threshold) {
  unsigned char* p1 = NULL;
  unsigned char* p2 = NULL;
  int s2 = field->size / 2;
  unsigned int sum=0;
  p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
  p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2)
    * bytesPerPixel;

  image_difference_optimized(&sum, p1, width1 * bytesPerPixel, p2, width2 * bytesPerPixel,
                             field->size* bytesPerPixel , field->size);
  return sum;
}
#endif

#ifdef USE_SSE2
unsigned int compareSubImg_thr_sse2(unsigned char* const I1, unsigned char* const I2,
                                    const Field* field,
                                    int width1, int width2, int height,
                                    int bytesPerPixel, int d_x, int d_y,
                                    unsigned int treshold) {
  int k, j;
  unsigned char* p1 = NULL;
  unsigned char* p2 = NULL;
  int s2 = field->size / 2;
  unsigned int sum = 0;

  static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
  unsigned char row = 0;
#ifndef USE_SSE2_CMP_HOR
  unsigned char summes[16];
  int i;
#endif
  __m128i xmmsum, xmmmask;
  xmmsum = _mm_setzero_si128();
  xmmmask = _mm_loadu_si128((__m128i const*)mask);

  p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel;
  p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel;
  for (j = 0; j < field->size; j++){
    for (k = 0; k < field->size * bytesPerPixel; k+=16){
      {
        __m128i xmm0, xmm1, xmm2;
        xmm0 = _mm_loadu_si128((__m128i const *)p1);
        xmm1 = _mm_loadu_si128((__m128i const *)p2);

        xmm2 = _mm_subs_epu8(xmm0, xmm1);
        xmm0 = _mm_subs_epu8(xmm1, xmm0);
        xmm0 = _mm_adds_epu8(xmm0, xmm2);

        xmm1 = _mm_and_si128(xmm0, xmmmask);
        xmm0 = _mm_srli_si128(xmm0, 1);
        xmm0 = _mm_and_si128(xmm0, xmmmask);

        xmmsum = _mm_adds_epu16(xmmsum, xmm0);
        xmmsum = _mm_adds_epu16(xmmsum, xmm1);
      }

      p1+=16;
      p2+=16;

      row++;
      if (row == SSE2_CMP_SUM_ROWS) {
        row = 0;
#ifdef USE_SSE2_CMP_HOR
        {
          __m128i xmm1;

          xmm1 = _mm_srli_si128(xmmsum, 8);
          xmmsum = _mm_adds_epu16(xmmsum, xmm1);

          xmm1 = _mm_srli_si128(xmmsum, 4);
          xmmsum = _mm_adds_epu16(xmmsum, xmm1);

          xmm1 = _mm_srli_si128(xmmsum, 2);
          xmmsum = _mm_adds_epu16(xmmsum, xmm1);

          sum += _mm_extract_epi16(xmmsum, 0);
        }
#else
        _mm_storeu_si128((__m128i*)summes, xmmsum);
        for(i = 0; i < 16; i+=2)
          sum += summes[i] + summes[i+1]*256;
#endif
        xmmsum = _mm_setzero_si128();
      }
    }
    if (sum > treshold)
      break;
    p1 += (width1 - field->size) * bytesPerPixel;
    p2 += (width2 - field->size) * bytesPerPixel;
  }

#if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) \
  && (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16)
  //process all data left unprocessed
  //this part can be safely ignored if
  //SSE_SUM_ROWS = {1, 2, 4, 8, 16}
#ifdef USE_SSE2_CMP_HOR
  {
    __m128i xmm1;

    xmm1 = _mm_srli_si128(xmmsum, 8);
    xmmsum = _mm_adds_epu16(xmmsum, xmm1);

    xmm1 = _mm_srli_si128(xmmsum, 4);
    xmmsum = _mm_adds_epu16(xmmsum, xmm1);

    xmm1 = _mm_srli_si128(xmmsum, 2);
    xmmsum = _mm_adds_epu16(xmmsum, xmm1);

    sum += _mm_extract_epi16(xmmsum, 0);
  }
#else
  _mm_storeu_si128((__m128i*)summes, xmmsum);
  for(i = 0; i < 16; i+=2)
    sum += summes[i] + summes[i+1]*256;
#endif
#endif

  return sum;
}
#endif // USE_SSE2

#ifdef USE_SSE2_ASM
unsigned int compareSubImg_thr_sse2_asm(unsigned char* const I1, unsigned char* const I2,
                                        const Field* field,
                                        int width1, int width2, int height,
                                        int bytesPerPixel, int d_x, int d_y,
                                        unsigned int treshold) {
  unsigned char* p1 = NULL;
  unsigned char* p2 = NULL;
  int s2 = field->size / 2;
  unsigned int sum = 0;

  static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
  p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel;
  p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel;
  asm (
    "xor %0,%0\n"
    "pxor %%xmm4,%%xmm4\n"         //8 x 16bit partial sums
    "movdqu (%3),%%xmm3\n"         //mask

    //main loop
    "movl %4,%%edx\n"              //edx = field->size * bytesPerPixel / 16
    "mov $8,%%ecx\n"               //cx = 8
    "1:\n"

    //calc intermediate sum of abs differences for 16 bytes
    "movdqu (%1),%%xmm0\n"       //p1
    "movdqu (%2),%%xmm1\n"       //p2
    "movdqu %%xmm0,%%xmm2\n"     //xmm2 = xmm0
    "psubusb %%xmm1,%%xmm0\n"    //xmm0 = xmm0 - xmm1 (by bytes)
    "psubusb %%xmm2,%%xmm1\n"    //xmm1 = xmm1 - xmm2 (by bytes)
    "paddusb %%xmm1,%%xmm0\n"    //xmm0 = xmm0 + xmm1 (absolute difference)
    "movdqu %%xmm0,%%xmm2\n"     //xmm2 = xmm0
    "pand %%xmm3,%%xmm2\n"       //xmm2 = xmm2 & xmm3 (apply mask)
    "psrldq $1,%%xmm0\n"         //xmm0 = xmm0 >> 8 (shift by 1 byte)
    "pand %%xmm3,%%xmm0\n"       //xmm0 = xmm0 & xmm3 (apply mask)
    "paddusw %%xmm0,%%xmm4\n"    //xmm4 = xmm4 + xmm0 (by words)
    "paddusw %%xmm2,%%xmm4\n"    //xmm4 = xmm4 + xmm2 (by words)

    "add $16,%1\n"               //move to next 16 bytes (p1)
    "add $16,%2\n"               //move to next 16 bytes (p2)

    //check if we need flush sum (i.e. xmm4 is about to saturate)
    "dec %%ecx\n"
    "jnz 2f\n"                   //skip flushing if not
    //flushing...
    "movdqu %%xmm4,%%xmm0\n"
    "psrldq $8,%%xmm0\n"
    "paddusw %%xmm0,%%xmm4\n"
    "movdqu %%xmm4,%%xmm0\n"
    "psrldq $4,%%xmm0\n"
    "paddusw %%xmm0,%%xmm4\n"
    "movdqu %%xmm4,%%xmm0\n"
    "psrldq $2,%%xmm0\n"
    "paddusw %%xmm0,%%xmm4\n"
    "movd %%xmm4,%%ecx\n"
    "and $0xFFFF,%%ecx\n"
    "addl %%ecx,%0\n"
    "pxor %%xmm4,%%xmm4\n"       //clearing xmm4
    "mov $8,%%ecx\n"             //cx = 8

    //check if we need to go to another line
    "2:\n"
    "dec %%edx\n"
    "jnz 1b\n"                   //skip if not

    //move p1 and p2 to the next line
    "add %5,%1\n"
    "add %5,%2\n"
    "cmp %7,%0\n"                //if (sum > treshold)
    "ja 3f\n"                    //    break;
    "movl %4,%%edx\n"

    //check if all lines done
    "decl %6\n"
    "jnz 1b\n"                   //if not, continue looping
    "3:\n"
    :"=r"(sum)
    :"r"(p1),"r"(p2),"r"(mask),"g"(field->size * bytesPerPixel / 16),"g"((unsigned char*)((width1 - field->size) * bytesPerPixel)),"g"(field->size), "g"(treshold), "0"(sum)
    :"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%ecx","%edx"
    );
  // TODO width2 is not properly used here
  return sum;
}
#endif // USE_SSE2_ASM

/*
 * Local variables:
 *   c-file-style: "stroustrup"
 *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
 *   indent-tabs-mode: nil
 *   tab-width:  2
 *   c-basic-offset: 2 t
 * End:
 *
 * vim: expandtab shiftwidth=2:
 */
Commit	Line	Data
80f575fc DM	1	/*
	2	* motiondetect_opt.c
	3	*
	4	* Copyright (C) Georg Martius - February 1007-2012
	5	* georg dot martius at web dot de
	6	* Copyright (C) Alexey Osipov - Jule 2011
	7	* simba at lerlan dot ru
	8	* speed optimizations (threshold, spiral, SSE, asm)
	9	*
	10	* This file is part of vid.stab video stabilization library
	11	*
	12	* vid.stab is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU General Public License,
	14	* as published by the Free Software Foundation; either version 2, or
	15	* (at your option) any later version.
	16	*
	17	* vid.stab is distributed in the hope that it will be useful,
	18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	* GNU General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU General Public License
	23	* along with GNU Make; see the file COPYING. If not, write to
	24	* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
	25	*
	26	*/
	27	#include "motiondetect_opt.h"
	28
	29	#ifdef USE_ORC
	30	#include "orc/motiondetectorc.h"
	31	#endif
	32
	33	#ifdef USE_SSE2
	34	#include <emmintrin.h>
	35
	36	#define USE_SSE2_CMP_HOR
	37	#define SSE2_CMP_SUM_ROWS 8
	38	#endif
	39
	40	#ifdef USE_SSE2
	41	/**
	42	\see contrastSubImg using SSE2 optimization, Planar (1 byte per channel) only
	43	*/
	44	double contrastSubImg1_SSE(unsigned char* const I, const Field* field,
	45	int width, int height)
	46	{
	47	int k, j;
	48	unsigned char* p = NULL;
	49	int s2 = field->size / 2;
	50
	51	static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
	52
	53	p = I + ((field->x - s2) + (field->y - s2)*width);
	54
	55	__m128i mmin, mmax;
	56
	57	mmin = _mm_loadu_si128((__m128i const*)full);
	58	mmax = _mm_setzero_si128();
	59
	60	for (j = 0; j < field->size; j++){
	61	for (k = 0; k < field->size; k += 16) {
	62	__m128i xmm0;
	63	xmm0 = _mm_loadu_si128((__m128i const*)p);
	64	mmin = _mm_min_epu8(mmin, xmm0);
65	mmax = _mm_max_epu8(mmax, xmm0);
66	p += 16;
67	}
68	p += (width - field->size);
69	}
70
71	__m128i xmm1;
72	xmm1 = _mm_srli_si128(mmin, 8);
73	mmin = _mm_min_epu8(mmin, xmm1);
74	xmm1 = _mm_srli_si128(mmin, 4);
75	mmin = _mm_min_epu8(mmin, xmm1);
76	xmm1 = _mm_srli_si128(mmin, 2);
77	mmin = _mm_min_epu8(mmin, xmm1);
78	xmm1 = _mm_srli_si128(mmin, 1);
79	mmin = _mm_min_epu8(mmin, xmm1);
80	unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0);
81
82	xmm1 = _mm_srli_si128(mmax, 8);
83	mmax = _mm_max_epu8(mmax, xmm1);
84	xmm1 = _mm_srli_si128(mmax, 4);
85	mmax = _mm_max_epu8(mmax, xmm1);
86	xmm1 = _mm_srli_si128(mmax, 2);
87	mmax = _mm_max_epu8(mmax, xmm1);
88	xmm1 = _mm_srli_si128(mmax, 1);
89	mmax = _mm_max_epu8(mmax, xmm1);
90	unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0);
91
92	return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0
93	}
94	#endif
95
96	#ifdef USE_ORC
97	/**
98	calculates the contrast in the given small part of the given image
99	using the absolute difference from mean luminance (like Root-Mean-Square,
100	but with abs() (Manhattan-Norm))
101	For multichannel images use contrastSubImg_Michelson()
102
103	\param I pointer to framebuffer
104	\param field Field specifies position(center) and size of subimage
105	\param width width of frame
106	\param height height of frame
107	*/
108	double contrastSubImg_variance_orc(unsigned char* const I, const Field* field,
109	int width, int height) {
110	unsigned char* p = NULL;
111	int s2 = field->size / 2;
112	int numpixel = field->size*field->size;
113
114	p = I + ((field->x - s2) + (field->y - s2) * width);
115
116	unsigned int sum=0;
117	image_sum_optimized((signed int*)&sum, p, width, field->size, field->size);
118	unsigned char mean = sum / numpixel;
119	int var=0;
120	image_variance_optimized(&var, p, width, mean, field->size, field->size);
121	return (double)var/numpixel/255.0;
122	}
123
124	/// plain C implementation of variance based contrastSubImg (without ORC)
125	double contrastSubImg_variance_C(unsigned char* const I,
126	const Field* field, int width, int height) {
127	int k, j;
128	unsigned char* p = NULL;
129	unsigned char* pstart = NULL;
130	int s2 = field->size / 2;
131	unsigned int sum=0;
132	int mean;
133	int var=0;
134	int numpixel = field->size*field->size;
135
136	pstart = I + ((field->x - s2) + (field->y - s2) * width);
137	p = pstart;
138	for (j = 0; j < field->size; j++) {
139	for (k = 0; k < field->size; k++, p++) {
140	sum+=*p;
141	}
142	p += (width - field->size);
143	}
144	mean=sum/numpixel;
145	p = pstart;
146	for (j = 0; j < field->size; j++) {
147	for (k = 0; k < field->size; k++, p++) {
148	var+=abs(*p-mean);
149	}
150	p += (width - field->size);
151	}
152	return (double)var/numpixel/255.0;
153	}
154	#endif
155
156
157
158
159
160
161	#ifdef USE_ORC
162	/**
163	compares a small part of two given images
164	and returns the average absolute difference.
165	Field center, size and shift have to be choosen,
166	so that no clipping is required.
167	Uses optimized inner loops by ORC.
168
169	\param field Field specifies position(center) and size of subimage
170	\param d_x shift in x direction
171	\param d_y shift in y direction
172	*/
173	unsigned int compareSubImg_thr_orc(unsigned char* const I1, unsigned char* const I2,
174	const Field* field, int width1, int width2, int height,
175	int bytesPerPixel, int d_x, int d_y,
176	unsigned int threshold) {
177	unsigned char* p1 = NULL;
178	unsigned char* p2 = NULL;
179	int s2 = field->size / 2;
180	int j;
181	unsigned int sum = 0;
182	p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
183	p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) * bytesPerPixel;
184
185	for (j = 0; j < field->size; j++) {
186	unsigned int s = 0;
187	image_line_difference_optimized(&s, p1, p2, field->size* bytesPerPixel);
188	sum += s;
189	if( sum > threshold) // no need to calculate any longer: worse than the best match
190	break;
191	p1 += width1 * bytesPerPixel;
192	p2 += width2 * bytesPerPixel;
193	}
194
195
196	return sum;
197	}
198
199	// implementation with 1 orc function, but no threshold
200	unsigned int compareSubImg_orc(unsigned char* const I1, unsigned char* const I2,
201	const Field* field, int width1, int width2, int height,
202	int bytesPerPixel, int d_x, int d_y,
203	unsigned int threshold) {
204	unsigned char* p1 = NULL;
205	unsigned char* p2 = NULL;
206	int s2 = field->size / 2;
207	unsigned int sum=0;
208	p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
209	p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2)
210	* bytesPerPixel;
211
212	image_difference_optimized(&sum, p1, width1 * bytesPerPixel, p2, width2 * bytesPerPixel,
213	field->size* bytesPerPixel , field->size);
214	return sum;
215	}
216	#endif
217
218	#ifdef USE_SSE2
219	unsigned int compareSubImg_thr_sse2(unsigned char* const I1, unsigned char* const I2,
220	const Field* field,
221	int width1, int width2, int height,
222	int bytesPerPixel, int d_x, int d_y,
223	unsigned int treshold) {
224	int k, j;
225	unsigned char* p1 = NULL;
226	unsigned char* p2 = NULL;
227	int s2 = field->size / 2;
228	unsigned int sum = 0;
229
230	static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
231	unsigned char row = 0;
232	#ifndef USE_SSE2_CMP_HOR
233	unsigned char summes[16];
234	int i;
235	#endif
236	__m128i xmmsum, xmmmask;
237	xmmsum = _mm_setzero_si128();
238	xmmmask = _mm_loadu_si128((__m128i const*)mask);
239
240	p1=I1 + ((field->x - s2) + (field->y - s2)width1)bytesPerPixel;
241	p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)width2)bytesPerPixel;
242	for (j = 0; j < field->size; j++){
243	for (k = 0; k < field->size * bytesPerPixel; k+=16){
244	{
245	__m128i xmm0, xmm1, xmm2;
246	xmm0 = _mm_loadu_si128((__m128i const *)p1);
247	xmm1 = _mm_loadu_si128((__m128i const *)p2);
248
249	xmm2 = _mm_subs_epu8(xmm0, xmm1);
250	xmm0 = _mm_subs_epu8(xmm1, xmm0);
251	xmm0 = _mm_adds_epu8(xmm0, xmm2);
252
253	xmm1 = _mm_and_si128(xmm0, xmmmask);
254	xmm0 = _mm_srli_si128(xmm0, 1);
255	xmm0 = _mm_and_si128(xmm0, xmmmask);
256
257	xmmsum = _mm_adds_epu16(xmmsum, xmm0);
258	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
259	}
260
261	p1+=16;
262	p2+=16;
263
264	row++;
265	if (row == SSE2_CMP_SUM_ROWS) {
266	row = 0;
267	#ifdef USE_SSE2_CMP_HOR
268	{
269	__m128i xmm1;
270
271	xmm1 = _mm_srli_si128(xmmsum, 8);
272	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
273
274	xmm1 = _mm_srli_si128(xmmsum, 4);
275	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
276
277	xmm1 = _mm_srli_si128(xmmsum, 2);
278	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
279
280	sum += _mm_extract_epi16(xmmsum, 0);
281	}
282	#else
283	_mm_storeu_si128((__m128i*)summes, xmmsum);
284	for(i = 0; i < 16; i+=2)
285	sum += summes[i] + summes[i+1]*256;
286	#endif
287	xmmsum = _mm_setzero_si128();
288	}
289	}
290	if (sum > treshold)
291	break;
292	p1 += (width1 - field->size) * bytesPerPixel;
293	p2 += (width2 - field->size) * bytesPerPixel;
294	}
295
296	#if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) \
297	&& (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16)
298	//process all data left unprocessed
299	//this part can be safely ignored if
300	//SSE_SUM_ROWS = {1, 2, 4, 8, 16}
301	#ifdef USE_SSE2_CMP_HOR
302	{
303	__m128i xmm1;
304
305	xmm1 = _mm_srli_si128(xmmsum, 8);
306	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
307
308	xmm1 = _mm_srli_si128(xmmsum, 4);
309	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
310
311	xmm1 = _mm_srli_si128(xmmsum, 2);
312	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
313
314	sum += _mm_extract_epi16(xmmsum, 0);
315	}
316	#else
317	_mm_storeu_si128((__m128i*)summes, xmmsum);
318	for(i = 0; i < 16; i+=2)
319	sum += summes[i] + summes[i+1]*256;
320	#endif
321	#endif
322
323	return sum;
324	}
325	#endif // USE_SSE2
326
327	#ifdef USE_SSE2_ASM
328	unsigned int compareSubImg_thr_sse2_asm(unsigned char* const I1, unsigned char* const I2,
329	const Field* field,
330	int width1, int width2, int height,
331	int bytesPerPixel, int d_x, int d_y,
332	unsigned int treshold) {
333	unsigned char* p1 = NULL;
334	unsigned char* p2 = NULL;
335	int s2 = field->size / 2;
336	unsigned int sum = 0;
337
338	static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
339	p1=I1 + ((field->x - s2) + (field->y - s2)width1)bytesPerPixel;
340	p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)width2)bytesPerPixel;
341	asm (
342	"xor %0,%0\n"
343	"pxor %%xmm4,%%xmm4\n" //8 x 16bit partial sums
344	"movdqu (%3),%%xmm3\n" //mask
345
346	//main loop
347	"movl %4,%%edx\n" //edx = field->size * bytesPerPixel / 16
348	"mov $8,%%ecx\n" //cx = 8
349	"1:\n"
350
351	//calc intermediate sum of abs differences for 16 bytes
352	"movdqu (%1),%%xmm0\n" //p1
353	"movdqu (%2),%%xmm1\n" //p2
354	"movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0
355	"psubusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 - xmm1 (by bytes)
356	"psubusb %%xmm2,%%xmm1\n" //xmm1 = xmm1 - xmm2 (by bytes)
357	"paddusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 + xmm1 (absolute difference)
358	"movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0
359	"pand %%xmm3,%%xmm2\n" //xmm2 = xmm2 & xmm3 (apply mask)
360	"psrldq $1,%%xmm0\n" //xmm0 = xmm0 >> 8 (shift by 1 byte)
361	"pand %%xmm3,%%xmm0\n" //xmm0 = xmm0 & xmm3 (apply mask)
362	"paddusw %%xmm0,%%xmm4\n" //xmm4 = xmm4 + xmm0 (by words)
363	"paddusw %%xmm2,%%xmm4\n" //xmm4 = xmm4 + xmm2 (by words)
364
365	"add $16,%1\n" //move to next 16 bytes (p1)
366	"add $16,%2\n" //move to next 16 bytes (p2)
367
368	//check if we need flush sum (i.e. xmm4 is about to saturate)
369	"dec %%ecx\n"
370	"jnz 2f\n" //skip flushing if not
371	//flushing...
372	"movdqu %%xmm4,%%xmm0\n"
373	"psrldq $8,%%xmm0\n"
374	"paddusw %%xmm0,%%xmm4\n"
375	"movdqu %%xmm4,%%xmm0\n"
376	"psrldq $4,%%xmm0\n"
377	"paddusw %%xmm0,%%xmm4\n"
378	"movdqu %%xmm4,%%xmm0\n"
379	"psrldq $2,%%xmm0\n"
380	"paddusw %%xmm0,%%xmm4\n"
381	"movd %%xmm4,%%ecx\n"
382	"and $0xFFFF,%%ecx\n"
383	"addl %%ecx,%0\n"
384	"pxor %%xmm4,%%xmm4\n" //clearing xmm4
385	"mov $8,%%ecx\n" //cx = 8
386
387	//check if we need to go to another line
388	"2:\n"
389	"dec %%edx\n"
390	"jnz 1b\n" //skip if not
391
392	//move p1 and p2 to the next line
393	"add %5,%1\n"
394	"add %5,%2\n"
395	"cmp %7,%0\n" //if (sum > treshold)
396	"ja 3f\n" // break;
397	"movl %4,%%edx\n"
398
399	//check if all lines done
400	"decl %6\n"
401	"jnz 1b\n" //if not, continue looping
402	"3:\n"
403	:"=r"(sum)
404	:"r"(p1),"r"(p2),"r"(mask),"g"(field->size * bytesPerPixel / 16),"g"((unsigned char)((width1 - field->size) bytesPerPixel)),"g"(field->size), "g"(treshold), "0"(sum)
405	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%ecx","%edx"
406	);
407	// TODO width2 is not properly used here
408	return sum;
409	}
410	#endif // USE_SSE2_ASM
411
412	/*
413	* Local variables:
414	* c-file-style: "stroustrup"
415	* c-file-offsets: ((case-label . ) (statement-case-intro . ))
416	* indent-tabs-mode: nil
417	* tab-width: 2
418	* c-basic-offset: 2 t
419	* End:
420	*
421	* vim: expandtab shiftwidth=2:
422	*/