[deb_vid.stab.git] / src / motiondetect_opt.c

/*
 * motiondetect_opt.c
 *
 *  Copyright (C) Georg Martius - February 1007-2012
 *   georg dot martius at web dot de
 *  Copyright (C) Alexey Osipov - Jule 2011
 *   simba at lerlan dot ru
 *   speed optimizations (threshold, spiral, SSE, asm)
 *
 *  This file is part of vid.stab video stabilization library
 *
 *  vid.stab is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License,
 *  as published by the Free Software Foundation; either version 2, or
 *  (at your option) any later version.
 *
 *  vid.stab is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */
#include "motiondetect_opt.h"

#ifdef USE_ORC
#include "orc/motiondetectorc.h"
#endif

#ifdef USE_SSE2
#include <emmintrin.h>

#define USE_SSE2_CMP_HOR
#define SSE2_CMP_SUM_ROWS 8
#endif

#ifdef USE_SSE2
/**
   \see contrastSubImg using SSE2 optimization, Planar (1 byte per channel) only
*/
double contrastSubImg1_SSE(unsigned char* const I, const Field* field,
                           int width, int height)
{
  int k, j;
  unsigned char* p = NULL;
  int s2 = field->size / 2;

  static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};

  p = I + ((field->x - s2) + (field->y - s2)*width);

  __m128i mmin, mmax;

  mmin = _mm_loadu_si128((__m128i const*)full);
  mmax = _mm_setzero_si128();

  for (j = 0; j < field->size; j++){
    for (k = 0; k < field->size; k += 16) {
      __m128i xmm0;
      xmm0 = _mm_loadu_si128((__m128i const*)p);
      mmin = _mm_min_epu8(mmin, xmm0);
      mmax = _mm_max_epu8(mmax, xmm0);
      p += 16;
    }
    p += (width - field->size);
  }

  __m128i xmm1;
  xmm1 = _mm_srli_si128(mmin, 8);
  mmin = _mm_min_epu8(mmin, xmm1);
  xmm1 = _mm_srli_si128(mmin, 4);
  mmin = _mm_min_epu8(mmin, xmm1);
  xmm1 = _mm_srli_si128(mmin, 2);
  mmin = _mm_min_epu8(mmin, xmm1);
  xmm1 = _mm_srli_si128(mmin, 1);
  mmin = _mm_min_epu8(mmin, xmm1);
  unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0);

  xmm1 = _mm_srli_si128(mmax, 8);
  mmax = _mm_max_epu8(mmax, xmm1);
  xmm1 = _mm_srli_si128(mmax, 4);
  mmax = _mm_max_epu8(mmax, xmm1);
  xmm1 = _mm_srli_si128(mmax, 2);
  mmax = _mm_max_epu8(mmax, xmm1);
  xmm1 = _mm_srli_si128(mmax, 1);
  mmax = _mm_max_epu8(mmax, xmm1);
  unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0);

  return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0
}
#endif

#ifdef USE_ORC
/**
   calculates the contrast in the given small part of the given image
   using the absolute difference from mean luminance (like Root-Mean-Square,
   but with abs() (Manhattan-Norm))
   For multichannel images use contrastSubImg_Michelson()

   \param I pointer to framebuffer
   \param field Field specifies position(center) and size of subimage
   \param width width of frame
   \param height height of frame
*/
double contrastSubImg_variance_orc(unsigned char* const I, const Field* field,
                                   int width, int height) {
  unsigned char* p = NULL;
  int s2 = field->size / 2;
  int numpixel = field->size*field->size;

  p = I + ((field->x - s2) + (field->y - s2) * width);

  unsigned int sum=0;
  image_sum_optimized((signed int*)&sum, p, width, field->size, field->size);
  unsigned char mean = sum / numpixel;
  int var=0;
  image_variance_optimized(&var, p, width, mean, field->size, field->size);
  return (double)var/numpixel/255.0;
}

/// plain C implementation of variance based contrastSubImg (without ORC)
double contrastSubImg_variance_C(unsigned char* const I,
                                 const Field* field, int width, int height) {
  int k, j;
  unsigned char* p = NULL;
  unsigned char* pstart = NULL;
  int s2 = field->size / 2;
  unsigned int sum=0;
  int mean;
  int var=0;
  int numpixel = field->size*field->size;

  pstart = I + ((field->x - s2) + (field->y - s2) * width);
  p = pstart;
  for (j = 0; j < field->size; j++) {
    for (k = 0; k < field->size; k++, p++) {
      sum+=*p;
    }
    p += (width - field->size);
  }
  mean=sum/numpixel;
  p = pstart;
  for (j = 0; j < field->size; j++) {
    for (k = 0; k < field->size; k++, p++) {
      var+=abs(*p-mean);
    }
    p += (width - field->size);
  }
  return (double)var/numpixel/255.0;
}
#endif


#ifdef USE_ORC
/**
   compares a small part of two given images
   and returns the average absolute difference.
   Field center, size and shift have to be choosen,
   so that no clipping is required.
   Uses optimized inner loops by ORC.

   \param field Field specifies position(center) and size of subimage
   \param d_x shift in x direction
   \param d_y shift in y direction
*/
unsigned int compareSubImg_thr_orc(unsigned char* const I1, unsigned char* const I2,
                                   const Field* field, int width1, int width2, int height,
                                   int bytesPerPixel, int d_x, int d_y,
                                   unsigned int threshold) {
  unsigned char* p1 = NULL;
  unsigned char* p2 = NULL;
  int s2 = field->size / 2;
  int j;
  unsigned int sum = 0;
  p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
  p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) * bytesPerPixel;

  for (j = 0; j < field->size; j++) {
    unsigned int s = 0;
    image_line_difference_optimized(&s, p1, p2, field->size* bytesPerPixel);
    sum += s;
    if( sum > threshold) // no need to calculate any longer: worse than the best match
      break;
    p1 += width1 * bytesPerPixel;
    p2 += width2 * bytesPerPixel;
  }


  return sum;
}

// implementation with 1 orc function, but no threshold
unsigned int compareSubImg_orc(unsigned char* const I1, unsigned char* const I2,
                               const Field* field, int width1, int width2, int height,
                               int bytesPerPixel, int d_x, int d_y,
                               unsigned int threshold) {
  unsigned char* p1 = NULL;
  unsigned char* p2 = NULL;
  int s2 = field->size / 2;
  unsigned int sum=0;
  p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
  p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2)
    * bytesPerPixel;

  image_difference_optimized(&sum, p1, width1 * bytesPerPixel, p2, width2 * bytesPerPixel,
                             field->size* bytesPerPixel , field->size);
  return sum;
}
#endif

#ifdef USE_SSE2
unsigned int compareSubImg_thr_sse2(unsigned char* const I1, unsigned char* const I2,
                                    const Field* field,
                                    int width1, int width2, int height,
                                    int bytesPerPixel, int d_x, int d_y,
                                    unsigned int treshold) {
  int k, j;
  unsigned char* p1 = NULL;
  unsigned char* p2 = NULL;
  int s2 = field->size / 2;
  unsigned int sum = 0;

  static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
  unsigned char row = 0;
#ifndef USE_SSE2_CMP_HOR
  unsigned char summes[16];
  int i;
#endif
  __m128i xmmsum, xmmmask;
  xmmsum = _mm_setzero_si128();
  xmmmask = _mm_loadu_si128((__m128i const*)mask);

  p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel;
  p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel;
  for (j = 0; j < field->size; j++){
    for (k = 0; k < field->size * bytesPerPixel; k+=16){
      {
        __m128i xmm0, xmm1, xmm2;
        xmm0 = _mm_loadu_si128((__m128i const *)p1);
        xmm1 = _mm_loadu_si128((__m128i const *)p2);

        xmm2 = _mm_subs_epu8(xmm0, xmm1);
        xmm0 = _mm_subs_epu8(xmm1, xmm0);
        xmm0 = _mm_adds_epu8(xmm0, xmm2);

        xmm1 = _mm_and_si128(xmm0, xmmmask);
        xmm0 = _mm_srli_si128(xmm0, 1);
        xmm0 = _mm_and_si128(xmm0, xmmmask);

        xmmsum = _mm_adds_epu16(xmmsum, xmm0);
        xmmsum = _mm_adds_epu16(xmmsum, xmm1);
      }

      p1+=16;
      p2+=16;

      row++;
      if (row == SSE2_CMP_SUM_ROWS) {
        row = 0;
#ifdef USE_SSE2_CMP_HOR
        {
          __m128i xmm1;

          xmm1 = _mm_srli_si128(xmmsum, 8);
          xmmsum = _mm_adds_epu16(xmmsum, xmm1);

          xmm1 = _mm_srli_si128(xmmsum, 4);
          xmmsum = _mm_adds_epu16(xmmsum, xmm1);

          xmm1 = _mm_srli_si128(xmmsum, 2);
          xmmsum = _mm_adds_epu16(xmmsum, xmm1);

          sum += _mm_extract_epi16(xmmsum, 0);
        }
#else
        _mm_storeu_si128((__m128i*)summes, xmmsum);
        for(i = 0; i < 16; i+=2)
          sum += summes[i] + summes[i+1]*256;
#endif
        xmmsum = _mm_setzero_si128();
      }
    }
    if (sum > treshold)
      break;
    p1 += (width1 - field->size) * bytesPerPixel;
    p2 += (width2 - field->size) * bytesPerPixel;
  }

#if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) \
  && (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16)
  //process all data left unprocessed
  //this part can be safely ignored if
  //SSE_SUM_ROWS = {1, 2, 4, 8, 16}
#ifdef USE_SSE2_CMP_HOR
  {
    __m128i xmm1;

    xmm1 = _mm_srli_si128(xmmsum, 8);
    xmmsum = _mm_adds_epu16(xmmsum, xmm1);

    xmm1 = _mm_srli_si128(xmmsum, 4);
    xmmsum = _mm_adds_epu16(xmmsum, xmm1);

    xmm1 = _mm_srli_si128(xmmsum, 2);
    xmmsum = _mm_adds_epu16(xmmsum, xmm1);

    sum += _mm_extract_epi16(xmmsum, 0);
  }
#else
  _mm_storeu_si128((__m128i*)summes, xmmsum);
  for(i = 0; i < 16; i+=2)
    sum += summes[i] + summes[i+1]*256;
#endif
#endif

  return sum;
}
#endif // USE_SSE2

#ifdef USE_SSE2_ASM
unsigned int compareSubImg_thr_sse2_asm(unsigned char* const I1, unsigned char* const I2,
                                        const Field* field,
                                        int width1, int width2, int height,
                                        int bytesPerPixel, int d_x, int d_y,
                                        unsigned int treshold) {
  unsigned char* p1 = NULL;
  unsigned char* p2 = NULL;
  int s2 = field->size / 2;
  unsigned int sum = 0;

  static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
  p1=I1 + ((field->x - s2) + (field->y - s2)*width1)*bytesPerPixel;
  p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)*width2)*bytesPerPixel;
  asm (
    "xor %0,%0\n"
    "pxor %%xmm4,%%xmm4\n"         //8 x 16bit partial sums
    "movdqu (%3),%%xmm3\n"         //mask

    //main loop
    "movl %4,%%edx\n"              //edx = field->size * bytesPerPixel / 16
    "mov $8,%%ecx\n"               //cx = 8
    "1:\n"

    //calc intermediate sum of abs differences for 16 bytes
    "movdqu (%1),%%xmm0\n"       //p1
    "movdqu (%2),%%xmm1\n"       //p2
    "movdqu %%xmm0,%%xmm2\n"     //xmm2 = xmm0
    "psubusb %%xmm1,%%xmm0\n"    //xmm0 = xmm0 - xmm1 (by bytes)
    "psubusb %%xmm2,%%xmm1\n"    //xmm1 = xmm1 - xmm2 (by bytes)
    "paddusb %%xmm1,%%xmm0\n"    //xmm0 = xmm0 + xmm1 (absolute difference)
    "movdqu %%xmm0,%%xmm2\n"     //xmm2 = xmm0
    "pand %%xmm3,%%xmm2\n"       //xmm2 = xmm2 & xmm3 (apply mask)
    "psrldq $1,%%xmm0\n"         //xmm0 = xmm0 >> 8 (shift by 1 byte)
    "pand %%xmm3,%%xmm0\n"       //xmm0 = xmm0 & xmm3 (apply mask)
    "paddusw %%xmm0,%%xmm4\n"    //xmm4 = xmm4 + xmm0 (by words)
    "paddusw %%xmm2,%%xmm4\n"    //xmm4 = xmm4 + xmm2 (by words)

    "add $16,%1\n"               //move to next 16 bytes (p1)
    "add $16,%2\n"               //move to next 16 bytes (p2)

    //check if we need flush sum (i.e. xmm4 is about to saturate)
    "dec %%ecx\n"
    "jnz 2f\n"                   //skip flushing if not
    //flushing...
    "movdqu %%xmm4,%%xmm0\n"
    "psrldq $8,%%xmm0\n"
    "paddusw %%xmm0,%%xmm4\n"
    "movdqu %%xmm4,%%xmm0\n"
    "psrldq $4,%%xmm0\n"
    "paddusw %%xmm0,%%xmm4\n"
    "movdqu %%xmm4,%%xmm0\n"
    "psrldq $2,%%xmm0\n"
    "paddusw %%xmm0,%%xmm4\n"
    "movd %%xmm4,%%ecx\n"
    "and $0xFFFF,%%ecx\n"
    "addl %%ecx,%0\n"
    "pxor %%xmm4,%%xmm4\n"       //clearing xmm4
    "mov $8,%%ecx\n"             //cx = 8

    //check if we need to go to another line
    "2:\n"
    "dec %%edx\n"
    "jnz 1b\n"                   //skip if not

    //move p1 and p2 to the next line
    "add %5,%1\n"
    "add %5,%2\n"
    "cmp %7,%0\n"                //if (sum > treshold)
    "ja 3f\n"                    //    break;
    "movl %4,%%edx\n"

    //check if all lines done
    "decl %6\n"
    "jnz 1b\n"                   //if not, continue looping
    "3:\n"
    :"=r"(sum)
    :"r"(p1),"r"(p2),"r"(mask),"g"(field->size * bytesPerPixel / 16),"g"((unsigned char*)((width1 - field->size) * bytesPerPixel)),"g"(field->size), "g"(treshold), "0"(sum)
    :"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%ecx","%edx"
    );
  // TODO width2 is not properly used here
  return sum;
}
#endif // USE_SSE2_ASM

/*
 * Local variables:
 *   c-file-style: "stroustrup"
 *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
 *   indent-tabs-mode: nil
 *   tab-width:  2
 *   c-basic-offset: 2 t
 * End:
 *
 * vim: expandtab shiftwidth=2:
 */
Commit	Line	Data
	1	/*
	2	* motiondetect_opt.c
	3	*
	4	* Copyright (C) Georg Martius - February 1007-2012
	5	* georg dot martius at web dot de
	6	* Copyright (C) Alexey Osipov - Jule 2011
	7	* simba at lerlan dot ru
	8	* speed optimizations (threshold, spiral, SSE, asm)
	9	*
	10	* This file is part of vid.stab video stabilization library
	11	*
	12	* vid.stab is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU General Public License,
	14	* as published by the Free Software Foundation; either version 2, or
	15	* (at your option) any later version.
	16	*
	17	* vid.stab is distributed in the hope that it will be useful,
	18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	* GNU General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU General Public License
	23	* along with GNU Make; see the file COPYING. If not, write to
	24	* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
	25	*
	26	*/
	27	#include "motiondetect_opt.h"
	28
	29	#ifdef USE_ORC
	30	#include "orc/motiondetectorc.h"
	31	#endif
	32
	33	#ifdef USE_SSE2
	34	#include <emmintrin.h>
	35
	36	#define USE_SSE2_CMP_HOR
	37	#define SSE2_CMP_SUM_ROWS 8
	38	#endif
	39
	40	#ifdef USE_SSE2
	41	/**
	42	\see contrastSubImg using SSE2 optimization, Planar (1 byte per channel) only
	43	*/
	44	double contrastSubImg1_SSE(unsigned char* const I, const Field* field,
	45	int width, int height)
	46	{
	47	int k, j;
	48	unsigned char* p = NULL;
	49	int s2 = field->size / 2;
	50
	51	static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
	52
	53	p = I + ((field->x - s2) + (field->y - s2)*width);
	54
	55	__m128i mmin, mmax;
	56
	57	mmin = _mm_loadu_si128((__m128i const*)full);
	58	mmax = _mm_setzero_si128();
	59
	60	for (j = 0; j < field->size; j++){
	61	for (k = 0; k < field->size; k += 16) {
	62	__m128i xmm0;
	63	xmm0 = _mm_loadu_si128((__m128i const*)p);
	64	mmin = _mm_min_epu8(mmin, xmm0);
	65	mmax = _mm_max_epu8(mmax, xmm0);
	66	p += 16;
	67	}
	68	p += (width - field->size);
	69	}
	70
	71	__m128i xmm1;
	72	xmm1 = _mm_srli_si128(mmin, 8);
	73	mmin = _mm_min_epu8(mmin, xmm1);
	74	xmm1 = _mm_srli_si128(mmin, 4);
	75	mmin = _mm_min_epu8(mmin, xmm1);
	76	xmm1 = _mm_srli_si128(mmin, 2);
	77	mmin = _mm_min_epu8(mmin, xmm1);
	78	xmm1 = _mm_srli_si128(mmin, 1);
	79	mmin = _mm_min_epu8(mmin, xmm1);
	80	unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0);
	81
	82	xmm1 = _mm_srli_si128(mmax, 8);
	83	mmax = _mm_max_epu8(mmax, xmm1);
	84	xmm1 = _mm_srli_si128(mmax, 4);
	85	mmax = _mm_max_epu8(mmax, xmm1);
	86	xmm1 = _mm_srli_si128(mmax, 2);
	87	mmax = _mm_max_epu8(mmax, xmm1);
	88	xmm1 = _mm_srli_si128(mmax, 1);
	89	mmax = _mm_max_epu8(mmax, xmm1);
	90	unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0);
	91
	92	return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0
	93	}
	94	#endif
	95
	96	#ifdef USE_ORC
	97	/**
	98	calculates the contrast in the given small part of the given image
	99	using the absolute difference from mean luminance (like Root-Mean-Square,
	100	but with abs() (Manhattan-Norm))
	101	For multichannel images use contrastSubImg_Michelson()
	102
	103	\param I pointer to framebuffer
	104	\param field Field specifies position(center) and size of subimage
	105	\param width width of frame
	106	\param height height of frame
	107	*/
	108	double contrastSubImg_variance_orc(unsigned char* const I, const Field* field,
	109	int width, int height) {
	110	unsigned char* p = NULL;
	111	int s2 = field->size / 2;
	112	int numpixel = field->size*field->size;
	113
	114	p = I + ((field->x - s2) + (field->y - s2) * width);
	115
	116	unsigned int sum=0;
	117	image_sum_optimized((signed int*)&sum, p, width, field->size, field->size);
	118	unsigned char mean = sum / numpixel;
	119	int var=0;
	120	image_variance_optimized(&var, p, width, mean, field->size, field->size);
	121	return (double)var/numpixel/255.0;
	122	}
	123
	124	/// plain C implementation of variance based contrastSubImg (without ORC)
	125	double contrastSubImg_variance_C(unsigned char* const I,
	126	const Field* field, int width, int height) {
	127	int k, j;
	128	unsigned char* p = NULL;
	129	unsigned char* pstart = NULL;
	130	int s2 = field->size / 2;
	131	unsigned int sum=0;
	132	int mean;
	133	int var=0;
	134	int numpixel = field->size*field->size;
	135
	136	pstart = I + ((field->x - s2) + (field->y - s2) * width);
	137	p = pstart;
	138	for (j = 0; j < field->size; j++) {
	139	for (k = 0; k < field->size; k++, p++) {
	140	sum+=*p;
	141	}
	142	p += (width - field->size);
	143	}
	144	mean=sum/numpixel;
	145	p = pstart;
	146	for (j = 0; j < field->size; j++) {
	147	for (k = 0; k < field->size; k++, p++) {
	148	var+=abs(*p-mean);
	149	}
	150	p += (width - field->size);
	151	}
	152	return (double)var/numpixel/255.0;
	153	}
	154	#endif
	155
	156
	157
	158
	159
	160
	161	#ifdef USE_ORC
	162	/**
	163	compares a small part of two given images
	164	and returns the average absolute difference.
	165	Field center, size and shift have to be choosen,
	166	so that no clipping is required.
	167	Uses optimized inner loops by ORC.
	168
	169	\param field Field specifies position(center) and size of subimage
	170	\param d_x shift in x direction
	171	\param d_y shift in y direction
	172	*/
	173	unsigned int compareSubImg_thr_orc(unsigned char* const I1, unsigned char* const I2,
	174	const Field* field, int width1, int width2, int height,
	175	int bytesPerPixel, int d_x, int d_y,
	176	unsigned int threshold) {
	177	unsigned char* p1 = NULL;
	178	unsigned char* p2 = NULL;
	179	int s2 = field->size / 2;
	180	int j;
	181	unsigned int sum = 0;
	182	p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
	183	p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2) * bytesPerPixel;
	184
	185	for (j = 0; j < field->size; j++) {
	186	unsigned int s = 0;
	187	image_line_difference_optimized(&s, p1, p2, field->size* bytesPerPixel);
	188	sum += s;
	189	if( sum > threshold) // no need to calculate any longer: worse than the best match
	190	break;
	191	p1 += width1 * bytesPerPixel;
	192	p2 += width2 * bytesPerPixel;
	193	}
	194
	195
	196	return sum;
	197	}
	198
	199	// implementation with 1 orc function, but no threshold
	200	unsigned int compareSubImg_orc(unsigned char* const I1, unsigned char* const I2,
	201	const Field* field, int width1, int width2, int height,
	202	int bytesPerPixel, int d_x, int d_y,
	203	unsigned int threshold) {
	204	unsigned char* p1 = NULL;
	205	unsigned char* p2 = NULL;
	206	int s2 = field->size / 2;
	207	unsigned int sum=0;
	208	p1 = I1 + ((field->x - s2) + (field->y - s2) * width1) * bytesPerPixel;
	209	p2 = I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y) * width2)
	210	* bytesPerPixel;
	211
	212	image_difference_optimized(&sum, p1, width1 * bytesPerPixel, p2, width2 * bytesPerPixel,
	213	field->size* bytesPerPixel , field->size);
	214	return sum;
	215	}
	216	#endif
	217
	218	#ifdef USE_SSE2
	219	unsigned int compareSubImg_thr_sse2(unsigned char* const I1, unsigned char* const I2,
	220	const Field* field,
	221	int width1, int width2, int height,
	222	int bytesPerPixel, int d_x, int d_y,
	223	unsigned int treshold) {
	224	int k, j;
	225	unsigned char* p1 = NULL;
	226	unsigned char* p2 = NULL;
	227	int s2 = field->size / 2;
	228	unsigned int sum = 0;
	229
	230	static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
	231	unsigned char row = 0;
	232	#ifndef USE_SSE2_CMP_HOR
	233	unsigned char summes[16];
	234	int i;
	235	#endif
	236	__m128i xmmsum, xmmmask;
	237	xmmsum = _mm_setzero_si128();
	238	xmmmask = _mm_loadu_si128((__m128i const*)mask);
	239
	240	p1=I1 + ((field->x - s2) + (field->y - s2)width1)bytesPerPixel;
	241	p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)width2)bytesPerPixel;
	242	for (j = 0; j < field->size; j++){
	243	for (k = 0; k < field->size * bytesPerPixel; k+=16){
	244	{
	245	__m128i xmm0, xmm1, xmm2;
	246	xmm0 = _mm_loadu_si128((__m128i const *)p1);
	247	xmm1 = _mm_loadu_si128((__m128i const *)p2);
	248
	249	xmm2 = _mm_subs_epu8(xmm0, xmm1);
	250	xmm0 = _mm_subs_epu8(xmm1, xmm0);
	251	xmm0 = _mm_adds_epu8(xmm0, xmm2);
	252
	253	xmm1 = _mm_and_si128(xmm0, xmmmask);
	254	xmm0 = _mm_srli_si128(xmm0, 1);
	255	xmm0 = _mm_and_si128(xmm0, xmmmask);
	256
	257	xmmsum = _mm_adds_epu16(xmmsum, xmm0);
	258	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
	259	}
	260
	261	p1+=16;
	262	p2+=16;
	263
	264	row++;
	265	if (row == SSE2_CMP_SUM_ROWS) {
	266	row = 0;
	267	#ifdef USE_SSE2_CMP_HOR
	268	{
	269	__m128i xmm1;
	270
	271	xmm1 = _mm_srli_si128(xmmsum, 8);
	272	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
	273
	274	xmm1 = _mm_srli_si128(xmmsum, 4);
	275	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
	276
	277	xmm1 = _mm_srli_si128(xmmsum, 2);
	278	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
	279
	280	sum += _mm_extract_epi16(xmmsum, 0);
	281	}
	282	#else
	283	_mm_storeu_si128((__m128i*)summes, xmmsum);
	284	for(i = 0; i < 16; i+=2)
	285	sum += summes[i] + summes[i+1]*256;
	286	#endif
	287	xmmsum = _mm_setzero_si128();
	288	}
	289	}
	290	if (sum > treshold)
	291	break;
	292	p1 += (width1 - field->size) * bytesPerPixel;
	293	p2 += (width2 - field->size) * bytesPerPixel;
	294	}
	295
	296	#if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS != 4) \
	297	&& (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16)
	298	//process all data left unprocessed
	299	//this part can be safely ignored if
	300	//SSE_SUM_ROWS = {1, 2, 4, 8, 16}
	301	#ifdef USE_SSE2_CMP_HOR
	302	{
	303	__m128i xmm1;
	304
	305	xmm1 = _mm_srli_si128(xmmsum, 8);
	306	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
	307
	308	xmm1 = _mm_srli_si128(xmmsum, 4);
	309	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
	310
	311	xmm1 = _mm_srli_si128(xmmsum, 2);
	312	xmmsum = _mm_adds_epu16(xmmsum, xmm1);
	313
	314	sum += _mm_extract_epi16(xmmsum, 0);
	315	}
	316	#else
	317	_mm_storeu_si128((__m128i*)summes, xmmsum);
	318	for(i = 0; i < 16; i+=2)
	319	sum += summes[i] + summes[i+1]*256;
	320	#endif
	321	#endif
	322
	323	return sum;
	324	}
	325	#endif // USE_SSE2
	326
	327	#ifdef USE_SSE2_ASM
	328	unsigned int compareSubImg_thr_sse2_asm(unsigned char* const I1, unsigned char* const I2,
	329	const Field* field,
	330	int width1, int width2, int height,
	331	int bytesPerPixel, int d_x, int d_y,
	332	unsigned int treshold) {
	333	unsigned char* p1 = NULL;
	334	unsigned char* p2 = NULL;
	335	int s2 = field->size / 2;
	336	unsigned int sum = 0;
	337
	338	static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
	339	p1=I1 + ((field->x - s2) + (field->y - s2)width1)bytesPerPixel;
	340	p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 + d_y)width2)bytesPerPixel;
	341	asm (
	342	"xor %0,%0\n"
	343	"pxor %%xmm4,%%xmm4\n" //8 x 16bit partial sums
	344	"movdqu (%3),%%xmm3\n" //mask
	345
	346	//main loop
	347	"movl %4,%%edx\n" //edx = field->size * bytesPerPixel / 16
	348	"mov $8,%%ecx\n" //cx = 8
	349	"1:\n"
	350
	351	//calc intermediate sum of abs differences for 16 bytes
	352	"movdqu (%1),%%xmm0\n" //p1
	353	"movdqu (%2),%%xmm1\n" //p2
	354	"movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0
	355	"psubusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 - xmm1 (by bytes)
	356	"psubusb %%xmm2,%%xmm1\n" //xmm1 = xmm1 - xmm2 (by bytes)
	357	"paddusb %%xmm1,%%xmm0\n" //xmm0 = xmm0 + xmm1 (absolute difference)
	358	"movdqu %%xmm0,%%xmm2\n" //xmm2 = xmm0
	359	"pand %%xmm3,%%xmm2\n" //xmm2 = xmm2 & xmm3 (apply mask)
	360	"psrldq $1,%%xmm0\n" //xmm0 = xmm0 >> 8 (shift by 1 byte)
	361	"pand %%xmm3,%%xmm0\n" //xmm0 = xmm0 & xmm3 (apply mask)
	362	"paddusw %%xmm0,%%xmm4\n" //xmm4 = xmm4 + xmm0 (by words)
	363	"paddusw %%xmm2,%%xmm4\n" //xmm4 = xmm4 + xmm2 (by words)
	364
	365	"add $16,%1\n" //move to next 16 bytes (p1)
	366	"add $16,%2\n" //move to next 16 bytes (p2)
	367
	368	//check if we need flush sum (i.e. xmm4 is about to saturate)
	369	"dec %%ecx\n"
	370	"jnz 2f\n" //skip flushing if not
	371	//flushing...
	372	"movdqu %%xmm4,%%xmm0\n"
	373	"psrldq $8,%%xmm0\n"
	374	"paddusw %%xmm0,%%xmm4\n"
	375	"movdqu %%xmm4,%%xmm0\n"
	376	"psrldq $4,%%xmm0\n"
	377	"paddusw %%xmm0,%%xmm4\n"
	378	"movdqu %%xmm4,%%xmm0\n"
	379	"psrldq $2,%%xmm0\n"
	380	"paddusw %%xmm0,%%xmm4\n"
	381	"movd %%xmm4,%%ecx\n"
	382	"and $0xFFFF,%%ecx\n"
	383	"addl %%ecx,%0\n"
	384	"pxor %%xmm4,%%xmm4\n" //clearing xmm4
	385	"mov $8,%%ecx\n" //cx = 8
	386
	387	//check if we need to go to another line
	388	"2:\n"
	389	"dec %%edx\n"
	390	"jnz 1b\n" //skip if not
	391
	392	//move p1 and p2 to the next line
	393	"add %5,%1\n"
	394	"add %5,%2\n"
	395	"cmp %7,%0\n" //if (sum > treshold)
	396	"ja 3f\n" // break;
	397	"movl %4,%%edx\n"
	398
	399	//check if all lines done
	400	"decl %6\n"
	401	"jnz 1b\n" //if not, continue looping
	402	"3:\n"
	403	:"=r"(sum)
	404	:"r"(p1),"r"(p2),"r"(mask),"g"(field->size * bytesPerPixel / 16),"g"((unsigned char)((width1 - field->size) bytesPerPixel)),"g"(field->size), "g"(treshold), "0"(sum)
	405	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%ecx","%edx"
	406	);
	407	// TODO width2 is not properly used here
	408	return sum;
	409	}
	410	#endif // USE_SSE2_ASM
	411
	412	/*
	413	* Local variables:
	414	* c-file-style: "stroustrup"
	415	* c-file-offsets: ((case-label . ) (statement-case-intro . ))
	416	* indent-tabs-mode: nil
	417	* tab-width: 2
	418	* c-basic-offset: 2 t
	419	* End:
	420	*
	421	* vim: expandtab shiftwidth=2:
	422	*/