Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / ssd-a.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* ssd-a.asm: x86 ssd functions
3;*****************************************************************************
4;* Copyright (C) 2003-2013 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Fiona Glaser <fiona@x264.com>
8;* Laurent Aimar <fenrir@via.ecp.fr>
9;* Alex Izvorski <aizvorksi@gmail.com>
10;*
11;* This program is free software; you can redistribute it and/or modify
12;* it under the terms of the GNU General Public License as published by
13;* the Free Software Foundation; either version 2 of the License, or
14;* (at your option) any later version.
15;*
16;* This program is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19;* GNU General Public License for more details.
20;*
21;* You should have received a copy of the GNU General Public License
22;* along with this program; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24;*
25;* This program is also available under a commercial proprietary license.
26;* For more information, contact us at license @ x265.com.
27;*****************************************************************************
28
29%include "x86inc.asm"
30%include "x86util.asm"
31
32SECTION_RODATA 32
33
34SECTION .text
35
36cextern pw_00ff
37cextern hsub_mul
38
39;=============================================================================
40; SSD
41;=============================================================================
42
43%if HIGH_BIT_DEPTH
44;-----------------------------------------------------------------------------
45; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
46;-----------------------------------------------------------------------------
47%macro SSD_ONE 2
48cglobal pixel_ssd_ss_%1x%2, 4,7,8
49 FIX_STRIDES r1, r3
50%if mmsize == %1*2
51 %define offset0_1 r1
52 %define offset0_2 r1*2
53 %define offset0_3 r5
54 %define offset1_1 r3
55 %define offset1_2 r3*2
56 %define offset1_3 r6
57 lea r5, [3*r1]
58 lea r6, [3*r3]
59%elif mmsize == %1
60 %define offset0_1 mmsize
61 %define offset0_2 r1
62 %define offset0_3 r1+mmsize
63 %define offset1_1 mmsize
64 %define offset1_2 r3
65 %define offset1_3 r3+mmsize
66%elif mmsize == %1/2
67 %define offset0_1 mmsize
68 %define offset0_2 mmsize*2
69 %define offset0_3 mmsize*3
70 %define offset1_1 mmsize
71 %define offset1_2 mmsize*2
72 %define offset1_3 mmsize*3
73%endif
74 %assign %%n %2/(2*mmsize/%1)
75%if %%n > 1
76 mov r4d, %%n
77%endif
78 pxor m0, m0
79.loop:
80 movu m1, [r0]
81 movu m2, [r0+offset0_1]
82 movu m3, [r0+offset0_2]
83 movu m4, [r0+offset0_3]
84 movu m6, [r2]
85 movu m7, [r2+offset1_1]
86 psubw m1, m6
87 psubw m2, m7
88 movu m6, [r2+offset1_2]
89 movu m7, [r2+offset1_3]
90 psubw m3, m6
91 psubw m4, m7
92%if %%n > 1
93 lea r0, [r0+r1*(%2/%%n)]
94 lea r2, [r2+r3*(%2/%%n)]
95%endif
96 pmaddwd m1, m1
97 pmaddwd m2, m2
98 pmaddwd m3, m3
99 pmaddwd m4, m4
100 paddd m1, m2
101 paddd m3, m4
102 paddd m0, m1
103 paddd m0, m3
104%if %%n > 1
105 dec r4d
106 jg .loop
107%endif
108 HADDD m0, m5
109 movd eax, xm0
110%ifidn movu,movq ; detect MMX
111 EMMS
112%endif
113 RET
114%endmacro
115
116%macro SSD_TWO 2
117cglobal pixel_ssd_ss_%1x%2, 4,7,8
118 FIX_STRIDES r1, r3
119 pxor m0, m0
120 mov r4d, %2/2
121 lea r5, [r1 * 2]
122 lea r6, [r3 * 2]
123.loop:
124 movu m1, [r0]
125 movu m2, [r0 + 16]
126 movu m3, [r0 + 32]
127 movu m4, [r0 + 48]
128 movu m6, [r2]
129 movu m7, [r2 + 16]
130 psubw m1, m6
131 psubw m2, m7
132 movu m6, [r2 + 32]
133 movu m7, [r2 + 48]
134 psubw m3, m6
135 psubw m4, m7
136 pmaddwd m1, m1
137 pmaddwd m2, m2
138 pmaddwd m3, m3
139 pmaddwd m4, m4
140 paddd m1, m2
141 paddd m3, m4
142 paddd m0, m1
143 paddd m0, m3
144 movu m1, [r0 + 64]
145 movu m2, [r0 + 80]
146 movu m6, [r2 + 64]
147 movu m7, [r2 + 80]
148 psubw m1, m6
149 psubw m2, m7
150 pmaddwd m1, m1
151 pmaddwd m2, m2
152 paddd m1, m2
153 paddd m0, m1
154%if %1 == 64
155 movu m3, [r0 + 96]
156 movu m4, [r0 + 112]
157 movu m6, [r2 + 96]
158 movu m7, [r2 + 112]
159 psubw m3, m6
160 psubw m4, m7
161 pmaddwd m3, m3
162 pmaddwd m4, m4
163 paddd m3, m4
164 paddd m0, m3
165%endif
166 movu m1, [r0 + r1]
167 movu m2, [r0 + r1 + 16]
168 movu m3, [r0 + r1 + 32]
169 movu m4, [r0 + r1 + 48]
170 movu m6, [r2 + r3]
171 movu m7, [r2 + r3 + 16]
172 psubw m1, m6
173 psubw m2, m7
174 movu m6, [r2 + r3 + 32]
175 movu m7, [r2 + r3 + 48]
176 psubw m3, m6
177 psubw m4, m7
178 pmaddwd m1, m1
179 pmaddwd m2, m2
180 pmaddwd m3, m3
181 pmaddwd m4, m4
182 paddd m1, m2
183 paddd m3, m4
184 paddd m0, m1
185 paddd m0, m3
186 movu m1, [r0 + r1 + 64]
187 movu m2, [r0 + r1 + 80]
188 movu m6, [r2 + r3 + 64]
189 movu m7, [r2 + r3 + 80]
190 psubw m1, m6
191 psubw m2, m7
192 pmaddwd m1, m1
193 pmaddwd m2, m2
194 paddd m1, m2
195 paddd m0, m1
196%if %1 == 64
197 movu m3, [r0 + r1 + 96]
198 movu m4, [r0 + r1 + 112]
199 movu m6, [r2 + r3 + 96]
200 movu m7, [r2 + r3 + 112]
201 psubw m3, m6
202 psubw m4, m7
203 pmaddwd m3, m3
204 pmaddwd m4, m4
205 paddd m3, m4
206 paddd m0, m3
207%endif
208 lea r0, [r0 + r5]
209 lea r2, [r2 + r6]
210 dec r4d
211 jnz .loop
212 HADDD m0, m5
213 movd eax, xm0
214 RET
215%endmacro
216%macro SSD_24 2
217cglobal pixel_ssd_ss_%1x%2, 4,7,8
218 FIX_STRIDES r1, r3
219 pxor m0, m0
220 mov r4d, %2/2
221 lea r5, [r1 * 2]
222 lea r6, [r3 * 2]
223.loop:
224 movu m1, [r0]
225 movu m2, [r0 + 16]
226 movu m3, [r0 + 32]
227 movu m5, [r2]
228 movu m6, [r2 + 16]
229 movu m7, [r2 + 32]
230 psubw m1, m5
231 psubw m2, m6
232 psubw m3, m7
233 pmaddwd m1, m1
234 pmaddwd m2, m2
235 pmaddwd m3, m3
236 paddd m1, m2
237 paddd m0, m1
238 movu m1, [r0 + r1]
239 movu m2, [r0 + r1 + 16]
240 movu m4, [r0 + r1 + 32]
241 movu m5, [r2 + r3]
242 movu m6, [r2 + r3 + 16]
243 movu m7, [r2 + r3 + 32]
244 psubw m1, m5
245 psubw m2, m6
246 psubw m4, m7
247 pmaddwd m1, m1
248 pmaddwd m2, m2
249 pmaddwd m4, m4
250 paddd m1, m2
251 paddd m3, m4
252 paddd m0, m1
253 paddd m0, m3
254 lea r0, [r0 + r5]
255 lea r2, [r2 + r6]
256 dec r4d
257 jnz .loop
258 HADDD m0, m5
259 movd eax, xm0
260 RET
261%endmacro
262%macro SSD_12 2
263cglobal pixel_ssd_ss_%1x%2, 4,7,8
264 FIX_STRIDES r1, r3
265 pxor m0, m0
266 mov r4d, %2/4
267 lea r5, [r1 * 2]
268 lea r6, [r3 * 2]
269.loop:
270 movu m1, [r0]
271 movh m2, [r0 + 16]
272 movu m3, [r0 + r1]
273 punpcklqdq m2, [r0 + r1 + 16]
274 movu m7, [r2]
275 psubw m1, m7
276 movh m4, [r2 + 16]
277 movu m7, [r2 + r3]
278 psubw m3, m7
279 punpcklqdq m4, [r2 + r3 + 16]
280 psubw m2, m4
281 pmaddwd m1, m1
282 pmaddwd m2, m2
283 pmaddwd m3, m3
284 paddd m1, m2
285 paddd m0, m1
286
287 movu m1, [r0 + r5]
288 movh m2, [r0 + r5 + 16]
289 lea r0, [r0 + r5]
290 movu m6, [r0 + r1]
291 punpcklqdq m2, [r0 + r1 + 16]
292 movu m7, [r2 + r6]
293 psubw m1, m7
294 movh m4, [r2 + r6 + 16]
295 lea r2, [r2 + r6]
296 movu m7, [r2 + r3]
297 psubw m6, m7
298 punpcklqdq m4, [r2 + r3 + 16]
299 psubw m2, m4
300 pmaddwd m1, m1
301 pmaddwd m2, m2
302 pmaddwd m6, m6
303 paddd m1, m2
304 paddd m3, m6
305 paddd m0, m1
306 paddd m0, m3
307 lea r0, [r0 + r5]
308 lea r2, [r2 + r6]
309 dec r4d
310 jnz .loop
311 HADDD m0, m5
312 movd eax, xm0
313 RET
314%endmacro
315INIT_MMX mmx2
316SSD_ONE 4, 4
317SSD_ONE 4, 8
318SSD_ONE 4, 16
319SSD_ONE 8, 4
320SSD_ONE 8, 8
321SSD_ONE 8, 16
322SSD_ONE 16, 8
323SSD_ONE 16, 16
324INIT_XMM sse2
325SSD_ONE 8, 4
326SSD_ONE 8, 8
327SSD_ONE 8, 16
328SSD_ONE 8, 32
329SSD_12 12, 16
330SSD_ONE 16, 4
331SSD_ONE 16, 8
332SSD_ONE 16, 12
333SSD_ONE 16, 16
334SSD_ONE 16, 32
335SSD_ONE 16, 64
336SSD_24 24, 32
337SSD_ONE 32, 8
338SSD_ONE 32, 16
339SSD_ONE 32, 24
340SSD_ONE 32, 32
341SSD_ONE 32, 64
342SSD_TWO 48, 64
343SSD_TWO 64, 16
344SSD_TWO 64, 32
345SSD_TWO 64, 48
346SSD_TWO 64, 64
347INIT_YMM avx2
348SSD_ONE 16, 8
349SSD_ONE 16, 16
350%endif ; HIGH_BIT_DEPTH
351
352;-----------------------------------------------------------------------------
353; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
354;-----------------------------------------------------------------------------
355%if HIGH_BIT_DEPTH == 0
356%macro SSD_SS 2
357cglobal pixel_ssd_ss_%1x%2, 4,7,6
358 FIX_STRIDES r1, r3
359%if mmsize == %1*4 || mmsize == %1*2
360 %define offset0_1 r1*2
361 %define offset0_2 r1*4
362 %define offset0_3 r5
363 %define offset1_1 r3*2
364 %define offset1_2 r3*4
365 %define offset1_3 r6
366 lea r5, [4*r1]
367 lea r6, [4*r3]
368 lea r5, [r5 + 2*r1]
369 lea r6, [r6 + 2*r3]
370%elif mmsize == %1
371 %define offset0_1 16
372 %define offset0_2 r1*2
373 %define offset0_3 r1*2+16
374 %define offset1_1 16
375 %define offset1_2 r3*2
376 %define offset1_3 r3*2+16
377%endif
378%if %1 == 4
379 %assign %%n %2/(mmsize/%1)
380%else
381 %assign %%n %2/(2*mmsize/%1)
382%endif
383%if %%n > 1
384 mov r4d, %%n
385%endif
386 pxor m0, m0
387.loop:
388%if %1 == 4
389 movh m1, [r0]
390 movh m2, [r2]
391 psubw m1, m2
392 pmaddwd m1, m1
393 paddd m0, m1
394 movh m1, [r0 + offset0_1]
395 movh m2, [r2 + offset1_1]
396 psubw m1, m2
397 pmaddwd m1, m1
398 paddd m0, m1
399 movh m1, [r0 + offset0_2]
400 movh m2, [r2 + offset1_2]
401 psubw m1, m2
402 pmaddwd m1, m1
403 paddd m0, m1
404 movh m1, [r0 + offset0_3]
405 movh m2, [r2 + offset1_3]
406 psubw m1, m2
407 pmaddwd m1, m1
408 paddd m0, m1
409%else
410 movu m1, [r0]
411 movu m2, [r2]
412 psubw m1, m2
413 pmaddwd m1, m1
414 paddd m0, m1
415 movu m1, [r0 + offset0_1]
416 movu m2, [r2 + offset1_1]
417 psubw m1, m2
418 pmaddwd m1, m1
419 paddd m0, m1
420 movu m1, [r0 + offset0_2]
421 movu m2, [r2 + offset1_2]
422 psubw m1, m2
423 pmaddwd m1, m1
424 paddd m0, m1
425 movu m1, [r0 + offset0_3]
426 movu m2, [r2 + offset1_3]
427 psubw m1, m2
428 pmaddwd m1, m1
429 paddd m0, m1
430%endif
431 lea r0, [r0+r1*(%2/%%n)*2]
432 lea r2, [r2+r3*(%2/%%n)*2]
433%if %%n > 1
434 dec r4d
435 jg .loop
436%endif
437%if %1 == 4
438 %if notcpuflag(ssse3)
439 pshufd m1, m0, 1
440 paddd m0, m1
441 %else
442 phaddd m0, m0
443 %endif
444%else
445 HADDD m0, m1
446%endif
447 movd eax, m0
448 RET
449%endmacro
450%macro SSD_SS_ONE 0
451SSD_SS 4, 4
452SSD_SS 4, 8
453SSD_SS 4, 16
454SSD_SS 8, 4
455SSD_SS 8, 8
456SSD_SS 8, 16
457SSD_SS 8, 32
458SSD_SS 16, 4
459SSD_SS 16, 8
460SSD_SS 16, 12
461SSD_SS 16, 16
462SSD_SS 16, 32
463SSD_SS 16, 64
464%endmacro
465
466%macro SSD_SS_12x16 0
467cglobal pixel_ssd_ss_12x16, 4,7,6
468 FIX_STRIDES r1, r3
469 mov r4d, 8
470 pxor m0, m0
471.loop:
472 movu m1, [r0]
473 movu m2, [r2]
474 psubw m1, m2
475 pmaddwd m1, m1
476 paddd m0, m1
477 movu m1, [r0 + 16]
478 movu m2, [r2 + 16]
479 psubw m1, m2
480 pmaddwd m1, m1
481 pslldq m1, 8
482 psrldq m1, 8
483 paddd m0, m1
484 lea r0, [r0 + 2*r1]
485 lea r2, [r2 + 2*r3]
486 movu m1, [r0]
487 movu m2, [r2]
488 psubw m1, m2
489 pmaddwd m1, m1
490 paddd m0, m1
491 movu m1, [r0 + 16]
492 movu m2, [r2 + 16]
493 psubw m1, m2
494 pmaddwd m1, m1
495 pslldq m1, 8
496 psrldq m1, 8
497 paddd m0, m1
498 lea r0, [r0 + 2*r1]
499 lea r2, [r2 + 2*r3]
500 dec r4d
501 jnz .loop
502 HADDD m0, m1
503 movd eax, m0
504 RET
505%endmacro
506
507%macro SSD_SS_32 1
508cglobal pixel_ssd_ss_32x%1, 4,7,6
509 FIX_STRIDES r1, r3
510 mov r4d, %1/2
511 pxor m0, m0
512.loop:
513 movu m1, [r0]
514 movu m2, [r2]
515 psubw m1, m2
516 pmaddwd m1, m1
517 paddd m0, m1
518 movu m1, [r0 + 16]
519 movu m2, [r2 + 16]
520 psubw m1, m2
521 pmaddwd m1, m1
522 paddd m0, m1
523 movu m1, [r0 + 32]
524 movu m2, [r2 + 32]
525 psubw m1, m2
526 pmaddwd m1, m1
527 paddd m0, m1
528 movu m1, [r0 + 48]
529 movu m2, [r2 + 48]
530 psubw m1, m2
531 pmaddwd m1, m1
532 paddd m0, m1
533 lea r0, [r0 + 2*r1]
534 lea r2, [r2 + 2*r3]
535 movu m1, [r0]
536 movu m2, [r2]
537 psubw m1, m2
538 pmaddwd m1, m1
539 paddd m0, m1
540 movu m1, [r0 + 16]
541 movu m2, [r2 + 16]
542 psubw m1, m2
543 pmaddwd m1, m1
544 paddd m0, m1
545 movu m1, [r0 + 32]
546 movu m2, [r2 + 32]
547 psubw m1, m2
548 pmaddwd m1, m1
549 paddd m0, m1
550 movu m1, [r0 + 48]
551 movu m2, [r2 + 48]
552 psubw m1, m2
553 pmaddwd m1, m1
554 paddd m0, m1
555 lea r0, [r0 + 2*r1]
556 lea r2, [r2 + 2*r3]
557 dec r4d
558 jnz .loop
559 HADDD m0, m1
560 movd eax, m0
561 RET
562%endmacro
563
564%macro SSD_SS_32xN 0
565SSD_SS_32 8
566SSD_SS_32 16
567SSD_SS_32 24
568SSD_SS_32 32
569SSD_SS_32 64
570%endmacro
571
572%macro SSD_SS_24 0
573cglobal pixel_ssd_ss_24x32, 4,7,6
574 FIX_STRIDES r1, r3
575 mov r4d, 16
576 pxor m0, m0
577.loop:
578 movu m1, [r0]
579 movu m2, [r2]
580 psubw m1, m2
581 pmaddwd m1, m1
582 paddd m0, m1
583 movu m1, [r0 + 16]
584 movu m2, [r2 + 16]
585 psubw m1, m2
586 pmaddwd m1, m1
587 paddd m0, m1
588 movu m1, [r0 + 32]
589 movu m2, [r2 + 32]
590 psubw m1, m2
591 pmaddwd m1, m1
592 paddd m0, m1
593 lea r0, [r0 + 2*r1]
594 lea r2, [r2 + 2*r3]
595 movu m1, [r0]
596 movu m2, [r2]
597 psubw m1, m2
598 pmaddwd m1, m1
599 paddd m0, m1
600 movu m1, [r0 + 16]
601 movu m2, [r2 + 16]
602 psubw m1, m2
603 pmaddwd m1, m1
604 paddd m0, m1
605 movu m1, [r0 + 32]
606 movu m2, [r2 + 32]
607 psubw m1, m2
608 pmaddwd m1, m1
609 paddd m0, m1
610 lea r0, [r0 + 2*r1]
611 lea r2, [r2 + 2*r3]
612 dec r4d
613 jnz .loop
614 HADDD m0, m1
615 movd eax, m0
616 RET
617%endmacro
618
619%macro SSD_SS_48 0
620cglobal pixel_ssd_ss_48x64, 4,7,6
621 FIX_STRIDES r1, r3
622 mov r4d, 32
623 pxor m0, m0
624.loop:
625 movu m1, [r0]
626 movu m2, [r2]
627 psubw m1, m2
628 pmaddwd m1, m1
629 paddd m0, m1
630 movu m1, [r0 + 16]
631 movu m2, [r2 + 16]
632 psubw m1, m2
633 pmaddwd m1, m1
634 paddd m0, m1
635 movu m1, [r0 + 32]
636 movu m2, [r2 + 32]
637 psubw m1, m2
638 pmaddwd m1, m1
639 paddd m0, m1
640 movu m1, [r0 + 48]
641 movu m2, [r2 + 48]
642 psubw m1, m2
643 pmaddwd m1, m1
644 paddd m0, m1
645 movu m1, [r0 + 64]
646 movu m2, [r2 + 64]
647 psubw m1, m2
648 pmaddwd m1, m1
649 paddd m0, m1
650 movu m1, [r0 + 80]
651 movu m2, [r2 + 80]
652 psubw m1, m2
653 pmaddwd m1, m1
654 paddd m0, m1
655 lea r0, [r0 + 2*r1]
656 lea r2, [r2 + 2*r3]
657 movu m1, [r0]
658 movu m2, [r2]
659 psubw m1, m2
660 pmaddwd m1, m1
661 paddd m0, m1
662 movu m1, [r0 + 16]
663 movu m2, [r2 + 16]
664 psubw m1, m2
665 pmaddwd m1, m1
666 paddd m0, m1
667 movu m1, [r0 + 32]
668 movu m2, [r2 + 32]
669 psubw m1, m2
670 pmaddwd m1, m1
671 paddd m0, m1
672 movu m1, [r0 + 48]
673 movu m2, [r2 + 48]
674 psubw m1, m2
675 pmaddwd m1, m1
676 paddd m0, m1
677 movu m1, [r0 + 64]
678 movu m2, [r2 + 64]
679 psubw m1, m2
680 pmaddwd m1, m1
681 paddd m0, m1
682 movu m1, [r0 + 80]
683 movu m2, [r2 + 80]
684 psubw m1, m2
685 pmaddwd m1, m1
686 paddd m0, m1
687 lea r0, [r0 + 2*r1]
688 lea r2, [r2 + 2*r3]
689 dec r4d
690 jnz .loop
691 HADDD m0, m1
692 movd eax, m0
693 RET
694%endmacro
695
696%macro SSD_SS_64 1
697cglobal pixel_ssd_ss_64x%1, 4,7,6
698 FIX_STRIDES r1, r3
699 mov r4d, %1/2
700 pxor m0, m0
701.loop:
702 movu m1, [r0]
703 movu m2, [r2]
704 psubw m1, m2
705 pmaddwd m1, m1
706 paddd m0, m1
707 movu m1, [r0 + 16]
708 movu m2, [r2 + 16]
709 psubw m1, m2
710 pmaddwd m1, m1
711 paddd m0, m1
712 movu m1, [r0 + 32]
713 movu m2, [r2 + 32]
714 psubw m1, m2
715 pmaddwd m1, m1
716 paddd m0, m1
717 movu m1, [r0 + 48]
718 movu m2, [r2 + 48]
719 psubw m1, m2
720 pmaddwd m1, m1
721 paddd m0, m1
722 movu m1, [r0 + 64]
723 movu m2, [r2 + 64]
724 psubw m1, m2
725 pmaddwd m1, m1
726 paddd m0, m1
727 movu m1, [r0 + 80]
728 movu m2, [r2 + 80]
729 psubw m1, m2
730 pmaddwd m1, m1
731 paddd m0, m1
732 movu m1, [r0 + 96]
733 movu m2, [r2 + 96]
734 psubw m1, m2
735 pmaddwd m1, m1
736 paddd m0, m1
737 movu m1, [r0 + 112]
738 movu m2, [r2 + 112]
739 psubw m1, m2
740 pmaddwd m1, m1
741 paddd m0, m1
742 lea r0, [r0 + 2*r1]
743 lea r2, [r2 + 2*r3]
744 movu m1, [r0]
745 movu m2, [r2]
746 psubw m1, m2
747 pmaddwd m1, m1
748 paddd m0, m1
749 movu m1, [r0 + 16]
750 movu m2, [r2 + 16]
751 psubw m1, m2
752 pmaddwd m1, m1
753 paddd m0, m1
754 movu m1, [r0 + 32]
755 movu m2, [r2 + 32]
756 psubw m1, m2
757 pmaddwd m1, m1
758 paddd m0, m1
759 movu m1, [r0 + 48]
760 movu m2, [r2 + 48]
761 psubw m1, m2
762 pmaddwd m1, m1
763 paddd m0, m1
764 movu m1, [r0 + 64]
765 movu m2, [r2 + 64]
766 psubw m1, m2
767 pmaddwd m1, m1
768 paddd m0, m1
769 movu m1, [r0 + 80]
770 movu m2, [r2 + 80]
771 psubw m1, m2
772 pmaddwd m1, m1
773 paddd m0, m1
774 movu m1, [r0 + 96]
775 movu m2, [r2 + 96]
776 psubw m1, m2
777 pmaddwd m1, m1
778 paddd m0, m1
779 movu m1, [r0 + 112]
780 movu m2, [r2 + 112]
781 psubw m1, m2
782 pmaddwd m1, m1
783 paddd m0, m1
784 lea r0, [r0 + 2*r1]
785 lea r2, [r2 + 2*r3]
786 dec r4d
787 jnz .loop
788 HADDD m0, m1
789 movd eax, m0
790 RET
791%endmacro
792
793%macro SSD_SS_64xN 0
794SSD_SS_64 16
795SSD_SS_64 32
796SSD_SS_64 48
797SSD_SS_64 64
798%endmacro
799
800INIT_XMM sse2
801SSD_SS_ONE
802SSD_SS_12x16
803SSD_SS_24
804SSD_SS_32xN
805SSD_SS_48
806SSD_SS_64xN
807INIT_XMM sse4
808SSD_SS_ONE
809SSD_SS_12x16
810SSD_SS_24
811SSD_SS_32xN
812SSD_SS_48
813SSD_SS_64xN
814INIT_XMM avx
815SSD_SS_ONE
816SSD_SS_12x16
817SSD_SS_24
818SSD_SS_32xN
819SSD_SS_48
820SSD_SS_64xN
821%endif ; !HIGH_BIT_DEPTH
822
823%if HIGH_BIT_DEPTH == 0
824%macro SSD_LOAD_FULL 5
825 mova m1, [t0+%1]
826 mova m2, [t2+%2]
827 mova m3, [t0+%3]
828 mova m4, [t2+%4]
829%if %5==1
830 add t0, t1
831 add t2, t3
832%elif %5==2
833 lea t0, [t0+2*t1]
834 lea t2, [t2+2*t3]
835%endif
836%endmacro
837
838%macro LOAD 5
839 movh m%1, %3
840 movh m%2, %4
841%if %5
842 lea t0, [t0+2*t1]
843%endif
844%endmacro
845
846%macro JOIN 7
847 movh m%3, %5
848 movh m%4, %6
849%if %7
850 lea t2, [t2+2*t3]
851%endif
852 punpcklbw m%1, m7
853 punpcklbw m%3, m7
854 psubw m%1, m%3
855 punpcklbw m%2, m7
856 punpcklbw m%4, m7
857 psubw m%2, m%4
858%endmacro
859
860%macro JOIN_SSE2 7
861 movh m%3, %5
862 movh m%4, %6
863%if %7
864 lea t2, [t2+2*t3]
865%endif
866 punpcklqdq m%1, m%2
867 punpcklqdq m%3, m%4
868 DEINTB %2, %1, %4, %3, 7
869 psubw m%2, m%4
870 psubw m%1, m%3
871%endmacro
872
873%macro JOIN_SSSE3 7
874 movh m%3, %5
875 movh m%4, %6
876%if %7
877 lea t2, [t2+2*t3]
878%endif
879 punpcklbw m%1, m%3
880 punpcklbw m%2, m%4
881%endmacro
882
883%macro LOAD_AVX2 5
884 mova xm%1, %3
885 vinserti128 m%1, m%1, %4, 1
886%if %5
887 lea t0, [t0+2*t1]
888%endif
889%endmacro
890
891%macro JOIN_AVX2 7
892 mova xm%2, %5
893 vinserti128 m%2, m%2, %6, 1
894%if %7
895 lea t2, [t2+2*t3]
896%endif
897 SBUTTERFLY bw, %1, %2, %3
898%endmacro
899
900%macro SSD_LOAD_HALF 5
901 LOAD 1, 2, [t0+%1], [t0+%3], 1
902 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
903 LOAD 3, 4, [t0+%1], [t0+%3], %5
904 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
905%endmacro
906
907%macro SSD_CORE 7-8
908%ifidn %8, FULL
909 mova m%6, m%2
910 mova m%7, m%4
911 psubusb m%2, m%1
912 psubusb m%4, m%3
913 psubusb m%1, m%6
914 psubusb m%3, m%7
915 por m%1, m%2
916 por m%3, m%4
917 punpcklbw m%2, m%1, m%5
918 punpckhbw m%1, m%5
919 punpcklbw m%4, m%3, m%5
920 punpckhbw m%3, m%5
921%endif
922 pmaddwd m%1, m%1
923 pmaddwd m%2, m%2
924 pmaddwd m%3, m%3
925 pmaddwd m%4, m%4
926%endmacro
927
928%macro SSD_CORE_SSE2 7-8
929%ifidn %8, FULL
930 DEINTB %6, %1, %7, %2, %5
931 psubw m%6, m%7
932 psubw m%1, m%2
933 SWAP %6, %2, %1
934 DEINTB %6, %3, %7, %4, %5
935 psubw m%6, m%7
936 psubw m%3, m%4
937 SWAP %6, %4, %3
938%endif
939 pmaddwd m%1, m%1
940 pmaddwd m%2, m%2
941 pmaddwd m%3, m%3
942 pmaddwd m%4, m%4
943%endmacro
944
945%macro SSD_CORE_SSSE3 7-8
946%ifidn %8, FULL
947 punpckhbw m%6, m%1, m%2
948 punpckhbw m%7, m%3, m%4
949 punpcklbw m%1, m%2
950 punpcklbw m%3, m%4
951 SWAP %6, %2, %3
952 SWAP %7, %4
953%endif
954 pmaddubsw m%1, m%5
955 pmaddubsw m%2, m%5
956 pmaddubsw m%3, m%5
957 pmaddubsw m%4, m%5
958 pmaddwd m%1, m%1
959 pmaddwd m%2, m%2
960 pmaddwd m%3, m%3
961 pmaddwd m%4, m%4
962%endmacro
963
964%macro SSD_ITER 6
965 SSD_LOAD_%1 %2,%3,%4,%5,%6
966 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
967 paddd m1, m2
968 paddd m3, m4
969 paddd m0, m1
970 paddd m0, m3
971%endmacro
972
973;-----------------------------------------------------------------------------
974; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
975;-----------------------------------------------------------------------------
976%macro SSD 2
977%if %1 != %2
978 %assign function_align 8
979%else
980 %assign function_align 16
981%endif
982cglobal pixel_ssd_%1x%2, 0,0,0
983 mov al, %1*%2/mmsize/2
984
985%if %1 != %2
986 jmp mangle(x265_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
987%else
988
989.startloop:
990%if ARCH_X86_64
991 DECLARE_REG_TMP 0,1,2,3
992 PROLOGUE 0,0,8
993%else
994 PROLOGUE 0,5
995 DECLARE_REG_TMP 1,2,3,4
996 mov t0, r0m
997 mov t1, r1m
998 mov t2, r2m
999 mov t3, r3m
1000%endif
1001
1002%if cpuflag(ssse3)
1003 mova m7, [hsub_mul]
1004%elifidn cpuname, sse2
1005 mova m7, [pw_00ff]
1006%elif %1 >= mmsize
1007 pxor m7, m7
1008%endif
1009 pxor m0, m0
1010
1011ALIGN 16
1012.loop:
1013%if %1 > mmsize
1014 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
1015%elif %1 == mmsize
1016 SSD_ITER FULL, 0, 0, t1, t3, 2
1017%else
1018 SSD_ITER HALF, 0, 0, t1, t3, 2
1019%endif
1020 dec al
1021 jg .loop
1022%if mmsize==32
1023 vextracti128 xm1, m0, 1
1024 paddd xm0, xm1
1025 HADDD xm0, xm1
1026 movd eax, xm0
1027%else
1028 HADDD m0, m1
1029 movd eax, m0
1030%endif
1031%if (mmsize == 8)
1032 emms
1033%endif
1034 RET
1035%endif
1036%endmacro
1037
1038%macro HEVC_SSD 0
1039SSD 32, 64
1040SSD 16, 64
1041SSD 32, 32
1042SSD 32, 16
1043SSD 16, 32
1044SSD 32, 8
1045SSD 8, 32
1046SSD 32, 24
1047SSD 24, 24 ; not used, but resolves x265_pixel_ssd_24x24_sse2.startloop symbol
1048SSD 8, 4
1049SSD 8, 8
1050SSD 16, 16
1051SSD 16, 12
1052SSD 16, 8
1053SSD 8, 16
1054SSD 16, 4
1055%endmacro
1056
1057INIT_MMX mmx
1058SSD 16, 16
1059SSD 16, 8
1060SSD 8, 8
1061SSD 8, 16
1062SSD 4, 4
1063SSD 8, 4
1064SSD 4, 8
1065SSD 4, 16
1066INIT_XMM sse2slow
1067SSD 16, 16
1068SSD 8, 8
1069SSD 16, 8
1070SSD 8, 16
1071SSD 8, 4
1072INIT_XMM sse2
1073%define SSD_CORE SSD_CORE_SSE2
1074%define JOIN JOIN_SSE2
1075HEVC_SSD
1076INIT_XMM ssse3
1077%define SSD_CORE SSD_CORE_SSSE3
1078%define JOIN JOIN_SSSE3
1079HEVC_SSD
1080INIT_XMM avx
1081HEVC_SSD
1082INIT_MMX ssse3
1083SSD 4, 4
1084SSD 4, 8
1085SSD 4, 16
1086INIT_XMM xop
1087SSD 16, 16
1088SSD 8, 8
1089SSD 16, 8
1090SSD 8, 16
1091SSD 8, 4
1092%define LOAD LOAD_AVX2
1093%define JOIN JOIN_AVX2
1094INIT_YMM avx2
1095SSD 16, 16
1096SSD 16, 8
1097%assign function_align 16
1098%endif ; !HIGH_BIT_DEPTH
1099
1100;-----------------------------------------------------------------------------
1101; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1102;-----------------------------------------------------------------------------
1103INIT_XMM sse4
1104cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2
1105
1106 pxor m6, m6
1107 mov r4d, 4
1108
1109.loop:
1110 movu m0, [r0]
1111 movu m1, [r2]
1112 movu m2, [r0 + r1]
1113 movu m3, [r2 + r3]
1114
1115 punpckhdq m4, m0, m2
1116 punpckhdq m5, m1, m3
1117
1118 pmovzxbw m0, m0
1119 pmovzxbw m1, m1
1120 pmovzxbw m2, m2
1121 pmovzxbw m3, m3
1122 pmovzxbw m4, m4
1123 pmovzxbw m5, m5
1124
1125 psubw m0, m1
1126 psubw m2, m3
1127 psubw m4, m5
1128
1129 pmaddwd m0, m0
1130 pmaddwd m2, m2
1131 pmaddwd m4, m4
1132
1133 paddd m0, m2
1134 paddd m6, m4
1135 paddd m6, m0
1136
1137 movu m0, [r0 + 2 * r1]
1138 movu m1, [r2 + 2 * r3]
1139 lea r0, [r0 + 2 * r1]
1140 lea r2, [r2 + 2 * r3]
1141 movu m2, [r0 + r1]
1142 movu m3, [r2 + r3]
1143
1144 punpckhdq m4, m0, m2
1145 punpckhdq m5, m1, m3
1146
1147 pmovzxbw m0, m0
1148 pmovzxbw m1, m1
1149 pmovzxbw m2, m2
1150 pmovzxbw m3, m3
1151 pmovzxbw m4, m4
1152 pmovzxbw m5, m5
1153
1154 psubw m0, m1
1155 psubw m2, m3
1156 psubw m4, m5
1157
1158 pmaddwd m0, m0
1159 pmaddwd m2, m2
1160 pmaddwd m4, m4
1161
1162 paddd m0, m2
1163 paddd m6, m4
1164 paddd m6, m0
1165
1166 dec r4d
1167 lea r0, [r0 + 2 * r1]
1168 lea r2, [r2 + 2 * r3]
1169 jnz .loop
1170
1171 HADDD m6, m1
1172 movd eax, m6
1173
1174 RET
1175
1176;-----------------------------------------------------------------------------
1177; int pixel_ssd_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
1178;-----------------------------------------------------------------------------
1179INIT_XMM sse4
1180cglobal pixel_ssd_24x32, 4, 5, 8, src1, stride1, src2, stride2
1181
1182 pxor m7, m7
1183 pxor m6, m6
1184 mov r4d, 16
1185
1186.loop:
1187 movu m1, [r0]
1188 pmovzxbw m0, m1
1189 punpckhbw m1, m6
1190 pmovzxbw m2, [r0 + 16]
1191 movu m4, [r2]
1192 pmovzxbw m3, m4
1193 punpckhbw m4, m6
1194 pmovzxbw m5, [r2 + 16]
1195
1196 psubw m0, m3
1197 psubw m1, m4
1198 psubw m2, m5
1199
1200 pmaddwd m0, m0
1201 pmaddwd m1, m1
1202 pmaddwd m2, m2
1203
1204 paddd m0, m1
1205 paddd m7, m2
1206 paddd m7, m0
1207
1208 movu m1, [r0 + r1]
1209 pmovzxbw m0, m1
1210 punpckhbw m1, m6
1211 pmovzxbw m2, [r0 + r1 + 16]
1212 movu m4, [r2 + r3]
1213 pmovzxbw m3, m4
1214 punpckhbw m4, m6
1215 pmovzxbw m5, [r2 + r3 + 16]
1216
1217 psubw m0, m3
1218 psubw m1, m4
1219 psubw m2, m5
1220
1221 pmaddwd m0, m0
1222 pmaddwd m1, m1
1223 pmaddwd m2, m2
1224
1225 paddd m0, m1
1226 paddd m7, m2
1227 paddd m7, m0
1228
1229 dec r4d
1230 lea r0, [r0 + 2 * r1]
1231 lea r2, [r2 + 2 * r3]
1232 jnz .loop
1233
1234 HADDD m7, m1
1235 movd eax, m7
1236
1237 RET
1238
1239%macro PIXEL_SSD_16x4 0
1240 movu m1, [r0]
1241 pmovzxbw m0, m1
1242 punpckhbw m1, m6
1243 movu m3, [r2]
1244 pmovzxbw m2, m3
1245 punpckhbw m3, m6
1246
1247 psubw m0, m2
1248 psubw m1, m3
1249
1250 movu m5, [r0 + r1]
1251 pmovzxbw m4, m5
1252 punpckhbw m5, m6
1253 movu m3, [r2 + r3]
1254 pmovzxbw m2, m3
1255 punpckhbw m3, m6
1256
1257 psubw m4, m2
1258 psubw m5, m3
1259
1260 pmaddwd m0, m0
1261 pmaddwd m1, m1
1262 pmaddwd m4, m4
1263 pmaddwd m5, m5
1264
1265 paddd m0, m1
1266 paddd m4, m5
1267 paddd m4, m0
1268 paddd m7, m4
1269
1270 movu m1, [r0 + r6]
1271 pmovzxbw m0, m1
1272 punpckhbw m1, m6
1273 movu m3, [r2 + 2 * r3]
1274 pmovzxbw m2, m3
1275 punpckhbw m3, m6
1276
1277 psubw m0, m2
1278 psubw m1, m3
1279
1280 lea r0, [r0 + r6]
1281 lea r2, [r2 + 2 * r3]
1282 movu m5, [r0 + r1]
1283 pmovzxbw m4, m5
1284 punpckhbw m5, m6
1285 movu m3, [r2 + r3]
1286 pmovzxbw m2, m3
1287 punpckhbw m3, m6
1288
1289 psubw m4, m2
1290 psubw m5, m3
1291
1292 pmaddwd m0, m0
1293 pmaddwd m1, m1
1294 pmaddwd m4, m4
1295 pmaddwd m5, m5
1296
1297 paddd m0, m1
1298 paddd m4, m5
1299 paddd m4, m0
1300 paddd m7, m4
1301%endmacro
1302
1303cglobal pixel_ssd_16x16_internal
1304 PIXEL_SSD_16x4
1305 lea r0, [r0 + r6]
1306 lea r2, [r2 + 2 * r3]
1307 PIXEL_SSD_16x4
1308 lea r0, [r0 + r6]
1309 lea r2, [r2 + 2 * r3]
1310 PIXEL_SSD_16x4
1311 lea r0, [r0 + r6]
1312 lea r2, [r2 + 2 * r3]
1313 PIXEL_SSD_16x4
1314 ret
1315
1316;-----------------------------------------------------------------------------
1317; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
1318;-----------------------------------------------------------------------------
1319INIT_XMM sse4
1320cglobal pixel_ssd_48x64, 4, 7, 8, src1, stride1, src2, stride2
1321
1322 pxor m7, m7
1323 pxor m6, m6
1324 mov r4, r0
1325 mov r5, r2
1326 lea r6, [r1 * 2]
1327
1328 call pixel_ssd_16x16_internal
1329 lea r0, [r0 + r6]
1330 lea r2, [r2 + 2 * r3]
1331 call pixel_ssd_16x16_internal
1332 lea r0, [r0 + r6]
1333 lea r2, [r2 + 2 * r3]
1334 call pixel_ssd_16x16_internal
1335 lea r0, [r0 + r6]
1336 lea r2, [r2 + 2 * r3]
1337 call pixel_ssd_16x16_internal
1338 lea r0, [r4 + 16]
1339 lea r2, [r5 + 16]
1340 call pixel_ssd_16x16_internal
1341 lea r0, [r0 + r6]
1342 lea r2, [r2 + 2 * r3]
1343 call pixel_ssd_16x16_internal
1344 lea r0, [r0 + r6]
1345 lea r2, [r2 + 2 * r3]
1346 call pixel_ssd_16x16_internal
1347 lea r0, [r0 + r6]
1348 lea r2, [r2 + 2 * r3]
1349 call pixel_ssd_16x16_internal
1350 lea r0, [r4 + 32]
1351 lea r2, [r5 + 32]
1352 call pixel_ssd_16x16_internal
1353 lea r0, [r0 + r6]
1354 lea r2, [r2 + 2 * r3]
1355 call pixel_ssd_16x16_internal
1356 lea r0, [r0 + r6]
1357 lea r2, [r2 + 2 * r3]
1358 call pixel_ssd_16x16_internal
1359 lea r0, [r0 + r6]
1360 lea r2, [r2 + 2 * r3]
1361 call pixel_ssd_16x16_internal
1362
1363 HADDD m7, m1
1364 movd eax, m7
1365
1366 RET
1367
1368;-----------------------------------------------------------------------------
1369; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1370;-----------------------------------------------------------------------------
1371INIT_XMM sse4
1372cglobal pixel_ssd_64x16, 4, 7, 8, src1, stride1, src2, stride2
1373
1374 pxor m7, m7
1375 pxor m6, m6
1376 mov r4, r0
1377 mov r5, r2
1378 lea r6, [r1 * 2]
1379
1380 call pixel_ssd_16x16_internal
1381 lea r0, [r4 + 16]
1382 lea r2, [r5 + 16]
1383 call pixel_ssd_16x16_internal
1384 lea r0, [r4 + 32]
1385 lea r2, [r5 + 32]
1386 call pixel_ssd_16x16_internal
1387 lea r0, [r4 + 48]
1388 lea r2, [r5 + 48]
1389 call pixel_ssd_16x16_internal
1390
1391 HADDD m7, m1
1392 movd eax, m7
1393
1394 RET
1395
1396;-----------------------------------------------------------------------------
1397; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
1398;-----------------------------------------------------------------------------
1399INIT_XMM sse4
1400cglobal pixel_ssd_64x32, 4, 7, 8, src1, stride1, src2, stride2
1401
1402 pxor m7, m7
1403 pxor m6, m6
1404 mov r4, r0
1405 mov r5, r2
1406 lea r6, [r1 * 2]
1407
1408 call pixel_ssd_16x16_internal
1409 lea r0, [r0 + r6]
1410 lea r2, [r2 + 2 * r3]
1411 call pixel_ssd_16x16_internal
1412 lea r0, [r4 + 16]
1413 lea r2, [r5 + 16]
1414 call pixel_ssd_16x16_internal
1415 lea r0, [r0 + r6]
1416 lea r2, [r2 + 2 * r3]
1417 call pixel_ssd_16x16_internal
1418 lea r0, [r4 + 32]
1419 lea r2, [r5 + 32]
1420 call pixel_ssd_16x16_internal
1421 lea r0, [r0 + r6]
1422 lea r2, [r2 + 2 * r3]
1423 call pixel_ssd_16x16_internal
1424 lea r0, [r4 + 48]
1425 lea r2, [r5 + 48]
1426 call pixel_ssd_16x16_internal
1427 lea r0, [r0 + r6]
1428 lea r2, [r2 + 2 * r3]
1429 call pixel_ssd_16x16_internal
1430
1431 HADDD m7, m1
1432 movd eax, m7
1433
1434 RET
1435
1436;-----------------------------------------------------------------------------
1437; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
1438;-----------------------------------------------------------------------------
1439INIT_XMM sse4
1440cglobal pixel_ssd_64x48, 4, 7, 8, src1, stride1, src2, stride2
1441
1442 pxor m7, m7
1443 pxor m6, m6
1444 mov r4, r0
1445 mov r5, r2
1446 lea r6, [r1 * 2]
1447
1448 call pixel_ssd_16x16_internal
1449 lea r0, [r0 + r6]
1450 lea r2, [r2 + 2 * r3]
1451 call pixel_ssd_16x16_internal
1452 lea r0, [r0 + r6]
1453 lea r2, [r2 + 2 * r3]
1454 call pixel_ssd_16x16_internal
1455 lea r0, [r4 + 16]
1456 lea r2, [r5 + 16]
1457 call pixel_ssd_16x16_internal
1458 lea r0, [r0 + r6]
1459 lea r2, [r2 + 2 * r3]
1460 call pixel_ssd_16x16_internal
1461 lea r0, [r0 + r6]
1462 lea r2, [r2 + 2 * r3]
1463 call pixel_ssd_16x16_internal
1464 lea r0, [r4 + 32]
1465 lea r2, [r5 + 32]
1466 call pixel_ssd_16x16_internal
1467 lea r0, [r0 + r6]
1468 lea r2, [r2 + 2 * r3]
1469 call pixel_ssd_16x16_internal
1470 lea r0, [r0 + r6]
1471 lea r2, [r2 + 2 * r3]
1472 call pixel_ssd_16x16_internal
1473 lea r0, [r4 + 48]
1474 lea r2, [r5 + 48]
1475 call pixel_ssd_16x16_internal
1476 lea r0, [r0 + r6]
1477 lea r2, [r2 + 2 * r3]
1478 call pixel_ssd_16x16_internal
1479 lea r0, [r0 + r6]
1480 lea r2, [r2 + 2 * r3]
1481 call pixel_ssd_16x16_internal
1482
1483 HADDD m7, m1
1484 movd eax, m7
1485
1486 RET
1487
1488;-----------------------------------------------------------------------------
1489; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
1490;-----------------------------------------------------------------------------
1491INIT_XMM sse4
1492cglobal pixel_ssd_64x64, 4, 7, 8, src1, stride1, src2, stride2
1493
1494 pxor m7, m7
1495 pxor m6, m6
1496 mov r4, r0
1497 mov r5, r2
1498 lea r6, [r1 * 2]
1499
1500 call pixel_ssd_16x16_internal
1501 lea r0, [r0 + r6]
1502 lea r2, [r2 + 2 * r3]
1503 call pixel_ssd_16x16_internal
1504 lea r0, [r0 + r6]
1505 lea r2, [r2 + 2 * r3]
1506 call pixel_ssd_16x16_internal
1507 lea r0, [r0 + r6]
1508 lea r2, [r2 + 2 * r3]
1509 call pixel_ssd_16x16_internal
1510 lea r0, [r4 + 16]
1511 lea r2, [r5 + 16]
1512 call pixel_ssd_16x16_internal
1513 lea r0, [r0 + r6]
1514 lea r2, [r2 + 2 * r3]
1515 call pixel_ssd_16x16_internal
1516 lea r0, [r0 + r6]
1517 lea r2, [r2 + 2 * r3]
1518 call pixel_ssd_16x16_internal
1519 lea r0, [r0 + r6]
1520 lea r2, [r2 + 2 * r3]
1521 call pixel_ssd_16x16_internal
1522 lea r0, [r4 + 32]
1523 lea r2, [r5 + 32]
1524 call pixel_ssd_16x16_internal
1525 lea r0, [r0 + r6]
1526 lea r2, [r2 + 2 * r3]
1527 call pixel_ssd_16x16_internal
1528 lea r0, [r0 + r6]
1529 lea r2, [r2 + 2 * r3]
1530 call pixel_ssd_16x16_internal
1531 lea r0, [r0 + r6]
1532 lea r2, [r2 + 2 * r3]
1533 call pixel_ssd_16x16_internal
1534 lea r0, [r4 + 48]
1535 lea r2, [r5 + 48]
1536 call pixel_ssd_16x16_internal
1537 lea r0, [r0 + r6]
1538 lea r2, [r2 + 2 * r3]
1539 call pixel_ssd_16x16_internal
1540 lea r0, [r0 + r6]
1541 lea r2, [r2 + 2 * r3]
1542 call pixel_ssd_16x16_internal
1543 lea r0, [r0 + r6]
1544 lea r2, [r2 + 2 * r3]
1545 call pixel_ssd_16x16_internal
1546
1547 HADDD m7, m1
1548 movd eax, m7
1549
1550 RET
1551
1552;-----------------------------------------------------------------------------
1553; int pixel_ssd_sp ( int16_t *, intptr_t, uint8_t *, intptr_t )
1554;-----------------------------------------------------------------------------
1555
1556cglobal pixel_ssd_sp_4x4_internal
1557 movh m0, [r0]
1558 movh m1, [r0 + r1]
1559 punpcklqdq m0, m1
1560 movd m2, [r2]
1561 movd m3, [r2 + r3]
1562 punpckldq m2, m3
1563 pmovzxbw m2, m2
1564 psubw m0, m2
1565 movh m4, [r0 + 2 * r1]
1566 movh m5, [r0 + r4]
1567 punpcklqdq m4, m5
1568 movd m6, [r2 + 2 * r3]
1569 lea r2, [r2 + 2 * r3]
1570 movd m1, [r2 + r3]
1571 punpckldq m6, m1
1572 pmovzxbw m6, m6
1573 psubw m4, m6
1574 pmaddwd m0, m0
1575 pmaddwd m4, m4
1576 paddd m0, m4
1577 paddd m7, m0
1578 ret
1579
1580;-----------------------------------------------------------------------------
1581; int pixel_ssd_sp_4x4( int16_t *, intptr_t, uint8_t *, intptr_t )
1582;-----------------------------------------------------------------------------
1583INIT_XMM sse4
1584cglobal pixel_ssd_sp_4x4, 4, 5, 8, src1, stride1, src2, stride2
1585 pxor m7, m7
1586 add r1, r1
1587 lea r4, [r1 * 3]
1588 call pixel_ssd_sp_4x4_internal
1589 HADDD m7, m1
1590 movd eax, m7
1591 RET
1592
1593;-----------------------------------------------------------------------------
1594; int pixel_ssd_sp_4x8( int16_t *, intptr_t, uint8_t *, intptr_t )
1595;-----------------------------------------------------------------------------
1596INIT_XMM sse4
1597cglobal pixel_ssd_sp_4x8, 4, 5, 8, src1, stride1, src2, stride2
1598 pxor m7, m7
1599 add r1, r1
1600 lea r4, [r1 * 3]
1601 call pixel_ssd_sp_4x4_internal
1602 lea r0, [r0 + 4 * r1]
1603 lea r2, [r2 + 2 * r3]
1604 call pixel_ssd_sp_4x4_internal
1605 HADDD m7, m1
1606 movd eax, m7
1607 RET
1608
1609;-----------------------------------------------------------------------------
1610; int pixel_ssd_sp_4x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1611;-----------------------------------------------------------------------------
1612INIT_XMM sse4
1613cglobal pixel_ssd_sp_4x16, 4, 5, 8, src1, stride1, src2, stride2
1614 pxor m7, m7
1615 add r1, r1
1616 lea r4, [r1 * 3]
1617 call pixel_ssd_sp_4x4_internal
1618 lea r0, [r0 + 4 * r1]
1619 lea r2, [r2 + 2 * r3]
1620 call pixel_ssd_sp_4x4_internal
1621 lea r0, [r0 + 4 * r1]
1622 lea r2, [r2 + 2 * r3]
1623 call pixel_ssd_sp_4x4_internal
1624 lea r0, [r0 + 4 * r1]
1625 lea r2, [r2 + 2 * r3]
1626 call pixel_ssd_sp_4x4_internal
1627 HADDD m7, m1
1628 movd eax, m7
1629 RET
1630
1631cglobal pixel_ssd_sp_8x4_internal
1632 movu m0, [r0]
1633 movu m1, [r0 + r1]
1634 movh m2, [r2]
1635 movh m3, [r2 + r3]
1636 pmovzxbw m2, m2
1637 pmovzxbw m3, m3
1638
1639 psubw m0, m2
1640 psubw m1, m3
1641
1642 movu m4, [r0 + 2 * r1]
1643 movu m5, [r0 + r4]
1644 movh m2, [r2 + 2 * r3]
1645 movh m3, [r2 + r5]
1646 pmovzxbw m2, m2
1647 pmovzxbw m3, m3
1648
1649 psubw m4, m2
1650 psubw m5, m3
1651
1652 pmaddwd m0, m0
1653 pmaddwd m1, m1
1654 pmaddwd m4, m4
1655 pmaddwd m5, m5
1656
1657 paddd m0, m1
1658 paddd m4, m5
1659 paddd m4, m0
1660 paddd m7, m4
1661 ret
1662
1663;-----------------------------------------------------------------------------
1664; int pixel_ssd_sp_8x4( int16_t *, intptr_t, uint8_t *, intptr_t )
1665;-----------------------------------------------------------------------------
1666INIT_XMM sse4
1667cglobal pixel_ssd_sp_8x4, 4, 6, 8, src1, stride1, src2, stride2
1668 pxor m7, m7
1669 add r1, r1
1670 lea r4, [r1 * 3]
1671 lea r5, [r3 * 3]
1672 call pixel_ssd_sp_8x4_internal
1673 HADDD m7, m1
1674 movd eax, m7
1675 RET
1676
1677;-----------------------------------------------------------------------------
1678; int pixel_ssd_sp_8x8( int16_t *, intptr_t, uint8_t *, intptr_t )
1679;-----------------------------------------------------------------------------
1680INIT_XMM sse4
1681cglobal pixel_ssd_sp_8x8, 4, 6, 8, src1, stride1, src2, stride2
1682 pxor m7, m7
1683 add r1, r1
1684 lea r4, [r1 * 3]
1685 lea r5, [r3 * 3]
1686 call pixel_ssd_sp_8x4_internal
1687 lea r0, [r0 + 4 * r1]
1688 lea r2, [r2 + 4 * r3]
1689 call pixel_ssd_sp_8x4_internal
1690 HADDD m7, m1
1691 movd eax, m7
1692 RET
1693
1694;-----------------------------------------------------------------------------
1695; int pixel_ssd_sp_8x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1696;-----------------------------------------------------------------------------
1697INIT_XMM sse4
1698cglobal pixel_ssd_sp_8x16, 4, 6, 8, src1, stride1, src2, stride2
1699 pxor m7, m7
1700 add r1, r1
1701 lea r4, [r1 * 3]
1702 lea r5, [r3 * 3]
1703 call pixel_ssd_sp_8x4_internal
1704 lea r0, [r0 + 4 * r1]
1705 lea r2, [r2 + 4 * r3]
1706 call pixel_ssd_sp_8x4_internal
1707 lea r0, [r0 + 4 * r1]
1708 lea r2, [r2 + 4 * r3]
1709 call pixel_ssd_sp_8x4_internal
1710 lea r0, [r0 + 4 * r1]
1711 lea r2, [r2 + 4 * r3]
1712 call pixel_ssd_sp_8x4_internal
1713 HADDD m7, m1
1714 movd eax, m7
1715 RET
1716
1717;-----------------------------------------------------------------------------
1718; int pixel_ssd_sp_8x32( int16_t *, intptr_t, uint8_t *, intptr_t )
1719;-----------------------------------------------------------------------------
1720INIT_XMM sse4
1721cglobal pixel_ssd_sp_8x32, 4, 6, 8, src1, stride1, src2, stride2
1722 pxor m7, m7
1723 add r1, r1
1724 lea r4, [r1 * 3]
1725 lea r5, [r3 * 3]
1726 call pixel_ssd_sp_8x4_internal
1727 lea r0, [r0 + 4 * r1]
1728 lea r2, [r2 + 4 * r3]
1729 call pixel_ssd_sp_8x4_internal
1730 lea r0, [r0 + 4 * r1]
1731 lea r2, [r2 + 4 * r3]
1732 call pixel_ssd_sp_8x4_internal
1733 lea r0, [r0 + 4 * r1]
1734 lea r2, [r2 + 4 * r3]
1735 call pixel_ssd_sp_8x4_internal
1736 lea r0, [r0 + 4 * r1]
1737 lea r2, [r2 + 4 * r3]
1738 call pixel_ssd_sp_8x4_internal
1739 lea r0, [r0 + 4 * r1]
1740 lea r2, [r2 + 4 * r3]
1741 call pixel_ssd_sp_8x4_internal
1742 lea r0, [r0 + 4 * r1]
1743 lea r2, [r2 + 4 * r3]
1744 call pixel_ssd_sp_8x4_internal
1745 lea r0, [r0 + 4 * r1]
1746 lea r2, [r2 + 4 * r3]
1747 call pixel_ssd_sp_8x4_internal
1748 HADDD m7, m1
1749 movd eax, m7
1750 RET
1751
1752;-----------------------------------------------------------------------------
1753; int pixel_ssd_sp_12x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1754;-----------------------------------------------------------------------------
1755INIT_XMM sse4
1756cglobal pixel_ssd_sp_12x16, 4, 7, 8, src1, stride1, src2, stride2
1757 pxor m7, m7
1758 add r1, r1
1759 lea r4, [r1 * 3]
1760 mov r5, r0
1761 mov r6, r2
1762 call pixel_ssd_sp_4x4_internal
1763 lea r0, [r0 + 4 * r1]
1764 lea r2, [r2 + 2 * r3]
1765 call pixel_ssd_sp_4x4_internal
1766 lea r0, [r0 + 4 * r1]
1767 lea r2, [r2 + 2 * r3]
1768 call pixel_ssd_sp_4x4_internal
1769 lea r0, [r0 + 4 * r1]
1770 lea r2, [r2 + 2 * r3]
1771 call pixel_ssd_sp_4x4_internal
1772 lea r0, [r5 + 8]
1773 lea r2, [r6 + 4]
1774 lea r5, [r3 * 3]
1775 call pixel_ssd_sp_8x4_internal
1776 lea r0, [r0 + 4 * r1]
1777 lea r2, [r2 + 4 * r3]
1778 call pixel_ssd_sp_8x4_internal
1779 lea r0, [r0 + 4 * r1]
1780 lea r2, [r2 + 4 * r3]
1781 call pixel_ssd_sp_8x4_internal
1782 lea r0, [r0 + 4 * r1]
1783 lea r2, [r2 + 4 * r3]
1784 call pixel_ssd_sp_8x4_internal
1785 HADDD m7, m1
1786 movd eax, m7
1787 RET
1788
1789%macro PIXEL_SSD_SP_16x4 0
1790 movu m0, [r0]
1791 movu m1, [r0 + 16]
1792 movu m3, [r2]
1793 pmovzxbw m2, m3
1794 punpckhbw m3, m6
1795
1796 psubw m0, m2
1797 psubw m1, m3
1798
1799 movu m4, [r0 + r1]
1800 movu m5, [r0 + r1 +16]
1801 movu m3, [r2 + r3]
1802 pmovzxbw m2, m3
1803 punpckhbw m3, m6
1804
1805 psubw m4, m2
1806 psubw m5, m3
1807
1808 pmaddwd m0, m0
1809 pmaddwd m1, m1
1810 pmaddwd m4, m4
1811 pmaddwd m5, m5
1812
1813 paddd m0, m1
1814 paddd m4, m5
1815 paddd m4, m0
1816 paddd m7, m4
1817
1818 movu m0, [r0 + 2 * r1]
1819 movu m1, [r0 + 2 * r1 + 16]
1820 movu m3, [r2 + 2 * r3]
1821 pmovzxbw m2, m3
1822 punpckhbw m3, m6
1823
1824 psubw m0, m2
1825 psubw m1, m3
1826
1827 lea r0, [r0 + 2 * r1]
1828 lea r2, [r2 + 2 * r3]
1829 movu m4, [r0 + r1]
1830 movu m5, [r0 + r1 + 16]
1831 movu m3, [r2 + r3]
1832 pmovzxbw m2, m3
1833 punpckhbw m3, m6
1834
1835 psubw m4, m2
1836 psubw m5, m3
1837
1838 pmaddwd m0, m0
1839 pmaddwd m1, m1
1840 pmaddwd m4, m4
1841 pmaddwd m5, m5
1842
1843 paddd m0, m1
1844 paddd m4, m5
1845 paddd m4, m0
1846 paddd m7, m4
1847%endmacro
1848
1849;-----------------------------------------------------------------------------
1850; int pixel_ssd_sp_16x4( int16_t *, intptr_t, uint8_t *, intptr_t )
1851;-----------------------------------------------------------------------------
1852INIT_XMM sse4
1853cglobal pixel_ssd_sp_16x4, 4, 6, 8, src1, stride1, src2, stride2
1854
1855 pxor m6, m6
1856 pxor m7, m7
1857 add r1, r1
1858 PIXEL_SSD_SP_16x4
1859 HADDD m7, m1
1860 movd eax, m7
1861
1862 RET
1863
1864;-----------------------------------------------------------------------------
1865; int pixel_ssd_sp_16x8( int16_t *, intptr_t, uint8_t *, intptr_t )
1866;-----------------------------------------------------------------------------
1867INIT_XMM sse4
1868cglobal pixel_ssd_sp_16x8, 4, 4, 8, src1, stride1, src2, stride2
1869
1870 pxor m6, m6
1871 pxor m7, m7
1872 add r1, r1
1873 PIXEL_SSD_SP_16x4
1874 lea r0, [r0 + 2 * r1]
1875 lea r2, [r2 + 2 * r3]
1876 PIXEL_SSD_SP_16x4
1877 HADDD m7, m1
1878 movd eax, m7
1879 RET
1880
1881;-----------------------------------------------------------------------------
1882; int pixel_ssd_sp_16x12( int16_t *, intptr_t, uint8_t *, intptr_t )
1883;-----------------------------------------------------------------------------
1884INIT_XMM sse4
1885cglobal pixel_ssd_sp_16x12, 4, 6, 8, src1, stride1, src2, stride2
1886
1887 pxor m6, m6
1888 pxor m7, m7
1889 add r1, r1
1890 lea r4, [r1 * 2]
1891 lea r5, [r3 * 2]
1892 PIXEL_SSD_SP_16x4
1893 lea r0, [r0 + r4]
1894 lea r2, [r2 + r5]
1895 PIXEL_SSD_SP_16x4
1896 lea r0, [r0 + r4]
1897 lea r2, [r2 + r5]
1898 PIXEL_SSD_SP_16x4
1899 HADDD m7, m1
1900 movd eax, m7
1901 RET
1902
1903;-----------------------------------------------------------------------------
1904; int pixel_ssd_sp_16x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1905;-----------------------------------------------------------------------------
1906INIT_XMM sse4
1907cglobal pixel_ssd_sp_16x16, 4, 6, 8, src1, stride1, src2, stride2
1908
1909 pxor m6, m6
1910 pxor m7, m7
1911 add r1, r1
1912 lea r4, [r1 * 2]
1913 lea r5, [r3 * 2]
1914 PIXEL_SSD_SP_16x4
1915 lea r0, [r0 + r4]
1916 lea r2, [r2 + r5]
1917 PIXEL_SSD_SP_16x4
1918 lea r0, [r0 + r4]
1919 lea r2, [r2 + r5]
1920 PIXEL_SSD_SP_16x4
1921 lea r0, [r0 + r4]
1922 lea r2, [r2 + r5]
1923 PIXEL_SSD_SP_16x4
1924 HADDD m7, m1
1925 movd eax, m7
1926 RET
1927
1928cglobal pixel_ssd_sp_16x16_internal
1929 PIXEL_SSD_SP_16x4
1930 lea r0, [r0 + r4]
1931 lea r2, [r2 + 2 * r3]
1932 PIXEL_SSD_SP_16x4
1933 lea r0, [r0 + r4]
1934 lea r2, [r2 + 2 * r3]
1935 PIXEL_SSD_SP_16x4
1936 lea r0, [r0 + r4]
1937 lea r2, [r2 + 2 * r3]
1938 PIXEL_SSD_SP_16x4
1939 ret
1940
1941;-----------------------------------------------------------------------------
1942; int pixel_ssd_sp_16x32( int16_t *, intptr_t, uint8_t *, intptr_t )
1943;-----------------------------------------------------------------------------
1944INIT_XMM sse4
1945cglobal pixel_ssd_sp_16x32, 4, 5, 8, src1, stride1, src2, stride2
1946
1947 pxor m6, m6
1948 pxor m7, m7
1949 add r1, r1
1950 lea r4, [r1 * 2]
1951 call pixel_ssd_sp_16x16_internal
1952 lea r0, [r0 + r4]
1953 lea r2, [r2 + 2 * r3]
1954 call pixel_ssd_sp_16x16_internal
1955 HADDD m7, m1
1956 movd eax, m7
1957 RET
1958
1959;-----------------------------------------------------------------------------
1960; int pixel_ssd_sp_16x64( int16_t *, intptr_t, uint8_t *, intptr_t )
1961;-----------------------------------------------------------------------------
1962INIT_XMM sse4
1963cglobal pixel_ssd_sp_16x64, 4, 6, 8, src1, stride1, src2, stride2
1964
1965 pxor m6, m6
1966 pxor m7, m7
1967 add r1, r1
1968 lea r4, [r1 * 2]
1969 lea r5, [r3 * 2]
1970 call pixel_ssd_sp_16x16_internal
1971 lea r0, [r0 + r4]
1972 lea r2, [r2 + r5]
1973 call pixel_ssd_sp_16x16_internal
1974 lea r0, [r0 + r4]
1975 lea r2, [r2 + r5]
1976 call pixel_ssd_sp_16x16_internal
1977 lea r0, [r0 + r4]
1978 lea r2, [r2 + r5]
1979 call pixel_ssd_sp_16x16_internal
1980
1981 HADDD m7, m1
1982 movd eax, m7
1983 RET
1984
1985;-----------------------------------------------------------------------------
1986; int pixel_ssd_sp_24x32( int16_t *, intptr_t, uint8_t *, intptr_t )
1987;-----------------------------------------------------------------------------
1988INIT_XMM sse4
1989cglobal pixel_ssd_sp_24x32, 4, 7, 8, src1, stride1, src2, stride2
1990 pxor m6, m6
1991 pxor m7, m7
1992 add r1, r1
1993 lea r4, [r1 * 2]
1994 mov r5, r0
1995 mov r6, r2
1996 call pixel_ssd_sp_16x16_internal
1997 lea r0, [r0 + r4]
1998 lea r2, [r2 + 2 * r3]
1999 call pixel_ssd_sp_16x16_internal
2000 lea r0, [r5 + 32]
2001 lea r2, [r6 + 16]
2002 lea r4, [r1 * 3]
2003 lea r5, [r3 * 3]
2004 call pixel_ssd_sp_8x4_internal
2005 lea r0, [r0 + 4 * r1]
2006 lea r2, [r2 + 4 * r3]
2007 call pixel_ssd_sp_8x4_internal
2008 lea r0, [r0 + 4 * r1]
2009 lea r2, [r2 + 4 * r3]
2010 call pixel_ssd_sp_8x4_internal
2011 lea r0, [r0 + 4 * r1]
2012 lea r2, [r2 + 4 * r3]
2013 call pixel_ssd_sp_8x4_internal
2014 lea r0, [r0 + 4 * r1]
2015 lea r2, [r2 + 4 * r3]
2016 call pixel_ssd_sp_8x4_internal
2017 lea r0, [r0 + 4 * r1]
2018 lea r2, [r2 + 4 * r3]
2019 call pixel_ssd_sp_8x4_internal
2020 lea r0, [r0 + 4 * r1]
2021 lea r2, [r2 + 4 * r3]
2022 call pixel_ssd_sp_8x4_internal
2023 lea r0, [r0 + 4 * r1]
2024 lea r2, [r2 + 4 * r3]
2025 call pixel_ssd_sp_8x4_internal
2026 HADDD m7, m1
2027 movd eax, m7
2028 RET
2029
2030;-----------------------------------------------------------------------------
2031; int pixel_ssd_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
2032;-----------------------------------------------------------------------------
2033INIT_XMM sse4
2034cglobal pixel_ssd_sp_32x8, 4, 7, 8, src1, stride1, src2, stride2
2035
2036 pxor m7, m7
2037 pxor m6, m6
2038 mov r5, r0
2039 mov r6, r2
2040 add r1, r1
2041 lea r4, [r1 * 2]
2042 PIXEL_SSD_SP_16x4
2043 lea r0, [r0 + r4]
2044 lea r2, [r2 + 2 * r3]
2045 PIXEL_SSD_SP_16x4
2046 lea r0, [r5 + 32]
2047 lea r2, [r6 + 16]
2048 PIXEL_SSD_SP_16x4
2049 lea r0, [r0 + r4]
2050 lea r2, [r2 + 2 * r3]
2051 PIXEL_SSD_SP_16x4
2052 HADDD m7, m1
2053 movd eax, m7
2054 RET
2055
2056;-----------------------------------------------------------------------------
2057; int pixel_ssd_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
2058;-----------------------------------------------------------------------------
2059INIT_XMM sse4
2060cglobal pixel_ssd_sp_32x16, 4, 7, 8, src1, stride1, src2, stride2
2061
2062 pxor m7, m7
2063 pxor m6, m6
2064 mov r5, r0
2065 mov r6, r2
2066 add r1, r1
2067 lea r4, [r1 * 2]
2068 call pixel_ssd_sp_16x16_internal
2069 lea r0, [r5 + 32]
2070 lea r2, [r6 + 16]
2071 call pixel_ssd_sp_16x16_internal
2072 HADDD m7, m1
2073 movd eax, m7
2074 RET
2075
2076;-----------------------------------------------------------------------------
2077; int pixel_ssd_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
2078;-----------------------------------------------------------------------------
2079INIT_XMM sse4
2080cglobal pixel_ssd_sp_32x24, 4, 7, 8, src1, stride1, src2, stride2
2081
2082 pxor m7, m7
2083 pxor m6, m6
2084 mov r5, r0
2085 mov r6, r2
2086 add r1, r1
2087 lea r4, [r1 * 2]
2088 call pixel_ssd_sp_16x16_internal
2089 lea r0, [r0 + r4]
2090 lea r2, [r2 + 2 * r3]
2091 PIXEL_SSD_SP_16x4
2092 lea r0, [r0 + r4]
2093 lea r2, [r2 + 2 * r3]
2094 PIXEL_SSD_SP_16x4
2095 lea r0, [r5 + 32]
2096 lea r2, [r6 + 16]
2097 call pixel_ssd_sp_16x16_internal
2098 lea r0, [r0 + r4]
2099 lea r2, [r2 + 2 * r3]
2100 PIXEL_SSD_SP_16x4
2101 lea r0, [r0 + r4]
2102 lea r2, [r2 + 2 * r3]
2103 PIXEL_SSD_SP_16x4
2104 HADDD m7, m1
2105 movd eax, m7
2106 RET
2107
2108;-----------------------------------------------------------------------------
2109; int pixel_ssd_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
2110;-----------------------------------------------------------------------------
2111INIT_XMM sse4
2112cglobal pixel_ssd_sp_32x32, 4, 7, 8, src1, stride1, src2, stride2
2113
2114 pxor m7, m7
2115 pxor m6, m6
2116 mov r5, r0
2117 mov r6, r2
2118 add r1, r1
2119 lea r4, [r1 * 2]
2120 call pixel_ssd_sp_16x16_internal
2121 lea r0, [r0 + r4]
2122 lea r2, [r2 + 2 * r3]
2123 call pixel_ssd_sp_16x16_internal
2124 lea r0, [r5 + 32]
2125 lea r2, [r6 + 16]
2126 call pixel_ssd_sp_16x16_internal
2127 lea r0, [r0 + r4]
2128 lea r2, [r2 + 2 * r3]
2129 call pixel_ssd_sp_16x16_internal
2130 HADDD m7, m1
2131 movd eax, m7
2132 RET
2133
2134;-----------------------------------------------------------------------------
2135; int pixel_ssd_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
2136;-----------------------------------------------------------------------------
2137INIT_XMM sse4
2138cglobal pixel_ssd_sp_32x64, 4, 7, 8, src1, stride1, src2, stride2
2139
2140 pxor m7, m7
2141 pxor m6, m6
2142 mov r5, r0
2143 mov r6, r2
2144 add r1, r1
2145 lea r4, [r1 * 2]
2146 call pixel_ssd_sp_16x16_internal
2147 lea r0, [r0 + r4]
2148 lea r2, [r2 + 2 * r3]
2149 call pixel_ssd_sp_16x16_internal
2150 lea r0, [r0 + r4]
2151 lea r2, [r2 + 2 * r3]
2152 call pixel_ssd_sp_16x16_internal
2153 lea r0, [r0 + r4]
2154 lea r2, [r2 + 2 * r3]
2155 call pixel_ssd_sp_16x16_internal
2156 lea r0, [r5 + 32]
2157 lea r2, [r6 + 16]
2158 call pixel_ssd_sp_16x16_internal
2159 lea r0, [r0 + r4]
2160 lea r2, [r2 + 2 * r3]
2161 call pixel_ssd_sp_16x16_internal
2162 lea r0, [r0 + r4]
2163 lea r2, [r2 + 2 * r3]
2164 call pixel_ssd_sp_16x16_internal
2165 lea r0, [r0 + r4]
2166 lea r2, [r2 + 2 * r3]
2167 call pixel_ssd_sp_16x16_internal
2168 HADDD m7, m1
2169 movd eax, m7
2170 RET
2171
2172;-----------------------------------------------------------------------------
2173; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
2174;-----------------------------------------------------------------------------
2175INIT_XMM sse4
2176cglobal pixel_ssd_sp_48x64, 4, 7, 8, src1, stride1, src2, stride2
2177
2178 pxor m7, m7
2179 pxor m6, m6
2180 mov r5, r0
2181 mov r6, r2
2182 add r1, r1
2183 lea r4, [r1 * 2]
2184 call pixel_ssd_sp_16x16_internal
2185 lea r0, [r0 + r4]
2186 lea r2, [r2 + 2 * r3]
2187 call pixel_ssd_sp_16x16_internal
2188 lea r0, [r0 + r4]
2189 lea r2, [r2 + 2 * r3]
2190 call pixel_ssd_sp_16x16_internal
2191 lea r0, [r0 + r4]
2192 lea r2, [r2 + 2 * r3]
2193 call pixel_ssd_sp_16x16_internal
2194 lea r0, [r5 + 32]
2195 lea r2, [r6 + 16]
2196 call pixel_ssd_sp_16x16_internal
2197 lea r0, [r0 + r4]
2198 lea r2, [r2 + 2 * r3]
2199 call pixel_ssd_sp_16x16_internal
2200 lea r0, [r0 + r4]
2201 lea r2, [r2 + 2 * r3]
2202 call pixel_ssd_sp_16x16_internal
2203 lea r0, [r0 + r4]
2204 lea r2, [r2 + 2 * r3]
2205 call pixel_ssd_sp_16x16_internal
2206 lea r0, [r5 + 64]
2207 lea r2, [r6 + 32]
2208 call pixel_ssd_sp_16x16_internal
2209 lea r0, [r0 + r4]
2210 lea r2, [r2 + 2 * r3]
2211 call pixel_ssd_sp_16x16_internal
2212 lea r0, [r0 + r4]
2213 lea r2, [r2 + 2 * r3]
2214 call pixel_ssd_sp_16x16_internal
2215 lea r0, [r0 + r4]
2216 lea r2, [r2 + 2 * r3]
2217 call pixel_ssd_sp_16x16_internal
2218 HADDD m7, m1
2219 movd eax, m7
2220 RET
2221
2222;-----------------------------------------------------------------------------
2223; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
2224;-----------------------------------------------------------------------------
2225INIT_XMM sse4
2226cglobal pixel_ssd_sp_64x16, 4, 7, 8, src1, stride1, src2, stride2
2227
2228 pxor m7, m7
2229 pxor m6, m6
2230 mov r5, r0
2231 mov r6, r2
2232 add r1, r1
2233 lea r4, [r1 * 2]
2234 call pixel_ssd_sp_16x16_internal
2235 lea r0, [r5 + 32]
2236 lea r2, [r6 + 16]
2237 call pixel_ssd_sp_16x16_internal
2238 lea r0, [r5 + 64]
2239 lea r2, [r6 + 32]
2240 call pixel_ssd_sp_16x16_internal
2241 lea r0, [r5 + 96]
2242 lea r2, [r6 + 48]
2243 call pixel_ssd_sp_16x16_internal
2244 HADDD m7, m1
2245 movd eax, m7
2246 RET
2247
2248;-----------------------------------------------------------------------------
2249; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
2250;-----------------------------------------------------------------------------
2251INIT_XMM sse4
2252cglobal pixel_ssd_sp_64x32, 4, 7, 8, src1, stride1, src2, stride2
2253
2254 pxor m7, m7
2255 pxor m6, m6
2256 mov r5, r0
2257 mov r6, r2
2258 add r1, r1
2259 lea r4, [r1 * 2]
2260 call pixel_ssd_sp_16x16_internal
2261 lea r0, [r0 + r4]
2262 lea r2, [r2 + 2 * r3]
2263 call pixel_ssd_sp_16x16_internal
2264 lea r0, [r5 + 32]
2265 lea r2, [r6 + 16]
2266 call pixel_ssd_sp_16x16_internal
2267 lea r0, [r0 + r4]
2268 lea r2, [r2 + 2 * r3]
2269 call pixel_ssd_sp_16x16_internal
2270 lea r0, [r5 + 64]
2271 lea r2, [r6 + 32]
2272 call pixel_ssd_sp_16x16_internal
2273 lea r0, [r0 + r4]
2274 lea r2, [r2 + 2 * r3]
2275 call pixel_ssd_sp_16x16_internal
2276 lea r0, [r5 + 96]
2277 lea r2, [r6 + 48]
2278 call pixel_ssd_sp_16x16_internal
2279 lea r0, [r0 + r4]
2280 lea r2, [r2 + 2 * r3]
2281 call pixel_ssd_sp_16x16_internal
2282 HADDD m7, m1
2283 movd eax, m7
2284 RET
2285
2286;-----------------------------------------------------------------------------
2287; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
2288;-----------------------------------------------------------------------------
2289INIT_XMM sse4
2290cglobal pixel_ssd_sp_64x48, 4, 7, 8, src1, stride1, src2, stride2
2291
2292 pxor m7, m7
2293 pxor m6, m6
2294 mov r5, r0
2295 mov r6, r2
2296 add r1, r1
2297 lea r4, [r1 * 2]
2298 call pixel_ssd_sp_16x16_internal
2299 lea r0, [r0 + r4]
2300 lea r2, [r2 + 2 * r3]
2301 call pixel_ssd_sp_16x16_internal
2302 lea r0, [r0 + r4]
2303 lea r2, [r2 + 2 * r3]
2304 call pixel_ssd_sp_16x16_internal
2305 lea r0, [r5 + 32]
2306 lea r2, [r6 + 16]
2307 call pixel_ssd_sp_16x16_internal
2308 lea r0, [r0 + r4]
2309 lea r2, [r2 + 2 * r3]
2310 call pixel_ssd_sp_16x16_internal
2311 lea r0, [r0 + r4]
2312 lea r2, [r2 + 2 * r3]
2313 call pixel_ssd_sp_16x16_internal
2314 lea r0, [r5 + 64]
2315 lea r2, [r6 + 32]
2316 call pixel_ssd_sp_16x16_internal
2317 lea r0, [r0 + r4]
2318 lea r2, [r2 + 2 * r3]
2319 call pixel_ssd_sp_16x16_internal
2320 lea r0, [r0 + r4]
2321 lea r2, [r2 + 2 * r3]
2322 call pixel_ssd_sp_16x16_internal
2323 lea r0, [r5 + 96]
2324 lea r2, [r6 + 48]
2325 call pixel_ssd_sp_16x16_internal
2326 lea r0, [r0 + r4]
2327 lea r2, [r2 + 2 * r3]
2328 call pixel_ssd_sp_16x16_internal
2329 lea r0, [r0 + r4]
2330 lea r2, [r2 + 2 * r3]
2331 call pixel_ssd_sp_16x16_internal
2332 HADDD m7, m1
2333 movd eax, m7
2334 RET
2335
2336;-----------------------------------------------------------------------------
2337; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
2338;-----------------------------------------------------------------------------
2339INIT_XMM sse4
2340cglobal pixel_ssd_sp_64x64, 4, 7, 8, src1, stride1, src2, stride2
2341
2342 pxor m7, m7
2343 pxor m6, m6
2344 mov r5, r0
2345 mov r6, r2
2346 add r1, r1
2347 lea r4, [r1 * 2]
2348 call pixel_ssd_sp_16x16_internal
2349 lea r0, [r0 + r4]
2350 lea r2, [r2 + 2 * r3]
2351 call pixel_ssd_sp_16x16_internal
2352 lea r0, [r0 + r4]
2353 lea r2, [r2 + 2 * r3]
2354 call pixel_ssd_sp_16x16_internal
2355 lea r0, [r0 + r4]
2356 lea r2, [r2 + 2 * r3]
2357 call pixel_ssd_sp_16x16_internal
2358 lea r0, [r5 + 32]
2359 lea r2, [r6 + 16]
2360 call pixel_ssd_sp_16x16_internal
2361 lea r0, [r0 + r4]
2362 lea r2, [r2 + 2 * r3]
2363 call pixel_ssd_sp_16x16_internal
2364 lea r0, [r0 + r4]
2365 lea r2, [r2 + 2 * r3]
2366 call pixel_ssd_sp_16x16_internal
2367 lea r0, [r0 + r4]
2368 lea r2, [r2 + 2 * r3]
2369 call pixel_ssd_sp_16x16_internal
2370 lea r0, [r5 + 64]
2371 lea r2, [r6 + 32]
2372 call pixel_ssd_sp_16x16_internal
2373 lea r0, [r0 + r4]
2374 lea r2, [r2 + 2 * r3]
2375 call pixel_ssd_sp_16x16_internal
2376 lea r0, [r0 + r4]
2377 lea r2, [r2 + 2 * r3]
2378 call pixel_ssd_sp_16x16_internal
2379 lea r0, [r0 + r4]
2380 lea r2, [r2 + 2 * r3]
2381 call pixel_ssd_sp_16x16_internal
2382 lea r0, [r5 + 96]
2383 lea r2, [r6 + 48]
2384 call pixel_ssd_sp_16x16_internal
2385 lea r0, [r0 + r4]
2386 lea r2, [r2 + 2 * r3]
2387 call pixel_ssd_sp_16x16_internal
2388 lea r0, [r0 + r4]
2389 lea r2, [r2 + 2 * r3]
2390 call pixel_ssd_sp_16x16_internal
2391 lea r0, [r0 + r4]
2392 lea r2, [r2 + 2 * r3]
2393 call pixel_ssd_sp_16x16_internal
2394 HADDD m7, m1
2395 movd eax, m7
2396 RET
2397
2398
2399;-----------------------------------------------------------------------------
2400; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
2401;-----------------------------------------------------------------------------
2402INIT_XMM sse2
2403cglobal pixel_ssd_s_4, 2,2,2
2404 add r1, r1
2405 movh m0, [r0]
2406 movhps m0, [r0 + r1]
2407
2408 lea r0, [r0 + r1 * 2]
2409 movh m1, [r0]
2410 movhps m1, [r0 + r1]
2411
2412 pmaddwd m0, m0
2413 pmaddwd m1, m1
2414 paddd m0, m1
2415
2416 ; calculate sum and return
2417 HADDD m0, m1
2418 movd eax, m0
2419 RET
2420
2421
2422INIT_XMM sse2
2423cglobal pixel_ssd_s_8, 2,3,5
2424 add r1, r1
2425 lea r2, [r1 * 3]
2426 movu m0, [r0]
2427 movu m1, [r0 + r1]
2428 movu m2, [r0 + r1 * 2]
2429 movu m3, [r0 + r2]
2430
2431 pmaddwd m0, m0
2432 pmaddwd m1, m1
2433 pmaddwd m2, m2
2434 pmaddwd m3, m3
2435 paddd m0, m1
2436 paddd m2, m3
2437 paddd m0, m2
2438
2439 lea r0, [r0 + r1 * 4]
2440 movu m4, [r0]
2441 movu m1, [r0 + r1]
2442 movu m2, [r0 + r1 * 2]
2443 movu m3, [r0 + r2]
2444
2445 pmaddwd m4, m4
2446 pmaddwd m1, m1
2447 pmaddwd m2, m2
2448 pmaddwd m3, m3
2449 paddd m4, m1
2450 paddd m2, m3
2451 paddd m4, m2
2452 paddd m0, m4
2453
2454 ; calculate sum and return
2455 HADDD m0, m1
2456 movd eax, m0
2457 RET
2458
2459
2460INIT_XMM sse2
2461cglobal pixel_ssd_s_16, 2,3,5
2462 add r1, r1
2463
2464 mov r2d, 4
2465 pxor m0, m0
2466.loop:
2467 movu m1, [r0]
2468 movu m2, [r0 + mmsize]
2469 movu m3, [r0 + r1]
2470 movu m4, [r0 + r1 + mmsize]
2471 lea r0, [r0 + r1 * 2]
2472
2473 pmaddwd m1, m1
2474 pmaddwd m2, m2
2475 pmaddwd m3, m3
2476 pmaddwd m4, m4
2477 paddd m1, m2
2478 paddd m3, m4
2479 paddd m1, m3
2480 paddd m0, m1
2481
2482 movu m1, [r0]
2483 movu m2, [r0 + mmsize]
2484 movu m3, [r0 + r1]
2485 movu m4, [r0 + r1 + mmsize]
2486 lea r0, [r0 + r1 * 2]
2487
2488 pmaddwd m1, m1
2489 pmaddwd m2, m2
2490 pmaddwd m3, m3
2491 pmaddwd m4, m4
2492 paddd m1, m2
2493 paddd m3, m4
2494 paddd m1, m3
2495 paddd m0, m1
2496
2497 dec r2d
2498 jnz .loop
2499
2500 ; calculate sum and return
2501 HADDD m0, m1
2502 movd eax, m0
2503 RET
2504
2505
2506INIT_XMM sse2
2507cglobal pixel_ssd_s_32, 2,3,5
2508 add r1, r1
2509
2510 mov r2d, 16
2511 pxor m0, m0
2512.loop:
2513 movu m1, [r0 + 0 * mmsize]
2514 movu m2, [r0 + 1 * mmsize]
2515 movu m3, [r0 + 2 * mmsize]
2516 movu m4, [r0 + 3 * mmsize]
2517 add r0, r1
2518
2519 pmaddwd m1, m1
2520 pmaddwd m2, m2
2521 pmaddwd m3, m3
2522 pmaddwd m4, m4
2523 paddd m1, m2
2524 paddd m3, m4
2525 paddd m1, m3
2526 paddd m0, m1
2527
2528 movu m1, [r0 + 0 * mmsize]
2529 movu m2, [r0 + 1 * mmsize]
2530 movu m3, [r0 + 2 * mmsize]
2531 movu m4, [r0 + 3 * mmsize]
2532 add r0, r1
2533
2534 pmaddwd m1, m1
2535 pmaddwd m2, m2
2536 pmaddwd m3, m3
2537 pmaddwd m4, m4
2538 paddd m1, m2
2539 paddd m3, m4
2540 paddd m1, m3
2541 paddd m0, m1
2542
2543 dec r2d
2544 jnz .loop
2545
2546 ; calculate sum and return
2547 HADDD m0, m1
2548 movd eax, m0
2549 RET
2550
2551
2552INIT_YMM avx2
2553cglobal pixel_ssd_s_32, 2,4,5
2554 add r1, r1
2555 lea r3, [r1 * 3]
2556
2557 mov r2d, 8
2558 pxor m0, m0
2559.loop:
2560 movu m1, [r0 + 0 * mmsize]
2561 movu m2, [r0 + 1 * mmsize]
2562 movu m3, [r0 + r1 + 0 * mmsize]
2563 movu m4, [r0 + r1 + 1 * mmsize]
2564
2565 pmaddwd m1, m1
2566 pmaddwd m2, m2
2567 pmaddwd m3, m3
2568 pmaddwd m4, m4
2569 paddd m1, m2
2570 paddd m3, m4
2571 paddd m1, m3
2572 paddd m0, m1
2573
2574 movu m1, [r0 + r1 * 2 + 0 * mmsize]
2575 movu m2, [r0 + r1 * 2 + 1 * mmsize]
2576 movu m3, [r0 + r3 + 0 * mmsize]
2577 movu m4, [r0 + r3 + 1 * mmsize]
2578 lea r0, [r0 + 4 * r1]
2579
2580 pmaddwd m1, m1
2581 pmaddwd m2, m2
2582 pmaddwd m3, m3
2583 pmaddwd m4, m4
2584 paddd m1, m2
2585 paddd m3, m4
2586 paddd m1, m3
2587 paddd m0, m1
2588
2589 dec r2d
2590 jnz .loop
2591
2592 ; calculate sum and return
2593 HADDD m0, m1
2594 movd eax, xm0
2595 RET