Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / mc-a.asm
1 ;*****************************************************************************
2 ;* mc-a.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Dylan Yudaken <dyudaken@gmail.com>
10 ;* Holger Lubitz <holger@lubitz.org>
11 ;* Min Chen <chenm001@163.com>
12 ;* Oskar Arvidsson <oskar@irock.se>
13 ;*
14 ;* This program is free software; you can redistribute it and/or modify
15 ;* it under the terms of the GNU General Public License as published by
16 ;* the Free Software Foundation; either version 2 of the License, or
17 ;* (at your option) any later version.
18 ;*
19 ;* This program is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;* GNU General Public License for more details.
23 ;*
24 ;* You should have received a copy of the GNU General Public License
25 ;* along with this program; if not, write to the Free Software
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 ;*
28 ;* This program is also available under a commercial proprietary license.
29 ;* For more information, contact us at license @ x265.com.
30 ;*****************************************************************************
31
32 %include "x86inc.asm"
33 %include "x86util.asm"
34
35 SECTION_RODATA 32
36
37 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
38 ch_shuf_adj: times 8 db 0
39 times 8 db 2
40 times 8 db 4
41 times 8 db 6
42 sq_1: times 1 dq 1
43
44 SECTION .text
45
46 cextern pb_0
47 cextern pw_1
48 cextern pw_4
49 cextern pw_8
50 cextern pw_32
51 cextern pw_64
52 cextern pw_128
53 cextern pw_256
54 cextern pw_512
55 cextern pw_1023
56 cextern pw_1024
57 cextern pw_00ff
58 cextern pw_pixel_max
59 cextern sw_64
60 cextern pd_32
61 cextern deinterleave_shufd
62
63 ;====================================================================================================================
64 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
65 ;====================================================================================================================
66 ; r0 = pSrc0, r1 = pSrc1
67 ; r2 = pDst, r3 = iStride0
68 ; r4 = iStride1, r5 = iDstStride
69 %if HIGH_BIT_DEPTH
70 INIT_XMM sse4
71 cglobal addAvg_2x4, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
72 add r3, r3
73 add r4, r4
74 add r5, r5
75
76 movd m1, [r0]
77 movd m2, [r0 + r3]
78 movd m3, [r1]
79 movd m4, [r1 + r4]
80
81 punpckldq m1, m2
82 punpckldq m3, m4
83
84 lea r0, [r0 + 2 * r3]
85 lea r1, [r1 + 2 * r4]
86
87 movd m2, [r0]
88 movd m4, [r0 + r3]
89 movd m5, [r1]
90 movd m0, [r1 + r4]
91 punpckldq m2, m4
92 punpckldq m5, m0
93 punpcklqdq m1, m2
94 punpcklqdq m3, m5
95 paddw m1, m3
96 pmulhrsw m1, [pw_1024]
97 paddw m1, [pw_512]
98
99 pxor m0, m0
100 pmaxsw m1, m0
101 pminsw m1, [pw_1023]
102 movd [r2], m1
103 pextrd [r2 + r5], m1, 1
104 lea r2, [r2 + 2 * r5]
105 pextrd [r2], m1, 2
106 pextrd [r2 + r5], m1, 3
107
108 RET
109 ;-----------------------------------------------------------------------------
110 INIT_XMM sse4
111 cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
112 mova m0, [pw_512]
113 pxor m7, m7
114 add r3, r3
115 add r4, r4
116 add r5, r5
117
118 %rep 2
119 movd m1, [r0]
120 movd m2, [r0 + r3]
121 movd m3, [r1]
122 movd m4, [r1 + r4]
123
124 punpckldq m1, m2
125 punpckldq m3, m4
126
127 lea r0, [r0 + 2 * r3]
128 lea r1, [r1 + 2 * r4]
129
130 movd m2, [r0]
131 movd m4, [r0 + r3]
132 movd m5, [r1]
133 movd m6, [r1 + r4]
134
135 punpckldq m2, m4
136 punpckldq m5, m6
137 punpcklqdq m1, m2
138 punpcklqdq m3, m5
139 paddw m1, m3
140 pmulhrsw m1, [pw_1024]
141 paddw m1, m0
142
143 pmaxsw m1, m7
144 pminsw m1, [pw_1023]
145 movd [r2], m1
146 pextrd [r2 + r5], m1, 1
147 lea r2, [r2 + 2 * r5]
148 pextrd [r2], m1, 2
149 pextrd [r2 + r5], m1, 3
150
151 lea r0, [r0 + 2 * r3]
152 lea r1, [r1 + 2 * r4]
153 lea r2, [r2 + 2 * r5]
154 %endrep
155 RET
156
157 ;-----------------------------------------------------------------------------
158 INIT_XMM sse4
159 cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
160 mova m6, [pw_1023]
161 mova m7, [pw_1024]
162 mov r6d, 16/4
163 add r3, r3
164 add r4, r4
165 add r5, r5
166 .loop:
167 movd m1, [r0]
168 movd m2, [r0 + r3]
169 movd m3, [r1]
170 movd m4, [r1 + r4]
171 lea r0, [r0 + r3 * 2]
172 lea r1, [r1 + r4 * 2]
173 punpckldq m1, m2
174 punpckldq m3, m4
175 movd m2, [r0]
176 movd m4, [r0 + r3]
177 movd m5, [r1]
178 movd m0, [r1 + r4]
179 lea r0, [r0 + r3 * 2]
180 lea r1, [r1 + r4 * 2]
181 punpckldq m2, m4
182 punpckldq m5, m0
183 punpcklqdq m1, m2
184 punpcklqdq m3, m5
185 paddw m1, m3
186 pmulhrsw m1, m7
187 paddw m1, [pw_512]
188 pxor m0, m0
189 pmaxsw m1, m0
190 pminsw m1, m6
191 movd [r2], m1
192 pextrd [r2 + r5], m1, 1
193 lea r2, [r2 + r5 * 2]
194 pextrd [r2], m1, 2
195 pextrd [r2 + r5], m1, 3
196 lea r2, [r2 + r5 * 2]
197 dec r6d
198 jnz .loop
199 RET
200 ;-----------------------------------------------------------------------------
201
202 ;-----------------------------------------------------------------------------
203 INIT_XMM sse4
204 cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
205 add r3, r3
206 add r4, r4
207 add r5, r5
208
209 movh m0, [r0]
210 movh m1, [r0 + r3]
211 movh m2, [r1]
212 movh m3, [r1 + r4]
213
214 punpcklqdq m0, m1
215 punpcklqdq m2, m3
216 paddw m0, m2
217 pmulhrsw m0, [pw_1024]
218 paddw m0, [pw_512]
219
220 pxor m6, m6
221 pmaxsw m0, m6
222 pminsw m0, [pw_1023]
223 movh [r2], m0
224 movhps [r2 + r5], m0
225 RET
226 ;-----------------------------------------------------------------------------
227 INIT_XMM sse4
228 cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
229 mova m4, [pw_512]
230 mova m5, [pw_1023]
231 mova m7, [pw_1024]
232 pxor m6, m6
233 add r3, r3
234 add r4, r4
235 add r5, r5
236
237 %rep 4
238 movu m0, [r0]
239 movu m2, [r1]
240 paddw m0, m2
241 pmulhrsw m0, m7
242 paddw m0, m4
243
244 pmaxsw m0, m6
245 pminsw m0, m5
246 movh [r2], m0
247 pextrd [r2 + 8], m0, 2
248
249 movu m1, [r0 + r3]
250 movu m3, [r1 + r4]
251 paddw m1, m3
252 pmulhrsw m1, m7
253 paddw m1, m4
254
255 pmaxsw m1, m6
256 pminsw m1, m5
257 movh [r2 + r5], m1
258 pextrd [r2 + r5 + 8], m1, 2
259
260 lea r2, [r2 + 2 * r5]
261 lea r0, [r0 + 2 * r3]
262 lea r1, [r1 + 2 * r4]
263 %endrep
264 RET
265 ;-----------------------------------------------------------------------------
266 INIT_XMM sse4
267 cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
268 mova m4, [pw_512]
269 mova m5, [pw_1023]
270 mova m7, [pw_1024]
271 pxor m6, m6
272 mov r6d, 16/2
273 add r3, r3
274 add r4, r4
275 add r5, r5
276 .loop:
277 movu m0, [r0]
278 movu m2, [r1]
279 movu m1, [r0 + r3]
280 movu m3, [r1 + r4]
281 dec r6d
282 lea r0, [r0 + r3 * 2]
283 lea r1, [r1 + r4 * 2]
284 paddw m0, m2
285 paddw m1, m3
286 pmulhrsw m0, m7
287 pmulhrsw m1, m7
288 paddw m0, m4
289 paddw m1, m4
290 pmaxsw m0, m6
291 pmaxsw m1, m6
292 pminsw m0, m5
293 pminsw m1, m5
294 movh [r2], m0
295 pextrd [r2 + 8], m0, 2
296 movh [r2 + r5], m1
297 pextrd [r2 + r5 + 8], m1, 2
298 lea r2, [r2 + r5 * 2]
299 jnz .loop
300 RET
301 ;-----------------------------------------------------------------------------
302 INIT_XMM sse4
303 cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
304 mova m4, [pw_512]
305 mova m5, [pw_1023]
306 mova m7, [pw_1024]
307 pxor m6, m6
308 add r3, r3
309 add r4, r4
310 add r5, r5
311
312 movu m0, [r0]
313 movu m2, [r1]
314 paddw m0, m2
315 pmulhrsw m0, m7
316 paddw m0, m4
317
318 pmaxsw m0, m6
319 pminsw m0, m5
320 movu [r2], m0
321
322 movu m1, [r0 + r3]
323 movu m3, [r1 + r4]
324 paddw m1, m3
325 pmulhrsw m1, m7
326 paddw m1, m4
327
328 pmaxsw m1, m6
329 pminsw m1, m5
330 movu [r2 + r5], m1
331 RET
332 ;-----------------------------------------------------------------------------
333 INIT_XMM sse4
334 cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
335 mova m4, [pw_512]
336 mova m5, [pw_1023]
337 mova m7, [pw_1024]
338 pxor m6, m6
339 add r3, r3
340 add r4, r4
341 add r5, r5
342
343 %rep 3
344 movu m0, [r0]
345 movu m2, [r1]
346 paddw m0, m2
347 pmulhrsw m0, m7
348 paddw m0, m4
349
350 pmaxsw m0, m6
351 pminsw m0, m5
352 movu [r2], m0
353
354 movu m1, [r0 + r3]
355 movu m3, [r1 + r4]
356 paddw m1, m3
357 pmulhrsw m1, m7
358 paddw m1, m4
359
360 pmaxsw m1, m6
361 pminsw m1, m5
362 movu [r2 + r5], m1
363
364 lea r2, [r2 + 2 * r5]
365 lea r0, [r0 + 2 * r3]
366 lea r1, [r1 + 2 * r4]
367 %endrep
368 RET
369
370 ;-----------------------------------------------------------------------------
371 %macro ADDAVG_W4_H4 1
372 INIT_XMM sse4
373 cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
374 mova m4, [pw_512]
375 mova m5, [pw_1023]
376 mova m7, [pw_1024]
377 pxor m6, m6
378 add r3, r3
379 add r4, r4
380 add r5, r5
381
382 mov r6d, %1/4
383
384 .loop:
385 %rep 2
386 movh m0, [r0]
387 movh m1, [r0 + r3]
388 movh m2, [r1]
389 movh m3, [r1 + r4]
390
391 punpcklqdq m0, m1
392 punpcklqdq m2, m3
393
394 paddw m0, m2
395 pmulhrsw m0, m7
396 paddw m0, m4
397
398 pmaxsw m0, m6
399 pminsw m0, m5
400
401 movh [r2], m0
402 movhps [r2 + r5], m0
403
404 lea r2, [r2 + 2 * r5]
405 lea r0, [r0 + 2 * r3]
406 lea r1, [r1 + 2 * r4]
407 %endrep
408
409 dec r6d
410 jnz .loop
411 RET
412 %endmacro
413
414 ADDAVG_W4_H4 4
415 ADDAVG_W4_H4 8
416 ADDAVG_W4_H4 16
417
418 ADDAVG_W4_H4 32
419
420 ;-----------------------------------------------------------------------------
421 %macro ADDAVG_W8_H4 1
422 INIT_XMM sse4
423 cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
424 mova m4, [pw_512]
425 mova m5, [pw_1023]
426 mova m7, [pw_1024]
427 pxor m6, m6
428 add r3, r3
429 add r4, r4
430 add r5, r5
431 mov r6d, %1/4
432
433 .loop:
434 %rep 2
435 movu m0, [r0]
436 movu m2, [r1]
437 paddw m0, m2
438 pmulhrsw m0, m7
439 paddw m0, m4
440 pmaxsw m0, m6
441 pminsw m0, m5
442 movu [r2], m0
443
444 movu m1, [r0 + r3]
445 movu m3, [r1 + r4]
446 paddw m1, m3
447 pmulhrsw m1, m7
448 paddw m1, m4
449 pmaxsw m1, m6
450 pminsw m1, m5
451 movu [r2 + r5], m1
452
453 lea r2, [r2 + 2 * r5]
454 lea r0, [r0 + 2 * r3]
455 lea r1, [r1 + 2 * r4]
456 %endrep
457 dec r6d
458 jnz .loop
459 RET
460 %endmacro
461
462 ADDAVG_W8_H4 4
463 ADDAVG_W8_H4 8
464 ADDAVG_W8_H4 16
465 ADDAVG_W8_H4 32
466
467 ADDAVG_W8_H4 12
468 ADDAVG_W8_H4 64
469
470 ;-----------------------------------------------------------------------------
471 %macro ADDAVG_W12_H4 1
472 INIT_XMM sse4
473 cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
474 mova m4, [pw_512]
475 mova m5, [pw_1023]
476 mova m7, [pw_1024]
477 pxor m6, m6
478 add r3, r3
479 add r4, r4
480 add r5, r5
481 mov r6d, %1/4
482
483 .loop:
484 %rep 2
485 movu m0, [r0]
486 movu m2, [r1]
487 paddw m0, m2
488 pmulhrsw m0, m7
489 paddw m0, m4
490 pmaxsw m0, m6
491 pminsw m0, m5
492 movu [r2], m0
493
494 movh m0, [r0 + 16]
495 movh m1, [r0 + 16 + r3]
496 movh m2, [r1 + 16]
497 movh m3, [r1 + 16 + r4]
498
499 punpcklqdq m0, m1
500 punpcklqdq m2, m3
501
502 paddw m0, m2
503 pmulhrsw m0, m7
504 paddw m0, m4
505 pmaxsw m0, m6
506 pminsw m0, m5
507 movh [r2 + 16], m0
508 movhps [r2 + r5 + 16], m0
509
510 movu m1, [r0 + r3]
511 movu m3, [r1 + r4]
512 paddw m1, m3
513 pmulhrsw m1, m7
514 paddw m1, m4
515 pmaxsw m1, m6
516 pminsw m1, m5
517 movu [r2 + r5], m1
518
519 lea r2, [r2 + 2 * r5]
520 lea r0, [r0 + 2 * r3]
521 lea r1, [r1 + 2 * r4]
522 %endrep
523 dec r6d
524 jnz .loop
525 RET
526 %endmacro
527
528 ADDAVG_W12_H4 16
529
530 ADDAVG_W12_H4 32
531
532 ;-----------------------------------------------------------------------------
533 %macro ADDAVG_W16_H4 1
534 INIT_XMM sse4
535 cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
536 mova m4, [pw_512]
537 mova m5, [pw_1023]
538 mova m7, [pw_1024]
539 pxor m6, m6
540 add r3, r3
541 add r4, r4
542 add r5, r5
543 mov r6d, %1/4
544
545 .loop:
546 %rep 2
547 movu m0, [r0]
548 movu m2, [r1]
549 paddw m0, m2
550 pmulhrsw m0, m7
551 paddw m0, m4
552 pmaxsw m0, m6
553 pminsw m0, m5
554 movu [r2], m0
555
556 movu m1, [r0 + 16]
557 movu m2, [r1 + 16]
558 paddw m1, m2
559 pmulhrsw m1, m7
560 paddw m1, m4
561 pmaxsw m1, m6
562 pminsw m1, m5
563 movu [r2 + 16], m1
564
565 movu m1, [r0 + r3]
566 movu m3, [r1 + r4]
567 paddw m1, m3
568 pmulhrsw m1, m7
569 paddw m1, m4
570 pmaxsw m1, m6
571 pminsw m1, m5
572 movu [r2 + r5], m1
573
574 movu m2, [r0 + 16 + r3]
575 movu m3, [r1 + 16 + r4]
576 paddw m2, m3
577 pmulhrsw m2, m7
578 paddw m2, m4
579 pmaxsw m2, m6
580 pminsw m2, m5
581 movu [r2 + r5 + 16], m2
582
583 lea r2, [r2 + 2 * r5]
584 lea r0, [r0 + 2 * r3]
585 lea r1, [r1 + 2 * r4]
586 %endrep
587 dec r6d
588 jnz .loop
589 RET
590 %endmacro
591
592 ADDAVG_W16_H4 4
593 ADDAVG_W16_H4 8
594 ADDAVG_W16_H4 12
595 ADDAVG_W16_H4 16
596 ADDAVG_W16_H4 32
597 ADDAVG_W16_H4 64
598
599 ADDAVG_W16_H4 24
600
601 ;-----------------------------------------------------------------------------
602 %macro ADDAVG_W24_H2 2
603 INIT_XMM sse4
604 cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
605 mova m4, [pw_512]
606 mova m5, [pw_1023]
607 mova m7, [pw_1024]
608 pxor m6, m6
609 add r3, r3
610 add r4, r4
611 add r5, r5
612
613 mov r6d, %2/2
614
615 .loop:
616 movu m0, [r0]
617 movu m2, [r1]
618 paddw m0, m2
619 pmulhrsw m0, m7
620 paddw m0, m4
621 pmaxsw m0, m6
622 pminsw m0, m5
623 movu [r2], m0
624
625 movu m1, [r0 + 16]
626 movu m2, [r1 + 16]
627 paddw m1, m2
628 pmulhrsw m1, m7
629 paddw m1, m4
630 pmaxsw m1, m6
631 pminsw m1, m5
632 movu [r2 + 16], m1
633
634 movu m0, [r0 + 32]
635 movu m2, [r1 + 32]
636 paddw m0, m2
637 pmulhrsw m0, m7
638 paddw m0, m4
639 pmaxsw m0, m6
640 pminsw m0, m5
641 movu [r2 + 32], m0
642
643 movu m1, [r0 + r3]
644 movu m3, [r1 + r4]
645 paddw m1, m3
646 pmulhrsw m1, m7
647 paddw m1, m4
648 pmaxsw m1, m6
649 pminsw m1, m5
650 movu [r2 + r5], m1
651
652 movu m2, [r0 + r3 + 16]
653 movu m3, [r1 + r4 + 16]
654 paddw m2, m3
655 pmulhrsw m2, m7
656 paddw m2, m4
657 pmaxsw m2, m6
658 pminsw m2, m5
659 movu [r2 + r5 + 16], m2
660
661 movu m1, [r0 + r3 + 32]
662 movu m3, [r1 + r4 + 32]
663 paddw m1, m3
664 pmulhrsw m1, m7
665 paddw m1, m4
666 pmaxsw m1, m6
667 pminsw m1, m5
668 movu [r2 + r5 + 32], m1
669
670 lea r2, [r2 + 2 * r5]
671 lea r0, [r0 + 2 * r3]
672 lea r1, [r1 + 2 * r4]
673
674 dec r6d
675 jnz .loop
676 RET
677 %endmacro
678
679 ADDAVG_W24_H2 24, 32
680
681 ADDAVG_W24_H2 24, 64
682
683 ;-----------------------------------------------------------------------------
684 %macro ADDAVG_W32_H2 1
685 INIT_XMM sse4
686 cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
687 mova m4, [pw_512]
688 mova m5, [pw_1023]
689 mova m7, [pw_1024]
690 pxor m6, m6
691 add r3, r3
692 add r4, r4
693 add r5, r5
694
695 mov r6d, %1/2
696
697 .loop:
698 movu m0, [r0]
699 movu m2, [r1]
700 paddw m0, m2
701 pmulhrsw m0, m7
702 paddw m0, m4
703 pmaxsw m0, m6
704 pminsw m0, m5
705 movu [r2], m0
706
707 movu m1, [r0 + 16]
708 movu m2, [r1 + 16]
709 paddw m1, m2
710 pmulhrsw m1, m7
711 paddw m1, m4
712 pmaxsw m1, m6
713 pminsw m1, m5
714 movu [r2 + 16], m1
715
716 movu m0, [r0 + 32]
717 movu m2, [r1 + 32]
718 paddw m0, m2
719 pmulhrsw m0, m7
720 paddw m0, m4
721 pmaxsw m0, m6
722 pminsw m0, m5
723 movu [r2 + 32], m0
724
725 movu m1, [r0 + 48]
726 movu m2, [r1 + 48]
727 paddw m1, m2
728 pmulhrsw m1, m7
729 paddw m1, m4
730 pmaxsw m1, m6
731 pminsw m1, m5
732 movu [r2 + 48], m1
733
734 movu m1, [r0 + r3]
735 movu m3, [r1 + r4]
736 paddw m1, m3
737 pmulhrsw m1, m7
738 paddw m1, m4
739 pmaxsw m1, m6
740 pminsw m1, m5
741 movu [r2 + r5], m1
742
743 movu m2, [r0 + 16 + r3]
744 movu m3, [r1 + 16 + r4]
745 paddw m2, m3
746 pmulhrsw m2, m7
747 paddw m2, m4
748 pmaxsw m2, m6
749 pminsw m2, m5
750 movu [r2 + r5 + 16], m2
751
752 movu m1, [r0 + 32 + r3]
753 movu m3, [r1 + 32 + r4]
754 paddw m1, m3
755 pmulhrsw m1, m7
756 paddw m1, m4
757 pmaxsw m1, m6
758 pminsw m1, m5
759 movu [r2 + r5 + 32], m1
760
761 movu m2, [r0 + 48 + r3]
762 movu m3, [r1 + 48 + r4]
763 paddw m2, m3
764 pmulhrsw m2, m7
765 paddw m2, m4
766 pmaxsw m2, m6
767 pminsw m2, m5
768 movu [r2 + r5 + 48], m2
769
770 lea r2, [r2 + 2 * r5]
771 lea r0, [r0 + 2 * r3]
772 lea r1, [r1 + 2 * r4]
773
774 dec r6d
775 jnz .loop
776 RET
777 %endmacro
778
779 ADDAVG_W32_H2 8
780 ADDAVG_W32_H2 16
781 ADDAVG_W32_H2 24
782 ADDAVG_W32_H2 32
783 ADDAVG_W32_H2 64
784
785 ADDAVG_W32_H2 48
786
787 ;-----------------------------------------------------------------------------
788 %macro ADDAVG_W48_H2 1
789 INIT_XMM sse4
790 cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
791 mova m4, [pw_512]
792 mova m5, [pw_1023]
793 mova m7, [pw_1024]
794 pxor m6, m6
795 add r3, r3
796 add r4, r4
797 add r5, r5
798
799 mov r6d, %1/2
800
801 .loop:
802 movu m0, [r0]
803 movu m2, [r1]
804 paddw m0, m2
805 pmulhrsw m0, m7
806 paddw m0, m4
807 pmaxsw m0, m6
808 pminsw m0, m5
809 movu [r2], m0
810
811 movu m1, [r0 + 16]
812 movu m2, [r1 + 16]
813 paddw m1, m2
814 pmulhrsw m1, m7
815 paddw m1, m4
816 pmaxsw m1, m6
817 pminsw m1, m5
818 movu [r2 + 16], m1
819
820 movu m0, [r0 + 32]
821 movu m2, [r1 + 32]
822 paddw m0, m2
823 pmulhrsw m0, m7
824 paddw m0, m4
825 pmaxsw m0, m6
826 pminsw m0, m5
827 movu [r2 + 32], m0
828
829 movu m1, [r0 + 48]
830 movu m2, [r1 + 48]
831 paddw m1, m2
832 pmulhrsw m1, m7
833 paddw m1, m4
834 pmaxsw m1, m6
835 pminsw m1, m5
836 movu [r2 + 48], m1
837
838 movu m0, [r0 + 64]
839 movu m2, [r1 + 64]
840 paddw m0, m2
841 pmulhrsw m0, m7
842 paddw m0, m4
843 pmaxsw m0, m6
844 pminsw m0, m5
845 movu [r2 + 64], m0
846
847 movu m1, [r0 + 80]
848 movu m2, [r1 + 80]
849 paddw m1, m2
850 pmulhrsw m1, m7
851 paddw m1, m4
852 pmaxsw m1, m6
853 pminsw m1, m5
854 movu [r2 + 80], m1
855
856 movu m1, [r0 + r3]
857 movu m3, [r1 + r4]
858 paddw m1, m3
859 pmulhrsw m1, m7
860 paddw m1, m4
861 pmaxsw m1, m6
862 pminsw m1, m5
863 movu [r2 + r5], m1
864
865 movu m2, [r0 + 16 + r3]
866 movu m3, [r1 + 16 + r4]
867 paddw m2, m3
868 pmulhrsw m2, m7
869 paddw m2, m4
870 pmaxsw m2, m6
871 pminsw m2, m5
872 movu [r2 + 16 + r5], m2
873
874 movu m1, [r0 + 32 + r3]
875 movu m3, [r1 + 32 + r4]
876 paddw m1, m3
877 pmulhrsw m1, m7
878 paddw m1, m4
879 pmaxsw m1, m6
880 pminsw m1, m5
881 movu [r2 + 32 + r5], m1
882
883 movu m2, [r0 + 48 + r3]
884 movu m3, [r1 + 48 + r4]
885 paddw m2, m3
886 pmulhrsw m2, m7
887 paddw m2, m4
888 pmaxsw m2, m6
889 pminsw m2, m5
890 movu [r2 + 48 + r5], m2
891
892 movu m1, [r0 + 64 + r3]
893 movu m3, [r1 + 64 + r4]
894 paddw m1, m3
895 pmulhrsw m1, m7
896 paddw m1, m4
897 pmaxsw m1, m6
898 pminsw m1, m5
899 movu [r2 + 64 + r5], m1
900
901 movu m2, [r0 + 80 + r3]
902 movu m3, [r1 + 80 + r4]
903 paddw m2, m3
904 pmulhrsw m2, m7
905 paddw m2, m4
906 pmaxsw m2, m6
907 pminsw m2, m5
908 movu [r2 + 80 + r5], m2
909
910 lea r2, [r2 + 2 * r5]
911 lea r0, [r0 + 2 * r3]
912 lea r1, [r1 + 2 * r4]
913
914 dec r6d
915 jnz .loop
916 RET
917 %endmacro
918
919 ADDAVG_W48_H2 64
920
921 ;-----------------------------------------------------------------------------
922 %macro ADDAVG_W64_H1 1
923 INIT_XMM sse4
924 cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
925 mova m4, [pw_512]
926 mova m5, [pw_1023]
927 mova m7, [pw_1024]
928 pxor m6, m6
929 add r3, r3
930 add r4, r4
931 add r5, r5
932 mov r6d, %1
933
934 .loop:
935 movu m0, [r0]
936 movu m2, [r1]
937 paddw m0, m2
938 pmulhrsw m0, m7
939 paddw m0, m4
940 pmaxsw m0, m6
941 pminsw m0, m5
942 movu [r2], m0
943
944 movu m1, [r0 + 16]
945 movu m2, [r1 + 16]
946 paddw m1, m2
947 pmulhrsw m1, m7
948 paddw m1, m4
949 pmaxsw m1, m6
950 pminsw m1, m5
951 movu [r2 + 16], m1
952
953 movu m0, [r0 + 32]
954 movu m2, [r1 + 32]
955 paddw m0, m2
956 pmulhrsw m0, m7
957 paddw m0, m4
958 pmaxsw m0, m6
959 pminsw m0, m5
960 movu [r2 + 32], m0
961
962 movu m1, [r0 + 48]
963 movu m2, [r1 + 48]
964 paddw m1, m2
965 pmulhrsw m1, m7
966 paddw m1, m4
967 pmaxsw m1, m6
968 pminsw m1, m5
969 movu [r2 + 48], m1
970
971 movu m0, [r0 + 64]
972 movu m2, [r1 + 64]
973 paddw m0, m2
974 pmulhrsw m0, m7
975 paddw m0, m4
976 pmaxsw m0, m6
977 pminsw m0, m5
978 movu [r2 + 64], m0
979
980 movu m1, [r0 + 80]
981 movu m2, [r1 + 80]
982 paddw m1, m2
983 pmulhrsw m1, m7
984 paddw m1, m4
985 pmaxsw m1, m6
986 pminsw m1, m5
987 movu [r2 + 80], m1
988
989 movu m0, [r0 + 96]
990 movu m2, [r1 + 96]
991 paddw m0, m2
992 pmulhrsw m0, m7
993 paddw m0, m4
994 pmaxsw m0, m6
995 pminsw m0, m5
996 movu [r2 + 96], m0
997
998 movu m1, [r0 + 112]
999 movu m2, [r1 + 112]
1000 paddw m1, m2
1001 pmulhrsw m1, m7
1002 paddw m1, m4
1003 pmaxsw m1, m6
1004 pminsw m1, m5
1005 movu [r2 + 112], m1
1006
1007 add r2, r5
1008 add r0, r3
1009 add r1, r4
1010
1011 dec r6d
1012 jnz .loop
1013 RET
1014 %endmacro
1015
1016 ADDAVG_W64_H1 16
1017 ADDAVG_W64_H1 32
1018 ADDAVG_W64_H1 48
1019 ADDAVG_W64_H1 64
1020 ;-----------------------------------------------------------------------------
1021 %else ; !HIGH_BIT_DEPTH
1022 ;-----------------------------------------------------------------------------
1023 INIT_XMM sse4
1024 cglobal addAvg_2x4, 6,6,8, src0, src1, dst, src0Stride, src1tride, dstStride
1025
1026 mova m0, [pw_256]
1027 mova m7, [pw_128]
1028 add r3, r3
1029 add r4, r4
1030
1031 movd m1, [r0]
1032 movd m2, [r0 + r3]
1033 movd m3, [r1]
1034 movd m4, [r1 + r4]
1035
1036 punpckldq m1, m2
1037 punpckldq m3, m4
1038
1039 lea r0, [r0 + 2 * r3]
1040 lea r1, [r1 + 2 * r4]
1041
1042 movd m2, [r0]
1043 movd m4, [r0 + r3]
1044 movd m5, [r1]
1045 movd m6, [r1 + r4]
1046
1047 punpckldq m2, m4
1048 punpckldq m5, m6
1049 punpcklqdq m1, m2
1050 punpcklqdq m3, m5
1051
1052 paddw m1, m3
1053 pmulhrsw m1, m0
1054 paddw m1, m7
1055 packuswb m1, m1
1056
1057 pextrw [r2], m1, 0
1058 pextrw [r2 + r5], m1, 1
1059 lea r2, [r2 + 2 * r5]
1060 pextrw [r2], m1, 2
1061 pextrw [r2 + r5], m1, 3
1062
1063 RET
1064 ;-----------------------------------------------------------------------------
1065
1066 ;-----------------------------------------------------------------------------
1067 INIT_XMM sse4
1068 cglobal addAvg_2x8, 6,6,8, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1069
1070 mova m0, [pw_256]
1071 mova m7, [pw_128]
1072 add r3, r3
1073 add r4, r4
1074
1075 movd m1, [r0]
1076 movd m2, [r0 + r3]
1077 movd m3, [r1]
1078 movd m4, [r1 + r4]
1079
1080 punpckldq m1, m2
1081 punpckldq m3, m4
1082
1083 lea r0, [r0 + 2 * r3]
1084 lea r1, [r1 + 2 * r4]
1085
1086 movd m2, [r0]
1087 movd m4, [r0 + r3]
1088 movd m5, [r1]
1089 movd m6, [r1 + r4]
1090
1091 punpckldq m2, m4
1092 punpckldq m5, m6
1093 punpcklqdq m1, m2
1094 punpcklqdq m3, m5
1095
1096 paddw m1, m3
1097 pmulhrsw m1, m0
1098 paddw m1, m7
1099 packuswb m1, m1
1100
1101 pextrw [r2], m1, 0
1102 pextrw [r2 + r5], m1, 1
1103 lea r2, [r2 + 2 * r5]
1104 pextrw [r2], m1, 2
1105 pextrw [r2 + r5], m1, 3
1106
1107 lea r2, [r2 + 2 * r5]
1108 lea r0, [r0 + 2 * r3]
1109 lea r1, [r1 + 2 * r4]
1110
1111 movd m1, [r0]
1112 movd m2, [r0 + r3]
1113 movd m3, [r1]
1114 movd m4, [r1 + r4]
1115
1116 punpckldq m1, m2
1117 punpckldq m3, m4
1118
1119 lea r0, [r0 + 2 * r3]
1120 lea r1, [r1 + 2 * r4]
1121
1122 movd m2, [r0]
1123 movd m4, [r0 + r3]
1124 movd m5, [r1]
1125 movd m6, [r1 + r4]
1126
1127 punpckldq m2, m4
1128 punpckldq m5, m6
1129 punpcklqdq m1, m2
1130 punpcklqdq m3, m5
1131
1132 paddw m1, m3
1133 pmulhrsw m1, m0
1134 paddw m1, m7
1135 packuswb m1, m1
1136
1137 pextrw [r2], m1, 0
1138 pextrw [r2 + r5], m1, 1
1139 lea r2, [r2 + 2 * r5]
1140 pextrw [r2], m1, 2
1141 pextrw [r2 + r5], m1, 3
1142
1143 RET
1144 ;-----------------------------------------------------------------------------
1145
1146 ;-----------------------------------------------------------------------------
1147 INIT_XMM sse4
1148 cglobal addAvg_2x16, 6,7,8, src0, src1, dst, src0Stride, src1tride, dstStride
1149 mova m0, [pw_256]
1150 mova m7, [pw_128]
1151 mov r6d, 16/4
1152 add r3, r3
1153 add r4, r4
1154 .loop:
1155 movd m1, [r0]
1156 movd m2, [r0 + r3]
1157 movd m3, [r1]
1158 movd m4, [r1 + r4]
1159 lea r0, [r0 + r3 * 2]
1160 lea r1, [r1 + r4 * 2]
1161 punpckldq m1, m2
1162 punpckldq m3, m4
1163 movd m2, [r0]
1164 movd m4, [r0 + r3]
1165 movd m5, [r1]
1166 movd m6, [r1 + r4]
1167 lea r0, [r0 + r3 * 2]
1168 lea r1, [r1 + r4 * 2]
1169 punpckldq m2, m4
1170 punpckldq m5, m6
1171 punpcklqdq m1, m2
1172 punpcklqdq m3, m5
1173 paddw m1, m3
1174 pmulhrsw m1, m0
1175 paddw m1, m7
1176 packuswb m1, m1
1177 pextrw [r2], m1, 0
1178 pextrw [r2 + r5], m1, 1
1179 lea r2, [r2 + r5 * 2]
1180 pextrw [r2], m1, 2
1181 pextrw [r2 + r5], m1, 3
1182 lea r2, [r2 + r5 * 2]
1183 dec r6d
1184 jnz .loop
1185 RET
1186 ;-----------------------------------------------------------------------------
1187
1188 ;-----------------------------------------------------------------------------
1189 INIT_XMM sse4
1190 cglobal addAvg_4x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1191
1192 mova m1, [pw_256]
1193 mova m3, [pw_128]
1194 add r3, r3
1195 add r4, r4
1196
1197 movh m0, [r0]
1198 movhps m0, [r0 + r3]
1199 movh m2, [r1]
1200 movhps m2, [r1 + r4]
1201
1202 paddw m0, m2
1203 pmulhrsw m0, m1
1204 paddw m0, m3
1205
1206 packuswb m0, m0
1207 movd [r2], m0
1208 pshufd m0, m0, 1
1209 movd [r2 + r5], m0
1210
1211 RET
1212 ;-----------------------------------------------------------------------------
1213
1214 ;-----------------------------------------------------------------------------
1215 %macro ADDAVG_W4_H4 1
1216 INIT_XMM sse4
1217 cglobal addAvg_4x%1, 6,7,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1218 mova m1, [pw_256]
1219 mova m3, [pw_128]
1220 add r3, r3
1221 add r4, r4
1222
1223 mov r6d, %1/4
1224
1225 .loop:
1226 movh m0, [r0]
1227 movhps m0, [r0 + r3]
1228 movh m2, [r1]
1229 movhps m2, [r1 + r4]
1230
1231 paddw m0, m2
1232 pmulhrsw m0, m1
1233 paddw m0, m3
1234
1235 packuswb m0, m0
1236 movd [r2], m0
1237 pshufd m0, m0, 1
1238 movd [r2 + r5], m0
1239
1240 lea r2, [r2 + 2 * r5]
1241 lea r0, [r0 + 2 * r3]
1242 lea r1, [r1 + 2 * r4]
1243
1244 movh m0, [r0]
1245 movhps m0, [r0 + r3]
1246 movh m2, [r1]
1247 movhps m2, [r1 + r4]
1248
1249 paddw m0, m2
1250 pmulhrsw m0, m1
1251 paddw m0, m3
1252
1253 packuswb m0, m0
1254 movd [r2], m0
1255 pshufd m0, m0, 1
1256 movd [r2 + r5], m0
1257
1258 lea r2, [r2 + 2 * r5]
1259 lea r0, [r0 + 2 * r3]
1260 lea r1, [r1 + 2 * r4]
1261
1262 dec r6d
1263 jnz .loop
1264 RET
1265 %endmacro
1266
1267 ADDAVG_W4_H4 4
1268 ADDAVG_W4_H4 8
1269 ADDAVG_W4_H4 16
1270
1271 ADDAVG_W4_H4 32
1272
1273 ;-----------------------------------------------------------------------------
1274
1275 ;-----------------------------------------------------------------------------
1276 INIT_XMM sse4
1277 cglobal addAvg_6x8, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1278
1279 mova m4, [pw_256]
1280 mova m5, [pw_128]
1281 add r3, r3
1282 add r4, r4
1283
1284 movu m0, [r0]
1285 movu m2, [r1]
1286 paddw m0, m2
1287 pmulhrsw m0, m4
1288 paddw m0, m5
1289 packuswb m0, m0
1290 movd [r2], m0
1291 pextrw [r2 + 4], m0, 2
1292
1293 movu m1, [r0 + r3]
1294 movu m3, [r1 + r4]
1295 paddw m1, m3
1296 pmulhrsw m1, m4
1297 paddw m1, m5
1298 packuswb m1, m1
1299 movd [r2 + r5], m1
1300 pextrw [r2 + r5 + 4], m1, 2
1301
1302 lea r2, [r2 + 2 * r5]
1303 lea r0, [r0 + 2 * r3]
1304 lea r1, [r1 + 2 * r4]
1305
1306 movu m0, [r0]
1307 movu m2, [r1]
1308 paddw m0, m2
1309 pmulhrsw m0, m4
1310 paddw m0, m5
1311 packuswb m0, m0
1312 movd [r2], m0
1313 pextrw [r2 + 4], m0, 2
1314
1315 movu m1, [r0 + r3]
1316 movu m3, [r1 + r4]
1317 paddw m1, m3
1318 pmulhrsw m1, m4
1319 paddw m1, m5
1320 packuswb m1, m1
1321 movd [r2 + r5], m1
1322 pextrw [r2 + r5 + 4], m1, 2
1323
1324 lea r2, [r2 + 2 * r5]
1325 lea r0, [r0 + 2 * r3]
1326 lea r1, [r1 + 2 * r4]
1327
1328 movu m0, [r0]
1329 movu m2, [r1]
1330 paddw m0, m2
1331 pmulhrsw m0, m4
1332 paddw m0, m5
1333 packuswb m0, m0
1334 movd [r2], m0
1335 pextrw [r2 + 4], m0, 2
1336
1337 movu m1, [r0 + r3]
1338 movu m3, [r1 + r4]
1339 paddw m1, m3
1340 pmulhrsw m1, m4
1341 paddw m1, m5
1342 packuswb m1, m1
1343 movd [r2 + r5], m1
1344 pextrw [r2 + r5 + 4], m1, 2
1345
1346 lea r2, [r2 + 2 * r5]
1347 lea r0, [r0 + 2 * r3]
1348 lea r1, [r1 + 2 * r4]
1349
1350 movu m0, [r0]
1351 movu m2, [r1]
1352 paddw m0, m2
1353 pmulhrsw m0, m4
1354 paddw m0, m5
1355 packuswb m0, m0
1356 movd [r2], m0
1357 pextrw [r2 + 4], m0, 2
1358
1359 movu m1, [r0 + r3]
1360 movu m3, [r1 + r4]
1361 paddw m1, m3
1362 pmulhrsw m1, m4
1363 paddw m1, m5
1364 packuswb m1, m1
1365 movd [r2 + r5], m1
1366 pextrw [r2 + r5 + 4], m1, 2
1367
1368 RET
1369 ;-----------------------------------------------------------------------------
1370
1371 ;-----------------------------------------------------------------------------
1372 INIT_XMM sse4
1373 cglobal addAvg_6x16, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1374 mova m4, [pw_256]
1375 mova m5, [pw_128]
1376 mov r6d, 16/2
1377 add r3, r3
1378 add r4, r4
1379 .loop:
1380 movu m0, [r0]
1381 movu m2, [r1]
1382 movu m1, [r0 + r3]
1383 movu m3, [r1 + r4]
1384 dec r6d
1385 lea r0, [r0 + r3 * 2]
1386 lea r1, [r1 + r4 * 2]
1387 paddw m0, m2
1388 paddw m1, m3
1389 pmulhrsw m0, m4
1390 pmulhrsw m1, m4
1391 paddw m0, m5
1392 paddw m1, m5
1393 packuswb m0, m0
1394 packuswb m1, m1
1395 movd [r2], m0
1396 pextrw [r2 + 4], m0, 2
1397 movd [r2 + r5], m1
1398 pextrw [r2 + r5 + 4], m1, 2
1399 lea r2, [r2 + r5 * 2]
1400 jnz .loop
1401 RET
1402 ;-----------------------------------------------------------------------------
1403
1404 ;-----------------------------------------------------------------------------
1405 INIT_XMM sse4
1406 cglobal addAvg_8x2, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1407 mova m4, [pw_256]
1408 mova m5, [pw_128]
1409 add r3, r3
1410 add r4, r4
1411
1412 movu m0, [r0]
1413 movu m2, [r1]
1414 paddw m0, m2
1415 pmulhrsw m0, m4
1416 paddw m0, m5
1417 packuswb m0, m0
1418 movh [r2], m0
1419
1420 movu m1, [r0 + r3]
1421 movu m3, [r1 + r4]
1422 paddw m1, m3
1423 pmulhrsw m1, m4
1424 paddw m1, m5
1425 packuswb m1, m1
1426 movh [r2 + r5], m1
1427
1428 RET
1429 ;-----------------------------------------------------------------------------
1430
1431 ;-----------------------------------------------------------------------------
1432 INIT_XMM sse4
1433 cglobal addAvg_8x6, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1434
1435 mova m4, [pw_256]
1436 mova m5, [pw_128]
1437 add r3, r3
1438 add r4, r4
1439
1440 movu m0, [r0]
1441 movu m2, [r1]
1442 paddw m0, m2
1443 pmulhrsw m0, m4
1444 paddw m0, m5
1445 packuswb m0, m0
1446 movh [r2], m0
1447
1448 movu m1, [r0 + r3]
1449 movu m3, [r1 + r4]
1450 paddw m1, m3
1451 pmulhrsw m1, m4
1452 paddw m1, m5
1453 packuswb m1, m1
1454 movh [r2 + r5], m1
1455
1456 lea r2, [r2 + 2 * r5]
1457 lea r0, [r0 + 2 * r3]
1458 lea r1, [r1 + 2 * r4]
1459
1460 movu m0, [r0]
1461 movu m2, [r1]
1462 paddw m0, m2
1463 pmulhrsw m0, m4
1464 paddw m0, m5
1465 packuswb m0, m0
1466 movh [r2], m0
1467
1468 movu m1, [r0 + r3]
1469 movu m3, [r1 + r4]
1470 paddw m1, m3
1471 pmulhrsw m1, m4
1472 paddw m1, m5
1473 packuswb m1, m1
1474 movh [r2 + r5], m1
1475
1476 lea r2, [r2 + 2 * r5]
1477 lea r0, [r0 + 2 * r3]
1478 lea r1, [r1 + 2 * r4]
1479
1480 movu m0, [r0]
1481 movu m2, [r1]
1482 paddw m0, m2
1483 pmulhrsw m0, m4
1484 paddw m0, m5
1485 packuswb m0, m0
1486 movh [r2], m0
1487
1488 movu m1, [r0 + r3]
1489 movu m3, [r1 + r4]
1490 paddw m1, m3
1491 pmulhrsw m1, m4
1492 paddw m1, m5
1493 packuswb m1, m1
1494 movh [r2 + r5], m1
1495
1496 RET
1497 ;-----------------------------------------------------------------------------
1498
1499 ;-----------------------------------------------------------------------------
1500 %macro ADDAVG_W8_H4 1
1501 INIT_XMM sse4
1502 cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1503
1504 mova m4, [pw_256]
1505 mova m5, [pw_128]
1506 add r3, r3
1507 add r4, r4
1508
1509 mov r6d, %1/4
1510
1511 .loop:
1512 movu m0, [r0]
1513 movu m2, [r1]
1514 paddw m0, m2
1515 pmulhrsw m0, m4
1516 paddw m0, m5
1517
1518 packuswb m0, m0
1519 movh [r2], m0
1520
1521 movu m1, [r0 + r3]
1522 movu m3, [r1 + r4]
1523 paddw m1, m3
1524 pmulhrsw m1, m4
1525 paddw m1, m5
1526
1527 packuswb m1, m1
1528 movh [r2 + r5], m1
1529
1530 lea r2, [r2 + 2 * r5]
1531 lea r0, [r0 + 2 * r3]
1532 lea r1, [r1 + 2 * r4]
1533
1534 movu m0, [r0]
1535 movu m2, [r1]
1536 paddw m0, m2
1537 pmulhrsw m0, m4
1538 paddw m0, m5
1539
1540 packuswb m0, m0
1541 movh [r2], m0
1542
1543 movu m1, [r0 + r3]
1544 movu m3, [r1 + r4]
1545 paddw m1, m3
1546 pmulhrsw m1, m4
1547 paddw m1, m5
1548
1549 packuswb m1, m1
1550 movh [r2 + r5], m1
1551
1552 lea r2, [r2 + 2 * r5]
1553 lea r0, [r0 + 2 * r3]
1554 lea r1, [r1 + 2 * r4]
1555
1556 dec r6d
1557 jnz .loop
1558 RET
1559 %endmacro
1560
1561 ADDAVG_W8_H4 4
1562 ADDAVG_W8_H4 8
1563 ADDAVG_W8_H4 16
1564 ADDAVG_W8_H4 32
1565
1566 ADDAVG_W8_H4 12
1567 ADDAVG_W8_H4 64
1568
1569 ;-----------------------------------------------------------------------------
1570
1571
1572 ;-----------------------------------------------------------------------------
1573 %macro ADDAVG_W12_H4 1
1574 INIT_XMM sse4
1575 cglobal addAvg_12x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1576 mova m4, [pw_256]
1577 mova m5, [pw_128]
1578 add r3, r3
1579 add r4, r4
1580
1581 mov r6d, %1/4
1582
1583 .loop:
1584 movu m0, [r0]
1585 movu m2, [r1]
1586 paddw m0, m2
1587 pmulhrsw m0, m4
1588 paddw m0, m5
1589 packuswb m0, m0
1590 movh [r2], m0
1591
1592 movh m0, [r0 + 16]
1593 movhps m0, [r0 + 16 + r3]
1594 movh m2, [r1 + 16]
1595 movhps m2, [r1 + 16 + r4]
1596
1597 paddw m0, m2
1598 pmulhrsw m0, m4
1599 paddw m0, m5
1600
1601 packuswb m0, m0
1602 movd [r2 + 8], m0
1603 pshufd m0, m0, 1
1604 movd [r2 + 8 + r5], m0
1605
1606 movu m1, [r0 + r3]
1607 movu m3, [r1 + r4]
1608 paddw m1, m3
1609 pmulhrsw m1, m4
1610 paddw m1, m5
1611
1612 packuswb m1, m1
1613 movh [r2 + r5], m1
1614
1615 lea r2, [r2 + 2 * r5]
1616 lea r0, [r0 + 2 * r3]
1617 lea r1, [r1 + 2 * r4]
1618
1619 movu m0, [r0]
1620 movu m2, [r1]
1621 paddw m0, m2
1622 pmulhrsw m0, m4
1623 paddw m0, m5
1624
1625 packuswb m0, m0
1626 movh [r2], m0
1627
1628 movh m0, [r0 + 16]
1629 movhps m0, [r0 + 16 + r3]
1630 movh m2, [r1 + 16]
1631 movhps m2, [r1 + 16 + r4]
1632
1633 paddw m0, m2
1634 pmulhrsw m0, m4
1635 paddw m0, m5
1636
1637 packuswb m0, m0
1638 movd [r2 + 8], m0
1639 pshufd m0, m0, 1
1640 movd [r2 + 8 + r5], m0
1641
1642 movu m1, [r0 + r3]
1643 movu m3, [r1 + r4]
1644 paddw m1, m3
1645 pmulhrsw m1, m4
1646 paddw m1, m5
1647
1648 packuswb m1, m1
1649 movh [r2 + r5], m1
1650
1651 lea r2, [r2 + 2 * r5]
1652 lea r0, [r0 + 2 * r3]
1653 lea r1, [r1 + 2 * r4]
1654
1655 dec r6d
1656 jnz .loop
1657 RET
1658 %endmacro
1659
1660 ADDAVG_W12_H4 16
1661
1662 ADDAVG_W12_H4 32
1663
1664 ;-----------------------------------------------------------------------------
1665
1666
1667 ;-----------------------------------------------------------------------------
1668 %macro ADDAVG_W16_H4 1
1669 INIT_XMM sse4
1670 cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1671 mova m4, [pw_256]
1672 mova m5, [pw_128]
1673 add r3, r3
1674 add r4, r4
1675
1676 mov r6d, %1/4
1677
1678 .loop:
1679 movu m0, [r0]
1680 movu m2, [r1]
1681 paddw m0, m2
1682 pmulhrsw m0, m4
1683 paddw m0, m5
1684
1685 movu m1, [r0 + 16]
1686 movu m2, [r1 + 16]
1687 paddw m1, m2
1688 pmulhrsw m1, m4
1689 paddw m1, m5
1690
1691 packuswb m0, m1
1692 movu [r2], m0
1693
1694 movu m1, [r0 + r3]
1695 movu m3, [r1 + r4]
1696 paddw m1, m3
1697 pmulhrsw m1, m4
1698 paddw m1, m5
1699
1700 movu m2, [r0 + 16 + r3]
1701 movu m3, [r1 + 16 + r4]
1702 paddw m2, m3
1703 pmulhrsw m2, m4
1704 paddw m2, m5
1705
1706 packuswb m1, m2
1707 movu [r2 + r5], m1
1708
1709 lea r2, [r2 + 2 * r5]
1710 lea r0, [r0 + 2 * r3]
1711 lea r1, [r1 + 2 * r4]
1712
1713 movu m0, [r0]
1714 movu m2, [r1]
1715 paddw m0, m2
1716 pmulhrsw m0, m4
1717 paddw m0, m5
1718
1719 movu m1, [r0 + 16]
1720 movu m2, [r1 + 16]
1721 paddw m1, m2
1722 pmulhrsw m1, m4
1723 paddw m1, m5
1724
1725 packuswb m0, m1
1726 movu [r2], m0
1727
1728 movu m1, [r0 + r3]
1729 movu m3, [r1 + r4]
1730 paddw m1, m3
1731 pmulhrsw m1, m4
1732 paddw m1, m5
1733
1734 movu m2, [r0 + 16 + r3]
1735 movu m3, [r1 + 16 + r4]
1736 paddw m2, m3
1737 pmulhrsw m2, m4
1738 paddw m2, m5
1739
1740 packuswb m1, m2
1741 movu [r2 + r5], m1
1742
1743 lea r2, [r2 + 2 * r5]
1744 lea r0, [r0 + 2 * r3]
1745 lea r1, [r1 + 2 * r4]
1746
1747 dec r6d
1748 jnz .loop
1749 RET
1750 %endmacro
1751
1752 ADDAVG_W16_H4 4
1753 ADDAVG_W16_H4 8
1754 ADDAVG_W16_H4 12
1755 ADDAVG_W16_H4 16
1756 ADDAVG_W16_H4 32
1757 ADDAVG_W16_H4 64
1758
1759 ADDAVG_W16_H4 24
1760
1761 ;-----------------------------------------------------------------------------
1762
1763
1764 ;-----------------------------------------------------------------------------
1765 %macro ADDAVG_W24_H2 2
1766 INIT_XMM sse4
1767 cglobal addAvg_%1x%2, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1768 mova m4, [pw_256]
1769 mova m5, [pw_128]
1770 add r3, r3
1771 add r4, r4
1772
1773 mov r6d, %2/2
1774
1775 .loop:
1776 movu m0, [r0]
1777 movu m2, [r1]
1778 paddw m0, m2
1779 pmulhrsw m0, m4
1780 paddw m0, m5
1781
1782 movu m1, [r0 + 16]
1783 movu m2, [r1 + 16]
1784 paddw m1, m2
1785 pmulhrsw m1, m4
1786 paddw m1, m5
1787
1788 packuswb m0, m1
1789 movu [r2], m0
1790
1791 movu m0, [r0 + 32]
1792 movu m2, [r1 + 32]
1793 paddw m0, m2
1794 pmulhrsw m0, m4
1795 paddw m0, m5
1796
1797 packuswb m0, m0
1798 movh [r2 + 16], m0
1799
1800 movu m1, [r0 + r3]
1801 movu m3, [r1 + r4]
1802 paddw m1, m3
1803 pmulhrsw m1, m4
1804 paddw m1, m5
1805
1806 movu m2, [r0 + 16 + r3]
1807 movu m3, [r1 + 16 + r4]
1808 paddw m2, m3
1809 pmulhrsw m2, m4
1810 paddw m2, m5
1811
1812 packuswb m1, m2
1813 movu [r2 + r5], m1
1814
1815 movu m1, [r0 + 32 + r3]
1816 movu m3, [r1 + 32 + r4]
1817 paddw m1, m3
1818 pmulhrsw m1, m4
1819 paddw m1, m5
1820
1821 packuswb m1, m1
1822 movh [r2 + 16 + r5], m1
1823
1824 lea r2, [r2 + 2 * r5]
1825 lea r0, [r0 + 2 * r3]
1826 lea r1, [r1 + 2 * r4]
1827
1828 dec r6d
1829 jnz .loop
1830 RET
1831 %endmacro
1832
1833 ADDAVG_W24_H2 24, 32
1834
1835 ADDAVG_W24_H2 24, 64
1836
1837 ;-----------------------------------------------------------------------------
1838
1839 ;-----------------------------------------------------------------------------
1840 %macro ADDAVG_W32_H2 1
1841 INIT_XMM sse4
1842 cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1843 mova m4, [pw_256]
1844 mova m5, [pw_128]
1845 add r3, r3
1846 add r4, r4
1847
1848 mov r6d, %1/2
1849
1850 .loop:
1851 movu m0, [r0]
1852 movu m2, [r1]
1853 paddw m0, m2
1854 pmulhrsw m0, m4
1855 paddw m0, m5
1856
1857 movu m1, [r0 + 16]
1858 movu m2, [r1 + 16]
1859 paddw m1, m2
1860 pmulhrsw m1, m4
1861 paddw m1, m5
1862
1863 packuswb m0, m1
1864 movu [r2], m0
1865
1866 movu m0, [r0 + 32]
1867 movu m2, [r1 + 32]
1868 paddw m0, m2
1869 pmulhrsw m0, m4
1870 paddw m0, m5
1871
1872 movu m1, [r0 + 48]
1873 movu m2, [r1 + 48]
1874 paddw m1, m2
1875 pmulhrsw m1, m4
1876 paddw m1, m5
1877
1878 packuswb m0, m1
1879 movu [r2 + 16], m0
1880
1881 movu m1, [r0 + r3]
1882 movu m3, [r1 + r4]
1883 paddw m1, m3
1884 pmulhrsw m1, m4
1885 paddw m1, m5
1886
1887 movu m2, [r0 + 16 + r3]
1888 movu m3, [r1 + 16 + r4]
1889 paddw m2, m3
1890 pmulhrsw m2, m4
1891 paddw m2, m5
1892
1893 packuswb m1, m2
1894 movu [r2 + r5], m1
1895
1896 movu m1, [r0 + 32 + r3]
1897 movu m3, [r1 + 32 + r4]
1898 paddw m1, m3
1899 pmulhrsw m1, m4
1900 paddw m1, m5
1901
1902 movu m2, [r0 + 48 + r3]
1903 movu m3, [r1 + 48 + r4]
1904 paddw m2, m3
1905 pmulhrsw m2, m4
1906 paddw m2, m5
1907
1908 packuswb m1, m2
1909 movu [r2 + 16 + r5], m1
1910
1911 lea r2, [r2 + 2 * r5]
1912 lea r0, [r0 + 2 * r3]
1913 lea r1, [r1 + 2 * r4]
1914
1915 dec r6d
1916 jnz .loop
1917 RET
1918 %endmacro
1919
1920 ADDAVG_W32_H2 8
1921 ADDAVG_W32_H2 16
1922 ADDAVG_W32_H2 24
1923 ADDAVG_W32_H2 32
1924 ADDAVG_W32_H2 64
1925
1926 ADDAVG_W32_H2 48
1927
1928 ;-----------------------------------------------------------------------------
1929
1930
1931 ;-----------------------------------------------------------------------------
1932 %macro ADDAVG_W48_H2 1
1933 INIT_XMM sse4
1934 cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1935
1936 mova m4, [pw_256]
1937 mova m5, [pw_128]
1938 add r3, r3
1939 add r4, r4
1940
1941 mov r6d, %1/2
1942
1943 .loop:
1944 movu m0, [r0]
1945 movu m2, [r1]
1946 paddw m0, m2
1947 pmulhrsw m0, m4
1948 paddw m0, m5
1949
1950 movu m1, [r0 + 16]
1951 movu m2, [r1 + 16]
1952 paddw m1, m2
1953 pmulhrsw m1, m4
1954 paddw m1, m5
1955
1956 packuswb m0, m1
1957 movu [r2], m0
1958
1959 movu m0, [r0 + 32]
1960 movu m2, [r1 + 32]
1961 paddw m0, m2
1962 pmulhrsw m0, m4
1963 paddw m0, m5
1964
1965 movu m1, [r0 + 48]
1966 movu m2, [r1 + 48]
1967 paddw m1, m2
1968 pmulhrsw m1, m4
1969 paddw m1, m5
1970
1971 packuswb m0, m1
1972 movu [r2 + 16], m0
1973
1974 movu m0, [r0 + 64]
1975 movu m2, [r1 + 64]
1976 paddw m0, m2
1977 pmulhrsw m0, m4
1978 paddw m0, m5
1979
1980 movu m1, [r0 + 80]
1981 movu m2, [r1 + 80]
1982 paddw m1, m2
1983 pmulhrsw m1, m4
1984 paddw m1, m5
1985
1986 packuswb m0, m1
1987 movu [r2 + 32], m0
1988
1989 movu m1, [r0 + r3]
1990 movu m3, [r1 + r4]
1991 paddw m1, m3
1992 pmulhrsw m1, m4
1993 paddw m1, m5
1994
1995 movu m2, [r0 + 16 + r3]
1996 movu m3, [r1 + 16 + r4]
1997 paddw m2, m3
1998 pmulhrsw m2, m4
1999 paddw m2, m5
2000
2001 packuswb m1, m2
2002 movu [r2 + r5], m1
2003
2004 movu m1, [r0 + 32 + r3]
2005 movu m3, [r1 + 32 + r4]
2006 paddw m1, m3
2007 pmulhrsw m1, m4
2008 paddw m1, m5
2009
2010 movu m2, [r0 + 48 + r3]
2011 movu m3, [r1 + 48 + r4]
2012 paddw m2, m3
2013 pmulhrsw m2, m4
2014 paddw m2, m5
2015
2016 packuswb m1, m2
2017 movu [r2 + 16 + r5], m1
2018
2019 movu m1, [r0 + 64 + r3]
2020 movu m3, [r1 + 64 + r4]
2021 paddw m1, m3
2022 pmulhrsw m1, m4
2023 paddw m1, m5
2024
2025 movu m2, [r0 + 80 + r3]
2026 movu m3, [r1 + 80 + r4]
2027 paddw m2, m3
2028 pmulhrsw m2, m4
2029 paddw m2, m5
2030
2031 packuswb m1, m2
2032 movu [r2 + 32 + r5], m1
2033
2034 lea r2, [r2 + 2 * r5]
2035 lea r0, [r0 + 2 * r3]
2036 lea r1, [r1 + 2 * r4]
2037
2038 dec r6d
2039 jnz .loop
2040 RET
2041 %endmacro
2042
2043 ADDAVG_W48_H2 64
2044
2045 ;-----------------------------------------------------------------------------
2046
2047 ;-----------------------------------------------------------------------------
2048 %macro ADDAVG_W64_H1 1
2049 INIT_XMM sse4
2050 cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
2051 mova m4, [pw_256]
2052 mova m5, [pw_128]
2053 add r3, r3
2054 add r4, r4
2055
2056 mov r6d, %1
2057
2058 .loop:
2059 movu m0, [r0]
2060 movu m2, [r1]
2061 paddw m0, m2
2062 pmulhrsw m0, m4
2063 paddw m0, m5
2064
2065 movu m1, [r0 + 16]
2066 movu m2, [r1 + 16]
2067 paddw m1, m2
2068 pmulhrsw m1, m4
2069 paddw m1, m5
2070
2071 packuswb m0, m1
2072 movu [r2], m0
2073
2074 movu m0, [r0 + 32]
2075 movu m2, [r1 + 32]
2076 paddw m0, m2
2077 pmulhrsw m0, m4
2078 paddw m0, m5
2079
2080 movu m1, [r0 + 48]
2081 movu m2, [r1 + 48]
2082 paddw m1, m2
2083 pmulhrsw m1, m4
2084 paddw m1, m5
2085
2086 packuswb m0, m1
2087 movu [r2 + 16], m0
2088
2089 movu m0, [r0 + 64]
2090 movu m2, [r1 + 64]
2091 paddw m0, m2
2092 pmulhrsw m0, m4
2093 paddw m0, m5
2094
2095 movu m1, [r0 + 80]
2096 movu m2, [r1 + 80]
2097 paddw m1, m2
2098 pmulhrsw m1, m4
2099 paddw m1, m5
2100
2101 packuswb m0, m1
2102 movu [r2 + 32], m0
2103
2104 movu m0, [r0 + 96]
2105 movu m2, [r1 + 96]
2106 paddw m0, m2
2107 pmulhrsw m0, m4
2108 paddw m0, m5
2109
2110 movu m1, [r0 + 112]
2111 movu m2, [r1 + 112]
2112 paddw m1, m2
2113 pmulhrsw m1, m4
2114 paddw m1, m5
2115
2116 packuswb m0, m1
2117 movu [r2 + 48], m0
2118
2119 add r2, r5
2120 add r0, r3
2121 add r1, r4
2122
2123 dec r6d
2124 jnz .loop
2125 RET
2126 %endmacro
2127
2128 ADDAVG_W64_H1 16
2129 ADDAVG_W64_H1 32
2130 ADDAVG_W64_H1 48
2131 ADDAVG_W64_H1 64
2132 ;-----------------------------------------------------------------------------
2133 %endif ; HIGH_BIT_DEPTH
2134
2135 ;=============================================================================
2136 ; implicit weighted biprediction
2137 ;=============================================================================
2138 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
2139 %if WIN64
2140 DECLARE_REG_TMP 0,1,2,3,4,5,4,5
2141 %macro AVG_START 0-1 0
2142 PROLOGUE 6,7,%1
2143 %endmacro
2144 %elif UNIX64
2145 DECLARE_REG_TMP 0,1,2,3,4,5,7,8
2146 %macro AVG_START 0-1 0
2147 PROLOGUE 6,9,%1
2148 %endmacro
2149 %else
2150 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
2151 %macro AVG_START 0-1 0
2152 PROLOGUE 0,7,%1
2153 mov t0, r0m
2154 mov t1, r1m
2155 mov t2, r2m
2156 mov t3, r3m
2157 mov t4, r4m
2158 mov t5, r5m
2159 %endmacro
2160 %endif
2161
2162 %macro AVG_END 0
2163 lea t4, [t4+t5*2*SIZEOF_PIXEL]
2164 lea t2, [t2+t3*2*SIZEOF_PIXEL]
2165 lea t0, [t0+t1*2*SIZEOF_PIXEL]
2166 sub eax, 2
2167 jg .height_loop
2168 %ifidn movu,movq ; detect MMX
2169 EMMS
2170 %endif
2171 RET
2172 %endmacro
2173
2174 %if HIGH_BIT_DEPTH
2175
2176 %macro BIWEIGHT_MMX 2
2177 movh m0, %1
2178 movh m1, %2
2179 punpcklwd m0, m1
2180 pmaddwd m0, m3
2181 paddd m0, m4
2182 psrad m0, 6
2183 %endmacro
2184
2185 %macro BIWEIGHT_START_MMX 0
2186 movzx t6d, word r6m
2187 mov t7d, 64
2188 sub t7d, t6d
2189 shl t7d, 16
2190 add t6d, t7d
2191 movd m3, t6d
2192 SPLATD m3, m3
2193 mova m4, [pd_32]
2194 pxor m5, m5
2195 %endmacro
2196
2197 %else ;!HIGH_BIT_DEPTH
2198 %macro BIWEIGHT_MMX 2
2199 movh m0, %1
2200 movh m1, %2
2201 punpcklbw m0, m5
2202 punpcklbw m1, m5
2203 pmullw m0, m2
2204 pmullw m1, m3
2205 paddw m0, m1
2206 paddw m0, m4
2207 psraw m0, 6
2208 %endmacro
2209
2210 %macro BIWEIGHT_START_MMX 0
2211 movd m2, r6m
2212 SPLATW m2, m2 ; weight_dst
2213 mova m3, [pw_64]
2214 psubw m3, m2 ; weight_src
2215 mova m4, [pw_32] ; rounding
2216 pxor m5, m5
2217 %endmacro
2218 %endif ;HIGH_BIT_DEPTH
2219
2220 %macro BIWEIGHT_SSSE3 2
2221 movh m0, %1
2222 movh m1, %2
2223 punpcklbw m0, m1
2224 pmaddubsw m0, m3
2225 pmulhrsw m0, m4
2226 %endmacro
2227
2228 %macro BIWEIGHT_START_SSSE3 0
2229 movzx t6d, byte r6m ; FIXME x86_64
2230 mov t7d, 64
2231 sub t7d, t6d
2232 shl t7d, 8
2233 add t6d, t7d
2234 mova m4, [pw_512]
2235 movd xm3, t6d
2236 %if cpuflag(avx2)
2237 vpbroadcastw m3, xm3
2238 %else
2239 SPLATW m3, m3 ; weight_dst,src
2240 %endif
2241 %endmacro
2242
2243 %if HIGH_BIT_DEPTH
2244 %macro BIWEIGHT_ROW 4
2245 BIWEIGHT [%2], [%3]
2246 %if %4==mmsize/4
2247 packssdw m0, m0
2248 CLIPW m0, m5, m7
2249 movh [%1], m0
2250 %else
2251 SWAP 0, 6
2252 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
2253 packssdw m6, m0
2254 CLIPW m6, m5, m7
2255 mova [%1], m6
2256 %endif
2257 %endmacro
2258
2259 %else ;!HIGH_BIT_DEPTH
2260 %macro BIWEIGHT_ROW 4
2261 BIWEIGHT [%2], [%3]
2262 %if %4==mmsize/2
2263 packuswb m0, m0
2264 movh [%1], m0
2265 %else
2266 SWAP 0, 6
2267 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
2268 packuswb m6, m0
2269 %if %4 != 12
2270 mova [%1], m6
2271 %else ; !w12
2272 movh [%1], m6
2273 movhlps m6, m6
2274 movd [%1+mmsize/2], m6
2275 %endif ; w12
2276 %endif
2277 %endmacro
2278
2279 %endif ;HIGH_BIT_DEPTH
2280
2281 ;-----------------------------------------------------------------------------
2282 ; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
2283 ;-----------------------------------------------------------------------------
2284 %macro AVG_WEIGHT 1-2 0
2285 cglobal pixel_avg_weight_w%1
2286 BIWEIGHT_START
2287 AVG_START %2
2288 %if HIGH_BIT_DEPTH
2289 mova m7, [pw_pixel_max]
2290 %endif
2291 .height_loop:
2292 %if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
2293 BIWEIGHT [t2], [t4]
2294 SWAP 0, 6
2295 BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
2296 %if HIGH_BIT_DEPTH
2297 packssdw m6, m0
2298 CLIPW m6, m5, m7
2299 %else ;!HIGH_BIT_DEPTH
2300 packuswb m6, m0
2301 %endif ;HIGH_BIT_DEPTH
2302 movlps [t0], m6
2303 movhps [t0+SIZEOF_PIXEL*t1], m6
2304 %else
2305 %assign x 0
2306 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
2307 %assign y mmsize
2308 %if (%1 == 12) && (%1*SIZEOF_PIXEL-x < mmsize)
2309 %assign y (%1*SIZEOF_PIXEL-x)
2310 %endif
2311 BIWEIGHT_ROW t0+x, t2+x, t4+x, y
2312 BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, y
2313 %assign x x+mmsize
2314 %endrep
2315 %endif
2316 AVG_END
2317 %endmacro
2318
2319 %define BIWEIGHT BIWEIGHT_MMX
2320 %define BIWEIGHT_START BIWEIGHT_START_MMX
2321 INIT_MMX mmx2
2322 AVG_WEIGHT 4
2323 AVG_WEIGHT 8
2324 AVG_WEIGHT 12
2325 AVG_WEIGHT 16
2326 AVG_WEIGHT 32
2327 AVG_WEIGHT 64
2328 AVG_WEIGHT 24
2329 AVG_WEIGHT 48
2330 %if HIGH_BIT_DEPTH
2331 INIT_XMM sse2
2332 AVG_WEIGHT 4, 8
2333 AVG_WEIGHT 8, 8
2334 AVG_WEIGHT 12, 8
2335 AVG_WEIGHT 16, 8
2336 AVG_WEIGHT 24, 8
2337 AVG_WEIGHT 32, 8
2338 AVG_WEIGHT 48, 8
2339 AVG_WEIGHT 64, 8
2340 %else ;!HIGH_BIT_DEPTH
2341 INIT_XMM sse2
2342 AVG_WEIGHT 8, 7
2343 AVG_WEIGHT 12, 7
2344 AVG_WEIGHT 16, 7
2345 AVG_WEIGHT 32, 7
2346 AVG_WEIGHT 64, 7
2347 AVG_WEIGHT 24, 7
2348 AVG_WEIGHT 48, 7
2349 %define BIWEIGHT BIWEIGHT_SSSE3
2350 %define BIWEIGHT_START BIWEIGHT_START_SSSE3
2351 INIT_MMX ssse3
2352 AVG_WEIGHT 4
2353 INIT_XMM ssse3
2354 AVG_WEIGHT 8, 7
2355 AVG_WEIGHT 12, 7
2356 AVG_WEIGHT 16, 7
2357 AVG_WEIGHT 32, 7
2358 AVG_WEIGHT 64, 7
2359 AVG_WEIGHT 24, 7
2360 AVG_WEIGHT 48, 7
2361
2362 INIT_YMM avx2
2363 cglobal pixel_avg_weight_w16
2364 BIWEIGHT_START
2365 AVG_START 5
2366 .height_loop:
2367 movu xm0, [t2]
2368 movu xm1, [t4]
2369 vinserti128 m0, m0, [t2+t3], 1
2370 vinserti128 m1, m1, [t4+t5], 1
2371 SBUTTERFLY bw, 0, 1, 2
2372 pmaddubsw m0, m3
2373 pmaddubsw m1, m3
2374 pmulhrsw m0, m4
2375 pmulhrsw m1, m4
2376 packuswb m0, m1
2377 mova [t0], xm0
2378 vextracti128 [t0+t1], m0, 1
2379 AVG_END
2380 %endif ;HIGH_BIT_DEPTH
2381
2382 ;=============================================================================
2383 ; P frame explicit weighted prediction
2384 ;=============================================================================
2385
2386 %if HIGH_BIT_DEPTH
2387 ; width
2388 %macro WEIGHT_START 1
2389 mova m0, [r4+ 0] ; 1<<denom
2390 mova m3, [r4+16]
2391 movd m2, [r4+32] ; denom
2392 mova m4, [pw_pixel_max]
2393 paddw m2, [sq_1] ; denom+1
2394 %endmacro
2395
2396 ; src1, src2
2397 %macro WEIGHT 2
2398 movh m5, [%1]
2399 movh m6, [%2]
2400 punpcklwd m5, m0
2401 punpcklwd m6, m0
2402 pmaddwd m5, m3
2403 pmaddwd m6, m3
2404 psrad m5, m2
2405 psrad m6, m2
2406 packssdw m5, m6
2407 %endmacro
2408
2409 ; src, dst, width
2410 %macro WEIGHT_TWO_ROW 4
2411 %assign x 0
2412 %rep (%3+mmsize/2-1)/(mmsize/2)
2413 %if %3-x/2 <= 4 && mmsize == 16
2414 WEIGHT %1+x, %1+r3+x
2415 CLIPW m5, [pb_0], m4
2416 movh [%2+x], m5
2417 movhps [%2+r1+x], m5
2418 %else
2419 WEIGHT %1+x, %1+x+mmsize/2
2420 SWAP 5, 7
2421 WEIGHT %1+r3+x, %1+r3+x+mmsize/2
2422 CLIPW m5, [pb_0], m4
2423 CLIPW m7, [pb_0], m4
2424 mova [%2+x], m7
2425 mova [%2+r1+x], m5
2426 %endif
2427 %assign x x+mmsize
2428 %endrep
2429 %endmacro
2430
2431 %else ; !HIGH_BIT_DEPTH
2432
2433 %macro WEIGHT_START 1
2434 %if cpuflag(avx2)
2435 vbroadcasti128 m3, [r4]
2436 vbroadcasti128 m4, [r4+16]
2437 %else
2438 mova m3, [r4]
2439 mova m4, [r4+16]
2440 %if notcpuflag(ssse3)
2441 movd m5, [r4+32]
2442 %endif
2443 %endif
2444 pxor m2, m2
2445 %endmacro
2446
2447 ; src1, src2, dst1, dst2, fast
2448 %macro WEIGHT_ROWx2 5
2449 movh m0, [%1 ]
2450 movh m1, [%1+mmsize/2]
2451 movh m6, [%2 ]
2452 movh m7, [%2+mmsize/2]
2453 punpcklbw m0, m2
2454 punpcklbw m1, m2
2455 punpcklbw m6, m2
2456 punpcklbw m7, m2
2457 %if cpuflag(ssse3)
2458 %if %5==0
2459 psllw m0, 7
2460 psllw m1, 7
2461 psllw m6, 7
2462 psllw m7, 7
2463 %endif
2464 pmulhrsw m0, m3
2465 pmulhrsw m1, m3
2466 pmulhrsw m6, m3
2467 pmulhrsw m7, m3
2468 paddw m0, m4
2469 paddw m1, m4
2470 paddw m6, m4
2471 paddw m7, m4
2472 %else
2473 pmullw m0, m3
2474 pmullw m1, m3
2475 pmullw m6, m3
2476 pmullw m7, m3
2477 paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
2478 paddsw m1, m4
2479 paddsw m6, m4
2480 paddsw m7, m4
2481 psraw m0, m5
2482 psraw m1, m5
2483 psraw m6, m5
2484 psraw m7, m5
2485 %endif
2486 packuswb m0, m1
2487 packuswb m6, m7
2488 mova [%3], m0
2489 mova [%4], m6
2490 %endmacro
2491
2492 ; src1, src2, dst1, dst2, width, fast
2493 %macro WEIGHT_COL 6
2494 %if cpuflag(avx2)
2495 %if %5==16
2496 movu xm0, [%1]
2497 vinserti128 m0, m0, [%2], 1
2498 punpckhbw m1, m0, m2
2499 punpcklbw m0, m0, m2
2500 %if %6==0
2501 psllw m0, 7
2502 psllw m1, 7
2503 %endif
2504 pmulhrsw m0, m3
2505 pmulhrsw m1, m3
2506 paddw m0, m4
2507 paddw m1, m4
2508 packuswb m0, m1
2509 mova [%3], xm0
2510 vextracti128 [%4], m0, 1
2511 %else
2512 movq xm0, [%1]
2513 vinserti128 m0, m0, [%2], 1
2514 punpcklbw m0, m2
2515 %if %6==0
2516 psllw m0, 7
2517 %endif
2518 pmulhrsw m0, m3
2519 paddw m0, m4
2520 packuswb m0, m0
2521 vextracti128 xm1, m0, 1
2522 %if %5 == 8
2523 movq [%3], xm0
2524 movq [%4], xm1
2525 %else
2526 movd [%3], xm0
2527 movd [%4], xm1
2528 %endif
2529 %endif
2530 %else
2531 movh m0, [%1]
2532 movh m1, [%2]
2533 punpcklbw m0, m2
2534 punpcklbw m1, m2
2535 %if cpuflag(ssse3)
2536 %if %6==0
2537 psllw m0, 7
2538 psllw m1, 7
2539 %endif
2540 pmulhrsw m0, m3
2541 pmulhrsw m1, m3
2542 paddw m0, m4
2543 paddw m1, m4
2544 %else
2545 pmullw m0, m3
2546 pmullw m1, m3
2547 paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
2548 paddsw m1, m4
2549 psraw m0, m5
2550 psraw m1, m5
2551 %endif
2552 %if %5 == 8
2553 packuswb m0, m1
2554 movh [%3], m0
2555 movhps [%4], m0
2556 %else
2557 packuswb m0, m0
2558 packuswb m1, m1
2559 movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
2560 movd [%4], m1
2561 %endif
2562 %endif
2563 %endmacro
2564 ; src, dst, width
2565 %macro WEIGHT_TWO_ROW 4
2566 %assign x 0
2567 %rep %3
2568 %if (%3-x) >= mmsize
2569 WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
2570 %assign x (x+mmsize)
2571 %else
2572 %assign w %3-x
2573 %if w == 20
2574 %assign w 16
2575 %endif
2576 WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
2577 %assign x (x+w)
2578 %endif
2579 %if x >= %3
2580 %exitrep
2581 %endif
2582 %endrep
2583 %endmacro
2584
2585 %endif ; HIGH_BIT_DEPTH
2586
2587 ;-----------------------------------------------------------------------------
2588 ;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
2589 ;-----------------------------------------------------------------------------
2590
2591 %macro WEIGHTER 1
2592 cglobal mc_weight_w%1, 6,6,8
2593 FIX_STRIDES r1, r3
2594 WEIGHT_START %1
2595 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
2596 ; we can merge the shift step into the scale factor
2597 ; if (m3<<7) doesn't overflow an int16_t
2598 cmp byte [r4+1], 0
2599 jz .fast
2600 %endif
2601 .loop:
2602 WEIGHT_TWO_ROW r2, r0, %1, 0
2603 lea r0, [r0+r1*2]
2604 lea r2, [r2+r3*2]
2605 sub r5d, 2
2606 jg .loop
2607 RET
2608 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
2609 .fast:
2610 psllw m3, 7
2611 .fastloop:
2612 WEIGHT_TWO_ROW r2, r0, %1, 1
2613 lea r0, [r0+r1*2]
2614 lea r2, [r2+r3*2]
2615 sub r5d, 2
2616 jg .fastloop
2617 RET
2618 %endif
2619 %endmacro
2620
2621 INIT_MMX mmx2
2622 WEIGHTER 4
2623 WEIGHTER 8
2624 WEIGHTER 12
2625 WEIGHTER 16
2626 WEIGHTER 20
2627 INIT_XMM sse2
2628 WEIGHTER 8
2629 WEIGHTER 16
2630 WEIGHTER 20
2631 %if HIGH_BIT_DEPTH
2632 WEIGHTER 12
2633 %else
2634 INIT_MMX ssse3
2635 WEIGHTER 4
2636 INIT_XMM ssse3
2637 WEIGHTER 8
2638 WEIGHTER 16
2639 WEIGHTER 20
2640 INIT_YMM avx2
2641 WEIGHTER 8
2642 WEIGHTER 16
2643 WEIGHTER 20
2644 %endif
2645
2646 %macro OFFSET_OP 7
2647 mov%6 m0, [%1]
2648 mov%6 m1, [%2]
2649 %if HIGH_BIT_DEPTH
2650 p%5usw m0, m2
2651 p%5usw m1, m2
2652 %ifidn %5,add
2653 pminsw m0, m3
2654 pminsw m1, m3
2655 %endif
2656 %else
2657 p%5usb m0, m2
2658 p%5usb m1, m2
2659 %endif
2660 mov%7 [%3], m0
2661 mov%7 [%4], m1
2662 %endmacro
2663
2664 %macro OFFSET_TWO_ROW 4
2665 %assign x 0
2666 %rep %3
2667 %if (%3*SIZEOF_PIXEL-x) >= mmsize
2668 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
2669 %assign x (x+mmsize)
2670 %else
2671 %if HIGH_BIT_DEPTH
2672 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
2673 %else
2674 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
2675 %endif
2676 %exitrep
2677 %endif
2678 %if x >= %3*SIZEOF_PIXEL
2679 %exitrep
2680 %endif
2681 %endrep
2682 %endmacro
2683
2684 ;-----------------------------------------------------------------------------
2685 ;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
2686 ;-----------------------------------------------------------------------------
2687 %macro OFFSET 2
2688 cglobal mc_offset%2_w%1, 6,6
2689 FIX_STRIDES r1, r3
2690 mova m2, [r4]
2691 %if HIGH_BIT_DEPTH
2692 %ifidn %2,add
2693 mova m3, [pw_pixel_max]
2694 %endif
2695 %endif
2696 .loop:
2697 OFFSET_TWO_ROW r2, r0, %1, %2
2698 lea r0, [r0+r1*2]
2699 lea r2, [r2+r3*2]
2700 sub r5d, 2
2701 jg .loop
2702 RET
2703 %endmacro
2704
2705 %macro OFFSETPN 1
2706 OFFSET %1, add
2707 OFFSET %1, sub
2708 %endmacro
2709 INIT_MMX mmx2
2710 OFFSETPN 4
2711 OFFSETPN 8
2712 OFFSETPN 12
2713 OFFSETPN 16
2714 OFFSETPN 20
2715 INIT_XMM sse2
2716 OFFSETPN 12
2717 OFFSETPN 16
2718 OFFSETPN 20
2719 %if HIGH_BIT_DEPTH
2720 INIT_XMM sse2
2721 OFFSETPN 8
2722 %endif
2723
2724
2725 ;=============================================================================
2726 ; pixel avg
2727 ;=============================================================================
2728
2729 ;-----------------------------------------------------------------------------
2730 ; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
2731 ; pixel *src2, intptr_t src2_stride, int weight );
2732 ;-----------------------------------------------------------------------------
2733 %macro AVGH 2
2734 cglobal pixel_avg_%1x%2
2735 mov eax, %2
2736 cmp dword r6m, 32
2737 jne pixel_avg_weight_w%1 %+ SUFFIX
2738 %if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
2739 jmp pixel_avg_w%1_avx2
2740 %else
2741 %if mmsize == 16 && (%1 % 16 == 0)
2742 test dword r4m, 15
2743 jz pixel_avg_w%1_sse2
2744 %endif
2745 jmp pixel_avg_w%1_mmx2
2746 %endif
2747 %endmacro
2748
2749 ;-----------------------------------------------------------------------------
2750 ; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
2751 ; pixel *src2, intptr_t src2_stride, int height, int weight );
2752 ;-----------------------------------------------------------------------------
2753
2754 %macro AVG_FUNC 3-4
2755 cglobal pixel_avg_w%1
2756 AVG_START
2757 .height_loop:
2758 %assign x 0
2759 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
2760 %2 m0, [t2+x]
2761 %2 m1, [t2+x+SIZEOF_PIXEL*t3]
2762 %if HIGH_BIT_DEPTH
2763 pavgw m0, [t4+x]
2764 pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
2765 %else ;!HIGH_BIT_DEPTH
2766 pavgb m0, [t4+x]
2767 pavgb m1, [t4+x+SIZEOF_PIXEL*t5]
2768 %endif
2769 %if (%1 == 12) && (%1-x/SIZEOF_PIXEL < mmsize)
2770 %4 [t0+x], m0
2771 %4 [t0+x+SIZEOF_PIXEL*t1], m1
2772 %else
2773 %3 [t0+x], m0
2774 %3 [t0+x+SIZEOF_PIXEL*t1], m1
2775 %endif
2776 %assign x x+mmsize
2777 %endrep
2778 AVG_END
2779 %endmacro
2780
2781 %if HIGH_BIT_DEPTH
2782
2783 INIT_MMX mmx2
2784 AVG_FUNC 4, movq, movq
2785 AVGH 4, 16
2786 AVGH 4, 8
2787 AVGH 4, 4
2788 AVGH 4, 2
2789
2790 AVG_FUNC 8, movq, movq
2791 AVGH 8, 32
2792 AVGH 8, 16
2793 AVGH 8, 8
2794 AVGH 8, 4
2795
2796 AVG_FUNC 16, movq, movq
2797 AVGH 16, 64
2798 AVGH 16, 32
2799 AVGH 16, 16
2800 AVGH 16, 12
2801 AVGH 16, 8
2802 AVGH 16, 4
2803
2804 AVG_FUNC 24, movq, movq
2805 AVGH 24, 32
2806
2807 AVG_FUNC 32, movq, movq
2808 AVGH 32, 32
2809 AVGH 32, 24
2810 AVGH 32, 16
2811 AVGH 32, 8
2812
2813 AVG_FUNC 48, movq, movq
2814 AVGH 48, 64
2815
2816 AVG_FUNC 64, movq, movq
2817 AVGH 64, 64
2818 AVGH 64, 48
2819 AVGH 64, 32
2820 AVGH 64, 16
2821
2822 AVG_FUNC 12, movq, movq, movq
2823 AVGH 12, 16
2824
2825 INIT_XMM sse2
2826 AVG_FUNC 4, movq, movq
2827 AVGH 4, 16
2828 AVGH 4, 8
2829 AVGH 4, 4
2830 AVGH 4, 2
2831
2832 AVG_FUNC 8, movdqu, movdqa
2833 AVGH 8, 32
2834 AVGH 8, 16
2835 AVGH 8, 8
2836 AVGH 8, 4
2837
2838 AVG_FUNC 16, movdqu, movdqa
2839 AVGH 16, 64
2840 AVGH 16, 32
2841 AVGH 16, 16
2842 AVGH 16, 12
2843 AVGH 16, 8
2844 AVGH 16, 4
2845
2846 AVG_FUNC 24, movdqu, movdqa
2847 AVGH 24, 32
2848
2849 AVG_FUNC 32, movdqu, movdqa
2850 AVGH 32, 64
2851 AVGH 32, 32
2852 AVGH 32, 24
2853 AVGH 32, 16
2854 AVGH 32, 8
2855
2856 AVG_FUNC 48, movdqu, movdqa
2857 AVGH 48, 64
2858
2859 AVG_FUNC 64, movdqu, movdqa
2860 AVGH 64, 64
2861 AVGH 64, 48
2862 AVGH 64, 32
2863 AVGH 64, 16
2864
2865 AVG_FUNC 12, movdqu, movdqa, movq
2866 AVGH 12, 16
2867
2868 %else ;!HIGH_BIT_DEPTH
2869
2870 INIT_MMX mmx2
2871 AVG_FUNC 4, movd, movd
2872 AVGH 4, 16
2873 AVGH 4, 8
2874 AVGH 4, 4
2875 AVGH 4, 2
2876
2877 AVG_FUNC 8, movq, movq
2878 AVGH 8, 32
2879 AVGH 8, 16
2880 AVGH 8, 8
2881 AVGH 8, 4
2882
2883 AVG_FUNC 12, movq, movq, movd
2884 AVGH 12, 16
2885
2886 AVG_FUNC 16, movq, movq
2887 AVGH 16, 64
2888 AVGH 16, 32
2889 AVGH 16, 16
2890 AVGH 16, 12
2891 AVGH 16, 8
2892 AVGH 16, 4
2893
2894 AVG_FUNC 32, movq, movq
2895 AVGH 32, 32
2896 AVGH 32, 24
2897 AVGH 32, 16
2898 AVGH 32, 8
2899
2900 AVG_FUNC 64, movq, movq
2901 AVGH 64, 64
2902 AVGH 64, 48
2903 AVGH 64, 16
2904
2905 AVG_FUNC 24, movq, movq
2906 AVGH 24, 32
2907
2908 AVG_FUNC 48, movq, movq
2909 AVGH 48, 64
2910
2911 INIT_XMM sse2
2912 AVG_FUNC 64, movdqu, movdqa
2913 AVGH 64, 64
2914 AVGH 64, 48
2915 AVGH 64, 32
2916 AVGH 64, 16
2917
2918 AVG_FUNC 32, movdqu, movdqa
2919 AVGH 32, 64
2920 AVGH 32, 32
2921 AVGH 32, 24
2922 AVGH 32, 16
2923 AVGH 32, 8
2924
2925 AVG_FUNC 24, movdqu, movdqa
2926 AVGH 24, 32
2927
2928 AVG_FUNC 16, movdqu, movdqa
2929 AVGH 16, 64
2930 AVGH 16, 32
2931 AVGH 16, 16
2932 AVGH 16, 12
2933 AVGH 16, 8
2934 AVGH 16, 4
2935
2936 AVG_FUNC 48, movdqu, movdqa
2937 AVGH 48, 64
2938
2939 AVG_FUNC 12, movdqu, movdqa, movq
2940 AVGH 12, 16
2941
2942 AVGH 8, 32
2943 AVGH 8, 16
2944 AVGH 8, 8
2945 AVGH 8, 4
2946 INIT_XMM ssse3
2947 AVGH 24, 32
2948
2949 AVGH 64, 64
2950 AVGH 64, 48
2951 AVGH 64, 32
2952 AVGH 64, 16
2953
2954 AVGH 32, 64
2955 AVGH 32, 32
2956 AVGH 32, 24
2957 AVGH 32, 16
2958 AVGH 32, 8
2959
2960 AVGH 16, 64
2961 AVGH 16, 32
2962 AVGH 16, 16
2963 AVGH 16, 12
2964 AVGH 16, 8
2965 AVGH 16, 4
2966
2967 AVGH 48, 64
2968
2969 AVGH 12, 16
2970
2971 AVGH 8, 32
2972 AVGH 8, 16
2973 AVGH 8, 8
2974 AVGH 8, 4
2975 INIT_MMX ssse3
2976 AVGH 4, 16
2977 AVGH 4, 8
2978 AVGH 4, 4
2979 AVGH 4, 2
2980 INIT_XMM avx2
2981 ; TODO: active AVX2 after debug
2982 ;AVG_FUNC 24, movdqu, movdqa
2983 ;AVGH 24, 32
2984
2985 ;AVG_FUNC 64, movdqu, movdqa
2986 ;AVGH 64, 64
2987 ;AVGH 64, 48
2988 ;AVGH 64, 16
2989
2990 ;AVG_FUNC 32, movdqu, movdqa
2991 ;AVGH 32, 64
2992 ;AVGH 32, 32
2993 ;AVGH 32, 24
2994 ;AVGH 32, 16
2995 ;AVGH 32, 8
2996 AVG_FUNC 16, movdqu, movdqa
2997 AVGH 16, 16
2998 AVGH 16, 8
2999
3000 %endif ;HIGH_BIT_DEPTH
3001
3002
3003
3004 ;=============================================================================
3005 ; pixel avg2
3006 ;=============================================================================
3007
3008 %if HIGH_BIT_DEPTH
3009 ;-----------------------------------------------------------------------------
3010 ; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
3011 ; uint16_t *src1, intptr_t src_stride,
3012 ; uint16_t *src2, int height );
3013 ;-----------------------------------------------------------------------------
3014 %macro AVG2_W_ONE 1
3015 cglobal pixel_avg2_w%1, 6,7,4
3016 sub r4, r2
3017 lea r6, [r4+r3*2]
3018 .height_loop:
3019 movu m0, [r2]
3020 movu m1, [r2+r3*2]
3021 %if cpuflag(avx) || mmsize == 8
3022 pavgw m0, [r2+r4]
3023 pavgw m1, [r2+r6]
3024 %else
3025 movu m2, [r2+r4]
3026 movu m3, [r2+r6]
3027 pavgw m0, m2
3028 pavgw m1, m3
3029 %endif
3030 mova [r0], m0
3031 mova [r0+r1*2], m1
3032 lea r2, [r2+r3*4]
3033 lea r0, [r0+r1*4]
3034 sub r5d, 2
3035 jg .height_loop
3036 RET
3037 %endmacro
3038
3039 %macro AVG2_W_TWO 3
3040 cglobal pixel_avg2_w%1, 6,7,8
3041 sub r4, r2
3042 lea r6, [r4+r3*2]
3043 .height_loop:
3044 movu m0, [r2]
3045 %2 m1, [r2+mmsize]
3046 movu m2, [r2+r3*2]
3047 %2 m3, [r2+r3*2+mmsize]
3048 %if mmsize == 8
3049 pavgw m0, [r2+r4]
3050 pavgw m1, [r2+r4+mmsize]
3051 pavgw m2, [r2+r6]
3052 pavgw m3, [r2+r6+mmsize]
3053 %else
3054 movu m4, [r2+r4]
3055 %2 m5, [r2+r4+mmsize]
3056 movu m6, [r2+r6]
3057 %2 m7, [r2+r6+mmsize]
3058 pavgw m0, m4
3059 pavgw m1, m5
3060 pavgw m2, m6
3061 pavgw m3, m7
3062 %endif
3063 mova [r0], m0
3064 %3 [r0+mmsize], m1
3065 mova [r0+r1*2], m2
3066 %3 [r0+r1*2+mmsize], m3
3067 lea r2, [r2+r3*4]
3068 lea r0, [r0+r1*4]
3069 sub r5d, 2
3070 jg .height_loop
3071 RET
3072 %endmacro
3073
3074 INIT_MMX mmx2
3075 AVG2_W_ONE 4
3076 AVG2_W_TWO 8, movu, mova
3077 INIT_XMM sse2
3078 AVG2_W_ONE 8
3079 AVG2_W_TWO 10, movd, movd
3080 AVG2_W_TWO 16, movu, mova
3081 INIT_YMM avx2
3082 AVG2_W_ONE 16
3083
3084 INIT_MMX
3085 cglobal pixel_avg2_w10_mmx2, 6,7
3086 sub r4, r2
3087 lea r6, [r4+r3*2]
3088 .height_loop:
3089 movu m0, [r2+ 0]
3090 movu m1, [r2+ 8]
3091 movh m2, [r2+16]
3092 movu m3, [r2+r3*2+ 0]
3093 movu m4, [r2+r3*2+ 8]
3094 movh m5, [r2+r3*2+16]
3095 pavgw m0, [r2+r4+ 0]
3096 pavgw m1, [r2+r4+ 8]
3097 pavgw m2, [r2+r4+16]
3098 pavgw m3, [r2+r6+ 0]
3099 pavgw m4, [r2+r6+ 8]
3100 pavgw m5, [r2+r6+16]
3101 mova [r0+ 0], m0
3102 mova [r0+ 8], m1
3103 movh [r0+16], m2
3104 mova [r0+r1*2+ 0], m3
3105 mova [r0+r1*2+ 8], m4
3106 movh [r0+r1*2+16], m5
3107 lea r2, [r2+r3*2*2]
3108 lea r0, [r0+r1*2*2]
3109 sub r5d, 2
3110 jg .height_loop
3111 RET
3112
3113 cglobal pixel_avg2_w16_mmx2, 6,7
3114 sub r4, r2
3115 lea r6, [r4+r3*2]
3116 .height_loop:
3117 movu m0, [r2+ 0]
3118 movu m1, [r2+ 8]
3119 movu m2, [r2+16]
3120 movu m3, [r2+24]
3121 movu m4, [r2+r3*2+ 0]
3122 movu m5, [r2+r3*2+ 8]
3123 movu m6, [r2+r3*2+16]
3124 movu m7, [r2+r3*2+24]
3125 pavgw m0, [r2+r4+ 0]
3126 pavgw m1, [r2+r4+ 8]
3127 pavgw m2, [r2+r4+16]
3128 pavgw m3, [r2+r4+24]
3129 pavgw m4, [r2+r6+ 0]
3130 pavgw m5, [r2+r6+ 8]
3131 pavgw m6, [r2+r6+16]
3132 pavgw m7, [r2+r6+24]
3133 mova [r0+ 0], m0
3134 mova [r0+ 8], m1
3135 mova [r0+16], m2
3136 mova [r0+24], m3
3137 mova [r0+r1*2+ 0], m4
3138 mova [r0+r1*2+ 8], m5
3139 mova [r0+r1*2+16], m6
3140 mova [r0+r1*2+24], m7
3141 lea r2, [r2+r3*2*2]
3142 lea r0, [r0+r1*2*2]
3143 sub r5d, 2
3144 jg .height_loop
3145 RET
3146
3147 cglobal pixel_avg2_w18_mmx2, 6,7
3148 sub r4, r2
3149 .height_loop:
3150 movu m0, [r2+ 0]
3151 movu m1, [r2+ 8]
3152 movu m2, [r2+16]
3153 movu m3, [r2+24]
3154 movh m4, [r2+32]
3155 pavgw m0, [r2+r4+ 0]
3156 pavgw m1, [r2+r4+ 8]
3157 pavgw m2, [r2+r4+16]
3158 pavgw m3, [r2+r4+24]
3159 pavgw m4, [r2+r4+32]
3160 mova [r0+ 0], m0
3161 mova [r0+ 8], m1
3162 mova [r0+16], m2
3163 mova [r0+24], m3
3164 movh [r0+32], m4
3165 lea r2, [r2+r3*2]
3166 lea r0, [r0+r1*2]
3167 dec r5d
3168 jg .height_loop
3169 RET
3170
3171 %macro PIXEL_AVG_W18 0
3172 cglobal pixel_avg2_w18, 6,7
3173 sub r4, r2
3174 .height_loop:
3175 movu m0, [r2+ 0]
3176 movd xm2, [r2+32]
3177 %if mmsize == 32
3178 pavgw m0, [r2+r4+ 0]
3179 movd xm1, [r2+r4+32]
3180 pavgw xm2, xm1
3181 %else
3182 movu m1, [r2+16]
3183 movu m3, [r2+r4+ 0]
3184 movu m4, [r2+r4+16]
3185 movd m5, [r2+r4+32]
3186 pavgw m0, m3
3187 pavgw m1, m4
3188 pavgw m2, m5
3189 mova [r0+16], m1
3190 %endif
3191 mova [r0+ 0], m0
3192 movd [r0+32], xm2
3193 lea r2, [r2+r3*2]
3194 lea r0, [r0+r1*2]
3195 dec r5d
3196 jg .height_loop
3197 RET
3198 %endmacro
3199
3200 INIT_XMM sse2
3201 PIXEL_AVG_W18
3202 INIT_YMM avx2
3203 PIXEL_AVG_W18
3204
3205 %endif ; HIGH_BIT_DEPTH
3206
3207 %if HIGH_BIT_DEPTH == 0
3208 ;-----------------------------------------------------------------------------
3209 ; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
3210 ; uint8_t *src1, intptr_t src_stride,
3211 ; uint8_t *src2, int height );
3212 ;-----------------------------------------------------------------------------
3213 %macro AVG2_W8 2
3214 cglobal pixel_avg2_w%1_mmx2, 6,7
3215 sub r4, r2
3216 lea r6, [r4+r3]
3217 .height_loop:
3218 %2 mm0, [r2]
3219 %2 mm1, [r2+r3]
3220 pavgb mm0, [r2+r4]
3221 pavgb mm1, [r2+r6]
3222 lea r2, [r2+r3*2]
3223 %2 [r0], mm0
3224 %2 [r0+r1], mm1
3225 lea r0, [r0+r1*2]
3226 sub r5d, 2
3227 jg .height_loop
3228 RET
3229 %endmacro
3230
3231 INIT_MMX
3232 AVG2_W8 4, movd
3233 AVG2_W8 8, movq
3234
3235 %macro AVG2_W16 2
3236 cglobal pixel_avg2_w%1_mmx2, 6,7
3237 sub r2, r4
3238 lea r6, [r2+r3]
3239 .height_loop:
3240 movq mm0, [r4]
3241 %2 mm1, [r4+8]
3242 movq mm2, [r4+r3]
3243 %2 mm3, [r4+r3+8]
3244 pavgb mm0, [r4+r2]
3245 pavgb mm1, [r4+r2+8]
3246 pavgb mm2, [r4+r6]
3247 pavgb mm3, [r4+r6+8]
3248 lea r4, [r4+r3*2]
3249 movq [r0], mm0
3250 %2 [r0+8], mm1
3251 movq [r0+r1], mm2
3252 %2 [r0+r1+8], mm3
3253 lea r0, [r0+r1*2]
3254 sub r5d, 2
3255 jg .height_loop
3256 RET
3257 %endmacro
3258
3259 AVG2_W16 12, movd
3260 AVG2_W16 16, movq
3261
3262 cglobal pixel_avg2_w20_mmx2, 6,7
3263 sub r2, r4
3264 lea r6, [r2+r3]
3265 .height_loop:
3266 movq mm0, [r4]
3267 movq mm1, [r4+8]
3268 movd mm2, [r4+16]
3269 movq mm3, [r4+r3]
3270 movq mm4, [r4+r3+8]
3271 movd mm5, [r4+r3+16]
3272 pavgb mm0, [r4+r2]
3273 pavgb mm1, [r4+r2+8]
3274 pavgb mm2, [r4+r2+16]
3275 pavgb mm3, [r4+r6]
3276 pavgb mm4, [r4+r6+8]
3277 pavgb mm5, [r4+r6+16]
3278 lea r4, [r4+r3*2]
3279 movq [r0], mm0
3280 movq [r0+8], mm1
3281 movd [r0+16], mm2
3282 movq [r0+r1], mm3
3283 movq [r0+r1+8], mm4
3284 movd [r0+r1+16], mm5
3285 lea r0, [r0+r1*2]
3286 sub r5d, 2
3287 jg .height_loop
3288 RET
3289
3290 INIT_XMM
3291 cglobal pixel_avg2_w16_sse2, 6,7
3292 sub r4, r2
3293 lea r6, [r4+r3]
3294 .height_loop:
3295 movu m0, [r2]
3296 movu m2, [r2+r3]
3297 movu m1, [r2+r4]
3298 movu m3, [r2+r6]
3299 lea r2, [r2+r3*2]
3300 pavgb m0, m1
3301 pavgb m2, m3
3302 mova [r0], m0
3303 mova [r0+r1], m2
3304 lea r0, [r0+r1*2]
3305 sub r5d, 2
3306 jg .height_loop
3307 RET
3308
3309 cglobal pixel_avg2_w20_sse2, 6,7
3310 sub r2, r4
3311 lea r6, [r2+r3]
3312 .height_loop:
3313 movu m0, [r4]
3314 movu m2, [r4+r3]
3315 movu m1, [r4+r2]
3316 movu m3, [r4+r6]
3317 movd mm4, [r4+16]
3318 movd mm5, [r4+r3+16]
3319 pavgb m0, m1
3320 pavgb m2, m3
3321 pavgb mm4, [r4+r2+16]
3322 pavgb mm5, [r4+r6+16]
3323 lea r4, [r4+r3*2]
3324 mova [r0], m0
3325 mova [r0+r1], m2
3326 movd [r0+16], mm4
3327 movd [r0+r1+16], mm5
3328 lea r0, [r0+r1*2]
3329 sub r5d, 2
3330 jg .height_loop
3331 RET
3332
3333 INIT_YMM avx2
3334 cglobal pixel_avg2_w20, 6,7
3335 sub r2, r4
3336 lea r6, [r2+r3]
3337 .height_loop:
3338 movu m0, [r4]
3339 movu m1, [r4+r3]
3340 pavgb m0, [r4+r2]
3341 pavgb m1, [r4+r6]
3342 lea r4, [r4+r3*2]
3343 mova [r0], m0
3344 mova [r0+r1], m1
3345 lea r0, [r0+r1*2]
3346 sub r5d, 2
3347 jg .height_loop
3348 RET
3349
3350 ; Cacheline split code for processors with high latencies for loads
3351 ; split over cache lines. See sad-a.asm for a more detailed explanation.
3352 ; This particular instance is complicated by the fact that src1 and src2
3353 ; can have different alignments. For simplicity and code size, only the
3354 ; MMX cacheline workaround is used. As a result, in the case of SSE2
3355 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
3356 ; is no cacheline split, and the MMX workaround if there is.
3357
3358 %macro INIT_SHIFT 2
3359 and eax, 7
3360 shl eax, 3
3361 movd %1, [sw_64]
3362 movd %2, eax
3363 psubw %1, %2
3364 %endmacro
3365
3366 %macro AVG_CACHELINE_START 0
3367 %assign stack_offset 0
3368 INIT_SHIFT mm6, mm7
3369 mov eax, r4m
3370 INIT_SHIFT mm4, mm5
3371 PROLOGUE 6,6
3372 and r2, ~7
3373 and r4, ~7
3374 sub r4, r2
3375 .height_loop:
3376 %endmacro
3377
3378 %macro AVG_CACHELINE_LOOP 2
3379 movq mm1, [r2+%1]
3380 movq mm0, [r2+8+%1]
3381 movq mm3, [r2+r4+%1]
3382 movq mm2, [r2+r4+8+%1]
3383 psrlq mm1, mm7
3384 psllq mm0, mm6
3385 psrlq mm3, mm5
3386 psllq mm2, mm4
3387 por mm0, mm1
3388 por mm2, mm3
3389 pavgb mm2, mm0
3390 %2 [r0+%1], mm2
3391 %endmacro
3392
3393 %macro AVG_CACHELINE_FUNC 2
3394 pixel_avg2_w%1_cache_mmx2:
3395 AVG_CACHELINE_START
3396 AVG_CACHELINE_LOOP 0, movq
3397 %if %1>8
3398 AVG_CACHELINE_LOOP 8, movq
3399 %if %1>16
3400 AVG_CACHELINE_LOOP 16, movd
3401 %endif
3402 %endif
3403 add r2, r3
3404 add r0, r1
3405 dec r5d
3406 jg .height_loop
3407 RET
3408 %endmacro
3409
3410 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
3411 %if %1 == 12
3412 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
3413 %define cachesplit pixel_avg2_w16_cache_mmx2
3414 %else
3415 %define cachesplit pixel_avg2_w%1_cache_mmx2
3416 %endif
3417 cglobal pixel_avg2_w%1_cache%2_%3
3418 mov eax, r2m
3419 and eax, %2-1
3420 cmp eax, (%2-%1-(%1 % 8))
3421 %if %1==12||%1==20
3422 jbe pixel_avg2_w%1_%3
3423 %else
3424 jb pixel_avg2_w%1_%3
3425 %endif
3426 %if 0 ; or %1==8 - but the extra branch seems too expensive
3427 ja cachesplit
3428 %if ARCH_X86_64
3429 test r4b, 1
3430 %else
3431 test byte r4m, 1
3432 %endif
3433 jz pixel_avg2_w%1_%3
3434 %else
3435 or eax, r4m
3436 and eax, 7
3437 jz pixel_avg2_w%1_%3
3438 mov eax, r2m
3439 %endif
3440 %if mmsize==16 || (%1==8 && %2==64)
3441 AVG_CACHELINE_FUNC %1, %2
3442 %else
3443 jmp cachesplit
3444 %endif
3445 %endmacro
3446
3447 INIT_MMX
3448 AVG_CACHELINE_CHECK 8, 64, mmx2
3449 AVG_CACHELINE_CHECK 12, 64, mmx2
3450 %if ARCH_X86_64 == 0
3451 AVG_CACHELINE_CHECK 16, 64, mmx2
3452 AVG_CACHELINE_CHECK 20, 64, mmx2
3453 AVG_CACHELINE_CHECK 8, 32, mmx2
3454 AVG_CACHELINE_CHECK 12, 32, mmx2
3455 AVG_CACHELINE_CHECK 16, 32, mmx2
3456 AVG_CACHELINE_CHECK 20, 32, mmx2
3457 %endif
3458 INIT_XMM
3459 AVG_CACHELINE_CHECK 16, 64, sse2
3460 AVG_CACHELINE_CHECK 20, 64, sse2
3461
3462 ; computed jump assumes this loop is exactly 48 bytes
3463 %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
3464 ALIGN 16
3465 avg_w16_align%1_%2_ssse3:
3466 %if %1==0 && %2==0
3467 movdqa xmm1, [r2]
3468 pavgb xmm1, [r2+r4]
3469 add r2, r3
3470 %elif %1==0
3471 movdqa xmm1, [r2+r4+16]
3472 palignr xmm1, [r2+r4], %2
3473 pavgb xmm1, [r2]
3474 add r2, r3
3475 %elif %2&15==0
3476 movdqa xmm1, [r2+16]
3477 palignr xmm1, [r2], %1
3478 pavgb xmm1, [r2+r4]
3479 add r2, r3
3480 %else
3481 movdqa xmm1, [r2+16]
3482 movdqa xmm2, [r2+r4+16]
3483 palignr xmm1, [r2], %1
3484 palignr xmm2, [r2+r4], %2&15
3485 add r2, r3
3486 pavgb xmm1, xmm2
3487 %endif
3488 movdqa [r0], xmm1
3489 add r0, r1
3490 dec r5d
3491 jg avg_w16_align%1_%2_ssse3
3492 ret
3493 %if %1==0
3494 ; make sure the first ones don't end up short
3495 ALIGN 16
3496 times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
3497 %endif
3498 %endmacro
3499
3500 cglobal pixel_avg2_w16_cache64_ssse3
3501 %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
3502 mov eax, r2m
3503 and eax, 0x3f
3504 cmp eax, 0x30
3505 jb x265_pixel_avg2_w16_sse2
3506 or eax, r4m
3507 and eax, 7
3508 jz x265_pixel_avg2_w16_sse2
3509 %endif
3510 PROLOGUE 6, 8
3511 lea r6, [r4+r2]
3512 and r4, ~0xf
3513 and r6, 0x1f
3514 and r2, ~0xf
3515 lea r6, [r6*3] ;(offset + align*2)*3
3516 sub r4, r2
3517 shl r6, 4 ;jump = (offset + align*2)*48
3518 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
3519 %ifdef PIC
3520 lea r7, [avg_w16_addr]
3521 add r6, r7
3522 %else
3523 lea r6, [avg_w16_addr + r6]
3524 %endif
3525 TAIL_CALL r6, 1
3526
3527 %assign j 0
3528 %assign k 1
3529 %rep 16
3530 AVG16_CACHELINE_LOOP_SSSE3 j, j
3531 AVG16_CACHELINE_LOOP_SSSE3 j, k
3532 %assign j j+1
3533 %assign k k+1
3534 %endrep
3535 %endif ; !HIGH_BIT_DEPTH
3536
3537 ;=============================================================================
3538 ; pixel copy
3539 ;=============================================================================
3540
3541 %macro COPY1 2
3542 movu m0, [r2]
3543 movu m1, [r2+r3]
3544 movu m2, [r2+r3*2]
3545 movu m3, [r2+%2]
3546 mova [r0], m0
3547 mova [r0+r1], m1
3548 mova [r0+r1*2], m2
3549 mova [r0+%1], m3
3550 %endmacro
3551
3552 %macro COPY2 2-4 0, 1
3553 movu m0, [r2+%3*mmsize]
3554 movu m1, [r2+%4*mmsize]
3555 movu m2, [r2+r3+%3*mmsize]
3556 movu m3, [r2+r3+%4*mmsize]
3557 mova [r0+%3*mmsize], m0
3558 mova [r0+%4*mmsize], m1
3559 mova [r0+r1+%3*mmsize], m2
3560 mova [r0+r1+%4*mmsize], m3
3561 movu m0, [r2+r3*2+%3*mmsize]
3562 movu m1, [r2+r3*2+%4*mmsize]
3563 movu m2, [r2+%2+%3*mmsize]
3564 movu m3, [r2+%2+%4*mmsize]
3565 mova [r0+r1*2+%3*mmsize], m0
3566 mova [r0+r1*2+%4*mmsize], m1
3567 mova [r0+%1+%3*mmsize], m2
3568 mova [r0+%1+%4*mmsize], m3
3569 %endmacro
3570
3571 %macro COPY4 2
3572 COPY2 %1, %2, 0, 1
3573 COPY2 %1, %2, 2, 3
3574 %endmacro
3575
3576 ;-----------------------------------------------------------------------------
3577 ; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
3578 ; uint8_t *src, intptr_t i_src_stride, int i_height )
3579 ;-----------------------------------------------------------------------------
3580 INIT_MMX
3581 cglobal mc_copy_w4_mmx, 4,6
3582 FIX_STRIDES r1, r3
3583 cmp dword r4m, 4
3584 lea r5, [r3*3]
3585 lea r4, [r1*3]
3586 je .end
3587 %if HIGH_BIT_DEPTH == 0
3588 %define mova movd
3589 %define movu movd
3590 %endif
3591 COPY1 r4, r5
3592 lea r2, [r2+r3*4]
3593 lea r0, [r0+r1*4]
3594 .end:
3595 COPY1 r4, r5
3596 RET
3597
3598 %macro MC_COPY 1
3599 %assign %%w %1*SIZEOF_PIXEL/mmsize
3600 %if %%w > 0
3601 cglobal mc_copy_w%1, 5,7
3602 FIX_STRIDES r1, r3
3603 lea r6, [r3*3]
3604 lea r5, [r1*3]
3605 .height_loop:
3606 COPY %+ %%w r5, r6
3607 lea r2, [r2+r3*4]
3608 lea r0, [r0+r1*4]
3609 sub r4d, 4
3610 jg .height_loop
3611 RET
3612 %endif
3613 %endmacro
3614
3615 INIT_MMX mmx
3616 MC_COPY 8
3617 MC_COPY 16
3618 INIT_XMM sse
3619 MC_COPY 8
3620 MC_COPY 16
3621 INIT_XMM aligned, sse
3622 MC_COPY 16
3623 %if HIGH_BIT_DEPTH
3624 INIT_YMM avx
3625 MC_COPY 16
3626 INIT_YMM aligned, avx
3627 MC_COPY 16
3628 %endif
3629
3630 ;=============================================================================
3631 ; prefetch
3632 ;=============================================================================
3633 ; assumes 64 byte cachelines
3634 ; FIXME doesn't cover all pixels in high depth and/or 4:4:4
3635
3636 ;-----------------------------------------------------------------------------
3637 ; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
3638 ; pixel *pix_uv, intptr_t stride_uv, int mb_x )
3639 ;-----------------------------------------------------------------------------
3640
3641 %macro PREFETCH_FENC 1
3642 %if ARCH_X86_64
3643 cglobal prefetch_fenc_%1, 5,5
3644 FIX_STRIDES r1, r3
3645 and r4d, 3
3646 mov eax, r4d
3647 imul r4d, r1d
3648 lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
3649 prefetcht0 [r0]
3650 prefetcht0 [r0+r1]
3651 lea r0, [r0+r1*2]
3652 prefetcht0 [r0]
3653 prefetcht0 [r0+r1]
3654
3655 imul eax, r3d
3656 lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
3657 prefetcht0 [r2]
3658 prefetcht0 [r2+r3]
3659 %ifidn %1, 422
3660 lea r2, [r2+r3*2]
3661 prefetcht0 [r2]
3662 prefetcht0 [r2+r3]
3663 %endif
3664 RET
3665
3666 %else
3667 cglobal prefetch_fenc_%1, 0,3
3668 mov r2, r4m
3669 mov r1, r1m
3670 mov r0, r0m
3671 FIX_STRIDES r1
3672 and r2, 3
3673 imul r2, r1
3674 lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
3675 prefetcht0 [r0]
3676 prefetcht0 [r0+r1]
3677 lea r0, [r0+r1*2]
3678 prefetcht0 [r0]
3679 prefetcht0 [r0+r1]
3680
3681 mov r2, r4m
3682 mov r1, r3m
3683 mov r0, r2m
3684 FIX_STRIDES r1
3685 and r2, 3
3686 imul r2, r1
3687 lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
3688 prefetcht0 [r0]
3689 prefetcht0 [r0+r1]
3690 %ifidn %1, 422
3691 lea r0, [r0+r1*2]
3692 prefetcht0 [r0]
3693 prefetcht0 [r0+r1]
3694 %endif
3695 ret
3696 %endif ; ARCH_X86_64
3697 %endmacro
3698
3699 INIT_MMX mmx2
3700 PREFETCH_FENC 420
3701 PREFETCH_FENC 422
3702
3703 ;-----------------------------------------------------------------------------
3704 ; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
3705 ;-----------------------------------------------------------------------------
3706 INIT_MMX mmx2
3707 cglobal prefetch_ref, 3,3
3708 FIX_STRIDES r1
3709 dec r2d
3710 and r2d, r1d
3711 lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
3712 lea r2, [r1*3]
3713 prefetcht0 [r0]
3714 prefetcht0 [r0+r1]
3715 prefetcht0 [r0+r1*2]
3716 prefetcht0 [r0+r2]
3717 lea r0, [r0+r1*4]
3718 prefetcht0 [r0]
3719 prefetcht0 [r0+r1]
3720 prefetcht0 [r0+r1*2]
3721 prefetcht0 [r0+r2]
3722 RET