Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / x86 / ipfilter8.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* Copyright (C) 2013 x265 project
3;*
4;* Authors: Min Chen <chenm003@163.com>
5;* Nabajit Deka <nabajit@multicorewareinc.com>
6;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7;*
8;* This program is free software; you can redistribute it and/or modify
9;* it under the terms of the GNU General Public License as published by
10;* the Free Software Foundation; either version 2 of the License, or
11;* (at your option) any later version.
12;*
13;* This program is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16;* GNU General Public License for more details.
17;*
18;* You should have received a copy of the GNU General Public License
19;* along with this program; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21;*
22;* This program is also available under a commercial proprietary license.
23;* For more information, contact us at license @ x265.com.
24;*****************************************************************************/
25
26%include "x86inc.asm"
27%include "x86util.asm"
28
29SECTION_RODATA 32
30tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
31 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
32 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
33
b53f7c52
JB
34ALIGN 32
35const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15
36
37ALIGN 32
38const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
39 dd 2, 3, 3, 4, 4, 5, 5, 6
40
72b9787e
JB
41ALIGN 32
42tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
43 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
44 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
45 db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
46
47tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
48 db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
49
50tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
51
72b9787e
JB
52tab_c_526336: times 4 dd 8192*64+2048
53
54tab_ChromaCoeff: db 0, 64, 0, 0
55 db -2, 58, 10, -2
56 db -4, 54, 16, -2
57 db -6, 46, 28, -4
58 db -4, 36, 36, -4
59 db -4, 28, 46, -6
60 db -2, 16, 54, -4
61 db -2, 10, 58, -2
62
63tab_ChromaCoeffV: times 4 dw 0, 64
64 times 4 dw 0, 0
65
66 times 4 dw -2, 58
67 times 4 dw 10, -2
68
69 times 4 dw -4, 54
70 times 4 dw 16, -2
71
72 times 4 dw -6, 46
73 times 4 dw 28, -4
74
75 times 4 dw -4, 36
76 times 4 dw 36, -4
77
78 times 4 dw -4, 28
79 times 4 dw 46, -6
80
81 times 4 dw -2, 16
82 times 4 dw 54, -4
83
84 times 4 dw -2, 10
85 times 4 dw 58, -2
86
87tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
88 db -1, 4, -10, 58, 17, -5, 1, 0
89 db -1, 4, -11, 40, 40, -11, 4, -1
90 db 0, 1, -5, 17, 58, -10, 4, -1
91
92tab_LumaCoeffV: times 4 dw 0, 0
93 times 4 dw 0, 64
94 times 4 dw 0, 0
95 times 4 dw 0, 0
96
97 times 4 dw -1, 4
98 times 4 dw -10, 58
99 times 4 dw 17, -5
100 times 4 dw 1, 0
101
102 times 4 dw -1, 4
103 times 4 dw -11, 40
104 times 4 dw 40, -11
105 times 4 dw 4, -1
106
107 times 4 dw 0, 1
108 times 4 dw -5, 17
109 times 4 dw 58, -10
110 times 4 dw 4, -1
111
112tab_LumaCoeffVer: times 8 db 0, 0
113 times 8 db 0, 64
114 times 8 db 0, 0
115 times 8 db 0, 0
116
117 times 8 db -1, 4
118 times 8 db -10, 58
119 times 8 db 17, -5
120 times 8 db 1, 0
121
122 times 8 db -1, 4
123 times 8 db -11, 40
124 times 8 db 40, -11
125 times 8 db 4, -1
126
127 times 8 db 0, 1
128 times 8 db -5, 17
129 times 8 db 58, -10
130 times 8 db 4, -1
131
b53f7c52
JB
132ALIGN 32
133tab_LumaCoeffVer_32: times 16 db 0, 0
134 times 16 db 0, 64
135 times 16 db 0, 0
136 times 16 db 0, 0
137
138 times 16 db -1, 4
139 times 16 db -10, 58
140 times 16 db 17, -5
141 times 16 db 1, 0
142
143 times 16 db -1, 4
144 times 16 db -11, 40
145 times 16 db 40, -11
146 times 16 db 4, -1
147
148 times 16 db 0, 1
149 times 16 db -5, 17
150 times 16 db 58, -10
151 times 16 db 4, -1
152
153ALIGN 32
154tab_ChromaCoeffVer_32: times 16 db 0, 64
155 times 16 db 0, 0
156
157 times 16 db -2, 58
158 times 16 db 10, -2
159
160 times 16 db -4, 54
161 times 16 db 16, -2
162
163 times 16 db -6, 46
164 times 16 db 28, -4
165
166 times 16 db -4, 36
167 times 16 db 36, -4
168
169 times 16 db -4, 28
170 times 16 db 46, -6
171
172 times 16 db -2, 16
173 times 16 db 54, -4
174
175 times 16 db -2, 10
176 times 16 db 58, -2
177
72b9787e
JB
178tab_c_64_n64: times 8 db 64, -64
179
b53f7c52
JB
180const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
181
182ALIGN 32
183interp4_horiz_shuf1: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
184 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
72b9787e
JB
185
186SECTION .text
187
b53f7c52 188cextern pb_128
72b9787e
JB
189cextern pw_1
190cextern pw_512
191cextern pw_2000
192
193%macro FILTER_H4_w2_2 3
194 movh %2, [srcq - 1]
195 pshufb %2, %2, Tm0
196 movh %1, [srcq + srcstrideq - 1]
197 pshufb %1, %1, Tm0
198 punpcklqdq %2, %1
199 pmaddubsw %2, coef2
200 phaddw %2, %2
201 pmulhrsw %2, %3
202 packuswb %2, %2
203 movd r4, %2
204 mov [dstq], r4w
205 shr r4, 16
206 mov [dstq + dststrideq], r4w
207%endmacro
208
209;-----------------------------------------------------------------------------
210; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
211;-----------------------------------------------------------------------------
212INIT_XMM sse4
213cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride
214%define coef2 m4
215%define Tm0 m3
216%define t2 m2
217%define t1 m1
218%define t0 m0
219
220mov r4d, r4m
221
222%ifdef PIC
223lea r5, [tab_ChromaCoeff]
224movd coef2, [r5 + r4 * 4]
225%else
226movd coef2, [tab_ChromaCoeff + r4 * 4]
227%endif
228
229pshufd coef2, coef2, 0
b53f7c52 230mova t2, [pw_512]
72b9787e
JB
231mova Tm0, [tab_Tm]
232
233%rep 2
234FILTER_H4_w2_2 t0, t1, t2
235lea srcq, [srcq + srcstrideq * 2]
236lea dstq, [dstq + dststrideq * 2]
237%endrep
238
239RET
240
241;-----------------------------------------------------------------------------
242; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
243;-----------------------------------------------------------------------------
244INIT_XMM sse4
245cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride
246%define coef2 m4
247%define Tm0 m3
248%define t2 m2
249%define t1 m1
250%define t0 m0
251
252mov r4d, r4m
253
254%ifdef PIC
255lea r5, [tab_ChromaCoeff]
256movd coef2, [r5 + r4 * 4]
257%else
258movd coef2, [tab_ChromaCoeff + r4 * 4]
259%endif
260
261pshufd coef2, coef2, 0
b53f7c52 262mova t2, [pw_512]
72b9787e
JB
263mova Tm0, [tab_Tm]
264
265%rep 4
266FILTER_H4_w2_2 t0, t1, t2
267lea srcq, [srcq + srcstrideq * 2]
268lea dstq, [dstq + dststrideq * 2]
269%endrep
270
271RET
272
273;-----------------------------------------------------------------------------
274; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
275;-----------------------------------------------------------------------------
276INIT_XMM sse4
277cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
278%define coef2 m4
279%define Tm0 m3
280%define t2 m2
281%define t1 m1
282%define t0 m0
283
284mov r4d, r4m
285
286%ifdef PIC
287lea r5, [tab_ChromaCoeff]
288movd coef2, [r5 + r4 * 4]
289%else
290movd coef2, [tab_ChromaCoeff + r4 * 4]
291%endif
292
293pshufd coef2, coef2, 0
b53f7c52 294mova t2, [pw_512]
72b9787e
JB
295mova Tm0, [tab_Tm]
296
297mov r5d, 16/2
298
299.loop:
300FILTER_H4_w2_2 t0, t1, t2
301lea srcq, [srcq + srcstrideq * 2]
302lea dstq, [dstq + dststrideq * 2]
303dec r5d
304jnz .loop
305
306RET
307
308%macro FILTER_H4_w4_2 3
309 movh %2, [srcq - 1]
310 pshufb %2, %2, Tm0
311 pmaddubsw %2, coef2
312 movh %1, [srcq + srcstrideq - 1]
313 pshufb %1, %1, Tm0
314 pmaddubsw %1, coef2
315 phaddw %2, %1
316 pmulhrsw %2, %3
317 packuswb %2, %2
318 movd [dstq], %2
319 palignr %2, %2, 4
320 movd [dstq + dststrideq], %2
321%endmacro
322
323;-----------------------------------------------------------------------------
324; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
325;-----------------------------------------------------------------------------
326INIT_XMM sse4
327cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride
328%define coef2 m4
329%define Tm0 m3
330%define t2 m2
331%define t1 m1
332%define t0 m0
333
334mov r4d, r4m
335
336%ifdef PIC
337lea r5, [tab_ChromaCoeff]
338movd coef2, [r5 + r4 * 4]
339%else
340movd coef2, [tab_ChromaCoeff + r4 * 4]
341%endif
342
343pshufd coef2, coef2, 0
b53f7c52 344mova t2, [pw_512]
72b9787e
JB
345mova Tm0, [tab_Tm]
346
347FILTER_H4_w4_2 t0, t1, t2
348
349RET
350
351;-----------------------------------------------------------------------------
352; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
353;-----------------------------------------------------------------------------
354INIT_XMM sse4
355cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride
356%define coef2 m4
357%define Tm0 m3
358%define t2 m2
359%define t1 m1
360%define t0 m0
361
362mov r4d, r4m
363
364%ifdef PIC
365lea r5, [tab_ChromaCoeff]
366movd coef2, [r5 + r4 * 4]
367%else
368movd coef2, [tab_ChromaCoeff + r4 * 4]
369%endif
370
371pshufd coef2, coef2, 0
b53f7c52 372mova t2, [pw_512]
72b9787e
JB
373mova Tm0, [tab_Tm]
374
375%rep 2
376FILTER_H4_w4_2 t0, t1, t2
377lea srcq, [srcq + srcstrideq * 2]
378lea dstq, [dstq + dststrideq * 2]
379%endrep
380
381RET
382
383;-----------------------------------------------------------------------------
384; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
385;-----------------------------------------------------------------------------
386INIT_XMM sse4
387cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride
388%define coef2 m4
389%define Tm0 m3
390%define t2 m2
391%define t1 m1
392%define t0 m0
393
394mov r4d, r4m
395
396%ifdef PIC
397lea r5, [tab_ChromaCoeff]
398movd coef2, [r5 + r4 * 4]
399%else
400movd coef2, [tab_ChromaCoeff + r4 * 4]
401%endif
402
403pshufd coef2, coef2, 0
b53f7c52 404mova t2, [pw_512]
72b9787e
JB
405mova Tm0, [tab_Tm]
406
407%rep 4
408FILTER_H4_w4_2 t0, t1, t2
409lea srcq, [srcq + srcstrideq * 2]
410lea dstq, [dstq + dststrideq * 2]
411%endrep
412
413RET
414
415;-----------------------------------------------------------------------------
416; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
417;-----------------------------------------------------------------------------
418INIT_XMM sse4
419cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride
420%define coef2 m4
421%define Tm0 m3
422%define t2 m2
423%define t1 m1
424%define t0 m0
425
426mov r4d, r4m
427
428%ifdef PIC
429lea r5, [tab_ChromaCoeff]
430movd coef2, [r5 + r4 * 4]
431%else
432movd coef2, [tab_ChromaCoeff + r4 * 4]
433%endif
434
435pshufd coef2, coef2, 0
b53f7c52 436mova t2, [pw_512]
72b9787e
JB
437mova Tm0, [tab_Tm]
438
439%rep 8
440FILTER_H4_w4_2 t0, t1, t2
441lea srcq, [srcq + srcstrideq * 2]
442lea dstq, [dstq + dststrideq * 2]
443%endrep
444
445RET
446
447;-----------------------------------------------------------------------------
448; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
449;-----------------------------------------------------------------------------
450INIT_XMM sse4
451cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
452%define coef2 m4
453%define Tm0 m3
454%define t2 m2
455%define t1 m1
456%define t0 m0
457
458mov r4d, r4m
459
460%ifdef PIC
461lea r5, [tab_ChromaCoeff]
462movd coef2, [r5 + r4 * 4]
463%else
464movd coef2, [tab_ChromaCoeff + r4 * 4]
465%endif
466
467pshufd coef2, coef2, 0
b53f7c52 468mova t2, [pw_512]
72b9787e
JB
469mova Tm0, [tab_Tm]
470
471mov r5d, 32/2
472
473.loop:
474FILTER_H4_w4_2 t0, t1, t2
475lea srcq, [srcq + srcstrideq * 2]
476lea dstq, [dstq + dststrideq * 2]
477dec r5d
478jnz .loop
479
480RET
481
b53f7c52
JB
482ALIGN 32
483const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
484
72b9787e
JB
485
486%macro FILTER_H4_w6 3
487 movu %1, [srcq - 1]
488 pshufb %2, %1, Tm0
489 pmaddubsw %2, coef2
490 pshufb %1, %1, Tm1
491 pmaddubsw %1, coef2
492 phaddw %2, %1
493 pmulhrsw %2, %3
494 packuswb %2, %2
495 movd [dstq], %2
496 pextrw [dstq + 4], %2, 2
497%endmacro
498
499%macro FILTER_H4_w8 3
500 movu %1, [srcq - 1]
501 pshufb %2, %1, Tm0
502 pmaddubsw %2, coef2
503 pshufb %1, %1, Tm1
504 pmaddubsw %1, coef2
505 phaddw %2, %1
506 pmulhrsw %2, %3
507 packuswb %2, %2
508 movh [dstq], %2
509%endmacro
510
511%macro FILTER_H4_w12 3
512 movu %1, [srcq - 1]
513 pshufb %2, %1, Tm0
514 pmaddubsw %2, coef2
515 pshufb %1, %1, Tm1
516 pmaddubsw %1, coef2
517 phaddw %2, %1
518 pmulhrsw %2, %3
519 movu %1, [srcq - 1 + 8]
520 pshufb %1, %1, Tm0
521 pmaddubsw %1, coef2
522 phaddw %1, %1
523 pmulhrsw %1, %3
524 packuswb %2, %1
525 movh [dstq], %2
526 pextrd [dstq + 8], %2, 2
527%endmacro
528
529%macro FILTER_H4_w16 4
530 movu %1, [srcq - 1]
531 pshufb %2, %1, Tm0
532 pmaddubsw %2, coef2
533 pshufb %1, %1, Tm1
534 pmaddubsw %1, coef2
535 phaddw %2, %1
536 movu %1, [srcq - 1 + 8]
537 pshufb %4, %1, Tm0
538 pmaddubsw %4, coef2
539 pshufb %1, %1, Tm1
540 pmaddubsw %1, coef2
541 phaddw %4, %1
542 pmulhrsw %2, %3
543 pmulhrsw %4, %3
544 packuswb %2, %4
545 movu [dstq], %2
546%endmacro
547
548%macro FILTER_H4_w24 4
549 movu %1, [srcq - 1]
550 pshufb %2, %1, Tm0
551 pmaddubsw %2, coef2
552 pshufb %1, %1, Tm1
553 pmaddubsw %1, coef2
554 phaddw %2, %1
555 movu %1, [srcq - 1 + 8]
556 pshufb %4, %1, Tm0
557 pmaddubsw %4, coef2
558 pshufb %1, %1, Tm1
559 pmaddubsw %1, coef2
560 phaddw %4, %1
561 pmulhrsw %2, %3
562 pmulhrsw %4, %3
563 packuswb %2, %4
564 movu [dstq], %2
565 movu %1, [srcq - 1 + 16]
566 pshufb %2, %1, Tm0
567 pmaddubsw %2, coef2
568 pshufb %1, %1, Tm1
569 pmaddubsw %1, coef2
570 phaddw %2, %1
571 pmulhrsw %2, %3
572 packuswb %2, %2
573 movh [dstq + 16], %2
574%endmacro
575
576%macro FILTER_H4_w32 4
577 movu %1, [srcq - 1]
578 pshufb %2, %1, Tm0
579 pmaddubsw %2, coef2
580 pshufb %1, %1, Tm1
581 pmaddubsw %1, coef2
582 phaddw %2, %1
583 movu %1, [srcq - 1 + 8]
584 pshufb %4, %1, Tm0
585 pmaddubsw %4, coef2
586 pshufb %1, %1, Tm1
587 pmaddubsw %1, coef2
588 phaddw %4, %1
589 pmulhrsw %2, %3
590 pmulhrsw %4, %3
591 packuswb %2, %4
592 movu [dstq], %2
593 movu %1, [srcq - 1 + 16]
594 pshufb %2, %1, Tm0
595 pmaddubsw %2, coef2
596 pshufb %1, %1, Tm1
597 pmaddubsw %1, coef2
598 phaddw %2, %1
599 movu %1, [srcq - 1 + 24]
600 pshufb %4, %1, Tm0
601 pmaddubsw %4, coef2
602 pshufb %1, %1, Tm1
603 pmaddubsw %1, coef2
604 phaddw %4, %1
605 pmulhrsw %2, %3
606 pmulhrsw %4, %3
607 packuswb %2, %4
608 movu [dstq + 16], %2
609%endmacro
610
611%macro FILTER_H4_w16o 5
612 movu %1, [srcq + %5 - 1]
613 pshufb %2, %1, Tm0
614 pmaddubsw %2, coef2
615 pshufb %1, %1, Tm1
616 pmaddubsw %1, coef2
617 phaddw %2, %1
618 movu %1, [srcq + %5 - 1 + 8]
619 pshufb %4, %1, Tm0
620 pmaddubsw %4, coef2
621 pshufb %1, %1, Tm1
622 pmaddubsw %1, coef2
623 phaddw %4, %1
624 pmulhrsw %2, %3
625 pmulhrsw %4, %3
626 packuswb %2, %4
627 movu [dstq + %5], %2
628%endmacro
629
630%macro FILTER_H4_w48 4
631 FILTER_H4_w16o %1, %2, %3, %4, 0
632 FILTER_H4_w16o %1, %2, %3, %4, 16
633 FILTER_H4_w16o %1, %2, %3, %4, 32
634%endmacro
635
636%macro FILTER_H4_w64 4
637 FILTER_H4_w16o %1, %2, %3, %4, 0
638 FILTER_H4_w16o %1, %2, %3, %4, 16
639 FILTER_H4_w16o %1, %2, %3, %4, 32
640 FILTER_H4_w16o %1, %2, %3, %4, 48
641%endmacro
642
643;-----------------------------------------------------------------------------
644; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
645;-----------------------------------------------------------------------------
646%macro IPFILTER_CHROMA 2
647INIT_XMM sse4
648cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
649%define coef2 m5
650%define Tm0 m4
651%define Tm1 m3
652%define t2 m2
653%define t1 m1
654%define t0 m0
655
656mov r4d, r4m
657
658%ifdef PIC
659lea r5, [tab_ChromaCoeff]
660movd coef2, [r5 + r4 * 4]
661%else
662movd coef2, [tab_ChromaCoeff + r4 * 4]
663%endif
664
665mov r5d, %2
666
667pshufd coef2, coef2, 0
b53f7c52 668mova t2, [pw_512]
72b9787e
JB
669mova Tm0, [tab_Tm]
670mova Tm1, [tab_Tm + 16]
671
672.loop:
673FILTER_H4_w%1 t0, t1, t2
674add srcq, srcstrideq
675add dstq, dststrideq
676
677dec r5d
678jnz .loop
679
680RET
681%endmacro
682
683
684IPFILTER_CHROMA 6, 8
685IPFILTER_CHROMA 8, 2
686IPFILTER_CHROMA 8, 4
687IPFILTER_CHROMA 8, 6
688IPFILTER_CHROMA 8, 8
689IPFILTER_CHROMA 8, 16
690IPFILTER_CHROMA 8, 32
691IPFILTER_CHROMA 12, 16
692
693IPFILTER_CHROMA 6, 16
694IPFILTER_CHROMA 8, 12
695IPFILTER_CHROMA 8, 64
696IPFILTER_CHROMA 12, 32
697
698;-----------------------------------------------------------------------------
699; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
700;-----------------------------------------------------------------------------
701%macro IPFILTER_CHROMA_W 2
702INIT_XMM sse4
703cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
704%define coef2 m6
705%define Tm0 m5
706%define Tm1 m4
707%define t3 m3
708%define t2 m2
709%define t1 m1
710%define t0 m0
711
712mov r4d, r4m
713
714%ifdef PIC
715lea r5, [tab_ChromaCoeff]
716movd coef2, [r5 + r4 * 4]
717%else
718movd coef2, [tab_ChromaCoeff + r4 * 4]
719%endif
720
721mov r5d, %2
722
723pshufd coef2, coef2, 0
b53f7c52 724mova t2, [pw_512]
72b9787e
JB
725mova Tm0, [tab_Tm]
726mova Tm1, [tab_Tm + 16]
727
728.loop:
729FILTER_H4_w%1 t0, t1, t2, t3
730add srcq, srcstrideq
731add dstq, dststrideq
732
733dec r5d
734jnz .loop
735
736RET
737%endmacro
738
739IPFILTER_CHROMA_W 16, 4
740IPFILTER_CHROMA_W 16, 8
741IPFILTER_CHROMA_W 16, 12
742IPFILTER_CHROMA_W 16, 16
743IPFILTER_CHROMA_W 16, 32
744IPFILTER_CHROMA_W 32, 8
745IPFILTER_CHROMA_W 32, 16
746IPFILTER_CHROMA_W 32, 24
747IPFILTER_CHROMA_W 24, 32
748IPFILTER_CHROMA_W 32, 32
749
750IPFILTER_CHROMA_W 16, 24
751IPFILTER_CHROMA_W 16, 64
752IPFILTER_CHROMA_W 32, 48
753IPFILTER_CHROMA_W 24, 64
754IPFILTER_CHROMA_W 32, 64
755
756IPFILTER_CHROMA_W 64, 64
757IPFILTER_CHROMA_W 64, 32
758IPFILTER_CHROMA_W 64, 48
759IPFILTER_CHROMA_W 48, 64
760IPFILTER_CHROMA_W 64, 16
761
762
763%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
764 movu %1, %7
765 pshufb %2, %1, [tab_Lm + 0]
766 pmaddubsw %2, %5
767 pshufb %3, %1, [tab_Lm + 16]
768 pmaddubsw %3, %5
769 phaddw %2, %3
770 pshufb %4, %1, [tab_Lm + 32]
771 pmaddubsw %4, %5
772 pshufb %1, %1, [tab_Lm + 48]
773 pmaddubsw %1, %5
774 phaddw %4, %1
775 phaddw %2, %4
776 %if %0 == 8
777 pmulhrsw %2, %6
778 packuswb %2, %2
779 movh %8, %2
780 %endif
781%endmacro
782
783%macro FILTER_H8_W4 2
784 movu %1, [r0 - 3 + r5]
785 pshufb %2, %1, [tab_Lm]
786 pmaddubsw %2, m3
787 pshufb m7, %1, [tab_Lm + 16]
788 pmaddubsw m7, m3
789 phaddw %2, m7
790 phaddw %2, %2
791%endmacro
792
793;----------------------------------------------------------------------------------------------------------------------------
794; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
795;----------------------------------------------------------------------------------------------------------------------------
796%macro IPFILTER_LUMA 3
797INIT_XMM sse4
798cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
799
800 mov r4d, r4m
801
802%ifdef PIC
803 lea r6, [tab_LumaCoeff]
804 movh m3, [r6 + r4 * 8]
805%else
806 movh m3, [tab_LumaCoeff + r4 * 8]
807%endif
808 punpcklqdq m3, m3
809
810%ifidn %3, pp
b53f7c52 811 mova m2, [pw_512]
72b9787e
JB
812%else
813 mova m2, [pw_2000]
814%endif
815
816 mov r4d, %2
817%ifidn %3, ps
818 add r3, r3
819 cmp r5m, byte 0
820 je .loopH
821 lea r6, [r1 + 2 * r1]
822 sub r0, r6
823 add r4d, 7
824%endif
825
826.loopH:
827 xor r5, r5
828%rep %1 / 8
829 %ifidn %3, pp
830 FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
831 %else
832 FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
833 psubw m1, m2
834 movu [r2 + 2 * r5], m1
835 %endif
836 add r5, 8
837%endrep
838
839%rep (%1 % 8) / 4
840 FILTER_H8_W4 m0, m1
841 %ifidn %3, pp
842 pmulhrsw m1, m2
843 packuswb m1, m1
844 movd [r2 + r5], m1
845 %else
846 psubw m1, m2
847 movh [r2 + 2 * r5], m1
848 %endif
849%endrep
850
851 add r0, r1
852 add r2, r3
853
854 dec r4d
855 jnz .loopH
856 RET
857%endmacro
858
859
860INIT_YMM avx2
861cglobal interp_8tap_horiz_pp_4x4, 4,6,6
862 mov r4d, r4m
863
864%ifdef PIC
865 lea r5, [tab_LumaCoeff]
866 vpbroadcastq m0, [r5 + r4 * 8]
867%else
868 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
869%endif
870
871 mova m1, [tab_Lm]
872 vpbroadcastd m2, [pw_1]
873
874 ; register map
875 ; m0 - interpolate coeff
876 ; m1 - shuffle order table
877 ; m2 - constant word 1
878
879 sub r0, 3
880 ; Row 0-1
881 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
882 pshufb m3, m1
883 pmaddubsw m3, m0
884 pmaddwd m3, m2
885 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
886 pshufb m4, m1
887 pmaddubsw m4, m0
888 pmaddwd m4, m2
889 phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
890
891 ; Row 2-3
892 lea r0, [r0 + r1 * 2]
893 vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
894 pshufb m4, m1
895 pmaddubsw m4, m0
896 pmaddwd m4, m2
897 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
898 pshufb m5, m1
899 pmaddubsw m5, m0
900 pmaddwd m5, m2
901 phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
902
903 packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
904 pmulhrsw m3, [pw_512]
905 vextracti128 xm4, m3, 1
906 packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
b53f7c52 907 pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0]
72b9787e
JB
908
909 lea r0, [r3 * 3]
910 movd [r2], xm3
911 pextrd [r2+r3], xm3, 2
912 pextrd [r2+r3*2], xm3, 1
913 pextrd [r2+r0], xm3, 3
914 RET
915
b53f7c52
JB
916INIT_YMM avx2
917cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7
918 mov r4d, r4m
919
920%ifdef PIC
921 lea r5, [tab_LumaCoeff]
922 vpbroadcastq m0, [r5 + r4 * 8]
923%else
924 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
925%endif
926
927 mova m1, [tab_Lm]
928 mova m2, [tab_Lm + 32]
929
930 ; register map
931 ; m0 - interpolate coeff
932 ; m1, m2 - shuffle order table
933
934 sub r0, 3
935 lea r5, [r1 * 3]
936 lea r4, [r3 * 3]
937
938 ; Row 0
939 vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
940 pshufb m4, m3, m2
941 pshufb m3, m1
942 pmaddubsw m3, m0
943 pmaddubsw m4, m0
944 phaddw m3, m4
945 ; Row 1
946 vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
947 pshufb m5, m4, m2
948 pshufb m4, m1
949 pmaddubsw m4, m0
950 pmaddubsw m5, m0
951 phaddw m4, m5
952
953 phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
954 pmulhrsw m3, [pw_512]
955
956 ; Row 2
957 vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
958 pshufb m5, m4, m2
959 pshufb m4, m1
960 pmaddubsw m4, m0
961 pmaddubsw m5, m0
962 phaddw m4, m5
963 ; Row 3
964 vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
965 pshufb m6, m5, m2
966 pshufb m5, m1
967 pmaddubsw m5, m0
968 pmaddubsw m6, m0
969 phaddw m5, m6
970
971 phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
972 pmulhrsw m4, [pw_512]
973
974 packuswb m3, m4
975 vextracti128 xm4, m3, 1
976 punpcklwd xm5, xm3, xm4
977
978 movq [r2], xm5
979 movhps [r2 + r3], xm5
980
981 punpckhwd xm5, xm3, xm4
982 movq [r2 + r3 * 2], xm5
983 movhps [r2 + r4], xm5
984 RET
985
986%macro IPFILTER_LUMA_AVX2_8xN 2
987INIT_YMM avx2
988cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7
989 mov r4d, r4m
990
991%ifdef PIC
992 lea r5, [tab_LumaCoeff]
993 vpbroadcastq m0, [r5 + r4 * 8]
994%else
995 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
996%endif
997
998 mova m1, [tab_Lm]
999 mova m2, [tab_Lm + 32]
1000
1001 ; register map
1002 ; m0 - interpolate coeff
1003 ; m1, m2 - shuffle order table
1004
1005 sub r0, 3
1006 lea r5, [r1 * 3]
1007 lea r6, [r3 * 3]
1008 mov r4d, %2 / 4
1009.loop:
1010 ; Row 0
1011 vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1012 pshufb m4, m3, m2
1013 pshufb m3, m1
1014 pmaddubsw m3, m0
1015 pmaddubsw m4, m0
1016 phaddw m3, m4
1017 ; Row 1
1018 vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1019 pshufb m5, m4, m2
1020 pshufb m4, m1
1021 pmaddubsw m4, m0
1022 pmaddubsw m5, m0
1023 phaddw m4, m5
1024
1025 phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
1026 pmulhrsw m3, [pw_512]
1027
1028 ; Row 2
1029 vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1030 pshufb m5, m4, m2
1031 pshufb m4, m1
1032 pmaddubsw m4, m0
1033 pmaddubsw m5, m0
1034 phaddw m4, m5
1035 ; Row 3
1036 vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1037 pshufb m6, m5, m2
1038 pshufb m5, m1
1039 pmaddubsw m5, m0
1040 pmaddubsw m6, m0
1041 phaddw m5, m6
1042
1043 phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
1044 pmulhrsw m4, [pw_512]
1045
1046 packuswb m3, m4
1047 vextracti128 xm4, m3, 1
1048 punpcklwd xm5, xm3, xm4
1049
1050 movq [r2], xm5
1051 movhps [r2 + r3], xm5
1052
1053 punpckhwd xm5, xm3, xm4
1054 movq [r2 + r3 * 2], xm5
1055 movhps [r2 + r6], xm5
1056
1057 lea r0, [r0 + r1 * 4]
1058 lea r2, [r2 + r3 * 4]
1059 dec r4d
1060 jnz .loop
1061 RET
1062%endmacro
1063
1064IPFILTER_LUMA_AVX2_8xN 8, 8
1065IPFILTER_LUMA_AVX2_8xN 8, 16
1066IPFILTER_LUMA_AVX2_8xN 8, 32
1067
1068%macro IPFILTER_LUMA_AVX2 2
1069INIT_YMM avx2
1070cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
1071 sub r0, 3
1072 mov r4d, r4m
1073%ifdef PIC
1074 lea r5, [tab_LumaCoeff]
1075 vpbroadcastd m0, [r5 + r4 * 8]
1076 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1077%else
1078 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1079 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1080%endif
1081 movu m3, [tab_Tm + 16]
1082 vpbroadcastd m7, [pw_1]
1083
1084 ; register map
1085 ; m0 , m1 interpolate coeff
1086 ; m2 , m2 shuffle order table
1087 ; m7 - pw_1
72b9787e 1088
b53f7c52
JB
1089 mov r4d, %2/2
1090.loop:
1091 ; Row 0
1092 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1093 pshufb m5, m4, m3
1094 pshufb m4, [tab_Tm]
1095 pmaddubsw m4, m0
1096 pmaddubsw m5, m1
1097 paddw m4, m5
1098 pmaddwd m4, m7
1099 vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0
1100 pshufb m6, m5, m3
1101 pshufb m5, [tab_Tm]
1102 pmaddubsw m5, m0
1103 pmaddubsw m6, m1
1104 paddw m5, m6
1105 pmaddwd m5, m7
1106 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1107 pmulhrsw m4, [pw_512]
1108 vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1109 pshufb m5, m2, m3
1110 pshufb m2, [tab_Tm]
1111 pmaddubsw m2, m0
1112 pmaddubsw m5, m1
1113 paddw m2, m5
1114 pmaddwd m2, m7
1115 vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0
1116 pshufb m6, m5, m3
1117 pshufb m5, [tab_Tm]
1118 pmaddubsw m5, m0
1119 pmaddubsw m6, m1
1120 paddw m5, m6
1121 pmaddwd m5, m7
1122 packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1123 pmulhrsw m2, [pw_512]
1124 packuswb m4, m2
1125 vpermq m4, m4, 11011000b
1126 vextracti128 xm5, m4, 1
1127 pshufd xm4, xm4, 11011000b
1128 pshufd xm5, xm5, 11011000b
1129 movu [r2], xm4
1130 movu [r2+r3], xm5
1131 lea r0, [r0 + r1 * 2]
1132 lea r2, [r2 + r3 * 2]
1133 dec r4d
1134 jnz .loop
1135 RET
1136%endmacro
1137
1138%macro IPFILTER_LUMA_32x_avx2 2
1139INIT_YMM avx2
1140cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
1141 sub r0, 3
1142 mov r4d, r4m
1143%ifdef PIC
1144 lea r5, [tab_LumaCoeff]
1145 vpbroadcastd m0, [r5 + r4 * 8]
1146 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1147%else
1148 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1149 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1150%endif
1151 movu m3, [tab_Tm + 16]
1152 vpbroadcastd m7, [pw_1]
1153
1154 ; register map
1155 ; m0 , m1 interpolate coeff
1156 ; m2 , m2 shuffle order table
1157 ; m7 - pw_1
1158
1159 mov r4d, %2
1160.loop:
1161 ; Row 0
1162 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1163 pshufb m5, m4, m3
1164 pshufb m4, [tab_Tm]
1165 pmaddubsw m4, m0
1166 pmaddubsw m5, m1
1167 paddw m4, m5
1168 pmaddwd m4, m7
1169 vbroadcasti128 m5, [r0 + 8]
1170 pshufb m6, m5, m3
1171 pshufb m5, [tab_Tm]
1172 pmaddubsw m5, m0
1173 pmaddubsw m6, m1
1174 paddw m5, m6
1175 pmaddwd m5, m7
1176 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1177 pmulhrsw m4, [pw_512]
1178 vbroadcasti128 m2, [r0 + 16]
1179 pshufb m5, m2, m3
1180 pshufb m2, [tab_Tm]
1181 pmaddubsw m2, m0
1182 pmaddubsw m5, m1
1183 paddw m2, m5
1184 pmaddwd m2, m7
1185 vbroadcasti128 m5, [r0 + 24]
1186 pshufb m6, m5, m3
1187 pshufb m5, [tab_Tm]
1188 pmaddubsw m5, m0
1189 pmaddubsw m6, m1
1190 paddw m5, m6
1191 pmaddwd m5, m7
1192 packssdw m2, m5
1193 pmulhrsw m2, [pw_512]
1194 packuswb m4, m2
1195 vpermq m4, m4, 11011000b
1196 vextracti128 xm5, m4, 1
1197 pshufd xm4, xm4, 11011000b
1198 pshufd xm5, xm5, 11011000b
1199 movu [r2], xm4
1200 movu [r2 + 16], xm5
1201 lea r0, [r0 + r1]
1202 lea r2, [r2 + r3]
1203 dec r4d
1204 jnz .loop
1205 RET
1206%endmacro
1207
1208%macro IPFILTER_LUMA_64x_avx2 2
1209INIT_YMM avx2
1210cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
1211 sub r0, 3
1212 mov r4d, r4m
1213%ifdef PIC
1214 lea r5, [tab_LumaCoeff]
1215 vpbroadcastd m0, [r5 + r4 * 8]
1216 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1217%else
1218 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1219 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1220%endif
1221 movu m3, [tab_Tm + 16]
1222 vpbroadcastd m7, [pw_1]
1223
1224 ; register map
1225 ; m0 , m1 interpolate coeff
1226 ; m2 , m2 shuffle order table
1227 ; m7 - pw_1
1228
1229 mov r4d, %2
1230.loop:
1231 ; Row 0
1232 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1233 pshufb m5, m4, m3
1234 pshufb m4, [tab_Tm]
1235 pmaddubsw m4, m0
1236 pmaddubsw m5, m1
1237 paddw m4, m5
1238 pmaddwd m4, m7
1239 vbroadcasti128 m5, [r0 + 8]
1240 pshufb m6, m5, m3
1241 pshufb m5, [tab_Tm]
1242 pmaddubsw m5, m0
1243 pmaddubsw m6, m1
1244 paddw m5, m6
1245 pmaddwd m5, m7
1246 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1247 pmulhrsw m4, [pw_512]
1248 vbroadcasti128 m2, [r0 + 16]
1249 pshufb m5, m2, m3
1250 pshufb m2, [tab_Tm]
1251 pmaddubsw m2, m0
1252 pmaddubsw m5, m1
1253 paddw m2, m5
1254 pmaddwd m2, m7
1255 vbroadcasti128 m5, [r0 + 24]
1256 pshufb m6, m5, m3
1257 pshufb m5, [tab_Tm]
1258 pmaddubsw m5, m0
1259 pmaddubsw m6, m1
1260 paddw m5, m6
1261 pmaddwd m5, m7
1262 packssdw m2, m5
1263 pmulhrsw m2, [pw_512]
1264 packuswb m4, m2
1265 vpermq m4, m4, 11011000b
1266 vextracti128 xm5, m4, 1
1267 pshufd xm4, xm4, 11011000b
1268 pshufd xm5, xm5, 11011000b
1269 movu [r2], xm4
1270 movu [r2 + 16], xm5
1271
1272 vbroadcasti128 m4, [r0 + 32]
1273 pshufb m5, m4, m3
1274 pshufb m4, [tab_Tm]
1275 pmaddubsw m4, m0
1276 pmaddubsw m5, m1
1277 paddw m4, m5
1278 pmaddwd m4, m7
1279 vbroadcasti128 m5, [r0 + 40]
1280 pshufb m6, m5, m3
1281 pshufb m5, [tab_Tm]
1282 pmaddubsw m5, m0
1283 pmaddubsw m6, m1
1284 paddw m5, m6
1285 pmaddwd m5, m7
1286 packssdw m4, m5
1287 pmulhrsw m4, [pw_512]
1288 vbroadcasti128 m2, [r0 + 48]
1289 pshufb m5, m2, m3
1290 pshufb m2, [tab_Tm]
1291 pmaddubsw m2, m0
1292 pmaddubsw m5, m1
1293 paddw m2, m5
1294 pmaddwd m2, m7
1295 vbroadcasti128 m5, [r0 + 56]
1296 pshufb m6, m5, m3
1297 pshufb m5, [tab_Tm]
1298 pmaddubsw m5, m0
1299 pmaddubsw m6, m1
1300 paddw m5, m6
1301 pmaddwd m5, m7
1302 packssdw m2, m5
1303 pmulhrsw m2, [pw_512]
1304 packuswb m4, m2
1305 vpermq m4, m4, 11011000b
1306 vextracti128 xm5, m4, 1
1307 pshufd xm4, xm4, 11011000b
1308 pshufd xm5, xm5, 11011000b
1309 movu [r2 +32], xm4
1310 movu [r2 + 48], xm5
1311
1312 lea r0, [r0 + r1]
1313 lea r2, [r2 + r3]
1314 dec r4d
1315 jnz .loop
1316 RET
1317%endmacro
1318
1319INIT_YMM avx2
1320cglobal interp_8tap_horiz_pp_48x64, 4,6,8
1321 sub r0, 3
1322 mov r4d, r4m
1323%ifdef PIC
1324 lea r5, [tab_LumaCoeff]
1325 vpbroadcastd m0, [r5 + r4 * 8]
1326 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1327%else
1328 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1329 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1330%endif
1331 movu m3, [tab_Tm + 16]
1332 vpbroadcastd m7, [pw_1]
1333
1334 ; register map
1335 ; m0 , m1 interpolate coeff
1336 ; m2 , m2 shuffle order table
1337 ; m7 - pw_1
1338
1339 mov r4d, 64
1340.loop:
1341 ; Row 0
1342 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1343 pshufb m5, m4, m3
1344 pshufb m4, [tab_Tm]
1345 pmaddubsw m4, m0
1346 pmaddubsw m5, m1
1347 paddw m4, m5
1348 pmaddwd m4, m7
1349 vbroadcasti128 m5, [r0 + 8]
1350 pshufb m6, m5, m3
1351 pshufb m5, [tab_Tm]
1352 pmaddubsw m5, m0
1353 pmaddubsw m6, m1
1354 paddw m5, m6
1355 pmaddwd m5, m7
1356 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1357 pmulhrsw m4, [pw_512]
1358
1359 vbroadcasti128 m2, [r0 + 16]
1360 pshufb m5, m2, m3
1361 pshufb m2, [tab_Tm]
1362 pmaddubsw m2, m0
1363 pmaddubsw m5, m1
1364 paddw m2, m5
1365 pmaddwd m2, m7
1366 vbroadcasti128 m5, [r0 + 24]
1367 pshufb m6, m5, m3
1368 pshufb m5, [tab_Tm]
1369 pmaddubsw m5, m0
1370 pmaddubsw m6, m1
1371 paddw m5, m6
1372 pmaddwd m5, m7
1373 packssdw m2, m5
1374 pmulhrsw m2, [pw_512]
1375 packuswb m4, m2
1376 vpermq m4, m4, 11011000b
1377 vextracti128 xm5, m4, 1
1378 pshufd xm4, xm4, 11011000b
1379 pshufd xm5, xm5, 11011000b
1380 movu [r2], xm4
1381 movu [r2 + 16], xm5
1382
1383 vbroadcasti128 m4, [r0 + 32]
1384 pshufb m5, m4, m3
1385 pshufb m4, [tab_Tm]
1386 pmaddubsw m4, m0
1387 pmaddubsw m5, m1
1388 paddw m4, m5
1389 pmaddwd m4, m7
1390 vbroadcasti128 m5, [r0 + 40]
1391 pshufb m6, m5, m3
1392 pshufb m5, [tab_Tm]
1393 pmaddubsw m5, m0
1394 pmaddubsw m6, m1
1395 paddw m5, m6
1396 pmaddwd m5, m7
1397 packssdw m4, m5
1398 pmulhrsw m4, [pw_512]
1399 packuswb m4, m4
1400 vpermq m4, m4, 11011000b
1401 pshufd xm4, xm4, 11011000b
1402 movu [r2 + 32], xm4
1403
1404 lea r0, [r0 + r1]
1405 lea r2, [r2 + r3]
1406 dec r4d
1407 jnz .loop
1408 RET
1409
1410INIT_YMM avx2
1411cglobal interp_4tap_horiz_pp_4x4, 4,6,6
1412 mov r4d, r4m
1413
1414%ifdef PIC
1415 lea r5, [tab_ChromaCoeff]
1416 vpbroadcastd m0, [r5 + r4 * 4]
1417%else
1418 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1419%endif
1420
1421 vpbroadcastd m2, [pw_1]
1422 vbroadcasti128 m1, [tab_Tm]
1423
1424 ; register map
1425 ; m0 - interpolate coeff
1426 ; m1 - shuffle order table
1427 ; m2 - constant word 1
1428
1429 dec r0
1430
1431 ; Row 0-1
1432 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1433 vinserti128 m3, m3, [r0 + r1], 1
1434 pshufb m3, m1
1435 pmaddubsw m3, m0
1436 pmaddwd m3, m2
1437
1438 ; Row 2-3
1439 lea r0, [r0 + r1 * 2]
1440 vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1441 vinserti128 m4, m4, [r0 + r1], 1
1442 pshufb m4, m1
1443 pmaddubsw m4, m0
1444 pmaddwd m4, m2
1445
1446 packssdw m3, m4
1447 pmulhrsw m3, [pw_512]
1448 vextracti128 xm4, m3, 1
1449 packuswb xm3, xm4
1450
1451 lea r0, [r3 * 3]
1452 movd [r2], xm3
1453 pextrd [r2+r3], xm3, 2
1454 pextrd [r2+r3*2], xm3, 1
1455 pextrd [r2+r0], xm3, 3
1456 RET
1457
1458INIT_YMM avx2
1459cglobal interp_4tap_horiz_pp_32x32, 4,6,7
1460 mov r4d, r4m
1461
1462%ifdef PIC
1463 lea r5, [tab_ChromaCoeff]
1464 vpbroadcastd m0, [r5 + r4 * 4]
1465%else
1466 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1467%endif
1468
1469 mova m1, [interp4_horiz_shuf1]
1470 vpbroadcastd m2, [pw_1]
1471 mova m6, [pw_512]
1472 ; register map
1473 ; m0 - interpolate coeff
1474 ; m1 - shuffle order table
1475 ; m2 - constant word 1
1476
1477 dec r0
1478 mov r4d, 32
1479
1480.loop:
1481 ; Row 0
1482 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1483 pshufb m3, m1
1484 pmaddubsw m3, m0
1485 pmaddwd m3, m2
1486 vbroadcasti128 m4, [r0 + 4]
1487 pshufb m4, m1
1488 pmaddubsw m4, m0
1489 pmaddwd m4, m2
1490 packssdw m3, m4
1491 pmulhrsw m3, m6
1492
1493 vbroadcasti128 m4, [r0 + 16]
1494 pshufb m4, m1
1495 pmaddubsw m4, m0
1496 pmaddwd m4, m2
1497 vbroadcasti128 m5, [r0 + 20]
1498 pshufb m5, m1
1499 pmaddubsw m5, m0
1500 pmaddwd m5, m2
1501 packssdw m4, m5
1502 pmulhrsw m4, m6
1503
1504 packuswb m3, m4
1505 vpermq m3, m3, 11011000b
1506
1507 movu [r2], m3
1508 lea r2, [r2 + r3]
1509 lea r0, [r0 + r1]
1510 dec r4d
1511 jnz .loop
1512 RET
1513
1514
1515INIT_YMM avx2
1516cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7
1517 mov r4d, r4m
1518
1519%ifdef PIC
1520 lea r5, [tab_ChromaCoeff]
1521 vpbroadcastd m0, [r5 + r4 * 4]
1522%else
1523 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1524%endif
1525
1526 mova m6, [pw_512]
1527 mova m1, [interp4_horiz_shuf1]
1528 vpbroadcastd m2, [pw_1]
1529
1530 ; register map
1531 ; m0 - interpolate coeff
1532 ; m1 - shuffle order table
1533 ; m2 - constant word 1
1534
1535 dec r0
1536 mov r4d, 8
1537
1538.loop:
1539 ; Row 0
1540 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1541 pshufb m3, m1
1542 pmaddubsw m3, m0
1543 pmaddwd m3, m2
1544 vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1545 pshufb m4, m1
1546 pmaddubsw m4, m0
1547 pmaddwd m4, m2
1548 packssdw m3, m4
1549 pmulhrsw m3, m6
1550
1551 ; Row 1
1552 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1553 pshufb m4, m1
1554 pmaddubsw m4, m0
1555 pmaddwd m4, m2
1556 vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1557 pshufb m5, m1
1558 pmaddubsw m5, m0
1559 pmaddwd m5, m2
1560 packssdw m4, m5
1561 pmulhrsw m4, m6
1562
1563 packuswb m3, m4
1564 vpermq m3, m3, 11011000b
1565
1566 vextracti128 xm4, m3, 1
1567 movu [r2], xm3
1568 movu [r2 + r3], xm4
1569 lea r2, [r2 + r3 * 2]
1570 lea r0, [r0 + r1 * 2]
1571 dec r4d
1572 jnz .loop
1573 RET
72b9787e
JB
1574;--------------------------------------------------------------------------------------------------------------
1575; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1576;--------------------------------------------------------------------------------------------------------------
1577 IPFILTER_LUMA 4, 4, pp
1578 IPFILTER_LUMA 4, 8, pp
1579 IPFILTER_LUMA 12, 16, pp
1580 IPFILTER_LUMA 4, 16, pp
1581
b53f7c52
JB
1582INIT_YMM avx2
1583cglobal interp_4tap_horiz_pp_8x8, 4,6,6
1584 mov r4d, r4m
1585
1586%ifdef PIC
1587 lea r5, [tab_ChromaCoeff]
1588 vpbroadcastd m0, [r5 + r4 * 4]
1589%else
1590 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1591%endif
1592
1593 movu m1, [tab_Tm]
1594 vpbroadcastd m2, [pw_1]
1595
1596 ; register map
1597 ; m0 - interpolate coeff
1598 ; m1 - shuffle order table
1599 ; m2 - constant word 1
1600
1601 sub r0, 1
1602 mov r4d, 2
1603
1604.loop:
1605 ; Row 0
1606 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1607 pshufb m3, m1
1608 pmaddubsw m3, m0
1609 pmaddwd m3, m2
1610
1611 ; Row 1
1612 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1613 pshufb m4, m1
1614 pmaddubsw m4, m0
1615 pmaddwd m4, m2
1616 packssdw m3, m4
1617 pmulhrsw m3, [pw_512]
1618 lea r0, [r0 + r1 * 2]
1619
1620 ; Row 2
1621 vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1622 pshufb m4, m1
1623 pmaddubsw m4, m0
1624 pmaddwd m4, m2
1625
1626 ; Row 3
1627 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1628 pshufb m5, m1
1629 pmaddubsw m5, m0
1630 pmaddwd m5, m2
1631 packssdw m4, m5
1632 pmulhrsw m4, [pw_512]
1633
1634 packuswb m3, m4
1635 mova m5, [interp_4tap_8x8_horiz_shuf]
1636 vpermd m3, m5, m3
1637 vextracti128 xm4, m3, 1
1638 movq [r2], xm3
1639 movhps [r2 + r3], xm3
1640 lea r2, [r2 + r3 * 2]
1641 movq [r2], xm4
1642 movhps [r2 + r3], xm4
1643 lea r2, [r2 + r3 * 2]
1644 lea r0, [r0 + r1*2]
1645 dec r4d
1646 jnz .loop
1647 RET
1648
1649 IPFILTER_LUMA_AVX2 16, 4
1650 IPFILTER_LUMA_AVX2 16, 8
1651 IPFILTER_LUMA_AVX2 16, 12
1652 IPFILTER_LUMA_AVX2 16, 16
1653 IPFILTER_LUMA_AVX2 16, 32
1654 IPFILTER_LUMA_AVX2 16, 64
1655
1656 IPFILTER_LUMA_32x_avx2 32 , 8
1657 IPFILTER_LUMA_32x_avx2 32 , 16
1658 IPFILTER_LUMA_32x_avx2 32 , 24
1659 IPFILTER_LUMA_32x_avx2 32 , 32
1660 IPFILTER_LUMA_32x_avx2 32 , 64
1661
1662 IPFILTER_LUMA_64x_avx2 64 , 64
1663 IPFILTER_LUMA_64x_avx2 64 , 48
1664 IPFILTER_LUMA_64x_avx2 64 , 32
1665 IPFILTER_LUMA_64x_avx2 64 , 16
1666
72b9787e
JB
1667;--------------------------------------------------------------------------------------------------------------
1668; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1669;--------------------------------------------------------------------------------------------------------------
1670%macro IPFILTER_LUMA_PP_W8 2
1671INIT_XMM sse4
1672cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
1673 mov r4d, r4m
1674
1675%ifdef PIC
1676 lea r5, [tab_LumaCoeff]
1677 movh m3, [r5 + r4 * 8]
1678%else
1679 movh m3, [tab_LumaCoeff + r4 * 8]
1680%endif
1681 pshufd m0, m3, 0 ; m0 = coeff-L
1682 pshufd m1, m3, 0x55 ; m1 = coeff-H
1683 lea r5, [tab_Tm] ; r5 = shuffle
1684 mova m2, [pw_512] ; m2 = 512
1685
1686 mov r4d, %2
1687.loopH:
1688%assign x 0
1689%rep %1 / 8
1690 movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
1691 pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
1692 pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
1693 pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]
1694 pmaddubsw m4, m0
1695 pmaddubsw m6, m5, m1
1696 pmaddubsw m5, m0
1697 pmaddubsw m3, m1
1698 paddw m4, m6
1699 paddw m5, m3
1700 phaddw m4, m5
1701 pmulhrsw m4, m2
1702 packuswb m4, m4
1703 movh [r2 + x], m4
1704%assign x x+8
1705%endrep
1706
1707 add r0, r1
1708 add r2, r3
1709
1710 dec r4d
1711 jnz .loopH
1712 RET
1713%endmacro
1714
1715IPFILTER_LUMA_PP_W8 8, 4
1716IPFILTER_LUMA_PP_W8 8, 8
1717IPFILTER_LUMA_PP_W8 8, 16
1718IPFILTER_LUMA_PP_W8 8, 32
1719IPFILTER_LUMA_PP_W8 16, 4
1720IPFILTER_LUMA_PP_W8 16, 8
1721IPFILTER_LUMA_PP_W8 16, 12
1722IPFILTER_LUMA_PP_W8 16, 16
1723IPFILTER_LUMA_PP_W8 16, 32
1724IPFILTER_LUMA_PP_W8 16, 64
1725IPFILTER_LUMA_PP_W8 24, 32
1726IPFILTER_LUMA_PP_W8 32, 8
1727IPFILTER_LUMA_PP_W8 32, 16
1728IPFILTER_LUMA_PP_W8 32, 24
1729IPFILTER_LUMA_PP_W8 32, 32
1730IPFILTER_LUMA_PP_W8 32, 64
1731IPFILTER_LUMA_PP_W8 48, 64
1732IPFILTER_LUMA_PP_W8 64, 16
1733IPFILTER_LUMA_PP_W8 64, 32
1734IPFILTER_LUMA_PP_W8 64, 48
1735IPFILTER_LUMA_PP_W8 64, 64
1736
1737;----------------------------------------------------------------------------------------------------------------------------
1738; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1739;----------------------------------------------------------------------------------------------------------------------------
1740 IPFILTER_LUMA 4, 4, ps
1741 IPFILTER_LUMA 8, 8, ps
1742 IPFILTER_LUMA 8, 4, ps
1743 IPFILTER_LUMA 4, 8, ps
1744 IPFILTER_LUMA 16, 16, ps
1745 IPFILTER_LUMA 16, 8, ps
1746 IPFILTER_LUMA 8, 16, ps
1747 IPFILTER_LUMA 16, 12, ps
1748 IPFILTER_LUMA 12, 16, ps
1749 IPFILTER_LUMA 16, 4, ps
1750 IPFILTER_LUMA 4, 16, ps
1751 IPFILTER_LUMA 32, 32, ps
1752 IPFILTER_LUMA 32, 16, ps
1753 IPFILTER_LUMA 16, 32, ps
1754 IPFILTER_LUMA 32, 24, ps
1755 IPFILTER_LUMA 24, 32, ps
1756 IPFILTER_LUMA 32, 8, ps
1757 IPFILTER_LUMA 8, 32, ps
1758 IPFILTER_LUMA 64, 64, ps
1759 IPFILTER_LUMA 64, 32, ps
1760 IPFILTER_LUMA 32, 64, ps
1761 IPFILTER_LUMA 64, 48, ps
1762 IPFILTER_LUMA 48, 64, ps
1763 IPFILTER_LUMA 64, 16, ps
1764 IPFILTER_LUMA 16, 64, ps
1765
1766;-----------------------------------------------------------------------------
1767; Interpolate HV
1768;-----------------------------------------------------------------------------
1769%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
1770 mova %5, [r0 + (%6 + 0) * 16]
1771 mova %1, [r0 + (%6 + 1) * 16]
1772 mova %2, [r0 + (%6 + 2) * 16]
1773 punpcklwd %3, %5, %1
1774 punpckhwd %5, %1
1775 pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
1776 pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
1777 punpcklwd %4, %1, %2
1778 punpckhwd %1, %2
1779 pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
1780 pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
1781%endmacro ; FILTER_HV8_START
1782
1783%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
1784 mova %8, [r0 + (%9 + 0) * 16]
1785 mova %1, [r0 + (%9 + 1) * 16]
1786 punpcklwd %7, %2, %8
1787 punpckhwd %2, %8
1788 pmaddwd %7, [r5 + %10 * 16]
1789 pmaddwd %2, [r5 + %10 * 16]
1790 paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
1791 paddd %5, %2 ; R0 = H[0+1+2+3]
1792 punpcklwd %7, %8, %1
1793 punpckhwd %8, %1
1794 pmaddwd %7, [r5 + %10 * 16]
1795 pmaddwd %8, [r5 + %10 * 16]
1796 paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
1797 paddd %6, %8 ; R1 = H[1+2+3+4]
1798%endmacro ; FILTER_HV8_MID
1799
1800; Round and Saturate
1801%macro FILTER_HV8_END 4 ; output in [1, 3]
1802 paddd %1, [tab_c_526336]
1803 paddd %2, [tab_c_526336]
1804 paddd %3, [tab_c_526336]
1805 paddd %4, [tab_c_526336]
1806 psrad %1, 12
1807 psrad %2, 12
1808 psrad %3, 12
1809 psrad %4, 12
1810 packssdw %1, %2
1811 packssdw %3, %4
1812
1813 ; TODO: is merge better? I think this way is short dependency link
1814 packuswb %1, %3
1815%endmacro ; FILTER_HV8_END
1816
1817;-----------------------------------------------------------------------------
1818; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
1819;-----------------------------------------------------------------------------
1820INIT_XMM ssse3
1821cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
1822%define coef m7
1823%define stk_buf rsp
1824
1825 mov r4d, r4m
1826 mov r5d, r5m
1827
1828%ifdef PIC
1829 lea r6, [tab_LumaCoeff]
1830 movh coef, [r6 + r4 * 8]
1831%else
1832 movh coef, [tab_LumaCoeff + r4 * 8]
1833%endif
1834 punpcklqdq coef, coef
1835
1836 ; move to row -3
1837 lea r6, [r1 + r1 * 2]
1838 sub r0, r6
1839
1840 xor r6, r6
1841 mov r4, rsp
1842
1843.loopH:
b53f7c52 1844 FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]
72b9787e
JB
1845 psubw m1, [pw_2000]
1846 mova [r4], m1
1847
1848 add r0, r1
1849 add r4, 16
1850 inc r6
1851 cmp r6, 8+7
1852 jnz .loopH
1853
1854 ; ready to phase V
1855 ; Here all of mN is free
1856
1857 ; load coeff table
1858 shl r5, 6
1859 lea r6, [tab_LumaCoeffV]
1860 lea r5, [r5 + r6]
1861
1862 ; load intermedia buffer
1863 mov r0, stk_buf
1864
1865 ; register mapping
1866 ; r0 - src
1867 ; r5 - coeff
1868 ; r6 - loop_i
1869
1870 ; let's go
1871 xor r6, r6
1872
1873 ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
1874.loopV:
1875
1876 FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
1877 FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
1878 FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
1879 FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
1880 FILTER_HV8_END m3, m0, m4, m1
1881
1882 movh [r2], m3
1883 movhps [r2 + r3], m3
1884
1885 lea r0, [r0 + 16 * 2]
1886 lea r2, [r2 + r3 * 2]
1887
1888 inc r6
1889 cmp r6, 8/2
1890 jnz .loopV
1891
1892 RET
1893
1894;-----------------------------------------------------------------------------
1895;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1896;-----------------------------------------------------------------------------
1897INIT_XMM sse4
1898cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
1899
1900mov r4d, r4m
1901sub r0, r1
1902
1903%ifdef PIC
1904lea r5, [tab_ChromaCoeff]
1905movd m0, [r5 + r4 * 4]
1906%else
1907movd m0, [tab_ChromaCoeff + r4 * 4]
1908%endif
1909lea r4, [r1 * 3]
1910lea r5, [r0 + 4 * r1]
1911pshufb m0, [tab_Cm]
b53f7c52 1912mova m1, [pw_512]
72b9787e
JB
1913
1914movd m2, [r0]
1915movd m3, [r0 + r1]
1916movd m4, [r0 + 2 * r1]
1917movd m5, [r0 + r4]
1918
1919punpcklbw m2, m3
1920punpcklbw m6, m4, m5
1921punpcklbw m2, m6
1922
1923pmaddubsw m2, m0
1924
1925movd m6, [r5]
1926
1927punpcklbw m3, m4
1928punpcklbw m7, m5, m6
1929punpcklbw m3, m7
1930
1931pmaddubsw m3, m0
1932
1933phaddw m2, m3
1934
1935pmulhrsw m2, m1
1936
1937movd m7, [r5 + r1]
1938
1939punpcklbw m4, m5
1940punpcklbw m3, m6, m7
1941punpcklbw m4, m3
1942
1943pmaddubsw m4, m0
1944
1945movd m3, [r5 + 2 * r1]
1946
1947punpcklbw m5, m6
1948punpcklbw m7, m3
1949punpcklbw m5, m7
1950
1951pmaddubsw m5, m0
1952
1953phaddw m4, m5
1954
1955pmulhrsw m4, m1
1956packuswb m2, m4
1957
1958pextrw [r2], m2, 0
1959pextrw [r2 + r3], m2, 2
1960lea r2, [r2 + 2 * r3]
1961pextrw [r2], m2, 4
1962pextrw [r2 + r3], m2, 6
1963
1964RET
1965
1966;-----------------------------------------------------------------------------
1967; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1968;-----------------------------------------------------------------------------
1969%macro FILTER_V4_W2_H4 2
1970INIT_XMM sse4
1971cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
1972
1973mov r4d, r4m
1974sub r0, r1
1975
1976%ifdef PIC
1977lea r5, [tab_ChromaCoeff]
1978movd m0, [r5 + r4 * 4]
1979%else
1980movd m0, [tab_ChromaCoeff + r4 * 4]
1981%endif
1982
1983pshufb m0, [tab_Cm]
1984
b53f7c52 1985mova m1, [pw_512]
72b9787e
JB
1986
1987mov r4d, %2
1988lea r5, [3 * r1]
1989
1990.loop:
1991movd m2, [r0]
1992movd m3, [r0 + r1]
1993movd m4, [r0 + 2 * r1]
1994movd m5, [r0 + r5]
1995
1996punpcklbw m2, m3
1997punpcklbw m6, m4, m5
1998punpcklbw m2, m6
1999
2000pmaddubsw m2, m0
2001
2002lea r0, [r0 + 4 * r1]
2003movd m6, [r0]
2004
2005punpcklbw m3, m4
2006punpcklbw m7, m5, m6
2007punpcklbw m3, m7
2008
2009pmaddubsw m3, m0
2010
2011phaddw m2, m3
2012
2013pmulhrsw m2, m1
2014
2015movd m7, [r0 + r1]
2016
2017punpcklbw m4, m5
2018punpcklbw m3, m6, m7
2019punpcklbw m4, m3
2020
2021pmaddubsw m4, m0
2022
2023movd m3, [r0 + 2 * r1]
2024
2025punpcklbw m5, m6
2026punpcklbw m7, m3
2027punpcklbw m5, m7
2028
2029pmaddubsw m5, m0
2030
2031phaddw m4, m5
2032
2033pmulhrsw m4, m1
2034packuswb m2, m4
2035
2036pextrw [r2], m2, 0
2037pextrw [r2 + r3], m2, 2
2038lea r2, [r2 + 2 * r3]
2039pextrw [r2], m2, 4
2040pextrw [r2 + r3], m2, 6
2041
2042lea r2, [r2 + 2 * r3]
2043
2044sub r4, 4
2045jnz .loop
2046RET
2047%endmacro
2048
2049FILTER_V4_W2_H4 2, 8
2050
2051FILTER_V4_W2_H4 2, 16
2052
2053;-----------------------------------------------------------------------------
2054; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2055;-----------------------------------------------------------------------------
2056INIT_XMM sse4
2057cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
2058
2059mov r4d, r4m
2060sub r0, r1
2061
2062%ifdef PIC
2063lea r5, [tab_ChromaCoeff]
2064movd m0, [r5 + r4 * 4]
2065%else
2066movd m0, [tab_ChromaCoeff + r4 * 4]
2067%endif
2068
2069pshufb m0, [tab_Cm]
2070lea r5, [r0 + 2 * r1]
2071
2072movd m2, [r0]
2073movd m3, [r0 + r1]
2074movd m4, [r5]
2075movd m5, [r5 + r1]
2076
2077punpcklbw m2, m3
2078punpcklbw m1, m4, m5
2079punpcklbw m2, m1
2080
2081pmaddubsw m2, m0
2082
2083movd m1, [r0 + 4 * r1]
2084
2085punpcklbw m3, m4
2086punpcklbw m5, m1
2087punpcklbw m3, m5
2088
2089pmaddubsw m3, m0
2090
2091phaddw m2, m3
2092
b53f7c52 2093pmulhrsw m2, [pw_512]
72b9787e
JB
2094packuswb m2, m2
2095movd [r2], m2
2096pextrd [r2 + r3], m2, 1
2097
2098RET
2099
2100;-----------------------------------------------------------------------------
2101; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2102;-----------------------------------------------------------------------------
2103INIT_XMM sse4
2104cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
2105
2106mov r4d, r4m
2107sub r0, r1
2108
2109%ifdef PIC
2110lea r5, [tab_ChromaCoeff]
2111movd m0, [r5 + r4 * 4]
2112%else
2113movd m0, [tab_ChromaCoeff + r4 * 4]
2114%endif
2115
2116pshufb m0, [tab_Cm]
b53f7c52 2117mova m1, [pw_512]
72b9787e
JB
2118lea r5, [r0 + 4 * r1]
2119lea r4, [r1 * 3]
2120
2121movd m2, [r0]
2122movd m3, [r0 + r1]
2123movd m4, [r0 + 2 * r1]
2124movd m5, [r0 + r4]
2125
2126punpcklbw m2, m3
2127punpcklbw m6, m4, m5
2128punpcklbw m2, m6
2129
2130pmaddubsw m2, m0
2131
2132movd m6, [r5]
2133
2134punpcklbw m3, m4
2135punpcklbw m7, m5, m6
2136punpcklbw m3, m7
2137
2138pmaddubsw m3, m0
2139
2140phaddw m2, m3
2141
2142pmulhrsw m2, m1
2143
2144movd m7, [r5 + r1]
2145
2146punpcklbw m4, m5
2147punpcklbw m3, m6, m7
2148punpcklbw m4, m3
2149
2150pmaddubsw m4, m0
2151
2152movd m3, [r5 + 2 * r1]
2153
2154punpcklbw m5, m6
2155punpcklbw m7, m3
2156punpcklbw m5, m7
2157
2158pmaddubsw m5, m0
2159
2160phaddw m4, m5
2161
2162pmulhrsw m4, m1
2163
2164packuswb m2, m4
2165movd [r2], m2
2166pextrd [r2 + r3], m2, 1
2167lea r2, [r2 + 2 * r3]
2168pextrd [r2], m2, 2
2169pextrd [r2 + r3], m2, 3
2170
2171RET
2172
b53f7c52
JB
2173INIT_YMM avx2
2174cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
2175 mov r4d, r4m
2176 shl r4d, 6
2177 sub r0, r1
2178
2179%ifdef PIC
2180 lea r5, [tab_ChromaCoeffVer_32]
2181 add r5, r4
2182%else
2183 lea r5, [tab_ChromaCoeffVer_32 + r4]
2184%endif
2185
2186 lea r4, [r1 * 3]
2187
2188 movd xm1, [r0]
2189 pinsrd xm1, [r0 + r1], 1
2190 pinsrd xm1, [r0 + r1 * 2], 2
2191 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
2192 lea r0, [r0 + r1 * 4]
2193 movd xm2, [r0]
2194 pinsrd xm2, [r0 + r1], 1
2195 pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4]
2196 vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0]
2197 mova m2, [interp4_vpp_shuf1]
2198 vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0]
2199 mova m2, [interp4_vpp_shuf1 + mmsize]
2200 vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2]
2201
2202 mova m2, [interp4_vpp_shuf]
2203 pshufb m0, m0, m2
2204 pshufb m1, m1, m2
2205 pmaddubsw m0, [r5]
2206 pmaddubsw m1, [r5 + mmsize]
2207 paddw m0, m1 ; m0 = WORD ROW[3 2 1 0]
2208 pmulhrsw m0, [pw_512]
2209 vextracti128 xm1, m0, 1
2210 packuswb xm0, xm1
2211 lea r5, [r3 * 3]
2212 movd [r2], xm0
2213 pextrd [r2 + r3], xm0, 1
2214 pextrd [r2 + r3 * 2], xm0, 2
2215 pextrd [r2 + r5], xm0, 3
2216 RET
2217
72b9787e
JB
2218;-----------------------------------------------------------------------------
2219; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2220;-----------------------------------------------------------------------------
2221%macro FILTER_V4_W4_H4 2
2222INIT_XMM sse4
2223cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
2224
2225mov r4d, r4m
2226sub r0, r1
2227
2228%ifdef PIC
2229lea r5, [tab_ChromaCoeff]
2230movd m0, [r5 + r4 * 4]
2231%else
2232movd m0, [tab_ChromaCoeff + r4 * 4]
2233%endif
2234
2235pshufb m0, [tab_Cm]
2236
b53f7c52 2237mova m1, [pw_512]
72b9787e
JB
2238
2239mov r4d, %2
2240
2241lea r5, [3 * r1]
2242
2243.loop:
2244movd m2, [r0]
2245movd m3, [r0 + r1]
2246movd m4, [r0 + 2 * r1]
2247movd m5, [r0 + r5]
2248
2249punpcklbw m2, m3
2250punpcklbw m6, m4, m5
2251punpcklbw m2, m6
2252
2253pmaddubsw m2, m0
2254
2255lea r0, [r0 + 4 * r1]
2256movd m6, [r0]
2257
2258punpcklbw m3, m4
2259punpcklbw m7, m5, m6
2260punpcklbw m3, m7
2261
2262pmaddubsw m3, m0
2263
2264phaddw m2, m3
2265
2266pmulhrsw m2, m1
2267
2268movd m7, [r0 + r1]
2269
2270punpcklbw m4, m5
2271punpcklbw m3, m6, m7
2272punpcklbw m4, m3
2273
2274pmaddubsw m4, m0
2275
2276movd m3, [r0 + 2 * r1]
2277
2278punpcklbw m5, m6
2279punpcklbw m7, m3
2280punpcklbw m5, m7
2281
2282pmaddubsw m5, m0
2283
2284phaddw m4, m5
2285
2286pmulhrsw m4, m1
2287packuswb m2, m4
2288movd [r2], m2
2289pextrd [r2 + r3], m2, 1
2290lea r2, [r2 + 2 * r3]
2291pextrd [r2], m2, 2
2292pextrd [r2 + r3], m2, 3
2293
2294lea r2, [r2 + 2 * r3]
2295
2296sub r4, 4
2297jnz .loop
2298RET
2299%endmacro
2300
2301FILTER_V4_W4_H4 4, 8
2302FILTER_V4_W4_H4 4, 16
2303
2304FILTER_V4_W4_H4 4, 32
2305
2306%macro FILTER_V4_W8_H2 0
2307punpcklbw m1, m2
2308punpcklbw m7, m3, m0
2309
2310pmaddubsw m1, m6
2311pmaddubsw m7, m5
2312
2313paddw m1, m7
2314
2315pmulhrsw m1, m4
2316packuswb m1, m1
2317%endmacro
2318
2319%macro FILTER_V4_W8_H3 0
2320punpcklbw m2, m3
2321punpcklbw m7, m0, m1
2322
2323pmaddubsw m2, m6
2324pmaddubsw m7, m5
2325
2326paddw m2, m7
2327
2328pmulhrsw m2, m4
2329packuswb m2, m2
2330%endmacro
2331
2332%macro FILTER_V4_W8_H4 0
2333punpcklbw m3, m0
2334punpcklbw m7, m1, m2
2335
2336pmaddubsw m3, m6
2337pmaddubsw m7, m5
2338
2339paddw m3, m7
2340
2341pmulhrsw m3, m4
2342packuswb m3, m3
2343%endmacro
2344
2345%macro FILTER_V4_W8_H5 0
2346punpcklbw m0, m1
2347punpcklbw m7, m2, m3
2348
2349pmaddubsw m0, m6
2350pmaddubsw m7, m5
2351
2352paddw m0, m7
2353
2354pmulhrsw m0, m4
2355packuswb m0, m0
2356%endmacro
2357
2358%macro FILTER_V4_W8_8x2 2
2359FILTER_V4_W8 %1, %2
2360movq m0, [r0 + 4 * r1]
2361
2362FILTER_V4_W8_H2
2363
2364movh [r2 + r3], m1
2365%endmacro
2366
2367%macro FILTER_V4_W8_8x4 2
2368FILTER_V4_W8_8x2 %1, %2
2369;8x3
2370lea r6, [r0 + 4 * r1]
2371movq m1, [r6 + r1]
2372
2373FILTER_V4_W8_H3
2374
2375movh [r2 + 2 * r3], m2
2376
2377;8x4
2378movq m2, [r6 + 2 * r1]
2379
2380FILTER_V4_W8_H4
2381
2382lea r5, [r2 + 2 * r3]
2383movh [r5 + r3], m3
2384%endmacro
2385
2386%macro FILTER_V4_W8_8x6 2
2387FILTER_V4_W8_8x4 %1, %2
2388;8x5
2389lea r6, [r6 + 2 * r1]
2390movq m3, [r6 + r1]
2391
2392FILTER_V4_W8_H5
2393
2394movh [r2 + 4 * r3], m0
2395
2396;8x6
2397movq m0, [r0 + 8 * r1]
2398
2399FILTER_V4_W8_H2
2400
2401lea r5, [r2 + 4 * r3]
2402movh [r5 + r3], m1
2403%endmacro
2404
2405;-----------------------------------------------------------------------------
2406; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2407;-----------------------------------------------------------------------------
2408%macro FILTER_V4_W8 2
2409INIT_XMM sse4
2410cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
2411
2412mov r4d, r4m
2413
2414sub r0, r1
2415movq m0, [r0]
2416movq m1, [r0 + r1]
2417movq m2, [r0 + 2 * r1]
2418lea r5, [r0 + 2 * r1]
2419movq m3, [r5 + r1]
2420
2421punpcklbw m0, m1
2422punpcklbw m4, m2, m3
2423
2424%ifdef PIC
2425lea r6, [tab_ChromaCoeff]
2426movd m5, [r6 + r4 * 4]
2427%else
2428movd m5, [tab_ChromaCoeff + r4 * 4]
2429%endif
2430
2431pshufb m6, m5, [tab_Vm]
2432pmaddubsw m0, m6
2433
2434pshufb m5, [tab_Vm + 16]
2435pmaddubsw m4, m5
2436
2437paddw m0, m4
2438
b53f7c52 2439mova m4, [pw_512]
72b9787e
JB
2440
2441pmulhrsw m0, m4
2442packuswb m0, m0
2443movh [r2], m0
2444%endmacro
2445
2446;-----------------------------------------------------------------------------
2447; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2448;-----------------------------------------------------------------------------
2449FILTER_V4_W8_8x2 8, 2
2450
2451RET
2452
2453;-----------------------------------------------------------------------------
2454; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2455;-----------------------------------------------------------------------------
2456FILTER_V4_W8_8x4 8, 4
2457
2458RET
2459
2460;-----------------------------------------------------------------------------
2461; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2462;-----------------------------------------------------------------------------
2463FILTER_V4_W8_8x6 8, 6
2464
2465RET
2466
2467;-------------------------------------------------------------------------------------------------------------
2468; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2469;-------------------------------------------------------------------------------------------------------------
2470INIT_XMM sse4
2471cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
2472
2473mov r4d, r4m
2474sub r0, r1
2475add r3d, r3d
2476
2477%ifdef PIC
2478lea r5, [tab_ChromaCoeff]
2479movd m0, [r5 + r4 * 4]
2480%else
2481movd m0, [tab_ChromaCoeff + r4 * 4]
2482%endif
2483
2484pshufb m0, [tab_Cm]
2485
2486movd m2, [r0]
2487movd m3, [r0 + r1]
2488lea r5, [r0 + 2 * r1]
2489movd m4, [r5]
2490movd m5, [r5 + r1]
2491
2492punpcklbw m2, m3
2493punpcklbw m1, m4, m5
2494punpcklbw m2, m1
2495
2496pmaddubsw m2, m0
2497
2498movd m1, [r0 + 4 * r1]
2499
2500punpcklbw m3, m4
2501punpcklbw m5, m1
2502punpcklbw m3, m5
2503
2504pmaddubsw m3, m0
2505
2506phaddw m2, m3
2507
2508psubw m2, [pw_2000]
2509movh [r2], m2
2510movhps [r2 + r3], m2
2511
2512RET
2513
2514;-------------------------------------------------------------------------------------------------------------
2515; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2516;-------------------------------------------------------------------------------------------------------------
2517INIT_XMM sse4
2518cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
2519
2520 mov r4d, r4m
2521 sub r0, r1
2522 add r3d, r3d
2523
2524%ifdef PIC
2525 lea r5, [tab_ChromaCoeff]
2526 movd m0, [r5 + r4 * 4]
2527%else
2528 movd m0, [tab_ChromaCoeff + r4 * 4]
2529%endif
2530
2531 pshufb m0, [tab_Cm]
2532
2533 lea r4, [r1 * 3]
2534 lea r5, [r0 + 4 * r1]
2535
2536 movd m2, [r0]
2537 movd m3, [r0 + r1]
2538 movd m4, [r0 + 2 * r1]
2539 movd m5, [r0 + r4]
2540
2541 punpcklbw m2, m3
2542 punpcklbw m6, m4, m5
2543 punpcklbw m2, m6
2544
2545 pmaddubsw m2, m0
2546
2547 movd m6, [r5]
2548
2549 punpcklbw m3, m4
2550 punpcklbw m1, m5, m6
2551 punpcklbw m3, m1
2552
2553 pmaddubsw m3, m0
2554
2555 phaddw m2, m3
2556
2557 mova m1, [pw_2000]
2558
2559 psubw m2, m1
2560 movh [r2], m2
2561 movhps [r2 + r3], m2
2562
2563 movd m2, [r5 + r1]
2564
2565 punpcklbw m4, m5
2566 punpcklbw m3, m6, m2
2567 punpcklbw m4, m3
2568
2569 pmaddubsw m4, m0
2570
2571 movd m3, [r5 + 2 * r1]
2572
2573 punpcklbw m5, m6
2574 punpcklbw m2, m3
2575 punpcklbw m5, m2
2576
2577 pmaddubsw m5, m0
2578
2579 phaddw m4, m5
2580
2581 psubw m4, m1
2582 lea r2, [r2 + 2 * r3]
2583 movh [r2], m4
2584 movhps [r2 + r3], m4
2585
2586 RET
2587
2588;---------------------------------------------------------------------------------------------------------------
2589; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2590;---------------------------------------------------------------------------------------------------------------
2591%macro FILTER_V_PS_W4_H4 2
2592INIT_XMM sse4
2593cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2594
2595 mov r4d, r4m
2596 sub r0, r1
2597 add r3d, r3d
2598
2599%ifdef PIC
2600 lea r5, [tab_ChromaCoeff]
2601 movd m0, [r5 + r4 * 4]
2602%else
2603 movd m0, [tab_ChromaCoeff + r4 * 4]
2604%endif
2605
2606 pshufb m0, [tab_Cm]
2607
2608 mova m1, [pw_2000]
2609
2610 mov r4d, %2/4
2611 lea r5, [3 * r1]
2612
2613.loop:
2614 movd m2, [r0]
2615 movd m3, [r0 + r1]
2616 movd m4, [r0 + 2 * r1]
2617 movd m5, [r0 + r5]
2618
2619 punpcklbw m2, m3
2620 punpcklbw m6, m4, m5
2621 punpcklbw m2, m6
2622
2623 pmaddubsw m2, m0
2624
2625 lea r0, [r0 + 4 * r1]
2626 movd m6, [r0]
2627
2628 punpcklbw m3, m4
2629 punpcklbw m7, m5, m6
2630 punpcklbw m3, m7
2631
2632 pmaddubsw m3, m0
2633
2634 phaddw m2, m3
2635
2636 psubw m2, m1
2637 movh [r2], m2
2638 movhps [r2 + r3], m2
2639
2640 movd m2, [r0 + r1]
2641
2642 punpcklbw m4, m5
2643 punpcklbw m3, m6, m2
2644 punpcklbw m4, m3
2645
2646 pmaddubsw m4, m0
2647
2648 movd m3, [r0 + 2 * r1]
2649
2650 punpcklbw m5, m6
2651 punpcklbw m2, m3
2652 punpcklbw m5, m2
2653
2654 pmaddubsw m5, m0
2655
2656 phaddw m4, m5
2657
2658 psubw m4, m1
2659 lea r2, [r2 + 2 * r3]
2660 movh [r2], m4
2661 movhps [r2 + r3], m4
2662
2663 lea r2, [r2 + 2 * r3]
2664
2665 dec r4d
2666 jnz .loop
2667 RET
2668%endmacro
2669
2670FILTER_V_PS_W4_H4 4, 8
2671FILTER_V_PS_W4_H4 4, 16
2672
2673FILTER_V_PS_W4_H4 4, 32
2674
2675;--------------------------------------------------------------------------------------------------------------
2676; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2677;--------------------------------------------------------------------------------------------------------------
2678%macro FILTER_V_PS_W8_H8_H16_H2 2
2679INIT_XMM sse4
2680cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
2681
2682 mov r4d, r4m
2683 sub r0, r1
2684 add r3d, r3d
2685
2686%ifdef PIC
2687 lea r5, [tab_ChromaCoeff]
2688 movd m5, [r5 + r4 * 4]
2689%else
2690 movd m5, [tab_ChromaCoeff + r4 * 4]
2691%endif
2692
2693 pshufb m6, m5, [tab_Vm]
2694 pshufb m5, [tab_Vm + 16]
2695 mova m4, [pw_2000]
2696
2697 mov r4d, %2/2
2698 lea r5, [3 * r1]
2699
2700.loopH:
2701 movq m0, [r0]
2702 movq m1, [r0 + r1]
2703 movq m2, [r0 + 2 * r1]
2704 movq m3, [r0 + r5]
2705
2706 punpcklbw m0, m1
2707 punpcklbw m1, m2
2708 punpcklbw m2, m3
2709
2710 pmaddubsw m0, m6
2711 pmaddubsw m2, m5
2712
2713 paddw m0, m2
2714
2715 psubw m0, m4
2716 movu [r2], m0
2717
2718 movq m0, [r0 + 4 * r1]
2719
2720 punpcklbw m3, m0
2721
2722 pmaddubsw m1, m6
2723 pmaddubsw m3, m5
2724
2725 paddw m1, m3
2726 psubw m1, m4
2727
2728 movu [r2 + r3], m1
2729
2730 lea r0, [r0 + 2 * r1]
2731 lea r2, [r2 + 2 * r3]
2732
2733 dec r4d
2734 jnz .loopH
2735
2736 RET
2737%endmacro
2738
2739FILTER_V_PS_W8_H8_H16_H2 8, 2
2740FILTER_V_PS_W8_H8_H16_H2 8, 4
2741FILTER_V_PS_W8_H8_H16_H2 8, 6
2742
2743FILTER_V_PS_W8_H8_H16_H2 8, 12
2744FILTER_V_PS_W8_H8_H16_H2 8, 64
2745
2746;--------------------------------------------------------------------------------------------------------------
2747; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2748;--------------------------------------------------------------------------------------------------------------
2749%macro FILTER_V_PS_W8_H8_H16_H32 2
2750INIT_XMM sse4
2751cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2752
2753 mov r4d, r4m
2754 sub r0, r1
2755 add r3d, r3d
2756
2757%ifdef PIC
2758 lea r5, [tab_ChromaCoeff]
2759 movd m5, [r5 + r4 * 4]
2760%else
2761 movd m5, [tab_ChromaCoeff + r4 * 4]
2762%endif
2763
2764 pshufb m6, m5, [tab_Vm]
2765 pshufb m5, [tab_Vm + 16]
2766 mova m4, [pw_2000]
2767
2768 mov r4d, %2/4
2769 lea r5, [3 * r1]
2770
2771.loop:
2772 movq m0, [r0]
2773 movq m1, [r0 + r1]
2774 movq m2, [r0 + 2 * r1]
2775 movq m3, [r0 + r5]
2776
2777 punpcklbw m0, m1
2778 punpcklbw m1, m2
2779 punpcklbw m2, m3
2780
2781 pmaddubsw m0, m6
2782 pmaddubsw m7, m2, m5
2783
2784 paddw m0, m7
2785
2786 psubw m0, m4
2787 movu [r2], m0
2788
2789 lea r0, [r0 + 4 * r1]
2790 movq m0, [r0]
2791
2792 punpcklbw m3, m0
2793
2794 pmaddubsw m1, m6
2795 pmaddubsw m7, m3, m5
2796
2797 paddw m1, m7
2798
2799 psubw m1, m4
2800 movu [r2 + r3], m1
2801
2802 movq m1, [r0 + r1]
2803
2804 punpcklbw m0, m1
2805
2806 pmaddubsw m2, m6
2807 pmaddubsw m0, m5
2808
2809 paddw m2, m0
2810
2811 psubw m2, m4
2812 lea r2, [r2 + 2 * r3]
2813 movu [r2], m2
2814
2815 movq m2, [r0 + 2 * r1]
2816
2817 punpcklbw m1, m2
2818
2819 pmaddubsw m3, m6
2820 pmaddubsw m1, m5
2821
2822 paddw m3, m1
2823 psubw m3, m4
2824
2825 movu [r2 + r3], m3
2826
2827 lea r2, [r2 + 2 * r3]
2828
2829 dec r4d
2830 jnz .loop
2831 RET
2832%endmacro
2833
2834FILTER_V_PS_W8_H8_H16_H32 8, 8
2835FILTER_V_PS_W8_H8_H16_H32 8, 16
2836FILTER_V_PS_W8_H8_H16_H32 8, 32
2837
2838;------------------------------------------------------------------------------------------------------------
2839;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2840;------------------------------------------------------------------------------------------------------------
2841%macro FILTER_V_PS_W6 2
2842INIT_XMM sse4
2843cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
2844
2845 mov r4d, r4m
2846 sub r0, r1
2847 add r3d, r3d
2848
2849%ifdef PIC
2850 lea r5, [tab_ChromaCoeff]
2851 movd m5, [r5 + r4 * 4]
2852%else
2853 movd m5, [tab_ChromaCoeff + r4 * 4]
2854%endif
2855
2856 pshufb m6, m5, [tab_Vm]
2857 pshufb m5, [tab_Vm + 16]
2858 mova m4, [pw_2000]
2859 lea r5, [3 * r1]
2860 mov r4d, %2/4
2861
2862.loop:
2863 movq m0, [r0]
2864 movq m1, [r0 + r1]
2865 movq m2, [r0 + 2 * r1]
2866 movq m3, [r0 + r5]
2867
2868 punpcklbw m0, m1
2869 punpcklbw m1, m2
2870 punpcklbw m2, m3
2871
2872 pmaddubsw m0, m6
2873 pmaddubsw m7, m2, m5
2874
2875 paddw m0, m7
2876 psubw m0, m4
2877
2878 movh [r2], m0
2879 pshufd m0, m0, 2
2880 movd [r2 + 8], m0
2881
2882 lea r0, [r0 + 4 * r1]
2883 movq m0, [r0]
2884 punpcklbw m3, m0
2885
2886 pmaddubsw m1, m6
2887 pmaddubsw m7, m3, m5
2888
2889 paddw m1, m7
2890 psubw m1, m4
2891
2892 movh [r2 + r3], m1
2893 pshufd m1, m1, 2
2894 movd [r2 + r3 + 8], m1
2895
2896 movq m1, [r0 + r1]
2897 punpcklbw m0, m1
2898
2899 pmaddubsw m2, m6
2900 pmaddubsw m0, m5
2901
2902 paddw m2, m0
2903 psubw m2, m4
2904
2905 lea r2,[r2 + 2 * r3]
2906 movh [r2], m2
2907 pshufd m2, m2, 2
2908 movd [r2 + 8], m2
2909
2910 movq m2,[r0 + 2 * r1]
2911 punpcklbw m1, m2
2912
2913 pmaddubsw m3, m6
2914 pmaddubsw m1, m5
2915
2916 paddw m3, m1
2917 psubw m3, m4
2918
2919 movh [r2 + r3], m3
2920 pshufd m3, m3, 2
2921 movd [r2 + r3 + 8], m3
2922
2923 lea r2, [r2 + 2 * r3]
2924
2925 dec r4d
2926 jnz .loop
2927 RET
2928%endmacro
2929
2930FILTER_V_PS_W6 6, 8
2931FILTER_V_PS_W6 6, 16
2932
2933;---------------------------------------------------------------------------------------------------------------
2934; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2935;---------------------------------------------------------------------------------------------------------------
2936%macro FILTER_V_PS_W12 2
2937INIT_XMM sse4
2938cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
2939
2940 mov r4d, r4m
2941 sub r0, r1
2942 add r3d, r3d
2943
2944%ifdef PIC
2945 lea r5, [tab_ChromaCoeff]
2946 movd m0, [r5 + r4 * 4]
2947%else
2948 movd m0, [tab_ChromaCoeff + r4 * 4]
2949%endif
2950
2951 pshufb m1, m0, [tab_Vm]
2952 pshufb m0, [tab_Vm + 16]
2953
2954 mov r4d, %2/2
2955
2956.loop:
2957 movu m2, [r0]
2958 movu m3, [r0 + r1]
2959
2960 punpcklbw m4, m2, m3
2961 punpckhbw m2, m3
2962
2963 pmaddubsw m4, m1
2964 pmaddubsw m2, m1
2965
2966 lea r0, [r0 + 2 * r1]
2967 movu m5, [r0]
2968 movu m7, [r0 + r1]
2969
2970 punpcklbw m6, m5, m7
2971 pmaddubsw m6, m0
2972 paddw m4, m6
2973
2974 punpckhbw m6, m5, m7
2975 pmaddubsw m6, m0
2976 paddw m2, m6
2977
2978 mova m6, [pw_2000]
2979
2980 psubw m4, m6
2981 psubw m2, m6
2982
2983 movu [r2], m4
2984 movh [r2 + 16], m2
2985
2986 punpcklbw m4, m3, m5
2987 punpckhbw m3, m5
2988
2989 pmaddubsw m4, m1
2990 pmaddubsw m3, m1
2991
2992 movu m2, [r0 + 2 * r1]
2993
2994 punpcklbw m5, m7, m2
2995 punpckhbw m7, m2
2996
2997 pmaddubsw m5, m0
2998 pmaddubsw m7, m0
2999
3000 paddw m4, m5
3001 paddw m3, m7
3002
3003 psubw m4, m6
3004 psubw m3, m6
3005
3006 movu [r2 + r3], m4
3007 movh [r2 + r3 + 16], m3
3008
3009 lea r2, [r2 + 2 * r3]
3010
3011 dec r4d
3012 jnz .loop
3013 RET
3014%endmacro
3015
3016FILTER_V_PS_W12 12, 16
3017FILTER_V_PS_W12 12, 32
3018
3019;---------------------------------------------------------------------------------------------------------------
3020; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3021;---------------------------------------------------------------------------------------------------------------
3022%macro FILTER_V_PS_W16 2
3023INIT_XMM sse4
3024cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
3025
3026 mov r4d, r4m
3027 sub r0, r1
3028 add r3d, r3d
3029
3030%ifdef PIC
3031 lea r5, [tab_ChromaCoeff]
3032 movd m0, [r5 + r4 * 4]
3033%else
3034 movd m0, [tab_ChromaCoeff + r4 * 4]
3035%endif
3036
3037 pshufb m1, m0, [tab_Vm]
3038 pshufb m0, [tab_Vm + 16]
3039 mov r4d, %2/2
3040
3041.loop:
3042 movu m2, [r0]
3043 movu m3, [r0 + r1]
3044
3045 punpcklbw m4, m2, m3
3046 punpckhbw m2, m3
3047
3048 pmaddubsw m4, m1
3049 pmaddubsw m2, m1
3050
3051 lea r0, [r0 + 2 * r1]
3052 movu m5, [r0]
3053 movu m7, [r0 + r1]
3054
3055 punpcklbw m6, m5, m7
3056 pmaddubsw m6, m0
3057 paddw m4, m6
3058
3059 punpckhbw m6, m5, m7
3060 pmaddubsw m6, m0
3061 paddw m2, m6
3062
3063 mova m6, [pw_2000]
3064
3065 psubw m4, m6
3066 psubw m2, m6
3067
3068 movu [r2], m4
3069 movu [r2 + 16], m2
3070
3071 punpcklbw m4, m3, m5
3072 punpckhbw m3, m5
3073
3074 pmaddubsw m4, m1
3075 pmaddubsw m3, m1
3076
3077 movu m5, [r0 + 2 * r1]
3078
3079 punpcklbw m2, m7, m5
3080 punpckhbw m7, m5
3081
3082 pmaddubsw m2, m0
3083 pmaddubsw m7, m0
3084
3085 paddw m4, m2
3086 paddw m3, m7
3087
3088 psubw m4, m6
3089 psubw m3, m6
3090
3091 movu [r2 + r3], m4
3092 movu [r2 + r3 + 16], m3
3093
3094 lea r2, [r2 + 2 * r3]
3095
3096 dec r4d
3097 jnz .loop
3098 RET
3099%endmacro
3100
3101FILTER_V_PS_W16 16, 4
3102FILTER_V_PS_W16 16, 8
3103FILTER_V_PS_W16 16, 12
3104FILTER_V_PS_W16 16, 16
3105FILTER_V_PS_W16 16, 32
3106
3107FILTER_V_PS_W16 16, 24
3108FILTER_V_PS_W16 16, 64
3109
3110;--------------------------------------------------------------------------------------------------------------
3111;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3112;--------------------------------------------------------------------------------------------------------------
3113%macro FILTER_V4_PS_W24 2
3114INIT_XMM sse4
3115cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
3116
3117 mov r4d, r4m
3118 sub r0, r1
3119 add r3d, r3d
3120
3121%ifdef PIC
3122 lea r5, [tab_ChromaCoeff]
3123 movd m0, [r5 + r4 * 4]
3124%else
3125 movd m0, [tab_ChromaCoeff + r4 * 4]
3126%endif
3127
3128 pshufb m1, m0, [tab_Vm]
3129 pshufb m0, [tab_Vm + 16]
3130
3131 mov r4d, %2/2
3132
3133.loop:
3134 movu m2, [r0]
3135 movu m3, [r0 + r1]
3136
3137 punpcklbw m4, m2, m3
3138 punpckhbw m2, m3
3139
3140 pmaddubsw m4, m1
3141 pmaddubsw m2, m1
3142
3143 lea r5, [r0 + 2 * r1]
3144
3145 movu m5, [r5]
3146 movu m7, [r5 + r1]
3147
3148 punpcklbw m6, m5, m7
3149 pmaddubsw m6, m0
3150 paddw m4, m6
3151
3152 punpckhbw m6, m5, m7
3153 pmaddubsw m6, m0
3154 paddw m2, m6
3155
3156 mova m6, [pw_2000]
3157
3158 psubw m4, m6
3159 psubw m2, m6
3160
3161 movu [r2], m4
3162 movu [r2 + 16], m2
3163
3164 punpcklbw m4, m3, m5
3165 punpckhbw m3, m5
3166
3167 pmaddubsw m4, m1
3168 pmaddubsw m3, m1
3169
3170 movu m2, [r5 + 2 * r1]
3171
3172 punpcklbw m5, m7, m2
3173 punpckhbw m7, m2
3174
3175 pmaddubsw m5, m0
3176 pmaddubsw m7, m0
3177
3178 paddw m4, m5
3179 paddw m3, m7
3180
3181 psubw m4, m6
3182 psubw m3, m6
3183
3184 movu [r2 + r3], m4
3185 movu [r2 + r3 + 16], m3
3186
3187 movq m2, [r0 + 16]
3188 movq m3, [r0 + r1 + 16]
3189 movq m4, [r5 + 16]
3190 movq m5, [r5 + r1 + 16]
3191
3192 punpcklbw m2, m3
3193 punpcklbw m7, m4, m5
3194
3195 pmaddubsw m2, m1
3196 pmaddubsw m7, m0
3197
3198 paddw m2, m7
3199 psubw m2, m6
3200
3201 movu [r2 + 32], m2
3202
3203 movq m2, [r5 + 2 * r1 + 16]
3204
3205 punpcklbw m3, m4
3206 punpcklbw m5, m2
3207
3208 pmaddubsw m3, m1
3209 pmaddubsw m5, m0
3210
3211 paddw m3, m5
3212 psubw m3, m6
3213
3214 movu [r2 + r3 + 32], m3
3215
3216 mov r0, r5
3217 lea r2, [r2 + 2 * r3]
3218
3219 dec r4d
3220 jnz .loop
3221 RET
3222%endmacro
3223
3224FILTER_V4_PS_W24 24, 32
3225
3226FILTER_V4_PS_W24 24, 64
3227
3228;---------------------------------------------------------------------------------------------------------------
3229; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3230;---------------------------------------------------------------------------------------------------------------
3231%macro FILTER_V_PS_W32 2
3232INIT_XMM sse4
3233cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
3234
3235 mov r4d, r4m
3236 sub r0, r1
3237 add r3d, r3d
3238
3239%ifdef PIC
3240 lea r5, [tab_ChromaCoeff]
3241 movd m0, [r5 + r4 * 4]
3242%else
3243 movd m0, [tab_ChromaCoeff + r4 * 4]
3244%endif
3245
3246 pshufb m1, m0, [tab_Vm]
3247 pshufb m0, [tab_Vm + 16]
3248
3249 mova m7, [pw_2000]
3250
3251 mov r4d, %2
3252
3253.loop:
3254 movu m2, [r0]
3255 movu m3, [r0 + r1]
3256
3257 punpcklbw m4, m2, m3
3258 punpckhbw m2, m3
3259
3260 pmaddubsw m4, m1
3261 pmaddubsw m2, m1
3262
3263 lea r5, [r0 + 2 * r1]
3264 movu m3, [r5]
3265 movu m5, [r5 + r1]
3266
3267 punpcklbw m6, m3, m5
3268 punpckhbw m3, m5
3269
3270 pmaddubsw m6, m0
3271 pmaddubsw m3, m0
3272
3273 paddw m4, m6
3274 paddw m2, m3
3275
3276 psubw m4, m7
3277 psubw m2, m7
3278
3279 movu [r2], m4
3280 movu [r2 + 16], m2
3281
3282 movu m2, [r0 + 16]
3283 movu m3, [r0 + r1 + 16]
3284
3285 punpcklbw m4, m2, m3
3286 punpckhbw m2, m3
3287
3288 pmaddubsw m4, m1
3289 pmaddubsw m2, m1
3290
3291 movu m3, [r5 + 16]
3292 movu m5, [r5 + r1 + 16]
3293
3294 punpcklbw m6, m3, m5
3295 punpckhbw m3, m5
3296
3297 pmaddubsw m6, m0
3298 pmaddubsw m3, m0
3299
3300 paddw m4, m6
3301 paddw m2, m3
3302
3303 psubw m4, m7
3304 psubw m2, m7
3305
3306 movu [r2 + 32], m4
3307 movu [r2 + 48], m2
3308
3309 lea r0, [r0 + r1]
3310 lea r2, [r2 + r3]
3311
3312 dec r4d
3313 jnz .loop
3314 RET
3315%endmacro
3316
3317FILTER_V_PS_W32 32, 8
3318FILTER_V_PS_W32 32, 16
3319FILTER_V_PS_W32 32, 24
3320FILTER_V_PS_W32 32, 32
3321
3322FILTER_V_PS_W32 32, 48
3323FILTER_V_PS_W32 32, 64
3324
3325;-----------------------------------------------------------------------------
3326; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3327;-----------------------------------------------------------------------------
3328%macro FILTER_V4_W8_H8_H16_H32 2
3329INIT_XMM sse4
3330cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
3331
3332mov r4d, r4m
3333sub r0, r1
3334
3335%ifdef PIC
3336lea r5, [tab_ChromaCoeff]
3337movd m5, [r5 + r4 * 4]
3338%else
3339movd m5, [tab_ChromaCoeff + r4 * 4]
3340%endif
3341
3342pshufb m6, m5, [tab_Vm]
3343pshufb m5, [tab_Vm + 16]
b53f7c52 3344mova m4, [pw_512]
72b9787e
JB
3345lea r5, [r1 * 3]
3346
3347mov r4d, %2
3348
3349.loop:
3350movq m0, [r0]
3351movq m1, [r0 + r1]
3352movq m2, [r0 + 2 * r1]
3353movq m3, [r0 + r5]
3354
3355punpcklbw m0, m1
3356punpcklbw m1, m2
3357punpcklbw m2, m3
3358
3359pmaddubsw m0, m6
3360pmaddubsw m7, m2, m5
3361
3362paddw m0, m7
3363
3364pmulhrsw m0, m4
3365packuswb m0, m0
3366movh [r2], m0
3367
3368lea r0, [r0 + 4 * r1]
3369movq m0, [r0]
3370
3371punpcklbw m3, m0
3372
3373pmaddubsw m1, m6
3374pmaddubsw m7, m3, m5
3375
3376paddw m1, m7
3377
3378pmulhrsw m1, m4
3379packuswb m1, m1
3380movh [r2 + r3], m1
3381
3382movq m1, [r0 + r1]
3383
3384punpcklbw m0, m1
3385
3386pmaddubsw m2, m6
3387pmaddubsw m0, m5
3388
3389paddw m2, m0
3390
3391pmulhrsw m2, m4
3392
3393movq m7, [r0 + 2 * r1]
3394punpcklbw m1, m7
3395
3396pmaddubsw m3, m6
3397pmaddubsw m1, m5
3398
3399paddw m3, m1
3400
3401pmulhrsw m3, m4
3402packuswb m2, m3
3403
3404lea r2, [r2 + 2 * r3]
3405movh [r2], m2
3406movhps [r2 + r3], m2
3407
3408lea r2, [r2 + 2 * r3]
3409
3410sub r4, 4
3411jnz .loop
3412RET
3413%endmacro
3414
3415FILTER_V4_W8_H8_H16_H32 8, 8
3416FILTER_V4_W8_H8_H16_H32 8, 16
3417FILTER_V4_W8_H8_H16_H32 8, 32
3418
3419FILTER_V4_W8_H8_H16_H32 8, 12
3420FILTER_V4_W8_H8_H16_H32 8, 64
3421
b53f7c52
JB
3422%macro PROCESS_CHROMA_AVX2_W8_8R 0
3423 movq xm1, [r0] ; m1 = row 0
3424 movq xm2, [r0 + r1] ; m2 = row 1
3425 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
3426 movq xm3, [r0 + r1 * 2] ; m3 = row 2
3427 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
3428 vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
3429 pmaddubsw m5, [r5]
3430 movq xm4, [r0 + r4] ; m4 = row 3
3431 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
3432 lea r0, [r0 + r1 * 4]
3433 movq xm1, [r0] ; m1 = row 4
3434 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
3435 vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
3436 pmaddubsw m0, m2, [r5 + 1 * mmsize]
3437 paddw m5, m0
3438 pmaddubsw m2, [r5]
3439 movq xm3, [r0 + r1] ; m3 = row 5
3440 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
3441 movq xm4, [r0 + r1 * 2] ; m4 = row 6
3442 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
3443 vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
3444 pmaddubsw m0, m1, [r5 + 1 * mmsize]
3445 paddw m2, m0
3446 pmaddubsw m1, [r5]
3447 movq xm3, [r0 + r4] ; m3 = row 7
3448 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
3449 lea r0, [r0 + r1 * 4]
3450 movq xm0, [r0] ; m0 = row 8
3451 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
3452 vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
3453 pmaddubsw m3, m4, [r5 + 1 * mmsize]
3454 paddw m1, m3
3455 pmaddubsw m4, [r5]
3456 movq xm3, [r0 + r1] ; m3 = row 9
3457 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
3458 movq xm6, [r0 + r1 * 2] ; m6 = row 10
3459 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
3460 vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
3461 pmaddubsw m0, [r5 + 1 * mmsize]
3462 paddw m4, m0
3463%endmacro
3464
3465INIT_YMM avx2
3466cglobal interp_4tap_vert_pp_8x8, 4, 6, 7
3467 mov r4d, r4m
3468 shl r4d, 6
3469
3470%ifdef PIC
3471 lea r5, [tab_ChromaCoeffVer_32]
3472 add r5, r4
3473%else
3474 lea r5, [tab_ChromaCoeffVer_32 + r4]
3475%endif
3476
3477 lea r4, [r1 * 3]
3478 sub r0, r1
3479 PROCESS_CHROMA_AVX2_W8_8R
3480 lea r4, [r3 * 3]
3481 mova m3, [pw_512]
3482 pmulhrsw m5, m3 ; m5 = word: row 0, row 1
3483 pmulhrsw m2, m3 ; m2 = word: row 2, row 3
3484 pmulhrsw m1, m3 ; m1 = word: row 4, row 5
3485 pmulhrsw m4, m3 ; m4 = word: row 6, row 7
3486 packuswb m5, m2
3487 packuswb m1, m4
3488 vextracti128 xm2, m5, 1
3489 vextracti128 xm4, m1, 1
3490 movq [r2], xm5
3491 movq [r2 + r3], xm2
3492 movhps [r2 + r3 * 2], xm5
3493 movhps [r2 + r4], xm2
3494 lea r2, [r2 + r3 * 4]
3495 movq [r2], xm1
3496 movq [r2 + r3], xm4
3497 movhps [r2 + r3 * 2], xm1
3498 movhps [r2 + r4], xm4
3499 RET
72b9787e
JB
3500
3501;-----------------------------------------------------------------------------
3502;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3503;-----------------------------------------------------------------------------
3504%macro FILTER_V4_W6_H4 2
3505INIT_XMM sse4
3506cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
3507
3508mov r4d, r4m
3509sub r0, r1
3510
3511%ifdef PIC
3512lea r5, [tab_ChromaCoeff]
3513movd m5, [r5 + r4 * 4]
3514%else
3515movd m5, [tab_ChromaCoeff + r4 * 4]
3516%endif
3517
3518pshufb m6, m5, [tab_Vm]
3519pshufb m5, [tab_Vm + 16]
b53f7c52 3520mova m4, [pw_512]
72b9787e
JB
3521
3522mov r4d, %2
3523lea r5, [3 * r1]
3524
3525.loop:
3526movq m0, [r0]
3527movq m1, [r0 + r1]
3528movq m2, [r0 + 2 * r1]
3529movq m3, [r0 + r5]
3530
3531punpcklbw m0, m1
3532punpcklbw m1, m2
3533punpcklbw m2, m3
3534
3535pmaddubsw m0, m6
3536pmaddubsw m7, m2, m5
3537
3538paddw m0, m7
3539
3540pmulhrsw m0, m4
3541packuswb m0, m0
3542movd [r2], m0
3543pextrw [r2 + 4], m0, 2
3544
3545lea r0, [r0 + 4 * r1]
3546
3547movq m0, [r0]
3548punpcklbw m3, m0
3549
3550pmaddubsw m1, m6
3551pmaddubsw m7, m3, m5
3552
3553paddw m1, m7
3554
3555pmulhrsw m1, m4
3556packuswb m1, m1
3557movd [r2 + r3], m1
3558pextrw [r2 + r3 + 4], m1, 2
3559
3560movq m1, [r0 + r1]
3561punpcklbw m7, m0, m1
3562
3563pmaddubsw m2, m6
3564pmaddubsw m7, m5
3565
3566paddw m2, m7
3567
3568pmulhrsw m2, m4
3569packuswb m2, m2
3570lea r2, [r2 + 2 * r3]
3571movd [r2], m2
3572pextrw [r2 + 4], m2, 2
3573
3574movq m2, [r0 + 2 * r1]
3575punpcklbw m1, m2
3576
3577pmaddubsw m3, m6
3578pmaddubsw m1, m5
3579
3580paddw m3, m1
3581
3582pmulhrsw m3, m4
3583packuswb m3, m3
3584
3585movd [r2 + r3], m3
3586pextrw [r2 + r3 + 4], m3, 2
3587
3588lea r2, [r2 + 2 * r3]
3589
3590sub r4, 4
3591jnz .loop
3592RET
3593%endmacro
3594
3595FILTER_V4_W6_H4 6, 8
3596
3597FILTER_V4_W6_H4 6, 16
3598
3599;-----------------------------------------------------------------------------
3600; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3601;-----------------------------------------------------------------------------
3602%macro FILTER_V4_W12_H2 2
3603INIT_XMM sse4
3604cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
3605
3606mov r4d, r4m
3607sub r0, r1
3608
3609%ifdef PIC
3610lea r5, [tab_ChromaCoeff]
3611movd m0, [r5 + r4 * 4]
3612%else
3613movd m0, [tab_ChromaCoeff + r4 * 4]
3614%endif
3615
3616pshufb m1, m0, [tab_Vm]
3617pshufb m0, [tab_Vm + 16]
3618
3619mov r4d, %2
3620
3621.loop:
3622movu m2, [r0]
3623movu m3, [r0 + r1]
3624
3625punpcklbw m4, m2, m3
3626punpckhbw m2, m3
3627
3628pmaddubsw m4, m1
3629pmaddubsw m2, m1
3630
3631lea r0, [r0 + 2 * r1]
3632movu m5, [r0]
3633movu m7, [r0 + r1]
3634
3635punpcklbw m6, m5, m7
3636pmaddubsw m6, m0
3637paddw m4, m6
3638
3639punpckhbw m6, m5, m7
3640pmaddubsw m6, m0
3641paddw m2, m6
3642
b53f7c52 3643mova m6, [pw_512]
72b9787e
JB
3644
3645pmulhrsw m4, m6
3646pmulhrsw m2, m6
3647
3648packuswb m4, m2
3649
3650movh [r2], m4
3651pextrd [r2 + 8], m4, 2
3652
3653punpcklbw m4, m3, m5
3654punpckhbw m3, m5
3655
3656pmaddubsw m4, m1
3657pmaddubsw m3, m1
3658
3659movu m5, [r0 + 2 * r1]
3660
3661punpcklbw m2, m7, m5
3662punpckhbw m7, m5
3663
3664pmaddubsw m2, m0
3665pmaddubsw m7, m0
3666
3667paddw m4, m2
3668paddw m3, m7
3669
3670pmulhrsw m4, m6
3671pmulhrsw m3, m6
3672
3673packuswb m4, m3
3674
3675movh [r2 + r3], m4
3676pextrd [r2 + r3 + 8], m4, 2
3677
3678lea r2, [r2 + 2 * r3]
3679
3680sub r4, 2
3681jnz .loop
3682RET
3683%endmacro
3684
3685FILTER_V4_W12_H2 12, 16
3686
3687FILTER_V4_W12_H2 12, 32
3688
3689;-----------------------------------------------------------------------------
3690; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3691;-----------------------------------------------------------------------------
3692%macro FILTER_V4_W16_H2 2
3693INIT_XMM sse4
3694cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
3695
3696mov r4d, r4m
3697sub r0, r1
3698
3699%ifdef PIC
3700lea r5, [tab_ChromaCoeff]
3701movd m0, [r5 + r4 * 4]
3702%else
3703movd m0, [tab_ChromaCoeff + r4 * 4]
3704%endif
3705
3706pshufb m1, m0, [tab_Vm]
3707pshufb m0, [tab_Vm + 16]
3708
3709mov r4d, %2/2
3710
3711.loop:
3712movu m2, [r0]
3713movu m3, [r0 + r1]
3714
3715punpcklbw m4, m2, m3
3716punpckhbw m2, m3
3717
3718pmaddubsw m4, m1
3719pmaddubsw m2, m1
3720
3721lea r0, [r0 + 2 * r1]
3722movu m5, [r0]
3723movu m6, [r0 + r1]
3724
3725punpckhbw m7, m5, m6
3726pmaddubsw m7, m0
3727paddw m2, m7
3728
3729punpcklbw m7, m5, m6
3730pmaddubsw m7, m0
3731paddw m4, m7
3732
b53f7c52 3733mova m7, [pw_512]
72b9787e
JB
3734
3735pmulhrsw m4, m7
3736pmulhrsw m2, m7
3737
3738packuswb m4, m2
3739
3740movu [r2], m4
3741
3742punpcklbw m4, m3, m5
3743punpckhbw m3, m5
3744
3745pmaddubsw m4, m1
3746pmaddubsw m3, m1
3747
3748movu m5, [r0 + 2 * r1]
3749
3750punpcklbw m2, m6, m5
3751punpckhbw m6, m5
3752
3753pmaddubsw m2, m0
3754pmaddubsw m6, m0
3755
3756paddw m4, m2
3757paddw m3, m6
3758
3759pmulhrsw m4, m7
3760pmulhrsw m3, m7
3761
3762packuswb m4, m3
3763
3764movu [r2 + r3], m4
3765
3766lea r2, [r2 + 2 * r3]
3767
3768dec r4d
3769jnz .loop
3770RET
3771%endmacro
3772
3773FILTER_V4_W16_H2 16, 4
3774FILTER_V4_W16_H2 16, 8
3775FILTER_V4_W16_H2 16, 12
3776FILTER_V4_W16_H2 16, 16
3777FILTER_V4_W16_H2 16, 32
3778
3779FILTER_V4_W16_H2 16, 24
3780FILTER_V4_W16_H2 16, 64
3781
b53f7c52
JB
3782INIT_YMM avx2
3783%if ARCH_X86_64 == 1
3784cglobal interp_4tap_vert_pp_16x16, 4, 6, 15
3785 mov r4d, r4m
3786 shl r4d, 6
3787
3788%ifdef PIC
3789 lea r5, [tab_ChromaCoeffVer_32]
3790 add r5, r4
3791%else
3792 lea r5, [tab_ChromaCoeffVer_32 + r4]
3793%endif
3794
3795 mova m12, [r5]
3796 mova m13, [r5 + mmsize]
3797 lea r4, [r1 * 3]
3798 sub r0, r1
3799 lea r5, [r3 * 3]
3800 mova m14, [pw_512]
3801
3802 movu xm0, [r0] ; m0 = row 0
3803 movu xm1, [r0 + r1] ; m1 = row 1
3804 punpckhbw xm2, xm0, xm1
3805 punpcklbw xm0, xm1
3806 vinserti128 m0, m0, xm2, 1
3807 pmaddubsw m0, m12
3808 movu xm2, [r0 + r1 * 2] ; m2 = row 2
3809 punpckhbw xm3, xm1, xm2
3810 punpcklbw xm1, xm2
3811 vinserti128 m1, m1, xm3, 1
3812 pmaddubsw m1, m12
3813 movu xm3, [r0 + r4] ; m3 = row 3
3814 punpckhbw xm4, xm2, xm3
3815 punpcklbw xm2, xm3
3816 vinserti128 m2, m2, xm4, 1
3817 pmaddubsw m4, m2, m13
3818 paddw m0, m4
3819 pmaddubsw m2, m12
3820 lea r0, [r0 + r1 * 4]
3821 movu xm4, [r0] ; m4 = row 4
3822 punpckhbw xm5, xm3, xm4
3823 punpcklbw xm3, xm4
3824 vinserti128 m3, m3, xm5, 1
3825 pmaddubsw m5, m3, m13
3826 paddw m1, m5
3827 pmaddubsw m3, m12
3828 movu xm5, [r0 + r1] ; m5 = row 5
3829 punpckhbw xm6, xm4, xm5
3830 punpcklbw xm4, xm5
3831 vinserti128 m4, m4, xm6, 1
3832 pmaddubsw m6, m4, m13
3833 paddw m2, m6
3834 pmaddubsw m4, m12
3835 movu xm6, [r0 + r1 * 2] ; m6 = row 6
3836 punpckhbw xm7, xm5, xm6
3837 punpcklbw xm5, xm6
3838 vinserti128 m5, m5, xm7, 1
3839 pmaddubsw m7, m5, m13
3840 paddw m3, m7
3841 pmaddubsw m5, m12
3842 movu xm7, [r0 + r4] ; m7 = row 7
3843 punpckhbw xm8, xm6, xm7
3844 punpcklbw xm6, xm7
3845 vinserti128 m6, m6, xm8, 1
3846 pmaddubsw m8, m6, m13
3847 paddw m4, m8
3848 pmaddubsw m6, m12
3849 lea r0, [r0 + r1 * 4]
3850 movu xm8, [r0] ; m8 = row 8
3851 punpckhbw xm9, xm7, xm8
3852 punpcklbw xm7, xm8
3853 vinserti128 m7, m7, xm9, 1
3854 pmaddubsw m9, m7, m13
3855 paddw m5, m9
3856 pmaddubsw m7, m12
3857 movu xm9, [r0 + r1] ; m9 = row 9
3858 punpckhbw xm10, xm8, xm9
3859 punpcklbw xm8, xm9
3860 vinserti128 m8, m8, xm10, 1
3861 pmaddubsw m10, m8, m13
3862 paddw m6, m10
3863 pmaddubsw m8, m12
3864 movu xm10, [r0 + r1 * 2] ; m10 = row 10
3865 punpckhbw xm11, xm9, xm10
3866 punpcklbw xm9, xm10
3867 vinserti128 m9, m9, xm11, 1
3868 pmaddubsw m11, m9, m13
3869 paddw m7, m11
3870 pmaddubsw m9, m12
3871
3872 pmulhrsw m0, m14 ; m0 = word: row 0
3873 pmulhrsw m1, m14 ; m1 = word: row 1
3874 pmulhrsw m2, m14 ; m2 = word: row 2
3875 pmulhrsw m3, m14 ; m3 = word: row 3
3876 pmulhrsw m4, m14 ; m4 = word: row 4
3877 pmulhrsw m5, m14 ; m5 = word: row 5
3878 pmulhrsw m6, m14 ; m6 = word: row 6
3879 pmulhrsw m7, m14 ; m7 = word: row 7
3880 packuswb m0, m1
3881 packuswb m2, m3
3882 packuswb m4, m5
3883 packuswb m6, m7
3884 vpermq m0, m0, 11011000b
3885 vpermq m2, m2, 11011000b
3886 vpermq m4, m4, 11011000b
3887 vpermq m6, m6, 11011000b
3888 vextracti128 xm1, m0, 1
3889 vextracti128 xm3, m2, 1
3890 vextracti128 xm5, m4, 1
3891 vextracti128 xm7, m6, 1
3892 movu [r2], xm0
3893 movu [r2 + r3], xm1
3894 movu [r2 + r3 * 2], xm2
3895 movu [r2 + r5], xm3
3896 lea r2, [r2 + r3 * 4]
3897 movu [r2], xm4
3898 movu [r2 + r3], xm5
3899 movu [r2 + r3 * 2], xm6
3900 movu [r2 + r5], xm7
3901 lea r2, [r2 + r3 * 4]
3902
3903 movu xm11, [r0 + r4] ; m11 = row 11
3904 punpckhbw xm6, xm10, xm11
3905 punpcklbw xm10, xm11
3906 vinserti128 m10, m10, xm6, 1
3907 pmaddubsw m6, m10, m13
3908 paddw m8, m6
3909 pmaddubsw m10, m12
3910 lea r0, [r0 + r1 * 4]
3911 movu xm6, [r0] ; m6 = row 12
3912 punpckhbw xm7, xm11, xm6
3913 punpcklbw xm11, xm6
3914 vinserti128 m11, m11, xm7, 1
3915 pmaddubsw m7, m11, m13
3916 paddw m9, m7
3917 pmaddubsw m11, m12
3918
3919 movu xm7, [r0 + r1] ; m7 = row 13
3920 punpckhbw xm0, xm6, xm7
3921 punpcklbw xm6, xm7
3922 vinserti128 m6, m6, xm0, 1
3923 pmaddubsw m0, m6, m13
3924 paddw m10, m0
3925 pmaddubsw m6, m12
3926 movu xm0, [r0 + r1 * 2] ; m0 = row 14
3927 punpckhbw xm1, xm7, xm0
3928 punpcklbw xm7, xm0
3929 vinserti128 m7, m7, xm1, 1
3930 pmaddubsw m1, m7, m13
3931 paddw m11, m1
3932 pmaddubsw m7, m12
3933 movu xm1, [r0 + r4] ; m1 = row 15
3934 punpckhbw xm2, xm0, xm1
3935 punpcklbw xm0, xm1
3936 vinserti128 m0, m0, xm2, 1
3937 pmaddubsw m2, m0, m13
3938 paddw m6, m2
3939 pmaddubsw m0, m12
3940 lea r0, [r0 + r1 * 4]
3941 movu xm2, [r0] ; m2 = row 16
3942 punpckhbw xm3, xm1, xm2
3943 punpcklbw xm1, xm2
3944 vinserti128 m1, m1, xm3, 1
3945 pmaddubsw m3, m1, m13
3946 paddw m7, m3
3947 pmaddubsw m1, m12
3948 movu xm3, [r0 + r1] ; m3 = row 17
3949 punpckhbw xm4, xm2, xm3
3950 punpcklbw xm2, xm3
3951 vinserti128 m2, m2, xm4, 1
3952 pmaddubsw m2, m13
3953 paddw m0, m2
3954 movu xm4, [r0 + r1 * 2] ; m4 = row 18
3955 punpckhbw xm5, xm3, xm4
3956 punpcklbw xm3, xm4
3957 vinserti128 m3, m3, xm5, 1
3958 pmaddubsw m3, m13
3959 paddw m1, m3
3960
3961 pmulhrsw m8, m14 ; m8 = word: row 8
3962 pmulhrsw m9, m14 ; m9 = word: row 9
3963 pmulhrsw m10, m14 ; m10 = word: row 10
3964 pmulhrsw m11, m14 ; m11 = word: row 11
3965 pmulhrsw m6, m14 ; m6 = word: row 12
3966 pmulhrsw m7, m14 ; m7 = word: row 13
3967 pmulhrsw m0, m14 ; m0 = word: row 14
3968 pmulhrsw m1, m14 ; m1 = word: row 15
3969 packuswb m8, m9
3970 packuswb m10, m11
3971 packuswb m6, m7
3972 packuswb m0, m1
3973 vpermq m8, m8, 11011000b
3974 vpermq m10, m10, 11011000b
3975 vpermq m6, m6, 11011000b
3976 vpermq m0, m0, 11011000b
3977 vextracti128 xm9, m8, 1
3978 vextracti128 xm11, m10, 1
3979 vextracti128 xm7, m6, 1
3980 vextracti128 xm1, m0, 1
3981 movu [r2], xm8
3982 movu [r2 + r3], xm9
3983 movu [r2 + r3 * 2], xm10
3984 movu [r2 + r5], xm11
3985 lea r2, [r2 + r3 * 4]
3986 movu [r2], xm6
3987 movu [r2 + r3], xm7
3988 movu [r2 + r3 * 2], xm0
3989 movu [r2 + r5], xm1
3990 RET
3991%endif
3992
72b9787e
JB
3993;-----------------------------------------------------------------------------
3994;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3995;-----------------------------------------------------------------------------
3996%macro FILTER_V4_W24 2
3997INIT_XMM sse4
3998cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
3999
4000mov r4d, r4m
4001sub r0, r1
4002
4003%ifdef PIC
4004lea r5, [tab_ChromaCoeff]
4005movd m0, [r5 + r4 * 4]
4006%else
4007movd m0, [tab_ChromaCoeff + r4 * 4]
4008%endif
4009
4010pshufb m1, m0, [tab_Vm]
4011pshufb m0, [tab_Vm + 16]
4012
4013mov r4d, %2
4014
4015.loop:
4016movu m2, [r0]
4017movu m3, [r0 + r1]
4018
4019punpcklbw m4, m2, m3
4020punpckhbw m2, m3
4021
4022pmaddubsw m4, m1
4023pmaddubsw m2, m1
4024
4025lea r5, [r0 + 2 * r1]
4026movu m5, [r5]
4027movu m7, [r5 + r1]
4028
4029punpcklbw m6, m5, m7
4030pmaddubsw m6, m0
4031paddw m4, m6
4032
4033punpckhbw m6, m5, m7
4034pmaddubsw m6, m0
4035paddw m2, m6
4036
b53f7c52 4037mova m6, [pw_512]
72b9787e
JB
4038
4039pmulhrsw m4, m6
4040pmulhrsw m2, m6
4041
4042packuswb m4, m2
4043
4044movu [r2], m4
4045
4046punpcklbw m4, m3, m5
4047punpckhbw m3, m5
4048
4049pmaddubsw m4, m1
4050pmaddubsw m3, m1
4051
4052movu m2, [r5 + 2 * r1]
4053
4054punpcklbw m5, m7, m2
4055punpckhbw m7, m2
4056
4057pmaddubsw m5, m0
4058pmaddubsw m7, m0
4059
4060paddw m4, m5
4061paddw m3, m7
4062
4063pmulhrsw m4, m6
4064pmulhrsw m3, m6
4065
4066packuswb m4, m3
4067
4068movu [r2 + r3], m4
4069
4070movq m2, [r0 + 16]
4071movq m3, [r0 + r1 + 16]
4072movq m4, [r5 + 16]
4073movq m5, [r5 + r1 + 16]
4074
4075punpcklbw m2, m3
4076punpcklbw m4, m5
4077
4078pmaddubsw m2, m1
4079pmaddubsw m4, m0
4080
4081paddw m2, m4
4082
4083pmulhrsw m2, m6
4084
4085movq m3, [r0 + r1 + 16]
4086movq m4, [r5 + 16]
4087movq m5, [r5 + r1 + 16]
4088movq m7, [r5 + 2 * r1 + 16]
4089
4090punpcklbw m3, m4
4091punpcklbw m5, m7
4092
4093pmaddubsw m3, m1
4094pmaddubsw m5, m0
4095
4096paddw m3, m5
4097
4098pmulhrsw m3, m6
4099packuswb m2, m3
4100
4101movh [r2 + 16], m2
4102movhps [r2 + r3 + 16], m2
4103
4104mov r0, r5
4105lea r2, [r2 + 2 * r3]
4106
4107sub r4, 2
4108jnz .loop
4109RET
4110%endmacro
4111
4112FILTER_V4_W24 24, 32
4113
4114FILTER_V4_W24 24, 64
4115
4116;-----------------------------------------------------------------------------
4117; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4118;-----------------------------------------------------------------------------
4119%macro FILTER_V4_W32 2
4120INIT_XMM sse4
4121cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
4122
4123mov r4d, r4m
4124sub r0, r1
4125
4126%ifdef PIC
4127lea r5, [tab_ChromaCoeff]
4128movd m0, [r5 + r4 * 4]
4129%else
4130movd m0, [tab_ChromaCoeff + r4 * 4]
4131%endif
4132
4133pshufb m1, m0, [tab_Vm]
4134pshufb m0, [tab_Vm + 16]
4135
b53f7c52 4136mova m7, [pw_512]
72b9787e
JB
4137
4138mov r4d, %2
4139
4140.loop:
4141movu m2, [r0]
4142movu m3, [r0 + r1]
4143
4144punpcklbw m4, m2, m3
4145punpckhbw m2, m3
4146
4147pmaddubsw m4, m1
4148pmaddubsw m2, m1
4149
4150lea r5, [r0 + 2 * r1]
4151movu m3, [r5]
4152movu m5, [r5 + r1]
4153
4154punpcklbw m6, m3, m5
4155punpckhbw m3, m5
4156
4157pmaddubsw m6, m0
4158pmaddubsw m3, m0
4159
4160paddw m4, m6
4161paddw m2, m3
4162
4163pmulhrsw m4, m7
4164pmulhrsw m2, m7
4165
4166packuswb m4, m2
4167
4168movu [r2], m4
4169
4170movu m2, [r0 + 16]
4171movu m3, [r0 + r1 + 16]
4172
4173punpcklbw m4, m2, m3
4174punpckhbw m2, m3
4175
4176pmaddubsw m4, m1
4177pmaddubsw m2, m1
4178
4179movu m3, [r5 + 16]
4180movu m5, [r5 + r1 + 16]
4181
4182punpcklbw m6, m3, m5
4183punpckhbw m3, m5
4184
4185pmaddubsw m6, m0
4186pmaddubsw m3, m0
4187
4188paddw m4, m6
4189paddw m2, m3
4190
4191pmulhrsw m4, m7
4192pmulhrsw m2, m7
4193
4194packuswb m4, m2
4195
4196movu [r2 + 16], m4
4197
4198lea r0, [r0 + r1]
4199lea r2, [r2 + r3]
4200
4201dec r4
4202jnz .loop
4203RET
4204%endmacro
4205
4206FILTER_V4_W32 32, 8
4207FILTER_V4_W32 32, 16
4208FILTER_V4_W32 32, 24
4209FILTER_V4_W32 32, 32
4210
4211FILTER_V4_W32 32, 48
4212FILTER_V4_W32 32, 64
4213
b53f7c52
JB
4214INIT_YMM avx2
4215%if ARCH_X86_64 == 1
4216cglobal interp_4tap_vert_pp_32x32, 4, 7, 13
4217 mov r4d, r4m
4218 shl r4d, 6
4219
4220%ifdef PIC
4221 lea r5, [tab_ChromaCoeffVer_32]
4222 add r5, r4
4223%else
4224 lea r5, [tab_ChromaCoeffVer_32 + r4]
4225%endif
4226
4227 mova m10, [r5]
4228 mova m11, [r5 + mmsize]
4229 lea r4, [r1 * 3]
4230 sub r0, r1
4231 lea r5, [r3 * 3]
4232 mova m12, [pw_512]
4233 mov r6d, 8
4234.loopW:
4235 movu m0, [r0] ; m0 = row 0
4236 movu m1, [r0 + r1] ; m1 = row 1
4237 punpcklbw m2, m0, m1
4238 punpckhbw m3, m0, m1
4239 pmaddubsw m2, m10
4240 pmaddubsw m3, m10
4241 movu m0, [r0 + r1 * 2] ; m0 = row 2
4242 punpcklbw m4, m1, m0
4243 punpckhbw m5, m1, m0
4244 pmaddubsw m4, m10
4245 pmaddubsw m5, m10
4246 movu m1, [r0 + r4] ; m1 = row 3
4247 punpcklbw m6, m0, m1
4248 punpckhbw m7, m0, m1
4249 pmaddubsw m8, m6, m11
4250 pmaddubsw m9, m7, m11
4251 pmaddubsw m6, m10
4252 pmaddubsw m7, m10
4253 paddw m2, m8
4254 paddw m3, m9
4255 pmulhrsw m2, m12
4256 pmulhrsw m3, m12
4257 packuswb m2, m3
4258 movu [r2], m2
4259
4260 lea r0, [r0 + r1 * 4]
4261 movu m0, [r0] ; m0 = row 4
4262 punpcklbw m2, m1, m0
4263 punpckhbw m3, m1, m0
4264 pmaddubsw m8, m2, m11
4265 pmaddubsw m9, m3, m11
4266 pmaddubsw m2, m10
4267 pmaddubsw m3, m10
4268 paddw m4, m8
4269 paddw m5, m9
4270 pmulhrsw m4, m12
4271 pmulhrsw m5, m12
4272 packuswb m4, m5
4273 movu [r2 + r3], m4
4274
4275 movu m1, [r0 + r1] ; m1 = row 5
4276 punpcklbw m4, m0, m1
4277 punpckhbw m5, m0, m1
4278 pmaddubsw m4, m11
4279 pmaddubsw m5, m11
4280 paddw m6, m4
4281 paddw m7, m5
4282 pmulhrsw m6, m12
4283 pmulhrsw m7, m12
4284 packuswb m6, m7
4285 movu [r2 + r3 * 2], m6
4286
4287 movu m0, [r0 + r1 * 2] ; m0 = row 6
4288 punpcklbw m6, m1, m0
4289 punpckhbw m7, m1, m0
4290 pmaddubsw m6, m11
4291 pmaddubsw m7, m11
4292 paddw m2, m6
4293 paddw m3, m7
4294 pmulhrsw m2, m12
4295 pmulhrsw m3, m12
4296 packuswb m2, m3
4297 movu [r2 + r5], m2
4298
4299 lea r2, [r2 + r3 * 4]
4300 dec r6d
4301 jnz .loopW
4302 RET
4303%endif
72b9787e
JB
4304
4305;-----------------------------------------------------------------------------
4306; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4307;-----------------------------------------------------------------------------
4308%macro FILTER_V4_W16n_H2 2
4309INIT_XMM sse4
4310cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
4311
4312mov r4d, r4m
4313sub r0, r1
4314
4315%ifdef PIC
4316lea r5, [tab_ChromaCoeff]
4317movd m0, [r5 + r4 * 4]
4318%else
4319movd m0, [tab_ChromaCoeff + r4 * 4]
4320%endif
4321
4322pshufb m1, m0, [tab_Vm]
4323pshufb m0, [tab_Vm + 16]
4324
4325mov r4d, %2/2
4326
4327.loop:
4328
4329mov r6d, %1/16
4330
4331.loopW:
4332
4333movu m2, [r0]
4334movu m3, [r0 + r1]
4335
4336punpcklbw m4, m2, m3
4337punpckhbw m2, m3
4338
4339pmaddubsw m4, m1
4340pmaddubsw m2, m1
4341
4342lea r5, [r0 + 2 * r1]
4343movu m5, [r5]
4344movu m6, [r5 + r1]
4345
4346punpckhbw m7, m5, m6
4347pmaddubsw m7, m0
4348paddw m2, m7
4349
4350punpcklbw m7, m5, m6
4351pmaddubsw m7, m0
4352paddw m4, m7
4353
b53f7c52 4354mova m7, [pw_512]
72b9787e
JB
4355
4356pmulhrsw m4, m7
4357pmulhrsw m2, m7
4358
4359packuswb m4, m2
4360
4361movu [r2], m4
4362
4363punpcklbw m4, m3, m5
4364punpckhbw m3, m5
4365
4366pmaddubsw m4, m1
4367pmaddubsw m3, m1
4368
4369movu m5, [r5 + 2 * r1]
4370
4371punpcklbw m2, m6, m5
4372punpckhbw m6, m5
4373
4374pmaddubsw m2, m0
4375pmaddubsw m6, m0
4376
4377paddw m4, m2
4378paddw m3, m6
4379
4380pmulhrsw m4, m7
4381pmulhrsw m3, m7
4382
4383packuswb m4, m3
4384
4385movu [r2 + r3], m4
4386
4387add r0, 16
4388add r2, 16
4389dec r6d
4390jnz .loopW
4391
4392lea r0, [r0 + r1 * 2 - %1]
4393lea r2, [r2 + r3 * 2 - %1]
4394
4395dec r4d
4396jnz .loop
4397RET
4398%endmacro
4399
4400FILTER_V4_W16n_H2 64, 64
4401FILTER_V4_W16n_H2 64, 32
4402FILTER_V4_W16n_H2 64, 48
4403FILTER_V4_W16n_H2 48, 64
4404FILTER_V4_W16n_H2 64, 16
4405
4406
4407;-----------------------------------------------------------------------------
4408; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
4409;-----------------------------------------------------------------------------
4410INIT_XMM ssse3
4411cglobal luma_p2s, 3, 7, 6
4412
4413 ; load width and height
4414 mov r3d, r3m
4415 mov r4d, r4m
4416
4417 ; load constant
b53f7c52 4418 mova m4, [pb_128]
72b9787e
JB
4419 mova m5, [tab_c_64_n64]
4420
4421.loopH:
4422
4423 xor r5d, r5d
4424.loopW:
4425 lea r6, [r0 + r5]
4426
4427 movh m0, [r6]
4428 punpcklbw m0, m4
4429 pmaddubsw m0, m5
4430
4431 movh m1, [r6 + r1]
4432 punpcklbw m1, m4
4433 pmaddubsw m1, m5
4434
4435 movh m2, [r6 + r1 * 2]
4436 punpcklbw m2, m4
4437 pmaddubsw m2, m5
4438
4439 lea r6, [r6 + r1 * 2]
4440 movh m3, [r6 + r1]
4441 punpcklbw m3, m4
4442 pmaddubsw m3, m5
4443
4444 add r5, 8
4445 cmp r5, r3
4446 jg .width4
4447 movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
4448 movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
4449 movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
4450 movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
4451 je .nextH
4452 jmp .loopW
4453
4454.width4:
4455 movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
4456 movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
4457 movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
4458 movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
4459
4460.nextH:
4461 lea r0, [r0 + r1 * 4]
4462 add r2, FENC_STRIDE * 8
4463
4464 sub r4d, 4
4465 jnz .loopH
4466
4467 RET
4468
4469%macro PROCESS_LUMA_W4_4R 0
4470 movd m0, [r0]
4471 movd m1, [r0 + r1]
4472 punpcklbw m2, m0, m1 ; m2=[0 1]
4473
4474 lea r0, [r0 + 2 * r1]
4475 movd m0, [r0]
4476 punpcklbw m1, m0 ; m1=[1 2]
4477 punpcklqdq m2, m1 ; m2=[0 1 1 2]
4478 pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]
4479
4480 movd m1, [r0 + r1]
4481 punpcklbw m5, m0, m1 ; m2=[2 3]
4482 lea r0, [r0 + 2 * r1]
4483 movd m0, [r0]
4484 punpcklbw m1, m0 ; m1=[3 4]
4485 punpcklqdq m5, m1 ; m5=[2 3 3 4]
4486 pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]
4487 paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2
4488 pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4
4489
4490 movd m1, [r0 + r1]
4491 punpcklbw m2, m0, m1 ; m2=[4 5]
4492 lea r0, [r0 + 2 * r1]
4493 movd m0, [r0]
4494 punpcklbw m1, m0 ; m1=[5 6]
4495 punpcklqdq m2, m1 ; m2=[4 5 5 6]
4496 pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]
4497 paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2
4498 pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]
4499 paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4
4500
4501 movd m1, [r0 + r1]
4502 punpcklbw m2, m0, m1 ; m2=[6 7]
4503 lea r0, [r0 + 2 * r1]
4504 movd m0, [r0]
4505 punpcklbw m1, m0 ; m1=[7 8]
4506 punpcklqdq m2, m1 ; m2=[6 7 7 8]
4507 pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]
4508 paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end
4509 pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]
4510 paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4
4511
4512 movd m1, [r0 + r1]
4513 punpcklbw m2, m0, m1 ; m2=[8 9]
4514 movd m0, [r0 + 2 * r1]
4515 punpcklbw m1, m0 ; m1=[9 10]
4516 punpcklqdq m2, m1 ; m2=[8 9 9 10]
4517 pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]
4518 paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end
4519%endmacro
4520
4521%macro PROCESS_LUMA_W8_4R 0
4522 movq m0, [r0]
4523 movq m1, [r0 + r1]
4524 punpcklbw m0, m1
4525 pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1
4526
4527 lea r0, [r0 + 2 * r1]
4528 movq m0, [r0]
4529 punpcklbw m1, m0
4530 pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2
4531
4532 movq m1, [r0 + r1]
4533 punpcklbw m0, m1
4534 pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3
4535 pmaddubsw m0, [r6 + 1 * 16]
4536 paddw m7, m0 ;m7=[0+1+2+3] Row1
4537
4538 lea r0, [r0 + 2 * r1]
4539 movq m0, [r0]
4540 punpcklbw m1, m0
4541 pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4
4542 pmaddubsw m1, [r6 + 1 * 16]
4543 paddw m6, m1 ;m6 = [1+2+3+4] Row2
4544
4545 movq m1, [r0 + r1]
4546 punpcklbw m0, m1
4547 pmaddubsw m2, m0, [r6 + 1 * 16]
4548 pmaddubsw m0, [r6 + 2 * 16]
4549 paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1
4550 paddw m5, m2 ;m5=[2+3+4+5] Row3
4551
4552 lea r0, [r0 + 2 * r1]
4553 movq m0, [r0]
4554 punpcklbw m1, m0
4555 pmaddubsw m2, m1, [r6 + 1 * 16]
4556 pmaddubsw m1, [r6 + 2 * 16]
4557 paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2
4558 paddw m4, m2 ;m4=[3+4+5+6] Row4
4559
4560 movq m1, [r0 + r1]
4561 punpcklbw m0, m1
4562 pmaddubsw m2, m0, [r6 + 2 * 16]
4563 pmaddubsw m0, [r6 + 3 * 16]
4564 paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end
4565 paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3
4566
4567 lea r0, [r0 + 2 * r1]
4568 movq m0, [r0]
4569 punpcklbw m1, m0
4570 pmaddubsw m2, m1, [r6 + 2 * 16]
4571 pmaddubsw m1, [r6 + 3 * 16]
4572 paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end
4573 paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4
4574
4575 movq m1, [r0 + r1]
4576 punpcklbw m0, m1
4577 pmaddubsw m0, [r6 + 3 * 16]
4578 paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end
4579
4580 movq m0, [r0 + 2 * r1]
4581 punpcklbw m1, m0
4582 pmaddubsw m1, [r6 + 3 * 16]
4583 paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end
4584%endmacro
4585
4586;-------------------------------------------------------------------------------------------------------------
4587; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4588;-------------------------------------------------------------------------------------------------------------
4589%macro FILTER_VER_LUMA_4xN 3
4590INIT_XMM sse4
4591cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
4592 lea r5, [3 * r1]
4593 sub r0, r5
4594 shl r4d, 6
4595%ifidn %3,ps
4596 add r3d, r3d
4597%endif
4598
4599%ifdef PIC
4600 lea r5, [tab_LumaCoeffVer]
4601 lea r6, [r5 + r4]
4602%else
4603 lea r6, [tab_LumaCoeffVer + r4]
4604%endif
4605
4606%ifidn %3,pp
b53f7c52 4607 mova m3, [pw_512]
72b9787e
JB
4608%else
4609 mova m3, [pw_2000]
4610%endif
4611
4612 mov r4d, %2/4
4613 lea r5, [4 * r1]
4614
4615.loopH:
4616 PROCESS_LUMA_W4_4R
4617
4618%ifidn %3,pp
4619 pmulhrsw m4, m3
4620 pmulhrsw m5, m3
4621
4622 packuswb m4, m5
4623
4624 movd [r2], m4
4625 pextrd [r2 + r3], m4, 1
4626 lea r2, [r2 + 2 * r3]
4627 pextrd [r2], m4, 2
4628 pextrd [r2 + r3], m4, 3
4629%else
4630 psubw m4, m3
4631 psubw m5, m3
4632
4633 movlps [r2], m4
4634 movhps [r2 + r3], m4
4635 lea r2, [r2 + 2 * r3]
4636 movlps [r2], m5
4637 movhps [r2 + r3], m5
4638%endif
4639
4640 sub r0, r5
4641 lea r2, [r2 + 2 * r3]
4642
4643 dec r4d
4644 jnz .loopH
4645
4646 RET
4647%endmacro
4648
b53f7c52
JB
4649
4650INIT_YMM avx2
4651cglobal interp_8tap_vert_pp_4x4, 4,6,8
4652 mov r4d, r4m
4653 lea r5, [r1 * 3]
4654 sub r0, r5
4655
4656 ; TODO: VPGATHERDD
4657 movd xm1, [r0] ; m1 = row0
4658 movd xm2, [r0 + r1] ; m2 = row1
4659 punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00]
4660
4661 movd xm3, [r0 + r1 * 2] ; m3 = row2
4662 punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10]
4663 movd xm4, [r0 + r5]
4664 punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20]
4665 punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
4666
4667 lea r0, [r0 + r1 * 4]
4668 movd xm5, [r0] ; m5 = row4
4669 punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30]
4670 punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]
4671 vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
4672 movd xm2, [r0 + r1] ; m2 = row5
4673 punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40]
4674 punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
4675 movd xm6, [r0 + r1 * 2] ; m6 = row6
4676 punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50]
4677 punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]
4678 vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
4679 movd xm4, [r0 + r5] ; m4 = row7
4680 punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60]
4681 punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
4682
4683 lea r0, [r0 + r1 * 4]
4684 movd xm7, [r0] ; m7 = row8
4685 punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70]
4686 punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]
4687 vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
4688 movd xm2, [r0 + r1] ; m2 = row9
4689 punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80]
4690 punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
4691 movd xm7, [r0 + r1 * 2] ; m7 = rowA
4692 punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90]
4693 punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]
4694 vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
4695
4696 ; load filter coeff
4697%ifdef PIC
4698 lea r5, [tab_LumaCoeff]
4699 vpbroadcastd m0, [r5 + r4 * 8 + 0]
4700 vpbroadcastd m2, [r5 + r4 * 8 + 4]
4701%else
4702 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0]
4703 vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4]
4704%endif
4705
4706 pmaddubsw m1, m0
4707 pmaddubsw m3, m0
4708 pmaddubsw m5, m2
4709 pmaddubsw m6, m2
4710 vbroadcasti128 m0, [pw_1]
4711 pmaddwd m1, m0
4712 pmaddwd m3, m0
4713 pmaddwd m5, m0
4714 pmaddwd m6, m0
4715 paddd m1, m5 ; m1 = DQWORD ROW[1 0]
4716 paddd m3, m6 ; m3 = DQWORD ROW[3 2]
4717 packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0]
4718
4719 ; TODO: does it overflow?
4720 pmulhrsw m1, [pw_512]
4721 vextracti128 xm2, m1, 1
4722 packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0]
4723 movd [r2], xm1
4724 pextrd [r2 + r3], xm1, 2
4725 pextrd [r2 + r3 * 2], xm1, 1
4726 lea r4, [r3 * 3]
4727 pextrd [r2 + r4], xm1, 3
4728 RET
4729
4730INIT_YMM avx2
4731cglobal interp_8tap_vert_ps_4x4, 4, 6, 5
4732 mov r4d, r4m
4733 shl r4d, 7
4734
4735%ifdef PIC
4736 lea r5, [tab_LumaCoeffVer_32]
4737 add r5, r4
4738%else
4739 lea r5, [tab_LumaCoeffVer_32 + r4]
4740%endif
4741
4742 lea r4, [r1 * 3]
4743 sub r0, r4
4744
4745 add r3d, r3d
4746
4747 movd xm1, [r0]
4748 pinsrd xm1, [r0 + r1], 1
4749 pinsrd xm1, [r0 + r1 * 2], 2
4750 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
4751 lea r0, [r0 + r1 * 4]
4752 movd xm2, [r0]
4753 pinsrd xm2, [r0 + r1], 1
4754 pinsrd xm2, [r0 + r1 * 2], 2
4755 pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4]
4756 vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0]
4757 lea r0, [r0 + r1 * 4]
4758 movd xm3, [r0]
4759 pinsrd xm3, [r0 + r1], 1
4760 pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8]
4761 vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4]
4762 mova m3, [interp4_vpp_shuf1]
4763 vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0]
4764 vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4]
4765 mova m3, [interp4_vpp_shuf1 + mmsize]
4766 vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2]
4767 vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6]
4768
4769 mova m3, [interp4_vpp_shuf]
4770 pshufb m0, m0, m3
4771 pshufb m1, m1, m3
4772 pshufb m4, m4, m3
4773 pshufb m2, m2, m3
4774 pmaddubsw m0, [r5]
4775 pmaddubsw m1, [r5 + mmsize]
4776 pmaddubsw m4, [r5 + 2 * mmsize]
4777 pmaddubsw m2, [r5 + 3 * mmsize]
4778 paddw m0, m1
4779 paddw m0, m4
4780 paddw m0, m2 ; m0 = WORD ROW[3 2 1 0]
4781
4782 vbroadcasti128 m3, [pw_2000]
4783 psubw m0, m3
4784 vextracti128 xm2, m0, 1
4785 lea r5, [r3 * 3]
4786 movq [r2], xm0
4787 movhps [r2 + r3], xm0
4788 movq [r2 + r3 * 2], xm2
4789 movhps [r2 + r5], xm2
4790 RET
4791
72b9787e
JB
4792;-------------------------------------------------------------------------------------------------------------
4793; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4794;-------------------------------------------------------------------------------------------------------------
4795FILTER_VER_LUMA_4xN 4, 4, pp
4796
4797;-------------------------------------------------------------------------------------------------------------
4798; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4799;-------------------------------------------------------------------------------------------------------------
4800FILTER_VER_LUMA_4xN 4, 8, pp
4801
4802;-------------------------------------------------------------------------------------------------------------
4803; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4804;-------------------------------------------------------------------------------------------------------------
4805FILTER_VER_LUMA_4xN 4, 16, pp
4806
4807;-------------------------------------------------------------------------------------------------------------
4808; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4809;-------------------------------------------------------------------------------------------------------------
4810FILTER_VER_LUMA_4xN 4, 4, ps
4811
4812;-------------------------------------------------------------------------------------------------------------
4813; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4814;-------------------------------------------------------------------------------------------------------------
4815FILTER_VER_LUMA_4xN 4, 8, ps
4816
4817;-------------------------------------------------------------------------------------------------------------
4818; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4819;-------------------------------------------------------------------------------------------------------------
4820FILTER_VER_LUMA_4xN 4, 16, ps
4821
b53f7c52
JB
4822%macro PROCESS_LUMA_AVX2_W8_8R 0
4823 movq xm1, [r0] ; m1 = row 0
4824 movq xm2, [r0 + r1] ; m2 = row 1
4825 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4826 movq xm3, [r0 + r1 * 2] ; m3 = row 2
4827 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
4828 vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4829 pmaddubsw m5, [r5]
4830 movq xm4, [r0 + r4] ; m4 = row 3
4831 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4832 lea r0, [r0 + r1 * 4]
4833 movq xm1, [r0] ; m1 = row 4
4834 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
4835 vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4836 pmaddubsw m0, m2, [r5 + 1 * mmsize]
4837 paddw m5, m0
4838 pmaddubsw m2, [r5]
4839 movq xm3, [r0 + r1] ; m3 = row 5
4840 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4841 movq xm4, [r0 + r1 * 2] ; m4 = row 6
4842 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
4843 vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4844 pmaddubsw m3, m1, [r5 + 2 * mmsize]
4845 paddw m5, m3
4846 pmaddubsw m0, m1, [r5 + 1 * mmsize]
4847 paddw m2, m0
4848 pmaddubsw m1, [r5]
4849 movq xm3, [r0 + r4] ; m3 = row 7
4850 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4851 lea r0, [r0 + r1 * 4]
4852 movq xm0, [r0] ; m0 = row 8
4853 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
4854 vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4855 pmaddubsw m3, m4, [r5 + 3 * mmsize]
4856 paddw m5, m3
4857 pmaddubsw m3, m4, [r5 + 2 * mmsize]
4858 paddw m2, m3
4859 pmaddubsw m3, m4, [r5 + 1 * mmsize]
4860 paddw m1, m3
4861 pmaddubsw m4, [r5]
4862 movq xm3, [r0 + r1] ; m3 = row 9
4863 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4864 movq xm6, [r0 + r1 * 2] ; m6 = row 10
4865 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
4866 vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4867 pmaddubsw m3, m0, [r5 + 3 * mmsize]
4868 paddw m2, m3
4869 pmaddubsw m3, m0, [r5 + 2 * mmsize]
4870 paddw m1, m3
4871 pmaddubsw m0, [r5 + 1 * mmsize]
4872 paddw m4, m0
4873
4874 movq xm3, [r0 + r4] ; m3 = row 11
4875 punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
4876 lea r0, [r0 + r1 * 4]
4877 movq xm0, [r0] ; m0 = row 12
4878 punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0]
4879 vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
4880 pmaddubsw m3, m6, [r5 + 3 * mmsize]
4881 paddw m1, m3
4882 pmaddubsw m6, [r5 + 2 * mmsize]
4883 paddw m4, m6
4884 movq xm3, [r0 + r1] ; m3 = row 13
4885 punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
4886 movq xm6, [r0 + r1 * 2] ; m6 = row 14
4887 punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0]
4888 vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
4889 pmaddubsw m0, [r5 + 3 * mmsize]
4890 paddw m4, m0
4891%endmacro
4892
4893%macro PROCESS_LUMA_AVX2_W8_4R 0
4894 movq xm1, [r0] ; m1 = row 0
4895 movq xm2, [r0 + r1] ; m2 = row 1
4896 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4897 movq xm3, [r0 + r1 * 2] ; m3 = row 2
4898 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
4899 vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4900 pmaddubsw m5, [r5]
4901 movq xm4, [r0 + r4] ; m4 = row 3
4902 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4903 lea r0, [r0 + r1 * 4]
4904 movq xm1, [r0] ; m1 = row 4
4905 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
4906 vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4907 pmaddubsw m0, m2, [r5 + 1 * mmsize]
4908 paddw m5, m0
4909 pmaddubsw m2, [r5]
4910 movq xm3, [r0 + r1] ; m3 = row 5
4911 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4912 movq xm4, [r0 + r1 * 2] ; m4 = row 6
4913 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
4914 vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4915 pmaddubsw m3, m1, [r5 + 2 * mmsize]
4916 paddw m5, m3
4917 pmaddubsw m0, m1, [r5 + 1 * mmsize]
4918 paddw m2, m0
4919 movq xm3, [r0 + r4] ; m3 = row 7
4920 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4921 lea r0, [r0 + r1 * 4]
4922 movq xm0, [r0] ; m0 = row 8
4923 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
4924 vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4925 pmaddubsw m3, m4, [r5 + 3 * mmsize]
4926 paddw m5, m3
4927 pmaddubsw m3, m4, [r5 + 2 * mmsize]
4928 paddw m2, m3
4929 movq xm3, [r0 + r1] ; m3 = row 9
4930 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4931 movq xm6, [r0 + r1 * 2] ; m6 = row 10
4932 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
4933 vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4934 pmaddubsw m3, m0, [r5 + 3 * mmsize]
4935 paddw m2, m3
4936%endmacro
4937
72b9787e
JB
4938;-------------------------------------------------------------------------------------------------------------
4939; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4940;-------------------------------------------------------------------------------------------------------------
4941%macro FILTER_VER_LUMA_8xN 3
4942INIT_XMM sse4
4943cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
4944 lea r5, [3 * r1]
4945 sub r0, r5
4946 shl r4d, 6
4947
4948%ifidn %3,ps
4949 add r3d, r3d
4950%endif
4951
4952%ifdef PIC
4953 lea r5, [tab_LumaCoeffVer]
4954 lea r6, [r5 + r4]
4955%else
4956 lea r6, [tab_LumaCoeffVer + r4]
4957%endif
4958
4959 %ifidn %3,pp
b53f7c52 4960 mova m3, [pw_512]
72b9787e
JB
4961%else
4962 mova m3, [pw_2000]
4963%endif
4964
4965 mov r4d, %2/4
4966 lea r5, [4 * r1]
4967
4968.loopH:
4969 PROCESS_LUMA_W8_4R
4970
4971%ifidn %3,pp
4972 pmulhrsw m7, m3
4973 pmulhrsw m6, m3
4974 pmulhrsw m5, m3
4975 pmulhrsw m4, m3
4976
4977 packuswb m7, m6
4978 packuswb m5, m4
4979
4980 movlps [r2], m7
4981 movhps [r2 + r3], m7
4982 lea r2, [r2 + 2 * r3]
4983 movlps [r2], m5
4984 movhps [r2 + r3], m5
4985%else
4986 psubw m7, m3
4987 psubw m6, m3
4988 psubw m5, m3
4989 psubw m4, m3
4990
4991 movu [r2], m7
4992 movu [r2 + r3], m6
4993 lea r2, [r2 + 2 * r3]
4994 movu [r2], m5
4995 movu [r2 + r3], m4
4996%endif
4997
4998 sub r0, r5
4999 lea r2, [r2 + 2 * r3]
5000
5001 dec r4d
5002 jnz .loopH
5003
5004 RET
5005%endmacro
5006
b53f7c52
JB
5007%macro FILTER_VER_LUMA_AVX2_8xN 2
5008INIT_YMM avx2
5009cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize
5010 mov r4d, r4m
5011 shl r4d, 7
5012
5013%ifdef PIC
5014 lea r5, [tab_LumaCoeffVer_32]
5015 add r5, r4
5016%else
5017 lea r5, [tab_LumaCoeffVer_32 + r4]
5018%endif
5019 lea r4, [r1 * 3]
5020 sub r0, r4
5021 lea r6, [r1 * 4]
5022 mov word [rsp], %2 / 8
5023 mova m7, [pw_512]
5024
5025.loop:
5026 PROCESS_LUMA_AVX2_W8_8R
5027 pmulhrsw m5, m7 ; m5 = word: row 0, row 1
5028 pmulhrsw m2, m7 ; m2 = word: row 2, row 3
5029 pmulhrsw m1, m7 ; m1 = word: row 4, row 5
5030 pmulhrsw m4, m7 ; m4 = word: row 6, row 7
5031 packuswb m5, m2
5032 packuswb m1, m4
5033 vextracti128 xm2, m5, 1
5034 vextracti128 xm4, m1, 1
5035 movq [r2], xm5
5036 movq [r2 + r3], xm2
5037 lea r2, [r2 + r3 * 2]
5038 movhps [r2], xm5
5039 movhps [r2 + r3], xm2
5040 lea r2, [r2 + r3 * 2]
5041 movq [r2], xm1
5042 movq [r2 + r3], xm4
5043 lea r2, [r2 + r3 * 2]
5044 movhps [r2], xm1
5045 movhps [r2 + r3], xm4
5046 lea r2, [r2 + r3 * 2]
5047 sub r0, r6
5048 dec word [rsp]
5049 jnz .loop
5050 RET
5051%endmacro
5052
5053INIT_YMM avx2
5054cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
5055 mov r4d, r4m
5056 shl r4d, 7
5057
5058%ifdef PIC
5059 lea r5, [tab_LumaCoeffVer_32]
5060 add r5, r4
5061%else
5062 lea r5, [tab_LumaCoeffVer_32 + r4]
5063%endif
5064
5065 lea r4, [r1 * 3]
5066 sub r0, r4
5067 PROCESS_LUMA_AVX2_W8_8R
5068 lea r4, [r3 * 3]
5069 mova m3, [pw_512]
5070 pmulhrsw m5, m3 ; m5 = word: row 0, row 1
5071 pmulhrsw m2, m3 ; m2 = word: row 2, row 3
5072 pmulhrsw m1, m3 ; m1 = word: row 4, row 5
5073 pmulhrsw m4, m3 ; m4 = word: row 6, row 7
5074 packuswb m5, m2
5075 packuswb m1, m4
5076 vextracti128 xm2, m5, 1
5077 vextracti128 xm4, m1, 1
5078 movq [r2], xm5
5079 movq [r2 + r3], xm2
5080 movhps [r2 + r3 * 2], xm5
5081 movhps [r2 + r4], xm2
5082 lea r2, [r2 + r3 * 4]
5083 movq [r2], xm1
5084 movq [r2 + r3], xm4
5085 movhps [r2 + r3 * 2], xm1
5086 movhps [r2 + r4], xm4
5087 RET
5088
5089INIT_YMM avx2
5090cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
5091 mov r4d, r4m
5092 shl r4d, 7
5093
5094%ifdef PIC
5095 lea r5, [tab_LumaCoeffVer_32]
5096 add r5, r4
5097%else
5098 lea r5, [tab_LumaCoeffVer_32 + r4]
5099%endif
5100
5101 lea r4, [r1 * 3]
5102 sub r0, r4
5103 PROCESS_LUMA_AVX2_W8_4R
5104 lea r4, [r3 * 3]
5105 mova m3, [pw_512]
5106 pmulhrsw m5, m3 ; m5 = word: row 0, row 1
5107 pmulhrsw m2, m3 ; m2 = word: row 2, row 3
5108 packuswb m5, m2
5109 vextracti128 xm2, m5, 1
5110 movq [r2], xm5
5111 movq [r2 + r3], xm2
5112 movhps [r2 + r3 * 2], xm5
5113 movhps [r2 + r4], xm2
5114 RET
5115
72b9787e
JB
5116;-------------------------------------------------------------------------------------------------------------
5117; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5118;-------------------------------------------------------------------------------------------------------------
5119FILTER_VER_LUMA_8xN 8, 4, pp
5120
5121;-------------------------------------------------------------------------------------------------------------
5122; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5123;-------------------------------------------------------------------------------------------------------------
5124FILTER_VER_LUMA_8xN 8, 8, pp
5125
5126;-------------------------------------------------------------------------------------------------------------
5127; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5128;-------------------------------------------------------------------------------------------------------------
5129FILTER_VER_LUMA_8xN 8, 16, pp
b53f7c52 5130FILTER_VER_LUMA_AVX2_8xN 8, 16
72b9787e
JB
5131
5132;-------------------------------------------------------------------------------------------------------------
5133; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5134;-------------------------------------------------------------------------------------------------------------
5135FILTER_VER_LUMA_8xN 8, 32, pp
b53f7c52 5136FILTER_VER_LUMA_AVX2_8xN 8, 32
72b9787e
JB
5137
5138;-------------------------------------------------------------------------------------------------------------
5139; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5140;-------------------------------------------------------------------------------------------------------------
5141FILTER_VER_LUMA_8xN 8, 4, ps
5142
5143;-------------------------------------------------------------------------------------------------------------
5144; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5145;-------------------------------------------------------------------------------------------------------------
5146FILTER_VER_LUMA_8xN 8, 8, ps
5147
5148;-------------------------------------------------------------------------------------------------------------
5149; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5150;-------------------------------------------------------------------------------------------------------------
5151FILTER_VER_LUMA_8xN 8, 16, ps
5152
5153;-------------------------------------------------------------------------------------------------------------
5154; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5155;-------------------------------------------------------------------------------------------------------------
5156FILTER_VER_LUMA_8xN 8, 32, ps
5157
5158;-------------------------------------------------------------------------------------------------------------
5159; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5160;-------------------------------------------------------------------------------------------------------------
5161%macro FILTER_VER_LUMA_12xN 3
5162INIT_XMM sse4
5163cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
5164 lea r5, [3 * r1]
5165 sub r0, r5
5166 shl r4d, 6
5167%ifidn %3,ps
5168 add r3d, r3d
5169%endif
5170
5171%ifdef PIC
5172 lea r5, [tab_LumaCoeffVer]
5173 lea r6, [r5 + r4]
5174%else
5175 lea r6, [tab_LumaCoeffVer + r4]
5176%endif
5177
5178 %ifidn %3,pp
b53f7c52 5179 mova m3, [pw_512]
72b9787e
JB
5180%else
5181 mova m3, [pw_2000]
5182%endif
5183
5184 mov r4d, %2/4
5185
5186.loopH:
5187 PROCESS_LUMA_W8_4R
5188
5189%ifidn %3,pp
5190 pmulhrsw m7, m3
5191 pmulhrsw m6, m3
5192 pmulhrsw m5, m3
5193 pmulhrsw m4, m3
5194
5195 packuswb m7, m6
5196 packuswb m5, m4
5197
5198 movlps [r2], m7
5199 movhps [r2 + r3], m7
5200 lea r5, [r2 + 2 * r3]
5201 movlps [r5], m5
5202 movhps [r5 + r3], m5
5203%else
5204 psubw m7, m3
5205 psubw m6, m3
5206 psubw m5, m3
5207 psubw m4, m3
5208
5209 movu [r2], m7
5210 movu [r2 + r3], m6
5211 lea r5, [r2 + 2 * r3]
5212 movu [r5], m5
5213 movu [r5 + r3], m4
5214%endif
5215
5216 lea r5, [8 * r1 - 8]
5217 sub r0, r5
5218%ifidn %3,pp
5219 add r2, 8
5220%else
5221 add r2, 16
5222%endif
5223
5224 PROCESS_LUMA_W4_4R
5225
5226%ifidn %3,pp
5227 pmulhrsw m4, m3
5228 pmulhrsw m5, m3
5229
5230 packuswb m4, m5
5231
5232 movd [r2], m4
5233 pextrd [r2 + r3], m4, 1
5234 lea r5, [r2 + 2 * r3]
5235 pextrd [r5], m4, 2
5236 pextrd [r5 + r3], m4, 3
5237%else
5238 psubw m4, m3
5239 psubw m5, m3
5240
5241 movlps [r2], m4
5242 movhps [r2 + r3], m4
5243 lea r5, [r2 + 2 * r3]
5244 movlps [r5], m5
5245 movhps [r5 + r3], m5
5246%endif
5247
5248 lea r5, [4 * r1 + 8]
5249 sub r0, r5
5250%ifidn %3,pp
5251 lea r2, [r2 + 4 * r3 - 8]
5252%else
5253 lea r2, [r2 + 4 * r3 - 16]
5254%endif
5255
5256 dec r4d
5257 jnz .loopH
5258
5259 RET
5260%endmacro
5261
5262;-------------------------------------------------------------------------------------------------------------
5263; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5264;-------------------------------------------------------------------------------------------------------------
5265FILTER_VER_LUMA_12xN 12, 16, pp
5266
5267;-------------------------------------------------------------------------------------------------------------
5268; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5269;-------------------------------------------------------------------------------------------------------------
5270FILTER_VER_LUMA_12xN 12, 16, ps
5271
b53f7c52
JB
5272INIT_YMM avx2
5273%if ARCH_X86_64 == 1
5274cglobal interp_8tap_vert_pp_12x16, 4, 7, 15
5275 mov r4d, r4m
5276 shl r4d, 7
5277
5278%ifdef PIC
5279 lea r5, [tab_LumaCoeffVer_32]
5280 add r5, r4
5281%else
5282 lea r5, [tab_LumaCoeffVer_32 + r4]
5283%endif
5284
5285 lea r4, [r1 * 3]
5286 sub r0, r4
5287 lea r6, [r3 * 3]
5288 mova m14, [pw_512]
5289
5290 movu xm0, [r0] ; m0 = row 0
5291 movu xm1, [r0 + r1] ; m1 = row 1
5292 punpckhbw xm2, xm0, xm1
5293 punpcklbw xm0, xm1
5294 vinserti128 m0, m0, xm2, 1
5295 pmaddubsw m0, [r5]
5296 movu xm2, [r0 + r1 * 2] ; m2 = row 2
5297 punpckhbw xm3, xm1, xm2
5298 punpcklbw xm1, xm2
5299 vinserti128 m1, m1, xm3, 1
5300 pmaddubsw m1, [r5]
5301 movu xm3, [r0 + r4] ; m3 = row 3
5302 punpckhbw xm4, xm2, xm3
5303 punpcklbw xm2, xm3
5304 vinserti128 m2, m2, xm4, 1
5305 pmaddubsw m4, m2, [r5 + 1 * mmsize]
5306 paddw m0, m4
5307 pmaddubsw m2, [r5]
5308 lea r0, [r0 + r1 * 4]
5309 movu xm4, [r0] ; m4 = row 4
5310 punpckhbw xm5, xm3, xm4
5311 punpcklbw xm3, xm4
5312 vinserti128 m3, m3, xm5, 1
5313 pmaddubsw m5, m3, [r5 + 1 * mmsize]
5314 paddw m1, m5
5315 pmaddubsw m3, [r5]
5316 movu xm5, [r0 + r1] ; m5 = row 5
5317 punpckhbw xm6, xm4, xm5
5318 punpcklbw xm4, xm5
5319 vinserti128 m4, m4, xm6, 1
5320 pmaddubsw m6, m4, [r5 + 2 * mmsize]
5321 paddw m0, m6
5322 pmaddubsw m6, m4, [r5 + 1 * mmsize]
5323 paddw m2, m6
5324 pmaddubsw m4, [r5]
5325 movu xm6, [r0 + r1 * 2] ; m6 = row 6
5326 punpckhbw xm7, xm5, xm6
5327 punpcklbw xm5, xm6
5328 vinserti128 m5, m5, xm7, 1
5329 pmaddubsw m7, m5, [r5 + 2 * mmsize]
5330 paddw m1, m7
5331 pmaddubsw m7, m5, [r5 + 1 * mmsize]
5332 paddw m3, m7
5333 pmaddubsw m5, [r5]
5334 movu xm7, [r0 + r4] ; m7 = row 7
5335 punpckhbw xm8, xm6, xm7
5336 punpcklbw xm6, xm7
5337 vinserti128 m6, m6, xm8, 1
5338 pmaddubsw m8, m6, [r5 + 3 * mmsize]
5339 paddw m0, m8
5340 pmaddubsw m8, m6, [r5 + 2 * mmsize]
5341 paddw m2, m8
5342 pmaddubsw m8, m6, [r5 + 1 * mmsize]
5343 paddw m4, m8
5344 pmaddubsw m6, [r5]
5345 lea r0, [r0 + r1 * 4]
5346 movu xm8, [r0] ; m8 = row 8
5347 punpckhbw xm9, xm7, xm8
5348 punpcklbw xm7, xm8
5349 vinserti128 m7, m7, xm9, 1
5350 pmaddubsw m9, m7, [r5 + 3 * mmsize]
5351 paddw m1, m9
5352 pmaddubsw m9, m7, [r5 + 2 * mmsize]
5353 paddw m3, m9
5354 pmaddubsw m9, m7, [r5 + 1 * mmsize]
5355 paddw m5, m9
5356 pmaddubsw m7, [r5]
5357 movu xm9, [r0 + r1] ; m9 = row 9
5358 punpckhbw xm10, xm8, xm9
5359 punpcklbw xm8, xm9
5360 vinserti128 m8, m8, xm10, 1
5361 pmaddubsw m10, m8, [r5 + 3 * mmsize]
5362 paddw m2, m10
5363 pmaddubsw m10, m8, [r5 + 2 * mmsize]
5364 paddw m4, m10
5365 pmaddubsw m10, m8, [r5 + 1 * mmsize]
5366 paddw m6, m10
5367 pmaddubsw m8, [r5]
5368 movu xm10, [r0 + r1 * 2] ; m10 = row 10
5369 punpckhbw xm11, xm9, xm10
5370 punpcklbw xm9, xm10
5371 vinserti128 m9, m9, xm11, 1
5372 pmaddubsw m11, m9, [r5 + 3 * mmsize]
5373 paddw m3, m11
5374 pmaddubsw m11, m9, [r5 + 2 * mmsize]
5375 paddw m5, m11
5376 pmaddubsw m11, m9, [r5 + 1 * mmsize]
5377 paddw m7, m11
5378 pmaddubsw m9, [r5]
5379 movu xm11, [r0 + r4] ; m11 = row 11
5380 punpckhbw xm12, xm10, xm11
5381 punpcklbw xm10, xm11
5382 vinserti128 m10, m10, xm12, 1
5383 pmaddubsw m12, m10, [r5 + 3 * mmsize]
5384 paddw m4, m12
5385 pmaddubsw m12, m10, [r5 + 2 * mmsize]
5386 paddw m6, m12
5387 pmaddubsw m12, m10, [r5 + 1 * mmsize]
5388 paddw m8, m12
5389 pmaddubsw m10, [r5]
5390 lea r0, [r0 + r1 * 4]
5391 movu xm12, [r0] ; m12 = row 12
5392 punpckhbw xm13, xm11, xm12
5393 punpcklbw xm11, xm12
5394 vinserti128 m11, m11, xm13, 1
5395 pmaddubsw m13, m11, [r5 + 3 * mmsize]
5396 paddw m5, m13
5397 pmaddubsw m13, m11, [r5 + 2 * mmsize]
5398 paddw m7, m13
5399 pmaddubsw m13, m11, [r5 + 1 * mmsize]
5400 paddw m9, m13
5401 pmaddubsw m11, [r5]
5402
5403 pmulhrsw m0, m14 ; m0 = word: row 0
5404 pmulhrsw m1, m14 ; m1 = word: row 1
5405 pmulhrsw m2, m14 ; m2 = word: row 2
5406 pmulhrsw m3, m14 ; m3 = word: row 3
5407 pmulhrsw m4, m14 ; m4 = word: row 4
5408 pmulhrsw m5, m14 ; m5 = word: row 5
5409 packuswb m0, m1
5410 packuswb m2, m3
5411 packuswb m4, m5
5412 vpermq m0, m0, 11011000b
5413 vpermq m2, m2, 11011000b
5414 vpermq m4, m4, 11011000b
5415 vextracti128 xm1, m0, 1
5416 vextracti128 xm3, m2, 1
5417 vextracti128 xm5, m4, 1
5418 movq [r2], xm0
5419 pextrd [r2 + 8], xm0, 2
5420 movq [r2 + r3], xm1
5421 pextrd [r2 + r3 + 8], xm1, 2
5422 movq [r2 + r3 * 2], xm2
5423 pextrd [r2 + r3 * 2 + 8], xm2, 2
5424 movq [r2 + r6], xm3
5425 pextrd [r2 + r6 + 8], xm3, 2
5426 lea r2, [r2 + r3 * 4]
5427 movq [r2], xm4
5428 pextrd [r2 + 8], xm4, 2
5429 movq [r2 + r3], xm5
5430 pextrd [r2 + r3 + 8], xm5, 2
5431
5432 movu xm13, [r0 + r1] ; m13 = row 13
5433 punpckhbw xm0, xm12, xm13
5434 punpcklbw xm12, xm13
5435 vinserti128 m12, m12, xm0, 1
5436 pmaddubsw m0, m12, [r5 + 3 * mmsize]
5437 paddw m6, m0
5438 pmaddubsw m0, m12, [r5 + 2 * mmsize]
5439 paddw m8, m0
5440 pmaddubsw m0, m12, [r5 + 1 * mmsize]
5441 paddw m10, m0
5442 pmaddubsw m12, [r5]
5443 movu xm0, [r0 + r1 * 2] ; m0 = row 14
5444 punpckhbw xm1, xm13, xm0
5445 punpcklbw xm13, xm0
5446 vinserti128 m13, m13, xm1, 1
5447 pmaddubsw m1, m13, [r5 + 3 * mmsize]
5448 paddw m7, m1
5449 pmaddubsw m1, m13, [r5 + 2 * mmsize]
5450 paddw m9, m1
5451 pmaddubsw m1, m13, [r5 + 1 * mmsize]
5452 paddw m11, m1
5453 pmaddubsw m13, [r5]
5454
5455 pmulhrsw m6, m14 ; m6 = word: row 6
5456 pmulhrsw m7, m14 ; m7 = word: row 7
5457 packuswb m6, m7
5458 vpermq m6, m6, 11011000b
5459 vextracti128 xm7, m6, 1
5460 movq [r2 + r3 * 2], xm6
5461 pextrd [r2 + r3 * 2 + 8], xm6, 2
5462 movq [r2 + r6], xm7
5463 pextrd [r2 + r6 + 8], xm7, 2
5464 lea r2, [r2 + r3 * 4]
5465
5466 movu xm1, [r0 + r4] ; m1 = row 15
5467 punpckhbw xm2, xm0, xm1
5468 punpcklbw xm0, xm1
5469 vinserti128 m0, m0, xm2, 1
5470 pmaddubsw m2, m0, [r5 + 3 * mmsize]
5471 paddw m8, m2
5472 pmaddubsw m2, m0, [r5 + 2 * mmsize]
5473 paddw m10, m2
5474 pmaddubsw m2, m0, [r5 + 1 * mmsize]
5475 paddw m12, m2
5476 pmaddubsw m0, [r5]
5477 lea r0, [r0 + r1 * 4]
5478 movu xm2, [r0] ; m2 = row 16
5479 punpckhbw xm3, xm1, xm2
5480 punpcklbw xm1, xm2
5481 vinserti128 m1, m1, xm3, 1
5482 pmaddubsw m3, m1, [r5 + 3 * mmsize]
5483 paddw m9, m3
5484 pmaddubsw m3, m1, [r5 + 2 * mmsize]
5485 paddw m11, m3
5486 pmaddubsw m3, m1, [r5 + 1 * mmsize]
5487 paddw m13, m3
5488 pmaddubsw m1, [r5]
5489 movu xm3, [r0 + r1] ; m3 = row 17
5490 punpckhbw xm4, xm2, xm3
5491 punpcklbw xm2, xm3
5492 vinserti128 m2, m2, xm4, 1
5493 pmaddubsw m4, m2, [r5 + 3 * mmsize]
5494 paddw m10, m4
5495 pmaddubsw m4, m2, [r5 + 2 * mmsize]
5496 paddw m12, m4
5497 pmaddubsw m2, [r5 + 1 * mmsize]
5498 paddw m0, m2
5499 movu xm4, [r0 + r1 * 2] ; m4 = row 18
5500 punpckhbw xm5, xm3, xm4
5501 punpcklbw xm3, xm4
5502 vinserti128 m3, m3, xm5, 1
5503 pmaddubsw m5, m3, [r5 + 3 * mmsize]
5504 paddw m11, m5
5505 pmaddubsw m5, m3, [r5 + 2 * mmsize]
5506 paddw m13, m5
5507 pmaddubsw m3, [r5 + 1 * mmsize]
5508 paddw m1, m3
5509 movu xm5, [r0 + r4] ; m5 = row 19
5510 punpckhbw xm6, xm4, xm5
5511 punpcklbw xm4, xm5
5512 vinserti128 m4, m4, xm6, 1
5513 pmaddubsw m6, m4, [r5 + 3 * mmsize]
5514 paddw m12, m6
5515 pmaddubsw m4, [r5 + 2 * mmsize]
5516 paddw m0, m4
5517 lea r0, [r0 + r1 * 4]
5518 movu xm6, [r0] ; m6 = row 20
5519 punpckhbw xm7, xm5, xm6
5520 punpcklbw xm5, xm6
5521 vinserti128 m5, m5, xm7, 1
5522 pmaddubsw m7, m5, [r5 + 3 * mmsize]
5523 paddw m13, m7
5524 pmaddubsw m5, [r5 + 2 * mmsize]
5525 paddw m1, m5
5526 movu xm7, [r0 + r1] ; m7 = row 21
5527 punpckhbw xm2, xm6, xm7
5528 punpcklbw xm6, xm7
5529 vinserti128 m6, m6, xm2, 1
5530 pmaddubsw m6, [r5 + 3 * mmsize]
5531 paddw m0, m6
5532 movu xm2, [r0 + r1 * 2] ; m2 = row 22
5533 punpckhbw xm3, xm7, xm2
5534 punpcklbw xm7, xm2
5535 vinserti128 m7, m7, xm3, 1
5536 pmaddubsw m7, [r5 + 3 * mmsize]
5537 paddw m1, m7
5538
5539 pmulhrsw m8, m14 ; m8 = word: row 8
5540 pmulhrsw m9, m14 ; m9 = word: row 9
5541 pmulhrsw m10, m14 ; m10 = word: row 10
5542 pmulhrsw m11, m14 ; m11 = word: row 11
5543 pmulhrsw m12, m14 ; m12 = word: row 12
5544 pmulhrsw m13, m14 ; m13 = word: row 13
5545 pmulhrsw m0, m14 ; m0 = word: row 14
5546 pmulhrsw m1, m14 ; m1 = word: row 15
5547 packuswb m8, m9
5548 packuswb m10, m11
5549 packuswb m12, m13
5550 packuswb m0, m1
5551 vpermq m8, m8, 11011000b
5552 vpermq m10, m10, 11011000b
5553 vpermq m12, m12, 11011000b
5554 vpermq m0, m0, 11011000b
5555 vextracti128 xm9, m8, 1
5556 vextracti128 xm11, m10, 1
5557 vextracti128 xm13, m12, 1
5558 vextracti128 xm1, m0, 1
5559 movq [r2], xm8
5560 pextrd [r2 + 8], xm8, 2
5561 movq [r2 + r3], xm9
5562 pextrd [r2 + r3 + 8], xm9, 2
5563 movq [r2 + r3 * 2], xm10
5564 pextrd [r2 + r3 * 2 + 8], xm10, 2
5565 movq [r2 + r6], xm11
5566 pextrd [r2 + r6 + 8], xm11, 2
5567 lea r2, [r2 + r3 * 4]
5568 movq [r2], xm12
5569 pextrd [r2 + 8], xm12, 2
5570 movq [r2 + r3], xm13
5571 pextrd [r2 + r3 + 8], xm13, 2
5572 movq [r2 + r3 * 2], xm0
5573 pextrd [r2 + r3 * 2 + 8], xm0, 2
5574 movq [r2 + r6], xm1
5575 pextrd [r2 + r6 + 8], xm1, 2
5576 RET
5577%endif
5578
5579INIT_YMM avx2
5580%if ARCH_X86_64 == 1
5581cglobal interp_8tap_vert_pp_16x16, 4, 7, 15
5582 mov r4d, r4m
5583 shl r4d, 7
5584
5585%ifdef PIC
5586 lea r5, [tab_LumaCoeffVer_32]
5587 add r5, r4
5588%else
5589 lea r5, [tab_LumaCoeffVer_32 + r4]
5590%endif
5591
5592 lea r4, [r1 * 3]
5593 sub r0, r4
5594 lea r6, [r3 * 3]
5595 mova m14, [pw_512]
5596
5597 movu xm0, [r0] ; m0 = row 0
5598 movu xm1, [r0 + r1] ; m1 = row 1
5599 punpckhbw xm2, xm0, xm1
5600 punpcklbw xm0, xm1
5601 vinserti128 m0, m0, xm2, 1
5602 pmaddubsw m0, [r5]
5603 movu xm2, [r0 + r1 * 2] ; m2 = row 2
5604 punpckhbw xm3, xm1, xm2
5605 punpcklbw xm1, xm2
5606 vinserti128 m1, m1, xm3, 1
5607 pmaddubsw m1, [r5]
5608 movu xm3, [r0 + r4] ; m3 = row 3
5609 punpckhbw xm4, xm2, xm3
5610 punpcklbw xm2, xm3
5611 vinserti128 m2, m2, xm4, 1
5612 pmaddubsw m4, m2, [r5 + 1 * mmsize]
5613 paddw m0, m4
5614 pmaddubsw m2, [r5]
5615 lea r0, [r0 + r1 * 4]
5616 movu xm4, [r0] ; m4 = row 4
5617 punpckhbw xm5, xm3, xm4
5618 punpcklbw xm3, xm4
5619 vinserti128 m3, m3, xm5, 1
5620 pmaddubsw m5, m3, [r5 + 1 * mmsize]
5621 paddw m1, m5
5622 pmaddubsw m3, [r5]
5623 movu xm5, [r0 + r1] ; m5 = row 5
5624 punpckhbw xm6, xm4, xm5
5625 punpcklbw xm4, xm5
5626 vinserti128 m4, m4, xm6, 1
5627 pmaddubsw m6, m4, [r5 + 2 * mmsize]
5628 paddw m0, m6
5629 pmaddubsw m6, m4, [r5 + 1 * mmsize]
5630 paddw m2, m6
5631 pmaddubsw m4, [r5]
5632 movu xm6, [r0 + r1 * 2] ; m6 = row 6
5633 punpckhbw xm7, xm5, xm6
5634 punpcklbw xm5, xm6
5635 vinserti128 m5, m5, xm7, 1
5636 pmaddubsw m7, m5, [r5 + 2 * mmsize]
5637 paddw m1, m7
5638 pmaddubsw m7, m5, [r5 + 1 * mmsize]
5639 paddw m3, m7
5640 pmaddubsw m5, [r5]
5641 movu xm7, [r0 + r4] ; m7 = row 7
5642 punpckhbw xm8, xm6, xm7
5643 punpcklbw xm6, xm7
5644 vinserti128 m6, m6, xm8, 1
5645 pmaddubsw m8, m6, [r5 + 3 * mmsize]
5646 paddw m0, m8
5647 pmaddubsw m8, m6, [r5 + 2 * mmsize]
5648 paddw m2, m8
5649 pmaddubsw m8, m6, [r5 + 1 * mmsize]
5650 paddw m4, m8
5651 pmaddubsw m6, [r5]
5652 lea r0, [r0 + r1 * 4]
5653 movu xm8, [r0] ; m8 = row 8
5654 punpckhbw xm9, xm7, xm8
5655 punpcklbw xm7, xm8
5656 vinserti128 m7, m7, xm9, 1
5657 pmaddubsw m9, m7, [r5 + 3 * mmsize]
5658 paddw m1, m9
5659 pmaddubsw m9, m7, [r5 + 2 * mmsize]
5660 paddw m3, m9
5661 pmaddubsw m9, m7, [r5 + 1 * mmsize]
5662 paddw m5, m9
5663 pmaddubsw m7, [r5]
5664 movu xm9, [r0 + r1] ; m9 = row 9
5665 punpckhbw xm10, xm8, xm9
5666 punpcklbw xm8, xm9
5667 vinserti128 m8, m8, xm10, 1
5668 pmaddubsw m10, m8, [r5 + 3 * mmsize]
5669 paddw m2, m10
5670 pmaddubsw m10, m8, [r5 + 2 * mmsize]
5671 paddw m4, m10
5672 pmaddubsw m10, m8, [r5 + 1 * mmsize]
5673 paddw m6, m10
5674 pmaddubsw m8, [r5]
5675 movu xm10, [r0 + r1 * 2] ; m10 = row 10
5676 punpckhbw xm11, xm9, xm10
5677 punpcklbw xm9, xm10
5678 vinserti128 m9, m9, xm11, 1
5679 pmaddubsw m11, m9, [r5 + 3 * mmsize]
5680 paddw m3, m11
5681 pmaddubsw m11, m9, [r5 + 2 * mmsize]
5682 paddw m5, m11
5683 pmaddubsw m11, m9, [r5 + 1 * mmsize]
5684 paddw m7, m11
5685 pmaddubsw m9, [r5]
5686 movu xm11, [r0 + r4] ; m11 = row 11
5687 punpckhbw xm12, xm10, xm11
5688 punpcklbw xm10, xm11
5689 vinserti128 m10, m10, xm12, 1
5690 pmaddubsw m12, m10, [r5 + 3 * mmsize]
5691 paddw m4, m12
5692 pmaddubsw m12, m10, [r5 + 2 * mmsize]
5693 paddw m6, m12
5694 pmaddubsw m12, m10, [r5 + 1 * mmsize]
5695 paddw m8, m12
5696 pmaddubsw m10, [r5]
5697 lea r0, [r0 + r1 * 4]
5698 movu xm12, [r0] ; m12 = row 12
5699 punpckhbw xm13, xm11, xm12
5700 punpcklbw xm11, xm12
5701 vinserti128 m11, m11, xm13, 1
5702 pmaddubsw m13, m11, [r5 + 3 * mmsize]
5703 paddw m5, m13
5704 pmaddubsw m13, m11, [r5 + 2 * mmsize]
5705 paddw m7, m13
5706 pmaddubsw m13, m11, [r5 + 1 * mmsize]
5707 paddw m9, m13
5708 pmaddubsw m11, [r5]
5709
5710 pmulhrsw m0, m14 ; m0 = word: row 0
5711 pmulhrsw m1, m14 ; m1 = word: row 1
5712 pmulhrsw m2, m14 ; m2 = word: row 2
5713 pmulhrsw m3, m14 ; m3 = word: row 3
5714 pmulhrsw m4, m14 ; m4 = word: row 4
5715 pmulhrsw m5, m14 ; m5 = word: row 5
5716 packuswb m0, m1
5717 packuswb m2, m3
5718 packuswb m4, m5
5719 vpermq m0, m0, 11011000b
5720 vpermq m2, m2, 11011000b
5721 vpermq m4, m4, 11011000b
5722 vextracti128 xm1, m0, 1
5723 vextracti128 xm3, m2, 1
5724 vextracti128 xm5, m4, 1
5725 movu [r2], xm0
5726 movu [r2 + r3], xm1
5727 movu [r2 + r3 * 2], xm2
5728 movu [r2 + r6], xm3
5729 lea r2, [r2 + r3 * 4]
5730 movu [r2], xm4
5731 movu [r2 + r3], xm5
5732
5733 movu xm13, [r0 + r1] ; m13 = row 13
5734 punpckhbw xm0, xm12, xm13
5735 punpcklbw xm12, xm13
5736 vinserti128 m12, m12, xm0, 1
5737 pmaddubsw m0, m12, [r5 + 3 * mmsize]
5738 paddw m6, m0
5739 pmaddubsw m0, m12, [r5 + 2 * mmsize]
5740 paddw m8, m0
5741 pmaddubsw m0, m12, [r5 + 1 * mmsize]
5742 paddw m10, m0
5743 pmaddubsw m12, [r5]
5744 movu xm0, [r0 + r1 * 2] ; m0 = row 14
5745 punpckhbw xm1, xm13, xm0
5746 punpcklbw xm13, xm0
5747 vinserti128 m13, m13, xm1, 1
5748 pmaddubsw m1, m13, [r5 + 3 * mmsize]
5749 paddw m7, m1
5750 pmaddubsw m1, m13, [r5 + 2 * mmsize]
5751 paddw m9, m1
5752 pmaddubsw m1, m13, [r5 + 1 * mmsize]
5753 paddw m11, m1
5754 pmaddubsw m13, [r5]
5755
5756 pmulhrsw m6, m14 ; m6 = word: row 6
5757 pmulhrsw m7, m14 ; m7 = word: row 7
5758 packuswb m6, m7
5759 vpermq m6, m6, 11011000b
5760 vextracti128 xm7, m6, 1
5761 movu [r2 + r3 * 2], xm6
5762 movu [r2 + r6], xm7
5763 lea r2, [r2 + r3 * 4]
5764
5765 movu xm1, [r0 + r4] ; m1 = row 15
5766 punpckhbw xm2, xm0, xm1
5767 punpcklbw xm0, xm1
5768 vinserti128 m0, m0, xm2, 1
5769 pmaddubsw m2, m0, [r5 + 3 * mmsize]
5770 paddw m8, m2
5771 pmaddubsw m2, m0, [r5 + 2 * mmsize]
5772 paddw m10, m2
5773 pmaddubsw m2, m0, [r5 + 1 * mmsize]
5774 paddw m12, m2
5775 pmaddubsw m0, [r5]
5776 lea r0, [r0 + r1 * 4]
5777 movu xm2, [r0] ; m2 = row 16
5778 punpckhbw xm3, xm1, xm2
5779 punpcklbw xm1, xm2
5780 vinserti128 m1, m1, xm3, 1
5781 pmaddubsw m3, m1, [r5 + 3 * mmsize]
5782 paddw m9, m3
5783 pmaddubsw m3, m1, [r5 + 2 * mmsize]
5784 paddw m11, m3
5785 pmaddubsw m3, m1, [r5 + 1 * mmsize]
5786 paddw m13, m3
5787 pmaddubsw m1, [r5]
5788 movu xm3, [r0 + r1] ; m3 = row 17
5789 punpckhbw xm4, xm2, xm3
5790 punpcklbw xm2, xm3
5791 vinserti128 m2, m2, xm4, 1
5792 pmaddubsw m4, m2, [r5 + 3 * mmsize]
5793 paddw m10, m4
5794 pmaddubsw m4, m2, [r5 + 2 * mmsize]
5795 paddw m12, m4
5796 pmaddubsw m2, [r5 + 1 * mmsize]
5797 paddw m0, m2
5798 movu xm4, [r0 + r1 * 2] ; m4 = row 18
5799 punpckhbw xm5, xm3, xm4
5800 punpcklbw xm3, xm4
5801 vinserti128 m3, m3, xm5, 1
5802 pmaddubsw m5, m3, [r5 + 3 * mmsize]
5803 paddw m11, m5
5804 pmaddubsw m5, m3, [r5 + 2 * mmsize]
5805 paddw m13, m5
5806 pmaddubsw m3, [r5 + 1 * mmsize]
5807 paddw m1, m3
5808 movu xm5, [r0 + r4] ; m5 = row 19
5809 punpckhbw xm6, xm4, xm5
5810 punpcklbw xm4, xm5
5811 vinserti128 m4, m4, xm6, 1
5812 pmaddubsw m6, m4, [r5 + 3 * mmsize]
5813 paddw m12, m6
5814 pmaddubsw m4, [r5 + 2 * mmsize]
5815 paddw m0, m4
5816 lea r0, [r0 + r1 * 4]
5817 movu xm6, [r0] ; m6 = row 20
5818 punpckhbw xm7, xm5, xm6
5819 punpcklbw xm5, xm6
5820 vinserti128 m5, m5, xm7, 1
5821 pmaddubsw m7, m5, [r5 + 3 * mmsize]
5822 paddw m13, m7
5823 pmaddubsw m5, [r5 + 2 * mmsize]
5824 paddw m1, m5
5825 movu xm7, [r0 + r1] ; m7 = row 21
5826 punpckhbw xm2, xm6, xm7
5827 punpcklbw xm6, xm7
5828 vinserti128 m6, m6, xm2, 1
5829 pmaddubsw m6, [r5 + 3 * mmsize]
5830 paddw m0, m6
5831 movu xm2, [r0 + r1 * 2] ; m2 = row 22
5832 punpckhbw xm3, xm7, xm2
5833 punpcklbw xm7, xm2
5834 vinserti128 m7, m7, xm3, 1
5835 pmaddubsw m7, [r5 + 3 * mmsize]
5836 paddw m1, m7
5837
5838 pmulhrsw m8, m14 ; m8 = word: row 8
5839 pmulhrsw m9, m14 ; m9 = word: row 9
5840 pmulhrsw m10, m14 ; m10 = word: row 10
5841 pmulhrsw m11, m14 ; m11 = word: row 11
5842 pmulhrsw m12, m14 ; m12 = word: row 12
5843 pmulhrsw m13, m14 ; m13 = word: row 13
5844 pmulhrsw m0, m14 ; m0 = word: row 14
5845 pmulhrsw m1, m14 ; m1 = word: row 15
5846 packuswb m8, m9
5847 packuswb m10, m11
5848 packuswb m12, m13
5849 packuswb m0, m1
5850 vpermq m8, m8, 11011000b
5851 vpermq m10, m10, 11011000b
5852 vpermq m12, m12, 11011000b
5853 vpermq m0, m0, 11011000b
5854 vextracti128 xm9, m8, 1
5855 vextracti128 xm11, m10, 1
5856 vextracti128 xm13, m12, 1
5857 vextracti128 xm1, m0, 1
5858 movu [r2], xm8
5859 movu [r2 + r3], xm9
5860 movu [r2 + r3 * 2], xm10
5861 movu [r2 + r6], xm11
5862 lea r2, [r2 + r3 * 4]
5863 movu [r2], xm12
5864 movu [r2 + r3], xm13
5865 movu [r2 + r3 * 2], xm0
5866 movu [r2 + r6], xm1
5867 RET
5868%endif
5869
5870INIT_YMM avx2
5871%if ARCH_X86_64 == 1
5872cglobal interp_8tap_vert_pp_16x12, 4, 7, 15
5873 mov r4d, r4m
5874 shl r4d, 7
5875
5876%ifdef PIC
5877 lea r5, [tab_LumaCoeffVer_32]
5878 add r5, r4
5879%else
5880 lea r5, [tab_LumaCoeffVer_32 + r4]
5881%endif
5882
5883 lea r4, [r1 * 3]
5884 sub r0, r4
5885 lea r6, [r3 * 3]
5886 mova m14, [pw_512]
5887
5888 movu xm0, [r0] ; m0 = row 0
5889 movu xm1, [r0 + r1] ; m1 = row 1
5890 punpckhbw xm2, xm0, xm1
5891 punpcklbw xm0, xm1
5892 vinserti128 m0, m0, xm2, 1
5893 pmaddubsw m0, [r5]
5894 movu xm2, [r0 + r1 * 2] ; m2 = row 2
5895 punpckhbw xm3, xm1, xm2
5896 punpcklbw xm1, xm2
5897 vinserti128 m1, m1, xm3, 1
5898 pmaddubsw m1, [r5]
5899 movu xm3, [r0 + r4] ; m3 = row 3
5900 punpckhbw xm4, xm2, xm3
5901 punpcklbw xm2, xm3
5902 vinserti128 m2, m2, xm4, 1
5903 pmaddubsw m4, m2, [r5 + 1 * mmsize]
5904 paddw m0, m4
5905 pmaddubsw m2, [r5]
5906 lea r0, [r0 + r1 * 4]
5907 movu xm4, [r0] ; m4 = row 4
5908 punpckhbw xm5, xm3, xm4
5909 punpcklbw xm3, xm4
5910 vinserti128 m3, m3, xm5, 1
5911 pmaddubsw m5, m3, [r5 + 1 * mmsize]
5912 paddw m1, m5
5913 pmaddubsw m3, [r5]
5914 movu xm5, [r0 + r1] ; m5 = row 5
5915 punpckhbw xm6, xm4, xm5
5916 punpcklbw xm4, xm5
5917 vinserti128 m4, m4, xm6, 1
5918 pmaddubsw m6, m4, [r5 + 2 * mmsize]
5919 paddw m0, m6
5920 pmaddubsw m6, m4, [r5 + 1 * mmsize]
5921 paddw m2, m6
5922 pmaddubsw m4, [r5]
5923 movu xm6, [r0 + r1 * 2] ; m6 = row 6
5924 punpckhbw xm7, xm5, xm6
5925 punpcklbw xm5, xm6
5926 vinserti128 m5, m5, xm7, 1
5927 pmaddubsw m7, m5, [r5 + 2 * mmsize]
5928 paddw m1, m7
5929 pmaddubsw m7, m5, [r5 + 1 * mmsize]
5930 paddw m3, m7
5931 pmaddubsw m5, [r5]
5932 movu xm7, [r0 + r4] ; m7 = row 7
5933 punpckhbw xm8, xm6, xm7
5934 punpcklbw xm6, xm7
5935 vinserti128 m6, m6, xm8, 1
5936 pmaddubsw m8, m6, [r5 + 3 * mmsize]
5937 paddw m0, m8
5938 pmaddubsw m8, m6, [r5 + 2 * mmsize]
5939 paddw m2, m8
5940 pmaddubsw m8, m6, [r5 + 1 * mmsize]
5941 paddw m4, m8
5942 pmaddubsw m6, [r5]
5943 lea r0, [r0 + r1 * 4]
5944 movu xm8, [r0] ; m8 = row 8
5945 punpckhbw xm9, xm7, xm8
5946 punpcklbw xm7, xm8
5947 vinserti128 m7, m7, xm9, 1
5948 pmaddubsw m9, m7, [r5 + 3 * mmsize]
5949 paddw m1, m9
5950 pmaddubsw m9, m7, [r5 + 2 * mmsize]
5951 paddw m3, m9
5952 pmaddubsw m9, m7, [r5 + 1 * mmsize]
5953 paddw m5, m9
5954 pmaddubsw m7, [r5]
5955 movu xm9, [r0 + r1] ; m9 = row 9
5956 punpckhbw xm10, xm8, xm9
5957 punpcklbw xm8, xm9
5958 vinserti128 m8, m8, xm10, 1
5959 pmaddubsw m10, m8, [r5 + 3 * mmsize]
5960 paddw m2, m10
5961 pmaddubsw m10, m8, [r5 + 2 * mmsize]
5962 paddw m4, m10
5963 pmaddubsw m10, m8, [r5 + 1 * mmsize]
5964 paddw m6, m10
5965 pmaddubsw m8, [r5]
5966 movu xm10, [r0 + r1 * 2] ; m10 = row 10
5967 punpckhbw xm11, xm9, xm10
5968 punpcklbw xm9, xm10
5969 vinserti128 m9, m9, xm11, 1
5970 pmaddubsw m11, m9, [r5 + 3 * mmsize]
5971 paddw m3, m11
5972 pmaddubsw m11, m9, [r5 + 2 * mmsize]
5973 paddw m5, m11
5974 pmaddubsw m11, m9, [r5 + 1 * mmsize]
5975 paddw m7, m11
5976 pmaddubsw m9, [r5]
5977 movu xm11, [r0 + r4] ; m11 = row 11
5978 punpckhbw xm12, xm10, xm11
5979 punpcklbw xm10, xm11
5980 vinserti128 m10, m10, xm12, 1
5981 pmaddubsw m12, m10, [r5 + 3 * mmsize]
5982 paddw m4, m12
5983 pmaddubsw m12, m10, [r5 + 2 * mmsize]
5984 paddw m6, m12
5985 pmaddubsw m12, m10, [r5 + 1 * mmsize]
5986 paddw m8, m12
5987 pmaddubsw m10, [r5]
5988 lea r0, [r0 + r1 * 4]
5989 movu xm12, [r0] ; m12 = row 12
5990 punpckhbw xm13, xm11, xm12
5991 punpcklbw xm11, xm12
5992 vinserti128 m11, m11, xm13, 1
5993 pmaddubsw m13, m11, [r5 + 3 * mmsize]
5994 paddw m5, m13
5995 pmaddubsw m13, m11, [r5 + 2 * mmsize]
5996 paddw m7, m13
5997 pmaddubsw m13, m11, [r5 + 1 * mmsize]
5998 paddw m9, m13
5999 pmaddubsw m11, [r5]
6000
6001 pmulhrsw m0, m14 ; m0 = word: row 0
6002 pmulhrsw m1, m14 ; m1 = word: row 1
6003 pmulhrsw m2, m14 ; m2 = word: row 2
6004 pmulhrsw m3, m14 ; m3 = word: row 3
6005 pmulhrsw m4, m14 ; m4 = word: row 4
6006 pmulhrsw m5, m14 ; m5 = word: row 5
6007 packuswb m0, m1
6008 packuswb m2, m3
6009 packuswb m4, m5
6010 vpermq m0, m0, 11011000b
6011 vpermq m2, m2, 11011000b
6012 vpermq m4, m4, 11011000b
6013 vextracti128 xm1, m0, 1
6014 vextracti128 xm3, m2, 1
6015 vextracti128 xm5, m4, 1
6016 movu [r2], xm0
6017 movu [r2 + r3], xm1
6018 movu [r2 + r3 * 2], xm2
6019 movu [r2 + r6], xm3
6020 lea r2, [r2 + r3 * 4]
6021 movu [r2], xm4
6022 movu [r2 + r3], xm5
6023
6024 movu xm13, [r0 + r1] ; m13 = row 13
6025 punpckhbw xm0, xm12, xm13
6026 punpcklbw xm12, xm13
6027 vinserti128 m12, m12, xm0, 1
6028 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6029 paddw m6, m0
6030 pmaddubsw m0, m12, [r5 + 2 * mmsize]
6031 paddw m8, m0
6032 pmaddubsw m0, m12, [r5 + 1 * mmsize]
6033 paddw m10, m0
6034 movu xm0, [r0 + r1 * 2] ; m0 = row 14
6035 punpckhbw xm1, xm13, xm0
6036 punpcklbw xm13, xm0
6037 vinserti128 m13, m13, xm1, 1
6038 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6039 paddw m7, m1
6040 pmaddubsw m1, m13, [r5 + 2 * mmsize]
6041 paddw m9, m1
6042 pmaddubsw m1, m13, [r5 + 1 * mmsize]
6043 paddw m11, m1
6044
6045 pmulhrsw m6, m14 ; m6 = word: row 6
6046 pmulhrsw m7, m14 ; m7 = word: row 7
6047 packuswb m6, m7
6048 vpermq m6, m6, 11011000b
6049 vextracti128 xm7, m6, 1
6050 movu [r2 + r3 * 2], xm6
6051 movu [r2 + r6], xm7
6052 lea r2, [r2 + r3 * 4]
6053
6054 movu xm1, [r0 + r4] ; m1 = row 15
6055 punpckhbw xm2, xm0, xm1
6056 punpcklbw xm0, xm1
6057 vinserti128 m0, m0, xm2, 1
6058 pmaddubsw m2, m0, [r5 + 3 * mmsize]
6059 paddw m8, m2
6060 pmaddubsw m2, m0, [r5 + 2 * mmsize]
6061 paddw m10, m2
6062 lea r0, [r0 + r1 * 4]
6063 movu xm2, [r0] ; m2 = row 16
6064 punpckhbw xm3, xm1, xm2
6065 punpcklbw xm1, xm2
6066 vinserti128 m1, m1, xm3, 1
6067 pmaddubsw m3, m1, [r5 + 3 * mmsize]
6068 paddw m9, m3
6069 pmaddubsw m3, m1, [r5 + 2 * mmsize]
6070 paddw m11, m3
6071 movu xm3, [r0 + r1] ; m3 = row 17
6072 punpckhbw xm4, xm2, xm3
6073 punpcklbw xm2, xm3
6074 vinserti128 m2, m2, xm4, 1
6075 pmaddubsw m4, m2, [r5 + 3 * mmsize]
6076 paddw m10, m4
6077 movu xm4, [r0 + r1 * 2] ; m4 = row 18
6078 punpckhbw xm5, xm3, xm4
6079 punpcklbw xm3, xm4
6080 vinserti128 m3, m3, xm5, 1
6081 pmaddubsw m5, m3, [r5 + 3 * mmsize]
6082 paddw m11, m5
6083
6084 pmulhrsw m8, m14 ; m8 = word: row 8
6085 pmulhrsw m9, m14 ; m9 = word: row 9
6086 pmulhrsw m10, m14 ; m10 = word: row 10
6087 pmulhrsw m11, m14 ; m11 = word: row 11
6088 packuswb m8, m9
6089 packuswb m10, m11
6090 vpermq m8, m8, 11011000b
6091 vpermq m10, m10, 11011000b
6092 vextracti128 xm9, m8, 1
6093 vextracti128 xm11, m10, 1
6094 movu [r2], xm8
6095 movu [r2 + r3], xm9
6096 movu [r2 + r3 * 2], xm10
6097 movu [r2 + r6], xm11
6098 RET
6099%endif
6100
6101INIT_YMM avx2
6102%if ARCH_X86_64 == 1
6103cglobal interp_8tap_vert_pp_16x8, 4, 7, 15
6104 mov r4d, r4m
6105 shl r4d, 7
6106
6107%ifdef PIC
6108 lea r5, [tab_LumaCoeffVer_32]
6109 add r5, r4
6110%else
6111 lea r5, [tab_LumaCoeffVer_32 + r4]
6112%endif
6113
6114 lea r4, [r1 * 3]
6115 sub r0, r4
6116 lea r6, [r3 * 3]
6117 mova m14, [pw_512]
6118
6119 movu xm0, [r0] ; m0 = row 0
6120 movu xm1, [r0 + r1] ; m1 = row 1
6121 punpckhbw xm2, xm0, xm1
6122 punpcklbw xm0, xm1
6123 vinserti128 m0, m0, xm2, 1
6124 pmaddubsw m0, [r5]
6125 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6126 punpckhbw xm3, xm1, xm2
6127 punpcklbw xm1, xm2
6128 vinserti128 m1, m1, xm3, 1
6129 pmaddubsw m1, [r5]
6130 movu xm3, [r0 + r4] ; m3 = row 3
6131 punpckhbw xm4, xm2, xm3
6132 punpcklbw xm2, xm3
6133 vinserti128 m2, m2, xm4, 1
6134 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6135 paddw m0, m4
6136 pmaddubsw m2, [r5]
6137 lea r0, [r0 + r1 * 4]
6138 movu xm4, [r0] ; m4 = row 4
6139 punpckhbw xm5, xm3, xm4
6140 punpcklbw xm3, xm4
6141 vinserti128 m3, m3, xm5, 1
6142 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6143 paddw m1, m5
6144 pmaddubsw m3, [r5]
6145 movu xm5, [r0 + r1] ; m5 = row 5
6146 punpckhbw xm6, xm4, xm5
6147 punpcklbw xm4, xm5
6148 vinserti128 m4, m4, xm6, 1
6149 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6150 paddw m0, m6
6151 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6152 paddw m2, m6
6153 pmaddubsw m4, [r5]
6154 movu xm6, [r0 + r1 * 2] ; m6 = row 6
6155 punpckhbw xm7, xm5, xm6
6156 punpcklbw xm5, xm6
6157 vinserti128 m5, m5, xm7, 1
6158 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6159 paddw m1, m7
6160 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6161 paddw m3, m7
6162 pmaddubsw m5, [r5]
6163 movu xm7, [r0 + r4] ; m7 = row 7
6164 punpckhbw xm8, xm6, xm7
6165 punpcklbw xm6, xm7
6166 vinserti128 m6, m6, xm8, 1
6167 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6168 paddw m0, m8
6169 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6170 paddw m2, m8
6171 pmaddubsw m8, m6, [r5 + 1 * mmsize]
6172 paddw m4, m8
6173 pmaddubsw m6, [r5]
6174 lea r0, [r0 + r1 * 4]
6175 movu xm8, [r0] ; m8 = row 8
6176 punpckhbw xm9, xm7, xm8
6177 punpcklbw xm7, xm8
6178 vinserti128 m7, m7, xm9, 1
6179 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6180 paddw m1, m9
6181 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6182 paddw m3, m9
6183 pmaddubsw m9, m7, [r5 + 1 * mmsize]
6184 paddw m5, m9
6185 pmaddubsw m7, [r5]
6186 movu xm9, [r0 + r1] ; m9 = row 9
6187 punpckhbw xm10, xm8, xm9
6188 punpcklbw xm8, xm9
6189 vinserti128 m8, m8, xm10, 1
6190 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6191 paddw m2, m10
6192 pmaddubsw m10, m8, [r5 + 2 * mmsize]
6193 paddw m4, m10
6194 pmaddubsw m10, m8, [r5 + 1 * mmsize]
6195 paddw m6, m10
6196 movu xm10, [r0 + r1 * 2] ; m10 = row 10
6197 punpckhbw xm11, xm9, xm10
6198 punpcklbw xm9, xm10
6199 vinserti128 m9, m9, xm11, 1
6200 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6201 paddw m3, m11
6202 pmaddubsw m11, m9, [r5 + 2 * mmsize]
6203 paddw m5, m11
6204 pmaddubsw m11, m9, [r5 + 1 * mmsize]
6205 paddw m7, m11
6206 movu xm11, [r0 + r4] ; m11 = row 11
6207 punpckhbw xm12, xm10, xm11
6208 punpcklbw xm10, xm11
6209 vinserti128 m10, m10, xm12, 1
6210 pmaddubsw m12, m10, [r5 + 3 * mmsize]
6211 paddw m4, m12
6212 pmaddubsw m12, m10, [r5 + 2 * mmsize]
6213 paddw m6, m12
6214 lea r0, [r0 + r1 * 4]
6215 movu xm12, [r0] ; m12 = row 12
6216 punpckhbw xm13, xm11, xm12
6217 punpcklbw xm11, xm12
6218 vinserti128 m11, m11, xm13, 1
6219 pmaddubsw m13, m11, [r5 + 3 * mmsize]
6220 paddw m5, m13
6221 pmaddubsw m13, m11, [r5 + 2 * mmsize]
6222 paddw m7, m13
6223
6224 pmulhrsw m0, m14 ; m0 = word: row 0
6225 pmulhrsw m1, m14 ; m1 = word: row 1
6226 pmulhrsw m2, m14 ; m2 = word: row 2
6227 pmulhrsw m3, m14 ; m3 = word: row 3
6228 pmulhrsw m4, m14 ; m4 = word: row 4
6229 pmulhrsw m5, m14 ; m5 = word: row 5
6230 packuswb m0, m1
6231 packuswb m2, m3
6232 packuswb m4, m5
6233 vpermq m0, m0, 11011000b
6234 vpermq m2, m2, 11011000b
6235 vpermq m4, m4, 11011000b
6236 vextracti128 xm1, m0, 1
6237 vextracti128 xm3, m2, 1
6238 vextracti128 xm5, m4, 1
6239 movu [r2], xm0
6240 movu [r2 + r3], xm1
6241 movu [r2 + r3 * 2], xm2
6242 movu [r2 + r6], xm3
6243 lea r2, [r2 + r3 * 4]
6244 movu [r2], xm4
6245 movu [r2 + r3], xm5
6246
6247 movu xm13, [r0 + r1] ; m13 = row 13
6248 punpckhbw xm0, xm12, xm13
6249 punpcklbw xm12, xm13
6250 vinserti128 m12, m12, xm0, 1
6251 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6252 paddw m6, m0
6253 movu xm0, [r0 + r1 * 2] ; m0 = row 14
6254 punpckhbw xm1, xm13, xm0
6255 punpcklbw xm13, xm0
6256 vinserti128 m13, m13, xm1, 1
6257 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6258 paddw m7, m1
6259
6260 pmulhrsw m6, m14 ; m6 = word: row 6
6261 pmulhrsw m7, m14 ; m7 = word: row 7
6262 packuswb m6, m7
6263 vpermq m6, m6, 11011000b
6264 vextracti128 xm7, m6, 1
6265 movu [r2 + r3 * 2], xm6
6266 movu [r2 + r6], xm7
6267 RET
6268%endif
6269
6270INIT_YMM avx2
6271%if ARCH_X86_64 == 1
6272cglobal interp_8tap_vert_pp_16x4, 4, 7, 13
6273 mov r4d, r4m
6274 shl r4d, 7
6275
6276%ifdef PIC
6277 lea r5, [tab_LumaCoeffVer_32]
6278 add r5, r4
6279%else
6280 lea r5, [tab_LumaCoeffVer_32 + r4]
6281%endif
6282
6283 lea r4, [r1 * 3]
6284 sub r0, r4
6285 lea r6, [r3 * 3]
6286 mova m12, [pw_512]
6287
6288 movu xm0, [r0] ; m0 = row 0
6289 movu xm1, [r0 + r1] ; m1 = row 1
6290 punpckhbw xm2, xm0, xm1
6291 punpcklbw xm0, xm1
6292 vinserti128 m0, m0, xm2, 1
6293 pmaddubsw m0, [r5]
6294 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6295 punpckhbw xm3, xm1, xm2
6296 punpcklbw xm1, xm2
6297 vinserti128 m1, m1, xm3, 1
6298 pmaddubsw m1, [r5]
6299 movu xm3, [r0 + r4] ; m3 = row 3
6300 punpckhbw xm4, xm2, xm3
6301 punpcklbw xm2, xm3
6302 vinserti128 m2, m2, xm4, 1
6303 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6304 paddw m0, m4
6305 pmaddubsw m2, [r5]
6306 lea r0, [r0 + r1 * 4]
6307 movu xm4, [r0] ; m4 = row 4
6308 punpckhbw xm5, xm3, xm4
6309 punpcklbw xm3, xm4
6310 vinserti128 m3, m3, xm5, 1
6311 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6312 paddw m1, m5
6313 pmaddubsw m3, [r5]
6314 movu xm5, [r0 + r1] ; m5 = row 5
6315 punpckhbw xm6, xm4, xm5
6316 punpcklbw xm4, xm5
6317 vinserti128 m4, m4, xm6, 1
6318 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6319 paddw m0, m6
6320 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6321 paddw m2, m6
6322 movu xm6, [r0 + r1 * 2] ; m6 = row 6
6323 punpckhbw xm7, xm5, xm6
6324 punpcklbw xm5, xm6
6325 vinserti128 m5, m5, xm7, 1
6326 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6327 paddw m1, m7
6328 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6329 paddw m3, m7
6330 movu xm7, [r0 + r4] ; m7 = row 7
6331 punpckhbw xm8, xm6, xm7
6332 punpcklbw xm6, xm7
6333 vinserti128 m6, m6, xm8, 1
6334 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6335 paddw m0, m8
6336 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6337 paddw m2, m8
6338 lea r0, [r0 + r1 * 4]
6339 movu xm8, [r0] ; m8 = row 8
6340 punpckhbw xm9, xm7, xm8
6341 punpcklbw xm7, xm8
6342 vinserti128 m7, m7, xm9, 1
6343 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6344 paddw m1, m9
6345 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6346 paddw m3, m9
6347 movu xm9, [r0 + r1] ; m9 = row 9
6348 punpckhbw xm10, xm8, xm9
6349 punpcklbw xm8, xm9
6350 vinserti128 m8, m8, xm10, 1
6351 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6352 paddw m2, m10
6353 movu xm10, [r0 + r1 * 2] ; m10 = row 10
6354 punpckhbw xm11, xm9, xm10
6355 punpcklbw xm9, xm10
6356 vinserti128 m9, m9, xm11, 1
6357 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6358 paddw m3, m11
6359
6360 pmulhrsw m0, m12 ; m0 = word: row 0
6361 pmulhrsw m1, m12 ; m1 = word: row 1
6362 pmulhrsw m2, m12 ; m2 = word: row 2
6363 pmulhrsw m3, m12 ; m3 = word: row 3
6364 packuswb m0, m1
6365 packuswb m2, m3
6366 vpermq m0, m0, 11011000b
6367 vpermq m2, m2, 11011000b
6368 vextracti128 xm1, m0, 1
6369 vextracti128 xm3, m2, 1
6370 movu [r2], xm0
6371 movu [r2 + r3], xm1
6372 movu [r2 + r3 * 2], xm2
6373 movu [r2 + r6], xm3
6374 RET
6375%endif
6376
6377%macro FILTER_VER_LUMA_AVX2_16xN 2
6378INIT_YMM avx2
6379%if ARCH_X86_64 == 1
6380cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15
6381 mov r4d, r4m
6382 shl r4d, 7
6383
6384%ifdef PIC
6385 lea r5, [tab_LumaCoeffVer_32]
6386 add r5, r4
6387%else
6388 lea r5, [tab_LumaCoeffVer_32 + r4]
6389%endif
6390
6391 lea r4, [r1 * 3]
6392 sub r0, r4
6393 lea r6, [r3 * 3]
6394 lea r7, [r1 * 4]
6395 mova m14, [pw_512]
6396 mov r8d, %2 / 16
6397
6398.loop:
6399 movu xm0, [r0] ; m0 = row 0
6400 movu xm1, [r0 + r1] ; m1 = row 1
6401 punpckhbw xm2, xm0, xm1
6402 punpcklbw xm0, xm1
6403 vinserti128 m0, m0, xm2, 1
6404 pmaddubsw m0, [r5]
6405 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6406 punpckhbw xm3, xm1, xm2
6407 punpcklbw xm1, xm2
6408 vinserti128 m1, m1, xm3, 1
6409 pmaddubsw m1, [r5]
6410 movu xm3, [r0 + r4] ; m3 = row 3
6411 punpckhbw xm4, xm2, xm3
6412 punpcklbw xm2, xm3
6413 vinserti128 m2, m2, xm4, 1
6414 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6415 paddw m0, m4
6416 pmaddubsw m2, [r5]
6417 lea r0, [r0 + r1 * 4]
6418 movu xm4, [r0] ; m4 = row 4
6419 punpckhbw xm5, xm3, xm4
6420 punpcklbw xm3, xm4
6421 vinserti128 m3, m3, xm5, 1
6422 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6423 paddw m1, m5
6424 pmaddubsw m3, [r5]
6425 movu xm5, [r0 + r1] ; m5 = row 5
6426 punpckhbw xm6, xm4, xm5
6427 punpcklbw xm4, xm5
6428 vinserti128 m4, m4, xm6, 1
6429 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6430 paddw m0, m6
6431 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6432 paddw m2, m6
6433 pmaddubsw m4, [r5]
6434 movu xm6, [r0 + r1 * 2] ; m6 = row 6
6435 punpckhbw xm7, xm5, xm6
6436 punpcklbw xm5, xm6
6437 vinserti128 m5, m5, xm7, 1
6438 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6439 paddw m1, m7
6440 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6441 paddw m3, m7
6442 pmaddubsw m5, [r5]
6443 movu xm7, [r0 + r4] ; m7 = row 7
6444 punpckhbw xm8, xm6, xm7
6445 punpcklbw xm6, xm7
6446 vinserti128 m6, m6, xm8, 1
6447 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6448 paddw m0, m8
6449 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6450 paddw m2, m8
6451 pmaddubsw m8, m6, [r5 + 1 * mmsize]
6452 paddw m4, m8
6453 pmaddubsw m6, [r5]
6454 lea r0, [r0 + r1 * 4]
6455 movu xm8, [r0] ; m8 = row 8
6456 punpckhbw xm9, xm7, xm8
6457 punpcklbw xm7, xm8
6458 vinserti128 m7, m7, xm9, 1
6459 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6460 paddw m1, m9
6461 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6462 paddw m3, m9
6463 pmaddubsw m9, m7, [r5 + 1 * mmsize]
6464 paddw m5, m9
6465 pmaddubsw m7, [r5]
6466 movu xm9, [r0 + r1] ; m9 = row 9
6467 punpckhbw xm10, xm8, xm9
6468 punpcklbw xm8, xm9
6469 vinserti128 m8, m8, xm10, 1
6470 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6471 paddw m2, m10
6472 pmaddubsw m10, m8, [r5 + 2 * mmsize]
6473 paddw m4, m10
6474 pmaddubsw m10, m8, [r5 + 1 * mmsize]
6475 paddw m6, m10
6476 pmaddubsw m8, [r5]
6477 movu xm10, [r0 + r1 * 2] ; m10 = row 10
6478 punpckhbw xm11, xm9, xm10
6479 punpcklbw xm9, xm10
6480 vinserti128 m9, m9, xm11, 1
6481 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6482 paddw m3, m11
6483 pmaddubsw m11, m9, [r5 + 2 * mmsize]
6484 paddw m5, m11
6485 pmaddubsw m11, m9, [r5 + 1 * mmsize]
6486 paddw m7, m11
6487 pmaddubsw m9, [r5]
6488 movu xm11, [r0 + r4] ; m11 = row 11
6489 punpckhbw xm12, xm10, xm11
6490 punpcklbw xm10, xm11
6491 vinserti128 m10, m10, xm12, 1
6492 pmaddubsw m12, m10, [r5 + 3 * mmsize]
6493 paddw m4, m12
6494 pmaddubsw m12, m10, [r5 + 2 * mmsize]
6495 paddw m6, m12
6496 pmaddubsw m12, m10, [r5 + 1 * mmsize]
6497 paddw m8, m12
6498 pmaddubsw m10, [r5]
6499 lea r0, [r0 + r1 * 4]
6500 movu xm12, [r0] ; m12 = row 12
6501 punpckhbw xm13, xm11, xm12
6502 punpcklbw xm11, xm12
6503 vinserti128 m11, m11, xm13, 1
6504 pmaddubsw m13, m11, [r5 + 3 * mmsize]
6505 paddw m5, m13
6506 pmaddubsw m13, m11, [r5 + 2 * mmsize]
6507 paddw m7, m13
6508 pmaddubsw m13, m11, [r5 + 1 * mmsize]
6509 paddw m9, m13
6510 pmaddubsw m11, [r5]
6511
6512 pmulhrsw m0, m14 ; m0 = word: row 0
6513 pmulhrsw m1, m14 ; m1 = word: row 1
6514 pmulhrsw m2, m14 ; m2 = word: row 2
6515 pmulhrsw m3, m14 ; m3 = word: row 3
6516 pmulhrsw m4, m14 ; m4 = word: row 4
6517 pmulhrsw m5, m14 ; m5 = word: row 5
6518 packuswb m0, m1
6519 packuswb m2, m3
6520 packuswb m4, m5
6521 vpermq m0, m0, 11011000b
6522 vpermq m2, m2, 11011000b
6523 vpermq m4, m4, 11011000b
6524 vextracti128 xm1, m0, 1
6525 vextracti128 xm3, m2, 1
6526 vextracti128 xm5, m4, 1
6527 movu [r2], xm0
6528 movu [r2 + r3], xm1
6529 movu [r2 + r3 * 2], xm2
6530 movu [r2 + r6], xm3
6531 lea r2, [r2 + r3 * 4]
6532 movu [r2], xm4
6533 movu [r2 + r3], xm5
6534
6535 movu xm13, [r0 + r1] ; m13 = row 13
6536 punpckhbw xm0, xm12, xm13
6537 punpcklbw xm12, xm13
6538 vinserti128 m12, m12, xm0, 1
6539 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6540 paddw m6, m0
6541 pmaddubsw m0, m12, [r5 + 2 * mmsize]
6542 paddw m8, m0
6543 pmaddubsw m0, m12, [r5 + 1 * mmsize]
6544 paddw m10, m0
6545 pmaddubsw m12, [r5]
6546 movu xm0, [r0 + r1 * 2] ; m0 = row 14
6547 punpckhbw xm1, xm13, xm0
6548 punpcklbw xm13, xm0
6549 vinserti128 m13, m13, xm1, 1
6550 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6551 paddw m7, m1
6552 pmaddubsw m1, m13, [r5 + 2 * mmsize]
6553 paddw m9, m1
6554 pmaddubsw m1, m13, [r5 + 1 * mmsize]
6555 paddw m11, m1
6556 pmaddubsw m13, [r5]
6557
6558 pmulhrsw m6, m14 ; m6 = word: row 6
6559 pmulhrsw m7, m14 ; m7 = word: row 7
6560 packuswb m6, m7
6561 vpermq m6, m6, 11011000b
6562 vextracti128 xm7, m6, 1
6563 movu [r2 + r3 * 2], xm6
6564 movu [r2 + r6], xm7
6565 lea r2, [r2 + r3 * 4]
6566
6567 movu xm1, [r0 + r4] ; m1 = row 15
6568 punpckhbw xm2, xm0, xm1
6569 punpcklbw xm0, xm1
6570 vinserti128 m0, m0, xm2, 1
6571 pmaddubsw m2, m0, [r5 + 3 * mmsize]
6572 paddw m8, m2
6573 pmaddubsw m2, m0, [r5 + 2 * mmsize]
6574 paddw m10, m2
6575 pmaddubsw m2, m0, [r5 + 1 * mmsize]
6576 paddw m12, m2
6577 pmaddubsw m0, [r5]
6578 lea r0, [r0 + r1 * 4]
6579 movu xm2, [r0] ; m2 = row 16
6580 punpckhbw xm3, xm1, xm2
6581 punpcklbw xm1, xm2
6582 vinserti128 m1, m1, xm3, 1
6583 pmaddubsw m3, m1, [r5 + 3 * mmsize]
6584 paddw m9, m3
6585 pmaddubsw m3, m1, [r5 + 2 * mmsize]
6586 paddw m11, m3
6587 pmaddubsw m3, m1, [r5 + 1 * mmsize]
6588 paddw m13, m3
6589 pmaddubsw m1, [r5]
6590 movu xm3, [r0 + r1] ; m3 = row 17
6591 punpckhbw xm4, xm2, xm3
6592 punpcklbw xm2, xm3
6593 vinserti128 m2, m2, xm4, 1
6594 pmaddubsw m4, m2, [r5 + 3 * mmsize]
6595 paddw m10, m4
6596 pmaddubsw m4, m2, [r5 + 2 * mmsize]
6597 paddw m12, m4
6598 pmaddubsw m2, [r5 + 1 * mmsize]
6599 paddw m0, m2
6600 movu xm4, [r0 + r1 * 2] ; m4 = row 18
6601 punpckhbw xm5, xm3, xm4
6602 punpcklbw xm3, xm4
6603 vinserti128 m3, m3, xm5, 1
6604 pmaddubsw m5, m3, [r5 + 3 * mmsize]
6605 paddw m11, m5
6606 pmaddubsw m5, m3, [r5 + 2 * mmsize]
6607 paddw m13, m5
6608 pmaddubsw m3, [r5 + 1 * mmsize]
6609 paddw m1, m3
6610 movu xm5, [r0 + r4] ; m5 = row 19
6611 punpckhbw xm6, xm4, xm5
6612 punpcklbw xm4, xm5
6613 vinserti128 m4, m4, xm6, 1
6614 pmaddubsw m6, m4, [r5 + 3 * mmsize]
6615 paddw m12, m6
6616 pmaddubsw m4, [r5 + 2 * mmsize]
6617 paddw m0, m4
6618 lea r0, [r0 + r1 * 4]
6619 movu xm6, [r0] ; m6 = row 20
6620 punpckhbw xm7, xm5, xm6
6621 punpcklbw xm5, xm6
6622 vinserti128 m5, m5, xm7, 1
6623 pmaddubsw m7, m5, [r5 + 3 * mmsize]
6624 paddw m13, m7
6625 pmaddubsw m5, [r5 + 2 * mmsize]
6626 paddw m1, m5
6627 movu xm7, [r0 + r1] ; m7 = row 21
6628 punpckhbw xm2, xm6, xm7
6629 punpcklbw xm6, xm7
6630 vinserti128 m6, m6, xm2, 1
6631 pmaddubsw m6, [r5 + 3 * mmsize]
6632 paddw m0, m6
6633 movu xm2, [r0 + r1 * 2] ; m2 = row 22
6634 punpckhbw xm3, xm7, xm2
6635 punpcklbw xm7, xm2
6636 vinserti128 m7, m7, xm3, 1
6637 pmaddubsw m7, [r5 + 3 * mmsize]
6638 paddw m1, m7
6639
6640 pmulhrsw m8, m14 ; m8 = word: row 8
6641 pmulhrsw m9, m14 ; m9 = word: row 9
6642 pmulhrsw m10, m14 ; m10 = word: row 10
6643 pmulhrsw m11, m14 ; m11 = word: row 11
6644 pmulhrsw m12, m14 ; m12 = word: row 12
6645 pmulhrsw m13, m14 ; m13 = word: row 13
6646 pmulhrsw m0, m14 ; m0 = word: row 14
6647 pmulhrsw m1, m14 ; m1 = word: row 15
6648 packuswb m8, m9
6649 packuswb m10, m11
6650 packuswb m12, m13
6651 packuswb m0, m1
6652 vpermq m8, m8, 11011000b
6653 vpermq m10, m10, 11011000b
6654 vpermq m12, m12, 11011000b
6655 vpermq m0, m0, 11011000b
6656 vextracti128 xm9, m8, 1
6657 vextracti128 xm11, m10, 1
6658 vextracti128 xm13, m12, 1
6659 vextracti128 xm1, m0, 1
6660 movu [r2], xm8
6661 movu [r2 + r3], xm9
6662 movu [r2 + r3 * 2], xm10
6663 movu [r2 + r6], xm11
6664 lea r2, [r2 + r3 * 4]
6665 movu [r2], xm12
6666 movu [r2 + r3], xm13
6667 movu [r2 + r3 * 2], xm0
6668 movu [r2 + r6], xm1
6669 lea r2, [r2 + r3 * 4]
6670 sub r0, r7
6671 dec r8d
6672 jnz .loop
6673 RET
6674%endif
6675%endmacro
6676
6677FILTER_VER_LUMA_AVX2_16xN 16, 32
6678FILTER_VER_LUMA_AVX2_16xN 16, 64
6679
6680%macro PROCESS_LUMA_AVX2_W16_16R 0
6681 movu xm0, [r0] ; m0 = row 0
6682 movu xm1, [r0 + r1] ; m1 = row 1
6683 punpckhbw xm2, xm0, xm1
6684 punpcklbw xm0, xm1
6685 vinserti128 m0, m0, xm2, 1
6686 pmaddubsw m0, [r5]
6687 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6688 punpckhbw xm3, xm1, xm2
6689 punpcklbw xm1, xm2
6690 vinserti128 m1, m1, xm3, 1
6691 pmaddubsw m1, [r5]
6692 movu xm3, [r0 + r4] ; m3 = row 3
6693 punpckhbw xm4, xm2, xm3
6694 punpcklbw xm2, xm3
6695 vinserti128 m2, m2, xm4, 1
6696 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6697 paddw m0, m4
6698 pmaddubsw m2, [r5]
6699 lea r7, [r0 + r1 * 4]
6700 movu xm4, [r7] ; m4 = row 4
6701 punpckhbw xm5, xm3, xm4
6702 punpcklbw xm3, xm4
6703 vinserti128 m3, m3, xm5, 1
6704 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6705 paddw m1, m5
6706 pmaddubsw m3, [r5]
6707 movu xm5, [r7 + r1] ; m5 = row 5
6708 punpckhbw xm6, xm4, xm5
6709 punpcklbw xm4, xm5
6710 vinserti128 m4, m4, xm6, 1
6711 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6712 paddw m0, m6
6713 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6714 paddw m2, m6
6715 pmaddubsw m4, [r5]
6716 movu xm6, [r7 + r1 * 2] ; m6 = row 6
6717 punpckhbw xm7, xm5, xm6
6718 punpcklbw xm5, xm6
6719 vinserti128 m5, m5, xm7, 1
6720 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6721 paddw m1, m7
6722 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6723 paddw m3, m7
6724 pmaddubsw m5, [r5]
6725 movu xm7, [r7 + r4] ; m7 = row 7
6726 punpckhbw xm8, xm6, xm7
6727 punpcklbw xm6, xm7
6728 vinserti128 m6, m6, xm8, 1
6729 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6730 paddw m0, m8
6731 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6732 paddw m2, m8
6733 pmaddubsw m8, m6, [r5 + 1 * mmsize]
6734 paddw m4, m8
6735 pmaddubsw m6, [r5]
6736 lea r7, [r7 + r1 * 4]
6737 movu xm8, [r7] ; m8 = row 8
6738 punpckhbw xm9, xm7, xm8
6739 punpcklbw xm7, xm8
6740 vinserti128 m7, m7, xm9, 1
6741 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6742 paddw m1, m9
6743 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6744 paddw m3, m9
6745 pmaddubsw m9, m7, [r5 + 1 * mmsize]
6746 paddw m5, m9
6747 pmaddubsw m7, [r5]
6748 movu xm9, [r7 + r1] ; m9 = row 9
6749 punpckhbw xm10, xm8, xm9
6750 punpcklbw xm8, xm9
6751 vinserti128 m8, m8, xm10, 1
6752 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6753 paddw m2, m10
6754 pmaddubsw m10, m8, [r5 + 2 * mmsize]
6755 paddw m4, m10
6756 pmaddubsw m10, m8, [r5 + 1 * mmsize]
6757 paddw m6, m10
6758 pmaddubsw m8, [r5]
6759 movu xm10, [r7 + r1 * 2] ; m10 = row 10
6760 punpckhbw xm11, xm9, xm10
6761 punpcklbw xm9, xm10
6762 vinserti128 m9, m9, xm11, 1
6763 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6764 paddw m3, m11
6765 pmaddubsw m11, m9, [r5 + 2 * mmsize]
6766 paddw m5, m11
6767 pmaddubsw m11, m9, [r5 + 1 * mmsize]
6768 paddw m7, m11
6769 pmaddubsw m9, [r5]
6770 movu xm11, [r7 + r4] ; m11 = row 11
6771 punpckhbw xm12, xm10, xm11
6772 punpcklbw xm10, xm11
6773 vinserti128 m10, m10, xm12, 1
6774 pmaddubsw m12, m10, [r5 + 3 * mmsize]
6775 paddw m4, m12
6776 pmaddubsw m12, m10, [r5 + 2 * mmsize]
6777 paddw m6, m12
6778 pmaddubsw m12, m10, [r5 + 1 * mmsize]
6779 paddw m8, m12
6780 pmaddubsw m10, [r5]
6781 lea r7, [r7 + r1 * 4]
6782 movu xm12, [r7] ; m12 = row 12
6783 punpckhbw xm13, xm11, xm12
6784 punpcklbw xm11, xm12
6785 vinserti128 m11, m11, xm13, 1
6786 pmaddubsw m13, m11, [r5 + 3 * mmsize]
6787 paddw m5, m13
6788 pmaddubsw m13, m11, [r5 + 2 * mmsize]
6789 paddw m7, m13
6790 pmaddubsw m13, m11, [r5 + 1 * mmsize]
6791 paddw m9, m13
6792 pmaddubsw m11, [r5]
6793
6794 pmulhrsw m0, m14 ; m0 = word: row 0
6795 pmulhrsw m1, m14 ; m1 = word: row 1
6796 pmulhrsw m2, m14 ; m2 = word: row 2
6797 pmulhrsw m3, m14 ; m3 = word: row 3
6798 pmulhrsw m4, m14 ; m4 = word: row 4
6799 pmulhrsw m5, m14 ; m5 = word: row 5
6800 packuswb m0, m1
6801 packuswb m2, m3
6802 packuswb m4, m5
6803 vpermq m0, m0, 11011000b
6804 vpermq m2, m2, 11011000b
6805 vpermq m4, m4, 11011000b
6806 vextracti128 xm1, m0, 1
6807 vextracti128 xm3, m2, 1
6808 vextracti128 xm5, m4, 1
6809 movu [r2], xm0
6810 movu [r2 + r3], xm1
6811 movu [r2 + r3 * 2], xm2
6812 movu [r2 + r6], xm3
6813 lea r8, [r2 + r3 * 4]
6814 movu [r8], xm4
6815 movu [r8 + r3], xm5
6816
6817 movu xm13, [r7 + r1] ; m13 = row 13
6818 punpckhbw xm0, xm12, xm13
6819 punpcklbw xm12, xm13
6820 vinserti128 m12, m12, xm0, 1
6821 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6822 paddw m6, m0
6823 pmaddubsw m0, m12, [r5 + 2 * mmsize]
6824 paddw m8, m0
6825 pmaddubsw m0, m12, [r5 + 1 * mmsize]
6826 paddw m10, m0
6827 pmaddubsw m12, [r5]
6828 movu xm0, [r7 + r1 * 2] ; m0 = row 14
6829 punpckhbw xm1, xm13, xm0
6830 punpcklbw xm13, xm0
6831 vinserti128 m13, m13, xm1, 1
6832 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6833 paddw m7, m1
6834 pmaddubsw m1, m13, [r5 + 2 * mmsize]
6835 paddw m9, m1
6836 pmaddubsw m1, m13, [r5 + 1 * mmsize]
6837 paddw m11, m1
6838 pmaddubsw m13, [r5]
6839
6840 pmulhrsw m6, m14 ; m6 = word: row 6
6841 pmulhrsw m7, m14 ; m7 = word: row 7
6842 packuswb m6, m7
6843 vpermq m6, m6, 11011000b
6844 vextracti128 xm7, m6, 1
6845 movu [r8 + r3 * 2], xm6
6846 movu [r8 + r6], xm7
6847 lea r8, [r8 + r3 * 4]
6848
6849 movu xm1, [r7 + r4] ; m1 = row 15
6850 punpckhbw xm2, xm0, xm1
6851 punpcklbw xm0, xm1
6852 vinserti128 m0, m0, xm2, 1
6853 pmaddubsw m2, m0, [r5 + 3 * mmsize]
6854 paddw m8, m2
6855 pmaddubsw m2, m0, [r5 + 2 * mmsize]
6856 paddw m10, m2
6857 pmaddubsw m2, m0, [r5 + 1 * mmsize]
6858 paddw m12, m2
6859 pmaddubsw m0, [r5]
6860 lea r7, [r7 + r1 * 4]
6861 movu xm2, [r7] ; m2 = row 16
6862 punpckhbw xm3, xm1, xm2
6863 punpcklbw xm1, xm2
6864 vinserti128 m1, m1, xm3, 1
6865 pmaddubsw m3, m1, [r5 + 3 * mmsize]
6866 paddw m9, m3
6867 pmaddubsw m3, m1, [r5 + 2 * mmsize]
6868 paddw m11, m3
6869 pmaddubsw m3, m1, [r5 + 1 * mmsize]
6870 paddw m13, m3
6871 pmaddubsw m1, [r5]
6872 movu xm3, [r7 + r1] ; m3 = row 17
6873 punpckhbw xm4, xm2, xm3
6874 punpcklbw xm2, xm3
6875 vinserti128 m2, m2, xm4, 1
6876 pmaddubsw m4, m2, [r5 + 3 * mmsize]
6877 paddw m10, m4
6878 pmaddubsw m4, m2, [r5 + 2 * mmsize]
6879 paddw m12, m4
6880 pmaddubsw m2, [r5 + 1 * mmsize]
6881 paddw m0, m2
6882 movu xm4, [r7 + r1 * 2] ; m4 = row 18
6883 punpckhbw xm5, xm3, xm4
6884 punpcklbw xm3, xm4
6885 vinserti128 m3, m3, xm5, 1
6886 pmaddubsw m5, m3, [r5 + 3 * mmsize]
6887 paddw m11, m5
6888 pmaddubsw m5, m3, [r5 + 2 * mmsize]
6889 paddw m13, m5
6890 pmaddubsw m3, [r5 + 1 * mmsize]
6891 paddw m1, m3
6892 movu xm5, [r7 + r4] ; m5 = row 19
6893 punpckhbw xm6, xm4, xm5
6894 punpcklbw xm4, xm5
6895 vinserti128 m4, m4, xm6, 1
6896 pmaddubsw m6, m4, [r5 + 3 * mmsize]
6897 paddw m12, m6
6898 pmaddubsw m4, [r5 + 2 * mmsize]
6899 paddw m0, m4
6900 lea r7, [r7 + r1 * 4]
6901 movu xm6, [r7] ; m6 = row 20
6902 punpckhbw xm7, xm5, xm6
6903 punpcklbw xm5, xm6
6904 vinserti128 m5, m5, xm7, 1
6905 pmaddubsw m7, m5, [r5 + 3 * mmsize]
6906 paddw m13, m7
6907 pmaddubsw m5, [r5 + 2 * mmsize]
6908 paddw m1, m5
6909 movu xm7, [r7 + r1] ; m7 = row 21
6910 punpckhbw xm2, xm6, xm7
6911 punpcklbw xm6, xm7
6912 vinserti128 m6, m6, xm2, 1
6913 pmaddubsw m6, [r5 + 3 * mmsize]
6914 paddw m0, m6
6915 movu xm2, [r7 + r1 * 2] ; m2 = row 22
6916 punpckhbw xm3, xm7, xm2
6917 punpcklbw xm7, xm2
6918 vinserti128 m7, m7, xm3, 1
6919 pmaddubsw m7, [r5 + 3 * mmsize]
6920 paddw m1, m7
6921
6922 pmulhrsw m8, m14 ; m8 = word: row 8
6923 pmulhrsw m9, m14 ; m9 = word: row 9
6924 pmulhrsw m10, m14 ; m10 = word: row 10
6925 pmulhrsw m11, m14 ; m11 = word: row 11
6926 pmulhrsw m12, m14 ; m12 = word: row 12
6927 pmulhrsw m13, m14 ; m13 = word: row 13
6928 pmulhrsw m0, m14 ; m0 = word: row 14
6929 pmulhrsw m1, m14 ; m1 = word: row 15
6930 packuswb m8, m9
6931 packuswb m10, m11
6932 packuswb m12, m13
6933 packuswb m0, m1
6934 vpermq m8, m8, 11011000b
6935 vpermq m10, m10, 11011000b
6936 vpermq m12, m12, 11011000b
6937 vpermq m0, m0, 11011000b
6938 vextracti128 xm9, m8, 1
6939 vextracti128 xm11, m10, 1
6940 vextracti128 xm13, m12, 1
6941 vextracti128 xm1, m0, 1
6942 movu [r8], xm8
6943 movu [r8 + r3], xm9
6944 movu [r8 + r3 * 2], xm10
6945 movu [r8 + r6], xm11
6946 lea r8, [r8 + r3 * 4]
6947 movu [r8], xm12
6948 movu [r8 + r3], xm13
6949 movu [r8 + r3 * 2], xm0
6950 movu [r8 + r6], xm1
6951%endmacro
6952
6953%macro PROCESS_LUMA_AVX2_W16_8R 0
6954 movu xm0, [r0] ; m0 = row 0
6955 movu xm1, [r0 + r1] ; m1 = row 1
6956 punpckhbw xm2, xm0, xm1
6957 punpcklbw xm0, xm1
6958 vinserti128 m0, m0, xm2, 1
6959 pmaddubsw m0, [r5]
6960 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6961 punpckhbw xm3, xm1, xm2
6962 punpcklbw xm1, xm2
6963 vinserti128 m1, m1, xm3, 1
6964 pmaddubsw m1, [r5]
6965 movu xm3, [r0 + r4] ; m3 = row 3
6966 punpckhbw xm4, xm2, xm3
6967 punpcklbw xm2, xm3
6968 vinserti128 m2, m2, xm4, 1
6969 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6970 paddw m0, m4
6971 pmaddubsw m2, [r5]
6972 lea r7, [r0 + r1 * 4]
6973 movu xm4, [r7] ; m4 = row 4
6974 punpckhbw xm5, xm3, xm4
6975 punpcklbw xm3, xm4
6976 vinserti128 m3, m3, xm5, 1
6977 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6978 paddw m1, m5
6979 pmaddubsw m3, [r5]
6980 movu xm5, [r7 + r1] ; m5 = row 5
6981 punpckhbw xm6, xm4, xm5
6982 punpcklbw xm4, xm5
6983 vinserti128 m4, m4, xm6, 1
6984 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6985 paddw m0, m6
6986 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6987 paddw m2, m6
6988 pmaddubsw m4, [r5]
6989 movu xm6, [r7 + r1 * 2] ; m6 = row 6
6990 punpckhbw xm7, xm5, xm6
6991 punpcklbw xm5, xm6
6992 vinserti128 m5, m5, xm7, 1
6993 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6994 paddw m1, m7
6995 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6996 paddw m3, m7
6997 pmaddubsw m5, [r5]
6998 movu xm7, [r7 + r4] ; m7 = row 7
6999 punpckhbw xm8, xm6, xm7
7000 punpcklbw xm6, xm7
7001 vinserti128 m6, m6, xm8, 1
7002 pmaddubsw m8, m6, [r5 + 3 * mmsize]
7003 paddw m0, m8
7004 pmaddubsw m8, m6, [r5 + 2 * mmsize]
7005 paddw m2, m8
7006 pmaddubsw m8, m6, [r5 + 1 * mmsize]
7007 paddw m4, m8
7008 pmaddubsw m6, [r5]
7009 lea r7, [r7 + r1 * 4]
7010 movu xm8, [r7] ; m8 = row 8
7011 punpckhbw xm9, xm7, xm8
7012 punpcklbw xm7, xm8
7013 vinserti128 m7, m7, xm9, 1
7014 pmaddubsw m9, m7, [r5 + 3 * mmsize]
7015 paddw m1, m9
7016 pmaddubsw m9, m7, [r5 + 2 * mmsize]
7017 paddw m3, m9
7018 pmaddubsw m9, m7, [r5 + 1 * mmsize]
7019 paddw m5, m9
7020 pmaddubsw m7, [r5]
7021 movu xm9, [r7 + r1] ; m9 = row 9
7022 punpckhbw xm10, xm8, xm9
7023 punpcklbw xm8, xm9
7024 vinserti128 m8, m8, xm10, 1
7025 pmaddubsw m10, m8, [r5 + 3 * mmsize]
7026 paddw m2, m10
7027 pmaddubsw m10, m8, [r5 + 2 * mmsize]
7028 paddw m4, m10
7029 pmaddubsw m10, m8, [r5 + 1 * mmsize]
7030 paddw m6, m10
7031 movu xm10, [r7 + r1 * 2] ; m10 = row 10
7032 punpckhbw xm11, xm9, xm10
7033 punpcklbw xm9, xm10
7034 vinserti128 m9, m9, xm11, 1
7035 pmaddubsw m11, m9, [r5 + 3 * mmsize]
7036 paddw m3, m11
7037 pmaddubsw m11, m9, [r5 + 2 * mmsize]
7038 paddw m5, m11
7039 pmaddubsw m11, m9, [r5 + 1 * mmsize]
7040 paddw m7, m11
7041 movu xm11, [r7 + r4] ; m11 = row 11
7042 punpckhbw xm12, xm10, xm11
7043 punpcklbw xm10, xm11
7044 vinserti128 m10, m10, xm12, 1
7045 pmaddubsw m12, m10, [r5 + 3 * mmsize]
7046 paddw m4, m12
7047 pmaddubsw m12, m10, [r5 + 2 * mmsize]
7048 paddw m6, m12
7049 lea r7, [r7 + r1 * 4]
7050 movu xm12, [r7] ; m12 = row 12
7051 punpckhbw xm13, xm11, xm12
7052 punpcklbw xm11, xm12
7053 vinserti128 m11, m11, xm13, 1
7054 pmaddubsw m13, m11, [r5 + 3 * mmsize]
7055 paddw m5, m13
7056 pmaddubsw m13, m11, [r5 + 2 * mmsize]
7057 paddw m7, m13
7058
7059 pmulhrsw m0, m14 ; m0 = word: row 0
7060 pmulhrsw m1, m14 ; m1 = word: row 1
7061 pmulhrsw m2, m14 ; m2 = word: row 2
7062 pmulhrsw m3, m14 ; m3 = word: row 3
7063 pmulhrsw m4, m14 ; m4 = word: row 4
7064 pmulhrsw m5, m14 ; m5 = word: row 5
7065 packuswb m0, m1
7066 packuswb m2, m3
7067 packuswb m4, m5
7068 vpermq m0, m0, 11011000b
7069 vpermq m2, m2, 11011000b
7070 vpermq m4, m4, 11011000b
7071 vextracti128 xm1, m0, 1
7072 vextracti128 xm3, m2, 1
7073 vextracti128 xm5, m4, 1
7074 movu [r2], xm0
7075 movu [r2 + r3], xm1
7076 movu [r2 + r3 * 2], xm2
7077 movu [r2 + r6], xm3
7078 lea r8, [r2 + r3 * 4]
7079 movu [r8], xm4
7080 movu [r8 + r3], xm5
7081
7082 movu xm13, [r7 + r1] ; m13 = row 13
7083 punpckhbw xm0, xm12, xm13
7084 punpcklbw xm12, xm13
7085 vinserti128 m12, m12, xm0, 1
7086 pmaddubsw m0, m12, [r5 + 3 * mmsize]
7087 paddw m6, m0
7088 movu xm0, [r7 + r1 * 2] ; m0 = row 14
7089 punpckhbw xm1, xm13, xm0
7090 punpcklbw xm13, xm0
7091 vinserti128 m13, m13, xm1, 1
7092 pmaddubsw m1, m13, [r5 + 3 * mmsize]
7093 paddw m7, m1
7094
7095 pmulhrsw m6, m14 ; m6 = word: row 6
7096 pmulhrsw m7, m14 ; m7 = word: row 7
7097 packuswb m6, m7
7098 vpermq m6, m6, 11011000b
7099 vextracti128 xm7, m6, 1
7100 movu [r8 + r3 * 2], xm6
7101 movu [r8 + r6], xm7
7102%endmacro
7103
7104INIT_YMM avx2
7105%if ARCH_X86_64 == 1
7106cglobal interp_8tap_vert_pp_24x32, 4, 11, 15
7107 mov r4d, r4m
7108 shl r4d, 7
7109
7110%ifdef PIC
7111 lea r5, [tab_LumaCoeffVer_32]
7112 add r5, r4
7113%else
7114 lea r5, [tab_LumaCoeffVer_32 + r4]
7115%endif
7116
7117 lea r4, [r1 * 3]
7118 sub r0, r4
7119 lea r6, [r3 * 3]
7120 lea r10, [r1 * 4]
7121 mova m14, [pw_512]
7122 mov r9d, 2
7123.loopH:
7124 PROCESS_LUMA_AVX2_W16_16R
7125 add r2, 16
7126 add r0, 16
7127
7128 movq xm1, [r0] ; m1 = row 0
7129 movq xm2, [r0 + r1] ; m2 = row 1
7130 punpcklbw xm1, xm2
7131 movq xm3, [r0 + r1 * 2] ; m3 = row 2
7132 punpcklbw xm2, xm3
7133 vinserti128 m5, m1, xm2, 1
7134 pmaddubsw m5, [r5]
7135 movq xm4, [r0 + r4] ; m4 = row 3
7136 punpcklbw xm3, xm4
7137 lea r7, [r0 + r1 * 4]
7138 movq xm1, [r7] ; m1 = row 4
7139 punpcklbw xm4, xm1
7140 vinserti128 m2, m3, xm4, 1
7141 pmaddubsw m0, m2, [r5 + 1 * mmsize]
7142 paddw m5, m0
7143 pmaddubsw m2, [r5]
7144 movq xm3, [r7 + r1] ; m3 = row 5
7145 punpcklbw xm1, xm3
7146 movq xm4, [r7 + r1 * 2] ; m4 = row 6
7147 punpcklbw xm3, xm4
7148 vinserti128 m1, m1, xm3, 1
7149 pmaddubsw m3, m1, [r5 + 2 * mmsize]
7150 paddw m5, m3
7151 pmaddubsw m0, m1, [r5 + 1 * mmsize]
7152 paddw m2, m0
7153 pmaddubsw m1, [r5]
7154 movq xm3, [r7 + r4] ; m3 = row 7
7155 punpcklbw xm4, xm3
7156 lea r7, [r7 + r1 * 4]
7157 movq xm0, [r7] ; m0 = row 8
7158 punpcklbw xm3, xm0
7159 vinserti128 m4, m4, xm3, 1
7160 pmaddubsw m3, m4, [r5 + 3 * mmsize]
7161 paddw m5, m3
7162 pmaddubsw m3, m4, [r5 + 2 * mmsize]
7163 paddw m2, m3
7164 pmaddubsw m3, m4, [r5 + 1 * mmsize]
7165 paddw m1, m3
7166 pmaddubsw m4, [r5]
7167 movq xm3, [r7 + r1] ; m3 = row 9
7168 punpcklbw xm0, xm3
7169 movq xm6, [r7 + r1 * 2] ; m6 = row 10
7170 punpcklbw xm3, xm6
7171 vinserti128 m0, m0, xm3, 1
7172 pmaddubsw m3, m0, [r5 + 3 * mmsize]
7173 paddw m2, m3
7174 pmaddubsw m3, m0, [r5 + 2 * mmsize]
7175 paddw m1, m3
7176 pmaddubsw m3, m0, [r5 + 1 * mmsize]
7177 paddw m4, m3
7178 pmaddubsw m0, [r5]
7179
7180 movq xm3, [r7 + r4] ; m3 = row 11
7181 punpcklbw xm6, xm3
7182 lea r7, [r7 + r1 * 4]
7183 movq xm7, [r7] ; m7 = row 12
7184 punpcklbw xm3, xm7
7185 vinserti128 m6, m6, xm3, 1
7186 pmaddubsw m3, m6, [r5 + 3 * mmsize]
7187 paddw m1, m3
7188 pmaddubsw m3, m6, [r5 + 2 * mmsize]
7189 paddw m4, m3
7190 pmaddubsw m3, m6, [r5 + 1 * mmsize]
7191 paddw m0, m3
7192 pmaddubsw m6, [r5]
7193 movq xm3, [r7 + r1] ; m3 = row 13
7194 punpcklbw xm7, xm3
7195 movq xm8, [r7 + r1 * 2] ; m8 = row 14
7196 punpcklbw xm3, xm8
7197 vinserti128 m7, m7, xm3, 1
7198 pmaddubsw m3, m7, [r5 + 3 * mmsize]
7199 paddw m4, m3
7200 pmaddubsw m3, m7, [r5 + 2 * mmsize]
7201 paddw m0, m3
7202 pmaddubsw m3, m7, [r5 + 1 * mmsize]
7203 paddw m6, m3
7204 pmaddubsw m7, [r5]
7205 movq xm3, [r7 + r4] ; m3 = row 15
7206 punpcklbw xm8, xm3
7207 lea r7, [r7 + r1 * 4]
7208 movq xm9, [r7] ; m9 = row 16
7209 punpcklbw xm3, xm9
7210 vinserti128 m8, m8, xm3, 1
7211 pmaddubsw m3, m8, [r5 + 3 * mmsize]
7212 paddw m0, m3
7213 pmaddubsw m3, m8, [r5 + 2 * mmsize]
7214 paddw m6, m3
7215 pmaddubsw m3, m8, [r5 + 1 * mmsize]
7216 paddw m7, m3
7217 pmaddubsw m8, [r5]
7218 movq xm3, [r7 + r1] ; m3 = row 17
7219 punpcklbw xm9, xm3
7220 movq xm10, [r7 + r1 * 2] ; m10 = row 18
7221 punpcklbw xm3, xm10
7222 vinserti128 m9, m9, xm3, 1
7223 pmaddubsw m3, m9, [r5 + 3 * mmsize]
7224 paddw m6, m3
7225 pmaddubsw m3, m9, [r5 + 2 * mmsize]
7226 paddw m7, m3
7227 pmaddubsw m3, m9, [r5 + 1 * mmsize]
7228 paddw m8, m3
7229 movq xm3, [r7 + r4] ; m3 = row 19
7230 punpcklbw xm10, xm3
7231 lea r7, [r7 + r1 * 4]
7232 movq xm9, [r7] ; m9 = row 20
7233 punpcklbw xm3, xm9
7234 vinserti128 m10, m10, xm3, 1
7235 pmaddubsw m3, m10, [r5 + 3 * mmsize]
7236 paddw m7, m3
7237 pmaddubsw m3, m10, [r5 + 2 * mmsize]
7238 paddw m8, m3
7239 movq xm3, [r7 + r1] ; m3 = row 21
7240 punpcklbw xm9, xm3
7241 movq xm10, [r7 + r1 * 2] ; m10 = row 22
7242 punpcklbw xm3, xm10
7243 vinserti128 m9, m9, xm3, 1
7244 pmaddubsw m3, m9, [r5 + 3 * mmsize]
7245 paddw m8, m3
7246
7247 pmulhrsw m5, m14 ; m5 = word: row 0, row 1
7248 pmulhrsw m2, m14 ; m2 = word: row 2, row 3
7249 pmulhrsw m1, m14 ; m1 = word: row 4, row 5
7250 pmulhrsw m4, m14 ; m4 = word: row 6, row 7
7251 pmulhrsw m0, m14 ; m0 = word: row 8, row 9
7252 pmulhrsw m6, m14 ; m6 = word: row 10, row 11
7253 pmulhrsw m7, m14 ; m7 = word: row 12, row 13
7254 pmulhrsw m8, m14 ; m8 = word: row 14, row 15
7255 packuswb m5, m2
7256 packuswb m1, m4
7257 packuswb m0, m6
7258 packuswb m7, m8
7259 vextracti128 xm2, m5, 1
7260 vextracti128 xm4, m1, 1
7261 vextracti128 xm6, m0, 1
7262 vextracti128 xm8, m7, 1
7263 movq [r2], xm5
7264 movq [r2 + r3], xm2
7265 movhps [r2 + r3 * 2], xm5
7266 movhps [r2 + r6], xm2
7267 lea r8, [r2 + r3 * 4]
7268 movq [r8], xm1
7269 movq [r8 + r3], xm4
7270 movhps [r8 + r3 * 2], xm1
7271 movhps [r8 + r6], xm4
7272 lea r8, [r8 + r3 * 4]
7273 movq [r8], xm0
7274 movq [r8 + r3], xm6
7275 movhps [r8 + r3 * 2], xm0
7276 movhps [r8 + r6], xm6
7277 lea r8, [r8 + r3 * 4]
7278 movq [r8], xm7
7279 movq [r8 + r3], xm8
7280 movhps [r8 + r3 * 2], xm7
7281 movhps [r8 + r6], xm8
7282
7283 sub r7, r10
7284 lea r0, [r7 - 16]
7285 lea r2, [r8 + r3 * 4 - 16]
7286 dec r9d
7287 jnz .loopH
7288 RET
7289%endif
7290
7291%macro FILTER_VER_LUMA_AVX2_32xN 2
7292INIT_YMM avx2
7293%if ARCH_X86_64 == 1
7294cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
7295 mov r4d, r4m
7296 shl r4d, 7
7297
7298%ifdef PIC
7299 lea r5, [tab_LumaCoeffVer_32]
7300 add r5, r4
7301%else
7302 lea r5, [tab_LumaCoeffVer_32 + r4]
7303%endif
7304
7305 lea r4, [r1 * 3]
7306 sub r0, r4
7307 lea r6, [r3 * 3]
7308 lea r11, [r1 * 4]
7309 mova m14, [pw_512]
7310 mov r9d, %2 / 16
7311.loopH:
7312 mov r10d, %1 / 16
7313.loopW:
7314 PROCESS_LUMA_AVX2_W16_16R
7315 add r2, 16
7316 add r0, 16
7317 dec r10d
7318 jnz .loopW
7319 sub r7, r11
7320 lea r0, [r7 - 16]
7321 lea r2, [r8 + r3 * 4 - 16]
7322 dec r9d
7323 jnz .loopH
7324 RET
7325%endif
7326%endmacro
7327
7328FILTER_VER_LUMA_AVX2_32xN 32, 32
7329FILTER_VER_LUMA_AVX2_32xN 32, 64
7330
7331INIT_YMM avx2
7332%if ARCH_X86_64 == 1
7333cglobal interp_8tap_vert_pp_32x16, 4, 10, 15
7334 mov r4d, r4m
7335 shl r4d, 7
7336
7337%ifdef PIC
7338 lea r5, [tab_LumaCoeffVer_32]
7339 add r5, r4
7340%else
7341 lea r5, [tab_LumaCoeffVer_32 + r4]
7342%endif
7343
7344 lea r4, [r1 * 3]
7345 sub r0, r4
7346 lea r6, [r3 * 3]
7347 mova m14, [pw_512]
7348 mov r9d, 2
7349.loopW:
7350 PROCESS_LUMA_AVX2_W16_16R
7351 add r2, 16
7352 add r0, 16
7353 dec r9d
7354 jnz .loopW
7355 RET
7356%endif
7357
7358INIT_YMM avx2
7359%if ARCH_X86_64 == 1
7360cglobal interp_8tap_vert_pp_32x24, 4, 10, 15
7361 mov r4d, r4m
7362 shl r4d, 7
7363
7364%ifdef PIC
7365 lea r5, [tab_LumaCoeffVer_32]
7366 add r5, r4
7367%else
7368 lea r5, [tab_LumaCoeffVer_32 + r4]
7369%endif
7370
7371 lea r4, [r1 * 3]
7372 sub r0, r4
7373 lea r6, [r3 * 3]
7374 mova m14, [pw_512]
7375 mov r9d, 2
7376.loopW:
7377 PROCESS_LUMA_AVX2_W16_16R
7378 add r2, 16
7379 add r0, 16
7380 dec r9d
7381 jnz .loopW
7382 lea r9, [r1 * 4]
7383 sub r7, r9
7384 lea r0, [r7 - 16]
7385 lea r2, [r8 + r3 * 4 - 16]
7386 mov r9d, 2
7387.loop:
7388 PROCESS_LUMA_AVX2_W16_8R
7389 add r2, 16
7390 add r0, 16
7391 dec r9d
7392 jnz .loop
7393 RET
7394%endif
7395
7396INIT_YMM avx2
7397%if ARCH_X86_64 == 1
7398cglobal interp_8tap_vert_pp_32x8, 4, 10, 15
7399 mov r4d, r4m
7400 shl r4d, 7
7401
7402%ifdef PIC
7403 lea r5, [tab_LumaCoeffVer_32]
7404 add r5, r4
7405%else
7406 lea r5, [tab_LumaCoeffVer_32 + r4]
7407%endif
7408
7409 lea r4, [r1 * 3]
7410 sub r0, r4
7411 lea r6, [r3 * 3]
7412 mova m14, [pw_512]
7413 mov r9d, 2
7414.loopW:
7415 PROCESS_LUMA_AVX2_W16_8R
7416 add r2, 16
7417 add r0, 16
7418 dec r9d
7419 jnz .loopW
7420 RET
7421%endif
7422
7423INIT_YMM avx2
7424%if ARCH_X86_64 == 1
7425cglobal interp_8tap_vert_pp_48x64, 4, 12, 15
7426 mov r4d, r4m
7427 shl r4d, 7
7428
7429%ifdef PIC
7430 lea r5, [tab_LumaCoeffVer_32]
7431 add r5, r4
7432%else
7433 lea r5, [tab_LumaCoeffVer_32 + r4]
7434%endif
7435
7436 lea r4, [r1 * 3]
7437 sub r0, r4
7438 lea r6, [r3 * 3]
7439 lea r11, [r1 * 4]
7440 mova m14, [pw_512]
7441 mov r9d, 4
7442.loopH:
7443 mov r10d, 3
7444.loopW:
7445 PROCESS_LUMA_AVX2_W16_16R
7446 add r2, 16
7447 add r0, 16
7448 dec r10d
7449 jnz .loopW
7450 sub r7, r11
7451 lea r0, [r7 - 32]
7452 lea r2, [r8 + r3 * 4 - 32]
7453 dec r9d
7454 jnz .loopH
7455 RET
7456%endif
7457
7458%macro FILTER_VER_LUMA_AVX2_64xN 2
7459INIT_YMM avx2
7460%if ARCH_X86_64 == 1
7461cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
7462 mov r4d, r4m
7463 shl r4d, 7
7464
7465%ifdef PIC
7466 lea r5, [tab_LumaCoeffVer_32]
7467 add r5, r4
7468%else
7469 lea r5, [tab_LumaCoeffVer_32 + r4]
7470%endif
7471
7472 lea r4, [r1 * 3]
7473 sub r0, r4
7474 lea r6, [r3 * 3]
7475 lea r11, [r1 * 4]
7476 mova m14, [pw_512]
7477 mov r9d, %2 / 16
7478.loopH:
7479 mov r10d, %1 / 16
7480.loopW:
7481 PROCESS_LUMA_AVX2_W16_16R
7482 add r2, 16
7483 add r0, 16
7484 dec r10d
7485 jnz .loopW
7486 sub r7, r11
7487 lea r0, [r7 - 48]
7488 lea r2, [r8 + r3 * 4 - 48]
7489 dec r9d
7490 jnz .loopH
7491 RET
7492%endif
7493%endmacro
7494
7495FILTER_VER_LUMA_AVX2_64xN 64, 32
7496FILTER_VER_LUMA_AVX2_64xN 64, 48
7497FILTER_VER_LUMA_AVX2_64xN 64, 64
7498
7499INIT_YMM avx2
7500%if ARCH_X86_64 == 1
7501cglobal interp_8tap_vert_pp_64x16, 4, 10, 15
7502 mov r4d, r4m
7503 shl r4d, 7
7504
7505%ifdef PIC
7506 lea r5, [tab_LumaCoeffVer_32]
7507 add r5, r4
7508%else
7509 lea r5, [tab_LumaCoeffVer_32 + r4]
7510%endif
7511
7512 lea r4, [r1 * 3]
7513 sub r0, r4
7514 lea r6, [r3 * 3]
7515 mova m14, [pw_512]
7516 mov r9d, 4
7517.loopW:
7518 PROCESS_LUMA_AVX2_W16_16R
7519 add r2, 16
7520 add r0, 16
7521 dec r9d
7522 jnz .loopW
7523 RET
7524%endif
7525
72b9787e
JB
7526;-------------------------------------------------------------------------------------------------------------
7527; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7528;-------------------------------------------------------------------------------------------------------------
7529%macro FILTER_VER_LUMA 3
7530INIT_XMM sse4
7531cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
7532 lea r5, [3 * r1]
7533 sub r0, r5
7534 shl r4d, 6
7535%ifidn %3,ps
7536 add r3d, r3d
7537%endif
7538
7539%ifdef PIC
7540 lea r5, [tab_LumaCoeffVer]
7541 lea r6, [r5 + r4]
7542%else
7543 lea r6, [tab_LumaCoeffVer + r4]
7544%endif
7545
7546%ifidn %3,pp
b53f7c52 7547 mova m3, [pw_512]
72b9787e
JB
7548%else
7549 mova m3, [pw_2000]
7550%endif
7551 mov dword [rsp], %2/4
7552
7553.loopH:
7554 mov r4d, (%1/8)
7555.loopW:
7556 PROCESS_LUMA_W8_4R
7557%ifidn %3,pp
7558 pmulhrsw m7, m3
7559 pmulhrsw m6, m3
7560 pmulhrsw m5, m3
7561 pmulhrsw m4, m3
7562
7563 packuswb m7, m6
7564 packuswb m5, m4
7565
7566 movlps [r2], m7
7567 movhps [r2 + r3], m7
7568 lea r5, [r2 + 2 * r3]
7569 movlps [r5], m5
7570 movhps [r5 + r3], m5
7571%else
7572 psubw m7, m3
7573 psubw m6, m3
7574 psubw m5, m3
7575 psubw m4, m3
7576
7577 movu [r2], m7
7578 movu [r2 + r3], m6
7579 lea r5, [r2 + 2 * r3]
7580 movu [r5], m5
7581 movu [r5 + r3], m4
7582%endif
7583
7584 lea r5, [8 * r1 - 8]
7585 sub r0, r5
7586%ifidn %3,pp
7587 add r2, 8
7588%else
7589 add r2, 16
7590%endif
7591 dec r4d
7592 jnz .loopW
7593
7594 lea r0, [r0 + 4 * r1 - %1]
7595%ifidn %3,pp
7596 lea r2, [r2 + 4 * r3 - %1]
7597%else
7598 lea r2, [r2 + 4 * r3 - 2 * %1]
7599%endif
7600
7601 dec dword [rsp]
7602 jnz .loopH
7603
7604 RET
7605%endmacro
7606
7607FILTER_VER_LUMA 16, 4, pp
7608FILTER_VER_LUMA 16, 8, pp
7609FILTER_VER_LUMA 16, 12, pp
7610FILTER_VER_LUMA 16, 16, pp
7611FILTER_VER_LUMA 16, 32, pp
7612FILTER_VER_LUMA 16, 64, pp
7613FILTER_VER_LUMA 24, 32, pp
7614FILTER_VER_LUMA 32, 8, pp
7615FILTER_VER_LUMA 32, 16, pp
7616FILTER_VER_LUMA 32, 24, pp
7617FILTER_VER_LUMA 32, 32, pp
7618FILTER_VER_LUMA 32, 64, pp
7619FILTER_VER_LUMA 48, 64, pp
7620FILTER_VER_LUMA 64, 16, pp
7621FILTER_VER_LUMA 64, 32, pp
7622FILTER_VER_LUMA 64, 48, pp
7623FILTER_VER_LUMA 64, 64, pp
7624
7625FILTER_VER_LUMA 16, 4, ps
7626FILTER_VER_LUMA 16, 8, ps
7627FILTER_VER_LUMA 16, 12, ps
7628FILTER_VER_LUMA 16, 16, ps
7629FILTER_VER_LUMA 16, 32, ps
7630FILTER_VER_LUMA 16, 64, ps
7631FILTER_VER_LUMA 24, 32, ps
7632FILTER_VER_LUMA 32, 8, ps
7633FILTER_VER_LUMA 32, 16, ps
7634FILTER_VER_LUMA 32, 24, ps
7635FILTER_VER_LUMA 32, 32, ps
7636FILTER_VER_LUMA 32, 64, ps
7637FILTER_VER_LUMA 48, 64, ps
7638FILTER_VER_LUMA 64, 16, ps
7639FILTER_VER_LUMA 64, 32, ps
7640FILTER_VER_LUMA 64, 48, ps
7641FILTER_VER_LUMA 64, 64, ps
7642
7643%macro PROCESS_LUMA_SP_W4_4R 0
7644 movq m0, [r0]
7645 movq m1, [r0 + r1]
7646 punpcklwd m0, m1 ;m0=[0 1]
7647 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
7648
7649 lea r0, [r0 + 2 * r1]
7650 movq m4, [r0]
7651 punpcklwd m1, m4 ;m1=[1 2]
7652 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
7653
7654 movq m5, [r0 + r1]
7655 punpcklwd m4, m5 ;m4=[2 3]
7656 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
7657 pmaddwd m4, [r6 + 1 * 16]
7658 paddd m0, m4 ;m0=[0+1+2+3] Row1
7659
7660 lea r0, [r0 + 2 * r1]
7661 movq m4, [r0]
7662 punpcklwd m5, m4 ;m5=[3 4]
7663 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
7664 pmaddwd m5, [r6 + 1 * 16]
7665 paddd m1, m5 ;m1 = [1+2+3+4] Row2
7666
7667 movq m5, [r0 + r1]
7668 punpcklwd m4, m5 ;m4=[4 5]
7669 pmaddwd m6, m4, [r6 + 1 * 16]
7670 paddd m2, m6 ;m2=[2+3+4+5] Row3
7671 pmaddwd m4, [r6 + 2 * 16]
7672 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
7673
7674 lea r0, [r0 + 2 * r1]
7675 movq m4, [r0]
7676 punpcklwd m5, m4 ;m5=[5 6]
7677 pmaddwd m6, m5, [r6 + 1 * 16]
7678 paddd m3, m6 ;m3=[3+4+5+6] Row4
7679 pmaddwd m5, [r6 + 2 * 16]
7680 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
7681
7682 movq m5, [r0 + r1]
7683 punpcklwd m4, m5 ;m4=[6 7]
7684 pmaddwd m6, m4, [r6 + 2 * 16]
7685 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
7686 pmaddwd m4, [r6 + 3 * 16]
7687 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
7688
7689 lea r0, [r0 + 2 * r1]
7690 movq m4, [r0]
7691 punpcklwd m5, m4 ;m5=[7 8]
7692 pmaddwd m6, m5, [r6 + 2 * 16]
7693 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
7694 pmaddwd m5, [r6 + 3 * 16]
7695 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
7696
7697 movq m5, [r0 + r1]
7698 punpcklwd m4, m5 ;m4=[8 9]
7699 pmaddwd m4, [r6 + 3 * 16]
7700 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
7701
7702 movq m4, [r0 + 2 * r1]
7703 punpcklwd m5, m4 ;m5=[9 10]
7704 pmaddwd m5, [r6 + 3 * 16]
7705 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
7706%endmacro
7707
7708;--------------------------------------------------------------------------------------------------------------
7709; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7710;--------------------------------------------------------------------------------------------------------------
7711%macro FILTER_VER_LUMA_SP 2
7712INIT_XMM sse4
7713cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
7714
7715 add r1d, r1d
7716 lea r5, [r1 + 2 * r1]
7717 sub r0, r5
7718 shl r4d, 6
7719
7720%ifdef PIC
7721 lea r5, [tab_LumaCoeffV]
7722 lea r6, [r5 + r4]
7723%else
7724 lea r6, [tab_LumaCoeffV + r4]
7725%endif
7726
7727 mova m7, [tab_c_526336]
7728
7729 mov dword [rsp], %2/4
7730.loopH:
7731 mov r4d, (%1/4)
7732.loopW:
7733 PROCESS_LUMA_SP_W4_4R
7734
7735 paddd m0, m7
7736 paddd m1, m7
7737 paddd m2, m7
7738 paddd m3, m7
7739
7740 psrad m0, 12
7741 psrad m1, 12
7742 psrad m2, 12
7743 psrad m3, 12
7744
7745 packssdw m0, m1
7746 packssdw m2, m3
7747
7748 packuswb m0, m2
7749
7750 movd [r2], m0
7751 pextrd [r2 + r3], m0, 1
7752 lea r5, [r2 + 2 * r3]
7753 pextrd [r5], m0, 2
7754 pextrd [r5 + r3], m0, 3
7755
7756 lea r5, [8 * r1 - 2 * 4]
7757 sub r0, r5
7758 add r2, 4
7759
7760 dec r4d
7761 jnz .loopW
7762
7763 lea r0, [r0 + 4 * r1 - 2 * %1]
7764 lea r2, [r2 + 4 * r3 - %1]
7765
7766 dec dword [rsp]
7767 jnz .loopH
7768
7769 RET
7770%endmacro
7771
7772;--------------------------------------------------------------------------------------------------------------
7773; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7774;--------------------------------------------------------------------------------------------------------------
7775 FILTER_VER_LUMA_SP 4, 4
7776 FILTER_VER_LUMA_SP 8, 8
7777 FILTER_VER_LUMA_SP 8, 4
7778 FILTER_VER_LUMA_SP 4, 8
7779 FILTER_VER_LUMA_SP 16, 16
7780 FILTER_VER_LUMA_SP 16, 8
7781 FILTER_VER_LUMA_SP 8, 16
7782 FILTER_VER_LUMA_SP 16, 12
7783 FILTER_VER_LUMA_SP 12, 16
7784 FILTER_VER_LUMA_SP 16, 4
7785 FILTER_VER_LUMA_SP 4, 16
7786 FILTER_VER_LUMA_SP 32, 32
7787 FILTER_VER_LUMA_SP 32, 16
7788 FILTER_VER_LUMA_SP 16, 32
7789 FILTER_VER_LUMA_SP 32, 24
7790 FILTER_VER_LUMA_SP 24, 32
7791 FILTER_VER_LUMA_SP 32, 8
7792 FILTER_VER_LUMA_SP 8, 32
7793 FILTER_VER_LUMA_SP 64, 64
7794 FILTER_VER_LUMA_SP 64, 32
7795 FILTER_VER_LUMA_SP 32, 64
7796 FILTER_VER_LUMA_SP 64, 48
7797 FILTER_VER_LUMA_SP 48, 64
7798 FILTER_VER_LUMA_SP 64, 16
7799 FILTER_VER_LUMA_SP 16, 64
7800
7801; TODO: combin of U and V is more performance, but need more register
7802; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
7803INIT_XMM ssse3
7804cglobal chroma_p2s, 3, 7, 4
7805
7806 ; load width and height
7807 mov r3d, r3m
7808 mov r4d, r4m
7809
7810 ; load constant
b53f7c52 7811 mova m2, [pb_128]
72b9787e
JB
7812 mova m3, [tab_c_64_n64]
7813
7814.loopH:
7815
7816 xor r5d, r5d
7817.loopW:
7818 lea r6, [r0 + r5]
7819
7820 movh m0, [r6]
7821 punpcklbw m0, m2
7822 pmaddubsw m0, m3
7823
7824 movh m1, [r6 + r1]
7825 punpcklbw m1, m2
7826 pmaddubsw m1, m3
7827
7828 add r5d, 8
7829 cmp r5d, r3d
7830 lea r6, [r2 + r5 * 2]
7831 jg .width4
7832 movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
7833 movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
7834 je .nextH
7835 jmp .loopW
7836
7837.width4:
7838 test r3d, 4
7839 jz .width2
7840 test r3d, 2
7841 movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
7842 movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
7843 lea r6, [r6 + 8]
7844 pshufd m0, m0, 2
7845 pshufd m1, m1, 2
7846 jz .nextH
7847
7848.width2:
7849 movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
7850 movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
7851
7852.nextH:
7853 lea r0, [r0 + r1 * 2]
7854 add r2, FENC_STRIDE / 2 * 4
7855
7856 sub r4d, 2
7857 jnz .loopH
7858
7859 RET
7860
7861%macro PROCESS_CHROMA_SP_W4_4R 0
7862 movq m0, [r0]
7863 movq m1, [r0 + r1]
7864 punpcklwd m0, m1 ;m0=[0 1]
7865 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
7866
7867 lea r0, [r0 + 2 * r1]
7868 movq m4, [r0]
7869 punpcklwd m1, m4 ;m1=[1 2]
7870 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
7871
7872 movq m5, [r0 + r1]
7873 punpcklwd m4, m5 ;m4=[2 3]
7874 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
7875 pmaddwd m4, [r6 + 1 * 16]
7876 paddd m0, m4 ;m0=[0+1+2+3] Row1 done
7877
7878 lea r0, [r0 + 2 * r1]
7879 movq m4, [r0]
7880 punpcklwd m5, m4 ;m5=[3 4]
7881 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
7882 pmaddwd m5, [r6 + 1 * 16]
7883 paddd m1, m5 ;m1 = [1+2+3+4] Row2
7884
7885 movq m5, [r0 + r1]
7886 punpcklwd m4, m5 ;m4=[4 5]
7887 pmaddwd m4, [r6 + 1 * 16]
7888 paddd m2, m4 ;m2=[2+3+4+5] Row3
7889
7890 movq m4, [r0 + 2 * r1]
7891 punpcklwd m5, m4 ;m5=[5 6]
7892 pmaddwd m5, [r6 + 1 * 16]
7893 paddd m3, m5 ;m3=[3+4+5+6] Row4
7894%endmacro
7895
7896;--------------------------------------------------------------------------------------------------------------
7897; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7898;--------------------------------------------------------------------------------------------------------------
7899%macro FILTER_VER_CHROMA_SP 2
7900INIT_XMM sse4
7901cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
7902
7903 add r1d, r1d
7904 sub r0, r1
7905 shl r4d, 5
7906
7907%ifdef PIC
7908 lea r5, [tab_ChromaCoeffV]
7909 lea r6, [r5 + r4]
7910%else
7911 lea r6, [tab_ChromaCoeffV + r4]
7912%endif
7913
7914 mova m6, [tab_c_526336]
7915
7916 mov dword [rsp], %2/4
7917
7918.loopH:
7919 mov r4d, (%1/4)
7920.loopW:
7921 PROCESS_CHROMA_SP_W4_4R
7922
7923 paddd m0, m6
7924 paddd m1, m6
7925 paddd m2, m6
7926 paddd m3, m6
7927
7928 psrad m0, 12
7929 psrad m1, 12
7930 psrad m2, 12
7931 psrad m3, 12
7932
7933 packssdw m0, m1
7934 packssdw m2, m3
7935
7936 packuswb m0, m2
7937
7938 movd [r2], m0
7939 pextrd [r2 + r3], m0, 1
7940 lea r5, [r2 + 2 * r3]
7941 pextrd [r5], m0, 2
7942 pextrd [r5 + r3], m0, 3
7943
7944 lea r5, [4 * r1 - 2 * 4]
7945 sub r0, r5
7946 add r2, 4
7947
7948 dec r4d
7949 jnz .loopW
7950
7951 lea r0, [r0 + 4 * r1 - 2 * %1]
7952 lea r2, [r2 + 4 * r3 - %1]
7953
7954 dec dword [rsp]
7955 jnz .loopH
7956
7957 RET
7958%endmacro
7959
7960 FILTER_VER_CHROMA_SP 4, 4
7961 FILTER_VER_CHROMA_SP 4, 8
7962 FILTER_VER_CHROMA_SP 16, 16
7963 FILTER_VER_CHROMA_SP 16, 8
7964 FILTER_VER_CHROMA_SP 16, 12
7965 FILTER_VER_CHROMA_SP 12, 16
7966 FILTER_VER_CHROMA_SP 16, 4
7967 FILTER_VER_CHROMA_SP 4, 16
7968 FILTER_VER_CHROMA_SP 32, 32
7969 FILTER_VER_CHROMA_SP 32, 16
7970 FILTER_VER_CHROMA_SP 16, 32
7971 FILTER_VER_CHROMA_SP 32, 24
7972 FILTER_VER_CHROMA_SP 24, 32
7973 FILTER_VER_CHROMA_SP 32, 8
7974
7975 FILTER_VER_CHROMA_SP 16, 24
7976 FILTER_VER_CHROMA_SP 16, 64
7977 FILTER_VER_CHROMA_SP 12, 32
7978 FILTER_VER_CHROMA_SP 4, 32
7979 FILTER_VER_CHROMA_SP 32, 64
7980 FILTER_VER_CHROMA_SP 32, 48
7981 FILTER_VER_CHROMA_SP 24, 64
7982
7983 FILTER_VER_CHROMA_SP 64, 64
7984 FILTER_VER_CHROMA_SP 64, 32
7985 FILTER_VER_CHROMA_SP 64, 48
7986 FILTER_VER_CHROMA_SP 48, 64
7987 FILTER_VER_CHROMA_SP 64, 16
7988
7989
7990%macro PROCESS_CHROMA_SP_W2_4R 1
7991 movd m0, [r0]
7992 movd m1, [r0 + r1]
7993 punpcklwd m0, m1 ;m0=[0 1]
7994
7995 lea r0, [r0 + 2 * r1]
7996 movd m2, [r0]
7997 punpcklwd m1, m2 ;m1=[1 2]
7998 punpcklqdq m0, m1 ;m0=[0 1 1 2]
7999 pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
8000
8001 movd m1, [r0 + r1]
8002 punpcklwd m2, m1 ;m2=[2 3]
8003
8004 lea r0, [r0 + 2 * r1]
8005 movd m3, [r0]
8006 punpcklwd m1, m3 ;m2=[3 4]
8007 punpcklqdq m2, m1 ;m2=[2 3 3 4]
8008
8009 pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
8010 pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
8011 paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
8012
8013 movd m1, [r0 + r1]
8014 punpcklwd m3, m1 ;m3=[4 5]
8015
8016 movd m4, [r0 + 2 * r1]
8017 punpcklwd m1, m4 ;m1=[5 6]
8018 punpcklqdq m3, m1 ;m2=[4 5 5 6]
8019 pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
8020 paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
8021%endmacro
8022
8023;-------------------------------------------------------------------------------------------------------------------
8024; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8025;-------------------------------------------------------------------------------------------------------------------
8026%macro FILTER_VER_CHROMA_SP_W2_4R 2
8027INIT_XMM sse4
8028cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6
8029
8030 add r1d, r1d
8031 sub r0, r1
8032 shl r4d, 5
8033
8034%ifdef PIC
8035 lea r5, [tab_ChromaCoeffV]
8036 lea r5, [r5 + r4]
8037%else
8038 lea r5, [tab_ChromaCoeffV + r4]
8039%endif
8040
8041 mova m5, [tab_c_526336]
8042
8043 mov r4d, (%2/4)
8044
8045.loopH:
8046 PROCESS_CHROMA_SP_W2_4R r5
8047
8048 paddd m0, m5
8049 paddd m2, m5
8050
8051 psrad m0, 12
8052 psrad m2, 12
8053
8054 packssdw m0, m2
8055 packuswb m0, m0
8056
8057 pextrw [r2], m0, 0
8058 pextrw [r2 + r3], m0, 1
8059 lea r2, [r2 + 2 * r3]
8060 pextrw [r2], m0, 2
8061 pextrw [r2 + r3], m0, 3
8062
8063 lea r2, [r2 + 2 * r3]
8064
8065 dec r4d
8066 jnz .loopH
8067
8068 RET
8069%endmacro
8070
8071FILTER_VER_CHROMA_SP_W2_4R 2, 4
8072FILTER_VER_CHROMA_SP_W2_4R 2, 8
8073
8074FILTER_VER_CHROMA_SP_W2_4R 2, 16
8075
8076;--------------------------------------------------------------------------------------------------------------
8077; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8078;--------------------------------------------------------------------------------------------------------------
8079INIT_XMM sse4
8080cglobal interp_4tap_vert_sp_4x2, 5, 6, 5
8081
8082 add r1d, r1d
8083 sub r0, r1
8084 shl r4d, 5
8085
8086%ifdef PIC
8087 lea r5, [tab_ChromaCoeffV]
8088 lea r5, [r5 + r4]
8089%else
8090 lea r5, [tab_ChromaCoeffV + r4]
8091%endif
8092
8093 mova m4, [tab_c_526336]
8094
8095 movq m0, [r0]
8096 movq m1, [r0 + r1]
8097 punpcklwd m0, m1 ;m0=[0 1]
8098 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
8099
8100 lea r0, [r0 + 2 * r1]
8101 movq m2, [r0]
8102 punpcklwd m1, m2 ;m1=[1 2]
8103 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
8104
8105 movq m3, [r0 + r1]
8106 punpcklwd m2, m3 ;m4=[2 3]
8107 pmaddwd m2, [r5 + 1 * 16]
8108 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
8109 paddd m0, m4
8110 psrad m0, 12
8111
8112 movq m2, [r0 + 2 * r1]
8113 punpcklwd m3, m2 ;m5=[3 4]
8114 pmaddwd m3, [r5 + 1 * 16]
8115 paddd m1, m3 ;m1 = [1+2+3+4] Row2 done
8116 paddd m1, m4
8117 psrad m1, 12
8118
8119 packssdw m0, m1
8120 packuswb m0, m0
8121
8122 movd [r2], m0
8123 pextrd [r2 + r3], m0, 1
8124
8125 RET
8126
8127;-------------------------------------------------------------------------------------------------------------------
8128; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8129;-------------------------------------------------------------------------------------------------------------------
8130%macro FILTER_VER_CHROMA_SP_W6_H4 2
8131INIT_XMM sse4
8132cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
8133
8134 add r1d, r1d
8135 sub r0, r1
8136 shl r4d, 5
8137
8138%ifdef PIC
8139 lea r5, [tab_ChromaCoeffV]
8140 lea r6, [r5 + r4]
8141%else
8142 lea r6, [tab_ChromaCoeffV + r4]
8143%endif
8144
8145 mova m6, [tab_c_526336]
8146
8147 mov r4d, %2/4
8148
8149.loopH:
8150 PROCESS_CHROMA_SP_W4_4R
8151
8152 paddd m0, m6
8153 paddd m1, m6
8154 paddd m2, m6
8155 paddd m3, m6
8156
8157 psrad m0, 12
8158 psrad m1, 12
8159 psrad m2, 12
8160 psrad m3, 12
8161
8162 packssdw m0, m1
8163 packssdw m2, m3
8164
8165 packuswb m0, m2
8166
8167 movd [r2], m0
8168 pextrd [r2 + r3], m0, 1
8169 lea r5, [r2 + 2 * r3]
8170 pextrd [r5], m0, 2
8171 pextrd [r5 + r3], m0, 3
8172
8173 lea r5, [4 * r1 - 2 * 4]
8174 sub r0, r5
8175 add r2, 4
8176
8177 PROCESS_CHROMA_SP_W2_4R r6
8178
8179 paddd m0, m6
8180 paddd m2, m6
8181
8182 psrad m0, 12
8183 psrad m2, 12
8184
8185 packssdw m0, m2
8186 packuswb m0, m0
8187
8188 pextrw [r2], m0, 0
8189 pextrw [r2 + r3], m0, 1
8190 lea r2, [r2 + 2 * r3]
8191 pextrw [r2], m0, 2
8192 pextrw [r2 + r3], m0, 3
8193
8194 sub r0, 2 * 4
8195 lea r2, [r2 + 2 * r3 - 4]
8196
8197 dec r4d
8198 jnz .loopH
8199
8200 RET
8201%endmacro
8202
8203FILTER_VER_CHROMA_SP_W6_H4 6, 8
8204
8205FILTER_VER_CHROMA_SP_W6_H4 6, 16
8206
8207%macro PROCESS_CHROMA_SP_W8_2R 0
8208 movu m1, [r0]
8209 movu m3, [r0 + r1]
8210 punpcklwd m0, m1, m3
8211 pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
8212 punpckhwd m1, m3
8213 pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
8214
8215 movu m4, [r0 + 2 * r1]
8216 punpcklwd m2, m3, m4
8217 pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
8218 punpckhwd m3, m4
8219 pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
8220
8221 lea r0, [r0 + 2 * r1]
8222 movu m5, [r0 + r1]
8223 punpcklwd m6, m4, m5
8224 pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
8225 paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
8226 punpckhwd m4, m5
8227 pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
8228 paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
8229
8230 movu m4, [r0 + 2 * r1]
8231 punpcklwd m6, m5, m4
8232 pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
8233 paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
8234 punpckhwd m5, m4
8235 pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
8236 paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
8237%endmacro
8238
8239;--------------------------------------------------------------------------------------------------------------
8240; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8241;--------------------------------------------------------------------------------------------------------------
8242%macro FILTER_VER_CHROMA_SP_W8_H2 2
8243INIT_XMM sse2
8244cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8
8245
8246 add r1d, r1d
8247 sub r0, r1
8248 shl r4d, 5
8249
8250%ifdef PIC
8251 lea r5, [tab_ChromaCoeffV]
8252 lea r5, [r5 + r4]
8253%else
8254 lea r5, [tab_ChromaCoeffV + r4]
8255%endif
8256
8257 mova m7, [tab_c_526336]
8258
8259 mov r4d, %2/2
8260.loopH:
8261 PROCESS_CHROMA_SP_W8_2R
8262
8263 paddd m0, m7
8264 paddd m1, m7
8265 paddd m2, m7
8266 paddd m3, m7
8267
8268 psrad m0, 12
8269 psrad m1, 12
8270 psrad m2, 12
8271 psrad m3, 12
8272
8273 packssdw m0, m1
8274 packssdw m2, m3
8275
8276 packuswb m0, m2
8277
8278 movlps [r2], m0
8279 movhps [r2 + r3], m0
8280
8281 lea r2, [r2 + 2 * r3]
8282
8283 dec r4d
8284 jnz .loopH
8285
8286 RET
8287%endmacro
8288
8289FILTER_VER_CHROMA_SP_W8_H2 8, 2
8290FILTER_VER_CHROMA_SP_W8_H2 8, 4
8291FILTER_VER_CHROMA_SP_W8_H2 8, 6
8292FILTER_VER_CHROMA_SP_W8_H2 8, 8
8293FILTER_VER_CHROMA_SP_W8_H2 8, 16
8294FILTER_VER_CHROMA_SP_W8_H2 8, 32
8295
8296FILTER_VER_CHROMA_SP_W8_H2 8, 12
8297FILTER_VER_CHROMA_SP_W8_H2 8, 64
8298
8299
8300;-----------------------------------------------------------------------------------------------------------------------------
8301; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8302;-----------------------------------------------------------------------------------------------------------------------------
8303%macro FILTER_HORIZ_CHROMA_2xN 2
8304INIT_XMM sse4
8305cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
8306%define coef2 m3
8307%define Tm0 m2
8308%define t1 m1
8309%define t0 m0
8310
8311 dec srcq
8312 mov r4d, r4m
8313 add dststrided, dststrided
8314
8315%ifdef PIC
8316 lea r6, [tab_ChromaCoeff]
8317 movd coef2, [r6 + r4 * 4]
8318%else
8319 movd coef2, [tab_ChromaCoeff + r4 * 4]
8320%endif
8321
8322 pshufd coef2, coef2, 0
8323 mova t1, [pw_2000]
8324 mova Tm0, [tab_Tm]
8325
8326 mov r4d, %2
8327 cmp r5m, byte 0
8328 je .loopH
8329 sub srcq, srcstrideq
8330 add r4d, 3
8331
8332.loopH:
8333 movh t0, [srcq]
8334 pshufb t0, t0, Tm0
8335 pmaddubsw t0, coef2
8336 phaddw t0, t0
8337 psubw t0, t1
8338 movd [dstq], t0
8339
8340 lea srcq, [srcq + srcstrideq]
8341 lea dstq, [dstq + dststrideq]
8342
8343 dec r4d
8344 jnz .loopH
8345
8346 RET
8347%endmacro
8348
8349FILTER_HORIZ_CHROMA_2xN 2, 4
8350FILTER_HORIZ_CHROMA_2xN 2, 8
8351
8352FILTER_HORIZ_CHROMA_2xN 2, 16
8353
8354;-----------------------------------------------------------------------------------------------------------------------------
8355; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8356;-----------------------------------------------------------------------------------------------------------------------------
8357%macro FILTER_HORIZ_CHROMA_4xN 2
8358INIT_XMM sse4
8359cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
8360%define coef2 m3
8361%define Tm0 m2
8362%define t1 m1
8363%define t0 m0
8364
8365 dec srcq
8366 mov r4d, r4m
8367 add dststrided, dststrided
8368
8369%ifdef PIC
8370 lea r6, [tab_ChromaCoeff]
8371 movd coef2, [r6 + r4 * 4]
8372%else
8373 movd coef2, [tab_ChromaCoeff + r4 * 4]
8374%endif
8375
8376 pshufd coef2, coef2, 0
8377 mova t1, [pw_2000]
8378 mova Tm0, [tab_Tm]
8379
8380 mov r4d, %2
8381 cmp r5m, byte 0
8382 je .loopH
8383 sub srcq, srcstrideq
8384 add r4d, 3
8385
8386.loopH:
8387 movh t0, [srcq]
8388 pshufb t0, t0, Tm0
8389 pmaddubsw t0, coef2
8390 phaddw t0, t0
8391 psubw t0, t1
8392 movlps [dstq], t0
8393
8394 lea srcq, [srcq + srcstrideq]
8395 lea dstq, [dstq + dststrideq]
8396
8397 dec r4d
8398 jnz .loopH
8399 RET
8400%endmacro
8401
8402FILTER_HORIZ_CHROMA_4xN 4, 2
8403FILTER_HORIZ_CHROMA_4xN 4, 4
8404FILTER_HORIZ_CHROMA_4xN 4, 8
8405FILTER_HORIZ_CHROMA_4xN 4, 16
8406
8407FILTER_HORIZ_CHROMA_4xN 4, 32
8408
8409%macro PROCESS_CHROMA_W6 3
8410 movu %1, [srcq]
8411 pshufb %2, %1, Tm0
8412 pmaddubsw %2, coef2
8413 pshufb %1, %1, Tm1
8414 pmaddubsw %1, coef2
8415 phaddw %2, %1
8416 psubw %2, %3
8417 movh [dstq], %2
8418 pshufd %2, %2, 2
8419 movd [dstq + 8], %2
8420%endmacro
8421
8422%macro PROCESS_CHROMA_W12 3
8423 movu %1, [srcq]
8424 pshufb %2, %1, Tm0
8425 pmaddubsw %2, coef2
8426 pshufb %1, %1, Tm1
8427 pmaddubsw %1, coef2
8428 phaddw %2, %1
8429 psubw %2, %3
8430 movu [dstq], %2
8431 movu %1, [srcq + 8]
8432 pshufb %1, %1, Tm0
8433 pmaddubsw %1, coef2
8434 phaddw %1, %1
8435 psubw %1, %3
8436 movh [dstq + 16], %1
8437%endmacro
8438
8439;-----------------------------------------------------------------------------------------------------------------------------
8440; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8441;-----------------------------------------------------------------------------------------------------------------------------
8442%macro FILTER_HORIZ_CHROMA 2
8443INIT_XMM sse4
8444cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
8445%define coef2 m5
8446%define Tm0 m4
8447%define Tm1 m3
8448%define t2 m2
8449%define t1 m1
8450%define t0 m0
8451
8452 dec srcq
8453 mov r4d, r4m
8454 add dststrided, dststrided
8455
8456%ifdef PIC
8457 lea r6, [tab_ChromaCoeff]
8458 movd coef2, [r6 + r4 * 4]
8459%else
8460 movd coef2, [tab_ChromaCoeff + r4 * 4]
8461%endif
8462
8463 pshufd coef2, coef2, 0
8464 mova t2, [pw_2000]
8465 mova Tm0, [tab_Tm]
8466 mova Tm1, [tab_Tm + 16]
8467
8468 mov r4d, %2
8469 cmp r5m, byte 0
8470 je .loopH
8471 sub srcq, srcstrideq
8472 add r4d, 3
8473
8474.loopH:
8475 PROCESS_CHROMA_W%1 t0, t1, t2
8476 add srcq, srcstrideq
8477 add dstq, dststrideq
8478
8479 dec r4d
8480 jnz .loopH
8481
8482 RET
8483%endmacro
8484
8485FILTER_HORIZ_CHROMA 6, 8
8486FILTER_HORIZ_CHROMA 12, 16
8487
8488FILTER_HORIZ_CHROMA 6, 16
8489FILTER_HORIZ_CHROMA 12, 32
8490
8491%macro PROCESS_CHROMA_W8 3
8492 movu %1, [srcq]
8493 pshufb %2, %1, Tm0
8494 pmaddubsw %2, coef2
8495 pshufb %1, %1, Tm1
8496 pmaddubsw %1, coef2
8497 phaddw %2, %1
8498 psubw %2, %3
8499 movu [dstq], %2
8500%endmacro
8501
8502;-----------------------------------------------------------------------------------------------------------------------------
8503; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8504;-----------------------------------------------------------------------------------------------------------------------------
8505%macro FILTER_HORIZ_CHROMA_8xN 2
8506INIT_XMM sse4
8507cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
8508%define coef2 m5
8509%define Tm0 m4
8510%define Tm1 m3
8511%define t2 m2
8512%define t1 m1
8513%define t0 m0
8514
8515 dec srcq
8516 mov r4d, r4m
8517 add dststrided, dststrided
8518
8519%ifdef PIC
8520 lea r6, [tab_ChromaCoeff]
8521 movd coef2, [r6 + r4 * 4]
8522%else
8523 movd coef2, [tab_ChromaCoeff + r4 * 4]
8524%endif
8525
8526 pshufd coef2, coef2, 0
8527 mova t2, [pw_2000]
8528 mova Tm0, [tab_Tm]
8529 mova Tm1, [tab_Tm + 16]
8530
8531 mov r4d, %2
8532 cmp r5m, byte 0
8533 je .loopH
8534 sub srcq, srcstrideq
8535 add r4d, 3
8536
8537.loopH:
8538 PROCESS_CHROMA_W8 t0, t1, t2
8539 add srcq, srcstrideq
8540 add dstq, dststrideq
8541
8542 dec r4d
8543 jnz .loopH
8544
8545 RET
8546%endmacro
8547
8548FILTER_HORIZ_CHROMA_8xN 8, 2
8549FILTER_HORIZ_CHROMA_8xN 8, 4
8550FILTER_HORIZ_CHROMA_8xN 8, 6
8551FILTER_HORIZ_CHROMA_8xN 8, 8
8552FILTER_HORIZ_CHROMA_8xN 8, 16
8553FILTER_HORIZ_CHROMA_8xN 8, 32
8554
8555FILTER_HORIZ_CHROMA_8xN 8, 12
8556FILTER_HORIZ_CHROMA_8xN 8, 64
8557
8558%macro PROCESS_CHROMA_W16 4
8559 movu %1, [srcq]
8560 pshufb %2, %1, Tm0
8561 pmaddubsw %2, coef2
8562 pshufb %1, %1, Tm1
8563 pmaddubsw %1, coef2
8564 phaddw %2, %1
8565 movu %1, [srcq + 8]
8566 pshufb %4, %1, Tm0
8567 pmaddubsw %4, coef2
8568 pshufb %1, %1, Tm1
8569 pmaddubsw %1, coef2
8570 phaddw %4, %1
8571 psubw %2, %3
8572 psubw %4, %3
8573 movu [dstq], %2
8574 movu [dstq + 16], %4
8575%endmacro
8576
8577%macro PROCESS_CHROMA_W24 4
8578 movu %1, [srcq]
8579 pshufb %2, %1, Tm0
8580 pmaddubsw %2, coef2
8581 pshufb %1, %1, Tm1
8582 pmaddubsw %1, coef2
8583 phaddw %2, %1
8584 movu %1, [srcq + 8]
8585 pshufb %4, %1, Tm0
8586 pmaddubsw %4, coef2
8587 pshufb %1, %1, Tm1
8588 pmaddubsw %1, coef2
8589 phaddw %4, %1
8590 psubw %2, %3
8591 psubw %4, %3
8592 movu [dstq], %2
8593 movu [dstq + 16], %4
8594 movu %1, [srcq + 16]
8595 pshufb %2, %1, Tm0
8596 pmaddubsw %2, coef2
8597 pshufb %1, %1, Tm1
8598 pmaddubsw %1, coef2
8599 phaddw %2, %1
8600 psubw %2, %3
8601 movu [dstq + 32], %2
8602%endmacro
8603
8604%macro PROCESS_CHROMA_W32 4
8605 movu %1, [srcq]
8606 pshufb %2, %1, Tm0
8607 pmaddubsw %2, coef2
8608 pshufb %1, %1, Tm1
8609 pmaddubsw %1, coef2
8610 phaddw %2, %1
8611 movu %1, [srcq + 8]
8612 pshufb %4, %1, Tm0
8613 pmaddubsw %4, coef2
8614 pshufb %1, %1, Tm1
8615 pmaddubsw %1, coef2
8616 phaddw %4, %1
8617 psubw %2, %3
8618 psubw %4, %3
8619 movu [dstq], %2
8620 movu [dstq + 16], %4
8621 movu %1, [srcq + 16]
8622 pshufb %2, %1, Tm0
8623 pmaddubsw %2, coef2
8624 pshufb %1, %1, Tm1
8625 pmaddubsw %1, coef2
8626 phaddw %2, %1
8627 movu %1, [srcq + 24]
8628 pshufb %4, %1, Tm0
8629 pmaddubsw %4, coef2
8630 pshufb %1, %1, Tm1
8631 pmaddubsw %1, coef2
8632 phaddw %4, %1
8633 psubw %2, %3
8634 psubw %4, %3
8635 movu [dstq + 32], %2
8636 movu [dstq + 48], %4
8637%endmacro
8638
8639%macro PROCESS_CHROMA_W16o 5
8640 movu %1, [srcq + %5]
8641 pshufb %2, %1, Tm0
8642 pmaddubsw %2, coef2
8643 pshufb %1, %1, Tm1
8644 pmaddubsw %1, coef2
8645 phaddw %2, %1
8646 movu %1, [srcq + %5 + 8]
8647 pshufb %4, %1, Tm0
8648 pmaddubsw %4, coef2
8649 pshufb %1, %1, Tm1
8650 pmaddubsw %1, coef2
8651 phaddw %4, %1
8652 psubw %2, %3
8653 psubw %4, %3
8654 movu [dstq + %5 * 2], %2
8655 movu [dstq + %5 * 2 + 16], %4
8656%endmacro
8657
8658%macro PROCESS_CHROMA_W48 4
8659 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
8660 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
8661 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
8662%endmacro
8663
8664%macro PROCESS_CHROMA_W64 4
8665 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
8666 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
8667 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
8668 PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
8669%endmacro
8670
8671;------------------------------------------------------------------------------------------------------------------------------
8672; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8673;------------------------------------------------------------------------------------------------------------------------------
8674%macro FILTER_HORIZ_CHROMA_WxN 2
8675INIT_XMM sse4
8676cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride
8677%define coef2 m6
8678%define Tm0 m5
8679%define Tm1 m4
8680%define t3 m3
8681%define t2 m2
8682%define t1 m1
8683%define t0 m0
8684
8685 dec srcq
8686 mov r4d, r4m
8687 add dststrided, dststrided
8688
8689%ifdef PIC
8690 lea r6, [tab_ChromaCoeff]
8691 movd coef2, [r6 + r4 * 4]
8692%else
8693 movd coef2, [tab_ChromaCoeff + r4 * 4]
8694%endif
8695
8696 pshufd coef2, coef2, 0
8697 mova t2, [pw_2000]
8698 mova Tm0, [tab_Tm]
8699 mova Tm1, [tab_Tm + 16]
8700
8701 mov r4d, %2
8702 cmp r5m, byte 0
8703 je .loopH
8704 sub srcq, srcstrideq
8705 add r4d, 3
8706
8707.loopH:
8708 PROCESS_CHROMA_W%1 t0, t1, t2, t3
8709 add srcq, srcstrideq
8710 add dstq, dststrideq
8711
8712 dec r4d
8713 jnz .loopH
8714
8715 RET
8716%endmacro
8717
8718FILTER_HORIZ_CHROMA_WxN 16, 4
8719FILTER_HORIZ_CHROMA_WxN 16, 8
8720FILTER_HORIZ_CHROMA_WxN 16, 12
8721FILTER_HORIZ_CHROMA_WxN 16, 16
8722FILTER_HORIZ_CHROMA_WxN 16, 32
8723FILTER_HORIZ_CHROMA_WxN 24, 32
8724FILTER_HORIZ_CHROMA_WxN 32, 8
8725FILTER_HORIZ_CHROMA_WxN 32, 16
8726FILTER_HORIZ_CHROMA_WxN 32, 24
8727FILTER_HORIZ_CHROMA_WxN 32, 32
8728
8729FILTER_HORIZ_CHROMA_WxN 16, 24
8730FILTER_HORIZ_CHROMA_WxN 16, 64
8731FILTER_HORIZ_CHROMA_WxN 24, 64
8732FILTER_HORIZ_CHROMA_WxN 32, 48
8733FILTER_HORIZ_CHROMA_WxN 32, 64
8734
8735FILTER_HORIZ_CHROMA_WxN 64, 64
8736FILTER_HORIZ_CHROMA_WxN 64, 32
8737FILTER_HORIZ_CHROMA_WxN 64, 48
8738FILTER_HORIZ_CHROMA_WxN 48, 64
8739FILTER_HORIZ_CHROMA_WxN 64, 16
8740
8741
8742;---------------------------------------------------------------------------------------------------------------
8743; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8744;---------------------------------------------------------------------------------------------------------------
8745%macro FILTER_V_PS_W16n 2
8746INIT_XMM sse4
8747cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
8748
8749 mov r4d, r4m
8750 sub r0, r1
8751 add r3d, r3d
8752
8753%ifdef PIC
8754 lea r5, [tab_ChromaCoeff]
8755 movd m0, [r5 + r4 * 4]
8756%else
8757 movd m0, [tab_ChromaCoeff + r4 * 4]
8758%endif
8759
8760 pshufb m1, m0, [tab_Vm]
8761 pshufb m0, [tab_Vm + 16]
8762 mov r4d, %2/2
8763
8764.loop:
8765
8766 mov r6d, %1/16
8767
8768.loopW:
8769
8770 movu m2, [r0]
8771 movu m3, [r0 + r1]
8772
8773 punpcklbw m4, m2, m3
8774 punpckhbw m2, m3
8775
8776 pmaddubsw m4, m1
8777 pmaddubsw m2, m1
8778
8779 lea r5, [r0 + 2 * r1]
8780 movu m5, [r5]
8781 movu m7, [r5 + r1]
8782
8783 punpcklbw m6, m5, m7
8784 pmaddubsw m6, m0
8785 paddw m4, m6
8786
8787 punpckhbw m6, m5, m7
8788 pmaddubsw m6, m0
8789 paddw m2, m6
8790
8791 mova m6, [pw_2000]
8792
8793 psubw m4, m6
8794 psubw m2, m6
8795
8796 movu [r2], m4
8797 movu [r2 + 16], m2
8798
8799 punpcklbw m4, m3, m5
8800 punpckhbw m3, m5
8801
8802 pmaddubsw m4, m1
8803 pmaddubsw m3, m1
8804
8805 movu m5, [r5 + 2 * r1]
8806
8807 punpcklbw m2, m7, m5
8808 punpckhbw m7, m5
8809
8810 pmaddubsw m2, m0
8811 pmaddubsw m7, m0
8812
8813 paddw m4, m2
8814 paddw m3, m7
8815
8816 psubw m4, m6
8817 psubw m3, m6
8818
8819 movu [r2 + r3], m4
8820 movu [r2 + r3 + 16], m3
8821
8822 add r0, 16
8823 add r2, 32
8824 dec r6d
8825 jnz .loopW
8826
8827 lea r0, [r0 + r1 * 2 - %1]
8828 lea r2, [r2 + r3 * 2 - %1 * 2]
8829
8830 dec r4d
8831 jnz .loop
8832 RET
8833%endmacro
8834
8835FILTER_V_PS_W16n 64, 64
8836FILTER_V_PS_W16n 64, 32
8837FILTER_V_PS_W16n 64, 48
8838FILTER_V_PS_W16n 48, 64
8839FILTER_V_PS_W16n 64, 16
8840
8841
8842;------------------------------------------------------------------------------------------------------------
8843;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8844;------------------------------------------------------------------------------------------------------------
8845INIT_XMM sse4
8846cglobal interp_4tap_vert_ps_2x4, 4, 6, 7
8847
8848 mov r4d, r4m
8849 sub r0, r1
8850 add r3d, r3d
8851
8852%ifdef PIC
8853 lea r5, [tab_ChromaCoeff]
8854 movd m0, [r5 + r4 * 4]
8855%else
8856 movd m0, [tab_ChromaCoeff + r4 * 4]
8857%endif
8858
8859 pshufb m0, [tab_Cm]
8860
8861 lea r5, [3 * r1]
8862
8863 movd m2, [r0]
8864 movd m3, [r0 + r1]
8865 movd m4, [r0 + 2 * r1]
8866 movd m5, [r0 + r5]
8867
8868 punpcklbw m2, m3
8869 punpcklbw m6, m4, m5
8870 punpcklbw m2, m6
8871
8872 pmaddubsw m2, m0
8873
8874 lea r0, [r0 + 4 * r1]
8875 movd m6, [r0]
8876
8877 punpcklbw m3, m4
8878 punpcklbw m1, m5, m6
8879 punpcklbw m3, m1
8880
8881 pmaddubsw m3, m0
8882 phaddw m2, m3
8883
8884 mova m1, [pw_2000]
8885
8886 psubw m2, m1
8887
8888 movd [r2], m2
8889 pextrd [r2 + r3], m2, 2
8890
8891 movd m2, [r0 + r1]
8892
8893 punpcklbw m4, m5
8894 punpcklbw m3, m6, m2
8895 punpcklbw m4, m3
8896
8897 pmaddubsw m4, m0
8898
8899 movd m3, [r0 + 2 * r1]
8900
8901 punpcklbw m5, m6
8902 punpcklbw m2, m3
8903 punpcklbw m5, m2
8904
8905 pmaddubsw m5, m0
8906 phaddw m4, m5
8907 psubw m4, m1
8908
8909 lea r2, [r2 + 2 * r3]
8910 movd [r2], m4
8911 pextrd [r2 + r3], m4, 2
8912
8913 RET
8914
8915;-------------------------------------------------------------------------------------------------------------
8916; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8917;-------------------------------------------------------------------------------------------------------------
8918%macro FILTER_V_PS_W2 2
8919INIT_XMM sse4
8920cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
8921
8922 mov r4d, r4m
8923 sub r0, r1
8924 add r3d, r3d
8925
8926%ifdef PIC
8927 lea r5, [tab_ChromaCoeff]
8928 movd m0, [r5 + r4 * 4]
8929%else
8930 movd m0, [tab_ChromaCoeff + r4 * 4]
8931%endif
8932
8933 pshufb m0, [tab_Cm]
8934
8935 mova m1, [pw_2000]
8936 lea r5, [3 * r1]
8937 mov r4d, %2/4
8938.loop:
8939 movd m2, [r0]
8940 movd m3, [r0 + r1]
8941 movd m4, [r0 + 2 * r1]
8942 movd m5, [r0 + r5]
8943
8944 punpcklbw m2, m3
8945 punpcklbw m6, m4, m5
8946 punpcklbw m2, m6
8947
8948 pmaddubsw m2, m0
8949
8950 lea r0, [r0 + 4 * r1]
8951 movd m6, [r0]
8952
8953 punpcklbw m3, m4
8954 punpcklbw m7, m5, m6
8955 punpcklbw m3, m7
8956
8957 pmaddubsw m3, m0
8958
8959 phaddw m2, m3
8960 psubw m2, m1
8961
8962
8963 movd [r2], m2
8964 pshufd m2, m2, 2
8965 movd [r2 + r3], m2
8966
8967 movd m2, [r0 + r1]
8968
8969 punpcklbw m4, m5
8970 punpcklbw m3, m6, m2
8971 punpcklbw m4, m3
8972
8973 pmaddubsw m4, m0
8974
8975 movd m3, [r0 + 2 * r1]
8976
8977 punpcklbw m5, m6
8978 punpcklbw m2, m3
8979 punpcklbw m5, m2
8980
8981 pmaddubsw m5, m0
8982
8983 phaddw m4, m5
8984
8985 psubw m4, m1
8986
8987 lea r2, [r2 + 2 * r3]
8988 movd [r2], m4
8989 pshufd m4 , m4 ,2
8990 movd [r2 + r3], m4
8991
8992 lea r2, [r2 + 2 * r3]
8993
8994 dec r4d
8995 jnz .loop
8996
8997RET
8998%endmacro
8999
9000FILTER_V_PS_W2 2, 8
9001
9002FILTER_V_PS_W2 2, 16
9003
9004;-----------------------------------------------------------------------------------------------------------------
9005; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9006;-----------------------------------------------------------------------------------------------------------------
9007%macro FILTER_VER_CHROMA_SS 2
9008INIT_XMM sse2
9009cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
9010
9011 add r1d, r1d
9012 add r3d, r3d
9013 sub r0, r1
9014 shl r4d, 5
9015
9016%ifdef PIC
9017 lea r5, [tab_ChromaCoeffV]
9018 lea r6, [r5 + r4]
9019%else
9020 lea r6, [tab_ChromaCoeffV + r4]
9021%endif
9022
9023 mov dword [rsp], %2/4
9024
9025.loopH:
9026 mov r4d, (%1/4)
9027.loopW:
9028 PROCESS_CHROMA_SP_W4_4R
9029
9030 psrad m0, 6
9031 psrad m1, 6
9032 psrad m2, 6
9033 psrad m3, 6
9034
9035 packssdw m0, m1
9036 packssdw m2, m3
9037
9038 movlps [r2], m0
9039 movhps [r2 + r3], m0
9040 lea r5, [r2 + 2 * r3]
9041 movlps [r5], m2
9042 movhps [r5 + r3], m2
9043
9044 lea r5, [4 * r1 - 2 * 4]
9045 sub r0, r5
9046 add r2, 2 * 4
9047
9048 dec r4d
9049 jnz .loopW
9050
9051 lea r0, [r0 + 4 * r1 - 2 * %1]
9052 lea r2, [r2 + 4 * r3 - 2 * %1]
9053
9054 dec dword [rsp]
9055 jnz .loopH
9056
9057 RET
9058%endmacro
9059
9060 FILTER_VER_CHROMA_SS 4, 4
9061 FILTER_VER_CHROMA_SS 4, 8
9062 FILTER_VER_CHROMA_SS 16, 16
9063 FILTER_VER_CHROMA_SS 16, 8
9064 FILTER_VER_CHROMA_SS 16, 12
9065 FILTER_VER_CHROMA_SS 12, 16
9066 FILTER_VER_CHROMA_SS 16, 4
9067 FILTER_VER_CHROMA_SS 4, 16
9068 FILTER_VER_CHROMA_SS 32, 32
9069 FILTER_VER_CHROMA_SS 32, 16
9070 FILTER_VER_CHROMA_SS 16, 32
9071 FILTER_VER_CHROMA_SS 32, 24
9072 FILTER_VER_CHROMA_SS 24, 32
9073 FILTER_VER_CHROMA_SS 32, 8
9074
9075 FILTER_VER_CHROMA_SS 16, 24
9076 FILTER_VER_CHROMA_SS 12, 32
9077 FILTER_VER_CHROMA_SS 4, 32
9078 FILTER_VER_CHROMA_SS 32, 64
9079 FILTER_VER_CHROMA_SS 16, 64
9080 FILTER_VER_CHROMA_SS 32, 48
9081 FILTER_VER_CHROMA_SS 24, 64
9082
9083 FILTER_VER_CHROMA_SS 64, 64
9084 FILTER_VER_CHROMA_SS 64, 32
9085 FILTER_VER_CHROMA_SS 64, 48
9086 FILTER_VER_CHROMA_SS 48, 64
9087 FILTER_VER_CHROMA_SS 64, 16
9088
9089
9090;---------------------------------------------------------------------------------------------------------------------
9091; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9092;---------------------------------------------------------------------------------------------------------------------
9093%macro FILTER_VER_CHROMA_SS_W2_4R 2
9094INIT_XMM sse4
9095cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5
9096
9097 add r1d, r1d
9098 add r3d, r3d
9099 sub r0, r1
9100 shl r4d, 5
9101
9102%ifdef PIC
9103 lea r5, [tab_ChromaCoeffV]
9104 lea r5, [r5 + r4]
9105%else
9106 lea r5, [tab_ChromaCoeffV + r4]
9107%endif
9108
9109 mov r4d, (%2/4)
9110
9111.loopH:
9112 PROCESS_CHROMA_SP_W2_4R r5
9113
9114 psrad m0, 6
9115 psrad m2, 6
9116
9117 packssdw m0, m2
9118
9119 movd [r2], m0
9120 pextrd [r2 + r3], m0, 1
9121 lea r2, [r2 + 2 * r3]
9122 pextrd [r2], m0, 2
9123 pextrd [r2 + r3], m0, 3
9124
9125 lea r2, [r2 + 2 * r3]
9126
9127 dec r4d
9128 jnz .loopH
9129
9130 RET
9131%endmacro
9132
9133FILTER_VER_CHROMA_SS_W2_4R 2, 4
9134FILTER_VER_CHROMA_SS_W2_4R 2, 8
9135
9136FILTER_VER_CHROMA_SS_W2_4R 2, 16
9137
9138;---------------------------------------------------------------------------------------------------------------
9139; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9140;---------------------------------------------------------------------------------------------------------------
9141INIT_XMM sse2
9142cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
9143
9144 add r1d, r1d
9145 add r3d, r3d
9146 sub r0, r1
9147 shl r4d, 5
9148
9149%ifdef PIC
9150 lea r5, [tab_ChromaCoeffV]
9151 lea r5, [r5 + r4]
9152%else
9153 lea r5, [tab_ChromaCoeffV + r4]
9154%endif
9155
9156 movq m0, [r0]
9157 movq m1, [r0 + r1]
9158 punpcklwd m0, m1 ;m0=[0 1]
9159 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
9160
9161 lea r0, [r0 + 2 * r1]
9162 movq m2, [r0]
9163 punpcklwd m1, m2 ;m1=[1 2]
9164 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
9165
9166 movq m3, [r0 + r1]
9167 punpcklwd m2, m3 ;m4=[2 3]
9168 pmaddwd m2, [r5 + 1 * 16]
9169 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
9170 psrad m0, 6
9171
9172 movq m2, [r0 + 2 * r1]
9173 punpcklwd m3, m2 ;m5=[3 4]
9174 pmaddwd m3, [r5 + 1 * 16]
9175 paddd m1, m3 ;m1=[1+2+3+4] Row2 done
9176 psrad m1, 6
9177
9178 packssdw m0, m1
9179
9180 movlps [r2], m0
9181 movhps [r2 + r3], m0
9182
9183 RET
9184
9185;-------------------------------------------------------------------------------------------------------------------
9186; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9187;-------------------------------------------------------------------------------------------------------------------
9188%macro FILTER_VER_CHROMA_SS_W6_H4 2
9189INIT_XMM sse4
9190cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
9191
9192 add r1d, r1d
9193 add r3d, r3d
9194 sub r0, r1
9195 shl r4d, 5
9196
9197%ifdef PIC
9198 lea r5, [tab_ChromaCoeffV]
9199 lea r6, [r5 + r4]
9200%else
9201 lea r6, [tab_ChromaCoeffV + r4]
9202%endif
9203
9204 mov r4d, %2/4
9205
9206.loopH:
9207 PROCESS_CHROMA_SP_W4_4R
9208
9209 psrad m0, 6
9210 psrad m1, 6
9211 psrad m2, 6
9212 psrad m3, 6
9213
9214 packssdw m0, m1
9215 packssdw m2, m3
9216
9217 movlps [r2], m0
9218 movhps [r2 + r3], m0
9219 lea r5, [r2 + 2 * r3]
9220 movlps [r5], m2
9221 movhps [r5 + r3], m2
9222
9223 lea r5, [4 * r1 - 2 * 4]
9224 sub r0, r5
9225 add r2, 2 * 4
9226
9227 PROCESS_CHROMA_SP_W2_4R r6
9228
9229 psrad m0, 6
9230 psrad m2, 6
9231
9232 packssdw m0, m2
9233
9234 movd [r2], m0
9235 pextrd [r2 + r3], m0, 1
9236 lea r2, [r2 + 2 * r3]
9237 pextrd [r2], m0, 2
9238 pextrd [r2 + r3], m0, 3
9239
9240 sub r0, 2 * 4
9241 lea r2, [r2 + 2 * r3 - 2 * 4]
9242
9243 dec r4d
9244 jnz .loopH
9245
9246 RET
9247%endmacro
9248
9249FILTER_VER_CHROMA_SS_W6_H4 6, 8
9250
9251FILTER_VER_CHROMA_SS_W6_H4 6, 16
9252
9253
9254;----------------------------------------------------------------------------------------------------------------
9255; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9256;----------------------------------------------------------------------------------------------------------------
9257%macro FILTER_VER_CHROMA_SS_W8_H2 2
9258INIT_XMM sse2
9259cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
9260
9261 add r1d, r1d
9262 add r3d, r3d
9263 sub r0, r1
9264 shl r4d, 5
9265
9266%ifdef PIC
9267 lea r5, [tab_ChromaCoeffV]
9268 lea r5, [r5 + r4]
9269%else
9270 lea r5, [tab_ChromaCoeffV + r4]
9271%endif
9272
9273 mov r4d, %2/2
9274.loopH:
9275 PROCESS_CHROMA_SP_W8_2R
9276
9277 psrad m0, 6
9278 psrad m1, 6
9279 psrad m2, 6
9280 psrad m3, 6
9281
9282 packssdw m0, m1
9283 packssdw m2, m3
9284
9285 movu [r2], m0
9286 movu [r2 + r3], m2
9287
9288 lea r2, [r2 + 2 * r3]
9289
9290 dec r4d
9291 jnz .loopH
9292
9293 RET
9294%endmacro
9295
9296FILTER_VER_CHROMA_SS_W8_H2 8, 2
9297FILTER_VER_CHROMA_SS_W8_H2 8, 4
9298FILTER_VER_CHROMA_SS_W8_H2 8, 6
9299FILTER_VER_CHROMA_SS_W8_H2 8, 8
9300FILTER_VER_CHROMA_SS_W8_H2 8, 16
9301FILTER_VER_CHROMA_SS_W8_H2 8, 32
9302
9303FILTER_VER_CHROMA_SS_W8_H2 8, 12
9304FILTER_VER_CHROMA_SS_W8_H2 8, 64
9305
9306;-----------------------------------------------------------------------------------------------------------------
9307; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9308;-----------------------------------------------------------------------------------------------------------------
9309%macro FILTER_VER_LUMA_SS 2
9310INIT_XMM sse2
9311cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
9312
9313 add r1d, r1d
9314 add r3d, r3d
9315 lea r5, [3 * r1]
9316 sub r0, r5
9317 shl r4d, 6
9318
9319%ifdef PIC
9320 lea r5, [tab_LumaCoeffV]
9321 lea r6, [r5 + r4]
9322%else
9323 lea r6, [tab_LumaCoeffV + r4]
9324%endif
9325
9326 mov dword [rsp], %2/4
9327.loopH:
9328 mov r4d, (%1/4)
9329.loopW:
9330 movq m0, [r0]
9331 movq m1, [r0 + r1]
9332 punpcklwd m0, m1 ;m0=[0 1]
9333 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
9334
9335 lea r0, [r0 + 2 * r1]
9336 movq m4, [r0]
9337 punpcklwd m1, m4 ;m1=[1 2]
9338 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
9339
9340 movq m5, [r0 + r1]
9341 punpcklwd m4, m5 ;m4=[2 3]
9342 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
9343 pmaddwd m4, [r6 + 1 * 16]
9344 paddd m0, m4 ;m0=[0+1+2+3] Row1
9345
9346 lea r0, [r0 + 2 * r1]
9347 movq m4, [r0]
9348 punpcklwd m5, m4 ;m5=[3 4]
9349 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
9350 pmaddwd m5, [r6 + 1 * 16]
9351 paddd m1, m5 ;m1 = [1+2+3+4] Row2
9352
9353 movq m5, [r0 + r1]
9354 punpcklwd m4, m5 ;m4=[4 5]
9355 pmaddwd m6, m4, [r6 + 1 * 16]
9356 paddd m2, m6 ;m2=[2+3+4+5] Row3
9357 pmaddwd m4, [r6 + 2 * 16]
9358 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
9359
9360 lea r0, [r0 + 2 * r1]
9361 movq m4, [r0]
9362 punpcklwd m5, m4 ;m5=[5 6]
9363 pmaddwd m6, m5, [r6 + 1 * 16]
9364 paddd m3, m6 ;m3=[3+4+5+6] Row4
9365 pmaddwd m5, [r6 + 2 * 16]
9366 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
9367
9368 movq m5, [r0 + r1]
9369 punpcklwd m4, m5 ;m4=[6 7]
9370 pmaddwd m6, m4, [r6 + 2 * 16]
9371 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
9372 pmaddwd m4, [r6 + 3 * 16]
9373 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
9374 psrad m0, 6
9375
9376 lea r0, [r0 + 2 * r1]
9377 movq m4, [r0]
9378 punpcklwd m5, m4 ;m5=[7 8]
9379 pmaddwd m6, m5, [r6 + 2 * 16]
9380 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
9381 pmaddwd m5, [r6 + 3 * 16]
9382 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
9383 psrad m1, 6
9384
9385 packssdw m0, m1
9386
9387 movlps [r2], m0
9388 movhps [r2 + r3], m0
9389
9390 movq m5, [r0 + r1]
9391 punpcklwd m4, m5 ;m4=[8 9]
9392 pmaddwd m4, [r6 + 3 * 16]
9393 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
9394 psrad m2, 6
9395
9396 movq m4, [r0 + 2 * r1]
9397 punpcklwd m5, m4 ;m5=[9 10]
9398 pmaddwd m5, [r6 + 3 * 16]
9399 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
9400 psrad m3, 6
9401
9402 packssdw m2, m3
9403
9404 movlps [r2 + 2 * r3], m2
9405 lea r5, [3 * r3]
9406 movhps [r2 + r5], m2
9407
9408 lea r5, [8 * r1 - 2 * 4]
9409 sub r0, r5
9410 add r2, 2 * 4
9411
9412 dec r4d
9413 jnz .loopW
9414
9415 lea r0, [r0 + 4 * r1 - 2 * %1]
9416 lea r2, [r2 + 4 * r3 - 2 * %1]
9417
9418 dec dword [rsp]
9419 jnz .loopH
9420
9421 RET
9422%endmacro
9423
9424 FILTER_VER_LUMA_SS 4, 4
9425 FILTER_VER_LUMA_SS 8, 8
9426 FILTER_VER_LUMA_SS 8, 4
9427 FILTER_VER_LUMA_SS 4, 8
9428 FILTER_VER_LUMA_SS 16, 16
9429 FILTER_VER_LUMA_SS 16, 8
9430 FILTER_VER_LUMA_SS 8, 16
9431 FILTER_VER_LUMA_SS 16, 12
9432 FILTER_VER_LUMA_SS 12, 16
9433 FILTER_VER_LUMA_SS 16, 4
9434 FILTER_VER_LUMA_SS 4, 16
9435 FILTER_VER_LUMA_SS 32, 32
9436 FILTER_VER_LUMA_SS 32, 16
9437 FILTER_VER_LUMA_SS 16, 32
9438 FILTER_VER_LUMA_SS 32, 24
9439 FILTER_VER_LUMA_SS 24, 32
9440 FILTER_VER_LUMA_SS 32, 8
9441 FILTER_VER_LUMA_SS 8, 32
9442 FILTER_VER_LUMA_SS 64, 64
9443 FILTER_VER_LUMA_SS 64, 32
9444 FILTER_VER_LUMA_SS 32, 64
9445 FILTER_VER_LUMA_SS 64, 48
9446 FILTER_VER_LUMA_SS 48, 64
9447 FILTER_VER_LUMA_SS 64, 16
9448 FILTER_VER_LUMA_SS 16, 64