Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / ipfilter8.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* Copyright (C) 2013 x265 project
3;*
4;* Authors: Min Chen <chenm003@163.com>
5;* Nabajit Deka <nabajit@multicorewareinc.com>
6;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7;*
8;* This program is free software; you can redistribute it and/or modify
9;* it under the terms of the GNU General Public License as published by
10;* the Free Software Foundation; either version 2 of the License, or
11;* (at your option) any later version.
12;*
13;* This program is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16;* GNU General Public License for more details.
17;*
18;* You should have received a copy of the GNU General Public License
19;* along with this program; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21;*
22;* This program is also available under a commercial proprietary license.
23;* For more information, contact us at license @ x265.com.
24;*****************************************************************************/
25
26%include "x86inc.asm"
27%include "x86util.asm"
28
29SECTION_RODATA 32
30tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
31 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
32 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
33
34ALIGN 32
35tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
36 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
37 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
38 db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
39
40tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
41 db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
42
43tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
44
45tab_c_512: times 8 dw 512
46tab_c_526336: times 4 dd 8192*64+2048
47
48tab_ChromaCoeff: db 0, 64, 0, 0
49 db -2, 58, 10, -2
50 db -4, 54, 16, -2
51 db -6, 46, 28, -4
52 db -4, 36, 36, -4
53 db -4, 28, 46, -6
54 db -2, 16, 54, -4
55 db -2, 10, 58, -2
56
57tab_ChromaCoeffV: times 4 dw 0, 64
58 times 4 dw 0, 0
59
60 times 4 dw -2, 58
61 times 4 dw 10, -2
62
63 times 4 dw -4, 54
64 times 4 dw 16, -2
65
66 times 4 dw -6, 46
67 times 4 dw 28, -4
68
69 times 4 dw -4, 36
70 times 4 dw 36, -4
71
72 times 4 dw -4, 28
73 times 4 dw 46, -6
74
75 times 4 dw -2, 16
76 times 4 dw 54, -4
77
78 times 4 dw -2, 10
79 times 4 dw 58, -2
80
81tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
82 db -1, 4, -10, 58, 17, -5, 1, 0
83 db -1, 4, -11, 40, 40, -11, 4, -1
84 db 0, 1, -5, 17, 58, -10, 4, -1
85
86tab_LumaCoeffV: times 4 dw 0, 0
87 times 4 dw 0, 64
88 times 4 dw 0, 0
89 times 4 dw 0, 0
90
91 times 4 dw -1, 4
92 times 4 dw -10, 58
93 times 4 dw 17, -5
94 times 4 dw 1, 0
95
96 times 4 dw -1, 4
97 times 4 dw -11, 40
98 times 4 dw 40, -11
99 times 4 dw 4, -1
100
101 times 4 dw 0, 1
102 times 4 dw -5, 17
103 times 4 dw 58, -10
104 times 4 dw 4, -1
105
106tab_LumaCoeffVer: times 8 db 0, 0
107 times 8 db 0, 64
108 times 8 db 0, 0
109 times 8 db 0, 0
110
111 times 8 db -1, 4
112 times 8 db -10, 58
113 times 8 db 17, -5
114 times 8 db 1, 0
115
116 times 8 db -1, 4
117 times 8 db -11, 40
118 times 8 db 40, -11
119 times 8 db 4, -1
120
121 times 8 db 0, 1
122 times 8 db -5, 17
123 times 8 db 58, -10
124 times 8 db 4, -1
125
126tab_c_128: times 16 db 0x80
127tab_c_64_n64: times 8 db 64, -64
128
129
130SECTION .text
131
132cextern idct4_shuf1
133cextern pw_1
134cextern pw_512
135cextern pw_2000
136
137%macro FILTER_H4_w2_2 3
138 movh %2, [srcq - 1]
139 pshufb %2, %2, Tm0
140 movh %1, [srcq + srcstrideq - 1]
141 pshufb %1, %1, Tm0
142 punpcklqdq %2, %1
143 pmaddubsw %2, coef2
144 phaddw %2, %2
145 pmulhrsw %2, %3
146 packuswb %2, %2
147 movd r4, %2
148 mov [dstq], r4w
149 shr r4, 16
150 mov [dstq + dststrideq], r4w
151%endmacro
152
153;-----------------------------------------------------------------------------
154; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
155;-----------------------------------------------------------------------------
156INIT_XMM sse4
157cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride
158%define coef2 m4
159%define Tm0 m3
160%define t2 m2
161%define t1 m1
162%define t0 m0
163
164mov r4d, r4m
165
166%ifdef PIC
167lea r5, [tab_ChromaCoeff]
168movd coef2, [r5 + r4 * 4]
169%else
170movd coef2, [tab_ChromaCoeff + r4 * 4]
171%endif
172
173pshufd coef2, coef2, 0
174mova t2, [tab_c_512]
175mova Tm0, [tab_Tm]
176
177%rep 2
178FILTER_H4_w2_2 t0, t1, t2
179lea srcq, [srcq + srcstrideq * 2]
180lea dstq, [dstq + dststrideq * 2]
181%endrep
182
183RET
184
185;-----------------------------------------------------------------------------
186; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
187;-----------------------------------------------------------------------------
188INIT_XMM sse4
189cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride
190%define coef2 m4
191%define Tm0 m3
192%define t2 m2
193%define t1 m1
194%define t0 m0
195
196mov r4d, r4m
197
198%ifdef PIC
199lea r5, [tab_ChromaCoeff]
200movd coef2, [r5 + r4 * 4]
201%else
202movd coef2, [tab_ChromaCoeff + r4 * 4]
203%endif
204
205pshufd coef2, coef2, 0
206mova t2, [tab_c_512]
207mova Tm0, [tab_Tm]
208
209%rep 4
210FILTER_H4_w2_2 t0, t1, t2
211lea srcq, [srcq + srcstrideq * 2]
212lea dstq, [dstq + dststrideq * 2]
213%endrep
214
215RET
216
217;-----------------------------------------------------------------------------
218; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
219;-----------------------------------------------------------------------------
220INIT_XMM sse4
221cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
222%define coef2 m4
223%define Tm0 m3
224%define t2 m2
225%define t1 m1
226%define t0 m0
227
228mov r4d, r4m
229
230%ifdef PIC
231lea r5, [tab_ChromaCoeff]
232movd coef2, [r5 + r4 * 4]
233%else
234movd coef2, [tab_ChromaCoeff + r4 * 4]
235%endif
236
237pshufd coef2, coef2, 0
238mova t2, [tab_c_512]
239mova Tm0, [tab_Tm]
240
241mov r5d, 16/2
242
243.loop:
244FILTER_H4_w2_2 t0, t1, t2
245lea srcq, [srcq + srcstrideq * 2]
246lea dstq, [dstq + dststrideq * 2]
247dec r5d
248jnz .loop
249
250RET
251
252%macro FILTER_H4_w4_2 3
253 movh %2, [srcq - 1]
254 pshufb %2, %2, Tm0
255 pmaddubsw %2, coef2
256 movh %1, [srcq + srcstrideq - 1]
257 pshufb %1, %1, Tm0
258 pmaddubsw %1, coef2
259 phaddw %2, %1
260 pmulhrsw %2, %3
261 packuswb %2, %2
262 movd [dstq], %2
263 palignr %2, %2, 4
264 movd [dstq + dststrideq], %2
265%endmacro
266
267;-----------------------------------------------------------------------------
268; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
269;-----------------------------------------------------------------------------
270INIT_XMM sse4
271cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride
272%define coef2 m4
273%define Tm0 m3
274%define t2 m2
275%define t1 m1
276%define t0 m0
277
278mov r4d, r4m
279
280%ifdef PIC
281lea r5, [tab_ChromaCoeff]
282movd coef2, [r5 + r4 * 4]
283%else
284movd coef2, [tab_ChromaCoeff + r4 * 4]
285%endif
286
287pshufd coef2, coef2, 0
288mova t2, [tab_c_512]
289mova Tm0, [tab_Tm]
290
291FILTER_H4_w4_2 t0, t1, t2
292
293RET
294
295;-----------------------------------------------------------------------------
296; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
297;-----------------------------------------------------------------------------
298INIT_XMM sse4
299cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride
300%define coef2 m4
301%define Tm0 m3
302%define t2 m2
303%define t1 m1
304%define t0 m0
305
306mov r4d, r4m
307
308%ifdef PIC
309lea r5, [tab_ChromaCoeff]
310movd coef2, [r5 + r4 * 4]
311%else
312movd coef2, [tab_ChromaCoeff + r4 * 4]
313%endif
314
315pshufd coef2, coef2, 0
316mova t2, [tab_c_512]
317mova Tm0, [tab_Tm]
318
319%rep 2
320FILTER_H4_w4_2 t0, t1, t2
321lea srcq, [srcq + srcstrideq * 2]
322lea dstq, [dstq + dststrideq * 2]
323%endrep
324
325RET
326
327;-----------------------------------------------------------------------------
328; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
329;-----------------------------------------------------------------------------
330INIT_XMM sse4
331cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride
332%define coef2 m4
333%define Tm0 m3
334%define t2 m2
335%define t1 m1
336%define t0 m0
337
338mov r4d, r4m
339
340%ifdef PIC
341lea r5, [tab_ChromaCoeff]
342movd coef2, [r5 + r4 * 4]
343%else
344movd coef2, [tab_ChromaCoeff + r4 * 4]
345%endif
346
347pshufd coef2, coef2, 0
348mova t2, [tab_c_512]
349mova Tm0, [tab_Tm]
350
351%rep 4
352FILTER_H4_w4_2 t0, t1, t2
353lea srcq, [srcq + srcstrideq * 2]
354lea dstq, [dstq + dststrideq * 2]
355%endrep
356
357RET
358
359;-----------------------------------------------------------------------------
360; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
361;-----------------------------------------------------------------------------
362INIT_XMM sse4
363cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride
364%define coef2 m4
365%define Tm0 m3
366%define t2 m2
367%define t1 m1
368%define t0 m0
369
370mov r4d, r4m
371
372%ifdef PIC
373lea r5, [tab_ChromaCoeff]
374movd coef2, [r5 + r4 * 4]
375%else
376movd coef2, [tab_ChromaCoeff + r4 * 4]
377%endif
378
379pshufd coef2, coef2, 0
380mova t2, [tab_c_512]
381mova Tm0, [tab_Tm]
382
383%rep 8
384FILTER_H4_w4_2 t0, t1, t2
385lea srcq, [srcq + srcstrideq * 2]
386lea dstq, [dstq + dststrideq * 2]
387%endrep
388
389RET
390
391;-----------------------------------------------------------------------------
392; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
393;-----------------------------------------------------------------------------
394INIT_XMM sse4
395cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
396%define coef2 m4
397%define Tm0 m3
398%define t2 m2
399%define t1 m1
400%define t0 m0
401
402mov r4d, r4m
403
404%ifdef PIC
405lea r5, [tab_ChromaCoeff]
406movd coef2, [r5 + r4 * 4]
407%else
408movd coef2, [tab_ChromaCoeff + r4 * 4]
409%endif
410
411pshufd coef2, coef2, 0
412mova t2, [tab_c_512]
413mova Tm0, [tab_Tm]
414
415mov r5d, 32/2
416
417.loop:
418FILTER_H4_w4_2 t0, t1, t2
419lea srcq, [srcq + srcstrideq * 2]
420lea dstq, [dstq + dststrideq * 2]
421dec r5d
422jnz .loop
423
424RET
425
426
427%macro FILTER_H4_w6 3
428 movu %1, [srcq - 1]
429 pshufb %2, %1, Tm0
430 pmaddubsw %2, coef2
431 pshufb %1, %1, Tm1
432 pmaddubsw %1, coef2
433 phaddw %2, %1
434 pmulhrsw %2, %3
435 packuswb %2, %2
436 movd [dstq], %2
437 pextrw [dstq + 4], %2, 2
438%endmacro
439
440%macro FILTER_H4_w8 3
441 movu %1, [srcq - 1]
442 pshufb %2, %1, Tm0
443 pmaddubsw %2, coef2
444 pshufb %1, %1, Tm1
445 pmaddubsw %1, coef2
446 phaddw %2, %1
447 pmulhrsw %2, %3
448 packuswb %2, %2
449 movh [dstq], %2
450%endmacro
451
452%macro FILTER_H4_w12 3
453 movu %1, [srcq - 1]
454 pshufb %2, %1, Tm0
455 pmaddubsw %2, coef2
456 pshufb %1, %1, Tm1
457 pmaddubsw %1, coef2
458 phaddw %2, %1
459 pmulhrsw %2, %3
460 movu %1, [srcq - 1 + 8]
461 pshufb %1, %1, Tm0
462 pmaddubsw %1, coef2
463 phaddw %1, %1
464 pmulhrsw %1, %3
465 packuswb %2, %1
466 movh [dstq], %2
467 pextrd [dstq + 8], %2, 2
468%endmacro
469
470%macro FILTER_H4_w16 4
471 movu %1, [srcq - 1]
472 pshufb %2, %1, Tm0
473 pmaddubsw %2, coef2
474 pshufb %1, %1, Tm1
475 pmaddubsw %1, coef2
476 phaddw %2, %1
477 movu %1, [srcq - 1 + 8]
478 pshufb %4, %1, Tm0
479 pmaddubsw %4, coef2
480 pshufb %1, %1, Tm1
481 pmaddubsw %1, coef2
482 phaddw %4, %1
483 pmulhrsw %2, %3
484 pmulhrsw %4, %3
485 packuswb %2, %4
486 movu [dstq], %2
487%endmacro
488
489%macro FILTER_H4_w24 4
490 movu %1, [srcq - 1]
491 pshufb %2, %1, Tm0
492 pmaddubsw %2, coef2
493 pshufb %1, %1, Tm1
494 pmaddubsw %1, coef2
495 phaddw %2, %1
496 movu %1, [srcq - 1 + 8]
497 pshufb %4, %1, Tm0
498 pmaddubsw %4, coef2
499 pshufb %1, %1, Tm1
500 pmaddubsw %1, coef2
501 phaddw %4, %1
502 pmulhrsw %2, %3
503 pmulhrsw %4, %3
504 packuswb %2, %4
505 movu [dstq], %2
506 movu %1, [srcq - 1 + 16]
507 pshufb %2, %1, Tm0
508 pmaddubsw %2, coef2
509 pshufb %1, %1, Tm1
510 pmaddubsw %1, coef2
511 phaddw %2, %1
512 pmulhrsw %2, %3
513 packuswb %2, %2
514 movh [dstq + 16], %2
515%endmacro
516
517%macro FILTER_H4_w32 4
518 movu %1, [srcq - 1]
519 pshufb %2, %1, Tm0
520 pmaddubsw %2, coef2
521 pshufb %1, %1, Tm1
522 pmaddubsw %1, coef2
523 phaddw %2, %1
524 movu %1, [srcq - 1 + 8]
525 pshufb %4, %1, Tm0
526 pmaddubsw %4, coef2
527 pshufb %1, %1, Tm1
528 pmaddubsw %1, coef2
529 phaddw %4, %1
530 pmulhrsw %2, %3
531 pmulhrsw %4, %3
532 packuswb %2, %4
533 movu [dstq], %2
534 movu %1, [srcq - 1 + 16]
535 pshufb %2, %1, Tm0
536 pmaddubsw %2, coef2
537 pshufb %1, %1, Tm1
538 pmaddubsw %1, coef2
539 phaddw %2, %1
540 movu %1, [srcq - 1 + 24]
541 pshufb %4, %1, Tm0
542 pmaddubsw %4, coef2
543 pshufb %1, %1, Tm1
544 pmaddubsw %1, coef2
545 phaddw %4, %1
546 pmulhrsw %2, %3
547 pmulhrsw %4, %3
548 packuswb %2, %4
549 movu [dstq + 16], %2
550%endmacro
551
552%macro FILTER_H4_w16o 5
553 movu %1, [srcq + %5 - 1]
554 pshufb %2, %1, Tm0
555 pmaddubsw %2, coef2
556 pshufb %1, %1, Tm1
557 pmaddubsw %1, coef2
558 phaddw %2, %1
559 movu %1, [srcq + %5 - 1 + 8]
560 pshufb %4, %1, Tm0
561 pmaddubsw %4, coef2
562 pshufb %1, %1, Tm1
563 pmaddubsw %1, coef2
564 phaddw %4, %1
565 pmulhrsw %2, %3
566 pmulhrsw %4, %3
567 packuswb %2, %4
568 movu [dstq + %5], %2
569%endmacro
570
571%macro FILTER_H4_w48 4
572 FILTER_H4_w16o %1, %2, %3, %4, 0
573 FILTER_H4_w16o %1, %2, %3, %4, 16
574 FILTER_H4_w16o %1, %2, %3, %4, 32
575%endmacro
576
577%macro FILTER_H4_w64 4
578 FILTER_H4_w16o %1, %2, %3, %4, 0
579 FILTER_H4_w16o %1, %2, %3, %4, 16
580 FILTER_H4_w16o %1, %2, %3, %4, 32
581 FILTER_H4_w16o %1, %2, %3, %4, 48
582%endmacro
583
584;-----------------------------------------------------------------------------
585; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
586;-----------------------------------------------------------------------------
587%macro IPFILTER_CHROMA 2
588INIT_XMM sse4
589cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
590%define coef2 m5
591%define Tm0 m4
592%define Tm1 m3
593%define t2 m2
594%define t1 m1
595%define t0 m0
596
597mov r4d, r4m
598
599%ifdef PIC
600lea r5, [tab_ChromaCoeff]
601movd coef2, [r5 + r4 * 4]
602%else
603movd coef2, [tab_ChromaCoeff + r4 * 4]
604%endif
605
606mov r5d, %2
607
608pshufd coef2, coef2, 0
609mova t2, [tab_c_512]
610mova Tm0, [tab_Tm]
611mova Tm1, [tab_Tm + 16]
612
613.loop:
614FILTER_H4_w%1 t0, t1, t2
615add srcq, srcstrideq
616add dstq, dststrideq
617
618dec r5d
619jnz .loop
620
621RET
622%endmacro
623
624
625IPFILTER_CHROMA 6, 8
626IPFILTER_CHROMA 8, 2
627IPFILTER_CHROMA 8, 4
628IPFILTER_CHROMA 8, 6
629IPFILTER_CHROMA 8, 8
630IPFILTER_CHROMA 8, 16
631IPFILTER_CHROMA 8, 32
632IPFILTER_CHROMA 12, 16
633
634IPFILTER_CHROMA 6, 16
635IPFILTER_CHROMA 8, 12
636IPFILTER_CHROMA 8, 64
637IPFILTER_CHROMA 12, 32
638
639;-----------------------------------------------------------------------------
640; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
641;-----------------------------------------------------------------------------
642%macro IPFILTER_CHROMA_W 2
643INIT_XMM sse4
644cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
645%define coef2 m6
646%define Tm0 m5
647%define Tm1 m4
648%define t3 m3
649%define t2 m2
650%define t1 m1
651%define t0 m0
652
653mov r4d, r4m
654
655%ifdef PIC
656lea r5, [tab_ChromaCoeff]
657movd coef2, [r5 + r4 * 4]
658%else
659movd coef2, [tab_ChromaCoeff + r4 * 4]
660%endif
661
662mov r5d, %2
663
664pshufd coef2, coef2, 0
665mova t2, [tab_c_512]
666mova Tm0, [tab_Tm]
667mova Tm1, [tab_Tm + 16]
668
669.loop:
670FILTER_H4_w%1 t0, t1, t2, t3
671add srcq, srcstrideq
672add dstq, dststrideq
673
674dec r5d
675jnz .loop
676
677RET
678%endmacro
679
680IPFILTER_CHROMA_W 16, 4
681IPFILTER_CHROMA_W 16, 8
682IPFILTER_CHROMA_W 16, 12
683IPFILTER_CHROMA_W 16, 16
684IPFILTER_CHROMA_W 16, 32
685IPFILTER_CHROMA_W 32, 8
686IPFILTER_CHROMA_W 32, 16
687IPFILTER_CHROMA_W 32, 24
688IPFILTER_CHROMA_W 24, 32
689IPFILTER_CHROMA_W 32, 32
690
691IPFILTER_CHROMA_W 16, 24
692IPFILTER_CHROMA_W 16, 64
693IPFILTER_CHROMA_W 32, 48
694IPFILTER_CHROMA_W 24, 64
695IPFILTER_CHROMA_W 32, 64
696
697IPFILTER_CHROMA_W 64, 64
698IPFILTER_CHROMA_W 64, 32
699IPFILTER_CHROMA_W 64, 48
700IPFILTER_CHROMA_W 48, 64
701IPFILTER_CHROMA_W 64, 16
702
703
704%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
705 movu %1, %7
706 pshufb %2, %1, [tab_Lm + 0]
707 pmaddubsw %2, %5
708 pshufb %3, %1, [tab_Lm + 16]
709 pmaddubsw %3, %5
710 phaddw %2, %3
711 pshufb %4, %1, [tab_Lm + 32]
712 pmaddubsw %4, %5
713 pshufb %1, %1, [tab_Lm + 48]
714 pmaddubsw %1, %5
715 phaddw %4, %1
716 phaddw %2, %4
717 %if %0 == 8
718 pmulhrsw %2, %6
719 packuswb %2, %2
720 movh %8, %2
721 %endif
722%endmacro
723
724%macro FILTER_H8_W4 2
725 movu %1, [r0 - 3 + r5]
726 pshufb %2, %1, [tab_Lm]
727 pmaddubsw %2, m3
728 pshufb m7, %1, [tab_Lm + 16]
729 pmaddubsw m7, m3
730 phaddw %2, m7
731 phaddw %2, %2
732%endmacro
733
734;----------------------------------------------------------------------------------------------------------------------------
735; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
736;----------------------------------------------------------------------------------------------------------------------------
737%macro IPFILTER_LUMA 3
738INIT_XMM sse4
739cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
740
741 mov r4d, r4m
742
743%ifdef PIC
744 lea r6, [tab_LumaCoeff]
745 movh m3, [r6 + r4 * 8]
746%else
747 movh m3, [tab_LumaCoeff + r4 * 8]
748%endif
749 punpcklqdq m3, m3
750
751%ifidn %3, pp
752 mova m2, [tab_c_512]
753%else
754 mova m2, [pw_2000]
755%endif
756
757 mov r4d, %2
758%ifidn %3, ps
759 add r3, r3
760 cmp r5m, byte 0
761 je .loopH
762 lea r6, [r1 + 2 * r1]
763 sub r0, r6
764 add r4d, 7
765%endif
766
767.loopH:
768 xor r5, r5
769%rep %1 / 8
770 %ifidn %3, pp
771 FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
772 %else
773 FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
774 psubw m1, m2
775 movu [r2 + 2 * r5], m1
776 %endif
777 add r5, 8
778%endrep
779
780%rep (%1 % 8) / 4
781 FILTER_H8_W4 m0, m1
782 %ifidn %3, pp
783 pmulhrsw m1, m2
784 packuswb m1, m1
785 movd [r2 + r5], m1
786 %else
787 psubw m1, m2
788 movh [r2 + 2 * r5], m1
789 %endif
790%endrep
791
792 add r0, r1
793 add r2, r3
794
795 dec r4d
796 jnz .loopH
797 RET
798%endmacro
799
800
801INIT_YMM avx2
802cglobal interp_8tap_horiz_pp_4x4, 4,6,6
803 mov r4d, r4m
804
805%ifdef PIC
806 lea r5, [tab_LumaCoeff]
807 vpbroadcastq m0, [r5 + r4 * 8]
808%else
809 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
810%endif
811
812 mova m1, [tab_Lm]
813 vpbroadcastd m2, [pw_1]
814
815 ; register map
816 ; m0 - interpolate coeff
817 ; m1 - shuffle order table
818 ; m2 - constant word 1
819
820 sub r0, 3
821 ; Row 0-1
822 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
823 pshufb m3, m1
824 pmaddubsw m3, m0
825 pmaddwd m3, m2
826 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
827 pshufb m4, m1
828 pmaddubsw m4, m0
829 pmaddwd m4, m2
830 phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
831
832 ; Row 2-3
833 lea r0, [r0 + r1 * 2]
834 vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
835 pshufb m4, m1
836 pmaddubsw m4, m0
837 pmaddwd m4, m2
838 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
839 pshufb m5, m1
840 pmaddubsw m5, m0
841 pmaddwd m5, m2
842 phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
843
844 packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
845 pmulhrsw m3, [pw_512]
846 vextracti128 xm4, m3, 1
847 packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
848 pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]
849
850 lea r0, [r3 * 3]
851 movd [r2], xm3
852 pextrd [r2+r3], xm3, 2
853 pextrd [r2+r3*2], xm3, 1
854 pextrd [r2+r0], xm3, 3
855 RET
856
857
858;--------------------------------------------------------------------------------------------------------------
859; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
860;--------------------------------------------------------------------------------------------------------------
861 IPFILTER_LUMA 4, 4, pp
862 IPFILTER_LUMA 4, 8, pp
863 IPFILTER_LUMA 12, 16, pp
864 IPFILTER_LUMA 4, 16, pp
865
866;--------------------------------------------------------------------------------------------------------------
867; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
868;--------------------------------------------------------------------------------------------------------------
869%macro IPFILTER_LUMA_PP_W8 2
870INIT_XMM sse4
871cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
872 mov r4d, r4m
873
874%ifdef PIC
875 lea r5, [tab_LumaCoeff]
876 movh m3, [r5 + r4 * 8]
877%else
878 movh m3, [tab_LumaCoeff + r4 * 8]
879%endif
880 pshufd m0, m3, 0 ; m0 = coeff-L
881 pshufd m1, m3, 0x55 ; m1 = coeff-H
882 lea r5, [tab_Tm] ; r5 = shuffle
883 mova m2, [pw_512] ; m2 = 512
884
885 mov r4d, %2
886.loopH:
887%assign x 0
888%rep %1 / 8
889 movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
890 pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
891 pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
892 pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]
893 pmaddubsw m4, m0
894 pmaddubsw m6, m5, m1
895 pmaddubsw m5, m0
896 pmaddubsw m3, m1
897 paddw m4, m6
898 paddw m5, m3
899 phaddw m4, m5
900 pmulhrsw m4, m2
901 packuswb m4, m4
902 movh [r2 + x], m4
903%assign x x+8
904%endrep
905
906 add r0, r1
907 add r2, r3
908
909 dec r4d
910 jnz .loopH
911 RET
912%endmacro
913
914IPFILTER_LUMA_PP_W8 8, 4
915IPFILTER_LUMA_PP_W8 8, 8
916IPFILTER_LUMA_PP_W8 8, 16
917IPFILTER_LUMA_PP_W8 8, 32
918IPFILTER_LUMA_PP_W8 16, 4
919IPFILTER_LUMA_PP_W8 16, 8
920IPFILTER_LUMA_PP_W8 16, 12
921IPFILTER_LUMA_PP_W8 16, 16
922IPFILTER_LUMA_PP_W8 16, 32
923IPFILTER_LUMA_PP_W8 16, 64
924IPFILTER_LUMA_PP_W8 24, 32
925IPFILTER_LUMA_PP_W8 32, 8
926IPFILTER_LUMA_PP_W8 32, 16
927IPFILTER_LUMA_PP_W8 32, 24
928IPFILTER_LUMA_PP_W8 32, 32
929IPFILTER_LUMA_PP_W8 32, 64
930IPFILTER_LUMA_PP_W8 48, 64
931IPFILTER_LUMA_PP_W8 64, 16
932IPFILTER_LUMA_PP_W8 64, 32
933IPFILTER_LUMA_PP_W8 64, 48
934IPFILTER_LUMA_PP_W8 64, 64
935
936;----------------------------------------------------------------------------------------------------------------------------
937; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
938;----------------------------------------------------------------------------------------------------------------------------
939 IPFILTER_LUMA 4, 4, ps
940 IPFILTER_LUMA 8, 8, ps
941 IPFILTER_LUMA 8, 4, ps
942 IPFILTER_LUMA 4, 8, ps
943 IPFILTER_LUMA 16, 16, ps
944 IPFILTER_LUMA 16, 8, ps
945 IPFILTER_LUMA 8, 16, ps
946 IPFILTER_LUMA 16, 12, ps
947 IPFILTER_LUMA 12, 16, ps
948 IPFILTER_LUMA 16, 4, ps
949 IPFILTER_LUMA 4, 16, ps
950 IPFILTER_LUMA 32, 32, ps
951 IPFILTER_LUMA 32, 16, ps
952 IPFILTER_LUMA 16, 32, ps
953 IPFILTER_LUMA 32, 24, ps
954 IPFILTER_LUMA 24, 32, ps
955 IPFILTER_LUMA 32, 8, ps
956 IPFILTER_LUMA 8, 32, ps
957 IPFILTER_LUMA 64, 64, ps
958 IPFILTER_LUMA 64, 32, ps
959 IPFILTER_LUMA 32, 64, ps
960 IPFILTER_LUMA 64, 48, ps
961 IPFILTER_LUMA 48, 64, ps
962 IPFILTER_LUMA 64, 16, ps
963 IPFILTER_LUMA 16, 64, ps
964
965;-----------------------------------------------------------------------------
966; Interpolate HV
967;-----------------------------------------------------------------------------
968%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
969 mova %5, [r0 + (%6 + 0) * 16]
970 mova %1, [r0 + (%6 + 1) * 16]
971 mova %2, [r0 + (%6 + 2) * 16]
972 punpcklwd %3, %5, %1
973 punpckhwd %5, %1
974 pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
975 pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
976 punpcklwd %4, %1, %2
977 punpckhwd %1, %2
978 pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
979 pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
980%endmacro ; FILTER_HV8_START
981
982%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
983 mova %8, [r0 + (%9 + 0) * 16]
984 mova %1, [r0 + (%9 + 1) * 16]
985 punpcklwd %7, %2, %8
986 punpckhwd %2, %8
987 pmaddwd %7, [r5 + %10 * 16]
988 pmaddwd %2, [r5 + %10 * 16]
989 paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
990 paddd %5, %2 ; R0 = H[0+1+2+3]
991 punpcklwd %7, %8, %1
992 punpckhwd %8, %1
993 pmaddwd %7, [r5 + %10 * 16]
994 pmaddwd %8, [r5 + %10 * 16]
995 paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
996 paddd %6, %8 ; R1 = H[1+2+3+4]
997%endmacro ; FILTER_HV8_MID
998
999; Round and Saturate
1000%macro FILTER_HV8_END 4 ; output in [1, 3]
1001 paddd %1, [tab_c_526336]
1002 paddd %2, [tab_c_526336]
1003 paddd %3, [tab_c_526336]
1004 paddd %4, [tab_c_526336]
1005 psrad %1, 12
1006 psrad %2, 12
1007 psrad %3, 12
1008 psrad %4, 12
1009 packssdw %1, %2
1010 packssdw %3, %4
1011
1012 ; TODO: is merge better? I think this way is short dependency link
1013 packuswb %1, %3
1014%endmacro ; FILTER_HV8_END
1015
1016;-----------------------------------------------------------------------------
1017; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
1018;-----------------------------------------------------------------------------
1019INIT_XMM ssse3
1020cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
1021%define coef m7
1022%define stk_buf rsp
1023
1024 mov r4d, r4m
1025 mov r5d, r5m
1026
1027%ifdef PIC
1028 lea r6, [tab_LumaCoeff]
1029 movh coef, [r6 + r4 * 8]
1030%else
1031 movh coef, [tab_LumaCoeff + r4 * 8]
1032%endif
1033 punpcklqdq coef, coef
1034
1035 ; move to row -3
1036 lea r6, [r1 + r1 * 2]
1037 sub r0, r6
1038
1039 xor r6, r6
1040 mov r4, rsp
1041
1042.loopH:
1043 FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
1044 psubw m1, [pw_2000]
1045 mova [r4], m1
1046
1047 add r0, r1
1048 add r4, 16
1049 inc r6
1050 cmp r6, 8+7
1051 jnz .loopH
1052
1053 ; ready to phase V
1054 ; Here all of mN is free
1055
1056 ; load coeff table
1057 shl r5, 6
1058 lea r6, [tab_LumaCoeffV]
1059 lea r5, [r5 + r6]
1060
1061 ; load intermedia buffer
1062 mov r0, stk_buf
1063
1064 ; register mapping
1065 ; r0 - src
1066 ; r5 - coeff
1067 ; r6 - loop_i
1068
1069 ; let's go
1070 xor r6, r6
1071
1072 ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
1073.loopV:
1074
1075 FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
1076 FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
1077 FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
1078 FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
1079 FILTER_HV8_END m3, m0, m4, m1
1080
1081 movh [r2], m3
1082 movhps [r2 + r3], m3
1083
1084 lea r0, [r0 + 16 * 2]
1085 lea r2, [r2 + r3 * 2]
1086
1087 inc r6
1088 cmp r6, 8/2
1089 jnz .loopV
1090
1091 RET
1092
1093;-----------------------------------------------------------------------------
1094;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1095;-----------------------------------------------------------------------------
1096INIT_XMM sse4
1097cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
1098
1099mov r4d, r4m
1100sub r0, r1
1101
1102%ifdef PIC
1103lea r5, [tab_ChromaCoeff]
1104movd m0, [r5 + r4 * 4]
1105%else
1106movd m0, [tab_ChromaCoeff + r4 * 4]
1107%endif
1108lea r4, [r1 * 3]
1109lea r5, [r0 + 4 * r1]
1110pshufb m0, [tab_Cm]
1111mova m1, [tab_c_512]
1112
1113movd m2, [r0]
1114movd m3, [r0 + r1]
1115movd m4, [r0 + 2 * r1]
1116movd m5, [r0 + r4]
1117
1118punpcklbw m2, m3
1119punpcklbw m6, m4, m5
1120punpcklbw m2, m6
1121
1122pmaddubsw m2, m0
1123
1124movd m6, [r5]
1125
1126punpcklbw m3, m4
1127punpcklbw m7, m5, m6
1128punpcklbw m3, m7
1129
1130pmaddubsw m3, m0
1131
1132phaddw m2, m3
1133
1134pmulhrsw m2, m1
1135
1136movd m7, [r5 + r1]
1137
1138punpcklbw m4, m5
1139punpcklbw m3, m6, m7
1140punpcklbw m4, m3
1141
1142pmaddubsw m4, m0
1143
1144movd m3, [r5 + 2 * r1]
1145
1146punpcklbw m5, m6
1147punpcklbw m7, m3
1148punpcklbw m5, m7
1149
1150pmaddubsw m5, m0
1151
1152phaddw m4, m5
1153
1154pmulhrsw m4, m1
1155packuswb m2, m4
1156
1157pextrw [r2], m2, 0
1158pextrw [r2 + r3], m2, 2
1159lea r2, [r2 + 2 * r3]
1160pextrw [r2], m2, 4
1161pextrw [r2 + r3], m2, 6
1162
1163RET
1164
1165;-----------------------------------------------------------------------------
1166; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1167;-----------------------------------------------------------------------------
1168%macro FILTER_V4_W2_H4 2
1169INIT_XMM sse4
1170cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
1171
1172mov r4d, r4m
1173sub r0, r1
1174
1175%ifdef PIC
1176lea r5, [tab_ChromaCoeff]
1177movd m0, [r5 + r4 * 4]
1178%else
1179movd m0, [tab_ChromaCoeff + r4 * 4]
1180%endif
1181
1182pshufb m0, [tab_Cm]
1183
1184mova m1, [tab_c_512]
1185
1186mov r4d, %2
1187lea r5, [3 * r1]
1188
1189.loop:
1190movd m2, [r0]
1191movd m3, [r0 + r1]
1192movd m4, [r0 + 2 * r1]
1193movd m5, [r0 + r5]
1194
1195punpcklbw m2, m3
1196punpcklbw m6, m4, m5
1197punpcklbw m2, m6
1198
1199pmaddubsw m2, m0
1200
1201lea r0, [r0 + 4 * r1]
1202movd m6, [r0]
1203
1204punpcklbw m3, m4
1205punpcklbw m7, m5, m6
1206punpcklbw m3, m7
1207
1208pmaddubsw m3, m0
1209
1210phaddw m2, m3
1211
1212pmulhrsw m2, m1
1213
1214movd m7, [r0 + r1]
1215
1216punpcklbw m4, m5
1217punpcklbw m3, m6, m7
1218punpcklbw m4, m3
1219
1220pmaddubsw m4, m0
1221
1222movd m3, [r0 + 2 * r1]
1223
1224punpcklbw m5, m6
1225punpcklbw m7, m3
1226punpcklbw m5, m7
1227
1228pmaddubsw m5, m0
1229
1230phaddw m4, m5
1231
1232pmulhrsw m4, m1
1233packuswb m2, m4
1234
1235pextrw [r2], m2, 0
1236pextrw [r2 + r3], m2, 2
1237lea r2, [r2 + 2 * r3]
1238pextrw [r2], m2, 4
1239pextrw [r2 + r3], m2, 6
1240
1241lea r2, [r2 + 2 * r3]
1242
1243sub r4, 4
1244jnz .loop
1245RET
1246%endmacro
1247
1248FILTER_V4_W2_H4 2, 8
1249
1250FILTER_V4_W2_H4 2, 16
1251
1252;-----------------------------------------------------------------------------
1253; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1254;-----------------------------------------------------------------------------
1255INIT_XMM sse4
1256cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
1257
1258mov r4d, r4m
1259sub r0, r1
1260
1261%ifdef PIC
1262lea r5, [tab_ChromaCoeff]
1263movd m0, [r5 + r4 * 4]
1264%else
1265movd m0, [tab_ChromaCoeff + r4 * 4]
1266%endif
1267
1268pshufb m0, [tab_Cm]
1269lea r5, [r0 + 2 * r1]
1270
1271movd m2, [r0]
1272movd m3, [r0 + r1]
1273movd m4, [r5]
1274movd m5, [r5 + r1]
1275
1276punpcklbw m2, m3
1277punpcklbw m1, m4, m5
1278punpcklbw m2, m1
1279
1280pmaddubsw m2, m0
1281
1282movd m1, [r0 + 4 * r1]
1283
1284punpcklbw m3, m4
1285punpcklbw m5, m1
1286punpcklbw m3, m5
1287
1288pmaddubsw m3, m0
1289
1290phaddw m2, m3
1291
1292pmulhrsw m2, [tab_c_512]
1293packuswb m2, m2
1294movd [r2], m2
1295pextrd [r2 + r3], m2, 1
1296
1297RET
1298
1299;-----------------------------------------------------------------------------
1300; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1301;-----------------------------------------------------------------------------
1302INIT_XMM sse4
1303cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
1304
1305mov r4d, r4m
1306sub r0, r1
1307
1308%ifdef PIC
1309lea r5, [tab_ChromaCoeff]
1310movd m0, [r5 + r4 * 4]
1311%else
1312movd m0, [tab_ChromaCoeff + r4 * 4]
1313%endif
1314
1315pshufb m0, [tab_Cm]
1316mova m1, [tab_c_512]
1317lea r5, [r0 + 4 * r1]
1318lea r4, [r1 * 3]
1319
1320movd m2, [r0]
1321movd m3, [r0 + r1]
1322movd m4, [r0 + 2 * r1]
1323movd m5, [r0 + r4]
1324
1325punpcklbw m2, m3
1326punpcklbw m6, m4, m5
1327punpcklbw m2, m6
1328
1329pmaddubsw m2, m0
1330
1331movd m6, [r5]
1332
1333punpcklbw m3, m4
1334punpcklbw m7, m5, m6
1335punpcklbw m3, m7
1336
1337pmaddubsw m3, m0
1338
1339phaddw m2, m3
1340
1341pmulhrsw m2, m1
1342
1343movd m7, [r5 + r1]
1344
1345punpcklbw m4, m5
1346punpcklbw m3, m6, m7
1347punpcklbw m4, m3
1348
1349pmaddubsw m4, m0
1350
1351movd m3, [r5 + 2 * r1]
1352
1353punpcklbw m5, m6
1354punpcklbw m7, m3
1355punpcklbw m5, m7
1356
1357pmaddubsw m5, m0
1358
1359phaddw m4, m5
1360
1361pmulhrsw m4, m1
1362
1363packuswb m2, m4
1364movd [r2], m2
1365pextrd [r2 + r3], m2, 1
1366lea r2, [r2 + 2 * r3]
1367pextrd [r2], m2, 2
1368pextrd [r2 + r3], m2, 3
1369
1370RET
1371
1372;-----------------------------------------------------------------------------
1373; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1374;-----------------------------------------------------------------------------
1375%macro FILTER_V4_W4_H4 2
1376INIT_XMM sse4
1377cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
1378
1379mov r4d, r4m
1380sub r0, r1
1381
1382%ifdef PIC
1383lea r5, [tab_ChromaCoeff]
1384movd m0, [r5 + r4 * 4]
1385%else
1386movd m0, [tab_ChromaCoeff + r4 * 4]
1387%endif
1388
1389pshufb m0, [tab_Cm]
1390
1391mova m1, [tab_c_512]
1392
1393mov r4d, %2
1394
1395lea r5, [3 * r1]
1396
1397.loop:
1398movd m2, [r0]
1399movd m3, [r0 + r1]
1400movd m4, [r0 + 2 * r1]
1401movd m5, [r0 + r5]
1402
1403punpcklbw m2, m3
1404punpcklbw m6, m4, m5
1405punpcklbw m2, m6
1406
1407pmaddubsw m2, m0
1408
1409lea r0, [r0 + 4 * r1]
1410movd m6, [r0]
1411
1412punpcklbw m3, m4
1413punpcklbw m7, m5, m6
1414punpcklbw m3, m7
1415
1416pmaddubsw m3, m0
1417
1418phaddw m2, m3
1419
1420pmulhrsw m2, m1
1421
1422movd m7, [r0 + r1]
1423
1424punpcklbw m4, m5
1425punpcklbw m3, m6, m7
1426punpcklbw m4, m3
1427
1428pmaddubsw m4, m0
1429
1430movd m3, [r0 + 2 * r1]
1431
1432punpcklbw m5, m6
1433punpcklbw m7, m3
1434punpcklbw m5, m7
1435
1436pmaddubsw m5, m0
1437
1438phaddw m4, m5
1439
1440pmulhrsw m4, m1
1441packuswb m2, m4
1442movd [r2], m2
1443pextrd [r2 + r3], m2, 1
1444lea r2, [r2 + 2 * r3]
1445pextrd [r2], m2, 2
1446pextrd [r2 + r3], m2, 3
1447
1448lea r2, [r2 + 2 * r3]
1449
1450sub r4, 4
1451jnz .loop
1452RET
1453%endmacro
1454
1455FILTER_V4_W4_H4 4, 8
1456FILTER_V4_W4_H4 4, 16
1457
1458FILTER_V4_W4_H4 4, 32
1459
1460%macro FILTER_V4_W8_H2 0
1461punpcklbw m1, m2
1462punpcklbw m7, m3, m0
1463
1464pmaddubsw m1, m6
1465pmaddubsw m7, m5
1466
1467paddw m1, m7
1468
1469pmulhrsw m1, m4
1470packuswb m1, m1
1471%endmacro
1472
1473%macro FILTER_V4_W8_H3 0
1474punpcklbw m2, m3
1475punpcklbw m7, m0, m1
1476
1477pmaddubsw m2, m6
1478pmaddubsw m7, m5
1479
1480paddw m2, m7
1481
1482pmulhrsw m2, m4
1483packuswb m2, m2
1484%endmacro
1485
1486%macro FILTER_V4_W8_H4 0
1487punpcklbw m3, m0
1488punpcklbw m7, m1, m2
1489
1490pmaddubsw m3, m6
1491pmaddubsw m7, m5
1492
1493paddw m3, m7
1494
1495pmulhrsw m3, m4
1496packuswb m3, m3
1497%endmacro
1498
1499%macro FILTER_V4_W8_H5 0
1500punpcklbw m0, m1
1501punpcklbw m7, m2, m3
1502
1503pmaddubsw m0, m6
1504pmaddubsw m7, m5
1505
1506paddw m0, m7
1507
1508pmulhrsw m0, m4
1509packuswb m0, m0
1510%endmacro
1511
1512%macro FILTER_V4_W8_8x2 2
1513FILTER_V4_W8 %1, %2
1514movq m0, [r0 + 4 * r1]
1515
1516FILTER_V4_W8_H2
1517
1518movh [r2 + r3], m1
1519%endmacro
1520
1521%macro FILTER_V4_W8_8x4 2
1522FILTER_V4_W8_8x2 %1, %2
1523;8x3
1524lea r6, [r0 + 4 * r1]
1525movq m1, [r6 + r1]
1526
1527FILTER_V4_W8_H3
1528
1529movh [r2 + 2 * r3], m2
1530
1531;8x4
1532movq m2, [r6 + 2 * r1]
1533
1534FILTER_V4_W8_H4
1535
1536lea r5, [r2 + 2 * r3]
1537movh [r5 + r3], m3
1538%endmacro
1539
1540%macro FILTER_V4_W8_8x6 2
1541FILTER_V4_W8_8x4 %1, %2
1542;8x5
1543lea r6, [r6 + 2 * r1]
1544movq m3, [r6 + r1]
1545
1546FILTER_V4_W8_H5
1547
1548movh [r2 + 4 * r3], m0
1549
1550;8x6
1551movq m0, [r0 + 8 * r1]
1552
1553FILTER_V4_W8_H2
1554
1555lea r5, [r2 + 4 * r3]
1556movh [r5 + r3], m1
1557%endmacro
1558
1559;-----------------------------------------------------------------------------
1560; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1561;-----------------------------------------------------------------------------
1562%macro FILTER_V4_W8 2
1563INIT_XMM sse4
1564cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
1565
1566mov r4d, r4m
1567
1568sub r0, r1
1569movq m0, [r0]
1570movq m1, [r0 + r1]
1571movq m2, [r0 + 2 * r1]
1572lea r5, [r0 + 2 * r1]
1573movq m3, [r5 + r1]
1574
1575punpcklbw m0, m1
1576punpcklbw m4, m2, m3
1577
1578%ifdef PIC
1579lea r6, [tab_ChromaCoeff]
1580movd m5, [r6 + r4 * 4]
1581%else
1582movd m5, [tab_ChromaCoeff + r4 * 4]
1583%endif
1584
1585pshufb m6, m5, [tab_Vm]
1586pmaddubsw m0, m6
1587
1588pshufb m5, [tab_Vm + 16]
1589pmaddubsw m4, m5
1590
1591paddw m0, m4
1592
1593mova m4, [tab_c_512]
1594
1595pmulhrsw m0, m4
1596packuswb m0, m0
1597movh [r2], m0
1598%endmacro
1599
1600;-----------------------------------------------------------------------------
1601; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1602;-----------------------------------------------------------------------------
1603FILTER_V4_W8_8x2 8, 2
1604
1605RET
1606
1607;-----------------------------------------------------------------------------
1608; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1609;-----------------------------------------------------------------------------
1610FILTER_V4_W8_8x4 8, 4
1611
1612RET
1613
1614;-----------------------------------------------------------------------------
1615; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1616;-----------------------------------------------------------------------------
1617FILTER_V4_W8_8x6 8, 6
1618
1619RET
1620
1621;-------------------------------------------------------------------------------------------------------------
1622; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1623;-------------------------------------------------------------------------------------------------------------
1624INIT_XMM sse4
1625cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
1626
1627mov r4d, r4m
1628sub r0, r1
1629add r3d, r3d
1630
1631%ifdef PIC
1632lea r5, [tab_ChromaCoeff]
1633movd m0, [r5 + r4 * 4]
1634%else
1635movd m0, [tab_ChromaCoeff + r4 * 4]
1636%endif
1637
1638pshufb m0, [tab_Cm]
1639
1640movd m2, [r0]
1641movd m3, [r0 + r1]
1642lea r5, [r0 + 2 * r1]
1643movd m4, [r5]
1644movd m5, [r5 + r1]
1645
1646punpcklbw m2, m3
1647punpcklbw m1, m4, m5
1648punpcklbw m2, m1
1649
1650pmaddubsw m2, m0
1651
1652movd m1, [r0 + 4 * r1]
1653
1654punpcklbw m3, m4
1655punpcklbw m5, m1
1656punpcklbw m3, m5
1657
1658pmaddubsw m3, m0
1659
1660phaddw m2, m3
1661
1662psubw m2, [pw_2000]
1663movh [r2], m2
1664movhps [r2 + r3], m2
1665
1666RET
1667
1668;-------------------------------------------------------------------------------------------------------------
1669; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1670;-------------------------------------------------------------------------------------------------------------
1671INIT_XMM sse4
1672cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
1673
1674 mov r4d, r4m
1675 sub r0, r1
1676 add r3d, r3d
1677
1678%ifdef PIC
1679 lea r5, [tab_ChromaCoeff]
1680 movd m0, [r5 + r4 * 4]
1681%else
1682 movd m0, [tab_ChromaCoeff + r4 * 4]
1683%endif
1684
1685 pshufb m0, [tab_Cm]
1686
1687 lea r4, [r1 * 3]
1688 lea r5, [r0 + 4 * r1]
1689
1690 movd m2, [r0]
1691 movd m3, [r0 + r1]
1692 movd m4, [r0 + 2 * r1]
1693 movd m5, [r0 + r4]
1694
1695 punpcklbw m2, m3
1696 punpcklbw m6, m4, m5
1697 punpcklbw m2, m6
1698
1699 pmaddubsw m2, m0
1700
1701 movd m6, [r5]
1702
1703 punpcklbw m3, m4
1704 punpcklbw m1, m5, m6
1705 punpcklbw m3, m1
1706
1707 pmaddubsw m3, m0
1708
1709 phaddw m2, m3
1710
1711 mova m1, [pw_2000]
1712
1713 psubw m2, m1
1714 movh [r2], m2
1715 movhps [r2 + r3], m2
1716
1717 movd m2, [r5 + r1]
1718
1719 punpcklbw m4, m5
1720 punpcklbw m3, m6, m2
1721 punpcklbw m4, m3
1722
1723 pmaddubsw m4, m0
1724
1725 movd m3, [r5 + 2 * r1]
1726
1727 punpcklbw m5, m6
1728 punpcklbw m2, m3
1729 punpcklbw m5, m2
1730
1731 pmaddubsw m5, m0
1732
1733 phaddw m4, m5
1734
1735 psubw m4, m1
1736 lea r2, [r2 + 2 * r3]
1737 movh [r2], m4
1738 movhps [r2 + r3], m4
1739
1740 RET
1741
1742;---------------------------------------------------------------------------------------------------------------
1743; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1744;---------------------------------------------------------------------------------------------------------------
1745%macro FILTER_V_PS_W4_H4 2
1746INIT_XMM sse4
1747cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
1748
1749 mov r4d, r4m
1750 sub r0, r1
1751 add r3d, r3d
1752
1753%ifdef PIC
1754 lea r5, [tab_ChromaCoeff]
1755 movd m0, [r5 + r4 * 4]
1756%else
1757 movd m0, [tab_ChromaCoeff + r4 * 4]
1758%endif
1759
1760 pshufb m0, [tab_Cm]
1761
1762 mova m1, [pw_2000]
1763
1764 mov r4d, %2/4
1765 lea r5, [3 * r1]
1766
1767.loop:
1768 movd m2, [r0]
1769 movd m3, [r0 + r1]
1770 movd m4, [r0 + 2 * r1]
1771 movd m5, [r0 + r5]
1772
1773 punpcklbw m2, m3
1774 punpcklbw m6, m4, m5
1775 punpcklbw m2, m6
1776
1777 pmaddubsw m2, m0
1778
1779 lea r0, [r0 + 4 * r1]
1780 movd m6, [r0]
1781
1782 punpcklbw m3, m4
1783 punpcklbw m7, m5, m6
1784 punpcklbw m3, m7
1785
1786 pmaddubsw m3, m0
1787
1788 phaddw m2, m3
1789
1790 psubw m2, m1
1791 movh [r2], m2
1792 movhps [r2 + r3], m2
1793
1794 movd m2, [r0 + r1]
1795
1796 punpcklbw m4, m5
1797 punpcklbw m3, m6, m2
1798 punpcklbw m4, m3
1799
1800 pmaddubsw m4, m0
1801
1802 movd m3, [r0 + 2 * r1]
1803
1804 punpcklbw m5, m6
1805 punpcklbw m2, m3
1806 punpcklbw m5, m2
1807
1808 pmaddubsw m5, m0
1809
1810 phaddw m4, m5
1811
1812 psubw m4, m1
1813 lea r2, [r2 + 2 * r3]
1814 movh [r2], m4
1815 movhps [r2 + r3], m4
1816
1817 lea r2, [r2 + 2 * r3]
1818
1819 dec r4d
1820 jnz .loop
1821 RET
1822%endmacro
1823
1824FILTER_V_PS_W4_H4 4, 8
1825FILTER_V_PS_W4_H4 4, 16
1826
1827FILTER_V_PS_W4_H4 4, 32
1828
1829;--------------------------------------------------------------------------------------------------------------
1830; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1831;--------------------------------------------------------------------------------------------------------------
1832%macro FILTER_V_PS_W8_H8_H16_H2 2
1833INIT_XMM sse4
1834cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
1835
1836 mov r4d, r4m
1837 sub r0, r1
1838 add r3d, r3d
1839
1840%ifdef PIC
1841 lea r5, [tab_ChromaCoeff]
1842 movd m5, [r5 + r4 * 4]
1843%else
1844 movd m5, [tab_ChromaCoeff + r4 * 4]
1845%endif
1846
1847 pshufb m6, m5, [tab_Vm]
1848 pshufb m5, [tab_Vm + 16]
1849 mova m4, [pw_2000]
1850
1851 mov r4d, %2/2
1852 lea r5, [3 * r1]
1853
1854.loopH:
1855 movq m0, [r0]
1856 movq m1, [r0 + r1]
1857 movq m2, [r0 + 2 * r1]
1858 movq m3, [r0 + r5]
1859
1860 punpcklbw m0, m1
1861 punpcklbw m1, m2
1862 punpcklbw m2, m3
1863
1864 pmaddubsw m0, m6
1865 pmaddubsw m2, m5
1866
1867 paddw m0, m2
1868
1869 psubw m0, m4
1870 movu [r2], m0
1871
1872 movq m0, [r0 + 4 * r1]
1873
1874 punpcklbw m3, m0
1875
1876 pmaddubsw m1, m6
1877 pmaddubsw m3, m5
1878
1879 paddw m1, m3
1880 psubw m1, m4
1881
1882 movu [r2 + r3], m1
1883
1884 lea r0, [r0 + 2 * r1]
1885 lea r2, [r2 + 2 * r3]
1886
1887 dec r4d
1888 jnz .loopH
1889
1890 RET
1891%endmacro
1892
1893FILTER_V_PS_W8_H8_H16_H2 8, 2
1894FILTER_V_PS_W8_H8_H16_H2 8, 4
1895FILTER_V_PS_W8_H8_H16_H2 8, 6
1896
1897FILTER_V_PS_W8_H8_H16_H2 8, 12
1898FILTER_V_PS_W8_H8_H16_H2 8, 64
1899
1900;--------------------------------------------------------------------------------------------------------------
1901; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1902;--------------------------------------------------------------------------------------------------------------
1903%macro FILTER_V_PS_W8_H8_H16_H32 2
1904INIT_XMM sse4
1905cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
1906
1907 mov r4d, r4m
1908 sub r0, r1
1909 add r3d, r3d
1910
1911%ifdef PIC
1912 lea r5, [tab_ChromaCoeff]
1913 movd m5, [r5 + r4 * 4]
1914%else
1915 movd m5, [tab_ChromaCoeff + r4 * 4]
1916%endif
1917
1918 pshufb m6, m5, [tab_Vm]
1919 pshufb m5, [tab_Vm + 16]
1920 mova m4, [pw_2000]
1921
1922 mov r4d, %2/4
1923 lea r5, [3 * r1]
1924
1925.loop:
1926 movq m0, [r0]
1927 movq m1, [r0 + r1]
1928 movq m2, [r0 + 2 * r1]
1929 movq m3, [r0 + r5]
1930
1931 punpcklbw m0, m1
1932 punpcklbw m1, m2
1933 punpcklbw m2, m3
1934
1935 pmaddubsw m0, m6
1936 pmaddubsw m7, m2, m5
1937
1938 paddw m0, m7
1939
1940 psubw m0, m4
1941 movu [r2], m0
1942
1943 lea r0, [r0 + 4 * r1]
1944 movq m0, [r0]
1945
1946 punpcklbw m3, m0
1947
1948 pmaddubsw m1, m6
1949 pmaddubsw m7, m3, m5
1950
1951 paddw m1, m7
1952
1953 psubw m1, m4
1954 movu [r2 + r3], m1
1955
1956 movq m1, [r0 + r1]
1957
1958 punpcklbw m0, m1
1959
1960 pmaddubsw m2, m6
1961 pmaddubsw m0, m5
1962
1963 paddw m2, m0
1964
1965 psubw m2, m4
1966 lea r2, [r2 + 2 * r3]
1967 movu [r2], m2
1968
1969 movq m2, [r0 + 2 * r1]
1970
1971 punpcklbw m1, m2
1972
1973 pmaddubsw m3, m6
1974 pmaddubsw m1, m5
1975
1976 paddw m3, m1
1977 psubw m3, m4
1978
1979 movu [r2 + r3], m3
1980
1981 lea r2, [r2 + 2 * r3]
1982
1983 dec r4d
1984 jnz .loop
1985 RET
1986%endmacro
1987
1988FILTER_V_PS_W8_H8_H16_H32 8, 8
1989FILTER_V_PS_W8_H8_H16_H32 8, 16
1990FILTER_V_PS_W8_H8_H16_H32 8, 32
1991
1992;------------------------------------------------------------------------------------------------------------
1993;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1994;------------------------------------------------------------------------------------------------------------
1995%macro FILTER_V_PS_W6 2
1996INIT_XMM sse4
1997cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
1998
1999 mov r4d, r4m
2000 sub r0, r1
2001 add r3d, r3d
2002
2003%ifdef PIC
2004 lea r5, [tab_ChromaCoeff]
2005 movd m5, [r5 + r4 * 4]
2006%else
2007 movd m5, [tab_ChromaCoeff + r4 * 4]
2008%endif
2009
2010 pshufb m6, m5, [tab_Vm]
2011 pshufb m5, [tab_Vm + 16]
2012 mova m4, [pw_2000]
2013 lea r5, [3 * r1]
2014 mov r4d, %2/4
2015
2016.loop:
2017 movq m0, [r0]
2018 movq m1, [r0 + r1]
2019 movq m2, [r0 + 2 * r1]
2020 movq m3, [r0 + r5]
2021
2022 punpcklbw m0, m1
2023 punpcklbw m1, m2
2024 punpcklbw m2, m3
2025
2026 pmaddubsw m0, m6
2027 pmaddubsw m7, m2, m5
2028
2029 paddw m0, m7
2030 psubw m0, m4
2031
2032 movh [r2], m0
2033 pshufd m0, m0, 2
2034 movd [r2 + 8], m0
2035
2036 lea r0, [r0 + 4 * r1]
2037 movq m0, [r0]
2038 punpcklbw m3, m0
2039
2040 pmaddubsw m1, m6
2041 pmaddubsw m7, m3, m5
2042
2043 paddw m1, m7
2044 psubw m1, m4
2045
2046 movh [r2 + r3], m1
2047 pshufd m1, m1, 2
2048 movd [r2 + r3 + 8], m1
2049
2050 movq m1, [r0 + r1]
2051 punpcklbw m0, m1
2052
2053 pmaddubsw m2, m6
2054 pmaddubsw m0, m5
2055
2056 paddw m2, m0
2057 psubw m2, m4
2058
2059 lea r2,[r2 + 2 * r3]
2060 movh [r2], m2
2061 pshufd m2, m2, 2
2062 movd [r2 + 8], m2
2063
2064 movq m2,[r0 + 2 * r1]
2065 punpcklbw m1, m2
2066
2067 pmaddubsw m3, m6
2068 pmaddubsw m1, m5
2069
2070 paddw m3, m1
2071 psubw m3, m4
2072
2073 movh [r2 + r3], m3
2074 pshufd m3, m3, 2
2075 movd [r2 + r3 + 8], m3
2076
2077 lea r2, [r2 + 2 * r3]
2078
2079 dec r4d
2080 jnz .loop
2081 RET
2082%endmacro
2083
2084FILTER_V_PS_W6 6, 8
2085FILTER_V_PS_W6 6, 16
2086
2087;---------------------------------------------------------------------------------------------------------------
2088; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2089;---------------------------------------------------------------------------------------------------------------
2090%macro FILTER_V_PS_W12 2
2091INIT_XMM sse4
2092cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
2093
2094 mov r4d, r4m
2095 sub r0, r1
2096 add r3d, r3d
2097
2098%ifdef PIC
2099 lea r5, [tab_ChromaCoeff]
2100 movd m0, [r5 + r4 * 4]
2101%else
2102 movd m0, [tab_ChromaCoeff + r4 * 4]
2103%endif
2104
2105 pshufb m1, m0, [tab_Vm]
2106 pshufb m0, [tab_Vm + 16]
2107
2108 mov r4d, %2/2
2109
2110.loop:
2111 movu m2, [r0]
2112 movu m3, [r0 + r1]
2113
2114 punpcklbw m4, m2, m3
2115 punpckhbw m2, m3
2116
2117 pmaddubsw m4, m1
2118 pmaddubsw m2, m1
2119
2120 lea r0, [r0 + 2 * r1]
2121 movu m5, [r0]
2122 movu m7, [r0 + r1]
2123
2124 punpcklbw m6, m5, m7
2125 pmaddubsw m6, m0
2126 paddw m4, m6
2127
2128 punpckhbw m6, m5, m7
2129 pmaddubsw m6, m0
2130 paddw m2, m6
2131
2132 mova m6, [pw_2000]
2133
2134 psubw m4, m6
2135 psubw m2, m6
2136
2137 movu [r2], m4
2138 movh [r2 + 16], m2
2139
2140 punpcklbw m4, m3, m5
2141 punpckhbw m3, m5
2142
2143 pmaddubsw m4, m1
2144 pmaddubsw m3, m1
2145
2146 movu m2, [r0 + 2 * r1]
2147
2148 punpcklbw m5, m7, m2
2149 punpckhbw m7, m2
2150
2151 pmaddubsw m5, m0
2152 pmaddubsw m7, m0
2153
2154 paddw m4, m5
2155 paddw m3, m7
2156
2157 psubw m4, m6
2158 psubw m3, m6
2159
2160 movu [r2 + r3], m4
2161 movh [r2 + r3 + 16], m3
2162
2163 lea r2, [r2 + 2 * r3]
2164
2165 dec r4d
2166 jnz .loop
2167 RET
2168%endmacro
2169
2170FILTER_V_PS_W12 12, 16
2171FILTER_V_PS_W12 12, 32
2172
2173;---------------------------------------------------------------------------------------------------------------
2174; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2175;---------------------------------------------------------------------------------------------------------------
2176%macro FILTER_V_PS_W16 2
2177INIT_XMM sse4
2178cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2179
2180 mov r4d, r4m
2181 sub r0, r1
2182 add r3d, r3d
2183
2184%ifdef PIC
2185 lea r5, [tab_ChromaCoeff]
2186 movd m0, [r5 + r4 * 4]
2187%else
2188 movd m0, [tab_ChromaCoeff + r4 * 4]
2189%endif
2190
2191 pshufb m1, m0, [tab_Vm]
2192 pshufb m0, [tab_Vm + 16]
2193 mov r4d, %2/2
2194
2195.loop:
2196 movu m2, [r0]
2197 movu m3, [r0 + r1]
2198
2199 punpcklbw m4, m2, m3
2200 punpckhbw m2, m3
2201
2202 pmaddubsw m4, m1
2203 pmaddubsw m2, m1
2204
2205 lea r0, [r0 + 2 * r1]
2206 movu m5, [r0]
2207 movu m7, [r0 + r1]
2208
2209 punpcklbw m6, m5, m7
2210 pmaddubsw m6, m0
2211 paddw m4, m6
2212
2213 punpckhbw m6, m5, m7
2214 pmaddubsw m6, m0
2215 paddw m2, m6
2216
2217 mova m6, [pw_2000]
2218
2219 psubw m4, m6
2220 psubw m2, m6
2221
2222 movu [r2], m4
2223 movu [r2 + 16], m2
2224
2225 punpcklbw m4, m3, m5
2226 punpckhbw m3, m5
2227
2228 pmaddubsw m4, m1
2229 pmaddubsw m3, m1
2230
2231 movu m5, [r0 + 2 * r1]
2232
2233 punpcklbw m2, m7, m5
2234 punpckhbw m7, m5
2235
2236 pmaddubsw m2, m0
2237 pmaddubsw m7, m0
2238
2239 paddw m4, m2
2240 paddw m3, m7
2241
2242 psubw m4, m6
2243 psubw m3, m6
2244
2245 movu [r2 + r3], m4
2246 movu [r2 + r3 + 16], m3
2247
2248 lea r2, [r2 + 2 * r3]
2249
2250 dec r4d
2251 jnz .loop
2252 RET
2253%endmacro
2254
2255FILTER_V_PS_W16 16, 4
2256FILTER_V_PS_W16 16, 8
2257FILTER_V_PS_W16 16, 12
2258FILTER_V_PS_W16 16, 16
2259FILTER_V_PS_W16 16, 32
2260
2261FILTER_V_PS_W16 16, 24
2262FILTER_V_PS_W16 16, 64
2263
2264;--------------------------------------------------------------------------------------------------------------
2265;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2266;--------------------------------------------------------------------------------------------------------------
2267%macro FILTER_V4_PS_W24 2
2268INIT_XMM sse4
2269cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
2270
2271 mov r4d, r4m
2272 sub r0, r1
2273 add r3d, r3d
2274
2275%ifdef PIC
2276 lea r5, [tab_ChromaCoeff]
2277 movd m0, [r5 + r4 * 4]
2278%else
2279 movd m0, [tab_ChromaCoeff + r4 * 4]
2280%endif
2281
2282 pshufb m1, m0, [tab_Vm]
2283 pshufb m0, [tab_Vm + 16]
2284
2285 mov r4d, %2/2
2286
2287.loop:
2288 movu m2, [r0]
2289 movu m3, [r0 + r1]
2290
2291 punpcklbw m4, m2, m3
2292 punpckhbw m2, m3
2293
2294 pmaddubsw m4, m1
2295 pmaddubsw m2, m1
2296
2297 lea r5, [r0 + 2 * r1]
2298
2299 movu m5, [r5]
2300 movu m7, [r5 + r1]
2301
2302 punpcklbw m6, m5, m7
2303 pmaddubsw m6, m0
2304 paddw m4, m6
2305
2306 punpckhbw m6, m5, m7
2307 pmaddubsw m6, m0
2308 paddw m2, m6
2309
2310 mova m6, [pw_2000]
2311
2312 psubw m4, m6
2313 psubw m2, m6
2314
2315 movu [r2], m4
2316 movu [r2 + 16], m2
2317
2318 punpcklbw m4, m3, m5
2319 punpckhbw m3, m5
2320
2321 pmaddubsw m4, m1
2322 pmaddubsw m3, m1
2323
2324 movu m2, [r5 + 2 * r1]
2325
2326 punpcklbw m5, m7, m2
2327 punpckhbw m7, m2
2328
2329 pmaddubsw m5, m0
2330 pmaddubsw m7, m0
2331
2332 paddw m4, m5
2333 paddw m3, m7
2334
2335 psubw m4, m6
2336 psubw m3, m6
2337
2338 movu [r2 + r3], m4
2339 movu [r2 + r3 + 16], m3
2340
2341 movq m2, [r0 + 16]
2342 movq m3, [r0 + r1 + 16]
2343 movq m4, [r5 + 16]
2344 movq m5, [r5 + r1 + 16]
2345
2346 punpcklbw m2, m3
2347 punpcklbw m7, m4, m5
2348
2349 pmaddubsw m2, m1
2350 pmaddubsw m7, m0
2351
2352 paddw m2, m7
2353 psubw m2, m6
2354
2355 movu [r2 + 32], m2
2356
2357 movq m2, [r5 + 2 * r1 + 16]
2358
2359 punpcklbw m3, m4
2360 punpcklbw m5, m2
2361
2362 pmaddubsw m3, m1
2363 pmaddubsw m5, m0
2364
2365 paddw m3, m5
2366 psubw m3, m6
2367
2368 movu [r2 + r3 + 32], m3
2369
2370 mov r0, r5
2371 lea r2, [r2 + 2 * r3]
2372
2373 dec r4d
2374 jnz .loop
2375 RET
2376%endmacro
2377
2378FILTER_V4_PS_W24 24, 32
2379
2380FILTER_V4_PS_W24 24, 64
2381
2382;---------------------------------------------------------------------------------------------------------------
2383; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2384;---------------------------------------------------------------------------------------------------------------
2385%macro FILTER_V_PS_W32 2
2386INIT_XMM sse4
2387cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2388
2389 mov r4d, r4m
2390 sub r0, r1
2391 add r3d, r3d
2392
2393%ifdef PIC
2394 lea r5, [tab_ChromaCoeff]
2395 movd m0, [r5 + r4 * 4]
2396%else
2397 movd m0, [tab_ChromaCoeff + r4 * 4]
2398%endif
2399
2400 pshufb m1, m0, [tab_Vm]
2401 pshufb m0, [tab_Vm + 16]
2402
2403 mova m7, [pw_2000]
2404
2405 mov r4d, %2
2406
2407.loop:
2408 movu m2, [r0]
2409 movu m3, [r0 + r1]
2410
2411 punpcklbw m4, m2, m3
2412 punpckhbw m2, m3
2413
2414 pmaddubsw m4, m1
2415 pmaddubsw m2, m1
2416
2417 lea r5, [r0 + 2 * r1]
2418 movu m3, [r5]
2419 movu m5, [r5 + r1]
2420
2421 punpcklbw m6, m3, m5
2422 punpckhbw m3, m5
2423
2424 pmaddubsw m6, m0
2425 pmaddubsw m3, m0
2426
2427 paddw m4, m6
2428 paddw m2, m3
2429
2430 psubw m4, m7
2431 psubw m2, m7
2432
2433 movu [r2], m4
2434 movu [r2 + 16], m2
2435
2436 movu m2, [r0 + 16]
2437 movu m3, [r0 + r1 + 16]
2438
2439 punpcklbw m4, m2, m3
2440 punpckhbw m2, m3
2441
2442 pmaddubsw m4, m1
2443 pmaddubsw m2, m1
2444
2445 movu m3, [r5 + 16]
2446 movu m5, [r5 + r1 + 16]
2447
2448 punpcklbw m6, m3, m5
2449 punpckhbw m3, m5
2450
2451 pmaddubsw m6, m0
2452 pmaddubsw m3, m0
2453
2454 paddw m4, m6
2455 paddw m2, m3
2456
2457 psubw m4, m7
2458 psubw m2, m7
2459
2460 movu [r2 + 32], m4
2461 movu [r2 + 48], m2
2462
2463 lea r0, [r0 + r1]
2464 lea r2, [r2 + r3]
2465
2466 dec r4d
2467 jnz .loop
2468 RET
2469%endmacro
2470
2471FILTER_V_PS_W32 32, 8
2472FILTER_V_PS_W32 32, 16
2473FILTER_V_PS_W32 32, 24
2474FILTER_V_PS_W32 32, 32
2475
2476FILTER_V_PS_W32 32, 48
2477FILTER_V_PS_W32 32, 64
2478
2479;-----------------------------------------------------------------------------
2480; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2481;-----------------------------------------------------------------------------
2482%macro FILTER_V4_W8_H8_H16_H32 2
2483INIT_XMM sse4
2484cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
2485
2486mov r4d, r4m
2487sub r0, r1
2488
2489%ifdef PIC
2490lea r5, [tab_ChromaCoeff]
2491movd m5, [r5 + r4 * 4]
2492%else
2493movd m5, [tab_ChromaCoeff + r4 * 4]
2494%endif
2495
2496pshufb m6, m5, [tab_Vm]
2497pshufb m5, [tab_Vm + 16]
2498mova m4, [tab_c_512]
2499lea r5, [r1 * 3]
2500
2501mov r4d, %2
2502
2503.loop:
2504movq m0, [r0]
2505movq m1, [r0 + r1]
2506movq m2, [r0 + 2 * r1]
2507movq m3, [r0 + r5]
2508
2509punpcklbw m0, m1
2510punpcklbw m1, m2
2511punpcklbw m2, m3
2512
2513pmaddubsw m0, m6
2514pmaddubsw m7, m2, m5
2515
2516paddw m0, m7
2517
2518pmulhrsw m0, m4
2519packuswb m0, m0
2520movh [r2], m0
2521
2522lea r0, [r0 + 4 * r1]
2523movq m0, [r0]
2524
2525punpcklbw m3, m0
2526
2527pmaddubsw m1, m6
2528pmaddubsw m7, m3, m5
2529
2530paddw m1, m7
2531
2532pmulhrsw m1, m4
2533packuswb m1, m1
2534movh [r2 + r3], m1
2535
2536movq m1, [r0 + r1]
2537
2538punpcklbw m0, m1
2539
2540pmaddubsw m2, m6
2541pmaddubsw m0, m5
2542
2543paddw m2, m0
2544
2545pmulhrsw m2, m4
2546
2547movq m7, [r0 + 2 * r1]
2548punpcklbw m1, m7
2549
2550pmaddubsw m3, m6
2551pmaddubsw m1, m5
2552
2553paddw m3, m1
2554
2555pmulhrsw m3, m4
2556packuswb m2, m3
2557
2558lea r2, [r2 + 2 * r3]
2559movh [r2], m2
2560movhps [r2 + r3], m2
2561
2562lea r2, [r2 + 2 * r3]
2563
2564sub r4, 4
2565jnz .loop
2566RET
2567%endmacro
2568
2569FILTER_V4_W8_H8_H16_H32 8, 8
2570FILTER_V4_W8_H8_H16_H32 8, 16
2571FILTER_V4_W8_H8_H16_H32 8, 32
2572
2573FILTER_V4_W8_H8_H16_H32 8, 12
2574FILTER_V4_W8_H8_H16_H32 8, 64
2575
2576
2577;-----------------------------------------------------------------------------
2578;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2579;-----------------------------------------------------------------------------
2580%macro FILTER_V4_W6_H4 2
2581INIT_XMM sse4
2582cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
2583
2584mov r4d, r4m
2585sub r0, r1
2586
2587%ifdef PIC
2588lea r5, [tab_ChromaCoeff]
2589movd m5, [r5 + r4 * 4]
2590%else
2591movd m5, [tab_ChromaCoeff + r4 * 4]
2592%endif
2593
2594pshufb m6, m5, [tab_Vm]
2595pshufb m5, [tab_Vm + 16]
2596mova m4, [tab_c_512]
2597
2598mov r4d, %2
2599lea r5, [3 * r1]
2600
2601.loop:
2602movq m0, [r0]
2603movq m1, [r0 + r1]
2604movq m2, [r0 + 2 * r1]
2605movq m3, [r0 + r5]
2606
2607punpcklbw m0, m1
2608punpcklbw m1, m2
2609punpcklbw m2, m3
2610
2611pmaddubsw m0, m6
2612pmaddubsw m7, m2, m5
2613
2614paddw m0, m7
2615
2616pmulhrsw m0, m4
2617packuswb m0, m0
2618movd [r2], m0
2619pextrw [r2 + 4], m0, 2
2620
2621lea r0, [r0 + 4 * r1]
2622
2623movq m0, [r0]
2624punpcklbw m3, m0
2625
2626pmaddubsw m1, m6
2627pmaddubsw m7, m3, m5
2628
2629paddw m1, m7
2630
2631pmulhrsw m1, m4
2632packuswb m1, m1
2633movd [r2 + r3], m1
2634pextrw [r2 + r3 + 4], m1, 2
2635
2636movq m1, [r0 + r1]
2637punpcklbw m7, m0, m1
2638
2639pmaddubsw m2, m6
2640pmaddubsw m7, m5
2641
2642paddw m2, m7
2643
2644pmulhrsw m2, m4
2645packuswb m2, m2
2646lea r2, [r2 + 2 * r3]
2647movd [r2], m2
2648pextrw [r2 + 4], m2, 2
2649
2650movq m2, [r0 + 2 * r1]
2651punpcklbw m1, m2
2652
2653pmaddubsw m3, m6
2654pmaddubsw m1, m5
2655
2656paddw m3, m1
2657
2658pmulhrsw m3, m4
2659packuswb m3, m3
2660
2661movd [r2 + r3], m3
2662pextrw [r2 + r3 + 4], m3, 2
2663
2664lea r2, [r2 + 2 * r3]
2665
2666sub r4, 4
2667jnz .loop
2668RET
2669%endmacro
2670
2671FILTER_V4_W6_H4 6, 8
2672
2673FILTER_V4_W6_H4 6, 16
2674
2675;-----------------------------------------------------------------------------
2676; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2677;-----------------------------------------------------------------------------
2678%macro FILTER_V4_W12_H2 2
2679INIT_XMM sse4
2680cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
2681
2682mov r4d, r4m
2683sub r0, r1
2684
2685%ifdef PIC
2686lea r5, [tab_ChromaCoeff]
2687movd m0, [r5 + r4 * 4]
2688%else
2689movd m0, [tab_ChromaCoeff + r4 * 4]
2690%endif
2691
2692pshufb m1, m0, [tab_Vm]
2693pshufb m0, [tab_Vm + 16]
2694
2695mov r4d, %2
2696
2697.loop:
2698movu m2, [r0]
2699movu m3, [r0 + r1]
2700
2701punpcklbw m4, m2, m3
2702punpckhbw m2, m3
2703
2704pmaddubsw m4, m1
2705pmaddubsw m2, m1
2706
2707lea r0, [r0 + 2 * r1]
2708movu m5, [r0]
2709movu m7, [r0 + r1]
2710
2711punpcklbw m6, m5, m7
2712pmaddubsw m6, m0
2713paddw m4, m6
2714
2715punpckhbw m6, m5, m7
2716pmaddubsw m6, m0
2717paddw m2, m6
2718
2719mova m6, [tab_c_512]
2720
2721pmulhrsw m4, m6
2722pmulhrsw m2, m6
2723
2724packuswb m4, m2
2725
2726movh [r2], m4
2727pextrd [r2 + 8], m4, 2
2728
2729punpcklbw m4, m3, m5
2730punpckhbw m3, m5
2731
2732pmaddubsw m4, m1
2733pmaddubsw m3, m1
2734
2735movu m5, [r0 + 2 * r1]
2736
2737punpcklbw m2, m7, m5
2738punpckhbw m7, m5
2739
2740pmaddubsw m2, m0
2741pmaddubsw m7, m0
2742
2743paddw m4, m2
2744paddw m3, m7
2745
2746pmulhrsw m4, m6
2747pmulhrsw m3, m6
2748
2749packuswb m4, m3
2750
2751movh [r2 + r3], m4
2752pextrd [r2 + r3 + 8], m4, 2
2753
2754lea r2, [r2 + 2 * r3]
2755
2756sub r4, 2
2757jnz .loop
2758RET
2759%endmacro
2760
2761FILTER_V4_W12_H2 12, 16
2762
2763FILTER_V4_W12_H2 12, 32
2764
2765;-----------------------------------------------------------------------------
2766; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2767;-----------------------------------------------------------------------------
2768%macro FILTER_V4_W16_H2 2
2769INIT_XMM sse4
2770cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
2771
2772mov r4d, r4m
2773sub r0, r1
2774
2775%ifdef PIC
2776lea r5, [tab_ChromaCoeff]
2777movd m0, [r5 + r4 * 4]
2778%else
2779movd m0, [tab_ChromaCoeff + r4 * 4]
2780%endif
2781
2782pshufb m1, m0, [tab_Vm]
2783pshufb m0, [tab_Vm + 16]
2784
2785mov r4d, %2/2
2786
2787.loop:
2788movu m2, [r0]
2789movu m3, [r0 + r1]
2790
2791punpcklbw m4, m2, m3
2792punpckhbw m2, m3
2793
2794pmaddubsw m4, m1
2795pmaddubsw m2, m1
2796
2797lea r0, [r0 + 2 * r1]
2798movu m5, [r0]
2799movu m6, [r0 + r1]
2800
2801punpckhbw m7, m5, m6
2802pmaddubsw m7, m0
2803paddw m2, m7
2804
2805punpcklbw m7, m5, m6
2806pmaddubsw m7, m0
2807paddw m4, m7
2808
2809mova m7, [tab_c_512]
2810
2811pmulhrsw m4, m7
2812pmulhrsw m2, m7
2813
2814packuswb m4, m2
2815
2816movu [r2], m4
2817
2818punpcklbw m4, m3, m5
2819punpckhbw m3, m5
2820
2821pmaddubsw m4, m1
2822pmaddubsw m3, m1
2823
2824movu m5, [r0 + 2 * r1]
2825
2826punpcklbw m2, m6, m5
2827punpckhbw m6, m5
2828
2829pmaddubsw m2, m0
2830pmaddubsw m6, m0
2831
2832paddw m4, m2
2833paddw m3, m6
2834
2835pmulhrsw m4, m7
2836pmulhrsw m3, m7
2837
2838packuswb m4, m3
2839
2840movu [r2 + r3], m4
2841
2842lea r2, [r2 + 2 * r3]
2843
2844dec r4d
2845jnz .loop
2846RET
2847%endmacro
2848
2849FILTER_V4_W16_H2 16, 4
2850FILTER_V4_W16_H2 16, 8
2851FILTER_V4_W16_H2 16, 12
2852FILTER_V4_W16_H2 16, 16
2853FILTER_V4_W16_H2 16, 32
2854
2855FILTER_V4_W16_H2 16, 24
2856FILTER_V4_W16_H2 16, 64
2857
2858;-----------------------------------------------------------------------------
2859;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2860;-----------------------------------------------------------------------------
2861%macro FILTER_V4_W24 2
2862INIT_XMM sse4
2863cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
2864
2865mov r4d, r4m
2866sub r0, r1
2867
2868%ifdef PIC
2869lea r5, [tab_ChromaCoeff]
2870movd m0, [r5 + r4 * 4]
2871%else
2872movd m0, [tab_ChromaCoeff + r4 * 4]
2873%endif
2874
2875pshufb m1, m0, [tab_Vm]
2876pshufb m0, [tab_Vm + 16]
2877
2878mov r4d, %2
2879
2880.loop:
2881movu m2, [r0]
2882movu m3, [r0 + r1]
2883
2884punpcklbw m4, m2, m3
2885punpckhbw m2, m3
2886
2887pmaddubsw m4, m1
2888pmaddubsw m2, m1
2889
2890lea r5, [r0 + 2 * r1]
2891movu m5, [r5]
2892movu m7, [r5 + r1]
2893
2894punpcklbw m6, m5, m7
2895pmaddubsw m6, m0
2896paddw m4, m6
2897
2898punpckhbw m6, m5, m7
2899pmaddubsw m6, m0
2900paddw m2, m6
2901
2902mova m6, [tab_c_512]
2903
2904pmulhrsw m4, m6
2905pmulhrsw m2, m6
2906
2907packuswb m4, m2
2908
2909movu [r2], m4
2910
2911punpcklbw m4, m3, m5
2912punpckhbw m3, m5
2913
2914pmaddubsw m4, m1
2915pmaddubsw m3, m1
2916
2917movu m2, [r5 + 2 * r1]
2918
2919punpcklbw m5, m7, m2
2920punpckhbw m7, m2
2921
2922pmaddubsw m5, m0
2923pmaddubsw m7, m0
2924
2925paddw m4, m5
2926paddw m3, m7
2927
2928pmulhrsw m4, m6
2929pmulhrsw m3, m6
2930
2931packuswb m4, m3
2932
2933movu [r2 + r3], m4
2934
2935movq m2, [r0 + 16]
2936movq m3, [r0 + r1 + 16]
2937movq m4, [r5 + 16]
2938movq m5, [r5 + r1 + 16]
2939
2940punpcklbw m2, m3
2941punpcklbw m4, m5
2942
2943pmaddubsw m2, m1
2944pmaddubsw m4, m0
2945
2946paddw m2, m4
2947
2948pmulhrsw m2, m6
2949
2950movq m3, [r0 + r1 + 16]
2951movq m4, [r5 + 16]
2952movq m5, [r5 + r1 + 16]
2953movq m7, [r5 + 2 * r1 + 16]
2954
2955punpcklbw m3, m4
2956punpcklbw m5, m7
2957
2958pmaddubsw m3, m1
2959pmaddubsw m5, m0
2960
2961paddw m3, m5
2962
2963pmulhrsw m3, m6
2964packuswb m2, m3
2965
2966movh [r2 + 16], m2
2967movhps [r2 + r3 + 16], m2
2968
2969mov r0, r5
2970lea r2, [r2 + 2 * r3]
2971
2972sub r4, 2
2973jnz .loop
2974RET
2975%endmacro
2976
2977FILTER_V4_W24 24, 32
2978
2979FILTER_V4_W24 24, 64
2980
2981;-----------------------------------------------------------------------------
2982; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2983;-----------------------------------------------------------------------------
2984%macro FILTER_V4_W32 2
2985INIT_XMM sse4
2986cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
2987
2988mov r4d, r4m
2989sub r0, r1
2990
2991%ifdef PIC
2992lea r5, [tab_ChromaCoeff]
2993movd m0, [r5 + r4 * 4]
2994%else
2995movd m0, [tab_ChromaCoeff + r4 * 4]
2996%endif
2997
2998pshufb m1, m0, [tab_Vm]
2999pshufb m0, [tab_Vm + 16]
3000
3001mova m7, [tab_c_512]
3002
3003mov r4d, %2
3004
3005.loop:
3006movu m2, [r0]
3007movu m3, [r0 + r1]
3008
3009punpcklbw m4, m2, m3
3010punpckhbw m2, m3
3011
3012pmaddubsw m4, m1
3013pmaddubsw m2, m1
3014
3015lea r5, [r0 + 2 * r1]
3016movu m3, [r5]
3017movu m5, [r5 + r1]
3018
3019punpcklbw m6, m3, m5
3020punpckhbw m3, m5
3021
3022pmaddubsw m6, m0
3023pmaddubsw m3, m0
3024
3025paddw m4, m6
3026paddw m2, m3
3027
3028pmulhrsw m4, m7
3029pmulhrsw m2, m7
3030
3031packuswb m4, m2
3032
3033movu [r2], m4
3034
3035movu m2, [r0 + 16]
3036movu m3, [r0 + r1 + 16]
3037
3038punpcklbw m4, m2, m3
3039punpckhbw m2, m3
3040
3041pmaddubsw m4, m1
3042pmaddubsw m2, m1
3043
3044movu m3, [r5 + 16]
3045movu m5, [r5 + r1 + 16]
3046
3047punpcklbw m6, m3, m5
3048punpckhbw m3, m5
3049
3050pmaddubsw m6, m0
3051pmaddubsw m3, m0
3052
3053paddw m4, m6
3054paddw m2, m3
3055
3056pmulhrsw m4, m7
3057pmulhrsw m2, m7
3058
3059packuswb m4, m2
3060
3061movu [r2 + 16], m4
3062
3063lea r0, [r0 + r1]
3064lea r2, [r2 + r3]
3065
3066dec r4
3067jnz .loop
3068RET
3069%endmacro
3070
3071FILTER_V4_W32 32, 8
3072FILTER_V4_W32 32, 16
3073FILTER_V4_W32 32, 24
3074FILTER_V4_W32 32, 32
3075
3076FILTER_V4_W32 32, 48
3077FILTER_V4_W32 32, 64
3078
3079
3080;-----------------------------------------------------------------------------
3081; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3082;-----------------------------------------------------------------------------
3083%macro FILTER_V4_W16n_H2 2
3084INIT_XMM sse4
3085cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
3086
3087mov r4d, r4m
3088sub r0, r1
3089
3090%ifdef PIC
3091lea r5, [tab_ChromaCoeff]
3092movd m0, [r5 + r4 * 4]
3093%else
3094movd m0, [tab_ChromaCoeff + r4 * 4]
3095%endif
3096
3097pshufb m1, m0, [tab_Vm]
3098pshufb m0, [tab_Vm + 16]
3099
3100mov r4d, %2/2
3101
3102.loop:
3103
3104mov r6d, %1/16
3105
3106.loopW:
3107
3108movu m2, [r0]
3109movu m3, [r0 + r1]
3110
3111punpcklbw m4, m2, m3
3112punpckhbw m2, m3
3113
3114pmaddubsw m4, m1
3115pmaddubsw m2, m1
3116
3117lea r5, [r0 + 2 * r1]
3118movu m5, [r5]
3119movu m6, [r5 + r1]
3120
3121punpckhbw m7, m5, m6
3122pmaddubsw m7, m0
3123paddw m2, m7
3124
3125punpcklbw m7, m5, m6
3126pmaddubsw m7, m0
3127paddw m4, m7
3128
3129mova m7, [tab_c_512]
3130
3131pmulhrsw m4, m7
3132pmulhrsw m2, m7
3133
3134packuswb m4, m2
3135
3136movu [r2], m4
3137
3138punpcklbw m4, m3, m5
3139punpckhbw m3, m5
3140
3141pmaddubsw m4, m1
3142pmaddubsw m3, m1
3143
3144movu m5, [r5 + 2 * r1]
3145
3146punpcklbw m2, m6, m5
3147punpckhbw m6, m5
3148
3149pmaddubsw m2, m0
3150pmaddubsw m6, m0
3151
3152paddw m4, m2
3153paddw m3, m6
3154
3155pmulhrsw m4, m7
3156pmulhrsw m3, m7
3157
3158packuswb m4, m3
3159
3160movu [r2 + r3], m4
3161
3162add r0, 16
3163add r2, 16
3164dec r6d
3165jnz .loopW
3166
3167lea r0, [r0 + r1 * 2 - %1]
3168lea r2, [r2 + r3 * 2 - %1]
3169
3170dec r4d
3171jnz .loop
3172RET
3173%endmacro
3174
3175FILTER_V4_W16n_H2 64, 64
3176FILTER_V4_W16n_H2 64, 32
3177FILTER_V4_W16n_H2 64, 48
3178FILTER_V4_W16n_H2 48, 64
3179FILTER_V4_W16n_H2 64, 16
3180
3181
3182;-----------------------------------------------------------------------------
3183; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
3184;-----------------------------------------------------------------------------
3185INIT_XMM ssse3
3186cglobal luma_p2s, 3, 7, 6
3187
3188 ; load width and height
3189 mov r3d, r3m
3190 mov r4d, r4m
3191
3192 ; load constant
3193 mova m4, [tab_c_128]
3194 mova m5, [tab_c_64_n64]
3195
3196.loopH:
3197
3198 xor r5d, r5d
3199.loopW:
3200 lea r6, [r0 + r5]
3201
3202 movh m0, [r6]
3203 punpcklbw m0, m4
3204 pmaddubsw m0, m5
3205
3206 movh m1, [r6 + r1]
3207 punpcklbw m1, m4
3208 pmaddubsw m1, m5
3209
3210 movh m2, [r6 + r1 * 2]
3211 punpcklbw m2, m4
3212 pmaddubsw m2, m5
3213
3214 lea r6, [r6 + r1 * 2]
3215 movh m3, [r6 + r1]
3216 punpcklbw m3, m4
3217 pmaddubsw m3, m5
3218
3219 add r5, 8
3220 cmp r5, r3
3221 jg .width4
3222 movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
3223 movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
3224 movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
3225 movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
3226 je .nextH
3227 jmp .loopW
3228
3229.width4:
3230 movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
3231 movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
3232 movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
3233 movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
3234
3235.nextH:
3236 lea r0, [r0 + r1 * 4]
3237 add r2, FENC_STRIDE * 8
3238
3239 sub r4d, 4
3240 jnz .loopH
3241
3242 RET
3243
3244%macro PROCESS_LUMA_W4_4R 0
3245 movd m0, [r0]
3246 movd m1, [r0 + r1]
3247 punpcklbw m2, m0, m1 ; m2=[0 1]
3248
3249 lea r0, [r0 + 2 * r1]
3250 movd m0, [r0]
3251 punpcklbw m1, m0 ; m1=[1 2]
3252 punpcklqdq m2, m1 ; m2=[0 1 1 2]
3253 pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]
3254
3255 movd m1, [r0 + r1]
3256 punpcklbw m5, m0, m1 ; m2=[2 3]
3257 lea r0, [r0 + 2 * r1]
3258 movd m0, [r0]
3259 punpcklbw m1, m0 ; m1=[3 4]
3260 punpcklqdq m5, m1 ; m5=[2 3 3 4]
3261 pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]
3262 paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2
3263 pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4
3264
3265 movd m1, [r0 + r1]
3266 punpcklbw m2, m0, m1 ; m2=[4 5]
3267 lea r0, [r0 + 2 * r1]
3268 movd m0, [r0]
3269 punpcklbw m1, m0 ; m1=[5 6]
3270 punpcklqdq m2, m1 ; m2=[4 5 5 6]
3271 pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]
3272 paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2
3273 pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]
3274 paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4
3275
3276 movd m1, [r0 + r1]
3277 punpcklbw m2, m0, m1 ; m2=[6 7]
3278 lea r0, [r0 + 2 * r1]
3279 movd m0, [r0]
3280 punpcklbw m1, m0 ; m1=[7 8]
3281 punpcklqdq m2, m1 ; m2=[6 7 7 8]
3282 pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]
3283 paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end
3284 pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]
3285 paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4
3286
3287 movd m1, [r0 + r1]
3288 punpcklbw m2, m0, m1 ; m2=[8 9]
3289 movd m0, [r0 + 2 * r1]
3290 punpcklbw m1, m0 ; m1=[9 10]
3291 punpcklqdq m2, m1 ; m2=[8 9 9 10]
3292 pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]
3293 paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end
3294%endmacro
3295
3296%macro PROCESS_LUMA_W8_4R 0
3297 movq m0, [r0]
3298 movq m1, [r0 + r1]
3299 punpcklbw m0, m1
3300 pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1
3301
3302 lea r0, [r0 + 2 * r1]
3303 movq m0, [r0]
3304 punpcklbw m1, m0
3305 pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2
3306
3307 movq m1, [r0 + r1]
3308 punpcklbw m0, m1
3309 pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3
3310 pmaddubsw m0, [r6 + 1 * 16]
3311 paddw m7, m0 ;m7=[0+1+2+3] Row1
3312
3313 lea r0, [r0 + 2 * r1]
3314 movq m0, [r0]
3315 punpcklbw m1, m0
3316 pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4
3317 pmaddubsw m1, [r6 + 1 * 16]
3318 paddw m6, m1 ;m6 = [1+2+3+4] Row2
3319
3320 movq m1, [r0 + r1]
3321 punpcklbw m0, m1
3322 pmaddubsw m2, m0, [r6 + 1 * 16]
3323 pmaddubsw m0, [r6 + 2 * 16]
3324 paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1
3325 paddw m5, m2 ;m5=[2+3+4+5] Row3
3326
3327 lea r0, [r0 + 2 * r1]
3328 movq m0, [r0]
3329 punpcklbw m1, m0
3330 pmaddubsw m2, m1, [r6 + 1 * 16]
3331 pmaddubsw m1, [r6 + 2 * 16]
3332 paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2
3333 paddw m4, m2 ;m4=[3+4+5+6] Row4
3334
3335 movq m1, [r0 + r1]
3336 punpcklbw m0, m1
3337 pmaddubsw m2, m0, [r6 + 2 * 16]
3338 pmaddubsw m0, [r6 + 3 * 16]
3339 paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end
3340 paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3
3341
3342 lea r0, [r0 + 2 * r1]
3343 movq m0, [r0]
3344 punpcklbw m1, m0
3345 pmaddubsw m2, m1, [r6 + 2 * 16]
3346 pmaddubsw m1, [r6 + 3 * 16]
3347 paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end
3348 paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4
3349
3350 movq m1, [r0 + r1]
3351 punpcklbw m0, m1
3352 pmaddubsw m0, [r6 + 3 * 16]
3353 paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end
3354
3355 movq m0, [r0 + 2 * r1]
3356 punpcklbw m1, m0
3357 pmaddubsw m1, [r6 + 3 * 16]
3358 paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end
3359%endmacro
3360
3361;-------------------------------------------------------------------------------------------------------------
3362; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3363;-------------------------------------------------------------------------------------------------------------
3364%macro FILTER_VER_LUMA_4xN 3
3365INIT_XMM sse4
3366cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
3367 lea r5, [3 * r1]
3368 sub r0, r5
3369 shl r4d, 6
3370%ifidn %3,ps
3371 add r3d, r3d
3372%endif
3373
3374%ifdef PIC
3375 lea r5, [tab_LumaCoeffVer]
3376 lea r6, [r5 + r4]
3377%else
3378 lea r6, [tab_LumaCoeffVer + r4]
3379%endif
3380
3381%ifidn %3,pp
3382 mova m3, [tab_c_512]
3383%else
3384 mova m3, [pw_2000]
3385%endif
3386
3387 mov r4d, %2/4
3388 lea r5, [4 * r1]
3389
3390.loopH:
3391 PROCESS_LUMA_W4_4R
3392
3393%ifidn %3,pp
3394 pmulhrsw m4, m3
3395 pmulhrsw m5, m3
3396
3397 packuswb m4, m5
3398
3399 movd [r2], m4
3400 pextrd [r2 + r3], m4, 1
3401 lea r2, [r2 + 2 * r3]
3402 pextrd [r2], m4, 2
3403 pextrd [r2 + r3], m4, 3
3404%else
3405 psubw m4, m3
3406 psubw m5, m3
3407
3408 movlps [r2], m4
3409 movhps [r2 + r3], m4
3410 lea r2, [r2 + 2 * r3]
3411 movlps [r2], m5
3412 movhps [r2 + r3], m5
3413%endif
3414
3415 sub r0, r5
3416 lea r2, [r2 + 2 * r3]
3417
3418 dec r4d
3419 jnz .loopH
3420
3421 RET
3422%endmacro
3423
3424;-------------------------------------------------------------------------------------------------------------
3425; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3426;-------------------------------------------------------------------------------------------------------------
3427FILTER_VER_LUMA_4xN 4, 4, pp
3428
3429;-------------------------------------------------------------------------------------------------------------
3430; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3431;-------------------------------------------------------------------------------------------------------------
3432FILTER_VER_LUMA_4xN 4, 8, pp
3433
3434;-------------------------------------------------------------------------------------------------------------
3435; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3436;-------------------------------------------------------------------------------------------------------------
3437FILTER_VER_LUMA_4xN 4, 16, pp
3438
3439;-------------------------------------------------------------------------------------------------------------
3440; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3441;-------------------------------------------------------------------------------------------------------------
3442FILTER_VER_LUMA_4xN 4, 4, ps
3443
3444;-------------------------------------------------------------------------------------------------------------
3445; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3446;-------------------------------------------------------------------------------------------------------------
3447FILTER_VER_LUMA_4xN 4, 8, ps
3448
3449;-------------------------------------------------------------------------------------------------------------
3450; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3451;-------------------------------------------------------------------------------------------------------------
3452FILTER_VER_LUMA_4xN 4, 16, ps
3453
3454;-------------------------------------------------------------------------------------------------------------
3455; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3456;-------------------------------------------------------------------------------------------------------------
3457%macro FILTER_VER_LUMA_8xN 3
3458INIT_XMM sse4
3459cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
3460 lea r5, [3 * r1]
3461 sub r0, r5
3462 shl r4d, 6
3463
3464%ifidn %3,ps
3465 add r3d, r3d
3466%endif
3467
3468%ifdef PIC
3469 lea r5, [tab_LumaCoeffVer]
3470 lea r6, [r5 + r4]
3471%else
3472 lea r6, [tab_LumaCoeffVer + r4]
3473%endif
3474
3475 %ifidn %3,pp
3476 mova m3, [tab_c_512]
3477%else
3478 mova m3, [pw_2000]
3479%endif
3480
3481 mov r4d, %2/4
3482 lea r5, [4 * r1]
3483
3484.loopH:
3485 PROCESS_LUMA_W8_4R
3486
3487%ifidn %3,pp
3488 pmulhrsw m7, m3
3489 pmulhrsw m6, m3
3490 pmulhrsw m5, m3
3491 pmulhrsw m4, m3
3492
3493 packuswb m7, m6
3494 packuswb m5, m4
3495
3496 movlps [r2], m7
3497 movhps [r2 + r3], m7
3498 lea r2, [r2 + 2 * r3]
3499 movlps [r2], m5
3500 movhps [r2 + r3], m5
3501%else
3502 psubw m7, m3
3503 psubw m6, m3
3504 psubw m5, m3
3505 psubw m4, m3
3506
3507 movu [r2], m7
3508 movu [r2 + r3], m6
3509 lea r2, [r2 + 2 * r3]
3510 movu [r2], m5
3511 movu [r2 + r3], m4
3512%endif
3513
3514 sub r0, r5
3515 lea r2, [r2 + 2 * r3]
3516
3517 dec r4d
3518 jnz .loopH
3519
3520 RET
3521%endmacro
3522
3523;-------------------------------------------------------------------------------------------------------------
3524; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3525;-------------------------------------------------------------------------------------------------------------
3526FILTER_VER_LUMA_8xN 8, 4, pp
3527
3528;-------------------------------------------------------------------------------------------------------------
3529; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3530;-------------------------------------------------------------------------------------------------------------
3531FILTER_VER_LUMA_8xN 8, 8, pp
3532
3533;-------------------------------------------------------------------------------------------------------------
3534; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3535;-------------------------------------------------------------------------------------------------------------
3536FILTER_VER_LUMA_8xN 8, 16, pp
3537
3538;-------------------------------------------------------------------------------------------------------------
3539; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3540;-------------------------------------------------------------------------------------------------------------
3541FILTER_VER_LUMA_8xN 8, 32, pp
3542
3543;-------------------------------------------------------------------------------------------------------------
3544; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3545;-------------------------------------------------------------------------------------------------------------
3546FILTER_VER_LUMA_8xN 8, 4, ps
3547
3548;-------------------------------------------------------------------------------------------------------------
3549; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3550;-------------------------------------------------------------------------------------------------------------
3551FILTER_VER_LUMA_8xN 8, 8, ps
3552
3553;-------------------------------------------------------------------------------------------------------------
3554; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3555;-------------------------------------------------------------------------------------------------------------
3556FILTER_VER_LUMA_8xN 8, 16, ps
3557
3558;-------------------------------------------------------------------------------------------------------------
3559; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3560;-------------------------------------------------------------------------------------------------------------
3561FILTER_VER_LUMA_8xN 8, 32, ps
3562
3563;-------------------------------------------------------------------------------------------------------------
3564; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3565;-------------------------------------------------------------------------------------------------------------
3566%macro FILTER_VER_LUMA_12xN 3
3567INIT_XMM sse4
3568cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
3569 lea r5, [3 * r1]
3570 sub r0, r5
3571 shl r4d, 6
3572%ifidn %3,ps
3573 add r3d, r3d
3574%endif
3575
3576%ifdef PIC
3577 lea r5, [tab_LumaCoeffVer]
3578 lea r6, [r5 + r4]
3579%else
3580 lea r6, [tab_LumaCoeffVer + r4]
3581%endif
3582
3583 %ifidn %3,pp
3584 mova m3, [tab_c_512]
3585%else
3586 mova m3, [pw_2000]
3587%endif
3588
3589 mov r4d, %2/4
3590
3591.loopH:
3592 PROCESS_LUMA_W8_4R
3593
3594%ifidn %3,pp
3595 pmulhrsw m7, m3
3596 pmulhrsw m6, m3
3597 pmulhrsw m5, m3
3598 pmulhrsw m4, m3
3599
3600 packuswb m7, m6
3601 packuswb m5, m4
3602
3603 movlps [r2], m7
3604 movhps [r2 + r3], m7
3605 lea r5, [r2 + 2 * r3]
3606 movlps [r5], m5
3607 movhps [r5 + r3], m5
3608%else
3609 psubw m7, m3
3610 psubw m6, m3
3611 psubw m5, m3
3612 psubw m4, m3
3613
3614 movu [r2], m7
3615 movu [r2 + r3], m6
3616 lea r5, [r2 + 2 * r3]
3617 movu [r5], m5
3618 movu [r5 + r3], m4
3619%endif
3620
3621 lea r5, [8 * r1 - 8]
3622 sub r0, r5
3623%ifidn %3,pp
3624 add r2, 8
3625%else
3626 add r2, 16
3627%endif
3628
3629 PROCESS_LUMA_W4_4R
3630
3631%ifidn %3,pp
3632 pmulhrsw m4, m3
3633 pmulhrsw m5, m3
3634
3635 packuswb m4, m5
3636
3637 movd [r2], m4
3638 pextrd [r2 + r3], m4, 1
3639 lea r5, [r2 + 2 * r3]
3640 pextrd [r5], m4, 2
3641 pextrd [r5 + r3], m4, 3
3642%else
3643 psubw m4, m3
3644 psubw m5, m3
3645
3646 movlps [r2], m4
3647 movhps [r2 + r3], m4
3648 lea r5, [r2 + 2 * r3]
3649 movlps [r5], m5
3650 movhps [r5 + r3], m5
3651%endif
3652
3653 lea r5, [4 * r1 + 8]
3654 sub r0, r5
3655%ifidn %3,pp
3656 lea r2, [r2 + 4 * r3 - 8]
3657%else
3658 lea r2, [r2 + 4 * r3 - 16]
3659%endif
3660
3661 dec r4d
3662 jnz .loopH
3663
3664 RET
3665%endmacro
3666
3667;-------------------------------------------------------------------------------------------------------------
3668; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3669;-------------------------------------------------------------------------------------------------------------
3670FILTER_VER_LUMA_12xN 12, 16, pp
3671
3672;-------------------------------------------------------------------------------------------------------------
3673; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3674;-------------------------------------------------------------------------------------------------------------
3675FILTER_VER_LUMA_12xN 12, 16, ps
3676
3677;-------------------------------------------------------------------------------------------------------------
3678; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3679;-------------------------------------------------------------------------------------------------------------
3680%macro FILTER_VER_LUMA 3
3681INIT_XMM sse4
3682cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
3683 lea r5, [3 * r1]
3684 sub r0, r5
3685 shl r4d, 6
3686%ifidn %3,ps
3687 add r3d, r3d
3688%endif
3689
3690%ifdef PIC
3691 lea r5, [tab_LumaCoeffVer]
3692 lea r6, [r5 + r4]
3693%else
3694 lea r6, [tab_LumaCoeffVer + r4]
3695%endif
3696
3697%ifidn %3,pp
3698 mova m3, [tab_c_512]
3699%else
3700 mova m3, [pw_2000]
3701%endif
3702 mov dword [rsp], %2/4
3703
3704.loopH:
3705 mov r4d, (%1/8)
3706.loopW:
3707 PROCESS_LUMA_W8_4R
3708%ifidn %3,pp
3709 pmulhrsw m7, m3
3710 pmulhrsw m6, m3
3711 pmulhrsw m5, m3
3712 pmulhrsw m4, m3
3713
3714 packuswb m7, m6
3715 packuswb m5, m4
3716
3717 movlps [r2], m7
3718 movhps [r2 + r3], m7
3719 lea r5, [r2 + 2 * r3]
3720 movlps [r5], m5
3721 movhps [r5 + r3], m5
3722%else
3723 psubw m7, m3
3724 psubw m6, m3
3725 psubw m5, m3
3726 psubw m4, m3
3727
3728 movu [r2], m7
3729 movu [r2 + r3], m6
3730 lea r5, [r2 + 2 * r3]
3731 movu [r5], m5
3732 movu [r5 + r3], m4
3733%endif
3734
3735 lea r5, [8 * r1 - 8]
3736 sub r0, r5
3737%ifidn %3,pp
3738 add r2, 8
3739%else
3740 add r2, 16
3741%endif
3742 dec r4d
3743 jnz .loopW
3744
3745 lea r0, [r0 + 4 * r1 - %1]
3746%ifidn %3,pp
3747 lea r2, [r2 + 4 * r3 - %1]
3748%else
3749 lea r2, [r2 + 4 * r3 - 2 * %1]
3750%endif
3751
3752 dec dword [rsp]
3753 jnz .loopH
3754
3755 RET
3756%endmacro
3757
3758FILTER_VER_LUMA 16, 4, pp
3759FILTER_VER_LUMA 16, 8, pp
3760FILTER_VER_LUMA 16, 12, pp
3761FILTER_VER_LUMA 16, 16, pp
3762FILTER_VER_LUMA 16, 32, pp
3763FILTER_VER_LUMA 16, 64, pp
3764FILTER_VER_LUMA 24, 32, pp
3765FILTER_VER_LUMA 32, 8, pp
3766FILTER_VER_LUMA 32, 16, pp
3767FILTER_VER_LUMA 32, 24, pp
3768FILTER_VER_LUMA 32, 32, pp
3769FILTER_VER_LUMA 32, 64, pp
3770FILTER_VER_LUMA 48, 64, pp
3771FILTER_VER_LUMA 64, 16, pp
3772FILTER_VER_LUMA 64, 32, pp
3773FILTER_VER_LUMA 64, 48, pp
3774FILTER_VER_LUMA 64, 64, pp
3775
3776FILTER_VER_LUMA 16, 4, ps
3777FILTER_VER_LUMA 16, 8, ps
3778FILTER_VER_LUMA 16, 12, ps
3779FILTER_VER_LUMA 16, 16, ps
3780FILTER_VER_LUMA 16, 32, ps
3781FILTER_VER_LUMA 16, 64, ps
3782FILTER_VER_LUMA 24, 32, ps
3783FILTER_VER_LUMA 32, 8, ps
3784FILTER_VER_LUMA 32, 16, ps
3785FILTER_VER_LUMA 32, 24, ps
3786FILTER_VER_LUMA 32, 32, ps
3787FILTER_VER_LUMA 32, 64, ps
3788FILTER_VER_LUMA 48, 64, ps
3789FILTER_VER_LUMA 64, 16, ps
3790FILTER_VER_LUMA 64, 32, ps
3791FILTER_VER_LUMA 64, 48, ps
3792FILTER_VER_LUMA 64, 64, ps
3793
3794%macro PROCESS_LUMA_SP_W4_4R 0
3795 movq m0, [r0]
3796 movq m1, [r0 + r1]
3797 punpcklwd m0, m1 ;m0=[0 1]
3798 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
3799
3800 lea r0, [r0 + 2 * r1]
3801 movq m4, [r0]
3802 punpcklwd m1, m4 ;m1=[1 2]
3803 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
3804
3805 movq m5, [r0 + r1]
3806 punpcklwd m4, m5 ;m4=[2 3]
3807 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
3808 pmaddwd m4, [r6 + 1 * 16]
3809 paddd m0, m4 ;m0=[0+1+2+3] Row1
3810
3811 lea r0, [r0 + 2 * r1]
3812 movq m4, [r0]
3813 punpcklwd m5, m4 ;m5=[3 4]
3814 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
3815 pmaddwd m5, [r6 + 1 * 16]
3816 paddd m1, m5 ;m1 = [1+2+3+4] Row2
3817
3818 movq m5, [r0 + r1]
3819 punpcklwd m4, m5 ;m4=[4 5]
3820 pmaddwd m6, m4, [r6 + 1 * 16]
3821 paddd m2, m6 ;m2=[2+3+4+5] Row3
3822 pmaddwd m4, [r6 + 2 * 16]
3823 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
3824
3825 lea r0, [r0 + 2 * r1]
3826 movq m4, [r0]
3827 punpcklwd m5, m4 ;m5=[5 6]
3828 pmaddwd m6, m5, [r6 + 1 * 16]
3829 paddd m3, m6 ;m3=[3+4+5+6] Row4
3830 pmaddwd m5, [r6 + 2 * 16]
3831 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
3832
3833 movq m5, [r0 + r1]
3834 punpcklwd m4, m5 ;m4=[6 7]
3835 pmaddwd m6, m4, [r6 + 2 * 16]
3836 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
3837 pmaddwd m4, [r6 + 3 * 16]
3838 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
3839
3840 lea r0, [r0 + 2 * r1]
3841 movq m4, [r0]
3842 punpcklwd m5, m4 ;m5=[7 8]
3843 pmaddwd m6, m5, [r6 + 2 * 16]
3844 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
3845 pmaddwd m5, [r6 + 3 * 16]
3846 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
3847
3848 movq m5, [r0 + r1]
3849 punpcklwd m4, m5 ;m4=[8 9]
3850 pmaddwd m4, [r6 + 3 * 16]
3851 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
3852
3853 movq m4, [r0 + 2 * r1]
3854 punpcklwd m5, m4 ;m5=[9 10]
3855 pmaddwd m5, [r6 + 3 * 16]
3856 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
3857%endmacro
3858
3859;--------------------------------------------------------------------------------------------------------------
3860; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3861;--------------------------------------------------------------------------------------------------------------
3862%macro FILTER_VER_LUMA_SP 2
3863INIT_XMM sse4
3864cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
3865
3866 add r1d, r1d
3867 lea r5, [r1 + 2 * r1]
3868 sub r0, r5
3869 shl r4d, 6
3870
3871%ifdef PIC
3872 lea r5, [tab_LumaCoeffV]
3873 lea r6, [r5 + r4]
3874%else
3875 lea r6, [tab_LumaCoeffV + r4]
3876%endif
3877
3878 mova m7, [tab_c_526336]
3879
3880 mov dword [rsp], %2/4
3881.loopH:
3882 mov r4d, (%1/4)
3883.loopW:
3884 PROCESS_LUMA_SP_W4_4R
3885
3886 paddd m0, m7
3887 paddd m1, m7
3888 paddd m2, m7
3889 paddd m3, m7
3890
3891 psrad m0, 12
3892 psrad m1, 12
3893 psrad m2, 12
3894 psrad m3, 12
3895
3896 packssdw m0, m1
3897 packssdw m2, m3
3898
3899 packuswb m0, m2
3900
3901 movd [r2], m0
3902 pextrd [r2 + r3], m0, 1
3903 lea r5, [r2 + 2 * r3]
3904 pextrd [r5], m0, 2
3905 pextrd [r5 + r3], m0, 3
3906
3907 lea r5, [8 * r1 - 2 * 4]
3908 sub r0, r5
3909 add r2, 4
3910
3911 dec r4d
3912 jnz .loopW
3913
3914 lea r0, [r0 + 4 * r1 - 2 * %1]
3915 lea r2, [r2 + 4 * r3 - %1]
3916
3917 dec dword [rsp]
3918 jnz .loopH
3919
3920 RET
3921%endmacro
3922
3923;--------------------------------------------------------------------------------------------------------------
3924; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3925;--------------------------------------------------------------------------------------------------------------
3926 FILTER_VER_LUMA_SP 4, 4
3927 FILTER_VER_LUMA_SP 8, 8
3928 FILTER_VER_LUMA_SP 8, 4
3929 FILTER_VER_LUMA_SP 4, 8
3930 FILTER_VER_LUMA_SP 16, 16
3931 FILTER_VER_LUMA_SP 16, 8
3932 FILTER_VER_LUMA_SP 8, 16
3933 FILTER_VER_LUMA_SP 16, 12
3934 FILTER_VER_LUMA_SP 12, 16
3935 FILTER_VER_LUMA_SP 16, 4
3936 FILTER_VER_LUMA_SP 4, 16
3937 FILTER_VER_LUMA_SP 32, 32
3938 FILTER_VER_LUMA_SP 32, 16
3939 FILTER_VER_LUMA_SP 16, 32
3940 FILTER_VER_LUMA_SP 32, 24
3941 FILTER_VER_LUMA_SP 24, 32
3942 FILTER_VER_LUMA_SP 32, 8
3943 FILTER_VER_LUMA_SP 8, 32
3944 FILTER_VER_LUMA_SP 64, 64
3945 FILTER_VER_LUMA_SP 64, 32
3946 FILTER_VER_LUMA_SP 32, 64
3947 FILTER_VER_LUMA_SP 64, 48
3948 FILTER_VER_LUMA_SP 48, 64
3949 FILTER_VER_LUMA_SP 64, 16
3950 FILTER_VER_LUMA_SP 16, 64
3951
3952; TODO: combin of U and V is more performance, but need more register
3953; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
3954INIT_XMM ssse3
3955cglobal chroma_p2s, 3, 7, 4
3956
3957 ; load width and height
3958 mov r3d, r3m
3959 mov r4d, r4m
3960
3961 ; load constant
3962 mova m2, [tab_c_128]
3963 mova m3, [tab_c_64_n64]
3964
3965.loopH:
3966
3967 xor r5d, r5d
3968.loopW:
3969 lea r6, [r0 + r5]
3970
3971 movh m0, [r6]
3972 punpcklbw m0, m2
3973 pmaddubsw m0, m3
3974
3975 movh m1, [r6 + r1]
3976 punpcklbw m1, m2
3977 pmaddubsw m1, m3
3978
3979 add r5d, 8
3980 cmp r5d, r3d
3981 lea r6, [r2 + r5 * 2]
3982 jg .width4
3983 movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
3984 movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
3985 je .nextH
3986 jmp .loopW
3987
3988.width4:
3989 test r3d, 4
3990 jz .width2
3991 test r3d, 2
3992 movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
3993 movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
3994 lea r6, [r6 + 8]
3995 pshufd m0, m0, 2
3996 pshufd m1, m1, 2
3997 jz .nextH
3998
3999.width2:
4000 movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
4001 movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
4002
4003.nextH:
4004 lea r0, [r0 + r1 * 2]
4005 add r2, FENC_STRIDE / 2 * 4
4006
4007 sub r4d, 2
4008 jnz .loopH
4009
4010 RET
4011
4012%macro PROCESS_CHROMA_SP_W4_4R 0
4013 movq m0, [r0]
4014 movq m1, [r0 + r1]
4015 punpcklwd m0, m1 ;m0=[0 1]
4016 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
4017
4018 lea r0, [r0 + 2 * r1]
4019 movq m4, [r0]
4020 punpcklwd m1, m4 ;m1=[1 2]
4021 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
4022
4023 movq m5, [r0 + r1]
4024 punpcklwd m4, m5 ;m4=[2 3]
4025 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
4026 pmaddwd m4, [r6 + 1 * 16]
4027 paddd m0, m4 ;m0=[0+1+2+3] Row1 done
4028
4029 lea r0, [r0 + 2 * r1]
4030 movq m4, [r0]
4031 punpcklwd m5, m4 ;m5=[3 4]
4032 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
4033 pmaddwd m5, [r6 + 1 * 16]
4034 paddd m1, m5 ;m1 = [1+2+3+4] Row2
4035
4036 movq m5, [r0 + r1]
4037 punpcklwd m4, m5 ;m4=[4 5]
4038 pmaddwd m4, [r6 + 1 * 16]
4039 paddd m2, m4 ;m2=[2+3+4+5] Row3
4040
4041 movq m4, [r0 + 2 * r1]
4042 punpcklwd m5, m4 ;m5=[5 6]
4043 pmaddwd m5, [r6 + 1 * 16]
4044 paddd m3, m5 ;m3=[3+4+5+6] Row4
4045%endmacro
4046
4047;--------------------------------------------------------------------------------------------------------------
4048; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4049;--------------------------------------------------------------------------------------------------------------
4050%macro FILTER_VER_CHROMA_SP 2
4051INIT_XMM sse4
4052cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
4053
4054 add r1d, r1d
4055 sub r0, r1
4056 shl r4d, 5
4057
4058%ifdef PIC
4059 lea r5, [tab_ChromaCoeffV]
4060 lea r6, [r5 + r4]
4061%else
4062 lea r6, [tab_ChromaCoeffV + r4]
4063%endif
4064
4065 mova m6, [tab_c_526336]
4066
4067 mov dword [rsp], %2/4
4068
4069.loopH:
4070 mov r4d, (%1/4)
4071.loopW:
4072 PROCESS_CHROMA_SP_W4_4R
4073
4074 paddd m0, m6
4075 paddd m1, m6
4076 paddd m2, m6
4077 paddd m3, m6
4078
4079 psrad m0, 12
4080 psrad m1, 12
4081 psrad m2, 12
4082 psrad m3, 12
4083
4084 packssdw m0, m1
4085 packssdw m2, m3
4086
4087 packuswb m0, m2
4088
4089 movd [r2], m0
4090 pextrd [r2 + r3], m0, 1
4091 lea r5, [r2 + 2 * r3]
4092 pextrd [r5], m0, 2
4093 pextrd [r5 + r3], m0, 3
4094
4095 lea r5, [4 * r1 - 2 * 4]
4096 sub r0, r5
4097 add r2, 4
4098
4099 dec r4d
4100 jnz .loopW
4101
4102 lea r0, [r0 + 4 * r1 - 2 * %1]
4103 lea r2, [r2 + 4 * r3 - %1]
4104
4105 dec dword [rsp]
4106 jnz .loopH
4107
4108 RET
4109%endmacro
4110
4111 FILTER_VER_CHROMA_SP 4, 4
4112 FILTER_VER_CHROMA_SP 4, 8
4113 FILTER_VER_CHROMA_SP 16, 16
4114 FILTER_VER_CHROMA_SP 16, 8
4115 FILTER_VER_CHROMA_SP 16, 12
4116 FILTER_VER_CHROMA_SP 12, 16
4117 FILTER_VER_CHROMA_SP 16, 4
4118 FILTER_VER_CHROMA_SP 4, 16
4119 FILTER_VER_CHROMA_SP 32, 32
4120 FILTER_VER_CHROMA_SP 32, 16
4121 FILTER_VER_CHROMA_SP 16, 32
4122 FILTER_VER_CHROMA_SP 32, 24
4123 FILTER_VER_CHROMA_SP 24, 32
4124 FILTER_VER_CHROMA_SP 32, 8
4125
4126 FILTER_VER_CHROMA_SP 16, 24
4127 FILTER_VER_CHROMA_SP 16, 64
4128 FILTER_VER_CHROMA_SP 12, 32
4129 FILTER_VER_CHROMA_SP 4, 32
4130 FILTER_VER_CHROMA_SP 32, 64
4131 FILTER_VER_CHROMA_SP 32, 48
4132 FILTER_VER_CHROMA_SP 24, 64
4133
4134 FILTER_VER_CHROMA_SP 64, 64
4135 FILTER_VER_CHROMA_SP 64, 32
4136 FILTER_VER_CHROMA_SP 64, 48
4137 FILTER_VER_CHROMA_SP 48, 64
4138 FILTER_VER_CHROMA_SP 64, 16
4139
4140
4141%macro PROCESS_CHROMA_SP_W2_4R 1
4142 movd m0, [r0]
4143 movd m1, [r0 + r1]
4144 punpcklwd m0, m1 ;m0=[0 1]
4145
4146 lea r0, [r0 + 2 * r1]
4147 movd m2, [r0]
4148 punpcklwd m1, m2 ;m1=[1 2]
4149 punpcklqdq m0, m1 ;m0=[0 1 1 2]
4150 pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
4151
4152 movd m1, [r0 + r1]
4153 punpcklwd m2, m1 ;m2=[2 3]
4154
4155 lea r0, [r0 + 2 * r1]
4156 movd m3, [r0]
4157 punpcklwd m1, m3 ;m2=[3 4]
4158 punpcklqdq m2, m1 ;m2=[2 3 3 4]
4159
4160 pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
4161 pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
4162 paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
4163
4164 movd m1, [r0 + r1]
4165 punpcklwd m3, m1 ;m3=[4 5]
4166
4167 movd m4, [r0 + 2 * r1]
4168 punpcklwd m1, m4 ;m1=[5 6]
4169 punpcklqdq m3, m1 ;m2=[4 5 5 6]
4170 pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
4171 paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
4172%endmacro
4173
4174;-------------------------------------------------------------------------------------------------------------------
4175; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4176;-------------------------------------------------------------------------------------------------------------------
4177%macro FILTER_VER_CHROMA_SP_W2_4R 2
4178INIT_XMM sse4
4179cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6
4180
4181 add r1d, r1d
4182 sub r0, r1
4183 shl r4d, 5
4184
4185%ifdef PIC
4186 lea r5, [tab_ChromaCoeffV]
4187 lea r5, [r5 + r4]
4188%else
4189 lea r5, [tab_ChromaCoeffV + r4]
4190%endif
4191
4192 mova m5, [tab_c_526336]
4193
4194 mov r4d, (%2/4)
4195
4196.loopH:
4197 PROCESS_CHROMA_SP_W2_4R r5
4198
4199 paddd m0, m5
4200 paddd m2, m5
4201
4202 psrad m0, 12
4203 psrad m2, 12
4204
4205 packssdw m0, m2
4206 packuswb m0, m0
4207
4208 pextrw [r2], m0, 0
4209 pextrw [r2 + r3], m0, 1
4210 lea r2, [r2 + 2 * r3]
4211 pextrw [r2], m0, 2
4212 pextrw [r2 + r3], m0, 3
4213
4214 lea r2, [r2 + 2 * r3]
4215
4216 dec r4d
4217 jnz .loopH
4218
4219 RET
4220%endmacro
4221
4222FILTER_VER_CHROMA_SP_W2_4R 2, 4
4223FILTER_VER_CHROMA_SP_W2_4R 2, 8
4224
4225FILTER_VER_CHROMA_SP_W2_4R 2, 16
4226
4227;--------------------------------------------------------------------------------------------------------------
4228; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4229;--------------------------------------------------------------------------------------------------------------
4230INIT_XMM sse4
4231cglobal interp_4tap_vert_sp_4x2, 5, 6, 5
4232
4233 add r1d, r1d
4234 sub r0, r1
4235 shl r4d, 5
4236
4237%ifdef PIC
4238 lea r5, [tab_ChromaCoeffV]
4239 lea r5, [r5 + r4]
4240%else
4241 lea r5, [tab_ChromaCoeffV + r4]
4242%endif
4243
4244 mova m4, [tab_c_526336]
4245
4246 movq m0, [r0]
4247 movq m1, [r0 + r1]
4248 punpcklwd m0, m1 ;m0=[0 1]
4249 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
4250
4251 lea r0, [r0 + 2 * r1]
4252 movq m2, [r0]
4253 punpcklwd m1, m2 ;m1=[1 2]
4254 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
4255
4256 movq m3, [r0 + r1]
4257 punpcklwd m2, m3 ;m4=[2 3]
4258 pmaddwd m2, [r5 + 1 * 16]
4259 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
4260 paddd m0, m4
4261 psrad m0, 12
4262
4263 movq m2, [r0 + 2 * r1]
4264 punpcklwd m3, m2 ;m5=[3 4]
4265 pmaddwd m3, [r5 + 1 * 16]
4266 paddd m1, m3 ;m1 = [1+2+3+4] Row2 done
4267 paddd m1, m4
4268 psrad m1, 12
4269
4270 packssdw m0, m1
4271 packuswb m0, m0
4272
4273 movd [r2], m0
4274 pextrd [r2 + r3], m0, 1
4275
4276 RET
4277
4278;-------------------------------------------------------------------------------------------------------------------
4279; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4280;-------------------------------------------------------------------------------------------------------------------
4281%macro FILTER_VER_CHROMA_SP_W6_H4 2
4282INIT_XMM sse4
4283cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
4284
4285 add r1d, r1d
4286 sub r0, r1
4287 shl r4d, 5
4288
4289%ifdef PIC
4290 lea r5, [tab_ChromaCoeffV]
4291 lea r6, [r5 + r4]
4292%else
4293 lea r6, [tab_ChromaCoeffV + r4]
4294%endif
4295
4296 mova m6, [tab_c_526336]
4297
4298 mov r4d, %2/4
4299
4300.loopH:
4301 PROCESS_CHROMA_SP_W4_4R
4302
4303 paddd m0, m6
4304 paddd m1, m6
4305 paddd m2, m6
4306 paddd m3, m6
4307
4308 psrad m0, 12
4309 psrad m1, 12
4310 psrad m2, 12
4311 psrad m3, 12
4312
4313 packssdw m0, m1
4314 packssdw m2, m3
4315
4316 packuswb m0, m2
4317
4318 movd [r2], m0
4319 pextrd [r2 + r3], m0, 1
4320 lea r5, [r2 + 2 * r3]
4321 pextrd [r5], m0, 2
4322 pextrd [r5 + r3], m0, 3
4323
4324 lea r5, [4 * r1 - 2 * 4]
4325 sub r0, r5
4326 add r2, 4
4327
4328 PROCESS_CHROMA_SP_W2_4R r6
4329
4330 paddd m0, m6
4331 paddd m2, m6
4332
4333 psrad m0, 12
4334 psrad m2, 12
4335
4336 packssdw m0, m2
4337 packuswb m0, m0
4338
4339 pextrw [r2], m0, 0
4340 pextrw [r2 + r3], m0, 1
4341 lea r2, [r2 + 2 * r3]
4342 pextrw [r2], m0, 2
4343 pextrw [r2 + r3], m0, 3
4344
4345 sub r0, 2 * 4
4346 lea r2, [r2 + 2 * r3 - 4]
4347
4348 dec r4d
4349 jnz .loopH
4350
4351 RET
4352%endmacro
4353
4354FILTER_VER_CHROMA_SP_W6_H4 6, 8
4355
4356FILTER_VER_CHROMA_SP_W6_H4 6, 16
4357
4358%macro PROCESS_CHROMA_SP_W8_2R 0
4359 movu m1, [r0]
4360 movu m3, [r0 + r1]
4361 punpcklwd m0, m1, m3
4362 pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
4363 punpckhwd m1, m3
4364 pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
4365
4366 movu m4, [r0 + 2 * r1]
4367 punpcklwd m2, m3, m4
4368 pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
4369 punpckhwd m3, m4
4370 pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
4371
4372 lea r0, [r0 + 2 * r1]
4373 movu m5, [r0 + r1]
4374 punpcklwd m6, m4, m5
4375 pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
4376 paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
4377 punpckhwd m4, m5
4378 pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
4379 paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
4380
4381 movu m4, [r0 + 2 * r1]
4382 punpcklwd m6, m5, m4
4383 pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
4384 paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
4385 punpckhwd m5, m4
4386 pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
4387 paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
4388%endmacro
4389
4390;--------------------------------------------------------------------------------------------------------------
4391; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4392;--------------------------------------------------------------------------------------------------------------
4393%macro FILTER_VER_CHROMA_SP_W8_H2 2
4394INIT_XMM sse2
4395cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8
4396
4397 add r1d, r1d
4398 sub r0, r1
4399 shl r4d, 5
4400
4401%ifdef PIC
4402 lea r5, [tab_ChromaCoeffV]
4403 lea r5, [r5 + r4]
4404%else
4405 lea r5, [tab_ChromaCoeffV + r4]
4406%endif
4407
4408 mova m7, [tab_c_526336]
4409
4410 mov r4d, %2/2
4411.loopH:
4412 PROCESS_CHROMA_SP_W8_2R
4413
4414 paddd m0, m7
4415 paddd m1, m7
4416 paddd m2, m7
4417 paddd m3, m7
4418
4419 psrad m0, 12
4420 psrad m1, 12
4421 psrad m2, 12
4422 psrad m3, 12
4423
4424 packssdw m0, m1
4425 packssdw m2, m3
4426
4427 packuswb m0, m2
4428
4429 movlps [r2], m0
4430 movhps [r2 + r3], m0
4431
4432 lea r2, [r2 + 2 * r3]
4433
4434 dec r4d
4435 jnz .loopH
4436
4437 RET
4438%endmacro
4439
4440FILTER_VER_CHROMA_SP_W8_H2 8, 2
4441FILTER_VER_CHROMA_SP_W8_H2 8, 4
4442FILTER_VER_CHROMA_SP_W8_H2 8, 6
4443FILTER_VER_CHROMA_SP_W8_H2 8, 8
4444FILTER_VER_CHROMA_SP_W8_H2 8, 16
4445FILTER_VER_CHROMA_SP_W8_H2 8, 32
4446
4447FILTER_VER_CHROMA_SP_W8_H2 8, 12
4448FILTER_VER_CHROMA_SP_W8_H2 8, 64
4449
4450
4451;-----------------------------------------------------------------------------------------------------------------------------
4452; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4453;-----------------------------------------------------------------------------------------------------------------------------
4454%macro FILTER_HORIZ_CHROMA_2xN 2
4455INIT_XMM sse4
4456cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
4457%define coef2 m3
4458%define Tm0 m2
4459%define t1 m1
4460%define t0 m0
4461
4462 dec srcq
4463 mov r4d, r4m
4464 add dststrided, dststrided
4465
4466%ifdef PIC
4467 lea r6, [tab_ChromaCoeff]
4468 movd coef2, [r6 + r4 * 4]
4469%else
4470 movd coef2, [tab_ChromaCoeff + r4 * 4]
4471%endif
4472
4473 pshufd coef2, coef2, 0
4474 mova t1, [pw_2000]
4475 mova Tm0, [tab_Tm]
4476
4477 mov r4d, %2
4478 cmp r5m, byte 0
4479 je .loopH
4480 sub srcq, srcstrideq
4481 add r4d, 3
4482
4483.loopH:
4484 movh t0, [srcq]
4485 pshufb t0, t0, Tm0
4486 pmaddubsw t0, coef2
4487 phaddw t0, t0
4488 psubw t0, t1
4489 movd [dstq], t0
4490
4491 lea srcq, [srcq + srcstrideq]
4492 lea dstq, [dstq + dststrideq]
4493
4494 dec r4d
4495 jnz .loopH
4496
4497 RET
4498%endmacro
4499
4500FILTER_HORIZ_CHROMA_2xN 2, 4
4501FILTER_HORIZ_CHROMA_2xN 2, 8
4502
4503FILTER_HORIZ_CHROMA_2xN 2, 16
4504
4505;-----------------------------------------------------------------------------------------------------------------------------
4506; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4507;-----------------------------------------------------------------------------------------------------------------------------
4508%macro FILTER_HORIZ_CHROMA_4xN 2
4509INIT_XMM sse4
4510cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
4511%define coef2 m3
4512%define Tm0 m2
4513%define t1 m1
4514%define t0 m0
4515
4516 dec srcq
4517 mov r4d, r4m
4518 add dststrided, dststrided
4519
4520%ifdef PIC
4521 lea r6, [tab_ChromaCoeff]
4522 movd coef2, [r6 + r4 * 4]
4523%else
4524 movd coef2, [tab_ChromaCoeff + r4 * 4]
4525%endif
4526
4527 pshufd coef2, coef2, 0
4528 mova t1, [pw_2000]
4529 mova Tm0, [tab_Tm]
4530
4531 mov r4d, %2
4532 cmp r5m, byte 0
4533 je .loopH
4534 sub srcq, srcstrideq
4535 add r4d, 3
4536
4537.loopH:
4538 movh t0, [srcq]
4539 pshufb t0, t0, Tm0
4540 pmaddubsw t0, coef2
4541 phaddw t0, t0
4542 psubw t0, t1
4543 movlps [dstq], t0
4544
4545 lea srcq, [srcq + srcstrideq]
4546 lea dstq, [dstq + dststrideq]
4547
4548 dec r4d
4549 jnz .loopH
4550 RET
4551%endmacro
4552
4553FILTER_HORIZ_CHROMA_4xN 4, 2
4554FILTER_HORIZ_CHROMA_4xN 4, 4
4555FILTER_HORIZ_CHROMA_4xN 4, 8
4556FILTER_HORIZ_CHROMA_4xN 4, 16
4557
4558FILTER_HORIZ_CHROMA_4xN 4, 32
4559
4560%macro PROCESS_CHROMA_W6 3
4561 movu %1, [srcq]
4562 pshufb %2, %1, Tm0
4563 pmaddubsw %2, coef2
4564 pshufb %1, %1, Tm1
4565 pmaddubsw %1, coef2
4566 phaddw %2, %1
4567 psubw %2, %3
4568 movh [dstq], %2
4569 pshufd %2, %2, 2
4570 movd [dstq + 8], %2
4571%endmacro
4572
4573%macro PROCESS_CHROMA_W12 3
4574 movu %1, [srcq]
4575 pshufb %2, %1, Tm0
4576 pmaddubsw %2, coef2
4577 pshufb %1, %1, Tm1
4578 pmaddubsw %1, coef2
4579 phaddw %2, %1
4580 psubw %2, %3
4581 movu [dstq], %2
4582 movu %1, [srcq + 8]
4583 pshufb %1, %1, Tm0
4584 pmaddubsw %1, coef2
4585 phaddw %1, %1
4586 psubw %1, %3
4587 movh [dstq + 16], %1
4588%endmacro
4589
4590;-----------------------------------------------------------------------------------------------------------------------------
4591; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4592;-----------------------------------------------------------------------------------------------------------------------------
4593%macro FILTER_HORIZ_CHROMA 2
4594INIT_XMM sse4
4595cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
4596%define coef2 m5
4597%define Tm0 m4
4598%define Tm1 m3
4599%define t2 m2
4600%define t1 m1
4601%define t0 m0
4602
4603 dec srcq
4604 mov r4d, r4m
4605 add dststrided, dststrided
4606
4607%ifdef PIC
4608 lea r6, [tab_ChromaCoeff]
4609 movd coef2, [r6 + r4 * 4]
4610%else
4611 movd coef2, [tab_ChromaCoeff + r4 * 4]
4612%endif
4613
4614 pshufd coef2, coef2, 0
4615 mova t2, [pw_2000]
4616 mova Tm0, [tab_Tm]
4617 mova Tm1, [tab_Tm + 16]
4618
4619 mov r4d, %2
4620 cmp r5m, byte 0
4621 je .loopH
4622 sub srcq, srcstrideq
4623 add r4d, 3
4624
4625.loopH:
4626 PROCESS_CHROMA_W%1 t0, t1, t2
4627 add srcq, srcstrideq
4628 add dstq, dststrideq
4629
4630 dec r4d
4631 jnz .loopH
4632
4633 RET
4634%endmacro
4635
4636FILTER_HORIZ_CHROMA 6, 8
4637FILTER_HORIZ_CHROMA 12, 16
4638
4639FILTER_HORIZ_CHROMA 6, 16
4640FILTER_HORIZ_CHROMA 12, 32
4641
4642%macro PROCESS_CHROMA_W8 3
4643 movu %1, [srcq]
4644 pshufb %2, %1, Tm0
4645 pmaddubsw %2, coef2
4646 pshufb %1, %1, Tm1
4647 pmaddubsw %1, coef2
4648 phaddw %2, %1
4649 psubw %2, %3
4650 movu [dstq], %2
4651%endmacro
4652
4653;-----------------------------------------------------------------------------------------------------------------------------
4654; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4655;-----------------------------------------------------------------------------------------------------------------------------
4656%macro FILTER_HORIZ_CHROMA_8xN 2
4657INIT_XMM sse4
4658cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
4659%define coef2 m5
4660%define Tm0 m4
4661%define Tm1 m3
4662%define t2 m2
4663%define t1 m1
4664%define t0 m0
4665
4666 dec srcq
4667 mov r4d, r4m
4668 add dststrided, dststrided
4669
4670%ifdef PIC
4671 lea r6, [tab_ChromaCoeff]
4672 movd coef2, [r6 + r4 * 4]
4673%else
4674 movd coef2, [tab_ChromaCoeff + r4 * 4]
4675%endif
4676
4677 pshufd coef2, coef2, 0
4678 mova t2, [pw_2000]
4679 mova Tm0, [tab_Tm]
4680 mova Tm1, [tab_Tm + 16]
4681
4682 mov r4d, %2
4683 cmp r5m, byte 0
4684 je .loopH
4685 sub srcq, srcstrideq
4686 add r4d, 3
4687
4688.loopH:
4689 PROCESS_CHROMA_W8 t0, t1, t2
4690 add srcq, srcstrideq
4691 add dstq, dststrideq
4692
4693 dec r4d
4694 jnz .loopH
4695
4696 RET
4697%endmacro
4698
4699FILTER_HORIZ_CHROMA_8xN 8, 2
4700FILTER_HORIZ_CHROMA_8xN 8, 4
4701FILTER_HORIZ_CHROMA_8xN 8, 6
4702FILTER_HORIZ_CHROMA_8xN 8, 8
4703FILTER_HORIZ_CHROMA_8xN 8, 16
4704FILTER_HORIZ_CHROMA_8xN 8, 32
4705
4706FILTER_HORIZ_CHROMA_8xN 8, 12
4707FILTER_HORIZ_CHROMA_8xN 8, 64
4708
4709%macro PROCESS_CHROMA_W16 4
4710 movu %1, [srcq]
4711 pshufb %2, %1, Tm0
4712 pmaddubsw %2, coef2
4713 pshufb %1, %1, Tm1
4714 pmaddubsw %1, coef2
4715 phaddw %2, %1
4716 movu %1, [srcq + 8]
4717 pshufb %4, %1, Tm0
4718 pmaddubsw %4, coef2
4719 pshufb %1, %1, Tm1
4720 pmaddubsw %1, coef2
4721 phaddw %4, %1
4722 psubw %2, %3
4723 psubw %4, %3
4724 movu [dstq], %2
4725 movu [dstq + 16], %4
4726%endmacro
4727
4728%macro PROCESS_CHROMA_W24 4
4729 movu %1, [srcq]
4730 pshufb %2, %1, Tm0
4731 pmaddubsw %2, coef2
4732 pshufb %1, %1, Tm1
4733 pmaddubsw %1, coef2
4734 phaddw %2, %1
4735 movu %1, [srcq + 8]
4736 pshufb %4, %1, Tm0
4737 pmaddubsw %4, coef2
4738 pshufb %1, %1, Tm1
4739 pmaddubsw %1, coef2
4740 phaddw %4, %1
4741 psubw %2, %3
4742 psubw %4, %3
4743 movu [dstq], %2
4744 movu [dstq + 16], %4
4745 movu %1, [srcq + 16]
4746 pshufb %2, %1, Tm0
4747 pmaddubsw %2, coef2
4748 pshufb %1, %1, Tm1
4749 pmaddubsw %1, coef2
4750 phaddw %2, %1
4751 psubw %2, %3
4752 movu [dstq + 32], %2
4753%endmacro
4754
4755%macro PROCESS_CHROMA_W32 4
4756 movu %1, [srcq]
4757 pshufb %2, %1, Tm0
4758 pmaddubsw %2, coef2
4759 pshufb %1, %1, Tm1
4760 pmaddubsw %1, coef2
4761 phaddw %2, %1
4762 movu %1, [srcq + 8]
4763 pshufb %4, %1, Tm0
4764 pmaddubsw %4, coef2
4765 pshufb %1, %1, Tm1
4766 pmaddubsw %1, coef2
4767 phaddw %4, %1
4768 psubw %2, %3
4769 psubw %4, %3
4770 movu [dstq], %2
4771 movu [dstq + 16], %4
4772 movu %1, [srcq + 16]
4773 pshufb %2, %1, Tm0
4774 pmaddubsw %2, coef2
4775 pshufb %1, %1, Tm1
4776 pmaddubsw %1, coef2
4777 phaddw %2, %1
4778 movu %1, [srcq + 24]
4779 pshufb %4, %1, Tm0
4780 pmaddubsw %4, coef2
4781 pshufb %1, %1, Tm1
4782 pmaddubsw %1, coef2
4783 phaddw %4, %1
4784 psubw %2, %3
4785 psubw %4, %3
4786 movu [dstq + 32], %2
4787 movu [dstq + 48], %4
4788%endmacro
4789
4790%macro PROCESS_CHROMA_W16o 5
4791 movu %1, [srcq + %5]
4792 pshufb %2, %1, Tm0
4793 pmaddubsw %2, coef2
4794 pshufb %1, %1, Tm1
4795 pmaddubsw %1, coef2
4796 phaddw %2, %1
4797 movu %1, [srcq + %5 + 8]
4798 pshufb %4, %1, Tm0
4799 pmaddubsw %4, coef2
4800 pshufb %1, %1, Tm1
4801 pmaddubsw %1, coef2
4802 phaddw %4, %1
4803 psubw %2, %3
4804 psubw %4, %3
4805 movu [dstq + %5 * 2], %2
4806 movu [dstq + %5 * 2 + 16], %4
4807%endmacro
4808
4809%macro PROCESS_CHROMA_W48 4
4810 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
4811 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
4812 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
4813%endmacro
4814
4815%macro PROCESS_CHROMA_W64 4
4816 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
4817 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
4818 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
4819 PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
4820%endmacro
4821
4822;------------------------------------------------------------------------------------------------------------------------------
4823; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4824;------------------------------------------------------------------------------------------------------------------------------
4825%macro FILTER_HORIZ_CHROMA_WxN 2
4826INIT_XMM sse4
4827cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride
4828%define coef2 m6
4829%define Tm0 m5
4830%define Tm1 m4
4831%define t3 m3
4832%define t2 m2
4833%define t1 m1
4834%define t0 m0
4835
4836 dec srcq
4837 mov r4d, r4m
4838 add dststrided, dststrided
4839
4840%ifdef PIC
4841 lea r6, [tab_ChromaCoeff]
4842 movd coef2, [r6 + r4 * 4]
4843%else
4844 movd coef2, [tab_ChromaCoeff + r4 * 4]
4845%endif
4846
4847 pshufd coef2, coef2, 0
4848 mova t2, [pw_2000]
4849 mova Tm0, [tab_Tm]
4850 mova Tm1, [tab_Tm + 16]
4851
4852 mov r4d, %2
4853 cmp r5m, byte 0
4854 je .loopH
4855 sub srcq, srcstrideq
4856 add r4d, 3
4857
4858.loopH:
4859 PROCESS_CHROMA_W%1 t0, t1, t2, t3
4860 add srcq, srcstrideq
4861 add dstq, dststrideq
4862
4863 dec r4d
4864 jnz .loopH
4865
4866 RET
4867%endmacro
4868
4869FILTER_HORIZ_CHROMA_WxN 16, 4
4870FILTER_HORIZ_CHROMA_WxN 16, 8
4871FILTER_HORIZ_CHROMA_WxN 16, 12
4872FILTER_HORIZ_CHROMA_WxN 16, 16
4873FILTER_HORIZ_CHROMA_WxN 16, 32
4874FILTER_HORIZ_CHROMA_WxN 24, 32
4875FILTER_HORIZ_CHROMA_WxN 32, 8
4876FILTER_HORIZ_CHROMA_WxN 32, 16
4877FILTER_HORIZ_CHROMA_WxN 32, 24
4878FILTER_HORIZ_CHROMA_WxN 32, 32
4879
4880FILTER_HORIZ_CHROMA_WxN 16, 24
4881FILTER_HORIZ_CHROMA_WxN 16, 64
4882FILTER_HORIZ_CHROMA_WxN 24, 64
4883FILTER_HORIZ_CHROMA_WxN 32, 48
4884FILTER_HORIZ_CHROMA_WxN 32, 64
4885
4886FILTER_HORIZ_CHROMA_WxN 64, 64
4887FILTER_HORIZ_CHROMA_WxN 64, 32
4888FILTER_HORIZ_CHROMA_WxN 64, 48
4889FILTER_HORIZ_CHROMA_WxN 48, 64
4890FILTER_HORIZ_CHROMA_WxN 64, 16
4891
4892
4893;---------------------------------------------------------------------------------------------------------------
4894; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4895;---------------------------------------------------------------------------------------------------------------
4896%macro FILTER_V_PS_W16n 2
4897INIT_XMM sse4
4898cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
4899
4900 mov r4d, r4m
4901 sub r0, r1
4902 add r3d, r3d
4903
4904%ifdef PIC
4905 lea r5, [tab_ChromaCoeff]
4906 movd m0, [r5 + r4 * 4]
4907%else
4908 movd m0, [tab_ChromaCoeff + r4 * 4]
4909%endif
4910
4911 pshufb m1, m0, [tab_Vm]
4912 pshufb m0, [tab_Vm + 16]
4913 mov r4d, %2/2
4914
4915.loop:
4916
4917 mov r6d, %1/16
4918
4919.loopW:
4920
4921 movu m2, [r0]
4922 movu m3, [r0 + r1]
4923
4924 punpcklbw m4, m2, m3
4925 punpckhbw m2, m3
4926
4927 pmaddubsw m4, m1
4928 pmaddubsw m2, m1
4929
4930 lea r5, [r0 + 2 * r1]
4931 movu m5, [r5]
4932 movu m7, [r5 + r1]
4933
4934 punpcklbw m6, m5, m7
4935 pmaddubsw m6, m0
4936 paddw m4, m6
4937
4938 punpckhbw m6, m5, m7
4939 pmaddubsw m6, m0
4940 paddw m2, m6
4941
4942 mova m6, [pw_2000]
4943
4944 psubw m4, m6
4945 psubw m2, m6
4946
4947 movu [r2], m4
4948 movu [r2 + 16], m2
4949
4950 punpcklbw m4, m3, m5
4951 punpckhbw m3, m5
4952
4953 pmaddubsw m4, m1
4954 pmaddubsw m3, m1
4955
4956 movu m5, [r5 + 2 * r1]
4957
4958 punpcklbw m2, m7, m5
4959 punpckhbw m7, m5
4960
4961 pmaddubsw m2, m0
4962 pmaddubsw m7, m0
4963
4964 paddw m4, m2
4965 paddw m3, m7
4966
4967 psubw m4, m6
4968 psubw m3, m6
4969
4970 movu [r2 + r3], m4
4971 movu [r2 + r3 + 16], m3
4972
4973 add r0, 16
4974 add r2, 32
4975 dec r6d
4976 jnz .loopW
4977
4978 lea r0, [r0 + r1 * 2 - %1]
4979 lea r2, [r2 + r3 * 2 - %1 * 2]
4980
4981 dec r4d
4982 jnz .loop
4983 RET
4984%endmacro
4985
4986FILTER_V_PS_W16n 64, 64
4987FILTER_V_PS_W16n 64, 32
4988FILTER_V_PS_W16n 64, 48
4989FILTER_V_PS_W16n 48, 64
4990FILTER_V_PS_W16n 64, 16
4991
4992
4993;------------------------------------------------------------------------------------------------------------
4994;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4995;------------------------------------------------------------------------------------------------------------
4996INIT_XMM sse4
4997cglobal interp_4tap_vert_ps_2x4, 4, 6, 7
4998
4999 mov r4d, r4m
5000 sub r0, r1
5001 add r3d, r3d
5002
5003%ifdef PIC
5004 lea r5, [tab_ChromaCoeff]
5005 movd m0, [r5 + r4 * 4]
5006%else
5007 movd m0, [tab_ChromaCoeff + r4 * 4]
5008%endif
5009
5010 pshufb m0, [tab_Cm]
5011
5012 lea r5, [3 * r1]
5013
5014 movd m2, [r0]
5015 movd m3, [r0 + r1]
5016 movd m4, [r0 + 2 * r1]
5017 movd m5, [r0 + r5]
5018
5019 punpcklbw m2, m3
5020 punpcklbw m6, m4, m5
5021 punpcklbw m2, m6
5022
5023 pmaddubsw m2, m0
5024
5025 lea r0, [r0 + 4 * r1]
5026 movd m6, [r0]
5027
5028 punpcklbw m3, m4
5029 punpcklbw m1, m5, m6
5030 punpcklbw m3, m1
5031
5032 pmaddubsw m3, m0
5033 phaddw m2, m3
5034
5035 mova m1, [pw_2000]
5036
5037 psubw m2, m1
5038
5039 movd [r2], m2
5040 pextrd [r2 + r3], m2, 2
5041
5042 movd m2, [r0 + r1]
5043
5044 punpcklbw m4, m5
5045 punpcklbw m3, m6, m2
5046 punpcklbw m4, m3
5047
5048 pmaddubsw m4, m0
5049
5050 movd m3, [r0 + 2 * r1]
5051
5052 punpcklbw m5, m6
5053 punpcklbw m2, m3
5054 punpcklbw m5, m2
5055
5056 pmaddubsw m5, m0
5057 phaddw m4, m5
5058 psubw m4, m1
5059
5060 lea r2, [r2 + 2 * r3]
5061 movd [r2], m4
5062 pextrd [r2 + r3], m4, 2
5063
5064 RET
5065
5066;-------------------------------------------------------------------------------------------------------------
5067; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5068;-------------------------------------------------------------------------------------------------------------
5069%macro FILTER_V_PS_W2 2
5070INIT_XMM sse4
5071cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
5072
5073 mov r4d, r4m
5074 sub r0, r1
5075 add r3d, r3d
5076
5077%ifdef PIC
5078 lea r5, [tab_ChromaCoeff]
5079 movd m0, [r5 + r4 * 4]
5080%else
5081 movd m0, [tab_ChromaCoeff + r4 * 4]
5082%endif
5083
5084 pshufb m0, [tab_Cm]
5085
5086 mova m1, [pw_2000]
5087 lea r5, [3 * r1]
5088 mov r4d, %2/4
5089.loop:
5090 movd m2, [r0]
5091 movd m3, [r0 + r1]
5092 movd m4, [r0 + 2 * r1]
5093 movd m5, [r0 + r5]
5094
5095 punpcklbw m2, m3
5096 punpcklbw m6, m4, m5
5097 punpcklbw m2, m6
5098
5099 pmaddubsw m2, m0
5100
5101 lea r0, [r0 + 4 * r1]
5102 movd m6, [r0]
5103
5104 punpcklbw m3, m4
5105 punpcklbw m7, m5, m6
5106 punpcklbw m3, m7
5107
5108 pmaddubsw m3, m0
5109
5110 phaddw m2, m3
5111 psubw m2, m1
5112
5113
5114 movd [r2], m2
5115 pshufd m2, m2, 2
5116 movd [r2 + r3], m2
5117
5118 movd m2, [r0 + r1]
5119
5120 punpcklbw m4, m5
5121 punpcklbw m3, m6, m2
5122 punpcklbw m4, m3
5123
5124 pmaddubsw m4, m0
5125
5126 movd m3, [r0 + 2 * r1]
5127
5128 punpcklbw m5, m6
5129 punpcklbw m2, m3
5130 punpcklbw m5, m2
5131
5132 pmaddubsw m5, m0
5133
5134 phaddw m4, m5
5135
5136 psubw m4, m1
5137
5138 lea r2, [r2 + 2 * r3]
5139 movd [r2], m4
5140 pshufd m4 , m4 ,2
5141 movd [r2 + r3], m4
5142
5143 lea r2, [r2 + 2 * r3]
5144
5145 dec r4d
5146 jnz .loop
5147
5148RET
5149%endmacro
5150
5151FILTER_V_PS_W2 2, 8
5152
5153FILTER_V_PS_W2 2, 16
5154
5155;-----------------------------------------------------------------------------------------------------------------
5156; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5157;-----------------------------------------------------------------------------------------------------------------
5158%macro FILTER_VER_CHROMA_SS 2
5159INIT_XMM sse2
5160cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
5161
5162 add r1d, r1d
5163 add r3d, r3d
5164 sub r0, r1
5165 shl r4d, 5
5166
5167%ifdef PIC
5168 lea r5, [tab_ChromaCoeffV]
5169 lea r6, [r5 + r4]
5170%else
5171 lea r6, [tab_ChromaCoeffV + r4]
5172%endif
5173
5174 mov dword [rsp], %2/4
5175
5176.loopH:
5177 mov r4d, (%1/4)
5178.loopW:
5179 PROCESS_CHROMA_SP_W4_4R
5180
5181 psrad m0, 6
5182 psrad m1, 6
5183 psrad m2, 6
5184 psrad m3, 6
5185
5186 packssdw m0, m1
5187 packssdw m2, m3
5188
5189 movlps [r2], m0
5190 movhps [r2 + r3], m0
5191 lea r5, [r2 + 2 * r3]
5192 movlps [r5], m2
5193 movhps [r5 + r3], m2
5194
5195 lea r5, [4 * r1 - 2 * 4]
5196 sub r0, r5
5197 add r2, 2 * 4
5198
5199 dec r4d
5200 jnz .loopW
5201
5202 lea r0, [r0 + 4 * r1 - 2 * %1]
5203 lea r2, [r2 + 4 * r3 - 2 * %1]
5204
5205 dec dword [rsp]
5206 jnz .loopH
5207
5208 RET
5209%endmacro
5210
5211 FILTER_VER_CHROMA_SS 4, 4
5212 FILTER_VER_CHROMA_SS 4, 8
5213 FILTER_VER_CHROMA_SS 16, 16
5214 FILTER_VER_CHROMA_SS 16, 8
5215 FILTER_VER_CHROMA_SS 16, 12
5216 FILTER_VER_CHROMA_SS 12, 16
5217 FILTER_VER_CHROMA_SS 16, 4
5218 FILTER_VER_CHROMA_SS 4, 16
5219 FILTER_VER_CHROMA_SS 32, 32
5220 FILTER_VER_CHROMA_SS 32, 16
5221 FILTER_VER_CHROMA_SS 16, 32
5222 FILTER_VER_CHROMA_SS 32, 24
5223 FILTER_VER_CHROMA_SS 24, 32
5224 FILTER_VER_CHROMA_SS 32, 8
5225
5226 FILTER_VER_CHROMA_SS 16, 24
5227 FILTER_VER_CHROMA_SS 12, 32
5228 FILTER_VER_CHROMA_SS 4, 32
5229 FILTER_VER_CHROMA_SS 32, 64
5230 FILTER_VER_CHROMA_SS 16, 64
5231 FILTER_VER_CHROMA_SS 32, 48
5232 FILTER_VER_CHROMA_SS 24, 64
5233
5234 FILTER_VER_CHROMA_SS 64, 64
5235 FILTER_VER_CHROMA_SS 64, 32
5236 FILTER_VER_CHROMA_SS 64, 48
5237 FILTER_VER_CHROMA_SS 48, 64
5238 FILTER_VER_CHROMA_SS 64, 16
5239
5240
5241;---------------------------------------------------------------------------------------------------------------------
5242; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5243;---------------------------------------------------------------------------------------------------------------------
5244%macro FILTER_VER_CHROMA_SS_W2_4R 2
5245INIT_XMM sse4
5246cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5
5247
5248 add r1d, r1d
5249 add r3d, r3d
5250 sub r0, r1
5251 shl r4d, 5
5252
5253%ifdef PIC
5254 lea r5, [tab_ChromaCoeffV]
5255 lea r5, [r5 + r4]
5256%else
5257 lea r5, [tab_ChromaCoeffV + r4]
5258%endif
5259
5260 mov r4d, (%2/4)
5261
5262.loopH:
5263 PROCESS_CHROMA_SP_W2_4R r5
5264
5265 psrad m0, 6
5266 psrad m2, 6
5267
5268 packssdw m0, m2
5269
5270 movd [r2], m0
5271 pextrd [r2 + r3], m0, 1
5272 lea r2, [r2 + 2 * r3]
5273 pextrd [r2], m0, 2
5274 pextrd [r2 + r3], m0, 3
5275
5276 lea r2, [r2 + 2 * r3]
5277
5278 dec r4d
5279 jnz .loopH
5280
5281 RET
5282%endmacro
5283
5284FILTER_VER_CHROMA_SS_W2_4R 2, 4
5285FILTER_VER_CHROMA_SS_W2_4R 2, 8
5286
5287FILTER_VER_CHROMA_SS_W2_4R 2, 16
5288
5289;---------------------------------------------------------------------------------------------------------------
5290; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5291;---------------------------------------------------------------------------------------------------------------
5292INIT_XMM sse2
5293cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
5294
5295 add r1d, r1d
5296 add r3d, r3d
5297 sub r0, r1
5298 shl r4d, 5
5299
5300%ifdef PIC
5301 lea r5, [tab_ChromaCoeffV]
5302 lea r5, [r5 + r4]
5303%else
5304 lea r5, [tab_ChromaCoeffV + r4]
5305%endif
5306
5307 movq m0, [r0]
5308 movq m1, [r0 + r1]
5309 punpcklwd m0, m1 ;m0=[0 1]
5310 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
5311
5312 lea r0, [r0 + 2 * r1]
5313 movq m2, [r0]
5314 punpcklwd m1, m2 ;m1=[1 2]
5315 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
5316
5317 movq m3, [r0 + r1]
5318 punpcklwd m2, m3 ;m4=[2 3]
5319 pmaddwd m2, [r5 + 1 * 16]
5320 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
5321 psrad m0, 6
5322
5323 movq m2, [r0 + 2 * r1]
5324 punpcklwd m3, m2 ;m5=[3 4]
5325 pmaddwd m3, [r5 + 1 * 16]
5326 paddd m1, m3 ;m1=[1+2+3+4] Row2 done
5327 psrad m1, 6
5328
5329 packssdw m0, m1
5330
5331 movlps [r2], m0
5332 movhps [r2 + r3], m0
5333
5334 RET
5335
5336;-------------------------------------------------------------------------------------------------------------------
5337; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5338;-------------------------------------------------------------------------------------------------------------------
5339%macro FILTER_VER_CHROMA_SS_W6_H4 2
5340INIT_XMM sse4
5341cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
5342
5343 add r1d, r1d
5344 add r3d, r3d
5345 sub r0, r1
5346 shl r4d, 5
5347
5348%ifdef PIC
5349 lea r5, [tab_ChromaCoeffV]
5350 lea r6, [r5 + r4]
5351%else
5352 lea r6, [tab_ChromaCoeffV + r4]
5353%endif
5354
5355 mov r4d, %2/4
5356
5357.loopH:
5358 PROCESS_CHROMA_SP_W4_4R
5359
5360 psrad m0, 6
5361 psrad m1, 6
5362 psrad m2, 6
5363 psrad m3, 6
5364
5365 packssdw m0, m1
5366 packssdw m2, m3
5367
5368 movlps [r2], m0
5369 movhps [r2 + r3], m0
5370 lea r5, [r2 + 2 * r3]
5371 movlps [r5], m2
5372 movhps [r5 + r3], m2
5373
5374 lea r5, [4 * r1 - 2 * 4]
5375 sub r0, r5
5376 add r2, 2 * 4
5377
5378 PROCESS_CHROMA_SP_W2_4R r6
5379
5380 psrad m0, 6
5381 psrad m2, 6
5382
5383 packssdw m0, m2
5384
5385 movd [r2], m0
5386 pextrd [r2 + r3], m0, 1
5387 lea r2, [r2 + 2 * r3]
5388 pextrd [r2], m0, 2
5389 pextrd [r2 + r3], m0, 3
5390
5391 sub r0, 2 * 4
5392 lea r2, [r2 + 2 * r3 - 2 * 4]
5393
5394 dec r4d
5395 jnz .loopH
5396
5397 RET
5398%endmacro
5399
5400FILTER_VER_CHROMA_SS_W6_H4 6, 8
5401
5402FILTER_VER_CHROMA_SS_W6_H4 6, 16
5403
5404
5405;----------------------------------------------------------------------------------------------------------------
5406; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5407;----------------------------------------------------------------------------------------------------------------
5408%macro FILTER_VER_CHROMA_SS_W8_H2 2
5409INIT_XMM sse2
5410cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
5411
5412 add r1d, r1d
5413 add r3d, r3d
5414 sub r0, r1
5415 shl r4d, 5
5416
5417%ifdef PIC
5418 lea r5, [tab_ChromaCoeffV]
5419 lea r5, [r5 + r4]
5420%else
5421 lea r5, [tab_ChromaCoeffV + r4]
5422%endif
5423
5424 mov r4d, %2/2
5425.loopH:
5426 PROCESS_CHROMA_SP_W8_2R
5427
5428 psrad m0, 6
5429 psrad m1, 6
5430 psrad m2, 6
5431 psrad m3, 6
5432
5433 packssdw m0, m1
5434 packssdw m2, m3
5435
5436 movu [r2], m0
5437 movu [r2 + r3], m2
5438
5439 lea r2, [r2 + 2 * r3]
5440
5441 dec r4d
5442 jnz .loopH
5443
5444 RET
5445%endmacro
5446
5447FILTER_VER_CHROMA_SS_W8_H2 8, 2
5448FILTER_VER_CHROMA_SS_W8_H2 8, 4
5449FILTER_VER_CHROMA_SS_W8_H2 8, 6
5450FILTER_VER_CHROMA_SS_W8_H2 8, 8
5451FILTER_VER_CHROMA_SS_W8_H2 8, 16
5452FILTER_VER_CHROMA_SS_W8_H2 8, 32
5453
5454FILTER_VER_CHROMA_SS_W8_H2 8, 12
5455FILTER_VER_CHROMA_SS_W8_H2 8, 64
5456
5457;-----------------------------------------------------------------------------------------------------------------
5458; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5459;-----------------------------------------------------------------------------------------------------------------
5460%macro FILTER_VER_LUMA_SS 2
5461INIT_XMM sse2
5462cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
5463
5464 add r1d, r1d
5465 add r3d, r3d
5466 lea r5, [3 * r1]
5467 sub r0, r5
5468 shl r4d, 6
5469
5470%ifdef PIC
5471 lea r5, [tab_LumaCoeffV]
5472 lea r6, [r5 + r4]
5473%else
5474 lea r6, [tab_LumaCoeffV + r4]
5475%endif
5476
5477 mov dword [rsp], %2/4
5478.loopH:
5479 mov r4d, (%1/4)
5480.loopW:
5481 movq m0, [r0]
5482 movq m1, [r0 + r1]
5483 punpcklwd m0, m1 ;m0=[0 1]
5484 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
5485
5486 lea r0, [r0 + 2 * r1]
5487 movq m4, [r0]
5488 punpcklwd m1, m4 ;m1=[1 2]
5489 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
5490
5491 movq m5, [r0 + r1]
5492 punpcklwd m4, m5 ;m4=[2 3]
5493 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
5494 pmaddwd m4, [r6 + 1 * 16]
5495 paddd m0, m4 ;m0=[0+1+2+3] Row1
5496
5497 lea r0, [r0 + 2 * r1]
5498 movq m4, [r0]
5499 punpcklwd m5, m4 ;m5=[3 4]
5500 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
5501 pmaddwd m5, [r6 + 1 * 16]
5502 paddd m1, m5 ;m1 = [1+2+3+4] Row2
5503
5504 movq m5, [r0 + r1]
5505 punpcklwd m4, m5 ;m4=[4 5]
5506 pmaddwd m6, m4, [r6 + 1 * 16]
5507 paddd m2, m6 ;m2=[2+3+4+5] Row3
5508 pmaddwd m4, [r6 + 2 * 16]
5509 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
5510
5511 lea r0, [r0 + 2 * r1]
5512 movq m4, [r0]
5513 punpcklwd m5, m4 ;m5=[5 6]
5514 pmaddwd m6, m5, [r6 + 1 * 16]
5515 paddd m3, m6 ;m3=[3+4+5+6] Row4
5516 pmaddwd m5, [r6 + 2 * 16]
5517 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
5518
5519 movq m5, [r0 + r1]
5520 punpcklwd m4, m5 ;m4=[6 7]
5521 pmaddwd m6, m4, [r6 + 2 * 16]
5522 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
5523 pmaddwd m4, [r6 + 3 * 16]
5524 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
5525 psrad m0, 6
5526
5527 lea r0, [r0 + 2 * r1]
5528 movq m4, [r0]
5529 punpcklwd m5, m4 ;m5=[7 8]
5530 pmaddwd m6, m5, [r6 + 2 * 16]
5531 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
5532 pmaddwd m5, [r6 + 3 * 16]
5533 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
5534 psrad m1, 6
5535
5536 packssdw m0, m1
5537
5538 movlps [r2], m0
5539 movhps [r2 + r3], m0
5540
5541 movq m5, [r0 + r1]
5542 punpcklwd m4, m5 ;m4=[8 9]
5543 pmaddwd m4, [r6 + 3 * 16]
5544 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
5545 psrad m2, 6
5546
5547 movq m4, [r0 + 2 * r1]
5548 punpcklwd m5, m4 ;m5=[9 10]
5549 pmaddwd m5, [r6 + 3 * 16]
5550 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
5551 psrad m3, 6
5552
5553 packssdw m2, m3
5554
5555 movlps [r2 + 2 * r3], m2
5556 lea r5, [3 * r3]
5557 movhps [r2 + r5], m2
5558
5559 lea r5, [8 * r1 - 2 * 4]
5560 sub r0, r5
5561 add r2, 2 * 4
5562
5563 dec r4d
5564 jnz .loopW
5565
5566 lea r0, [r0 + 4 * r1 - 2 * %1]
5567 lea r2, [r2 + 4 * r3 - 2 * %1]
5568
5569 dec dword [rsp]
5570 jnz .loopH
5571
5572 RET
5573%endmacro
5574
5575 FILTER_VER_LUMA_SS 4, 4
5576 FILTER_VER_LUMA_SS 8, 8
5577 FILTER_VER_LUMA_SS 8, 4
5578 FILTER_VER_LUMA_SS 4, 8
5579 FILTER_VER_LUMA_SS 16, 16
5580 FILTER_VER_LUMA_SS 16, 8
5581 FILTER_VER_LUMA_SS 8, 16
5582 FILTER_VER_LUMA_SS 16, 12
5583 FILTER_VER_LUMA_SS 12, 16
5584 FILTER_VER_LUMA_SS 16, 4
5585 FILTER_VER_LUMA_SS 4, 16
5586 FILTER_VER_LUMA_SS 32, 32
5587 FILTER_VER_LUMA_SS 32, 16
5588 FILTER_VER_LUMA_SS 16, 32
5589 FILTER_VER_LUMA_SS 32, 24
5590 FILTER_VER_LUMA_SS 24, 32
5591 FILTER_VER_LUMA_SS 32, 8
5592 FILTER_VER_LUMA_SS 8, 32
5593 FILTER_VER_LUMA_SS 64, 64
5594 FILTER_VER_LUMA_SS 64, 32
5595 FILTER_VER_LUMA_SS 32, 64
5596 FILTER_VER_LUMA_SS 64, 48
5597 FILTER_VER_LUMA_SS 48, 64
5598 FILTER_VER_LUMA_SS 64, 16
5599 FILTER_VER_LUMA_SS 16, 64