Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / x86 / ipfilter8.asm
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Min Chen <chenm003@163.com>
5 ;* Nabajit Deka <nabajit@multicorewareinc.com>
6 ;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7 ;*
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
12 ;*
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;*
22 ;* This program is also available under a commercial proprietary license.
23 ;* For more information, contact us at license @ x265.com.
24 ;*****************************************************************************/
25
26 %include "x86inc.asm"
27 %include "x86util.asm"
28
29 SECTION_RODATA 32
30 tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
31 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
32 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
33
34 ALIGN 32
35 const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15
36
37 ALIGN 32
38 const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
39 dd 2, 3, 3, 4, 4, 5, 5, 6
40
41 ALIGN 32
42 tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
43 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
44 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
45 db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
46
47 tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
48 db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
49
50 tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
51
52 tab_c_526336: times 4 dd 8192*64+2048
53
54 tab_ChromaCoeff: db 0, 64, 0, 0
55 db -2, 58, 10, -2
56 db -4, 54, 16, -2
57 db -6, 46, 28, -4
58 db -4, 36, 36, -4
59 db -4, 28, 46, -6
60 db -2, 16, 54, -4
61 db -2, 10, 58, -2
62
63 tab_ChromaCoeffV: times 4 dw 0, 64
64 times 4 dw 0, 0
65
66 times 4 dw -2, 58
67 times 4 dw 10, -2
68
69 times 4 dw -4, 54
70 times 4 dw 16, -2
71
72 times 4 dw -6, 46
73 times 4 dw 28, -4
74
75 times 4 dw -4, 36
76 times 4 dw 36, -4
77
78 times 4 dw -4, 28
79 times 4 dw 46, -6
80
81 times 4 dw -2, 16
82 times 4 dw 54, -4
83
84 times 4 dw -2, 10
85 times 4 dw 58, -2
86
87 tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
88 db -1, 4, -10, 58, 17, -5, 1, 0
89 db -1, 4, -11, 40, 40, -11, 4, -1
90 db 0, 1, -5, 17, 58, -10, 4, -1
91
92 tab_LumaCoeffV: times 4 dw 0, 0
93 times 4 dw 0, 64
94 times 4 dw 0, 0
95 times 4 dw 0, 0
96
97 times 4 dw -1, 4
98 times 4 dw -10, 58
99 times 4 dw 17, -5
100 times 4 dw 1, 0
101
102 times 4 dw -1, 4
103 times 4 dw -11, 40
104 times 4 dw 40, -11
105 times 4 dw 4, -1
106
107 times 4 dw 0, 1
108 times 4 dw -5, 17
109 times 4 dw 58, -10
110 times 4 dw 4, -1
111
112 tab_LumaCoeffVer: times 8 db 0, 0
113 times 8 db 0, 64
114 times 8 db 0, 0
115 times 8 db 0, 0
116
117 times 8 db -1, 4
118 times 8 db -10, 58
119 times 8 db 17, -5
120 times 8 db 1, 0
121
122 times 8 db -1, 4
123 times 8 db -11, 40
124 times 8 db 40, -11
125 times 8 db 4, -1
126
127 times 8 db 0, 1
128 times 8 db -5, 17
129 times 8 db 58, -10
130 times 8 db 4, -1
131
132 ALIGN 32
133 tab_LumaCoeffVer_32: times 16 db 0, 0
134 times 16 db 0, 64
135 times 16 db 0, 0
136 times 16 db 0, 0
137
138 times 16 db -1, 4
139 times 16 db -10, 58
140 times 16 db 17, -5
141 times 16 db 1, 0
142
143 times 16 db -1, 4
144 times 16 db -11, 40
145 times 16 db 40, -11
146 times 16 db 4, -1
147
148 times 16 db 0, 1
149 times 16 db -5, 17
150 times 16 db 58, -10
151 times 16 db 4, -1
152
153 ALIGN 32
154 tab_ChromaCoeffVer_32: times 16 db 0, 64
155 times 16 db 0, 0
156
157 times 16 db -2, 58
158 times 16 db 10, -2
159
160 times 16 db -4, 54
161 times 16 db 16, -2
162
163 times 16 db -6, 46
164 times 16 db 28, -4
165
166 times 16 db -4, 36
167 times 16 db 36, -4
168
169 times 16 db -4, 28
170 times 16 db 46, -6
171
172 times 16 db -2, 16
173 times 16 db 54, -4
174
175 times 16 db -2, 10
176 times 16 db 58, -2
177
178 tab_c_64_n64: times 8 db 64, -64
179
180 const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
181
182 ALIGN 32
183 interp4_horiz_shuf1: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
184 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
185
186 SECTION .text
187
188 cextern pb_128
189 cextern pw_1
190 cextern pw_512
191 cextern pw_2000
192
193 %macro FILTER_H4_w2_2 3
194 movh %2, [srcq - 1]
195 pshufb %2, %2, Tm0
196 movh %1, [srcq + srcstrideq - 1]
197 pshufb %1, %1, Tm0
198 punpcklqdq %2, %1
199 pmaddubsw %2, coef2
200 phaddw %2, %2
201 pmulhrsw %2, %3
202 packuswb %2, %2
203 movd r4, %2
204 mov [dstq], r4w
205 shr r4, 16
206 mov [dstq + dststrideq], r4w
207 %endmacro
208
209 ;-----------------------------------------------------------------------------
210 ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
211 ;-----------------------------------------------------------------------------
212 INIT_XMM sse4
213 cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride
214 %define coef2 m4
215 %define Tm0 m3
216 %define t2 m2
217 %define t1 m1
218 %define t0 m0
219
220 mov r4d, r4m
221
222 %ifdef PIC
223 lea r5, [tab_ChromaCoeff]
224 movd coef2, [r5 + r4 * 4]
225 %else
226 movd coef2, [tab_ChromaCoeff + r4 * 4]
227 %endif
228
229 pshufd coef2, coef2, 0
230 mova t2, [pw_512]
231 mova Tm0, [tab_Tm]
232
233 %rep 2
234 FILTER_H4_w2_2 t0, t1, t2
235 lea srcq, [srcq + srcstrideq * 2]
236 lea dstq, [dstq + dststrideq * 2]
237 %endrep
238
239 RET
240
241 ;-----------------------------------------------------------------------------
242 ; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
243 ;-----------------------------------------------------------------------------
244 INIT_XMM sse4
245 cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride
246 %define coef2 m4
247 %define Tm0 m3
248 %define t2 m2
249 %define t1 m1
250 %define t0 m0
251
252 mov r4d, r4m
253
254 %ifdef PIC
255 lea r5, [tab_ChromaCoeff]
256 movd coef2, [r5 + r4 * 4]
257 %else
258 movd coef2, [tab_ChromaCoeff + r4 * 4]
259 %endif
260
261 pshufd coef2, coef2, 0
262 mova t2, [pw_512]
263 mova Tm0, [tab_Tm]
264
265 %rep 4
266 FILTER_H4_w2_2 t0, t1, t2
267 lea srcq, [srcq + srcstrideq * 2]
268 lea dstq, [dstq + dststrideq * 2]
269 %endrep
270
271 RET
272
273 ;-----------------------------------------------------------------------------
274 ; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
275 ;-----------------------------------------------------------------------------
276 INIT_XMM sse4
277 cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
278 %define coef2 m4
279 %define Tm0 m3
280 %define t2 m2
281 %define t1 m1
282 %define t0 m0
283
284 mov r4d, r4m
285
286 %ifdef PIC
287 lea r5, [tab_ChromaCoeff]
288 movd coef2, [r5 + r4 * 4]
289 %else
290 movd coef2, [tab_ChromaCoeff + r4 * 4]
291 %endif
292
293 pshufd coef2, coef2, 0
294 mova t2, [pw_512]
295 mova Tm0, [tab_Tm]
296
297 mov r5d, 16/2
298
299 .loop:
300 FILTER_H4_w2_2 t0, t1, t2
301 lea srcq, [srcq + srcstrideq * 2]
302 lea dstq, [dstq + dststrideq * 2]
303 dec r5d
304 jnz .loop
305
306 RET
307
308 %macro FILTER_H4_w4_2 3
309 movh %2, [srcq - 1]
310 pshufb %2, %2, Tm0
311 pmaddubsw %2, coef2
312 movh %1, [srcq + srcstrideq - 1]
313 pshufb %1, %1, Tm0
314 pmaddubsw %1, coef2
315 phaddw %2, %1
316 pmulhrsw %2, %3
317 packuswb %2, %2
318 movd [dstq], %2
319 palignr %2, %2, 4
320 movd [dstq + dststrideq], %2
321 %endmacro
322
323 ;-----------------------------------------------------------------------------
324 ; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
325 ;-----------------------------------------------------------------------------
326 INIT_XMM sse4
327 cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride
328 %define coef2 m4
329 %define Tm0 m3
330 %define t2 m2
331 %define t1 m1
332 %define t0 m0
333
334 mov r4d, r4m
335
336 %ifdef PIC
337 lea r5, [tab_ChromaCoeff]
338 movd coef2, [r5 + r4 * 4]
339 %else
340 movd coef2, [tab_ChromaCoeff + r4 * 4]
341 %endif
342
343 pshufd coef2, coef2, 0
344 mova t2, [pw_512]
345 mova Tm0, [tab_Tm]
346
347 FILTER_H4_w4_2 t0, t1, t2
348
349 RET
350
351 ;-----------------------------------------------------------------------------
352 ; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
353 ;-----------------------------------------------------------------------------
354 INIT_XMM sse4
355 cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride
356 %define coef2 m4
357 %define Tm0 m3
358 %define t2 m2
359 %define t1 m1
360 %define t0 m0
361
362 mov r4d, r4m
363
364 %ifdef PIC
365 lea r5, [tab_ChromaCoeff]
366 movd coef2, [r5 + r4 * 4]
367 %else
368 movd coef2, [tab_ChromaCoeff + r4 * 4]
369 %endif
370
371 pshufd coef2, coef2, 0
372 mova t2, [pw_512]
373 mova Tm0, [tab_Tm]
374
375 %rep 2
376 FILTER_H4_w4_2 t0, t1, t2
377 lea srcq, [srcq + srcstrideq * 2]
378 lea dstq, [dstq + dststrideq * 2]
379 %endrep
380
381 RET
382
383 ;-----------------------------------------------------------------------------
384 ; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
385 ;-----------------------------------------------------------------------------
386 INIT_XMM sse4
387 cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride
388 %define coef2 m4
389 %define Tm0 m3
390 %define t2 m2
391 %define t1 m1
392 %define t0 m0
393
394 mov r4d, r4m
395
396 %ifdef PIC
397 lea r5, [tab_ChromaCoeff]
398 movd coef2, [r5 + r4 * 4]
399 %else
400 movd coef2, [tab_ChromaCoeff + r4 * 4]
401 %endif
402
403 pshufd coef2, coef2, 0
404 mova t2, [pw_512]
405 mova Tm0, [tab_Tm]
406
407 %rep 4
408 FILTER_H4_w4_2 t0, t1, t2
409 lea srcq, [srcq + srcstrideq * 2]
410 lea dstq, [dstq + dststrideq * 2]
411 %endrep
412
413 RET
414
415 ;-----------------------------------------------------------------------------
416 ; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
417 ;-----------------------------------------------------------------------------
418 INIT_XMM sse4
419 cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride
420 %define coef2 m4
421 %define Tm0 m3
422 %define t2 m2
423 %define t1 m1
424 %define t0 m0
425
426 mov r4d, r4m
427
428 %ifdef PIC
429 lea r5, [tab_ChromaCoeff]
430 movd coef2, [r5 + r4 * 4]
431 %else
432 movd coef2, [tab_ChromaCoeff + r4 * 4]
433 %endif
434
435 pshufd coef2, coef2, 0
436 mova t2, [pw_512]
437 mova Tm0, [tab_Tm]
438
439 %rep 8
440 FILTER_H4_w4_2 t0, t1, t2
441 lea srcq, [srcq + srcstrideq * 2]
442 lea dstq, [dstq + dststrideq * 2]
443 %endrep
444
445 RET
446
447 ;-----------------------------------------------------------------------------
448 ; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
449 ;-----------------------------------------------------------------------------
450 INIT_XMM sse4
451 cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
452 %define coef2 m4
453 %define Tm0 m3
454 %define t2 m2
455 %define t1 m1
456 %define t0 m0
457
458 mov r4d, r4m
459
460 %ifdef PIC
461 lea r5, [tab_ChromaCoeff]
462 movd coef2, [r5 + r4 * 4]
463 %else
464 movd coef2, [tab_ChromaCoeff + r4 * 4]
465 %endif
466
467 pshufd coef2, coef2, 0
468 mova t2, [pw_512]
469 mova Tm0, [tab_Tm]
470
471 mov r5d, 32/2
472
473 .loop:
474 FILTER_H4_w4_2 t0, t1, t2
475 lea srcq, [srcq + srcstrideq * 2]
476 lea dstq, [dstq + dststrideq * 2]
477 dec r5d
478 jnz .loop
479
480 RET
481
482 ALIGN 32
483 const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
484
485
486 %macro FILTER_H4_w6 3
487 movu %1, [srcq - 1]
488 pshufb %2, %1, Tm0
489 pmaddubsw %2, coef2
490 pshufb %1, %1, Tm1
491 pmaddubsw %1, coef2
492 phaddw %2, %1
493 pmulhrsw %2, %3
494 packuswb %2, %2
495 movd [dstq], %2
496 pextrw [dstq + 4], %2, 2
497 %endmacro
498
499 %macro FILTER_H4_w8 3
500 movu %1, [srcq - 1]
501 pshufb %2, %1, Tm0
502 pmaddubsw %2, coef2
503 pshufb %1, %1, Tm1
504 pmaddubsw %1, coef2
505 phaddw %2, %1
506 pmulhrsw %2, %3
507 packuswb %2, %2
508 movh [dstq], %2
509 %endmacro
510
511 %macro FILTER_H4_w12 3
512 movu %1, [srcq - 1]
513 pshufb %2, %1, Tm0
514 pmaddubsw %2, coef2
515 pshufb %1, %1, Tm1
516 pmaddubsw %1, coef2
517 phaddw %2, %1
518 pmulhrsw %2, %3
519 movu %1, [srcq - 1 + 8]
520 pshufb %1, %1, Tm0
521 pmaddubsw %1, coef2
522 phaddw %1, %1
523 pmulhrsw %1, %3
524 packuswb %2, %1
525 movh [dstq], %2
526 pextrd [dstq + 8], %2, 2
527 %endmacro
528
529 %macro FILTER_H4_w16 4
530 movu %1, [srcq - 1]
531 pshufb %2, %1, Tm0
532 pmaddubsw %2, coef2
533 pshufb %1, %1, Tm1
534 pmaddubsw %1, coef2
535 phaddw %2, %1
536 movu %1, [srcq - 1 + 8]
537 pshufb %4, %1, Tm0
538 pmaddubsw %4, coef2
539 pshufb %1, %1, Tm1
540 pmaddubsw %1, coef2
541 phaddw %4, %1
542 pmulhrsw %2, %3
543 pmulhrsw %4, %3
544 packuswb %2, %4
545 movu [dstq], %2
546 %endmacro
547
548 %macro FILTER_H4_w24 4
549 movu %1, [srcq - 1]
550 pshufb %2, %1, Tm0
551 pmaddubsw %2, coef2
552 pshufb %1, %1, Tm1
553 pmaddubsw %1, coef2
554 phaddw %2, %1
555 movu %1, [srcq - 1 + 8]
556 pshufb %4, %1, Tm0
557 pmaddubsw %4, coef2
558 pshufb %1, %1, Tm1
559 pmaddubsw %1, coef2
560 phaddw %4, %1
561 pmulhrsw %2, %3
562 pmulhrsw %4, %3
563 packuswb %2, %4
564 movu [dstq], %2
565 movu %1, [srcq - 1 + 16]
566 pshufb %2, %1, Tm0
567 pmaddubsw %2, coef2
568 pshufb %1, %1, Tm1
569 pmaddubsw %1, coef2
570 phaddw %2, %1
571 pmulhrsw %2, %3
572 packuswb %2, %2
573 movh [dstq + 16], %2
574 %endmacro
575
576 %macro FILTER_H4_w32 4
577 movu %1, [srcq - 1]
578 pshufb %2, %1, Tm0
579 pmaddubsw %2, coef2
580 pshufb %1, %1, Tm1
581 pmaddubsw %1, coef2
582 phaddw %2, %1
583 movu %1, [srcq - 1 + 8]
584 pshufb %4, %1, Tm0
585 pmaddubsw %4, coef2
586 pshufb %1, %1, Tm1
587 pmaddubsw %1, coef2
588 phaddw %4, %1
589 pmulhrsw %2, %3
590 pmulhrsw %4, %3
591 packuswb %2, %4
592 movu [dstq], %2
593 movu %1, [srcq - 1 + 16]
594 pshufb %2, %1, Tm0
595 pmaddubsw %2, coef2
596 pshufb %1, %1, Tm1
597 pmaddubsw %1, coef2
598 phaddw %2, %1
599 movu %1, [srcq - 1 + 24]
600 pshufb %4, %1, Tm0
601 pmaddubsw %4, coef2
602 pshufb %1, %1, Tm1
603 pmaddubsw %1, coef2
604 phaddw %4, %1
605 pmulhrsw %2, %3
606 pmulhrsw %4, %3
607 packuswb %2, %4
608 movu [dstq + 16], %2
609 %endmacro
610
611 %macro FILTER_H4_w16o 5
612 movu %1, [srcq + %5 - 1]
613 pshufb %2, %1, Tm0
614 pmaddubsw %2, coef2
615 pshufb %1, %1, Tm1
616 pmaddubsw %1, coef2
617 phaddw %2, %1
618 movu %1, [srcq + %5 - 1 + 8]
619 pshufb %4, %1, Tm0
620 pmaddubsw %4, coef2
621 pshufb %1, %1, Tm1
622 pmaddubsw %1, coef2
623 phaddw %4, %1
624 pmulhrsw %2, %3
625 pmulhrsw %4, %3
626 packuswb %2, %4
627 movu [dstq + %5], %2
628 %endmacro
629
630 %macro FILTER_H4_w48 4
631 FILTER_H4_w16o %1, %2, %3, %4, 0
632 FILTER_H4_w16o %1, %2, %3, %4, 16
633 FILTER_H4_w16o %1, %2, %3, %4, 32
634 %endmacro
635
636 %macro FILTER_H4_w64 4
637 FILTER_H4_w16o %1, %2, %3, %4, 0
638 FILTER_H4_w16o %1, %2, %3, %4, 16
639 FILTER_H4_w16o %1, %2, %3, %4, 32
640 FILTER_H4_w16o %1, %2, %3, %4, 48
641 %endmacro
642
643 ;-----------------------------------------------------------------------------
644 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
645 ;-----------------------------------------------------------------------------
646 %macro IPFILTER_CHROMA 2
647 INIT_XMM sse4
648 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
649 %define coef2 m5
650 %define Tm0 m4
651 %define Tm1 m3
652 %define t2 m2
653 %define t1 m1
654 %define t0 m0
655
656 mov r4d, r4m
657
658 %ifdef PIC
659 lea r5, [tab_ChromaCoeff]
660 movd coef2, [r5 + r4 * 4]
661 %else
662 movd coef2, [tab_ChromaCoeff + r4 * 4]
663 %endif
664
665 mov r5d, %2
666
667 pshufd coef2, coef2, 0
668 mova t2, [pw_512]
669 mova Tm0, [tab_Tm]
670 mova Tm1, [tab_Tm + 16]
671
672 .loop:
673 FILTER_H4_w%1 t0, t1, t2
674 add srcq, srcstrideq
675 add dstq, dststrideq
676
677 dec r5d
678 jnz .loop
679
680 RET
681 %endmacro
682
683
684 IPFILTER_CHROMA 6, 8
685 IPFILTER_CHROMA 8, 2
686 IPFILTER_CHROMA 8, 4
687 IPFILTER_CHROMA 8, 6
688 IPFILTER_CHROMA 8, 8
689 IPFILTER_CHROMA 8, 16
690 IPFILTER_CHROMA 8, 32
691 IPFILTER_CHROMA 12, 16
692
693 IPFILTER_CHROMA 6, 16
694 IPFILTER_CHROMA 8, 12
695 IPFILTER_CHROMA 8, 64
696 IPFILTER_CHROMA 12, 32
697
698 ;-----------------------------------------------------------------------------
699 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
700 ;-----------------------------------------------------------------------------
701 %macro IPFILTER_CHROMA_W 2
702 INIT_XMM sse4
703 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
704 %define coef2 m6
705 %define Tm0 m5
706 %define Tm1 m4
707 %define t3 m3
708 %define t2 m2
709 %define t1 m1
710 %define t0 m0
711
712 mov r4d, r4m
713
714 %ifdef PIC
715 lea r5, [tab_ChromaCoeff]
716 movd coef2, [r5 + r4 * 4]
717 %else
718 movd coef2, [tab_ChromaCoeff + r4 * 4]
719 %endif
720
721 mov r5d, %2
722
723 pshufd coef2, coef2, 0
724 mova t2, [pw_512]
725 mova Tm0, [tab_Tm]
726 mova Tm1, [tab_Tm + 16]
727
728 .loop:
729 FILTER_H4_w%1 t0, t1, t2, t3
730 add srcq, srcstrideq
731 add dstq, dststrideq
732
733 dec r5d
734 jnz .loop
735
736 RET
737 %endmacro
738
739 IPFILTER_CHROMA_W 16, 4
740 IPFILTER_CHROMA_W 16, 8
741 IPFILTER_CHROMA_W 16, 12
742 IPFILTER_CHROMA_W 16, 16
743 IPFILTER_CHROMA_W 16, 32
744 IPFILTER_CHROMA_W 32, 8
745 IPFILTER_CHROMA_W 32, 16
746 IPFILTER_CHROMA_W 32, 24
747 IPFILTER_CHROMA_W 24, 32
748 IPFILTER_CHROMA_W 32, 32
749
750 IPFILTER_CHROMA_W 16, 24
751 IPFILTER_CHROMA_W 16, 64
752 IPFILTER_CHROMA_W 32, 48
753 IPFILTER_CHROMA_W 24, 64
754 IPFILTER_CHROMA_W 32, 64
755
756 IPFILTER_CHROMA_W 64, 64
757 IPFILTER_CHROMA_W 64, 32
758 IPFILTER_CHROMA_W 64, 48
759 IPFILTER_CHROMA_W 48, 64
760 IPFILTER_CHROMA_W 64, 16
761
762
763 %macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
764 movu %1, %7
765 pshufb %2, %1, [tab_Lm + 0]
766 pmaddubsw %2, %5
767 pshufb %3, %1, [tab_Lm + 16]
768 pmaddubsw %3, %5
769 phaddw %2, %3
770 pshufb %4, %1, [tab_Lm + 32]
771 pmaddubsw %4, %5
772 pshufb %1, %1, [tab_Lm + 48]
773 pmaddubsw %1, %5
774 phaddw %4, %1
775 phaddw %2, %4
776 %if %0 == 8
777 pmulhrsw %2, %6
778 packuswb %2, %2
779 movh %8, %2
780 %endif
781 %endmacro
782
783 %macro FILTER_H8_W4 2
784 movu %1, [r0 - 3 + r5]
785 pshufb %2, %1, [tab_Lm]
786 pmaddubsw %2, m3
787 pshufb m7, %1, [tab_Lm + 16]
788 pmaddubsw m7, m3
789 phaddw %2, m7
790 phaddw %2, %2
791 %endmacro
792
793 ;----------------------------------------------------------------------------------------------------------------------------
794 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
795 ;----------------------------------------------------------------------------------------------------------------------------
796 %macro IPFILTER_LUMA 3
797 INIT_XMM sse4
798 cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
799
800 mov r4d, r4m
801
802 %ifdef PIC
803 lea r6, [tab_LumaCoeff]
804 movh m3, [r6 + r4 * 8]
805 %else
806 movh m3, [tab_LumaCoeff + r4 * 8]
807 %endif
808 punpcklqdq m3, m3
809
810 %ifidn %3, pp
811 mova m2, [pw_512]
812 %else
813 mova m2, [pw_2000]
814 %endif
815
816 mov r4d, %2
817 %ifidn %3, ps
818 add r3, r3
819 cmp r5m, byte 0
820 je .loopH
821 lea r6, [r1 + 2 * r1]
822 sub r0, r6
823 add r4d, 7
824 %endif
825
826 .loopH:
827 xor r5, r5
828 %rep %1 / 8
829 %ifidn %3, pp
830 FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
831 %else
832 FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
833 psubw m1, m2
834 movu [r2 + 2 * r5], m1
835 %endif
836 add r5, 8
837 %endrep
838
839 %rep (%1 % 8) / 4
840 FILTER_H8_W4 m0, m1
841 %ifidn %3, pp
842 pmulhrsw m1, m2
843 packuswb m1, m1
844 movd [r2 + r5], m1
845 %else
846 psubw m1, m2
847 movh [r2 + 2 * r5], m1
848 %endif
849 %endrep
850
851 add r0, r1
852 add r2, r3
853
854 dec r4d
855 jnz .loopH
856 RET
857 %endmacro
858
859
860 INIT_YMM avx2
861 cglobal interp_8tap_horiz_pp_4x4, 4,6,6
862 mov r4d, r4m
863
864 %ifdef PIC
865 lea r5, [tab_LumaCoeff]
866 vpbroadcastq m0, [r5 + r4 * 8]
867 %else
868 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
869 %endif
870
871 mova m1, [tab_Lm]
872 vpbroadcastd m2, [pw_1]
873
874 ; register map
875 ; m0 - interpolate coeff
876 ; m1 - shuffle order table
877 ; m2 - constant word 1
878
879 sub r0, 3
880 ; Row 0-1
881 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
882 pshufb m3, m1
883 pmaddubsw m3, m0
884 pmaddwd m3, m2
885 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
886 pshufb m4, m1
887 pmaddubsw m4, m0
888 pmaddwd m4, m2
889 phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
890
891 ; Row 2-3
892 lea r0, [r0 + r1 * 2]
893 vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
894 pshufb m4, m1
895 pmaddubsw m4, m0
896 pmaddwd m4, m2
897 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
898 pshufb m5, m1
899 pmaddubsw m5, m0
900 pmaddwd m5, m2
901 phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
902
903 packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
904 pmulhrsw m3, [pw_512]
905 vextracti128 xm4, m3, 1
906 packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
907 pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0]
908
909 lea r0, [r3 * 3]
910 movd [r2], xm3
911 pextrd [r2+r3], xm3, 2
912 pextrd [r2+r3*2], xm3, 1
913 pextrd [r2+r0], xm3, 3
914 RET
915
916 INIT_YMM avx2
917 cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7
918 mov r4d, r4m
919
920 %ifdef PIC
921 lea r5, [tab_LumaCoeff]
922 vpbroadcastq m0, [r5 + r4 * 8]
923 %else
924 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
925 %endif
926
927 mova m1, [tab_Lm]
928 mova m2, [tab_Lm + 32]
929
930 ; register map
931 ; m0 - interpolate coeff
932 ; m1, m2 - shuffle order table
933
934 sub r0, 3
935 lea r5, [r1 * 3]
936 lea r4, [r3 * 3]
937
938 ; Row 0
939 vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
940 pshufb m4, m3, m2
941 pshufb m3, m1
942 pmaddubsw m3, m0
943 pmaddubsw m4, m0
944 phaddw m3, m4
945 ; Row 1
946 vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
947 pshufb m5, m4, m2
948 pshufb m4, m1
949 pmaddubsw m4, m0
950 pmaddubsw m5, m0
951 phaddw m4, m5
952
953 phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
954 pmulhrsw m3, [pw_512]
955
956 ; Row 2
957 vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
958 pshufb m5, m4, m2
959 pshufb m4, m1
960 pmaddubsw m4, m0
961 pmaddubsw m5, m0
962 phaddw m4, m5
963 ; Row 3
964 vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
965 pshufb m6, m5, m2
966 pshufb m5, m1
967 pmaddubsw m5, m0
968 pmaddubsw m6, m0
969 phaddw m5, m6
970
971 phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
972 pmulhrsw m4, [pw_512]
973
974 packuswb m3, m4
975 vextracti128 xm4, m3, 1
976 punpcklwd xm5, xm3, xm4
977
978 movq [r2], xm5
979 movhps [r2 + r3], xm5
980
981 punpckhwd xm5, xm3, xm4
982 movq [r2 + r3 * 2], xm5
983 movhps [r2 + r4], xm5
984 RET
985
986 %macro IPFILTER_LUMA_AVX2_8xN 2
987 INIT_YMM avx2
988 cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7
989 mov r4d, r4m
990
991 %ifdef PIC
992 lea r5, [tab_LumaCoeff]
993 vpbroadcastq m0, [r5 + r4 * 8]
994 %else
995 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
996 %endif
997
998 mova m1, [tab_Lm]
999 mova m2, [tab_Lm + 32]
1000
1001 ; register map
1002 ; m0 - interpolate coeff
1003 ; m1, m2 - shuffle order table
1004
1005 sub r0, 3
1006 lea r5, [r1 * 3]
1007 lea r6, [r3 * 3]
1008 mov r4d, %2 / 4
1009 .loop:
1010 ; Row 0
1011 vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1012 pshufb m4, m3, m2
1013 pshufb m3, m1
1014 pmaddubsw m3, m0
1015 pmaddubsw m4, m0
1016 phaddw m3, m4
1017 ; Row 1
1018 vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1019 pshufb m5, m4, m2
1020 pshufb m4, m1
1021 pmaddubsw m4, m0
1022 pmaddubsw m5, m0
1023 phaddw m4, m5
1024
1025 phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
1026 pmulhrsw m3, [pw_512]
1027
1028 ; Row 2
1029 vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1030 pshufb m5, m4, m2
1031 pshufb m4, m1
1032 pmaddubsw m4, m0
1033 pmaddubsw m5, m0
1034 phaddw m4, m5
1035 ; Row 3
1036 vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1037 pshufb m6, m5, m2
1038 pshufb m5, m1
1039 pmaddubsw m5, m0
1040 pmaddubsw m6, m0
1041 phaddw m5, m6
1042
1043 phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
1044 pmulhrsw m4, [pw_512]
1045
1046 packuswb m3, m4
1047 vextracti128 xm4, m3, 1
1048 punpcklwd xm5, xm3, xm4
1049
1050 movq [r2], xm5
1051 movhps [r2 + r3], xm5
1052
1053 punpckhwd xm5, xm3, xm4
1054 movq [r2 + r3 * 2], xm5
1055 movhps [r2 + r6], xm5
1056
1057 lea r0, [r0 + r1 * 4]
1058 lea r2, [r2 + r3 * 4]
1059 dec r4d
1060 jnz .loop
1061 RET
1062 %endmacro
1063
1064 IPFILTER_LUMA_AVX2_8xN 8, 8
1065 IPFILTER_LUMA_AVX2_8xN 8, 16
1066 IPFILTER_LUMA_AVX2_8xN 8, 32
1067
1068 %macro IPFILTER_LUMA_AVX2 2
1069 INIT_YMM avx2
1070 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
1071 sub r0, 3
1072 mov r4d, r4m
1073 %ifdef PIC
1074 lea r5, [tab_LumaCoeff]
1075 vpbroadcastd m0, [r5 + r4 * 8]
1076 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1077 %else
1078 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1079 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1080 %endif
1081 movu m3, [tab_Tm + 16]
1082 vpbroadcastd m7, [pw_1]
1083
1084 ; register map
1085 ; m0 , m1 interpolate coeff
1086 ; m2 , m2 shuffle order table
1087 ; m7 - pw_1
1088
1089 mov r4d, %2/2
1090 .loop:
1091 ; Row 0
1092 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1093 pshufb m5, m4, m3
1094 pshufb m4, [tab_Tm]
1095 pmaddubsw m4, m0
1096 pmaddubsw m5, m1
1097 paddw m4, m5
1098 pmaddwd m4, m7
1099 vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0
1100 pshufb m6, m5, m3
1101 pshufb m5, [tab_Tm]
1102 pmaddubsw m5, m0
1103 pmaddubsw m6, m1
1104 paddw m5, m6
1105 pmaddwd m5, m7
1106 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1107 pmulhrsw m4, [pw_512]
1108 vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1109 pshufb m5, m2, m3
1110 pshufb m2, [tab_Tm]
1111 pmaddubsw m2, m0
1112 pmaddubsw m5, m1
1113 paddw m2, m5
1114 pmaddwd m2, m7
1115 vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0
1116 pshufb m6, m5, m3
1117 pshufb m5, [tab_Tm]
1118 pmaddubsw m5, m0
1119 pmaddubsw m6, m1
1120 paddw m5, m6
1121 pmaddwd m5, m7
1122 packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1123 pmulhrsw m2, [pw_512]
1124 packuswb m4, m2
1125 vpermq m4, m4, 11011000b
1126 vextracti128 xm5, m4, 1
1127 pshufd xm4, xm4, 11011000b
1128 pshufd xm5, xm5, 11011000b
1129 movu [r2], xm4
1130 movu [r2+r3], xm5
1131 lea r0, [r0 + r1 * 2]
1132 lea r2, [r2 + r3 * 2]
1133 dec r4d
1134 jnz .loop
1135 RET
1136 %endmacro
1137
1138 %macro IPFILTER_LUMA_32x_avx2 2
1139 INIT_YMM avx2
1140 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
1141 sub r0, 3
1142 mov r4d, r4m
1143 %ifdef PIC
1144 lea r5, [tab_LumaCoeff]
1145 vpbroadcastd m0, [r5 + r4 * 8]
1146 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1147 %else
1148 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1149 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1150 %endif
1151 movu m3, [tab_Tm + 16]
1152 vpbroadcastd m7, [pw_1]
1153
1154 ; register map
1155 ; m0 , m1 interpolate coeff
1156 ; m2 , m2 shuffle order table
1157 ; m7 - pw_1
1158
1159 mov r4d, %2
1160 .loop:
1161 ; Row 0
1162 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1163 pshufb m5, m4, m3
1164 pshufb m4, [tab_Tm]
1165 pmaddubsw m4, m0
1166 pmaddubsw m5, m1
1167 paddw m4, m5
1168 pmaddwd m4, m7
1169 vbroadcasti128 m5, [r0 + 8]
1170 pshufb m6, m5, m3
1171 pshufb m5, [tab_Tm]
1172 pmaddubsw m5, m0
1173 pmaddubsw m6, m1
1174 paddw m5, m6
1175 pmaddwd m5, m7
1176 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1177 pmulhrsw m4, [pw_512]
1178 vbroadcasti128 m2, [r0 + 16]
1179 pshufb m5, m2, m3
1180 pshufb m2, [tab_Tm]
1181 pmaddubsw m2, m0
1182 pmaddubsw m5, m1
1183 paddw m2, m5
1184 pmaddwd m2, m7
1185 vbroadcasti128 m5, [r0 + 24]
1186 pshufb m6, m5, m3
1187 pshufb m5, [tab_Tm]
1188 pmaddubsw m5, m0
1189 pmaddubsw m6, m1
1190 paddw m5, m6
1191 pmaddwd m5, m7
1192 packssdw m2, m5
1193 pmulhrsw m2, [pw_512]
1194 packuswb m4, m2
1195 vpermq m4, m4, 11011000b
1196 vextracti128 xm5, m4, 1
1197 pshufd xm4, xm4, 11011000b
1198 pshufd xm5, xm5, 11011000b
1199 movu [r2], xm4
1200 movu [r2 + 16], xm5
1201 lea r0, [r0 + r1]
1202 lea r2, [r2 + r3]
1203 dec r4d
1204 jnz .loop
1205 RET
1206 %endmacro
1207
1208 %macro IPFILTER_LUMA_64x_avx2 2
1209 INIT_YMM avx2
1210 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
1211 sub r0, 3
1212 mov r4d, r4m
1213 %ifdef PIC
1214 lea r5, [tab_LumaCoeff]
1215 vpbroadcastd m0, [r5 + r4 * 8]
1216 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1217 %else
1218 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1219 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1220 %endif
1221 movu m3, [tab_Tm + 16]
1222 vpbroadcastd m7, [pw_1]
1223
1224 ; register map
1225 ; m0 , m1 interpolate coeff
1226 ; m2 , m2 shuffle order table
1227 ; m7 - pw_1
1228
1229 mov r4d, %2
1230 .loop:
1231 ; Row 0
1232 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1233 pshufb m5, m4, m3
1234 pshufb m4, [tab_Tm]
1235 pmaddubsw m4, m0
1236 pmaddubsw m5, m1
1237 paddw m4, m5
1238 pmaddwd m4, m7
1239 vbroadcasti128 m5, [r0 + 8]
1240 pshufb m6, m5, m3
1241 pshufb m5, [tab_Tm]
1242 pmaddubsw m5, m0
1243 pmaddubsw m6, m1
1244 paddw m5, m6
1245 pmaddwd m5, m7
1246 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1247 pmulhrsw m4, [pw_512]
1248 vbroadcasti128 m2, [r0 + 16]
1249 pshufb m5, m2, m3
1250 pshufb m2, [tab_Tm]
1251 pmaddubsw m2, m0
1252 pmaddubsw m5, m1
1253 paddw m2, m5
1254 pmaddwd m2, m7
1255 vbroadcasti128 m5, [r0 + 24]
1256 pshufb m6, m5, m3
1257 pshufb m5, [tab_Tm]
1258 pmaddubsw m5, m0
1259 pmaddubsw m6, m1
1260 paddw m5, m6
1261 pmaddwd m5, m7
1262 packssdw m2, m5
1263 pmulhrsw m2, [pw_512]
1264 packuswb m4, m2
1265 vpermq m4, m4, 11011000b
1266 vextracti128 xm5, m4, 1
1267 pshufd xm4, xm4, 11011000b
1268 pshufd xm5, xm5, 11011000b
1269 movu [r2], xm4
1270 movu [r2 + 16], xm5
1271
1272 vbroadcasti128 m4, [r0 + 32]
1273 pshufb m5, m4, m3
1274 pshufb m4, [tab_Tm]
1275 pmaddubsw m4, m0
1276 pmaddubsw m5, m1
1277 paddw m4, m5
1278 pmaddwd m4, m7
1279 vbroadcasti128 m5, [r0 + 40]
1280 pshufb m6, m5, m3
1281 pshufb m5, [tab_Tm]
1282 pmaddubsw m5, m0
1283 pmaddubsw m6, m1
1284 paddw m5, m6
1285 pmaddwd m5, m7
1286 packssdw m4, m5
1287 pmulhrsw m4, [pw_512]
1288 vbroadcasti128 m2, [r0 + 48]
1289 pshufb m5, m2, m3
1290 pshufb m2, [tab_Tm]
1291 pmaddubsw m2, m0
1292 pmaddubsw m5, m1
1293 paddw m2, m5
1294 pmaddwd m2, m7
1295 vbroadcasti128 m5, [r0 + 56]
1296 pshufb m6, m5, m3
1297 pshufb m5, [tab_Tm]
1298 pmaddubsw m5, m0
1299 pmaddubsw m6, m1
1300 paddw m5, m6
1301 pmaddwd m5, m7
1302 packssdw m2, m5
1303 pmulhrsw m2, [pw_512]
1304 packuswb m4, m2
1305 vpermq m4, m4, 11011000b
1306 vextracti128 xm5, m4, 1
1307 pshufd xm4, xm4, 11011000b
1308 pshufd xm5, xm5, 11011000b
1309 movu [r2 +32], xm4
1310 movu [r2 + 48], xm5
1311
1312 lea r0, [r0 + r1]
1313 lea r2, [r2 + r3]
1314 dec r4d
1315 jnz .loop
1316 RET
1317 %endmacro
1318
1319 INIT_YMM avx2
1320 cglobal interp_8tap_horiz_pp_48x64, 4,6,8
1321 sub r0, 3
1322 mov r4d, r4m
1323 %ifdef PIC
1324 lea r5, [tab_LumaCoeff]
1325 vpbroadcastd m0, [r5 + r4 * 8]
1326 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1327 %else
1328 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1329 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1330 %endif
1331 movu m3, [tab_Tm + 16]
1332 vpbroadcastd m7, [pw_1]
1333
1334 ; register map
1335 ; m0 , m1 interpolate coeff
1336 ; m2 , m2 shuffle order table
1337 ; m7 - pw_1
1338
1339 mov r4d, 64
1340 .loop:
1341 ; Row 0
1342 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1343 pshufb m5, m4, m3
1344 pshufb m4, [tab_Tm]
1345 pmaddubsw m4, m0
1346 pmaddubsw m5, m1
1347 paddw m4, m5
1348 pmaddwd m4, m7
1349 vbroadcasti128 m5, [r0 + 8]
1350 pshufb m6, m5, m3
1351 pshufb m5, [tab_Tm]
1352 pmaddubsw m5, m0
1353 pmaddubsw m6, m1
1354 paddw m5, m6
1355 pmaddwd m5, m7
1356 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1357 pmulhrsw m4, [pw_512]
1358
1359 vbroadcasti128 m2, [r0 + 16]
1360 pshufb m5, m2, m3
1361 pshufb m2, [tab_Tm]
1362 pmaddubsw m2, m0
1363 pmaddubsw m5, m1
1364 paddw m2, m5
1365 pmaddwd m2, m7
1366 vbroadcasti128 m5, [r0 + 24]
1367 pshufb m6, m5, m3
1368 pshufb m5, [tab_Tm]
1369 pmaddubsw m5, m0
1370 pmaddubsw m6, m1
1371 paddw m5, m6
1372 pmaddwd m5, m7
1373 packssdw m2, m5
1374 pmulhrsw m2, [pw_512]
1375 packuswb m4, m2
1376 vpermq m4, m4, 11011000b
1377 vextracti128 xm5, m4, 1
1378 pshufd xm4, xm4, 11011000b
1379 pshufd xm5, xm5, 11011000b
1380 movu [r2], xm4
1381 movu [r2 + 16], xm5
1382
1383 vbroadcasti128 m4, [r0 + 32]
1384 pshufb m5, m4, m3
1385 pshufb m4, [tab_Tm]
1386 pmaddubsw m4, m0
1387 pmaddubsw m5, m1
1388 paddw m4, m5
1389 pmaddwd m4, m7
1390 vbroadcasti128 m5, [r0 + 40]
1391 pshufb m6, m5, m3
1392 pshufb m5, [tab_Tm]
1393 pmaddubsw m5, m0
1394 pmaddubsw m6, m1
1395 paddw m5, m6
1396 pmaddwd m5, m7
1397 packssdw m4, m5
1398 pmulhrsw m4, [pw_512]
1399 packuswb m4, m4
1400 vpermq m4, m4, 11011000b
1401 pshufd xm4, xm4, 11011000b
1402 movu [r2 + 32], xm4
1403
1404 lea r0, [r0 + r1]
1405 lea r2, [r2 + r3]
1406 dec r4d
1407 jnz .loop
1408 RET
1409
1410 INIT_YMM avx2
1411 cglobal interp_4tap_horiz_pp_4x4, 4,6,6
1412 mov r4d, r4m
1413
1414 %ifdef PIC
1415 lea r5, [tab_ChromaCoeff]
1416 vpbroadcastd m0, [r5 + r4 * 4]
1417 %else
1418 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1419 %endif
1420
1421 vpbroadcastd m2, [pw_1]
1422 vbroadcasti128 m1, [tab_Tm]
1423
1424 ; register map
1425 ; m0 - interpolate coeff
1426 ; m1 - shuffle order table
1427 ; m2 - constant word 1
1428
1429 dec r0
1430
1431 ; Row 0-1
1432 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1433 vinserti128 m3, m3, [r0 + r1], 1
1434 pshufb m3, m1
1435 pmaddubsw m3, m0
1436 pmaddwd m3, m2
1437
1438 ; Row 2-3
1439 lea r0, [r0 + r1 * 2]
1440 vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1441 vinserti128 m4, m4, [r0 + r1], 1
1442 pshufb m4, m1
1443 pmaddubsw m4, m0
1444 pmaddwd m4, m2
1445
1446 packssdw m3, m4
1447 pmulhrsw m3, [pw_512]
1448 vextracti128 xm4, m3, 1
1449 packuswb xm3, xm4
1450
1451 lea r0, [r3 * 3]
1452 movd [r2], xm3
1453 pextrd [r2+r3], xm3, 2
1454 pextrd [r2+r3*2], xm3, 1
1455 pextrd [r2+r0], xm3, 3
1456 RET
1457
1458 INIT_YMM avx2
1459 cglobal interp_4tap_horiz_pp_32x32, 4,6,7
1460 mov r4d, r4m
1461
1462 %ifdef PIC
1463 lea r5, [tab_ChromaCoeff]
1464 vpbroadcastd m0, [r5 + r4 * 4]
1465 %else
1466 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1467 %endif
1468
1469 mova m1, [interp4_horiz_shuf1]
1470 vpbroadcastd m2, [pw_1]
1471 mova m6, [pw_512]
1472 ; register map
1473 ; m0 - interpolate coeff
1474 ; m1 - shuffle order table
1475 ; m2 - constant word 1
1476
1477 dec r0
1478 mov r4d, 32
1479
1480 .loop:
1481 ; Row 0
1482 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1483 pshufb m3, m1
1484 pmaddubsw m3, m0
1485 pmaddwd m3, m2
1486 vbroadcasti128 m4, [r0 + 4]
1487 pshufb m4, m1
1488 pmaddubsw m4, m0
1489 pmaddwd m4, m2
1490 packssdw m3, m4
1491 pmulhrsw m3, m6
1492
1493 vbroadcasti128 m4, [r0 + 16]
1494 pshufb m4, m1
1495 pmaddubsw m4, m0
1496 pmaddwd m4, m2
1497 vbroadcasti128 m5, [r0 + 20]
1498 pshufb m5, m1
1499 pmaddubsw m5, m0
1500 pmaddwd m5, m2
1501 packssdw m4, m5
1502 pmulhrsw m4, m6
1503
1504 packuswb m3, m4
1505 vpermq m3, m3, 11011000b
1506
1507 movu [r2], m3
1508 lea r2, [r2 + r3]
1509 lea r0, [r0 + r1]
1510 dec r4d
1511 jnz .loop
1512 RET
1513
1514
1515 INIT_YMM avx2
1516 cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7
1517 mov r4d, r4m
1518
1519 %ifdef PIC
1520 lea r5, [tab_ChromaCoeff]
1521 vpbroadcastd m0, [r5 + r4 * 4]
1522 %else
1523 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1524 %endif
1525
1526 mova m6, [pw_512]
1527 mova m1, [interp4_horiz_shuf1]
1528 vpbroadcastd m2, [pw_1]
1529
1530 ; register map
1531 ; m0 - interpolate coeff
1532 ; m1 - shuffle order table
1533 ; m2 - constant word 1
1534
1535 dec r0
1536 mov r4d, 8
1537
1538 .loop:
1539 ; Row 0
1540 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1541 pshufb m3, m1
1542 pmaddubsw m3, m0
1543 pmaddwd m3, m2
1544 vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1545 pshufb m4, m1
1546 pmaddubsw m4, m0
1547 pmaddwd m4, m2
1548 packssdw m3, m4
1549 pmulhrsw m3, m6
1550
1551 ; Row 1
1552 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1553 pshufb m4, m1
1554 pmaddubsw m4, m0
1555 pmaddwd m4, m2
1556 vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1557 pshufb m5, m1
1558 pmaddubsw m5, m0
1559 pmaddwd m5, m2
1560 packssdw m4, m5
1561 pmulhrsw m4, m6
1562
1563 packuswb m3, m4
1564 vpermq m3, m3, 11011000b
1565
1566 vextracti128 xm4, m3, 1
1567 movu [r2], xm3
1568 movu [r2 + r3], xm4
1569 lea r2, [r2 + r3 * 2]
1570 lea r0, [r0 + r1 * 2]
1571 dec r4d
1572 jnz .loop
1573 RET
1574 ;--------------------------------------------------------------------------------------------------------------
1575 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1576 ;--------------------------------------------------------------------------------------------------------------
1577 IPFILTER_LUMA 4, 4, pp
1578 IPFILTER_LUMA 4, 8, pp
1579 IPFILTER_LUMA 12, 16, pp
1580 IPFILTER_LUMA 4, 16, pp
1581
1582 INIT_YMM avx2
1583 cglobal interp_4tap_horiz_pp_8x8, 4,6,6
1584 mov r4d, r4m
1585
1586 %ifdef PIC
1587 lea r5, [tab_ChromaCoeff]
1588 vpbroadcastd m0, [r5 + r4 * 4]
1589 %else
1590 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1591 %endif
1592
1593 movu m1, [tab_Tm]
1594 vpbroadcastd m2, [pw_1]
1595
1596 ; register map
1597 ; m0 - interpolate coeff
1598 ; m1 - shuffle order table
1599 ; m2 - constant word 1
1600
1601 sub r0, 1
1602 mov r4d, 2
1603
1604 .loop:
1605 ; Row 0
1606 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1607 pshufb m3, m1
1608 pmaddubsw m3, m0
1609 pmaddwd m3, m2
1610
1611 ; Row 1
1612 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1613 pshufb m4, m1
1614 pmaddubsw m4, m0
1615 pmaddwd m4, m2
1616 packssdw m3, m4
1617 pmulhrsw m3, [pw_512]
1618 lea r0, [r0 + r1 * 2]
1619
1620 ; Row 2
1621 vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1622 pshufb m4, m1
1623 pmaddubsw m4, m0
1624 pmaddwd m4, m2
1625
1626 ; Row 3
1627 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1628 pshufb m5, m1
1629 pmaddubsw m5, m0
1630 pmaddwd m5, m2
1631 packssdw m4, m5
1632 pmulhrsw m4, [pw_512]
1633
1634 packuswb m3, m4
1635 mova m5, [interp_4tap_8x8_horiz_shuf]
1636 vpermd m3, m5, m3
1637 vextracti128 xm4, m3, 1
1638 movq [r2], xm3
1639 movhps [r2 + r3], xm3
1640 lea r2, [r2 + r3 * 2]
1641 movq [r2], xm4
1642 movhps [r2 + r3], xm4
1643 lea r2, [r2 + r3 * 2]
1644 lea r0, [r0 + r1*2]
1645 dec r4d
1646 jnz .loop
1647 RET
1648
1649 IPFILTER_LUMA_AVX2 16, 4
1650 IPFILTER_LUMA_AVX2 16, 8
1651 IPFILTER_LUMA_AVX2 16, 12
1652 IPFILTER_LUMA_AVX2 16, 16
1653 IPFILTER_LUMA_AVX2 16, 32
1654 IPFILTER_LUMA_AVX2 16, 64
1655
1656 IPFILTER_LUMA_32x_avx2 32 , 8
1657 IPFILTER_LUMA_32x_avx2 32 , 16
1658 IPFILTER_LUMA_32x_avx2 32 , 24
1659 IPFILTER_LUMA_32x_avx2 32 , 32
1660 IPFILTER_LUMA_32x_avx2 32 , 64
1661
1662 IPFILTER_LUMA_64x_avx2 64 , 64
1663 IPFILTER_LUMA_64x_avx2 64 , 48
1664 IPFILTER_LUMA_64x_avx2 64 , 32
1665 IPFILTER_LUMA_64x_avx2 64 , 16
1666
1667 ;--------------------------------------------------------------------------------------------------------------
1668 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1669 ;--------------------------------------------------------------------------------------------------------------
1670 %macro IPFILTER_LUMA_PP_W8 2
1671 INIT_XMM sse4
1672 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
1673 mov r4d, r4m
1674
1675 %ifdef PIC
1676 lea r5, [tab_LumaCoeff]
1677 movh m3, [r5 + r4 * 8]
1678 %else
1679 movh m3, [tab_LumaCoeff + r4 * 8]
1680 %endif
1681 pshufd m0, m3, 0 ; m0 = coeff-L
1682 pshufd m1, m3, 0x55 ; m1 = coeff-H
1683 lea r5, [tab_Tm] ; r5 = shuffle
1684 mova m2, [pw_512] ; m2 = 512
1685
1686 mov r4d, %2
1687 .loopH:
1688 %assign x 0
1689 %rep %1 / 8
1690 movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
1691 pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
1692 pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
1693 pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]
1694 pmaddubsw m4, m0
1695 pmaddubsw m6, m5, m1
1696 pmaddubsw m5, m0
1697 pmaddubsw m3, m1
1698 paddw m4, m6
1699 paddw m5, m3
1700 phaddw m4, m5
1701 pmulhrsw m4, m2
1702 packuswb m4, m4
1703 movh [r2 + x], m4
1704 %assign x x+8
1705 %endrep
1706
1707 add r0, r1
1708 add r2, r3
1709
1710 dec r4d
1711 jnz .loopH
1712 RET
1713 %endmacro
1714
1715 IPFILTER_LUMA_PP_W8 8, 4
1716 IPFILTER_LUMA_PP_W8 8, 8
1717 IPFILTER_LUMA_PP_W8 8, 16
1718 IPFILTER_LUMA_PP_W8 8, 32
1719 IPFILTER_LUMA_PP_W8 16, 4
1720 IPFILTER_LUMA_PP_W8 16, 8
1721 IPFILTER_LUMA_PP_W8 16, 12
1722 IPFILTER_LUMA_PP_W8 16, 16
1723 IPFILTER_LUMA_PP_W8 16, 32
1724 IPFILTER_LUMA_PP_W8 16, 64
1725 IPFILTER_LUMA_PP_W8 24, 32
1726 IPFILTER_LUMA_PP_W8 32, 8
1727 IPFILTER_LUMA_PP_W8 32, 16
1728 IPFILTER_LUMA_PP_W8 32, 24
1729 IPFILTER_LUMA_PP_W8 32, 32
1730 IPFILTER_LUMA_PP_W8 32, 64
1731 IPFILTER_LUMA_PP_W8 48, 64
1732 IPFILTER_LUMA_PP_W8 64, 16
1733 IPFILTER_LUMA_PP_W8 64, 32
1734 IPFILTER_LUMA_PP_W8 64, 48
1735 IPFILTER_LUMA_PP_W8 64, 64
1736
1737 ;----------------------------------------------------------------------------------------------------------------------------
1738 ; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1739 ;----------------------------------------------------------------------------------------------------------------------------
1740 IPFILTER_LUMA 4, 4, ps
1741 IPFILTER_LUMA 8, 8, ps
1742 IPFILTER_LUMA 8, 4, ps
1743 IPFILTER_LUMA 4, 8, ps
1744 IPFILTER_LUMA 16, 16, ps
1745 IPFILTER_LUMA 16, 8, ps
1746 IPFILTER_LUMA 8, 16, ps
1747 IPFILTER_LUMA 16, 12, ps
1748 IPFILTER_LUMA 12, 16, ps
1749 IPFILTER_LUMA 16, 4, ps
1750 IPFILTER_LUMA 4, 16, ps
1751 IPFILTER_LUMA 32, 32, ps
1752 IPFILTER_LUMA 32, 16, ps
1753 IPFILTER_LUMA 16, 32, ps
1754 IPFILTER_LUMA 32, 24, ps
1755 IPFILTER_LUMA 24, 32, ps
1756 IPFILTER_LUMA 32, 8, ps
1757 IPFILTER_LUMA 8, 32, ps
1758 IPFILTER_LUMA 64, 64, ps
1759 IPFILTER_LUMA 64, 32, ps
1760 IPFILTER_LUMA 32, 64, ps
1761 IPFILTER_LUMA 64, 48, ps
1762 IPFILTER_LUMA 48, 64, ps
1763 IPFILTER_LUMA 64, 16, ps
1764 IPFILTER_LUMA 16, 64, ps
1765
1766 ;-----------------------------------------------------------------------------
1767 ; Interpolate HV
1768 ;-----------------------------------------------------------------------------
1769 %macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
1770 mova %5, [r0 + (%6 + 0) * 16]
1771 mova %1, [r0 + (%6 + 1) * 16]
1772 mova %2, [r0 + (%6 + 2) * 16]
1773 punpcklwd %3, %5, %1
1774 punpckhwd %5, %1
1775 pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
1776 pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
1777 punpcklwd %4, %1, %2
1778 punpckhwd %1, %2
1779 pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
1780 pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
1781 %endmacro ; FILTER_HV8_START
1782
1783 %macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
1784 mova %8, [r0 + (%9 + 0) * 16]
1785 mova %1, [r0 + (%9 + 1) * 16]
1786 punpcklwd %7, %2, %8
1787 punpckhwd %2, %8
1788 pmaddwd %7, [r5 + %10 * 16]
1789 pmaddwd %2, [r5 + %10 * 16]
1790 paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
1791 paddd %5, %2 ; R0 = H[0+1+2+3]
1792 punpcklwd %7, %8, %1
1793 punpckhwd %8, %1
1794 pmaddwd %7, [r5 + %10 * 16]
1795 pmaddwd %8, [r5 + %10 * 16]
1796 paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
1797 paddd %6, %8 ; R1 = H[1+2+3+4]
1798 %endmacro ; FILTER_HV8_MID
1799
1800 ; Round and Saturate
1801 %macro FILTER_HV8_END 4 ; output in [1, 3]
1802 paddd %1, [tab_c_526336]
1803 paddd %2, [tab_c_526336]
1804 paddd %3, [tab_c_526336]
1805 paddd %4, [tab_c_526336]
1806 psrad %1, 12
1807 psrad %2, 12
1808 psrad %3, 12
1809 psrad %4, 12
1810 packssdw %1, %2
1811 packssdw %3, %4
1812
1813 ; TODO: is merge better? I think this way is short dependency link
1814 packuswb %1, %3
1815 %endmacro ; FILTER_HV8_END
1816
1817 ;-----------------------------------------------------------------------------
1818 ; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
1819 ;-----------------------------------------------------------------------------
1820 INIT_XMM ssse3
1821 cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
1822 %define coef m7
1823 %define stk_buf rsp
1824
1825 mov r4d, r4m
1826 mov r5d, r5m
1827
1828 %ifdef PIC
1829 lea r6, [tab_LumaCoeff]
1830 movh coef, [r6 + r4 * 8]
1831 %else
1832 movh coef, [tab_LumaCoeff + r4 * 8]
1833 %endif
1834 punpcklqdq coef, coef
1835
1836 ; move to row -3
1837 lea r6, [r1 + r1 * 2]
1838 sub r0, r6
1839
1840 xor r6, r6
1841 mov r4, rsp
1842
1843 .loopH:
1844 FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]
1845 psubw m1, [pw_2000]
1846 mova [r4], m1
1847
1848 add r0, r1
1849 add r4, 16
1850 inc r6
1851 cmp r6, 8+7
1852 jnz .loopH
1853
1854 ; ready to phase V
1855 ; Here all of mN is free
1856
1857 ; load coeff table
1858 shl r5, 6
1859 lea r6, [tab_LumaCoeffV]
1860 lea r5, [r5 + r6]
1861
1862 ; load intermedia buffer
1863 mov r0, stk_buf
1864
1865 ; register mapping
1866 ; r0 - src
1867 ; r5 - coeff
1868 ; r6 - loop_i
1869
1870 ; let's go
1871 xor r6, r6
1872
1873 ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
1874 .loopV:
1875
1876 FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
1877 FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
1878 FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
1879 FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
1880 FILTER_HV8_END m3, m0, m4, m1
1881
1882 movh [r2], m3
1883 movhps [r2 + r3], m3
1884
1885 lea r0, [r0 + 16 * 2]
1886 lea r2, [r2 + r3 * 2]
1887
1888 inc r6
1889 cmp r6, 8/2
1890 jnz .loopV
1891
1892 RET
1893
1894 ;-----------------------------------------------------------------------------
1895 ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1896 ;-----------------------------------------------------------------------------
1897 INIT_XMM sse4
1898 cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
1899
1900 mov r4d, r4m
1901 sub r0, r1
1902
1903 %ifdef PIC
1904 lea r5, [tab_ChromaCoeff]
1905 movd m0, [r5 + r4 * 4]
1906 %else
1907 movd m0, [tab_ChromaCoeff + r4 * 4]
1908 %endif
1909 lea r4, [r1 * 3]
1910 lea r5, [r0 + 4 * r1]
1911 pshufb m0, [tab_Cm]
1912 mova m1, [pw_512]
1913
1914 movd m2, [r0]
1915 movd m3, [r0 + r1]
1916 movd m4, [r0 + 2 * r1]
1917 movd m5, [r0 + r4]
1918
1919 punpcklbw m2, m3
1920 punpcklbw m6, m4, m5
1921 punpcklbw m2, m6
1922
1923 pmaddubsw m2, m0
1924
1925 movd m6, [r5]
1926
1927 punpcklbw m3, m4
1928 punpcklbw m7, m5, m6
1929 punpcklbw m3, m7
1930
1931 pmaddubsw m3, m0
1932
1933 phaddw m2, m3
1934
1935 pmulhrsw m2, m1
1936
1937 movd m7, [r5 + r1]
1938
1939 punpcklbw m4, m5
1940 punpcklbw m3, m6, m7
1941 punpcklbw m4, m3
1942
1943 pmaddubsw m4, m0
1944
1945 movd m3, [r5 + 2 * r1]
1946
1947 punpcklbw m5, m6
1948 punpcklbw m7, m3
1949 punpcklbw m5, m7
1950
1951 pmaddubsw m5, m0
1952
1953 phaddw m4, m5
1954
1955 pmulhrsw m4, m1
1956 packuswb m2, m4
1957
1958 pextrw [r2], m2, 0
1959 pextrw [r2 + r3], m2, 2
1960 lea r2, [r2 + 2 * r3]
1961 pextrw [r2], m2, 4
1962 pextrw [r2 + r3], m2, 6
1963
1964 RET
1965
1966 ;-----------------------------------------------------------------------------
1967 ; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1968 ;-----------------------------------------------------------------------------
1969 %macro FILTER_V4_W2_H4 2
1970 INIT_XMM sse4
1971 cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
1972
1973 mov r4d, r4m
1974 sub r0, r1
1975
1976 %ifdef PIC
1977 lea r5, [tab_ChromaCoeff]
1978 movd m0, [r5 + r4 * 4]
1979 %else
1980 movd m0, [tab_ChromaCoeff + r4 * 4]
1981 %endif
1982
1983 pshufb m0, [tab_Cm]
1984
1985 mova m1, [pw_512]
1986
1987 mov r4d, %2
1988 lea r5, [3 * r1]
1989
1990 .loop:
1991 movd m2, [r0]
1992 movd m3, [r0 + r1]
1993 movd m4, [r0 + 2 * r1]
1994 movd m5, [r0 + r5]
1995
1996 punpcklbw m2, m3
1997 punpcklbw m6, m4, m5
1998 punpcklbw m2, m6
1999
2000 pmaddubsw m2, m0
2001
2002 lea r0, [r0 + 4 * r1]
2003 movd m6, [r0]
2004
2005 punpcklbw m3, m4
2006 punpcklbw m7, m5, m6
2007 punpcklbw m3, m7
2008
2009 pmaddubsw m3, m0
2010
2011 phaddw m2, m3
2012
2013 pmulhrsw m2, m1
2014
2015 movd m7, [r0 + r1]
2016
2017 punpcklbw m4, m5
2018 punpcklbw m3, m6, m7
2019 punpcklbw m4, m3
2020
2021 pmaddubsw m4, m0
2022
2023 movd m3, [r0 + 2 * r1]
2024
2025 punpcklbw m5, m6
2026 punpcklbw m7, m3
2027 punpcklbw m5, m7
2028
2029 pmaddubsw m5, m0
2030
2031 phaddw m4, m5
2032
2033 pmulhrsw m4, m1
2034 packuswb m2, m4
2035
2036 pextrw [r2], m2, 0
2037 pextrw [r2 + r3], m2, 2
2038 lea r2, [r2 + 2 * r3]
2039 pextrw [r2], m2, 4
2040 pextrw [r2 + r3], m2, 6
2041
2042 lea r2, [r2 + 2 * r3]
2043
2044 sub r4, 4
2045 jnz .loop
2046 RET
2047 %endmacro
2048
2049 FILTER_V4_W2_H4 2, 8
2050
2051 FILTER_V4_W2_H4 2, 16
2052
2053 ;-----------------------------------------------------------------------------
2054 ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2055 ;-----------------------------------------------------------------------------
2056 INIT_XMM sse4
2057 cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
2058
2059 mov r4d, r4m
2060 sub r0, r1
2061
2062 %ifdef PIC
2063 lea r5, [tab_ChromaCoeff]
2064 movd m0, [r5 + r4 * 4]
2065 %else
2066 movd m0, [tab_ChromaCoeff + r4 * 4]
2067 %endif
2068
2069 pshufb m0, [tab_Cm]
2070 lea r5, [r0 + 2 * r1]
2071
2072 movd m2, [r0]
2073 movd m3, [r0 + r1]
2074 movd m4, [r5]
2075 movd m5, [r5 + r1]
2076
2077 punpcklbw m2, m3
2078 punpcklbw m1, m4, m5
2079 punpcklbw m2, m1
2080
2081 pmaddubsw m2, m0
2082
2083 movd m1, [r0 + 4 * r1]
2084
2085 punpcklbw m3, m4
2086 punpcklbw m5, m1
2087 punpcklbw m3, m5
2088
2089 pmaddubsw m3, m0
2090
2091 phaddw m2, m3
2092
2093 pmulhrsw m2, [pw_512]
2094 packuswb m2, m2
2095 movd [r2], m2
2096 pextrd [r2 + r3], m2, 1
2097
2098 RET
2099
2100 ;-----------------------------------------------------------------------------
2101 ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2102 ;-----------------------------------------------------------------------------
2103 INIT_XMM sse4
2104 cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
2105
2106 mov r4d, r4m
2107 sub r0, r1
2108
2109 %ifdef PIC
2110 lea r5, [tab_ChromaCoeff]
2111 movd m0, [r5 + r4 * 4]
2112 %else
2113 movd m0, [tab_ChromaCoeff + r4 * 4]
2114 %endif
2115
2116 pshufb m0, [tab_Cm]
2117 mova m1, [pw_512]
2118 lea r5, [r0 + 4 * r1]
2119 lea r4, [r1 * 3]
2120
2121 movd m2, [r0]
2122 movd m3, [r0 + r1]
2123 movd m4, [r0 + 2 * r1]
2124 movd m5, [r0 + r4]
2125
2126 punpcklbw m2, m3
2127 punpcklbw m6, m4, m5
2128 punpcklbw m2, m6
2129
2130 pmaddubsw m2, m0
2131
2132 movd m6, [r5]
2133
2134 punpcklbw m3, m4
2135 punpcklbw m7, m5, m6
2136 punpcklbw m3, m7
2137
2138 pmaddubsw m3, m0
2139
2140 phaddw m2, m3
2141
2142 pmulhrsw m2, m1
2143
2144 movd m7, [r5 + r1]
2145
2146 punpcklbw m4, m5
2147 punpcklbw m3, m6, m7
2148 punpcklbw m4, m3
2149
2150 pmaddubsw m4, m0
2151
2152 movd m3, [r5 + 2 * r1]
2153
2154 punpcklbw m5, m6
2155 punpcklbw m7, m3
2156 punpcklbw m5, m7
2157
2158 pmaddubsw m5, m0
2159
2160 phaddw m4, m5
2161
2162 pmulhrsw m4, m1
2163
2164 packuswb m2, m4
2165 movd [r2], m2
2166 pextrd [r2 + r3], m2, 1
2167 lea r2, [r2 + 2 * r3]
2168 pextrd [r2], m2, 2
2169 pextrd [r2 + r3], m2, 3
2170
2171 RET
2172
2173 INIT_YMM avx2
2174 cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
2175 mov r4d, r4m
2176 shl r4d, 6
2177 sub r0, r1
2178
2179 %ifdef PIC
2180 lea r5, [tab_ChromaCoeffVer_32]
2181 add r5, r4
2182 %else
2183 lea r5, [tab_ChromaCoeffVer_32 + r4]
2184 %endif
2185
2186 lea r4, [r1 * 3]
2187
2188 movd xm1, [r0]
2189 pinsrd xm1, [r0 + r1], 1
2190 pinsrd xm1, [r0 + r1 * 2], 2
2191 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
2192 lea r0, [r0 + r1 * 4]
2193 movd xm2, [r0]
2194 pinsrd xm2, [r0 + r1], 1
2195 pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4]
2196 vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0]
2197 mova m2, [interp4_vpp_shuf1]
2198 vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0]
2199 mova m2, [interp4_vpp_shuf1 + mmsize]
2200 vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2]
2201
2202 mova m2, [interp4_vpp_shuf]
2203 pshufb m0, m0, m2
2204 pshufb m1, m1, m2
2205 pmaddubsw m0, [r5]
2206 pmaddubsw m1, [r5 + mmsize]
2207 paddw m0, m1 ; m0 = WORD ROW[3 2 1 0]
2208 pmulhrsw m0, [pw_512]
2209 vextracti128 xm1, m0, 1
2210 packuswb xm0, xm1
2211 lea r5, [r3 * 3]
2212 movd [r2], xm0
2213 pextrd [r2 + r3], xm0, 1
2214 pextrd [r2 + r3 * 2], xm0, 2
2215 pextrd [r2 + r5], xm0, 3
2216 RET
2217
2218 ;-----------------------------------------------------------------------------
2219 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2220 ;-----------------------------------------------------------------------------
2221 %macro FILTER_V4_W4_H4 2
2222 INIT_XMM sse4
2223 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
2224
2225 mov r4d, r4m
2226 sub r0, r1
2227
2228 %ifdef PIC
2229 lea r5, [tab_ChromaCoeff]
2230 movd m0, [r5 + r4 * 4]
2231 %else
2232 movd m0, [tab_ChromaCoeff + r4 * 4]
2233 %endif
2234
2235 pshufb m0, [tab_Cm]
2236
2237 mova m1, [pw_512]
2238
2239 mov r4d, %2
2240
2241 lea r5, [3 * r1]
2242
2243 .loop:
2244 movd m2, [r0]
2245 movd m3, [r0 + r1]
2246 movd m4, [r0 + 2 * r1]
2247 movd m5, [r0 + r5]
2248
2249 punpcklbw m2, m3
2250 punpcklbw m6, m4, m5
2251 punpcklbw m2, m6
2252
2253 pmaddubsw m2, m0
2254
2255 lea r0, [r0 + 4 * r1]
2256 movd m6, [r0]
2257
2258 punpcklbw m3, m4
2259 punpcklbw m7, m5, m6
2260 punpcklbw m3, m7
2261
2262 pmaddubsw m3, m0
2263
2264 phaddw m2, m3
2265
2266 pmulhrsw m2, m1
2267
2268 movd m7, [r0 + r1]
2269
2270 punpcklbw m4, m5
2271 punpcklbw m3, m6, m7
2272 punpcklbw m4, m3
2273
2274 pmaddubsw m4, m0
2275
2276 movd m3, [r0 + 2 * r1]
2277
2278 punpcklbw m5, m6
2279 punpcklbw m7, m3
2280 punpcklbw m5, m7
2281
2282 pmaddubsw m5, m0
2283
2284 phaddw m4, m5
2285
2286 pmulhrsw m4, m1
2287 packuswb m2, m4
2288 movd [r2], m2
2289 pextrd [r2 + r3], m2, 1
2290 lea r2, [r2 + 2 * r3]
2291 pextrd [r2], m2, 2
2292 pextrd [r2 + r3], m2, 3
2293
2294 lea r2, [r2 + 2 * r3]
2295
2296 sub r4, 4
2297 jnz .loop
2298 RET
2299 %endmacro
2300
2301 FILTER_V4_W4_H4 4, 8
2302 FILTER_V4_W4_H4 4, 16
2303
2304 FILTER_V4_W4_H4 4, 32
2305
2306 %macro FILTER_V4_W8_H2 0
2307 punpcklbw m1, m2
2308 punpcklbw m7, m3, m0
2309
2310 pmaddubsw m1, m6
2311 pmaddubsw m7, m5
2312
2313 paddw m1, m7
2314
2315 pmulhrsw m1, m4
2316 packuswb m1, m1
2317 %endmacro
2318
2319 %macro FILTER_V4_W8_H3 0
2320 punpcklbw m2, m3
2321 punpcklbw m7, m0, m1
2322
2323 pmaddubsw m2, m6
2324 pmaddubsw m7, m5
2325
2326 paddw m2, m7
2327
2328 pmulhrsw m2, m4
2329 packuswb m2, m2
2330 %endmacro
2331
2332 %macro FILTER_V4_W8_H4 0
2333 punpcklbw m3, m0
2334 punpcklbw m7, m1, m2
2335
2336 pmaddubsw m3, m6
2337 pmaddubsw m7, m5
2338
2339 paddw m3, m7
2340
2341 pmulhrsw m3, m4
2342 packuswb m3, m3
2343 %endmacro
2344
2345 %macro FILTER_V4_W8_H5 0
2346 punpcklbw m0, m1
2347 punpcklbw m7, m2, m3
2348
2349 pmaddubsw m0, m6
2350 pmaddubsw m7, m5
2351
2352 paddw m0, m7
2353
2354 pmulhrsw m0, m4
2355 packuswb m0, m0
2356 %endmacro
2357
2358 %macro FILTER_V4_W8_8x2 2
2359 FILTER_V4_W8 %1, %2
2360 movq m0, [r0 + 4 * r1]
2361
2362 FILTER_V4_W8_H2
2363
2364 movh [r2 + r3], m1
2365 %endmacro
2366
2367 %macro FILTER_V4_W8_8x4 2
2368 FILTER_V4_W8_8x2 %1, %2
2369 ;8x3
2370 lea r6, [r0 + 4 * r1]
2371 movq m1, [r6 + r1]
2372
2373 FILTER_V4_W8_H3
2374
2375 movh [r2 + 2 * r3], m2
2376
2377 ;8x4
2378 movq m2, [r6 + 2 * r1]
2379
2380 FILTER_V4_W8_H4
2381
2382 lea r5, [r2 + 2 * r3]
2383 movh [r5 + r3], m3
2384 %endmacro
2385
2386 %macro FILTER_V4_W8_8x6 2
2387 FILTER_V4_W8_8x4 %1, %2
2388 ;8x5
2389 lea r6, [r6 + 2 * r1]
2390 movq m3, [r6 + r1]
2391
2392 FILTER_V4_W8_H5
2393
2394 movh [r2 + 4 * r3], m0
2395
2396 ;8x6
2397 movq m0, [r0 + 8 * r1]
2398
2399 FILTER_V4_W8_H2
2400
2401 lea r5, [r2 + 4 * r3]
2402 movh [r5 + r3], m1
2403 %endmacro
2404
2405 ;-----------------------------------------------------------------------------
2406 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2407 ;-----------------------------------------------------------------------------
2408 %macro FILTER_V4_W8 2
2409 INIT_XMM sse4
2410 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
2411
2412 mov r4d, r4m
2413
2414 sub r0, r1
2415 movq m0, [r0]
2416 movq m1, [r0 + r1]
2417 movq m2, [r0 + 2 * r1]
2418 lea r5, [r0 + 2 * r1]
2419 movq m3, [r5 + r1]
2420
2421 punpcklbw m0, m1
2422 punpcklbw m4, m2, m3
2423
2424 %ifdef PIC
2425 lea r6, [tab_ChromaCoeff]
2426 movd m5, [r6 + r4 * 4]
2427 %else
2428 movd m5, [tab_ChromaCoeff + r4 * 4]
2429 %endif
2430
2431 pshufb m6, m5, [tab_Vm]
2432 pmaddubsw m0, m6
2433
2434 pshufb m5, [tab_Vm + 16]
2435 pmaddubsw m4, m5
2436
2437 paddw m0, m4
2438
2439 mova m4, [pw_512]
2440
2441 pmulhrsw m0, m4
2442 packuswb m0, m0
2443 movh [r2], m0
2444 %endmacro
2445
2446 ;-----------------------------------------------------------------------------
2447 ; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2448 ;-----------------------------------------------------------------------------
2449 FILTER_V4_W8_8x2 8, 2
2450
2451 RET
2452
2453 ;-----------------------------------------------------------------------------
2454 ; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2455 ;-----------------------------------------------------------------------------
2456 FILTER_V4_W8_8x4 8, 4
2457
2458 RET
2459
2460 ;-----------------------------------------------------------------------------
2461 ; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2462 ;-----------------------------------------------------------------------------
2463 FILTER_V4_W8_8x6 8, 6
2464
2465 RET
2466
2467 ;-------------------------------------------------------------------------------------------------------------
2468 ; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2469 ;-------------------------------------------------------------------------------------------------------------
2470 INIT_XMM sse4
2471 cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
2472
2473 mov r4d, r4m
2474 sub r0, r1
2475 add r3d, r3d
2476
2477 %ifdef PIC
2478 lea r5, [tab_ChromaCoeff]
2479 movd m0, [r5 + r4 * 4]
2480 %else
2481 movd m0, [tab_ChromaCoeff + r4 * 4]
2482 %endif
2483
2484 pshufb m0, [tab_Cm]
2485
2486 movd m2, [r0]
2487 movd m3, [r0 + r1]
2488 lea r5, [r0 + 2 * r1]
2489 movd m4, [r5]
2490 movd m5, [r5 + r1]
2491
2492 punpcklbw m2, m3
2493 punpcklbw m1, m4, m5
2494 punpcklbw m2, m1
2495
2496 pmaddubsw m2, m0
2497
2498 movd m1, [r0 + 4 * r1]
2499
2500 punpcklbw m3, m4
2501 punpcklbw m5, m1
2502 punpcklbw m3, m5
2503
2504 pmaddubsw m3, m0
2505
2506 phaddw m2, m3
2507
2508 psubw m2, [pw_2000]
2509 movh [r2], m2
2510 movhps [r2 + r3], m2
2511
2512 RET
2513
2514 ;-------------------------------------------------------------------------------------------------------------
2515 ; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2516 ;-------------------------------------------------------------------------------------------------------------
2517 INIT_XMM sse4
2518 cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
2519
2520 mov r4d, r4m
2521 sub r0, r1
2522 add r3d, r3d
2523
2524 %ifdef PIC
2525 lea r5, [tab_ChromaCoeff]
2526 movd m0, [r5 + r4 * 4]
2527 %else
2528 movd m0, [tab_ChromaCoeff + r4 * 4]
2529 %endif
2530
2531 pshufb m0, [tab_Cm]
2532
2533 lea r4, [r1 * 3]
2534 lea r5, [r0 + 4 * r1]
2535
2536 movd m2, [r0]
2537 movd m3, [r0 + r1]
2538 movd m4, [r0 + 2 * r1]
2539 movd m5, [r0 + r4]
2540
2541 punpcklbw m2, m3
2542 punpcklbw m6, m4, m5
2543 punpcklbw m2, m6
2544
2545 pmaddubsw m2, m0
2546
2547 movd m6, [r5]
2548
2549 punpcklbw m3, m4
2550 punpcklbw m1, m5, m6
2551 punpcklbw m3, m1
2552
2553 pmaddubsw m3, m0
2554
2555 phaddw m2, m3
2556
2557 mova m1, [pw_2000]
2558
2559 psubw m2, m1
2560 movh [r2], m2
2561 movhps [r2 + r3], m2
2562
2563 movd m2, [r5 + r1]
2564
2565 punpcklbw m4, m5
2566 punpcklbw m3, m6, m2
2567 punpcklbw m4, m3
2568
2569 pmaddubsw m4, m0
2570
2571 movd m3, [r5 + 2 * r1]
2572
2573 punpcklbw m5, m6
2574 punpcklbw m2, m3
2575 punpcklbw m5, m2
2576
2577 pmaddubsw m5, m0
2578
2579 phaddw m4, m5
2580
2581 psubw m4, m1
2582 lea r2, [r2 + 2 * r3]
2583 movh [r2], m4
2584 movhps [r2 + r3], m4
2585
2586 RET
2587
2588 ;---------------------------------------------------------------------------------------------------------------
2589 ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2590 ;---------------------------------------------------------------------------------------------------------------
2591 %macro FILTER_V_PS_W4_H4 2
2592 INIT_XMM sse4
2593 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2594
2595 mov r4d, r4m
2596 sub r0, r1
2597 add r3d, r3d
2598
2599 %ifdef PIC
2600 lea r5, [tab_ChromaCoeff]
2601 movd m0, [r5 + r4 * 4]
2602 %else
2603 movd m0, [tab_ChromaCoeff + r4 * 4]
2604 %endif
2605
2606 pshufb m0, [tab_Cm]
2607
2608 mova m1, [pw_2000]
2609
2610 mov r4d, %2/4
2611 lea r5, [3 * r1]
2612
2613 .loop:
2614 movd m2, [r0]
2615 movd m3, [r0 + r1]
2616 movd m4, [r0 + 2 * r1]
2617 movd m5, [r0 + r5]
2618
2619 punpcklbw m2, m3
2620 punpcklbw m6, m4, m5
2621 punpcklbw m2, m6
2622
2623 pmaddubsw m2, m0
2624
2625 lea r0, [r0 + 4 * r1]
2626 movd m6, [r0]
2627
2628 punpcklbw m3, m4
2629 punpcklbw m7, m5, m6
2630 punpcklbw m3, m7
2631
2632 pmaddubsw m3, m0
2633
2634 phaddw m2, m3
2635
2636 psubw m2, m1
2637 movh [r2], m2
2638 movhps [r2 + r3], m2
2639
2640 movd m2, [r0 + r1]
2641
2642 punpcklbw m4, m5
2643 punpcklbw m3, m6, m2
2644 punpcklbw m4, m3
2645
2646 pmaddubsw m4, m0
2647
2648 movd m3, [r0 + 2 * r1]
2649
2650 punpcklbw m5, m6
2651 punpcklbw m2, m3
2652 punpcklbw m5, m2
2653
2654 pmaddubsw m5, m0
2655
2656 phaddw m4, m5
2657
2658 psubw m4, m1
2659 lea r2, [r2 + 2 * r3]
2660 movh [r2], m4
2661 movhps [r2 + r3], m4
2662
2663 lea r2, [r2 + 2 * r3]
2664
2665 dec r4d
2666 jnz .loop
2667 RET
2668 %endmacro
2669
2670 FILTER_V_PS_W4_H4 4, 8
2671 FILTER_V_PS_W4_H4 4, 16
2672
2673 FILTER_V_PS_W4_H4 4, 32
2674
2675 ;--------------------------------------------------------------------------------------------------------------
2676 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2677 ;--------------------------------------------------------------------------------------------------------------
2678 %macro FILTER_V_PS_W8_H8_H16_H2 2
2679 INIT_XMM sse4
2680 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
2681
2682 mov r4d, r4m
2683 sub r0, r1
2684 add r3d, r3d
2685
2686 %ifdef PIC
2687 lea r5, [tab_ChromaCoeff]
2688 movd m5, [r5 + r4 * 4]
2689 %else
2690 movd m5, [tab_ChromaCoeff + r4 * 4]
2691 %endif
2692
2693 pshufb m6, m5, [tab_Vm]
2694 pshufb m5, [tab_Vm + 16]
2695 mova m4, [pw_2000]
2696
2697 mov r4d, %2/2
2698 lea r5, [3 * r1]
2699
2700 .loopH:
2701 movq m0, [r0]
2702 movq m1, [r0 + r1]
2703 movq m2, [r0 + 2 * r1]
2704 movq m3, [r0 + r5]
2705
2706 punpcklbw m0, m1
2707 punpcklbw m1, m2
2708 punpcklbw m2, m3
2709
2710 pmaddubsw m0, m6
2711 pmaddubsw m2, m5
2712
2713 paddw m0, m2
2714
2715 psubw m0, m4
2716 movu [r2], m0
2717
2718 movq m0, [r0 + 4 * r1]
2719
2720 punpcklbw m3, m0
2721
2722 pmaddubsw m1, m6
2723 pmaddubsw m3, m5
2724
2725 paddw m1, m3
2726 psubw m1, m4
2727
2728 movu [r2 + r3], m1
2729
2730 lea r0, [r0 + 2 * r1]
2731 lea r2, [r2 + 2 * r3]
2732
2733 dec r4d
2734 jnz .loopH
2735
2736 RET
2737 %endmacro
2738
2739 FILTER_V_PS_W8_H8_H16_H2 8, 2
2740 FILTER_V_PS_W8_H8_H16_H2 8, 4
2741 FILTER_V_PS_W8_H8_H16_H2 8, 6
2742
2743 FILTER_V_PS_W8_H8_H16_H2 8, 12
2744 FILTER_V_PS_W8_H8_H16_H2 8, 64
2745
2746 ;--------------------------------------------------------------------------------------------------------------
2747 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2748 ;--------------------------------------------------------------------------------------------------------------
2749 %macro FILTER_V_PS_W8_H8_H16_H32 2
2750 INIT_XMM sse4
2751 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2752
2753 mov r4d, r4m
2754 sub r0, r1
2755 add r3d, r3d
2756
2757 %ifdef PIC
2758 lea r5, [tab_ChromaCoeff]
2759 movd m5, [r5 + r4 * 4]
2760 %else
2761 movd m5, [tab_ChromaCoeff + r4 * 4]
2762 %endif
2763
2764 pshufb m6, m5, [tab_Vm]
2765 pshufb m5, [tab_Vm + 16]
2766 mova m4, [pw_2000]
2767
2768 mov r4d, %2/4
2769 lea r5, [3 * r1]
2770
2771 .loop:
2772 movq m0, [r0]
2773 movq m1, [r0 + r1]
2774 movq m2, [r0 + 2 * r1]
2775 movq m3, [r0 + r5]
2776
2777 punpcklbw m0, m1
2778 punpcklbw m1, m2
2779 punpcklbw m2, m3
2780
2781 pmaddubsw m0, m6
2782 pmaddubsw m7, m2, m5
2783
2784 paddw m0, m7
2785
2786 psubw m0, m4
2787 movu [r2], m0
2788
2789 lea r0, [r0 + 4 * r1]
2790 movq m0, [r0]
2791
2792 punpcklbw m3, m0
2793
2794 pmaddubsw m1, m6
2795 pmaddubsw m7, m3, m5
2796
2797 paddw m1, m7
2798
2799 psubw m1, m4
2800 movu [r2 + r3], m1
2801
2802 movq m1, [r0 + r1]
2803
2804 punpcklbw m0, m1
2805
2806 pmaddubsw m2, m6
2807 pmaddubsw m0, m5
2808
2809 paddw m2, m0
2810
2811 psubw m2, m4
2812 lea r2, [r2 + 2 * r3]
2813 movu [r2], m2
2814
2815 movq m2, [r0 + 2 * r1]
2816
2817 punpcklbw m1, m2
2818
2819 pmaddubsw m3, m6
2820 pmaddubsw m1, m5
2821
2822 paddw m3, m1
2823 psubw m3, m4
2824
2825 movu [r2 + r3], m3
2826
2827 lea r2, [r2 + 2 * r3]
2828
2829 dec r4d
2830 jnz .loop
2831 RET
2832 %endmacro
2833
2834 FILTER_V_PS_W8_H8_H16_H32 8, 8
2835 FILTER_V_PS_W8_H8_H16_H32 8, 16
2836 FILTER_V_PS_W8_H8_H16_H32 8, 32
2837
2838 ;------------------------------------------------------------------------------------------------------------
2839 ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2840 ;------------------------------------------------------------------------------------------------------------
2841 %macro FILTER_V_PS_W6 2
2842 INIT_XMM sse4
2843 cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
2844
2845 mov r4d, r4m
2846 sub r0, r1
2847 add r3d, r3d
2848
2849 %ifdef PIC
2850 lea r5, [tab_ChromaCoeff]
2851 movd m5, [r5 + r4 * 4]
2852 %else
2853 movd m5, [tab_ChromaCoeff + r4 * 4]
2854 %endif
2855
2856 pshufb m6, m5, [tab_Vm]
2857 pshufb m5, [tab_Vm + 16]
2858 mova m4, [pw_2000]
2859 lea r5, [3 * r1]
2860 mov r4d, %2/4
2861
2862 .loop:
2863 movq m0, [r0]
2864 movq m1, [r0 + r1]
2865 movq m2, [r0 + 2 * r1]
2866 movq m3, [r0 + r5]
2867
2868 punpcklbw m0, m1
2869 punpcklbw m1, m2
2870 punpcklbw m2, m3
2871
2872 pmaddubsw m0, m6
2873 pmaddubsw m7, m2, m5
2874
2875 paddw m0, m7
2876 psubw m0, m4
2877
2878 movh [r2], m0
2879 pshufd m0, m0, 2
2880 movd [r2 + 8], m0
2881
2882 lea r0, [r0 + 4 * r1]
2883 movq m0, [r0]
2884 punpcklbw m3, m0
2885
2886 pmaddubsw m1, m6
2887 pmaddubsw m7, m3, m5
2888
2889 paddw m1, m7
2890 psubw m1, m4
2891
2892 movh [r2 + r3], m1
2893 pshufd m1, m1, 2
2894 movd [r2 + r3 + 8], m1
2895
2896 movq m1, [r0 + r1]
2897 punpcklbw m0, m1
2898
2899 pmaddubsw m2, m6
2900 pmaddubsw m0, m5
2901
2902 paddw m2, m0
2903 psubw m2, m4
2904
2905 lea r2,[r2 + 2 * r3]
2906 movh [r2], m2
2907 pshufd m2, m2, 2
2908 movd [r2 + 8], m2
2909
2910 movq m2,[r0 + 2 * r1]
2911 punpcklbw m1, m2
2912
2913 pmaddubsw m3, m6
2914 pmaddubsw m1, m5
2915
2916 paddw m3, m1
2917 psubw m3, m4
2918
2919 movh [r2 + r3], m3
2920 pshufd m3, m3, 2
2921 movd [r2 + r3 + 8], m3
2922
2923 lea r2, [r2 + 2 * r3]
2924
2925 dec r4d
2926 jnz .loop
2927 RET
2928 %endmacro
2929
2930 FILTER_V_PS_W6 6, 8
2931 FILTER_V_PS_W6 6, 16
2932
2933 ;---------------------------------------------------------------------------------------------------------------
2934 ; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2935 ;---------------------------------------------------------------------------------------------------------------
2936 %macro FILTER_V_PS_W12 2
2937 INIT_XMM sse4
2938 cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
2939
2940 mov r4d, r4m
2941 sub r0, r1
2942 add r3d, r3d
2943
2944 %ifdef PIC
2945 lea r5, [tab_ChromaCoeff]
2946 movd m0, [r5 + r4 * 4]
2947 %else
2948 movd m0, [tab_ChromaCoeff + r4 * 4]
2949 %endif
2950
2951 pshufb m1, m0, [tab_Vm]
2952 pshufb m0, [tab_Vm + 16]
2953
2954 mov r4d, %2/2
2955
2956 .loop:
2957 movu m2, [r0]
2958 movu m3, [r0 + r1]
2959
2960 punpcklbw m4, m2, m3
2961 punpckhbw m2, m3
2962
2963 pmaddubsw m4, m1
2964 pmaddubsw m2, m1
2965
2966 lea r0, [r0 + 2 * r1]
2967 movu m5, [r0]
2968 movu m7, [r0 + r1]
2969
2970 punpcklbw m6, m5, m7
2971 pmaddubsw m6, m0
2972 paddw m4, m6
2973
2974 punpckhbw m6, m5, m7
2975 pmaddubsw m6, m0
2976 paddw m2, m6
2977
2978 mova m6, [pw_2000]
2979
2980 psubw m4, m6
2981 psubw m2, m6
2982
2983 movu [r2], m4
2984 movh [r2 + 16], m2
2985
2986 punpcklbw m4, m3, m5
2987 punpckhbw m3, m5
2988
2989 pmaddubsw m4, m1
2990 pmaddubsw m3, m1
2991
2992 movu m2, [r0 + 2 * r1]
2993
2994 punpcklbw m5, m7, m2
2995 punpckhbw m7, m2
2996
2997 pmaddubsw m5, m0
2998 pmaddubsw m7, m0
2999
3000 paddw m4, m5
3001 paddw m3, m7
3002
3003 psubw m4, m6
3004 psubw m3, m6
3005
3006 movu [r2 + r3], m4
3007 movh [r2 + r3 + 16], m3
3008
3009 lea r2, [r2 + 2 * r3]
3010
3011 dec r4d
3012 jnz .loop
3013 RET
3014 %endmacro
3015
3016 FILTER_V_PS_W12 12, 16
3017 FILTER_V_PS_W12 12, 32
3018
3019 ;---------------------------------------------------------------------------------------------------------------
3020 ; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3021 ;---------------------------------------------------------------------------------------------------------------
3022 %macro FILTER_V_PS_W16 2
3023 INIT_XMM sse4
3024 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
3025
3026 mov r4d, r4m
3027 sub r0, r1
3028 add r3d, r3d
3029
3030 %ifdef PIC
3031 lea r5, [tab_ChromaCoeff]
3032 movd m0, [r5 + r4 * 4]
3033 %else
3034 movd m0, [tab_ChromaCoeff + r4 * 4]
3035 %endif
3036
3037 pshufb m1, m0, [tab_Vm]
3038 pshufb m0, [tab_Vm + 16]
3039 mov r4d, %2/2
3040
3041 .loop:
3042 movu m2, [r0]
3043 movu m3, [r0 + r1]
3044
3045 punpcklbw m4, m2, m3
3046 punpckhbw m2, m3
3047
3048 pmaddubsw m4, m1
3049 pmaddubsw m2, m1
3050
3051 lea r0, [r0 + 2 * r1]
3052 movu m5, [r0]
3053 movu m7, [r0 + r1]
3054
3055 punpcklbw m6, m5, m7
3056 pmaddubsw m6, m0
3057 paddw m4, m6
3058
3059 punpckhbw m6, m5, m7
3060 pmaddubsw m6, m0
3061 paddw m2, m6
3062
3063 mova m6, [pw_2000]
3064
3065 psubw m4, m6
3066 psubw m2, m6
3067
3068 movu [r2], m4
3069 movu [r2 + 16], m2
3070
3071 punpcklbw m4, m3, m5
3072 punpckhbw m3, m5
3073
3074 pmaddubsw m4, m1
3075 pmaddubsw m3, m1
3076
3077 movu m5, [r0 + 2 * r1]
3078
3079 punpcklbw m2, m7, m5
3080 punpckhbw m7, m5
3081
3082 pmaddubsw m2, m0
3083 pmaddubsw m7, m0
3084
3085 paddw m4, m2
3086 paddw m3, m7
3087
3088 psubw m4, m6
3089 psubw m3, m6
3090
3091 movu [r2 + r3], m4
3092 movu [r2 + r3 + 16], m3
3093
3094 lea r2, [r2 + 2 * r3]
3095
3096 dec r4d
3097 jnz .loop
3098 RET
3099 %endmacro
3100
3101 FILTER_V_PS_W16 16, 4
3102 FILTER_V_PS_W16 16, 8
3103 FILTER_V_PS_W16 16, 12
3104 FILTER_V_PS_W16 16, 16
3105 FILTER_V_PS_W16 16, 32
3106
3107 FILTER_V_PS_W16 16, 24
3108 FILTER_V_PS_W16 16, 64
3109
3110 ;--------------------------------------------------------------------------------------------------------------
3111 ;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3112 ;--------------------------------------------------------------------------------------------------------------
3113 %macro FILTER_V4_PS_W24 2
3114 INIT_XMM sse4
3115 cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
3116
3117 mov r4d, r4m
3118 sub r0, r1
3119 add r3d, r3d
3120
3121 %ifdef PIC
3122 lea r5, [tab_ChromaCoeff]
3123 movd m0, [r5 + r4 * 4]
3124 %else
3125 movd m0, [tab_ChromaCoeff + r4 * 4]
3126 %endif
3127
3128 pshufb m1, m0, [tab_Vm]
3129 pshufb m0, [tab_Vm + 16]
3130
3131 mov r4d, %2/2
3132
3133 .loop:
3134 movu m2, [r0]
3135 movu m3, [r0 + r1]
3136
3137 punpcklbw m4, m2, m3
3138 punpckhbw m2, m3
3139
3140 pmaddubsw m4, m1
3141 pmaddubsw m2, m1
3142
3143 lea r5, [r0 + 2 * r1]
3144
3145 movu m5, [r5]
3146 movu m7, [r5 + r1]
3147
3148 punpcklbw m6, m5, m7
3149 pmaddubsw m6, m0
3150 paddw m4, m6
3151
3152 punpckhbw m6, m5, m7
3153 pmaddubsw m6, m0
3154 paddw m2, m6
3155
3156 mova m6, [pw_2000]
3157
3158 psubw m4, m6
3159 psubw m2, m6
3160
3161 movu [r2], m4
3162 movu [r2 + 16], m2
3163
3164 punpcklbw m4, m3, m5
3165 punpckhbw m3, m5
3166
3167 pmaddubsw m4, m1
3168 pmaddubsw m3, m1
3169
3170 movu m2, [r5 + 2 * r1]
3171
3172 punpcklbw m5, m7, m2
3173 punpckhbw m7, m2
3174
3175 pmaddubsw m5, m0
3176 pmaddubsw m7, m0
3177
3178 paddw m4, m5
3179 paddw m3, m7
3180
3181 psubw m4, m6
3182 psubw m3, m6
3183
3184 movu [r2 + r3], m4
3185 movu [r2 + r3 + 16], m3
3186
3187 movq m2, [r0 + 16]
3188 movq m3, [r0 + r1 + 16]
3189 movq m4, [r5 + 16]
3190 movq m5, [r5 + r1 + 16]
3191
3192 punpcklbw m2, m3
3193 punpcklbw m7, m4, m5
3194
3195 pmaddubsw m2, m1
3196 pmaddubsw m7, m0
3197
3198 paddw m2, m7
3199 psubw m2, m6
3200
3201 movu [r2 + 32], m2
3202
3203 movq m2, [r5 + 2 * r1 + 16]
3204
3205 punpcklbw m3, m4
3206 punpcklbw m5, m2
3207
3208 pmaddubsw m3, m1
3209 pmaddubsw m5, m0
3210
3211 paddw m3, m5
3212 psubw m3, m6
3213
3214 movu [r2 + r3 + 32], m3
3215
3216 mov r0, r5
3217 lea r2, [r2 + 2 * r3]
3218
3219 dec r4d
3220 jnz .loop
3221 RET
3222 %endmacro
3223
3224 FILTER_V4_PS_W24 24, 32
3225
3226 FILTER_V4_PS_W24 24, 64
3227
3228 ;---------------------------------------------------------------------------------------------------------------
3229 ; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3230 ;---------------------------------------------------------------------------------------------------------------
3231 %macro FILTER_V_PS_W32 2
3232 INIT_XMM sse4
3233 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
3234
3235 mov r4d, r4m
3236 sub r0, r1
3237 add r3d, r3d
3238
3239 %ifdef PIC
3240 lea r5, [tab_ChromaCoeff]
3241 movd m0, [r5 + r4 * 4]
3242 %else
3243 movd m0, [tab_ChromaCoeff + r4 * 4]
3244 %endif
3245
3246 pshufb m1, m0, [tab_Vm]
3247 pshufb m0, [tab_Vm + 16]
3248
3249 mova m7, [pw_2000]
3250
3251 mov r4d, %2
3252
3253 .loop:
3254 movu m2, [r0]
3255 movu m3, [r0 + r1]
3256
3257 punpcklbw m4, m2, m3
3258 punpckhbw m2, m3
3259
3260 pmaddubsw m4, m1
3261 pmaddubsw m2, m1
3262
3263 lea r5, [r0 + 2 * r1]
3264 movu m3, [r5]
3265 movu m5, [r5 + r1]
3266
3267 punpcklbw m6, m3, m5
3268 punpckhbw m3, m5
3269
3270 pmaddubsw m6, m0
3271 pmaddubsw m3, m0
3272
3273 paddw m4, m6
3274 paddw m2, m3
3275
3276 psubw m4, m7
3277 psubw m2, m7
3278
3279 movu [r2], m4
3280 movu [r2 + 16], m2
3281
3282 movu m2, [r0 + 16]
3283 movu m3, [r0 + r1 + 16]
3284
3285 punpcklbw m4, m2, m3
3286 punpckhbw m2, m3
3287
3288 pmaddubsw m4, m1
3289 pmaddubsw m2, m1
3290
3291 movu m3, [r5 + 16]
3292 movu m5, [r5 + r1 + 16]
3293
3294 punpcklbw m6, m3, m5
3295 punpckhbw m3, m5
3296
3297 pmaddubsw m6, m0
3298 pmaddubsw m3, m0
3299
3300 paddw m4, m6
3301 paddw m2, m3
3302
3303 psubw m4, m7
3304 psubw m2, m7
3305
3306 movu [r2 + 32], m4
3307 movu [r2 + 48], m2
3308
3309 lea r0, [r0 + r1]
3310 lea r2, [r2 + r3]
3311
3312 dec r4d
3313 jnz .loop
3314 RET
3315 %endmacro
3316
3317 FILTER_V_PS_W32 32, 8
3318 FILTER_V_PS_W32 32, 16
3319 FILTER_V_PS_W32 32, 24
3320 FILTER_V_PS_W32 32, 32
3321
3322 FILTER_V_PS_W32 32, 48
3323 FILTER_V_PS_W32 32, 64
3324
3325 ;-----------------------------------------------------------------------------
3326 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3327 ;-----------------------------------------------------------------------------
3328 %macro FILTER_V4_W8_H8_H16_H32 2
3329 INIT_XMM sse4
3330 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
3331
3332 mov r4d, r4m
3333 sub r0, r1
3334
3335 %ifdef PIC
3336 lea r5, [tab_ChromaCoeff]
3337 movd m5, [r5 + r4 * 4]
3338 %else
3339 movd m5, [tab_ChromaCoeff + r4 * 4]
3340 %endif
3341
3342 pshufb m6, m5, [tab_Vm]
3343 pshufb m5, [tab_Vm + 16]
3344 mova m4, [pw_512]
3345 lea r5, [r1 * 3]
3346
3347 mov r4d, %2
3348
3349 .loop:
3350 movq m0, [r0]
3351 movq m1, [r0 + r1]
3352 movq m2, [r0 + 2 * r1]
3353 movq m3, [r0 + r5]
3354
3355 punpcklbw m0, m1
3356 punpcklbw m1, m2
3357 punpcklbw m2, m3
3358
3359 pmaddubsw m0, m6
3360 pmaddubsw m7, m2, m5
3361
3362 paddw m0, m7
3363
3364 pmulhrsw m0, m4
3365 packuswb m0, m0
3366 movh [r2], m0
3367
3368 lea r0, [r0 + 4 * r1]
3369 movq m0, [r0]
3370
3371 punpcklbw m3, m0
3372
3373 pmaddubsw m1, m6
3374 pmaddubsw m7, m3, m5
3375
3376 paddw m1, m7
3377
3378 pmulhrsw m1, m4
3379 packuswb m1, m1
3380 movh [r2 + r3], m1
3381
3382 movq m1, [r0 + r1]
3383
3384 punpcklbw m0, m1
3385
3386 pmaddubsw m2, m6
3387 pmaddubsw m0, m5
3388
3389 paddw m2, m0
3390
3391 pmulhrsw m2, m4
3392
3393 movq m7, [r0 + 2 * r1]
3394 punpcklbw m1, m7
3395
3396 pmaddubsw m3, m6
3397 pmaddubsw m1, m5
3398
3399 paddw m3, m1
3400
3401 pmulhrsw m3, m4
3402 packuswb m2, m3
3403
3404 lea r2, [r2 + 2 * r3]
3405 movh [r2], m2
3406 movhps [r2 + r3], m2
3407
3408 lea r2, [r2 + 2 * r3]
3409
3410 sub r4, 4
3411 jnz .loop
3412 RET
3413 %endmacro
3414
3415 FILTER_V4_W8_H8_H16_H32 8, 8
3416 FILTER_V4_W8_H8_H16_H32 8, 16
3417 FILTER_V4_W8_H8_H16_H32 8, 32
3418
3419 FILTER_V4_W8_H8_H16_H32 8, 12
3420 FILTER_V4_W8_H8_H16_H32 8, 64
3421
3422 %macro PROCESS_CHROMA_AVX2_W8_8R 0
3423 movq xm1, [r0] ; m1 = row 0
3424 movq xm2, [r0 + r1] ; m2 = row 1
3425 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
3426 movq xm3, [r0 + r1 * 2] ; m3 = row 2
3427 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
3428 vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
3429 pmaddubsw m5, [r5]
3430 movq xm4, [r0 + r4] ; m4 = row 3
3431 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
3432 lea r0, [r0 + r1 * 4]
3433 movq xm1, [r0] ; m1 = row 4
3434 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
3435 vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
3436 pmaddubsw m0, m2, [r5 + 1 * mmsize]
3437 paddw m5, m0
3438 pmaddubsw m2, [r5]
3439 movq xm3, [r0 + r1] ; m3 = row 5
3440 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
3441 movq xm4, [r0 + r1 * 2] ; m4 = row 6
3442 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
3443 vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
3444 pmaddubsw m0, m1, [r5 + 1 * mmsize]
3445 paddw m2, m0
3446 pmaddubsw m1, [r5]
3447 movq xm3, [r0 + r4] ; m3 = row 7
3448 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
3449 lea r0, [r0 + r1 * 4]
3450 movq xm0, [r0] ; m0 = row 8
3451 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
3452 vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
3453 pmaddubsw m3, m4, [r5 + 1 * mmsize]
3454 paddw m1, m3
3455 pmaddubsw m4, [r5]
3456 movq xm3, [r0 + r1] ; m3 = row 9
3457 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
3458 movq xm6, [r0 + r1 * 2] ; m6 = row 10
3459 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
3460 vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
3461 pmaddubsw m0, [r5 + 1 * mmsize]
3462 paddw m4, m0
3463 %endmacro
3464
3465 INIT_YMM avx2
3466 cglobal interp_4tap_vert_pp_8x8, 4, 6, 7
3467 mov r4d, r4m
3468 shl r4d, 6
3469
3470 %ifdef PIC
3471 lea r5, [tab_ChromaCoeffVer_32]
3472 add r5, r4
3473 %else
3474 lea r5, [tab_ChromaCoeffVer_32 + r4]
3475 %endif
3476
3477 lea r4, [r1 * 3]
3478 sub r0, r1
3479 PROCESS_CHROMA_AVX2_W8_8R
3480 lea r4, [r3 * 3]
3481 mova m3, [pw_512]
3482 pmulhrsw m5, m3 ; m5 = word: row 0, row 1
3483 pmulhrsw m2, m3 ; m2 = word: row 2, row 3
3484 pmulhrsw m1, m3 ; m1 = word: row 4, row 5
3485 pmulhrsw m4, m3 ; m4 = word: row 6, row 7
3486 packuswb m5, m2
3487 packuswb m1, m4
3488 vextracti128 xm2, m5, 1
3489 vextracti128 xm4, m1, 1
3490 movq [r2], xm5
3491 movq [r2 + r3], xm2
3492 movhps [r2 + r3 * 2], xm5
3493 movhps [r2 + r4], xm2
3494 lea r2, [r2 + r3 * 4]
3495 movq [r2], xm1
3496 movq [r2 + r3], xm4
3497 movhps [r2 + r3 * 2], xm1
3498 movhps [r2 + r4], xm4
3499 RET
3500
3501 ;-----------------------------------------------------------------------------
3502 ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3503 ;-----------------------------------------------------------------------------
3504 %macro FILTER_V4_W6_H4 2
3505 INIT_XMM sse4
3506 cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
3507
3508 mov r4d, r4m
3509 sub r0, r1
3510
3511 %ifdef PIC
3512 lea r5, [tab_ChromaCoeff]
3513 movd m5, [r5 + r4 * 4]
3514 %else
3515 movd m5, [tab_ChromaCoeff + r4 * 4]
3516 %endif
3517
3518 pshufb m6, m5, [tab_Vm]
3519 pshufb m5, [tab_Vm + 16]
3520 mova m4, [pw_512]
3521
3522 mov r4d, %2
3523 lea r5, [3 * r1]
3524
3525 .loop:
3526 movq m0, [r0]
3527 movq m1, [r0 + r1]
3528 movq m2, [r0 + 2 * r1]
3529 movq m3, [r0 + r5]
3530
3531 punpcklbw m0, m1
3532 punpcklbw m1, m2
3533 punpcklbw m2, m3
3534
3535 pmaddubsw m0, m6
3536 pmaddubsw m7, m2, m5
3537
3538 paddw m0, m7
3539
3540 pmulhrsw m0, m4
3541 packuswb m0, m0
3542 movd [r2], m0
3543 pextrw [r2 + 4], m0, 2
3544
3545 lea r0, [r0 + 4 * r1]
3546
3547 movq m0, [r0]
3548 punpcklbw m3, m0
3549
3550 pmaddubsw m1, m6
3551 pmaddubsw m7, m3, m5
3552
3553 paddw m1, m7
3554
3555 pmulhrsw m1, m4
3556 packuswb m1, m1
3557 movd [r2 + r3], m1
3558 pextrw [r2 + r3 + 4], m1, 2
3559
3560 movq m1, [r0 + r1]
3561 punpcklbw m7, m0, m1
3562
3563 pmaddubsw m2, m6
3564 pmaddubsw m7, m5
3565
3566 paddw m2, m7
3567
3568 pmulhrsw m2, m4
3569 packuswb m2, m2
3570 lea r2, [r2 + 2 * r3]
3571 movd [r2], m2
3572 pextrw [r2 + 4], m2, 2
3573
3574 movq m2, [r0 + 2 * r1]
3575 punpcklbw m1, m2
3576
3577 pmaddubsw m3, m6
3578 pmaddubsw m1, m5
3579
3580 paddw m3, m1
3581
3582 pmulhrsw m3, m4
3583 packuswb m3, m3
3584
3585 movd [r2 + r3], m3
3586 pextrw [r2 + r3 + 4], m3, 2
3587
3588 lea r2, [r2 + 2 * r3]
3589
3590 sub r4, 4
3591 jnz .loop
3592 RET
3593 %endmacro
3594
3595 FILTER_V4_W6_H4 6, 8
3596
3597 FILTER_V4_W6_H4 6, 16
3598
3599 ;-----------------------------------------------------------------------------
3600 ; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3601 ;-----------------------------------------------------------------------------
3602 %macro FILTER_V4_W12_H2 2
3603 INIT_XMM sse4
3604 cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
3605
3606 mov r4d, r4m
3607 sub r0, r1
3608
3609 %ifdef PIC
3610 lea r5, [tab_ChromaCoeff]
3611 movd m0, [r5 + r4 * 4]
3612 %else
3613 movd m0, [tab_ChromaCoeff + r4 * 4]
3614 %endif
3615
3616 pshufb m1, m0, [tab_Vm]
3617 pshufb m0, [tab_Vm + 16]
3618
3619 mov r4d, %2
3620
3621 .loop:
3622 movu m2, [r0]
3623 movu m3, [r0 + r1]
3624
3625 punpcklbw m4, m2, m3
3626 punpckhbw m2, m3
3627
3628 pmaddubsw m4, m1
3629 pmaddubsw m2, m1
3630
3631 lea r0, [r0 + 2 * r1]
3632 movu m5, [r0]
3633 movu m7, [r0 + r1]
3634
3635 punpcklbw m6, m5, m7
3636 pmaddubsw m6, m0
3637 paddw m4, m6
3638
3639 punpckhbw m6, m5, m7
3640 pmaddubsw m6, m0
3641 paddw m2, m6
3642
3643 mova m6, [pw_512]
3644
3645 pmulhrsw m4, m6
3646 pmulhrsw m2, m6
3647
3648 packuswb m4, m2
3649
3650 movh [r2], m4
3651 pextrd [r2 + 8], m4, 2
3652
3653 punpcklbw m4, m3, m5
3654 punpckhbw m3, m5
3655
3656 pmaddubsw m4, m1
3657 pmaddubsw m3, m1
3658
3659 movu m5, [r0 + 2 * r1]
3660
3661 punpcklbw m2, m7, m5
3662 punpckhbw m7, m5
3663
3664 pmaddubsw m2, m0
3665 pmaddubsw m7, m0
3666
3667 paddw m4, m2
3668 paddw m3, m7
3669
3670 pmulhrsw m4, m6
3671 pmulhrsw m3, m6
3672
3673 packuswb m4, m3
3674
3675 movh [r2 + r3], m4
3676 pextrd [r2 + r3 + 8], m4, 2
3677
3678 lea r2, [r2 + 2 * r3]
3679
3680 sub r4, 2
3681 jnz .loop
3682 RET
3683 %endmacro
3684
3685 FILTER_V4_W12_H2 12, 16
3686
3687 FILTER_V4_W12_H2 12, 32
3688
3689 ;-----------------------------------------------------------------------------
3690 ; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3691 ;-----------------------------------------------------------------------------
3692 %macro FILTER_V4_W16_H2 2
3693 INIT_XMM sse4
3694 cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
3695
3696 mov r4d, r4m
3697 sub r0, r1
3698
3699 %ifdef PIC
3700 lea r5, [tab_ChromaCoeff]
3701 movd m0, [r5 + r4 * 4]
3702 %else
3703 movd m0, [tab_ChromaCoeff + r4 * 4]
3704 %endif
3705
3706 pshufb m1, m0, [tab_Vm]
3707 pshufb m0, [tab_Vm + 16]
3708
3709 mov r4d, %2/2
3710
3711 .loop:
3712 movu m2, [r0]
3713 movu m3, [r0 + r1]
3714
3715 punpcklbw m4, m2, m3
3716 punpckhbw m2, m3
3717
3718 pmaddubsw m4, m1
3719 pmaddubsw m2, m1
3720
3721 lea r0, [r0 + 2 * r1]
3722 movu m5, [r0]
3723 movu m6, [r0 + r1]
3724
3725 punpckhbw m7, m5, m6
3726 pmaddubsw m7, m0
3727 paddw m2, m7
3728
3729 punpcklbw m7, m5, m6
3730 pmaddubsw m7, m0
3731 paddw m4, m7
3732
3733 mova m7, [pw_512]
3734
3735 pmulhrsw m4, m7
3736 pmulhrsw m2, m7
3737
3738 packuswb m4, m2
3739
3740 movu [r2], m4
3741
3742 punpcklbw m4, m3, m5
3743 punpckhbw m3, m5
3744
3745 pmaddubsw m4, m1
3746 pmaddubsw m3, m1
3747
3748 movu m5, [r0 + 2 * r1]
3749
3750 punpcklbw m2, m6, m5
3751 punpckhbw m6, m5
3752
3753 pmaddubsw m2, m0
3754 pmaddubsw m6, m0
3755
3756 paddw m4, m2
3757 paddw m3, m6
3758
3759 pmulhrsw m4, m7
3760 pmulhrsw m3, m7
3761
3762 packuswb m4, m3
3763
3764 movu [r2 + r3], m4
3765
3766 lea r2, [r2 + 2 * r3]
3767
3768 dec r4d
3769 jnz .loop
3770 RET
3771 %endmacro
3772
3773 FILTER_V4_W16_H2 16, 4
3774 FILTER_V4_W16_H2 16, 8
3775 FILTER_V4_W16_H2 16, 12
3776 FILTER_V4_W16_H2 16, 16
3777 FILTER_V4_W16_H2 16, 32
3778
3779 FILTER_V4_W16_H2 16, 24
3780 FILTER_V4_W16_H2 16, 64
3781
3782 INIT_YMM avx2
3783 %if ARCH_X86_64 == 1
3784 cglobal interp_4tap_vert_pp_16x16, 4, 6, 15
3785 mov r4d, r4m
3786 shl r4d, 6
3787
3788 %ifdef PIC
3789 lea r5, [tab_ChromaCoeffVer_32]
3790 add r5, r4
3791 %else
3792 lea r5, [tab_ChromaCoeffVer_32 + r4]
3793 %endif
3794
3795 mova m12, [r5]
3796 mova m13, [r5 + mmsize]
3797 lea r4, [r1 * 3]
3798 sub r0, r1
3799 lea r5, [r3 * 3]
3800 mova m14, [pw_512]
3801
3802 movu xm0, [r0] ; m0 = row 0
3803 movu xm1, [r0 + r1] ; m1 = row 1
3804 punpckhbw xm2, xm0, xm1
3805 punpcklbw xm0, xm1
3806 vinserti128 m0, m0, xm2, 1
3807 pmaddubsw m0, m12
3808 movu xm2, [r0 + r1 * 2] ; m2 = row 2
3809 punpckhbw xm3, xm1, xm2
3810 punpcklbw xm1, xm2
3811 vinserti128 m1, m1, xm3, 1
3812 pmaddubsw m1, m12
3813 movu xm3, [r0 + r4] ; m3 = row 3
3814 punpckhbw xm4, xm2, xm3
3815 punpcklbw xm2, xm3
3816 vinserti128 m2, m2, xm4, 1
3817 pmaddubsw m4, m2, m13
3818 paddw m0, m4
3819 pmaddubsw m2, m12
3820 lea r0, [r0 + r1 * 4]
3821 movu xm4, [r0] ; m4 = row 4
3822 punpckhbw xm5, xm3, xm4
3823 punpcklbw xm3, xm4
3824 vinserti128 m3, m3, xm5, 1
3825 pmaddubsw m5, m3, m13
3826 paddw m1, m5
3827 pmaddubsw m3, m12
3828 movu xm5, [r0 + r1] ; m5 = row 5
3829 punpckhbw xm6, xm4, xm5
3830 punpcklbw xm4, xm5
3831 vinserti128 m4, m4, xm6, 1
3832 pmaddubsw m6, m4, m13
3833 paddw m2, m6
3834 pmaddubsw m4, m12
3835 movu xm6, [r0 + r1 * 2] ; m6 = row 6
3836 punpckhbw xm7, xm5, xm6
3837 punpcklbw xm5, xm6
3838 vinserti128 m5, m5, xm7, 1
3839 pmaddubsw m7, m5, m13
3840 paddw m3, m7
3841 pmaddubsw m5, m12
3842 movu xm7, [r0 + r4] ; m7 = row 7
3843 punpckhbw xm8, xm6, xm7
3844 punpcklbw xm6, xm7
3845 vinserti128 m6, m6, xm8, 1
3846 pmaddubsw m8, m6, m13
3847 paddw m4, m8
3848 pmaddubsw m6, m12
3849 lea r0, [r0 + r1 * 4]
3850 movu xm8, [r0] ; m8 = row 8
3851 punpckhbw xm9, xm7, xm8
3852 punpcklbw xm7, xm8
3853 vinserti128 m7, m7, xm9, 1
3854 pmaddubsw m9, m7, m13
3855 paddw m5, m9
3856 pmaddubsw m7, m12
3857 movu xm9, [r0 + r1] ; m9 = row 9
3858 punpckhbw xm10, xm8, xm9
3859 punpcklbw xm8, xm9
3860 vinserti128 m8, m8, xm10, 1
3861 pmaddubsw m10, m8, m13
3862 paddw m6, m10
3863 pmaddubsw m8, m12
3864 movu xm10, [r0 + r1 * 2] ; m10 = row 10
3865 punpckhbw xm11, xm9, xm10
3866 punpcklbw xm9, xm10
3867 vinserti128 m9, m9, xm11, 1
3868 pmaddubsw m11, m9, m13
3869 paddw m7, m11
3870 pmaddubsw m9, m12
3871
3872 pmulhrsw m0, m14 ; m0 = word: row 0
3873 pmulhrsw m1, m14 ; m1 = word: row 1
3874 pmulhrsw m2, m14 ; m2 = word: row 2
3875 pmulhrsw m3, m14 ; m3 = word: row 3
3876 pmulhrsw m4, m14 ; m4 = word: row 4
3877 pmulhrsw m5, m14 ; m5 = word: row 5
3878 pmulhrsw m6, m14 ; m6 = word: row 6
3879 pmulhrsw m7, m14 ; m7 = word: row 7
3880 packuswb m0, m1
3881 packuswb m2, m3
3882 packuswb m4, m5
3883 packuswb m6, m7
3884 vpermq m0, m0, 11011000b
3885 vpermq m2, m2, 11011000b
3886 vpermq m4, m4, 11011000b
3887 vpermq m6, m6, 11011000b
3888 vextracti128 xm1, m0, 1
3889 vextracti128 xm3, m2, 1
3890 vextracti128 xm5, m4, 1
3891 vextracti128 xm7, m6, 1
3892 movu [r2], xm0
3893 movu [r2 + r3], xm1
3894 movu [r2 + r3 * 2], xm2
3895 movu [r2 + r5], xm3
3896 lea r2, [r2 + r3 * 4]
3897 movu [r2], xm4
3898 movu [r2 + r3], xm5
3899 movu [r2 + r3 * 2], xm6
3900 movu [r2 + r5], xm7
3901 lea r2, [r2 + r3 * 4]
3902
3903 movu xm11, [r0 + r4] ; m11 = row 11
3904 punpckhbw xm6, xm10, xm11
3905 punpcklbw xm10, xm11
3906 vinserti128 m10, m10, xm6, 1
3907 pmaddubsw m6, m10, m13
3908 paddw m8, m6
3909 pmaddubsw m10, m12
3910 lea r0, [r0 + r1 * 4]
3911 movu xm6, [r0] ; m6 = row 12
3912 punpckhbw xm7, xm11, xm6
3913 punpcklbw xm11, xm6
3914 vinserti128 m11, m11, xm7, 1
3915 pmaddubsw m7, m11, m13
3916 paddw m9, m7
3917 pmaddubsw m11, m12
3918
3919 movu xm7, [r0 + r1] ; m7 = row 13
3920 punpckhbw xm0, xm6, xm7
3921 punpcklbw xm6, xm7
3922 vinserti128 m6, m6, xm0, 1
3923 pmaddubsw m0, m6, m13
3924 paddw m10, m0
3925 pmaddubsw m6, m12
3926 movu xm0, [r0 + r1 * 2] ; m0 = row 14
3927 punpckhbw xm1, xm7, xm0
3928 punpcklbw xm7, xm0
3929 vinserti128 m7, m7, xm1, 1
3930 pmaddubsw m1, m7, m13
3931 paddw m11, m1
3932 pmaddubsw m7, m12
3933 movu xm1, [r0 + r4] ; m1 = row 15
3934 punpckhbw xm2, xm0, xm1
3935 punpcklbw xm0, xm1
3936 vinserti128 m0, m0, xm2, 1
3937 pmaddubsw m2, m0, m13
3938 paddw m6, m2
3939 pmaddubsw m0, m12
3940 lea r0, [r0 + r1 * 4]
3941 movu xm2, [r0] ; m2 = row 16
3942 punpckhbw xm3, xm1, xm2
3943 punpcklbw xm1, xm2
3944 vinserti128 m1, m1, xm3, 1
3945 pmaddubsw m3, m1, m13
3946 paddw m7, m3
3947 pmaddubsw m1, m12
3948 movu xm3, [r0 + r1] ; m3 = row 17
3949 punpckhbw xm4, xm2, xm3
3950 punpcklbw xm2, xm3
3951 vinserti128 m2, m2, xm4, 1
3952 pmaddubsw m2, m13
3953 paddw m0, m2
3954 movu xm4, [r0 + r1 * 2] ; m4 = row 18
3955 punpckhbw xm5, xm3, xm4
3956 punpcklbw xm3, xm4
3957 vinserti128 m3, m3, xm5, 1
3958 pmaddubsw m3, m13
3959 paddw m1, m3
3960
3961 pmulhrsw m8, m14 ; m8 = word: row 8
3962 pmulhrsw m9, m14 ; m9 = word: row 9
3963 pmulhrsw m10, m14 ; m10 = word: row 10
3964 pmulhrsw m11, m14 ; m11 = word: row 11
3965 pmulhrsw m6, m14 ; m6 = word: row 12
3966 pmulhrsw m7, m14 ; m7 = word: row 13
3967 pmulhrsw m0, m14 ; m0 = word: row 14
3968 pmulhrsw m1, m14 ; m1 = word: row 15
3969 packuswb m8, m9
3970 packuswb m10, m11
3971 packuswb m6, m7
3972 packuswb m0, m1
3973 vpermq m8, m8, 11011000b
3974 vpermq m10, m10, 11011000b
3975 vpermq m6, m6, 11011000b
3976 vpermq m0, m0, 11011000b
3977 vextracti128 xm9, m8, 1
3978 vextracti128 xm11, m10, 1
3979 vextracti128 xm7, m6, 1
3980 vextracti128 xm1, m0, 1
3981 movu [r2], xm8
3982 movu [r2 + r3], xm9
3983 movu [r2 + r3 * 2], xm10
3984 movu [r2 + r5], xm11
3985 lea r2, [r2 + r3 * 4]
3986 movu [r2], xm6
3987 movu [r2 + r3], xm7
3988 movu [r2 + r3 * 2], xm0
3989 movu [r2 + r5], xm1
3990 RET
3991 %endif
3992
3993 ;-----------------------------------------------------------------------------
3994 ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3995 ;-----------------------------------------------------------------------------
3996 %macro FILTER_V4_W24 2
3997 INIT_XMM sse4
3998 cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
3999
4000 mov r4d, r4m
4001 sub r0, r1
4002
4003 %ifdef PIC
4004 lea r5, [tab_ChromaCoeff]
4005 movd m0, [r5 + r4 * 4]
4006 %else
4007 movd m0, [tab_ChromaCoeff + r4 * 4]
4008 %endif
4009
4010 pshufb m1, m0, [tab_Vm]
4011 pshufb m0, [tab_Vm + 16]
4012
4013 mov r4d, %2
4014
4015 .loop:
4016 movu m2, [r0]
4017 movu m3, [r0 + r1]
4018
4019 punpcklbw m4, m2, m3
4020 punpckhbw m2, m3
4021
4022 pmaddubsw m4, m1
4023 pmaddubsw m2, m1
4024
4025 lea r5, [r0 + 2 * r1]
4026 movu m5, [r5]
4027 movu m7, [r5 + r1]
4028
4029 punpcklbw m6, m5, m7
4030 pmaddubsw m6, m0
4031 paddw m4, m6
4032
4033 punpckhbw m6, m5, m7
4034 pmaddubsw m6, m0
4035 paddw m2, m6
4036
4037 mova m6, [pw_512]
4038
4039 pmulhrsw m4, m6
4040 pmulhrsw m2, m6
4041
4042 packuswb m4, m2
4043
4044 movu [r2], m4
4045
4046 punpcklbw m4, m3, m5
4047 punpckhbw m3, m5
4048
4049 pmaddubsw m4, m1
4050 pmaddubsw m3, m1
4051
4052 movu m2, [r5 + 2 * r1]
4053
4054 punpcklbw m5, m7, m2
4055 punpckhbw m7, m2
4056
4057 pmaddubsw m5, m0
4058 pmaddubsw m7, m0
4059
4060 paddw m4, m5
4061 paddw m3, m7
4062
4063 pmulhrsw m4, m6
4064 pmulhrsw m3, m6
4065
4066 packuswb m4, m3
4067
4068 movu [r2 + r3], m4
4069
4070 movq m2, [r0 + 16]
4071 movq m3, [r0 + r1 + 16]
4072 movq m4, [r5 + 16]
4073 movq m5, [r5 + r1 + 16]
4074
4075 punpcklbw m2, m3
4076 punpcklbw m4, m5
4077
4078 pmaddubsw m2, m1
4079 pmaddubsw m4, m0
4080
4081 paddw m2, m4
4082
4083 pmulhrsw m2, m6
4084
4085 movq m3, [r0 + r1 + 16]
4086 movq m4, [r5 + 16]
4087 movq m5, [r5 + r1 + 16]
4088 movq m7, [r5 + 2 * r1 + 16]
4089
4090 punpcklbw m3, m4
4091 punpcklbw m5, m7
4092
4093 pmaddubsw m3, m1
4094 pmaddubsw m5, m0
4095
4096 paddw m3, m5
4097
4098 pmulhrsw m3, m6
4099 packuswb m2, m3
4100
4101 movh [r2 + 16], m2
4102 movhps [r2 + r3 + 16], m2
4103
4104 mov r0, r5
4105 lea r2, [r2 + 2 * r3]
4106
4107 sub r4, 2
4108 jnz .loop
4109 RET
4110 %endmacro
4111
4112 FILTER_V4_W24 24, 32
4113
4114 FILTER_V4_W24 24, 64
4115
4116 ;-----------------------------------------------------------------------------
4117 ; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4118 ;-----------------------------------------------------------------------------
4119 %macro FILTER_V4_W32 2
4120 INIT_XMM sse4
4121 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
4122
4123 mov r4d, r4m
4124 sub r0, r1
4125
4126 %ifdef PIC
4127 lea r5, [tab_ChromaCoeff]
4128 movd m0, [r5 + r4 * 4]
4129 %else
4130 movd m0, [tab_ChromaCoeff + r4 * 4]
4131 %endif
4132
4133 pshufb m1, m0, [tab_Vm]
4134 pshufb m0, [tab_Vm + 16]
4135
4136 mova m7, [pw_512]
4137
4138 mov r4d, %2
4139
4140 .loop:
4141 movu m2, [r0]
4142 movu m3, [r0 + r1]
4143
4144 punpcklbw m4, m2, m3
4145 punpckhbw m2, m3
4146
4147 pmaddubsw m4, m1
4148 pmaddubsw m2, m1
4149
4150 lea r5, [r0 + 2 * r1]
4151 movu m3, [r5]
4152 movu m5, [r5 + r1]
4153
4154 punpcklbw m6, m3, m5
4155 punpckhbw m3, m5
4156
4157 pmaddubsw m6, m0
4158 pmaddubsw m3, m0
4159
4160 paddw m4, m6
4161 paddw m2, m3
4162
4163 pmulhrsw m4, m7
4164 pmulhrsw m2, m7
4165
4166 packuswb m4, m2
4167
4168 movu [r2], m4
4169
4170 movu m2, [r0 + 16]
4171 movu m3, [r0 + r1 + 16]
4172
4173 punpcklbw m4, m2, m3
4174 punpckhbw m2, m3
4175
4176 pmaddubsw m4, m1
4177 pmaddubsw m2, m1
4178
4179 movu m3, [r5 + 16]
4180 movu m5, [r5 + r1 + 16]
4181
4182 punpcklbw m6, m3, m5
4183 punpckhbw m3, m5
4184
4185 pmaddubsw m6, m0
4186 pmaddubsw m3, m0
4187
4188 paddw m4, m6
4189 paddw m2, m3
4190
4191 pmulhrsw m4, m7
4192 pmulhrsw m2, m7
4193
4194 packuswb m4, m2
4195
4196 movu [r2 + 16], m4
4197
4198 lea r0, [r0 + r1]
4199 lea r2, [r2 + r3]
4200
4201 dec r4
4202 jnz .loop
4203 RET
4204 %endmacro
4205
4206 FILTER_V4_W32 32, 8
4207 FILTER_V4_W32 32, 16
4208 FILTER_V4_W32 32, 24
4209 FILTER_V4_W32 32, 32
4210
4211 FILTER_V4_W32 32, 48
4212 FILTER_V4_W32 32, 64
4213
4214 INIT_YMM avx2
4215 %if ARCH_X86_64 == 1
4216 cglobal interp_4tap_vert_pp_32x32, 4, 7, 13
4217 mov r4d, r4m
4218 shl r4d, 6
4219
4220 %ifdef PIC
4221 lea r5, [tab_ChromaCoeffVer_32]
4222 add r5, r4
4223 %else
4224 lea r5, [tab_ChromaCoeffVer_32 + r4]
4225 %endif
4226
4227 mova m10, [r5]
4228 mova m11, [r5 + mmsize]
4229 lea r4, [r1 * 3]
4230 sub r0, r1
4231 lea r5, [r3 * 3]
4232 mova m12, [pw_512]
4233 mov r6d, 8
4234 .loopW:
4235 movu m0, [r0] ; m0 = row 0
4236 movu m1, [r0 + r1] ; m1 = row 1
4237 punpcklbw m2, m0, m1
4238 punpckhbw m3, m0, m1
4239 pmaddubsw m2, m10
4240 pmaddubsw m3, m10
4241 movu m0, [r0 + r1 * 2] ; m0 = row 2
4242 punpcklbw m4, m1, m0
4243 punpckhbw m5, m1, m0
4244 pmaddubsw m4, m10
4245 pmaddubsw m5, m10
4246 movu m1, [r0 + r4] ; m1 = row 3
4247 punpcklbw m6, m0, m1
4248 punpckhbw m7, m0, m1
4249 pmaddubsw m8, m6, m11
4250 pmaddubsw m9, m7, m11
4251 pmaddubsw m6, m10
4252 pmaddubsw m7, m10
4253 paddw m2, m8
4254 paddw m3, m9
4255 pmulhrsw m2, m12
4256 pmulhrsw m3, m12
4257 packuswb m2, m3
4258 movu [r2], m2
4259
4260 lea r0, [r0 + r1 * 4]
4261 movu m0, [r0] ; m0 = row 4
4262 punpcklbw m2, m1, m0
4263 punpckhbw m3, m1, m0
4264 pmaddubsw m8, m2, m11
4265 pmaddubsw m9, m3, m11
4266 pmaddubsw m2, m10
4267 pmaddubsw m3, m10
4268 paddw m4, m8
4269 paddw m5, m9
4270 pmulhrsw m4, m12
4271 pmulhrsw m5, m12
4272 packuswb m4, m5
4273 movu [r2 + r3], m4
4274
4275 movu m1, [r0 + r1] ; m1 = row 5
4276 punpcklbw m4, m0, m1
4277 punpckhbw m5, m0, m1
4278 pmaddubsw m4, m11
4279 pmaddubsw m5, m11
4280 paddw m6, m4
4281 paddw m7, m5
4282 pmulhrsw m6, m12
4283 pmulhrsw m7, m12
4284 packuswb m6, m7
4285 movu [r2 + r3 * 2], m6
4286
4287 movu m0, [r0 + r1 * 2] ; m0 = row 6
4288 punpcklbw m6, m1, m0
4289 punpckhbw m7, m1, m0
4290 pmaddubsw m6, m11
4291 pmaddubsw m7, m11
4292 paddw m2, m6
4293 paddw m3, m7
4294 pmulhrsw m2, m12
4295 pmulhrsw m3, m12
4296 packuswb m2, m3
4297 movu [r2 + r5], m2
4298
4299 lea r2, [r2 + r3 * 4]
4300 dec r6d
4301 jnz .loopW
4302 RET
4303 %endif
4304
4305 ;-----------------------------------------------------------------------------
4306 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4307 ;-----------------------------------------------------------------------------
4308 %macro FILTER_V4_W16n_H2 2
4309 INIT_XMM sse4
4310 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
4311
4312 mov r4d, r4m
4313 sub r0, r1
4314
4315 %ifdef PIC
4316 lea r5, [tab_ChromaCoeff]
4317 movd m0, [r5 + r4 * 4]
4318 %else
4319 movd m0, [tab_ChromaCoeff + r4 * 4]
4320 %endif
4321
4322 pshufb m1, m0, [tab_Vm]
4323 pshufb m0, [tab_Vm + 16]
4324
4325 mov r4d, %2/2
4326
4327 .loop:
4328
4329 mov r6d, %1/16
4330
4331 .loopW:
4332
4333 movu m2, [r0]
4334 movu m3, [r0 + r1]
4335
4336 punpcklbw m4, m2, m3
4337 punpckhbw m2, m3
4338
4339 pmaddubsw m4, m1
4340 pmaddubsw m2, m1
4341
4342 lea r5, [r0 + 2 * r1]
4343 movu m5, [r5]
4344 movu m6, [r5 + r1]
4345
4346 punpckhbw m7, m5, m6
4347 pmaddubsw m7, m0
4348 paddw m2, m7
4349
4350 punpcklbw m7, m5, m6
4351 pmaddubsw m7, m0
4352 paddw m4, m7
4353
4354 mova m7, [pw_512]
4355
4356 pmulhrsw m4, m7
4357 pmulhrsw m2, m7
4358
4359 packuswb m4, m2
4360
4361 movu [r2], m4
4362
4363 punpcklbw m4, m3, m5
4364 punpckhbw m3, m5
4365
4366 pmaddubsw m4, m1
4367 pmaddubsw m3, m1
4368
4369 movu m5, [r5 + 2 * r1]
4370
4371 punpcklbw m2, m6, m5
4372 punpckhbw m6, m5
4373
4374 pmaddubsw m2, m0
4375 pmaddubsw m6, m0
4376
4377 paddw m4, m2
4378 paddw m3, m6
4379
4380 pmulhrsw m4, m7
4381 pmulhrsw m3, m7
4382
4383 packuswb m4, m3
4384
4385 movu [r2 + r3], m4
4386
4387 add r0, 16
4388 add r2, 16
4389 dec r6d
4390 jnz .loopW
4391
4392 lea r0, [r0 + r1 * 2 - %1]
4393 lea r2, [r2 + r3 * 2 - %1]
4394
4395 dec r4d
4396 jnz .loop
4397 RET
4398 %endmacro
4399
4400 FILTER_V4_W16n_H2 64, 64
4401 FILTER_V4_W16n_H2 64, 32
4402 FILTER_V4_W16n_H2 64, 48
4403 FILTER_V4_W16n_H2 48, 64
4404 FILTER_V4_W16n_H2 64, 16
4405
4406
4407 ;-----------------------------------------------------------------------------
4408 ; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
4409 ;-----------------------------------------------------------------------------
4410 INIT_XMM ssse3
4411 cglobal luma_p2s, 3, 7, 6
4412
4413 ; load width and height
4414 mov r3d, r3m
4415 mov r4d, r4m
4416
4417 ; load constant
4418 mova m4, [pb_128]
4419 mova m5, [tab_c_64_n64]
4420
4421 .loopH:
4422
4423 xor r5d, r5d
4424 .loopW:
4425 lea r6, [r0 + r5]
4426
4427 movh m0, [r6]
4428 punpcklbw m0, m4
4429 pmaddubsw m0, m5
4430
4431 movh m1, [r6 + r1]
4432 punpcklbw m1, m4
4433 pmaddubsw m1, m5
4434
4435 movh m2, [r6 + r1 * 2]
4436 punpcklbw m2, m4
4437 pmaddubsw m2, m5
4438
4439 lea r6, [r6 + r1 * 2]
4440 movh m3, [r6 + r1]
4441 punpcklbw m3, m4
4442 pmaddubsw m3, m5
4443
4444 add r5, 8
4445 cmp r5, r3
4446 jg .width4
4447 movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
4448 movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
4449 movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
4450 movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
4451 je .nextH
4452 jmp .loopW
4453
4454 .width4:
4455 movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
4456 movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
4457 movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
4458 movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
4459
4460 .nextH:
4461 lea r0, [r0 + r1 * 4]
4462 add r2, FENC_STRIDE * 8
4463
4464 sub r4d, 4
4465 jnz .loopH
4466
4467 RET
4468
4469 %macro PROCESS_LUMA_W4_4R 0
4470 movd m0, [r0]
4471 movd m1, [r0 + r1]
4472 punpcklbw m2, m0, m1 ; m2=[0 1]
4473
4474 lea r0, [r0 + 2 * r1]
4475 movd m0, [r0]
4476 punpcklbw m1, m0 ; m1=[1 2]
4477 punpcklqdq m2, m1 ; m2=[0 1 1 2]
4478 pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]
4479
4480 movd m1, [r0 + r1]
4481 punpcklbw m5, m0, m1 ; m2=[2 3]
4482 lea r0, [r0 + 2 * r1]
4483 movd m0, [r0]
4484 punpcklbw m1, m0 ; m1=[3 4]
4485 punpcklqdq m5, m1 ; m5=[2 3 3 4]
4486 pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]
4487 paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2
4488 pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4
4489
4490 movd m1, [r0 + r1]
4491 punpcklbw m2, m0, m1 ; m2=[4 5]
4492 lea r0, [r0 + 2 * r1]
4493 movd m0, [r0]
4494 punpcklbw m1, m0 ; m1=[5 6]
4495 punpcklqdq m2, m1 ; m2=[4 5 5 6]
4496 pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]
4497 paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2
4498 pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]
4499 paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4
4500
4501 movd m1, [r0 + r1]
4502 punpcklbw m2, m0, m1 ; m2=[6 7]
4503 lea r0, [r0 + 2 * r1]
4504 movd m0, [r0]
4505 punpcklbw m1, m0 ; m1=[7 8]
4506 punpcklqdq m2, m1 ; m2=[6 7 7 8]
4507 pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]
4508 paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end
4509 pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]
4510 paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4
4511
4512 movd m1, [r0 + r1]
4513 punpcklbw m2, m0, m1 ; m2=[8 9]
4514 movd m0, [r0 + 2 * r1]
4515 punpcklbw m1, m0 ; m1=[9 10]
4516 punpcklqdq m2, m1 ; m2=[8 9 9 10]
4517 pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]
4518 paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end
4519 %endmacro
4520
4521 %macro PROCESS_LUMA_W8_4R 0
4522 movq m0, [r0]
4523 movq m1, [r0 + r1]
4524 punpcklbw m0, m1
4525 pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1
4526
4527 lea r0, [r0 + 2 * r1]
4528 movq m0, [r0]
4529 punpcklbw m1, m0
4530 pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2
4531
4532 movq m1, [r0 + r1]
4533 punpcklbw m0, m1
4534 pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3
4535 pmaddubsw m0, [r6 + 1 * 16]
4536 paddw m7, m0 ;m7=[0+1+2+3] Row1
4537
4538 lea r0, [r0 + 2 * r1]
4539 movq m0, [r0]
4540 punpcklbw m1, m0
4541 pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4
4542 pmaddubsw m1, [r6 + 1 * 16]
4543 paddw m6, m1 ;m6 = [1+2+3+4] Row2
4544
4545 movq m1, [r0 + r1]
4546 punpcklbw m0, m1
4547 pmaddubsw m2, m0, [r6 + 1 * 16]
4548 pmaddubsw m0, [r6 + 2 * 16]
4549 paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1
4550 paddw m5, m2 ;m5=[2+3+4+5] Row3
4551
4552 lea r0, [r0 + 2 * r1]
4553 movq m0, [r0]
4554 punpcklbw m1, m0
4555 pmaddubsw m2, m1, [r6 + 1 * 16]
4556 pmaddubsw m1, [r6 + 2 * 16]
4557 paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2
4558 paddw m4, m2 ;m4=[3+4+5+6] Row4
4559
4560 movq m1, [r0 + r1]
4561 punpcklbw m0, m1
4562 pmaddubsw m2, m0, [r6 + 2 * 16]
4563 pmaddubsw m0, [r6 + 3 * 16]
4564 paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end
4565 paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3
4566
4567 lea r0, [r0 + 2 * r1]
4568 movq m0, [r0]
4569 punpcklbw m1, m0
4570 pmaddubsw m2, m1, [r6 + 2 * 16]
4571 pmaddubsw m1, [r6 + 3 * 16]
4572 paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end
4573 paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4
4574
4575 movq m1, [r0 + r1]
4576 punpcklbw m0, m1
4577 pmaddubsw m0, [r6 + 3 * 16]
4578 paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end
4579
4580 movq m0, [r0 + 2 * r1]
4581 punpcklbw m1, m0
4582 pmaddubsw m1, [r6 + 3 * 16]
4583 paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end
4584 %endmacro
4585
4586 ;-------------------------------------------------------------------------------------------------------------
4587 ; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4588 ;-------------------------------------------------------------------------------------------------------------
4589 %macro FILTER_VER_LUMA_4xN 3
4590 INIT_XMM sse4
4591 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
4592 lea r5, [3 * r1]
4593 sub r0, r5
4594 shl r4d, 6
4595 %ifidn %3,ps
4596 add r3d, r3d
4597 %endif
4598
4599 %ifdef PIC
4600 lea r5, [tab_LumaCoeffVer]
4601 lea r6, [r5 + r4]
4602 %else
4603 lea r6, [tab_LumaCoeffVer + r4]
4604 %endif
4605
4606 %ifidn %3,pp
4607 mova m3, [pw_512]
4608 %else
4609 mova m3, [pw_2000]
4610 %endif
4611
4612 mov r4d, %2/4
4613 lea r5, [4 * r1]
4614
4615 .loopH:
4616 PROCESS_LUMA_W4_4R
4617
4618 %ifidn %3,pp
4619 pmulhrsw m4, m3
4620 pmulhrsw m5, m3
4621
4622 packuswb m4, m5
4623
4624 movd [r2], m4
4625 pextrd [r2 + r3], m4, 1
4626 lea r2, [r2 + 2 * r3]
4627 pextrd [r2], m4, 2
4628 pextrd [r2 + r3], m4, 3
4629 %else
4630 psubw m4, m3
4631 psubw m5, m3
4632
4633 movlps [r2], m4
4634 movhps [r2 + r3], m4
4635 lea r2, [r2 + 2 * r3]
4636 movlps [r2], m5
4637 movhps [r2 + r3], m5
4638 %endif
4639
4640 sub r0, r5
4641 lea r2, [r2 + 2 * r3]
4642
4643 dec r4d
4644 jnz .loopH
4645
4646 RET
4647 %endmacro
4648
4649
4650 INIT_YMM avx2
4651 cglobal interp_8tap_vert_pp_4x4, 4,6,8
4652 mov r4d, r4m
4653 lea r5, [r1 * 3]
4654 sub r0, r5
4655
4656 ; TODO: VPGATHERDD
4657 movd xm1, [r0] ; m1 = row0
4658 movd xm2, [r0 + r1] ; m2 = row1
4659 punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00]
4660
4661 movd xm3, [r0 + r1 * 2] ; m3 = row2
4662 punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10]
4663 movd xm4, [r0 + r5]
4664 punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20]
4665 punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
4666
4667 lea r0, [r0 + r1 * 4]
4668 movd xm5, [r0] ; m5 = row4
4669 punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30]
4670 punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]
4671 vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
4672 movd xm2, [r0 + r1] ; m2 = row5
4673 punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40]
4674 punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
4675 movd xm6, [r0 + r1 * 2] ; m6 = row6
4676 punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50]
4677 punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]
4678 vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
4679 movd xm4, [r0 + r5] ; m4 = row7
4680 punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60]
4681 punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
4682
4683 lea r0, [r0 + r1 * 4]
4684 movd xm7, [r0] ; m7 = row8
4685 punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70]
4686 punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]
4687 vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
4688 movd xm2, [r0 + r1] ; m2 = row9
4689 punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80]
4690 punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
4691 movd xm7, [r0 + r1 * 2] ; m7 = rowA
4692 punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90]
4693 punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]
4694 vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
4695
4696 ; load filter coeff
4697 %ifdef PIC
4698 lea r5, [tab_LumaCoeff]
4699 vpbroadcastd m0, [r5 + r4 * 8 + 0]
4700 vpbroadcastd m2, [r5 + r4 * 8 + 4]
4701 %else
4702 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0]
4703 vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4]
4704 %endif
4705
4706 pmaddubsw m1, m0
4707 pmaddubsw m3, m0
4708 pmaddubsw m5, m2
4709 pmaddubsw m6, m2
4710 vbroadcasti128 m0, [pw_1]
4711 pmaddwd m1, m0
4712 pmaddwd m3, m0
4713 pmaddwd m5, m0
4714 pmaddwd m6, m0
4715 paddd m1, m5 ; m1 = DQWORD ROW[1 0]
4716 paddd m3, m6 ; m3 = DQWORD ROW[3 2]
4717 packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0]
4718
4719 ; TODO: does it overflow?
4720 pmulhrsw m1, [pw_512]
4721 vextracti128 xm2, m1, 1
4722 packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0]
4723 movd [r2], xm1
4724 pextrd [r2 + r3], xm1, 2
4725 pextrd [r2 + r3 * 2], xm1, 1
4726 lea r4, [r3 * 3]
4727 pextrd [r2 + r4], xm1, 3
4728 RET
4729
4730 INIT_YMM avx2
4731 cglobal interp_8tap_vert_ps_4x4, 4, 6, 5
4732 mov r4d, r4m
4733 shl r4d, 7
4734
4735 %ifdef PIC
4736 lea r5, [tab_LumaCoeffVer_32]
4737 add r5, r4
4738 %else
4739 lea r5, [tab_LumaCoeffVer_32 + r4]
4740 %endif
4741
4742 lea r4, [r1 * 3]
4743 sub r0, r4
4744
4745 add r3d, r3d
4746
4747 movd xm1, [r0]
4748 pinsrd xm1, [r0 + r1], 1
4749 pinsrd xm1, [r0 + r1 * 2], 2
4750 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
4751 lea r0, [r0 + r1 * 4]
4752 movd xm2, [r0]
4753 pinsrd xm2, [r0 + r1], 1
4754 pinsrd xm2, [r0 + r1 * 2], 2
4755 pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4]
4756 vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0]
4757 lea r0, [r0 + r1 * 4]
4758 movd xm3, [r0]
4759 pinsrd xm3, [r0 + r1], 1
4760 pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8]
4761 vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4]
4762 mova m3, [interp4_vpp_shuf1]
4763 vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0]
4764 vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4]
4765 mova m3, [interp4_vpp_shuf1 + mmsize]
4766 vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2]
4767 vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6]
4768
4769 mova m3, [interp4_vpp_shuf]
4770 pshufb m0, m0, m3
4771 pshufb m1, m1, m3
4772 pshufb m4, m4, m3
4773 pshufb m2, m2, m3
4774 pmaddubsw m0, [r5]
4775 pmaddubsw m1, [r5 + mmsize]
4776 pmaddubsw m4, [r5 + 2 * mmsize]
4777 pmaddubsw m2, [r5 + 3 * mmsize]
4778 paddw m0, m1
4779 paddw m0, m4
4780 paddw m0, m2 ; m0 = WORD ROW[3 2 1 0]
4781
4782 vbroadcasti128 m3, [pw_2000]
4783 psubw m0, m3
4784 vextracti128 xm2, m0, 1
4785 lea r5, [r3 * 3]
4786 movq [r2], xm0
4787 movhps [r2 + r3], xm0
4788 movq [r2 + r3 * 2], xm2
4789 movhps [r2 + r5], xm2
4790 RET
4791
4792 ;-------------------------------------------------------------------------------------------------------------
4793 ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4794 ;-------------------------------------------------------------------------------------------------------------
4795 FILTER_VER_LUMA_4xN 4, 4, pp
4796
4797 ;-------------------------------------------------------------------------------------------------------------
4798 ; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4799 ;-------------------------------------------------------------------------------------------------------------
4800 FILTER_VER_LUMA_4xN 4, 8, pp
4801
4802 ;-------------------------------------------------------------------------------------------------------------
4803 ; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4804 ;-------------------------------------------------------------------------------------------------------------
4805 FILTER_VER_LUMA_4xN 4, 16, pp
4806
4807 ;-------------------------------------------------------------------------------------------------------------
4808 ; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4809 ;-------------------------------------------------------------------------------------------------------------
4810 FILTER_VER_LUMA_4xN 4, 4, ps
4811
4812 ;-------------------------------------------------------------------------------------------------------------
4813 ; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4814 ;-------------------------------------------------------------------------------------------------------------
4815 FILTER_VER_LUMA_4xN 4, 8, ps
4816
4817 ;-------------------------------------------------------------------------------------------------------------
4818 ; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4819 ;-------------------------------------------------------------------------------------------------------------
4820 FILTER_VER_LUMA_4xN 4, 16, ps
4821
4822 %macro PROCESS_LUMA_AVX2_W8_8R 0
4823 movq xm1, [r0] ; m1 = row 0
4824 movq xm2, [r0 + r1] ; m2 = row 1
4825 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4826 movq xm3, [r0 + r1 * 2] ; m3 = row 2
4827 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
4828 vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4829 pmaddubsw m5, [r5]
4830 movq xm4, [r0 + r4] ; m4 = row 3
4831 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4832 lea r0, [r0 + r1 * 4]
4833 movq xm1, [r0] ; m1 = row 4
4834 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
4835 vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4836 pmaddubsw m0, m2, [r5 + 1 * mmsize]
4837 paddw m5, m0
4838 pmaddubsw m2, [r5]
4839 movq xm3, [r0 + r1] ; m3 = row 5
4840 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4841 movq xm4, [r0 + r1 * 2] ; m4 = row 6
4842 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
4843 vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4844 pmaddubsw m3, m1, [r5 + 2 * mmsize]
4845 paddw m5, m3
4846 pmaddubsw m0, m1, [r5 + 1 * mmsize]
4847 paddw m2, m0
4848 pmaddubsw m1, [r5]
4849 movq xm3, [r0 + r4] ; m3 = row 7
4850 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4851 lea r0, [r0 + r1 * 4]
4852 movq xm0, [r0] ; m0 = row 8
4853 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
4854 vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4855 pmaddubsw m3, m4, [r5 + 3 * mmsize]
4856 paddw m5, m3
4857 pmaddubsw m3, m4, [r5 + 2 * mmsize]
4858 paddw m2, m3
4859 pmaddubsw m3, m4, [r5 + 1 * mmsize]
4860 paddw m1, m3
4861 pmaddubsw m4, [r5]
4862 movq xm3, [r0 + r1] ; m3 = row 9
4863 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4864 movq xm6, [r0 + r1 * 2] ; m6 = row 10
4865 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
4866 vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4867 pmaddubsw m3, m0, [r5 + 3 * mmsize]
4868 paddw m2, m3
4869 pmaddubsw m3, m0, [r5 + 2 * mmsize]
4870 paddw m1, m3
4871 pmaddubsw m0, [r5 + 1 * mmsize]
4872 paddw m4, m0
4873
4874 movq xm3, [r0 + r4] ; m3 = row 11
4875 punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
4876 lea r0, [r0 + r1 * 4]
4877 movq xm0, [r0] ; m0 = row 12
4878 punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0]
4879 vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
4880 pmaddubsw m3, m6, [r5 + 3 * mmsize]
4881 paddw m1, m3
4882 pmaddubsw m6, [r5 + 2 * mmsize]
4883 paddw m4, m6
4884 movq xm3, [r0 + r1] ; m3 = row 13
4885 punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
4886 movq xm6, [r0 + r1 * 2] ; m6 = row 14
4887 punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0]
4888 vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
4889 pmaddubsw m0, [r5 + 3 * mmsize]
4890 paddw m4, m0
4891 %endmacro
4892
4893 %macro PROCESS_LUMA_AVX2_W8_4R 0
4894 movq xm1, [r0] ; m1 = row 0
4895 movq xm2, [r0 + r1] ; m2 = row 1
4896 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4897 movq xm3, [r0 + r1 * 2] ; m3 = row 2
4898 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
4899 vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4900 pmaddubsw m5, [r5]
4901 movq xm4, [r0 + r4] ; m4 = row 3
4902 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4903 lea r0, [r0 + r1 * 4]
4904 movq xm1, [r0] ; m1 = row 4
4905 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
4906 vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4907 pmaddubsw m0, m2, [r5 + 1 * mmsize]
4908 paddw m5, m0
4909 pmaddubsw m2, [r5]
4910 movq xm3, [r0 + r1] ; m3 = row 5
4911 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4912 movq xm4, [r0 + r1 * 2] ; m4 = row 6
4913 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
4914 vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4915 pmaddubsw m3, m1, [r5 + 2 * mmsize]
4916 paddw m5, m3
4917 pmaddubsw m0, m1, [r5 + 1 * mmsize]
4918 paddw m2, m0
4919 movq xm3, [r0 + r4] ; m3 = row 7
4920 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4921 lea r0, [r0 + r1 * 4]
4922 movq xm0, [r0] ; m0 = row 8
4923 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
4924 vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4925 pmaddubsw m3, m4, [r5 + 3 * mmsize]
4926 paddw m5, m3
4927 pmaddubsw m3, m4, [r5 + 2 * mmsize]
4928 paddw m2, m3
4929 movq xm3, [r0 + r1] ; m3 = row 9
4930 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4931 movq xm6, [r0 + r1 * 2] ; m6 = row 10
4932 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
4933 vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4934 pmaddubsw m3, m0, [r5 + 3 * mmsize]
4935 paddw m2, m3
4936 %endmacro
4937
4938 ;-------------------------------------------------------------------------------------------------------------
4939 ; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4940 ;-------------------------------------------------------------------------------------------------------------
4941 %macro FILTER_VER_LUMA_8xN 3
4942 INIT_XMM sse4
4943 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
4944 lea r5, [3 * r1]
4945 sub r0, r5
4946 shl r4d, 6
4947
4948 %ifidn %3,ps
4949 add r3d, r3d
4950 %endif
4951
4952 %ifdef PIC
4953 lea r5, [tab_LumaCoeffVer]
4954 lea r6, [r5 + r4]
4955 %else
4956 lea r6, [tab_LumaCoeffVer + r4]
4957 %endif
4958
4959 %ifidn %3,pp
4960 mova m3, [pw_512]
4961 %else
4962 mova m3, [pw_2000]
4963 %endif
4964
4965 mov r4d, %2/4
4966 lea r5, [4 * r1]
4967
4968 .loopH:
4969 PROCESS_LUMA_W8_4R
4970
4971 %ifidn %3,pp
4972 pmulhrsw m7, m3
4973 pmulhrsw m6, m3
4974 pmulhrsw m5, m3
4975 pmulhrsw m4, m3
4976
4977 packuswb m7, m6
4978 packuswb m5, m4
4979
4980 movlps [r2], m7
4981 movhps [r2 + r3], m7
4982 lea r2, [r2 + 2 * r3]
4983 movlps [r2], m5
4984 movhps [r2 + r3], m5
4985 %else
4986 psubw m7, m3
4987 psubw m6, m3
4988 psubw m5, m3
4989 psubw m4, m3
4990
4991 movu [r2], m7
4992 movu [r2 + r3], m6
4993 lea r2, [r2 + 2 * r3]
4994 movu [r2], m5
4995 movu [r2 + r3], m4
4996 %endif
4997
4998 sub r0, r5
4999 lea r2, [r2 + 2 * r3]
5000
5001 dec r4d
5002 jnz .loopH
5003
5004 RET
5005 %endmacro
5006
5007 %macro FILTER_VER_LUMA_AVX2_8xN 2
5008 INIT_YMM avx2
5009 cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize
5010 mov r4d, r4m
5011 shl r4d, 7
5012
5013 %ifdef PIC
5014 lea r5, [tab_LumaCoeffVer_32]
5015 add r5, r4
5016 %else
5017 lea r5, [tab_LumaCoeffVer_32 + r4]
5018 %endif
5019 lea r4, [r1 * 3]
5020 sub r0, r4
5021 lea r6, [r1 * 4]
5022 mov word [rsp], %2 / 8
5023 mova m7, [pw_512]
5024
5025 .loop:
5026 PROCESS_LUMA_AVX2_W8_8R
5027 pmulhrsw m5, m7 ; m5 = word: row 0, row 1
5028 pmulhrsw m2, m7 ; m2 = word: row 2, row 3
5029 pmulhrsw m1, m7 ; m1 = word: row 4, row 5
5030 pmulhrsw m4, m7 ; m4 = word: row 6, row 7
5031 packuswb m5, m2
5032 packuswb m1, m4
5033 vextracti128 xm2, m5, 1
5034 vextracti128 xm4, m1, 1
5035 movq [r2], xm5
5036 movq [r2 + r3], xm2
5037 lea r2, [r2 + r3 * 2]
5038 movhps [r2], xm5
5039 movhps [r2 + r3], xm2
5040 lea r2, [r2 + r3 * 2]
5041 movq [r2], xm1
5042 movq [r2 + r3], xm4
5043 lea r2, [r2 + r3 * 2]
5044 movhps [r2], xm1
5045 movhps [r2 + r3], xm4
5046 lea r2, [r2 + r3 * 2]
5047 sub r0, r6
5048 dec word [rsp]
5049 jnz .loop
5050 RET
5051 %endmacro
5052
5053 INIT_YMM avx2
5054 cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
5055 mov r4d, r4m
5056 shl r4d, 7
5057
5058 %ifdef PIC
5059 lea r5, [tab_LumaCoeffVer_32]
5060 add r5, r4
5061 %else
5062 lea r5, [tab_LumaCoeffVer_32 + r4]
5063 %endif
5064
5065 lea r4, [r1 * 3]
5066 sub r0, r4
5067 PROCESS_LUMA_AVX2_W8_8R
5068 lea r4, [r3 * 3]
5069 mova m3, [pw_512]
5070 pmulhrsw m5, m3 ; m5 = word: row 0, row 1
5071 pmulhrsw m2, m3 ; m2 = word: row 2, row 3
5072 pmulhrsw m1, m3 ; m1 = word: row 4, row 5
5073 pmulhrsw m4, m3 ; m4 = word: row 6, row 7
5074 packuswb m5, m2
5075 packuswb m1, m4
5076 vextracti128 xm2, m5, 1
5077 vextracti128 xm4, m1, 1
5078 movq [r2], xm5
5079 movq [r2 + r3], xm2
5080 movhps [r2 + r3 * 2], xm5
5081 movhps [r2 + r4], xm2
5082 lea r2, [r2 + r3 * 4]
5083 movq [r2], xm1
5084 movq [r2 + r3], xm4
5085 movhps [r2 + r3 * 2], xm1
5086 movhps [r2 + r4], xm4
5087 RET
5088
5089 INIT_YMM avx2
5090 cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
5091 mov r4d, r4m
5092 shl r4d, 7
5093
5094 %ifdef PIC
5095 lea r5, [tab_LumaCoeffVer_32]
5096 add r5, r4
5097 %else
5098 lea r5, [tab_LumaCoeffVer_32 + r4]
5099 %endif
5100
5101 lea r4, [r1 * 3]
5102 sub r0, r4
5103 PROCESS_LUMA_AVX2_W8_4R
5104 lea r4, [r3 * 3]
5105 mova m3, [pw_512]
5106 pmulhrsw m5, m3 ; m5 = word: row 0, row 1
5107 pmulhrsw m2, m3 ; m2 = word: row 2, row 3
5108 packuswb m5, m2
5109 vextracti128 xm2, m5, 1
5110 movq [r2], xm5
5111 movq [r2 + r3], xm2
5112 movhps [r2 + r3 * 2], xm5
5113 movhps [r2 + r4], xm2
5114 RET
5115
5116 ;-------------------------------------------------------------------------------------------------------------
5117 ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5118 ;-------------------------------------------------------------------------------------------------------------
5119 FILTER_VER_LUMA_8xN 8, 4, pp
5120
5121 ;-------------------------------------------------------------------------------------------------------------
5122 ; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5123 ;-------------------------------------------------------------------------------------------------------------
5124 FILTER_VER_LUMA_8xN 8, 8, pp
5125
5126 ;-------------------------------------------------------------------------------------------------------------
5127 ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5128 ;-------------------------------------------------------------------------------------------------------------
5129 FILTER_VER_LUMA_8xN 8, 16, pp
5130 FILTER_VER_LUMA_AVX2_8xN 8, 16
5131
5132 ;-------------------------------------------------------------------------------------------------------------
5133 ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5134 ;-------------------------------------------------------------------------------------------------------------
5135 FILTER_VER_LUMA_8xN 8, 32, pp
5136 FILTER_VER_LUMA_AVX2_8xN 8, 32
5137
5138 ;-------------------------------------------------------------------------------------------------------------
5139 ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5140 ;-------------------------------------------------------------------------------------------------------------
5141 FILTER_VER_LUMA_8xN 8, 4, ps
5142
5143 ;-------------------------------------------------------------------------------------------------------------
5144 ; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5145 ;-------------------------------------------------------------------------------------------------------------
5146 FILTER_VER_LUMA_8xN 8, 8, ps
5147
5148 ;-------------------------------------------------------------------------------------------------------------
5149 ; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5150 ;-------------------------------------------------------------------------------------------------------------
5151 FILTER_VER_LUMA_8xN 8, 16, ps
5152
5153 ;-------------------------------------------------------------------------------------------------------------
5154 ; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5155 ;-------------------------------------------------------------------------------------------------------------
5156 FILTER_VER_LUMA_8xN 8, 32, ps
5157
5158 ;-------------------------------------------------------------------------------------------------------------
5159 ; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5160 ;-------------------------------------------------------------------------------------------------------------
5161 %macro FILTER_VER_LUMA_12xN 3
5162 INIT_XMM sse4
5163 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
5164 lea r5, [3 * r1]
5165 sub r0, r5
5166 shl r4d, 6
5167 %ifidn %3,ps
5168 add r3d, r3d
5169 %endif
5170
5171 %ifdef PIC
5172 lea r5, [tab_LumaCoeffVer]
5173 lea r6, [r5 + r4]
5174 %else
5175 lea r6, [tab_LumaCoeffVer + r4]
5176 %endif
5177
5178 %ifidn %3,pp
5179 mova m3, [pw_512]
5180 %else
5181 mova m3, [pw_2000]
5182 %endif
5183
5184 mov r4d, %2/4
5185
5186 .loopH:
5187 PROCESS_LUMA_W8_4R
5188
5189 %ifidn %3,pp
5190 pmulhrsw m7, m3
5191 pmulhrsw m6, m3
5192 pmulhrsw m5, m3
5193 pmulhrsw m4, m3
5194
5195 packuswb m7, m6
5196 packuswb m5, m4
5197
5198 movlps [r2], m7
5199 movhps [r2 + r3], m7
5200 lea r5, [r2 + 2 * r3]
5201 movlps [r5], m5
5202 movhps [r5 + r3], m5
5203 %else
5204 psubw m7, m3
5205 psubw m6, m3
5206 psubw m5, m3
5207 psubw m4, m3
5208
5209 movu [r2], m7
5210 movu [r2 + r3], m6
5211 lea r5, [r2 + 2 * r3]
5212 movu [r5], m5
5213 movu [r5 + r3], m4
5214 %endif
5215
5216 lea r5, [8 * r1 - 8]
5217 sub r0, r5
5218 %ifidn %3,pp
5219 add r2, 8
5220 %else
5221 add r2, 16
5222 %endif
5223
5224 PROCESS_LUMA_W4_4R
5225
5226 %ifidn %3,pp
5227 pmulhrsw m4, m3
5228 pmulhrsw m5, m3
5229
5230 packuswb m4, m5
5231
5232 movd [r2], m4
5233 pextrd [r2 + r3], m4, 1
5234 lea r5, [r2 + 2 * r3]
5235 pextrd [r5], m4, 2
5236 pextrd [r5 + r3], m4, 3
5237 %else
5238 psubw m4, m3
5239 psubw m5, m3
5240
5241 movlps [r2], m4
5242 movhps [r2 + r3], m4
5243 lea r5, [r2 + 2 * r3]
5244 movlps [r5], m5
5245 movhps [r5 + r3], m5
5246 %endif
5247
5248 lea r5, [4 * r1 + 8]
5249 sub r0, r5
5250 %ifidn %3,pp
5251 lea r2, [r2 + 4 * r3 - 8]
5252 %else
5253 lea r2, [r2 + 4 * r3 - 16]
5254 %endif
5255
5256 dec r4d
5257 jnz .loopH
5258
5259 RET
5260 %endmacro
5261
5262 ;-------------------------------------------------------------------------------------------------------------
5263 ; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5264 ;-------------------------------------------------------------------------------------------------------------
5265 FILTER_VER_LUMA_12xN 12, 16, pp
5266
5267 ;-------------------------------------------------------------------------------------------------------------
5268 ; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5269 ;-------------------------------------------------------------------------------------------------------------
5270 FILTER_VER_LUMA_12xN 12, 16, ps
5271
5272 INIT_YMM avx2
5273 %if ARCH_X86_64 == 1
5274 cglobal interp_8tap_vert_pp_12x16, 4, 7, 15
5275 mov r4d, r4m
5276 shl r4d, 7
5277
5278 %ifdef PIC
5279 lea r5, [tab_LumaCoeffVer_32]
5280 add r5, r4
5281 %else
5282 lea r5, [tab_LumaCoeffVer_32 + r4]
5283 %endif
5284
5285 lea r4, [r1 * 3]
5286 sub r0, r4
5287 lea r6, [r3 * 3]
5288 mova m14, [pw_512]
5289
5290 movu xm0, [r0] ; m0 = row 0
5291 movu xm1, [r0 + r1] ; m1 = row 1
5292 punpckhbw xm2, xm0, xm1
5293 punpcklbw xm0, xm1
5294 vinserti128 m0, m0, xm2, 1
5295 pmaddubsw m0, [r5]
5296 movu xm2, [r0 + r1 * 2] ; m2 = row 2
5297 punpckhbw xm3, xm1, xm2
5298 punpcklbw xm1, xm2
5299 vinserti128 m1, m1, xm3, 1
5300 pmaddubsw m1, [r5]
5301 movu xm3, [r0 + r4] ; m3 = row 3
5302 punpckhbw xm4, xm2, xm3
5303 punpcklbw xm2, xm3
5304 vinserti128 m2, m2, xm4, 1
5305 pmaddubsw m4, m2, [r5 + 1 * mmsize]
5306 paddw m0, m4
5307 pmaddubsw m2, [r5]
5308 lea r0, [r0 + r1 * 4]
5309 movu xm4, [r0] ; m4 = row 4
5310 punpckhbw xm5, xm3, xm4
5311 punpcklbw xm3, xm4
5312 vinserti128 m3, m3, xm5, 1
5313 pmaddubsw m5, m3, [r5 + 1 * mmsize]
5314 paddw m1, m5
5315 pmaddubsw m3, [r5]
5316 movu xm5, [r0 + r1] ; m5 = row 5
5317 punpckhbw xm6, xm4, xm5
5318 punpcklbw xm4, xm5
5319 vinserti128 m4, m4, xm6, 1
5320 pmaddubsw m6, m4, [r5 + 2 * mmsize]
5321 paddw m0, m6
5322 pmaddubsw m6, m4, [r5 + 1 * mmsize]
5323 paddw m2, m6
5324 pmaddubsw m4, [r5]
5325 movu xm6, [r0 + r1 * 2] ; m6 = row 6
5326 punpckhbw xm7, xm5, xm6
5327 punpcklbw xm5, xm6
5328 vinserti128 m5, m5, xm7, 1
5329 pmaddubsw m7, m5, [r5 + 2 * mmsize]
5330 paddw m1, m7
5331 pmaddubsw m7, m5, [r5 + 1 * mmsize]
5332 paddw m3, m7
5333 pmaddubsw m5, [r5]
5334 movu xm7, [r0 + r4] ; m7 = row 7
5335 punpckhbw xm8, xm6, xm7
5336 punpcklbw xm6, xm7
5337 vinserti128 m6, m6, xm8, 1
5338 pmaddubsw m8, m6, [r5 + 3 * mmsize]
5339 paddw m0, m8
5340 pmaddubsw m8, m6, [r5 + 2 * mmsize]
5341 paddw m2, m8
5342 pmaddubsw m8, m6, [r5 + 1 * mmsize]
5343 paddw m4, m8
5344 pmaddubsw m6, [r5]
5345 lea r0, [r0 + r1 * 4]
5346 movu xm8, [r0] ; m8 = row 8
5347 punpckhbw xm9, xm7, xm8
5348 punpcklbw xm7, xm8
5349 vinserti128 m7, m7, xm9, 1
5350 pmaddubsw m9, m7, [r5 + 3 * mmsize]
5351 paddw m1, m9
5352 pmaddubsw m9, m7, [r5 + 2 * mmsize]
5353 paddw m3, m9
5354 pmaddubsw m9, m7, [r5 + 1 * mmsize]
5355 paddw m5, m9
5356 pmaddubsw m7, [r5]
5357 movu xm9, [r0 + r1] ; m9 = row 9
5358 punpckhbw xm10, xm8, xm9
5359 punpcklbw xm8, xm9
5360 vinserti128 m8, m8, xm10, 1
5361 pmaddubsw m10, m8, [r5 + 3 * mmsize]
5362 paddw m2, m10
5363 pmaddubsw m10, m8, [r5 + 2 * mmsize]
5364 paddw m4, m10
5365 pmaddubsw m10, m8, [r5 + 1 * mmsize]
5366 paddw m6, m10
5367 pmaddubsw m8, [r5]
5368 movu xm10, [r0 + r1 * 2] ; m10 = row 10
5369 punpckhbw xm11, xm9, xm10
5370 punpcklbw xm9, xm10
5371 vinserti128 m9, m9, xm11, 1
5372 pmaddubsw m11, m9, [r5 + 3 * mmsize]
5373 paddw m3, m11
5374 pmaddubsw m11, m9, [r5 + 2 * mmsize]
5375 paddw m5, m11
5376 pmaddubsw m11, m9, [r5 + 1 * mmsize]
5377 paddw m7, m11
5378 pmaddubsw m9, [r5]
5379 movu xm11, [r0 + r4] ; m11 = row 11
5380 punpckhbw xm12, xm10, xm11
5381 punpcklbw xm10, xm11
5382 vinserti128 m10, m10, xm12, 1
5383 pmaddubsw m12, m10, [r5 + 3 * mmsize]
5384 paddw m4, m12
5385 pmaddubsw m12, m10, [r5 + 2 * mmsize]
5386 paddw m6, m12
5387 pmaddubsw m12, m10, [r5 + 1 * mmsize]
5388 paddw m8, m12
5389 pmaddubsw m10, [r5]
5390 lea r0, [r0 + r1 * 4]
5391 movu xm12, [r0] ; m12 = row 12
5392 punpckhbw xm13, xm11, xm12
5393 punpcklbw xm11, xm12
5394 vinserti128 m11, m11, xm13, 1
5395 pmaddubsw m13, m11, [r5 + 3 * mmsize]
5396 paddw m5, m13
5397 pmaddubsw m13, m11, [r5 + 2 * mmsize]
5398 paddw m7, m13
5399 pmaddubsw m13, m11, [r5 + 1 * mmsize]
5400 paddw m9, m13
5401 pmaddubsw m11, [r5]
5402
5403 pmulhrsw m0, m14 ; m0 = word: row 0
5404 pmulhrsw m1, m14 ; m1 = word: row 1
5405 pmulhrsw m2, m14 ; m2 = word: row 2
5406 pmulhrsw m3, m14 ; m3 = word: row 3
5407 pmulhrsw m4, m14 ; m4 = word: row 4
5408 pmulhrsw m5, m14 ; m5 = word: row 5
5409 packuswb m0, m1
5410 packuswb m2, m3
5411 packuswb m4, m5
5412 vpermq m0, m0, 11011000b
5413 vpermq m2, m2, 11011000b
5414 vpermq m4, m4, 11011000b
5415 vextracti128 xm1, m0, 1
5416 vextracti128 xm3, m2, 1
5417 vextracti128 xm5, m4, 1
5418 movq [r2], xm0
5419 pextrd [r2 + 8], xm0, 2
5420 movq [r2 + r3], xm1
5421 pextrd [r2 + r3 + 8], xm1, 2
5422 movq [r2 + r3 * 2], xm2
5423 pextrd [r2 + r3 * 2 + 8], xm2, 2
5424 movq [r2 + r6], xm3
5425 pextrd [r2 + r6 + 8], xm3, 2
5426 lea r2, [r2 + r3 * 4]
5427 movq [r2], xm4
5428 pextrd [r2 + 8], xm4, 2
5429 movq [r2 + r3], xm5
5430 pextrd [r2 + r3 + 8], xm5, 2
5431
5432 movu xm13, [r0 + r1] ; m13 = row 13
5433 punpckhbw xm0, xm12, xm13
5434 punpcklbw xm12, xm13
5435 vinserti128 m12, m12, xm0, 1
5436 pmaddubsw m0, m12, [r5 + 3 * mmsize]
5437 paddw m6, m0
5438 pmaddubsw m0, m12, [r5 + 2 * mmsize]
5439 paddw m8, m0
5440 pmaddubsw m0, m12, [r5 + 1 * mmsize]
5441 paddw m10, m0
5442 pmaddubsw m12, [r5]
5443 movu xm0, [r0 + r1 * 2] ; m0 = row 14
5444 punpckhbw xm1, xm13, xm0
5445 punpcklbw xm13, xm0
5446 vinserti128 m13, m13, xm1, 1
5447 pmaddubsw m1, m13, [r5 + 3 * mmsize]
5448 paddw m7, m1
5449 pmaddubsw m1, m13, [r5 + 2 * mmsize]
5450 paddw m9, m1
5451 pmaddubsw m1, m13, [r5 + 1 * mmsize]
5452 paddw m11, m1
5453 pmaddubsw m13, [r5]
5454
5455 pmulhrsw m6, m14 ; m6 = word: row 6
5456 pmulhrsw m7, m14 ; m7 = word: row 7
5457 packuswb m6, m7
5458 vpermq m6, m6, 11011000b
5459 vextracti128 xm7, m6, 1
5460 movq [r2 + r3 * 2], xm6
5461 pextrd [r2 + r3 * 2 + 8], xm6, 2
5462 movq [r2 + r6], xm7
5463 pextrd [r2 + r6 + 8], xm7, 2
5464 lea r2, [r2 + r3 * 4]
5465
5466 movu xm1, [r0 + r4] ; m1 = row 15
5467 punpckhbw xm2, xm0, xm1
5468 punpcklbw xm0, xm1
5469 vinserti128 m0, m0, xm2, 1
5470 pmaddubsw m2, m0, [r5 + 3 * mmsize]
5471 paddw m8, m2
5472 pmaddubsw m2, m0, [r5 + 2 * mmsize]
5473 paddw m10, m2
5474 pmaddubsw m2, m0, [r5 + 1 * mmsize]
5475 paddw m12, m2
5476 pmaddubsw m0, [r5]
5477 lea r0, [r0 + r1 * 4]
5478 movu xm2, [r0] ; m2 = row 16
5479 punpckhbw xm3, xm1, xm2
5480 punpcklbw xm1, xm2
5481 vinserti128 m1, m1, xm3, 1
5482 pmaddubsw m3, m1, [r5 + 3 * mmsize]
5483 paddw m9, m3
5484 pmaddubsw m3, m1, [r5 + 2 * mmsize]
5485 paddw m11, m3
5486 pmaddubsw m3, m1, [r5 + 1 * mmsize]
5487 paddw m13, m3
5488 pmaddubsw m1, [r5]
5489 movu xm3, [r0 + r1] ; m3 = row 17
5490 punpckhbw xm4, xm2, xm3
5491 punpcklbw xm2, xm3
5492 vinserti128 m2, m2, xm4, 1
5493 pmaddubsw m4, m2, [r5 + 3 * mmsize]
5494 paddw m10, m4
5495 pmaddubsw m4, m2, [r5 + 2 * mmsize]
5496 paddw m12, m4
5497 pmaddubsw m2, [r5 + 1 * mmsize]
5498 paddw m0, m2
5499 movu xm4, [r0 + r1 * 2] ; m4 = row 18
5500 punpckhbw xm5, xm3, xm4
5501 punpcklbw xm3, xm4
5502 vinserti128 m3, m3, xm5, 1
5503 pmaddubsw m5, m3, [r5 + 3 * mmsize]
5504 paddw m11, m5
5505 pmaddubsw m5, m3, [r5 + 2 * mmsize]
5506 paddw m13, m5
5507 pmaddubsw m3, [r5 + 1 * mmsize]
5508 paddw m1, m3
5509 movu xm5, [r0 + r4] ; m5 = row 19
5510 punpckhbw xm6, xm4, xm5
5511 punpcklbw xm4, xm5
5512 vinserti128 m4, m4, xm6, 1
5513 pmaddubsw m6, m4, [r5 + 3 * mmsize]
5514 paddw m12, m6
5515 pmaddubsw m4, [r5 + 2 * mmsize]
5516 paddw m0, m4
5517 lea r0, [r0 + r1 * 4]
5518 movu xm6, [r0] ; m6 = row 20
5519 punpckhbw xm7, xm5, xm6
5520 punpcklbw xm5, xm6
5521 vinserti128 m5, m5, xm7, 1
5522 pmaddubsw m7, m5, [r5 + 3 * mmsize]
5523 paddw m13, m7
5524 pmaddubsw m5, [r5 + 2 * mmsize]
5525 paddw m1, m5
5526 movu xm7, [r0 + r1] ; m7 = row 21
5527 punpckhbw xm2, xm6, xm7
5528 punpcklbw xm6, xm7
5529 vinserti128 m6, m6, xm2, 1
5530 pmaddubsw m6, [r5 + 3 * mmsize]
5531 paddw m0, m6
5532 movu xm2, [r0 + r1 * 2] ; m2 = row 22
5533 punpckhbw xm3, xm7, xm2
5534 punpcklbw xm7, xm2
5535 vinserti128 m7, m7, xm3, 1
5536 pmaddubsw m7, [r5 + 3 * mmsize]
5537 paddw m1, m7
5538
5539 pmulhrsw m8, m14 ; m8 = word: row 8
5540 pmulhrsw m9, m14 ; m9 = word: row 9
5541 pmulhrsw m10, m14 ; m10 = word: row 10
5542 pmulhrsw m11, m14 ; m11 = word: row 11
5543 pmulhrsw m12, m14 ; m12 = word: row 12
5544 pmulhrsw m13, m14 ; m13 = word: row 13
5545 pmulhrsw m0, m14 ; m0 = word: row 14
5546 pmulhrsw m1, m14 ; m1 = word: row 15
5547 packuswb m8, m9
5548 packuswb m10, m11
5549 packuswb m12, m13
5550 packuswb m0, m1
5551 vpermq m8, m8, 11011000b
5552 vpermq m10, m10, 11011000b
5553 vpermq m12, m12, 11011000b
5554 vpermq m0, m0, 11011000b
5555 vextracti128 xm9, m8, 1
5556 vextracti128 xm11, m10, 1
5557 vextracti128 xm13, m12, 1
5558 vextracti128 xm1, m0, 1
5559 movq [r2], xm8
5560 pextrd [r2 + 8], xm8, 2
5561 movq [r2 + r3], xm9
5562 pextrd [r2 + r3 + 8], xm9, 2
5563 movq [r2 + r3 * 2], xm10
5564 pextrd [r2 + r3 * 2 + 8], xm10, 2
5565 movq [r2 + r6], xm11
5566 pextrd [r2 + r6 + 8], xm11, 2
5567 lea r2, [r2 + r3 * 4]
5568 movq [r2], xm12
5569 pextrd [r2 + 8], xm12, 2
5570 movq [r2 + r3], xm13
5571 pextrd [r2 + r3 + 8], xm13, 2
5572 movq [r2 + r3 * 2], xm0
5573 pextrd [r2 + r3 * 2 + 8], xm0, 2
5574 movq [r2 + r6], xm1
5575 pextrd [r2 + r6 + 8], xm1, 2
5576 RET
5577 %endif
5578
5579 INIT_YMM avx2
5580 %if ARCH_X86_64 == 1
5581 cglobal interp_8tap_vert_pp_16x16, 4, 7, 15
5582 mov r4d, r4m
5583 shl r4d, 7
5584
5585 %ifdef PIC
5586 lea r5, [tab_LumaCoeffVer_32]
5587 add r5, r4
5588 %else
5589 lea r5, [tab_LumaCoeffVer_32 + r4]
5590 %endif
5591
5592 lea r4, [r1 * 3]
5593 sub r0, r4
5594 lea r6, [r3 * 3]
5595 mova m14, [pw_512]
5596
5597 movu xm0, [r0] ; m0 = row 0
5598 movu xm1, [r0 + r1] ; m1 = row 1
5599 punpckhbw xm2, xm0, xm1
5600 punpcklbw xm0, xm1
5601 vinserti128 m0, m0, xm2, 1
5602 pmaddubsw m0, [r5]
5603 movu xm2, [r0 + r1 * 2] ; m2 = row 2
5604 punpckhbw xm3, xm1, xm2
5605 punpcklbw xm1, xm2
5606 vinserti128 m1, m1, xm3, 1
5607 pmaddubsw m1, [r5]
5608 movu xm3, [r0 + r4] ; m3 = row 3
5609 punpckhbw xm4, xm2, xm3
5610 punpcklbw xm2, xm3
5611 vinserti128 m2, m2, xm4, 1
5612 pmaddubsw m4, m2, [r5 + 1 * mmsize]
5613 paddw m0, m4
5614 pmaddubsw m2, [r5]
5615 lea r0, [r0 + r1 * 4]
5616 movu xm4, [r0] ; m4 = row 4
5617 punpckhbw xm5, xm3, xm4
5618 punpcklbw xm3, xm4
5619 vinserti128 m3, m3, xm5, 1
5620 pmaddubsw m5, m3, [r5 + 1 * mmsize]
5621 paddw m1, m5
5622 pmaddubsw m3, [r5]
5623 movu xm5, [r0 + r1] ; m5 = row 5
5624 punpckhbw xm6, xm4, xm5
5625 punpcklbw xm4, xm5
5626 vinserti128 m4, m4, xm6, 1
5627 pmaddubsw m6, m4, [r5 + 2 * mmsize]
5628 paddw m0, m6
5629 pmaddubsw m6, m4, [r5 + 1 * mmsize]
5630 paddw m2, m6
5631 pmaddubsw m4, [r5]
5632 movu xm6, [r0 + r1 * 2] ; m6 = row 6
5633 punpckhbw xm7, xm5, xm6
5634 punpcklbw xm5, xm6
5635 vinserti128 m5, m5, xm7, 1
5636 pmaddubsw m7, m5, [r5 + 2 * mmsize]
5637 paddw m1, m7
5638 pmaddubsw m7, m5, [r5 + 1 * mmsize]
5639 paddw m3, m7
5640 pmaddubsw m5, [r5]
5641 movu xm7, [r0 + r4] ; m7 = row 7
5642 punpckhbw xm8, xm6, xm7
5643 punpcklbw xm6, xm7
5644 vinserti128 m6, m6, xm8, 1
5645 pmaddubsw m8, m6, [r5 + 3 * mmsize]
5646 paddw m0, m8
5647 pmaddubsw m8, m6, [r5 + 2 * mmsize]
5648 paddw m2, m8
5649 pmaddubsw m8, m6, [r5 + 1 * mmsize]
5650 paddw m4, m8
5651 pmaddubsw m6, [r5]
5652 lea r0, [r0 + r1 * 4]
5653 movu xm8, [r0] ; m8 = row 8
5654 punpckhbw xm9, xm7, xm8
5655 punpcklbw xm7, xm8
5656 vinserti128 m7, m7, xm9, 1
5657 pmaddubsw m9, m7, [r5 + 3 * mmsize]
5658 paddw m1, m9
5659 pmaddubsw m9, m7, [r5 + 2 * mmsize]
5660 paddw m3, m9
5661 pmaddubsw m9, m7, [r5 + 1 * mmsize]
5662 paddw m5, m9
5663 pmaddubsw m7, [r5]
5664 movu xm9, [r0 + r1] ; m9 = row 9
5665 punpckhbw xm10, xm8, xm9
5666 punpcklbw xm8, xm9
5667 vinserti128 m8, m8, xm10, 1
5668 pmaddubsw m10, m8, [r5 + 3 * mmsize]
5669 paddw m2, m10
5670 pmaddubsw m10, m8, [r5 + 2 * mmsize]
5671 paddw m4, m10
5672 pmaddubsw m10, m8, [r5 + 1 * mmsize]
5673 paddw m6, m10
5674 pmaddubsw m8, [r5]
5675 movu xm10, [r0 + r1 * 2] ; m10 = row 10
5676 punpckhbw xm11, xm9, xm10
5677 punpcklbw xm9, xm10
5678 vinserti128 m9, m9, xm11, 1
5679 pmaddubsw m11, m9, [r5 + 3 * mmsize]
5680 paddw m3, m11
5681 pmaddubsw m11, m9, [r5 + 2 * mmsize]
5682 paddw m5, m11
5683 pmaddubsw m11, m9, [r5 + 1 * mmsize]
5684 paddw m7, m11
5685 pmaddubsw m9, [r5]
5686 movu xm11, [r0 + r4] ; m11 = row 11
5687 punpckhbw xm12, xm10, xm11
5688 punpcklbw xm10, xm11
5689 vinserti128 m10, m10, xm12, 1
5690 pmaddubsw m12, m10, [r5 + 3 * mmsize]
5691 paddw m4, m12
5692 pmaddubsw m12, m10, [r5 + 2 * mmsize]
5693 paddw m6, m12
5694 pmaddubsw m12, m10, [r5 + 1 * mmsize]
5695 paddw m8, m12
5696 pmaddubsw m10, [r5]
5697 lea r0, [r0 + r1 * 4]
5698 movu xm12, [r0] ; m12 = row 12
5699 punpckhbw xm13, xm11, xm12
5700 punpcklbw xm11, xm12
5701 vinserti128 m11, m11, xm13, 1
5702 pmaddubsw m13, m11, [r5 + 3 * mmsize]
5703 paddw m5, m13
5704 pmaddubsw m13, m11, [r5 + 2 * mmsize]
5705 paddw m7, m13
5706 pmaddubsw m13, m11, [r5 + 1 * mmsize]
5707 paddw m9, m13
5708 pmaddubsw m11, [r5]
5709
5710 pmulhrsw m0, m14 ; m0 = word: row 0
5711 pmulhrsw m1, m14 ; m1 = word: row 1
5712 pmulhrsw m2, m14 ; m2 = word: row 2
5713 pmulhrsw m3, m14 ; m3 = word: row 3
5714 pmulhrsw m4, m14 ; m4 = word: row 4
5715 pmulhrsw m5, m14 ; m5 = word: row 5
5716 packuswb m0, m1
5717 packuswb m2, m3
5718 packuswb m4, m5
5719 vpermq m0, m0, 11011000b
5720 vpermq m2, m2, 11011000b
5721 vpermq m4, m4, 11011000b
5722 vextracti128 xm1, m0, 1
5723 vextracti128 xm3, m2, 1
5724 vextracti128 xm5, m4, 1
5725 movu [r2], xm0
5726 movu [r2 + r3], xm1
5727 movu [r2 + r3 * 2], xm2
5728 movu [r2 + r6], xm3
5729 lea r2, [r2 + r3 * 4]
5730 movu [r2], xm4
5731 movu [r2 + r3], xm5
5732
5733 movu xm13, [r0 + r1] ; m13 = row 13
5734 punpckhbw xm0, xm12, xm13
5735 punpcklbw xm12, xm13
5736 vinserti128 m12, m12, xm0, 1
5737 pmaddubsw m0, m12, [r5 + 3 * mmsize]
5738 paddw m6, m0
5739 pmaddubsw m0, m12, [r5 + 2 * mmsize]
5740 paddw m8, m0
5741 pmaddubsw m0, m12, [r5 + 1 * mmsize]
5742 paddw m10, m0
5743 pmaddubsw m12, [r5]
5744 movu xm0, [r0 + r1 * 2] ; m0 = row 14
5745 punpckhbw xm1, xm13, xm0
5746 punpcklbw xm13, xm0
5747 vinserti128 m13, m13, xm1, 1
5748 pmaddubsw m1, m13, [r5 + 3 * mmsize]
5749 paddw m7, m1
5750 pmaddubsw m1, m13, [r5 + 2 * mmsize]
5751 paddw m9, m1
5752 pmaddubsw m1, m13, [r5 + 1 * mmsize]
5753 paddw m11, m1
5754 pmaddubsw m13, [r5]
5755
5756 pmulhrsw m6, m14 ; m6 = word: row 6
5757 pmulhrsw m7, m14 ; m7 = word: row 7
5758 packuswb m6, m7
5759 vpermq m6, m6, 11011000b
5760 vextracti128 xm7, m6, 1
5761 movu [r2 + r3 * 2], xm6
5762 movu [r2 + r6], xm7
5763 lea r2, [r2 + r3 * 4]
5764
5765 movu xm1, [r0 + r4] ; m1 = row 15
5766 punpckhbw xm2, xm0, xm1
5767 punpcklbw xm0, xm1
5768 vinserti128 m0, m0, xm2, 1
5769 pmaddubsw m2, m0, [r5 + 3 * mmsize]
5770 paddw m8, m2
5771 pmaddubsw m2, m0, [r5 + 2 * mmsize]
5772 paddw m10, m2
5773 pmaddubsw m2, m0, [r5 + 1 * mmsize]
5774 paddw m12, m2
5775 pmaddubsw m0, [r5]
5776 lea r0, [r0 + r1 * 4]
5777 movu xm2, [r0] ; m2 = row 16
5778 punpckhbw xm3, xm1, xm2
5779 punpcklbw xm1, xm2
5780 vinserti128 m1, m1, xm3, 1
5781 pmaddubsw m3, m1, [r5 + 3 * mmsize]
5782 paddw m9, m3
5783 pmaddubsw m3, m1, [r5 + 2 * mmsize]
5784 paddw m11, m3
5785 pmaddubsw m3, m1, [r5 + 1 * mmsize]
5786 paddw m13, m3
5787 pmaddubsw m1, [r5]
5788 movu xm3, [r0 + r1] ; m3 = row 17
5789 punpckhbw xm4, xm2, xm3
5790 punpcklbw xm2, xm3
5791 vinserti128 m2, m2, xm4, 1
5792 pmaddubsw m4, m2, [r5 + 3 * mmsize]
5793 paddw m10, m4
5794 pmaddubsw m4, m2, [r5 + 2 * mmsize]
5795 paddw m12, m4
5796 pmaddubsw m2, [r5 + 1 * mmsize]
5797 paddw m0, m2
5798 movu xm4, [r0 + r1 * 2] ; m4 = row 18
5799 punpckhbw xm5, xm3, xm4
5800 punpcklbw xm3, xm4
5801 vinserti128 m3, m3, xm5, 1
5802 pmaddubsw m5, m3, [r5 + 3 * mmsize]
5803 paddw m11, m5
5804 pmaddubsw m5, m3, [r5 + 2 * mmsize]
5805 paddw m13, m5
5806 pmaddubsw m3, [r5 + 1 * mmsize]
5807 paddw m1, m3
5808 movu xm5, [r0 + r4] ; m5 = row 19
5809 punpckhbw xm6, xm4, xm5
5810 punpcklbw xm4, xm5
5811 vinserti128 m4, m4, xm6, 1
5812 pmaddubsw m6, m4, [r5 + 3 * mmsize]
5813 paddw m12, m6
5814 pmaddubsw m4, [r5 + 2 * mmsize]
5815 paddw m0, m4
5816 lea r0, [r0 + r1 * 4]
5817 movu xm6, [r0] ; m6 = row 20
5818 punpckhbw xm7, xm5, xm6
5819 punpcklbw xm5, xm6
5820 vinserti128 m5, m5, xm7, 1
5821 pmaddubsw m7, m5, [r5 + 3 * mmsize]
5822 paddw m13, m7
5823 pmaddubsw m5, [r5 + 2 * mmsize]
5824 paddw m1, m5
5825 movu xm7, [r0 + r1] ; m7 = row 21
5826 punpckhbw xm2, xm6, xm7
5827 punpcklbw xm6, xm7
5828 vinserti128 m6, m6, xm2, 1
5829 pmaddubsw m6, [r5 + 3 * mmsize]
5830 paddw m0, m6
5831 movu xm2, [r0 + r1 * 2] ; m2 = row 22
5832 punpckhbw xm3, xm7, xm2
5833 punpcklbw xm7, xm2
5834 vinserti128 m7, m7, xm3, 1
5835 pmaddubsw m7, [r5 + 3 * mmsize]
5836 paddw m1, m7
5837
5838 pmulhrsw m8, m14 ; m8 = word: row 8
5839 pmulhrsw m9, m14 ; m9 = word: row 9
5840 pmulhrsw m10, m14 ; m10 = word: row 10
5841 pmulhrsw m11, m14 ; m11 = word: row 11
5842 pmulhrsw m12, m14 ; m12 = word: row 12
5843 pmulhrsw m13, m14 ; m13 = word: row 13
5844 pmulhrsw m0, m14 ; m0 = word: row 14
5845 pmulhrsw m1, m14 ; m1 = word: row 15
5846 packuswb m8, m9
5847 packuswb m10, m11
5848 packuswb m12, m13
5849 packuswb m0, m1
5850 vpermq m8, m8, 11011000b
5851 vpermq m10, m10, 11011000b
5852 vpermq m12, m12, 11011000b
5853 vpermq m0, m0, 11011000b
5854 vextracti128 xm9, m8, 1
5855 vextracti128 xm11, m10, 1
5856 vextracti128 xm13, m12, 1
5857 vextracti128 xm1, m0, 1
5858 movu [r2], xm8
5859 movu [r2 + r3], xm9
5860 movu [r2 + r3 * 2], xm10
5861 movu [r2 + r6], xm11
5862 lea r2, [r2 + r3 * 4]
5863 movu [r2], xm12
5864 movu [r2 + r3], xm13
5865 movu [r2 + r3 * 2], xm0
5866 movu [r2 + r6], xm1
5867 RET
5868 %endif
5869
5870 INIT_YMM avx2
5871 %if ARCH_X86_64 == 1
5872 cglobal interp_8tap_vert_pp_16x12, 4, 7, 15
5873 mov r4d, r4m
5874 shl r4d, 7
5875
5876 %ifdef PIC
5877 lea r5, [tab_LumaCoeffVer_32]
5878 add r5, r4
5879 %else
5880 lea r5, [tab_LumaCoeffVer_32 + r4]
5881 %endif
5882
5883 lea r4, [r1 * 3]
5884 sub r0, r4
5885 lea r6, [r3 * 3]
5886 mova m14, [pw_512]
5887
5888 movu xm0, [r0] ; m0 = row 0
5889 movu xm1, [r0 + r1] ; m1 = row 1
5890 punpckhbw xm2, xm0, xm1
5891 punpcklbw xm0, xm1
5892 vinserti128 m0, m0, xm2, 1
5893 pmaddubsw m0, [r5]
5894 movu xm2, [r0 + r1 * 2] ; m2 = row 2
5895 punpckhbw xm3, xm1, xm2
5896 punpcklbw xm1, xm2
5897 vinserti128 m1, m1, xm3, 1
5898 pmaddubsw m1, [r5]
5899 movu xm3, [r0 + r4] ; m3 = row 3
5900 punpckhbw xm4, xm2, xm3
5901 punpcklbw xm2, xm3
5902 vinserti128 m2, m2, xm4, 1
5903 pmaddubsw m4, m2, [r5 + 1 * mmsize]
5904 paddw m0, m4
5905 pmaddubsw m2, [r5]
5906 lea r0, [r0 + r1 * 4]
5907 movu xm4, [r0] ; m4 = row 4
5908 punpckhbw xm5, xm3, xm4
5909 punpcklbw xm3, xm4
5910 vinserti128 m3, m3, xm5, 1
5911 pmaddubsw m5, m3, [r5 + 1 * mmsize]
5912 paddw m1, m5
5913 pmaddubsw m3, [r5]
5914 movu xm5, [r0 + r1] ; m5 = row 5
5915 punpckhbw xm6, xm4, xm5
5916 punpcklbw xm4, xm5
5917 vinserti128 m4, m4, xm6, 1
5918 pmaddubsw m6, m4, [r5 + 2 * mmsize]
5919 paddw m0, m6
5920 pmaddubsw m6, m4, [r5 + 1 * mmsize]
5921 paddw m2, m6
5922 pmaddubsw m4, [r5]
5923 movu xm6, [r0 + r1 * 2] ; m6 = row 6
5924 punpckhbw xm7, xm5, xm6
5925 punpcklbw xm5, xm6
5926 vinserti128 m5, m5, xm7, 1
5927 pmaddubsw m7, m5, [r5 + 2 * mmsize]
5928 paddw m1, m7
5929 pmaddubsw m7, m5, [r5 + 1 * mmsize]
5930 paddw m3, m7
5931 pmaddubsw m5, [r5]
5932 movu xm7, [r0 + r4] ; m7 = row 7
5933 punpckhbw xm8, xm6, xm7
5934 punpcklbw xm6, xm7
5935 vinserti128 m6, m6, xm8, 1
5936 pmaddubsw m8, m6, [r5 + 3 * mmsize]
5937 paddw m0, m8
5938 pmaddubsw m8, m6, [r5 + 2 * mmsize]
5939 paddw m2, m8
5940 pmaddubsw m8, m6, [r5 + 1 * mmsize]
5941 paddw m4, m8
5942 pmaddubsw m6, [r5]
5943 lea r0, [r0 + r1 * 4]
5944 movu xm8, [r0] ; m8 = row 8
5945 punpckhbw xm9, xm7, xm8
5946 punpcklbw xm7, xm8
5947 vinserti128 m7, m7, xm9, 1
5948 pmaddubsw m9, m7, [r5 + 3 * mmsize]
5949 paddw m1, m9
5950 pmaddubsw m9, m7, [r5 + 2 * mmsize]
5951 paddw m3, m9
5952 pmaddubsw m9, m7, [r5 + 1 * mmsize]
5953 paddw m5, m9
5954 pmaddubsw m7, [r5]
5955 movu xm9, [r0 + r1] ; m9 = row 9
5956 punpckhbw xm10, xm8, xm9
5957 punpcklbw xm8, xm9
5958 vinserti128 m8, m8, xm10, 1
5959 pmaddubsw m10, m8, [r5 + 3 * mmsize]
5960 paddw m2, m10
5961 pmaddubsw m10, m8, [r5 + 2 * mmsize]
5962 paddw m4, m10
5963 pmaddubsw m10, m8, [r5 + 1 * mmsize]
5964 paddw m6, m10
5965 pmaddubsw m8, [r5]
5966 movu xm10, [r0 + r1 * 2] ; m10 = row 10
5967 punpckhbw xm11, xm9, xm10
5968 punpcklbw xm9, xm10
5969 vinserti128 m9, m9, xm11, 1
5970 pmaddubsw m11, m9, [r5 + 3 * mmsize]
5971 paddw m3, m11
5972 pmaddubsw m11, m9, [r5 + 2 * mmsize]
5973 paddw m5, m11
5974 pmaddubsw m11, m9, [r5 + 1 * mmsize]
5975 paddw m7, m11
5976 pmaddubsw m9, [r5]
5977 movu xm11, [r0 + r4] ; m11 = row 11
5978 punpckhbw xm12, xm10, xm11
5979 punpcklbw xm10, xm11
5980 vinserti128 m10, m10, xm12, 1
5981 pmaddubsw m12, m10, [r5 + 3 * mmsize]
5982 paddw m4, m12
5983 pmaddubsw m12, m10, [r5 + 2 * mmsize]
5984 paddw m6, m12
5985 pmaddubsw m12, m10, [r5 + 1 * mmsize]
5986 paddw m8, m12
5987 pmaddubsw m10, [r5]
5988 lea r0, [r0 + r1 * 4]
5989 movu xm12, [r0] ; m12 = row 12
5990 punpckhbw xm13, xm11, xm12
5991 punpcklbw xm11, xm12
5992 vinserti128 m11, m11, xm13, 1
5993 pmaddubsw m13, m11, [r5 + 3 * mmsize]
5994 paddw m5, m13
5995 pmaddubsw m13, m11, [r5 + 2 * mmsize]
5996 paddw m7, m13
5997 pmaddubsw m13, m11, [r5 + 1 * mmsize]
5998 paddw m9, m13
5999 pmaddubsw m11, [r5]
6000
6001 pmulhrsw m0, m14 ; m0 = word: row 0
6002 pmulhrsw m1, m14 ; m1 = word: row 1
6003 pmulhrsw m2, m14 ; m2 = word: row 2
6004 pmulhrsw m3, m14 ; m3 = word: row 3
6005 pmulhrsw m4, m14 ; m4 = word: row 4
6006 pmulhrsw m5, m14 ; m5 = word: row 5
6007 packuswb m0, m1
6008 packuswb m2, m3
6009 packuswb m4, m5
6010 vpermq m0, m0, 11011000b
6011 vpermq m2, m2, 11011000b
6012 vpermq m4, m4, 11011000b
6013 vextracti128 xm1, m0, 1
6014 vextracti128 xm3, m2, 1
6015 vextracti128 xm5, m4, 1
6016 movu [r2], xm0
6017 movu [r2 + r3], xm1
6018 movu [r2 + r3 * 2], xm2
6019 movu [r2 + r6], xm3
6020 lea r2, [r2 + r3 * 4]
6021 movu [r2], xm4
6022 movu [r2 + r3], xm5
6023
6024 movu xm13, [r0 + r1] ; m13 = row 13
6025 punpckhbw xm0, xm12, xm13
6026 punpcklbw xm12, xm13
6027 vinserti128 m12, m12, xm0, 1
6028 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6029 paddw m6, m0
6030 pmaddubsw m0, m12, [r5 + 2 * mmsize]
6031 paddw m8, m0
6032 pmaddubsw m0, m12, [r5 + 1 * mmsize]
6033 paddw m10, m0
6034 movu xm0, [r0 + r1 * 2] ; m0 = row 14
6035 punpckhbw xm1, xm13, xm0
6036 punpcklbw xm13, xm0
6037 vinserti128 m13, m13, xm1, 1
6038 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6039 paddw m7, m1
6040 pmaddubsw m1, m13, [r5 + 2 * mmsize]
6041 paddw m9, m1
6042 pmaddubsw m1, m13, [r5 + 1 * mmsize]
6043 paddw m11, m1
6044
6045 pmulhrsw m6, m14 ; m6 = word: row 6
6046 pmulhrsw m7, m14 ; m7 = word: row 7
6047 packuswb m6, m7
6048 vpermq m6, m6, 11011000b
6049 vextracti128 xm7, m6, 1
6050 movu [r2 + r3 * 2], xm6
6051 movu [r2 + r6], xm7
6052 lea r2, [r2 + r3 * 4]
6053
6054 movu xm1, [r0 + r4] ; m1 = row 15
6055 punpckhbw xm2, xm0, xm1
6056 punpcklbw xm0, xm1
6057 vinserti128 m0, m0, xm2, 1
6058 pmaddubsw m2, m0, [r5 + 3 * mmsize]
6059 paddw m8, m2
6060 pmaddubsw m2, m0, [r5 + 2 * mmsize]
6061 paddw m10, m2
6062 lea r0, [r0 + r1 * 4]
6063 movu xm2, [r0] ; m2 = row 16
6064 punpckhbw xm3, xm1, xm2
6065 punpcklbw xm1, xm2
6066 vinserti128 m1, m1, xm3, 1
6067 pmaddubsw m3, m1, [r5 + 3 * mmsize]
6068 paddw m9, m3
6069 pmaddubsw m3, m1, [r5 + 2 * mmsize]
6070 paddw m11, m3
6071 movu xm3, [r0 + r1] ; m3 = row 17
6072 punpckhbw xm4, xm2, xm3
6073 punpcklbw xm2, xm3
6074 vinserti128 m2, m2, xm4, 1
6075 pmaddubsw m4, m2, [r5 + 3 * mmsize]
6076 paddw m10, m4
6077 movu xm4, [r0 + r1 * 2] ; m4 = row 18
6078 punpckhbw xm5, xm3, xm4
6079 punpcklbw xm3, xm4
6080 vinserti128 m3, m3, xm5, 1
6081 pmaddubsw m5, m3, [r5 + 3 * mmsize]
6082 paddw m11, m5
6083
6084 pmulhrsw m8, m14 ; m8 = word: row 8
6085 pmulhrsw m9, m14 ; m9 = word: row 9
6086 pmulhrsw m10, m14 ; m10 = word: row 10
6087 pmulhrsw m11, m14 ; m11 = word: row 11
6088 packuswb m8, m9
6089 packuswb m10, m11
6090 vpermq m8, m8, 11011000b
6091 vpermq m10, m10, 11011000b
6092 vextracti128 xm9, m8, 1
6093 vextracti128 xm11, m10, 1
6094 movu [r2], xm8
6095 movu [r2 + r3], xm9
6096 movu [r2 + r3 * 2], xm10
6097 movu [r2 + r6], xm11
6098 RET
6099 %endif
6100
6101 INIT_YMM avx2
6102 %if ARCH_X86_64 == 1
6103 cglobal interp_8tap_vert_pp_16x8, 4, 7, 15
6104 mov r4d, r4m
6105 shl r4d, 7
6106
6107 %ifdef PIC
6108 lea r5, [tab_LumaCoeffVer_32]
6109 add r5, r4
6110 %else
6111 lea r5, [tab_LumaCoeffVer_32 + r4]
6112 %endif
6113
6114 lea r4, [r1 * 3]
6115 sub r0, r4
6116 lea r6, [r3 * 3]
6117 mova m14, [pw_512]
6118
6119 movu xm0, [r0] ; m0 = row 0
6120 movu xm1, [r0 + r1] ; m1 = row 1
6121 punpckhbw xm2, xm0, xm1
6122 punpcklbw xm0, xm1
6123 vinserti128 m0, m0, xm2, 1
6124 pmaddubsw m0, [r5]
6125 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6126 punpckhbw xm3, xm1, xm2
6127 punpcklbw xm1, xm2
6128 vinserti128 m1, m1, xm3, 1
6129 pmaddubsw m1, [r5]
6130 movu xm3, [r0 + r4] ; m3 = row 3
6131 punpckhbw xm4, xm2, xm3
6132 punpcklbw xm2, xm3
6133 vinserti128 m2, m2, xm4, 1
6134 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6135 paddw m0, m4
6136 pmaddubsw m2, [r5]
6137 lea r0, [r0 + r1 * 4]
6138 movu xm4, [r0] ; m4 = row 4
6139 punpckhbw xm5, xm3, xm4
6140 punpcklbw xm3, xm4
6141 vinserti128 m3, m3, xm5, 1
6142 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6143 paddw m1, m5
6144 pmaddubsw m3, [r5]
6145 movu xm5, [r0 + r1] ; m5 = row 5
6146 punpckhbw xm6, xm4, xm5
6147 punpcklbw xm4, xm5
6148 vinserti128 m4, m4, xm6, 1
6149 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6150 paddw m0, m6
6151 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6152 paddw m2, m6
6153 pmaddubsw m4, [r5]
6154 movu xm6, [r0 + r1 * 2] ; m6 = row 6
6155 punpckhbw xm7, xm5, xm6
6156 punpcklbw xm5, xm6
6157 vinserti128 m5, m5, xm7, 1
6158 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6159 paddw m1, m7
6160 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6161 paddw m3, m7
6162 pmaddubsw m5, [r5]
6163 movu xm7, [r0 + r4] ; m7 = row 7
6164 punpckhbw xm8, xm6, xm7
6165 punpcklbw xm6, xm7
6166 vinserti128 m6, m6, xm8, 1
6167 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6168 paddw m0, m8
6169 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6170 paddw m2, m8
6171 pmaddubsw m8, m6, [r5 + 1 * mmsize]
6172 paddw m4, m8
6173 pmaddubsw m6, [r5]
6174 lea r0, [r0 + r1 * 4]
6175 movu xm8, [r0] ; m8 = row 8
6176 punpckhbw xm9, xm7, xm8
6177 punpcklbw xm7, xm8
6178 vinserti128 m7, m7, xm9, 1
6179 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6180 paddw m1, m9
6181 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6182 paddw m3, m9
6183 pmaddubsw m9, m7, [r5 + 1 * mmsize]
6184 paddw m5, m9
6185 pmaddubsw m7, [r5]
6186 movu xm9, [r0 + r1] ; m9 = row 9
6187 punpckhbw xm10, xm8, xm9
6188 punpcklbw xm8, xm9
6189 vinserti128 m8, m8, xm10, 1
6190 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6191 paddw m2, m10
6192 pmaddubsw m10, m8, [r5 + 2 * mmsize]
6193 paddw m4, m10
6194 pmaddubsw m10, m8, [r5 + 1 * mmsize]
6195 paddw m6, m10
6196 movu xm10, [r0 + r1 * 2] ; m10 = row 10
6197 punpckhbw xm11, xm9, xm10
6198 punpcklbw xm9, xm10
6199 vinserti128 m9, m9, xm11, 1
6200 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6201 paddw m3, m11
6202 pmaddubsw m11, m9, [r5 + 2 * mmsize]
6203 paddw m5, m11
6204 pmaddubsw m11, m9, [r5 + 1 * mmsize]
6205 paddw m7, m11
6206 movu xm11, [r0 + r4] ; m11 = row 11
6207 punpckhbw xm12, xm10, xm11
6208 punpcklbw xm10, xm11
6209 vinserti128 m10, m10, xm12, 1
6210 pmaddubsw m12, m10, [r5 + 3 * mmsize]
6211 paddw m4, m12
6212 pmaddubsw m12, m10, [r5 + 2 * mmsize]
6213 paddw m6, m12
6214 lea r0, [r0 + r1 * 4]
6215 movu xm12, [r0] ; m12 = row 12
6216 punpckhbw xm13, xm11, xm12
6217 punpcklbw xm11, xm12
6218 vinserti128 m11, m11, xm13, 1
6219 pmaddubsw m13, m11, [r5 + 3 * mmsize]
6220 paddw m5, m13
6221 pmaddubsw m13, m11, [r5 + 2 * mmsize]
6222 paddw m7, m13
6223
6224 pmulhrsw m0, m14 ; m0 = word: row 0
6225 pmulhrsw m1, m14 ; m1 = word: row 1
6226 pmulhrsw m2, m14 ; m2 = word: row 2
6227 pmulhrsw m3, m14 ; m3 = word: row 3
6228 pmulhrsw m4, m14 ; m4 = word: row 4
6229 pmulhrsw m5, m14 ; m5 = word: row 5
6230 packuswb m0, m1
6231 packuswb m2, m3
6232 packuswb m4, m5
6233 vpermq m0, m0, 11011000b
6234 vpermq m2, m2, 11011000b
6235 vpermq m4, m4, 11011000b
6236 vextracti128 xm1, m0, 1
6237 vextracti128 xm3, m2, 1
6238 vextracti128 xm5, m4, 1
6239 movu [r2], xm0
6240 movu [r2 + r3], xm1
6241 movu [r2 + r3 * 2], xm2
6242 movu [r2 + r6], xm3
6243 lea r2, [r2 + r3 * 4]
6244 movu [r2], xm4
6245 movu [r2 + r3], xm5
6246
6247 movu xm13, [r0 + r1] ; m13 = row 13
6248 punpckhbw xm0, xm12, xm13
6249 punpcklbw xm12, xm13
6250 vinserti128 m12, m12, xm0, 1
6251 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6252 paddw m6, m0
6253 movu xm0, [r0 + r1 * 2] ; m0 = row 14
6254 punpckhbw xm1, xm13, xm0
6255 punpcklbw xm13, xm0
6256 vinserti128 m13, m13, xm1, 1
6257 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6258 paddw m7, m1
6259
6260 pmulhrsw m6, m14 ; m6 = word: row 6
6261 pmulhrsw m7, m14 ; m7 = word: row 7
6262 packuswb m6, m7
6263 vpermq m6, m6, 11011000b
6264 vextracti128 xm7, m6, 1
6265 movu [r2 + r3 * 2], xm6
6266 movu [r2 + r6], xm7
6267 RET
6268 %endif
6269
6270 INIT_YMM avx2
6271 %if ARCH_X86_64 == 1
6272 cglobal interp_8tap_vert_pp_16x4, 4, 7, 13
6273 mov r4d, r4m
6274 shl r4d, 7
6275
6276 %ifdef PIC
6277 lea r5, [tab_LumaCoeffVer_32]
6278 add r5, r4
6279 %else
6280 lea r5, [tab_LumaCoeffVer_32 + r4]
6281 %endif
6282
6283 lea r4, [r1 * 3]
6284 sub r0, r4
6285 lea r6, [r3 * 3]
6286 mova m12, [pw_512]
6287
6288 movu xm0, [r0] ; m0 = row 0
6289 movu xm1, [r0 + r1] ; m1 = row 1
6290 punpckhbw xm2, xm0, xm1
6291 punpcklbw xm0, xm1
6292 vinserti128 m0, m0, xm2, 1
6293 pmaddubsw m0, [r5]
6294 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6295 punpckhbw xm3, xm1, xm2
6296 punpcklbw xm1, xm2
6297 vinserti128 m1, m1, xm3, 1
6298 pmaddubsw m1, [r5]
6299 movu xm3, [r0 + r4] ; m3 = row 3
6300 punpckhbw xm4, xm2, xm3
6301 punpcklbw xm2, xm3
6302 vinserti128 m2, m2, xm4, 1
6303 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6304 paddw m0, m4
6305 pmaddubsw m2, [r5]
6306 lea r0, [r0 + r1 * 4]
6307 movu xm4, [r0] ; m4 = row 4
6308 punpckhbw xm5, xm3, xm4
6309 punpcklbw xm3, xm4
6310 vinserti128 m3, m3, xm5, 1
6311 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6312 paddw m1, m5
6313 pmaddubsw m3, [r5]
6314 movu xm5, [r0 + r1] ; m5 = row 5
6315 punpckhbw xm6, xm4, xm5
6316 punpcklbw xm4, xm5
6317 vinserti128 m4, m4, xm6, 1
6318 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6319 paddw m0, m6
6320 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6321 paddw m2, m6
6322 movu xm6, [r0 + r1 * 2] ; m6 = row 6
6323 punpckhbw xm7, xm5, xm6
6324 punpcklbw xm5, xm6
6325 vinserti128 m5, m5, xm7, 1
6326 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6327 paddw m1, m7
6328 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6329 paddw m3, m7
6330 movu xm7, [r0 + r4] ; m7 = row 7
6331 punpckhbw xm8, xm6, xm7
6332 punpcklbw xm6, xm7
6333 vinserti128 m6, m6, xm8, 1
6334 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6335 paddw m0, m8
6336 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6337 paddw m2, m8
6338 lea r0, [r0 + r1 * 4]
6339 movu xm8, [r0] ; m8 = row 8
6340 punpckhbw xm9, xm7, xm8
6341 punpcklbw xm7, xm8
6342 vinserti128 m7, m7, xm9, 1
6343 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6344 paddw m1, m9
6345 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6346 paddw m3, m9
6347 movu xm9, [r0 + r1] ; m9 = row 9
6348 punpckhbw xm10, xm8, xm9
6349 punpcklbw xm8, xm9
6350 vinserti128 m8, m8, xm10, 1
6351 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6352 paddw m2, m10
6353 movu xm10, [r0 + r1 * 2] ; m10 = row 10
6354 punpckhbw xm11, xm9, xm10
6355 punpcklbw xm9, xm10
6356 vinserti128 m9, m9, xm11, 1
6357 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6358 paddw m3, m11
6359
6360 pmulhrsw m0, m12 ; m0 = word: row 0
6361 pmulhrsw m1, m12 ; m1 = word: row 1
6362 pmulhrsw m2, m12 ; m2 = word: row 2
6363 pmulhrsw m3, m12 ; m3 = word: row 3
6364 packuswb m0, m1
6365 packuswb m2, m3
6366 vpermq m0, m0, 11011000b
6367 vpermq m2, m2, 11011000b
6368 vextracti128 xm1, m0, 1
6369 vextracti128 xm3, m2, 1
6370 movu [r2], xm0
6371 movu [r2 + r3], xm1
6372 movu [r2 + r3 * 2], xm2
6373 movu [r2 + r6], xm3
6374 RET
6375 %endif
6376
6377 %macro FILTER_VER_LUMA_AVX2_16xN 2
6378 INIT_YMM avx2
6379 %if ARCH_X86_64 == 1
6380 cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15
6381 mov r4d, r4m
6382 shl r4d, 7
6383
6384 %ifdef PIC
6385 lea r5, [tab_LumaCoeffVer_32]
6386 add r5, r4
6387 %else
6388 lea r5, [tab_LumaCoeffVer_32 + r4]
6389 %endif
6390
6391 lea r4, [r1 * 3]
6392 sub r0, r4
6393 lea r6, [r3 * 3]
6394 lea r7, [r1 * 4]
6395 mova m14, [pw_512]
6396 mov r8d, %2 / 16
6397
6398 .loop:
6399 movu xm0, [r0] ; m0 = row 0
6400 movu xm1, [r0 + r1] ; m1 = row 1
6401 punpckhbw xm2, xm0, xm1
6402 punpcklbw xm0, xm1
6403 vinserti128 m0, m0, xm2, 1
6404 pmaddubsw m0, [r5]
6405 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6406 punpckhbw xm3, xm1, xm2
6407 punpcklbw xm1, xm2
6408 vinserti128 m1, m1, xm3, 1
6409 pmaddubsw m1, [r5]
6410 movu xm3, [r0 + r4] ; m3 = row 3
6411 punpckhbw xm4, xm2, xm3
6412 punpcklbw xm2, xm3
6413 vinserti128 m2, m2, xm4, 1
6414 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6415 paddw m0, m4
6416 pmaddubsw m2, [r5]
6417 lea r0, [r0 + r1 * 4]
6418 movu xm4, [r0] ; m4 = row 4
6419 punpckhbw xm5, xm3, xm4
6420 punpcklbw xm3, xm4
6421 vinserti128 m3, m3, xm5, 1
6422 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6423 paddw m1, m5
6424 pmaddubsw m3, [r5]
6425 movu xm5, [r0 + r1] ; m5 = row 5
6426 punpckhbw xm6, xm4, xm5
6427 punpcklbw xm4, xm5
6428 vinserti128 m4, m4, xm6, 1
6429 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6430 paddw m0, m6
6431 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6432 paddw m2, m6
6433 pmaddubsw m4, [r5]
6434 movu xm6, [r0 + r1 * 2] ; m6 = row 6
6435 punpckhbw xm7, xm5, xm6
6436 punpcklbw xm5, xm6
6437 vinserti128 m5, m5, xm7, 1
6438 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6439 paddw m1, m7
6440 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6441 paddw m3, m7
6442 pmaddubsw m5, [r5]
6443 movu xm7, [r0 + r4] ; m7 = row 7
6444 punpckhbw xm8, xm6, xm7
6445 punpcklbw xm6, xm7
6446 vinserti128 m6, m6, xm8, 1
6447 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6448 paddw m0, m8
6449 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6450 paddw m2, m8
6451 pmaddubsw m8, m6, [r5 + 1 * mmsize]
6452 paddw m4, m8
6453 pmaddubsw m6, [r5]
6454 lea r0, [r0 + r1 * 4]
6455 movu xm8, [r0] ; m8 = row 8
6456 punpckhbw xm9, xm7, xm8
6457 punpcklbw xm7, xm8
6458 vinserti128 m7, m7, xm9, 1
6459 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6460 paddw m1, m9
6461 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6462 paddw m3, m9
6463 pmaddubsw m9, m7, [r5 + 1 * mmsize]
6464 paddw m5, m9
6465 pmaddubsw m7, [r5]
6466 movu xm9, [r0 + r1] ; m9 = row 9
6467 punpckhbw xm10, xm8, xm9
6468 punpcklbw xm8, xm9
6469 vinserti128 m8, m8, xm10, 1
6470 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6471 paddw m2, m10
6472 pmaddubsw m10, m8, [r5 + 2 * mmsize]
6473 paddw m4, m10
6474 pmaddubsw m10, m8, [r5 + 1 * mmsize]
6475 paddw m6, m10
6476 pmaddubsw m8, [r5]
6477 movu xm10, [r0 + r1 * 2] ; m10 = row 10
6478 punpckhbw xm11, xm9, xm10
6479 punpcklbw xm9, xm10
6480 vinserti128 m9, m9, xm11, 1
6481 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6482 paddw m3, m11
6483 pmaddubsw m11, m9, [r5 + 2 * mmsize]
6484 paddw m5, m11
6485 pmaddubsw m11, m9, [r5 + 1 * mmsize]
6486 paddw m7, m11
6487 pmaddubsw m9, [r5]
6488 movu xm11, [r0 + r4] ; m11 = row 11
6489 punpckhbw xm12, xm10, xm11
6490 punpcklbw xm10, xm11
6491 vinserti128 m10, m10, xm12, 1
6492 pmaddubsw m12, m10, [r5 + 3 * mmsize]
6493 paddw m4, m12
6494 pmaddubsw m12, m10, [r5 + 2 * mmsize]
6495 paddw m6, m12
6496 pmaddubsw m12, m10, [r5 + 1 * mmsize]
6497 paddw m8, m12
6498 pmaddubsw m10, [r5]
6499 lea r0, [r0 + r1 * 4]
6500 movu xm12, [r0] ; m12 = row 12
6501 punpckhbw xm13, xm11, xm12
6502 punpcklbw xm11, xm12
6503 vinserti128 m11, m11, xm13, 1
6504 pmaddubsw m13, m11, [r5 + 3 * mmsize]
6505 paddw m5, m13
6506 pmaddubsw m13, m11, [r5 + 2 * mmsize]
6507 paddw m7, m13
6508 pmaddubsw m13, m11, [r5 + 1 * mmsize]
6509 paddw m9, m13
6510 pmaddubsw m11, [r5]
6511
6512 pmulhrsw m0, m14 ; m0 = word: row 0
6513 pmulhrsw m1, m14 ; m1 = word: row 1
6514 pmulhrsw m2, m14 ; m2 = word: row 2
6515 pmulhrsw m3, m14 ; m3 = word: row 3
6516 pmulhrsw m4, m14 ; m4 = word: row 4
6517 pmulhrsw m5, m14 ; m5 = word: row 5
6518 packuswb m0, m1
6519 packuswb m2, m3
6520 packuswb m4, m5
6521 vpermq m0, m0, 11011000b
6522 vpermq m2, m2, 11011000b
6523 vpermq m4, m4, 11011000b
6524 vextracti128 xm1, m0, 1
6525 vextracti128 xm3, m2, 1
6526 vextracti128 xm5, m4, 1
6527 movu [r2], xm0
6528 movu [r2 + r3], xm1
6529 movu [r2 + r3 * 2], xm2
6530 movu [r2 + r6], xm3
6531 lea r2, [r2 + r3 * 4]
6532 movu [r2], xm4
6533 movu [r2 + r3], xm5
6534
6535 movu xm13, [r0 + r1] ; m13 = row 13
6536 punpckhbw xm0, xm12, xm13
6537 punpcklbw xm12, xm13
6538 vinserti128 m12, m12, xm0, 1
6539 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6540 paddw m6, m0
6541 pmaddubsw m0, m12, [r5 + 2 * mmsize]
6542 paddw m8, m0
6543 pmaddubsw m0, m12, [r5 + 1 * mmsize]
6544 paddw m10, m0
6545 pmaddubsw m12, [r5]
6546 movu xm0, [r0 + r1 * 2] ; m0 = row 14
6547 punpckhbw xm1, xm13, xm0
6548 punpcklbw xm13, xm0
6549 vinserti128 m13, m13, xm1, 1
6550 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6551 paddw m7, m1
6552 pmaddubsw m1, m13, [r5 + 2 * mmsize]
6553 paddw m9, m1
6554 pmaddubsw m1, m13, [r5 + 1 * mmsize]
6555 paddw m11, m1
6556 pmaddubsw m13, [r5]
6557
6558 pmulhrsw m6, m14 ; m6 = word: row 6
6559 pmulhrsw m7, m14 ; m7 = word: row 7
6560 packuswb m6, m7
6561 vpermq m6, m6, 11011000b
6562 vextracti128 xm7, m6, 1
6563 movu [r2 + r3 * 2], xm6
6564 movu [r2 + r6], xm7
6565 lea r2, [r2 + r3 * 4]
6566
6567 movu xm1, [r0 + r4] ; m1 = row 15
6568 punpckhbw xm2, xm0, xm1
6569 punpcklbw xm0, xm1
6570 vinserti128 m0, m0, xm2, 1
6571 pmaddubsw m2, m0, [r5 + 3 * mmsize]
6572 paddw m8, m2
6573 pmaddubsw m2, m0, [r5 + 2 * mmsize]
6574 paddw m10, m2
6575 pmaddubsw m2, m0, [r5 + 1 * mmsize]
6576 paddw m12, m2
6577 pmaddubsw m0, [r5]
6578 lea r0, [r0 + r1 * 4]
6579 movu xm2, [r0] ; m2 = row 16
6580 punpckhbw xm3, xm1, xm2
6581 punpcklbw xm1, xm2
6582 vinserti128 m1, m1, xm3, 1
6583 pmaddubsw m3, m1, [r5 + 3 * mmsize]
6584 paddw m9, m3
6585 pmaddubsw m3, m1, [r5 + 2 * mmsize]
6586 paddw m11, m3
6587 pmaddubsw m3, m1, [r5 + 1 * mmsize]
6588 paddw m13, m3
6589 pmaddubsw m1, [r5]
6590 movu xm3, [r0 + r1] ; m3 = row 17
6591 punpckhbw xm4, xm2, xm3
6592 punpcklbw xm2, xm3
6593 vinserti128 m2, m2, xm4, 1
6594 pmaddubsw m4, m2, [r5 + 3 * mmsize]
6595 paddw m10, m4
6596 pmaddubsw m4, m2, [r5 + 2 * mmsize]
6597 paddw m12, m4
6598 pmaddubsw m2, [r5 + 1 * mmsize]
6599 paddw m0, m2
6600 movu xm4, [r0 + r1 * 2] ; m4 = row 18
6601 punpckhbw xm5, xm3, xm4
6602 punpcklbw xm3, xm4
6603 vinserti128 m3, m3, xm5, 1
6604 pmaddubsw m5, m3, [r5 + 3 * mmsize]
6605 paddw m11, m5
6606 pmaddubsw m5, m3, [r5 + 2 * mmsize]
6607 paddw m13, m5
6608 pmaddubsw m3, [r5 + 1 * mmsize]
6609 paddw m1, m3
6610 movu xm5, [r0 + r4] ; m5 = row 19
6611 punpckhbw xm6, xm4, xm5
6612 punpcklbw xm4, xm5
6613 vinserti128 m4, m4, xm6, 1
6614 pmaddubsw m6, m4, [r5 + 3 * mmsize]
6615 paddw m12, m6
6616 pmaddubsw m4, [r5 + 2 * mmsize]
6617 paddw m0, m4
6618 lea r0, [r0 + r1 * 4]
6619 movu xm6, [r0] ; m6 = row 20
6620 punpckhbw xm7, xm5, xm6
6621 punpcklbw xm5, xm6
6622 vinserti128 m5, m5, xm7, 1
6623 pmaddubsw m7, m5, [r5 + 3 * mmsize]
6624 paddw m13, m7
6625 pmaddubsw m5, [r5 + 2 * mmsize]
6626 paddw m1, m5
6627 movu xm7, [r0 + r1] ; m7 = row 21
6628 punpckhbw xm2, xm6, xm7
6629 punpcklbw xm6, xm7
6630 vinserti128 m6, m6, xm2, 1
6631 pmaddubsw m6, [r5 + 3 * mmsize]
6632 paddw m0, m6
6633 movu xm2, [r0 + r1 * 2] ; m2 = row 22
6634 punpckhbw xm3, xm7, xm2
6635 punpcklbw xm7, xm2
6636 vinserti128 m7, m7, xm3, 1
6637 pmaddubsw m7, [r5 + 3 * mmsize]
6638 paddw m1, m7
6639
6640 pmulhrsw m8, m14 ; m8 = word: row 8
6641 pmulhrsw m9, m14 ; m9 = word: row 9
6642 pmulhrsw m10, m14 ; m10 = word: row 10
6643 pmulhrsw m11, m14 ; m11 = word: row 11
6644 pmulhrsw m12, m14 ; m12 = word: row 12
6645 pmulhrsw m13, m14 ; m13 = word: row 13
6646 pmulhrsw m0, m14 ; m0 = word: row 14
6647 pmulhrsw m1, m14 ; m1 = word: row 15
6648 packuswb m8, m9
6649 packuswb m10, m11
6650 packuswb m12, m13
6651 packuswb m0, m1
6652 vpermq m8, m8, 11011000b
6653 vpermq m10, m10, 11011000b
6654 vpermq m12, m12, 11011000b
6655 vpermq m0, m0, 11011000b
6656 vextracti128 xm9, m8, 1
6657 vextracti128 xm11, m10, 1
6658 vextracti128 xm13, m12, 1
6659 vextracti128 xm1, m0, 1
6660 movu [r2], xm8
6661 movu [r2 + r3], xm9
6662 movu [r2 + r3 * 2], xm10
6663 movu [r2 + r6], xm11
6664 lea r2, [r2 + r3 * 4]
6665 movu [r2], xm12
6666 movu [r2 + r3], xm13
6667 movu [r2 + r3 * 2], xm0
6668 movu [r2 + r6], xm1
6669 lea r2, [r2 + r3 * 4]
6670 sub r0, r7
6671 dec r8d
6672 jnz .loop
6673 RET
6674 %endif
6675 %endmacro
6676
6677 FILTER_VER_LUMA_AVX2_16xN 16, 32
6678 FILTER_VER_LUMA_AVX2_16xN 16, 64
6679
6680 %macro PROCESS_LUMA_AVX2_W16_16R 0
6681 movu xm0, [r0] ; m0 = row 0
6682 movu xm1, [r0 + r1] ; m1 = row 1
6683 punpckhbw xm2, xm0, xm1
6684 punpcklbw xm0, xm1
6685 vinserti128 m0, m0, xm2, 1
6686 pmaddubsw m0, [r5]
6687 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6688 punpckhbw xm3, xm1, xm2
6689 punpcklbw xm1, xm2
6690 vinserti128 m1, m1, xm3, 1
6691 pmaddubsw m1, [r5]
6692 movu xm3, [r0 + r4] ; m3 = row 3
6693 punpckhbw xm4, xm2, xm3
6694 punpcklbw xm2, xm3
6695 vinserti128 m2, m2, xm4, 1
6696 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6697 paddw m0, m4
6698 pmaddubsw m2, [r5]
6699 lea r7, [r0 + r1 * 4]
6700 movu xm4, [r7] ; m4 = row 4
6701 punpckhbw xm5, xm3, xm4
6702 punpcklbw xm3, xm4
6703 vinserti128 m3, m3, xm5, 1
6704 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6705 paddw m1, m5
6706 pmaddubsw m3, [r5]
6707 movu xm5, [r7 + r1] ; m5 = row 5
6708 punpckhbw xm6, xm4, xm5
6709 punpcklbw xm4, xm5
6710 vinserti128 m4, m4, xm6, 1
6711 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6712 paddw m0, m6
6713 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6714 paddw m2, m6
6715 pmaddubsw m4, [r5]
6716 movu xm6, [r7 + r1 * 2] ; m6 = row 6
6717 punpckhbw xm7, xm5, xm6
6718 punpcklbw xm5, xm6
6719 vinserti128 m5, m5, xm7, 1
6720 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6721 paddw m1, m7
6722 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6723 paddw m3, m7
6724 pmaddubsw m5, [r5]
6725 movu xm7, [r7 + r4] ; m7 = row 7
6726 punpckhbw xm8, xm6, xm7
6727 punpcklbw xm6, xm7
6728 vinserti128 m6, m6, xm8, 1
6729 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6730 paddw m0, m8
6731 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6732 paddw m2, m8
6733 pmaddubsw m8, m6, [r5 + 1 * mmsize]
6734 paddw m4, m8
6735 pmaddubsw m6, [r5]
6736 lea r7, [r7 + r1 * 4]
6737 movu xm8, [r7] ; m8 = row 8
6738 punpckhbw xm9, xm7, xm8
6739 punpcklbw xm7, xm8
6740 vinserti128 m7, m7, xm9, 1
6741 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6742 paddw m1, m9
6743 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6744 paddw m3, m9
6745 pmaddubsw m9, m7, [r5 + 1 * mmsize]
6746 paddw m5, m9
6747 pmaddubsw m7, [r5]
6748 movu xm9, [r7 + r1] ; m9 = row 9
6749 punpckhbw xm10, xm8, xm9
6750 punpcklbw xm8, xm9
6751 vinserti128 m8, m8, xm10, 1
6752 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6753 paddw m2, m10
6754 pmaddubsw m10, m8, [r5 + 2 * mmsize]
6755 paddw m4, m10
6756 pmaddubsw m10, m8, [r5 + 1 * mmsize]
6757 paddw m6, m10
6758 pmaddubsw m8, [r5]
6759 movu xm10, [r7 + r1 * 2] ; m10 = row 10
6760 punpckhbw xm11, xm9, xm10
6761 punpcklbw xm9, xm10
6762 vinserti128 m9, m9, xm11, 1
6763 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6764 paddw m3, m11
6765 pmaddubsw m11, m9, [r5 + 2 * mmsize]
6766 paddw m5, m11
6767 pmaddubsw m11, m9, [r5 + 1 * mmsize]
6768 paddw m7, m11
6769 pmaddubsw m9, [r5]
6770 movu xm11, [r7 + r4] ; m11 = row 11
6771 punpckhbw xm12, xm10, xm11
6772 punpcklbw xm10, xm11
6773 vinserti128 m10, m10, xm12, 1
6774 pmaddubsw m12, m10, [r5 + 3 * mmsize]
6775 paddw m4, m12
6776 pmaddubsw m12, m10, [r5 + 2 * mmsize]
6777 paddw m6, m12
6778 pmaddubsw m12, m10, [r5 + 1 * mmsize]
6779 paddw m8, m12
6780 pmaddubsw m10, [r5]
6781 lea r7, [r7 + r1 * 4]
6782 movu xm12, [r7] ; m12 = row 12
6783 punpckhbw xm13, xm11, xm12
6784 punpcklbw xm11, xm12
6785 vinserti128 m11, m11, xm13, 1
6786 pmaddubsw m13, m11, [r5 + 3 * mmsize]
6787 paddw m5, m13
6788 pmaddubsw m13, m11, [r5 + 2 * mmsize]
6789 paddw m7, m13
6790 pmaddubsw m13, m11, [r5 + 1 * mmsize]
6791 paddw m9, m13
6792 pmaddubsw m11, [r5]
6793
6794 pmulhrsw m0, m14 ; m0 = word: row 0
6795 pmulhrsw m1, m14 ; m1 = word: row 1
6796 pmulhrsw m2, m14 ; m2 = word: row 2
6797 pmulhrsw m3, m14 ; m3 = word: row 3
6798 pmulhrsw m4, m14 ; m4 = word: row 4
6799 pmulhrsw m5, m14 ; m5 = word: row 5
6800 packuswb m0, m1
6801 packuswb m2, m3
6802 packuswb m4, m5
6803 vpermq m0, m0, 11011000b
6804 vpermq m2, m2, 11011000b
6805 vpermq m4, m4, 11011000b
6806 vextracti128 xm1, m0, 1
6807 vextracti128 xm3, m2, 1
6808 vextracti128 xm5, m4, 1
6809 movu [r2], xm0
6810 movu [r2 + r3], xm1
6811 movu [r2 + r3 * 2], xm2
6812 movu [r2 + r6], xm3
6813 lea r8, [r2 + r3 * 4]
6814 movu [r8], xm4
6815 movu [r8 + r3], xm5
6816
6817 movu xm13, [r7 + r1] ; m13 = row 13
6818 punpckhbw xm0, xm12, xm13
6819 punpcklbw xm12, xm13
6820 vinserti128 m12, m12, xm0, 1
6821 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6822 paddw m6, m0
6823 pmaddubsw m0, m12, [r5 + 2 * mmsize]
6824 paddw m8, m0
6825 pmaddubsw m0, m12, [r5 + 1 * mmsize]
6826 paddw m10, m0
6827 pmaddubsw m12, [r5]
6828 movu xm0, [r7 + r1 * 2] ; m0 = row 14
6829 punpckhbw xm1, xm13, xm0
6830 punpcklbw xm13, xm0
6831 vinserti128 m13, m13, xm1, 1
6832 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6833 paddw m7, m1
6834 pmaddubsw m1, m13, [r5 + 2 * mmsize]
6835 paddw m9, m1
6836 pmaddubsw m1, m13, [r5 + 1 * mmsize]
6837 paddw m11, m1
6838 pmaddubsw m13, [r5]
6839
6840 pmulhrsw m6, m14 ; m6 = word: row 6
6841 pmulhrsw m7, m14 ; m7 = word: row 7
6842 packuswb m6, m7
6843 vpermq m6, m6, 11011000b
6844 vextracti128 xm7, m6, 1
6845 movu [r8 + r3 * 2], xm6
6846 movu [r8 + r6], xm7
6847 lea r8, [r8 + r3 * 4]
6848
6849 movu xm1, [r7 + r4] ; m1 = row 15
6850 punpckhbw xm2, xm0, xm1
6851 punpcklbw xm0, xm1
6852 vinserti128 m0, m0, xm2, 1
6853 pmaddubsw m2, m0, [r5 + 3 * mmsize]
6854 paddw m8, m2
6855 pmaddubsw m2, m0, [r5 + 2 * mmsize]
6856 paddw m10, m2
6857 pmaddubsw m2, m0, [r5 + 1 * mmsize]
6858 paddw m12, m2
6859 pmaddubsw m0, [r5]
6860 lea r7, [r7 + r1 * 4]
6861 movu xm2, [r7] ; m2 = row 16
6862 punpckhbw xm3, xm1, xm2
6863 punpcklbw xm1, xm2
6864 vinserti128 m1, m1, xm3, 1
6865 pmaddubsw m3, m1, [r5 + 3 * mmsize]
6866 paddw m9, m3
6867 pmaddubsw m3, m1, [r5 + 2 * mmsize]
6868 paddw m11, m3
6869 pmaddubsw m3, m1, [r5 + 1 * mmsize]
6870 paddw m13, m3
6871 pmaddubsw m1, [r5]
6872 movu xm3, [r7 + r1] ; m3 = row 17
6873 punpckhbw xm4, xm2, xm3
6874 punpcklbw xm2, xm3
6875 vinserti128 m2, m2, xm4, 1
6876 pmaddubsw m4, m2, [r5 + 3 * mmsize]
6877 paddw m10, m4
6878 pmaddubsw m4, m2, [r5 + 2 * mmsize]
6879 paddw m12, m4
6880 pmaddubsw m2, [r5 + 1 * mmsize]
6881 paddw m0, m2
6882 movu xm4, [r7 + r1 * 2] ; m4 = row 18
6883 punpckhbw xm5, xm3, xm4
6884 punpcklbw xm3, xm4
6885 vinserti128 m3, m3, xm5, 1
6886 pmaddubsw m5, m3, [r5 + 3 * mmsize]
6887 paddw m11, m5
6888 pmaddubsw m5, m3, [r5 + 2 * mmsize]
6889 paddw m13, m5
6890 pmaddubsw m3, [r5 + 1 * mmsize]
6891 paddw m1, m3
6892 movu xm5, [r7 + r4] ; m5 = row 19
6893 punpckhbw xm6, xm4, xm5
6894 punpcklbw xm4, xm5
6895 vinserti128 m4, m4, xm6, 1
6896 pmaddubsw m6, m4, [r5 + 3 * mmsize]
6897 paddw m12, m6
6898 pmaddubsw m4, [r5 + 2 * mmsize]
6899 paddw m0, m4
6900 lea r7, [r7 + r1 * 4]
6901 movu xm6, [r7] ; m6 = row 20
6902 punpckhbw xm7, xm5, xm6
6903 punpcklbw xm5, xm6
6904 vinserti128 m5, m5, xm7, 1
6905 pmaddubsw m7, m5, [r5 + 3 * mmsize]
6906 paddw m13, m7
6907 pmaddubsw m5, [r5 + 2 * mmsize]
6908 paddw m1, m5
6909 movu xm7, [r7 + r1] ; m7 = row 21
6910 punpckhbw xm2, xm6, xm7
6911 punpcklbw xm6, xm7
6912 vinserti128 m6, m6, xm2, 1
6913 pmaddubsw m6, [r5 + 3 * mmsize]
6914 paddw m0, m6
6915 movu xm2, [r7 + r1 * 2] ; m2 = row 22
6916 punpckhbw xm3, xm7, xm2
6917 punpcklbw xm7, xm2
6918 vinserti128 m7, m7, xm3, 1
6919 pmaddubsw m7, [r5 + 3 * mmsize]
6920 paddw m1, m7
6921
6922 pmulhrsw m8, m14 ; m8 = word: row 8
6923 pmulhrsw m9, m14 ; m9 = word: row 9
6924 pmulhrsw m10, m14 ; m10 = word: row 10
6925 pmulhrsw m11, m14 ; m11 = word: row 11
6926 pmulhrsw m12, m14 ; m12 = word: row 12
6927 pmulhrsw m13, m14 ; m13 = word: row 13
6928 pmulhrsw m0, m14 ; m0 = word: row 14
6929 pmulhrsw m1, m14 ; m1 = word: row 15
6930 packuswb m8, m9
6931 packuswb m10, m11
6932 packuswb m12, m13
6933 packuswb m0, m1
6934 vpermq m8, m8, 11011000b
6935 vpermq m10, m10, 11011000b
6936 vpermq m12, m12, 11011000b
6937 vpermq m0, m0, 11011000b
6938 vextracti128 xm9, m8, 1
6939 vextracti128 xm11, m10, 1
6940 vextracti128 xm13, m12, 1
6941 vextracti128 xm1, m0, 1
6942 movu [r8], xm8
6943 movu [r8 + r3], xm9
6944 movu [r8 + r3 * 2], xm10
6945 movu [r8 + r6], xm11
6946 lea r8, [r8 + r3 * 4]
6947 movu [r8], xm12
6948 movu [r8 + r3], xm13
6949 movu [r8 + r3 * 2], xm0
6950 movu [r8 + r6], xm1
6951 %endmacro
6952
6953 %macro PROCESS_LUMA_AVX2_W16_8R 0
6954 movu xm0, [r0] ; m0 = row 0
6955 movu xm1, [r0 + r1] ; m1 = row 1
6956 punpckhbw xm2, xm0, xm1
6957 punpcklbw xm0, xm1
6958 vinserti128 m0, m0, xm2, 1
6959 pmaddubsw m0, [r5]
6960 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6961 punpckhbw xm3, xm1, xm2
6962 punpcklbw xm1, xm2
6963 vinserti128 m1, m1, xm3, 1
6964 pmaddubsw m1, [r5]
6965 movu xm3, [r0 + r4] ; m3 = row 3
6966 punpckhbw xm4, xm2, xm3
6967 punpcklbw xm2, xm3
6968 vinserti128 m2, m2, xm4, 1
6969 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6970 paddw m0, m4
6971 pmaddubsw m2, [r5]
6972 lea r7, [r0 + r1 * 4]
6973 movu xm4, [r7] ; m4 = row 4
6974 punpckhbw xm5, xm3, xm4
6975 punpcklbw xm3, xm4
6976 vinserti128 m3, m3, xm5, 1
6977 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6978 paddw m1, m5
6979 pmaddubsw m3, [r5]
6980 movu xm5, [r7 + r1] ; m5 = row 5
6981 punpckhbw xm6, xm4, xm5
6982 punpcklbw xm4, xm5
6983 vinserti128 m4, m4, xm6, 1
6984 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6985 paddw m0, m6
6986 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6987 paddw m2, m6
6988 pmaddubsw m4, [r5]
6989 movu xm6, [r7 + r1 * 2] ; m6 = row 6
6990 punpckhbw xm7, xm5, xm6
6991 punpcklbw xm5, xm6
6992 vinserti128 m5, m5, xm7, 1
6993 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6994 paddw m1, m7
6995 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6996 paddw m3, m7
6997 pmaddubsw m5, [r5]
6998 movu xm7, [r7 + r4] ; m7 = row 7
6999 punpckhbw xm8, xm6, xm7
7000 punpcklbw xm6, xm7
7001 vinserti128 m6, m6, xm8, 1
7002 pmaddubsw m8, m6, [r5 + 3 * mmsize]
7003 paddw m0, m8
7004 pmaddubsw m8, m6, [r5 + 2 * mmsize]
7005 paddw m2, m8
7006 pmaddubsw m8, m6, [r5 + 1 * mmsize]
7007 paddw m4, m8
7008 pmaddubsw m6, [r5]
7009 lea r7, [r7 + r1 * 4]
7010 movu xm8, [r7] ; m8 = row 8
7011 punpckhbw xm9, xm7, xm8
7012 punpcklbw xm7, xm8
7013 vinserti128 m7, m7, xm9, 1
7014 pmaddubsw m9, m7, [r5 + 3 * mmsize]
7015 paddw m1, m9
7016 pmaddubsw m9, m7, [r5 + 2 * mmsize]
7017 paddw m3, m9
7018 pmaddubsw m9, m7, [r5 + 1 * mmsize]
7019 paddw m5, m9
7020 pmaddubsw m7, [r5]
7021 movu xm9, [r7 + r1] ; m9 = row 9
7022 punpckhbw xm10, xm8, xm9
7023 punpcklbw xm8, xm9
7024 vinserti128 m8, m8, xm10, 1
7025 pmaddubsw m10, m8, [r5 + 3 * mmsize]
7026 paddw m2, m10
7027 pmaddubsw m10, m8, [r5 + 2 * mmsize]
7028 paddw m4, m10
7029 pmaddubsw m10, m8, [r5 + 1 * mmsize]
7030 paddw m6, m10
7031 movu xm10, [r7 + r1 * 2] ; m10 = row 10
7032 punpckhbw xm11, xm9, xm10
7033 punpcklbw xm9, xm10
7034 vinserti128 m9, m9, xm11, 1
7035 pmaddubsw m11, m9, [r5 + 3 * mmsize]
7036 paddw m3, m11
7037 pmaddubsw m11, m9, [r5 + 2 * mmsize]
7038 paddw m5, m11
7039 pmaddubsw m11, m9, [r5 + 1 * mmsize]
7040 paddw m7, m11
7041 movu xm11, [r7 + r4] ; m11 = row 11
7042 punpckhbw xm12, xm10, xm11
7043 punpcklbw xm10, xm11
7044 vinserti128 m10, m10, xm12, 1
7045 pmaddubsw m12, m10, [r5 + 3 * mmsize]
7046 paddw m4, m12
7047 pmaddubsw m12, m10, [r5 + 2 * mmsize]
7048 paddw m6, m12
7049 lea r7, [r7 + r1 * 4]
7050 movu xm12, [r7] ; m12 = row 12
7051 punpckhbw xm13, xm11, xm12
7052 punpcklbw xm11, xm12
7053 vinserti128 m11, m11, xm13, 1
7054 pmaddubsw m13, m11, [r5 + 3 * mmsize]
7055 paddw m5, m13
7056 pmaddubsw m13, m11, [r5 + 2 * mmsize]
7057 paddw m7, m13
7058
7059 pmulhrsw m0, m14 ; m0 = word: row 0
7060 pmulhrsw m1, m14 ; m1 = word: row 1
7061 pmulhrsw m2, m14 ; m2 = word: row 2
7062 pmulhrsw m3, m14 ; m3 = word: row 3
7063 pmulhrsw m4, m14 ; m4 = word: row 4
7064 pmulhrsw m5, m14 ; m5 = word: row 5
7065 packuswb m0, m1
7066 packuswb m2, m3
7067 packuswb m4, m5
7068 vpermq m0, m0, 11011000b
7069 vpermq m2, m2, 11011000b
7070 vpermq m4, m4, 11011000b
7071 vextracti128 xm1, m0, 1
7072 vextracti128 xm3, m2, 1
7073 vextracti128 xm5, m4, 1
7074 movu [r2], xm0
7075 movu [r2 + r3], xm1
7076 movu [r2 + r3 * 2], xm2
7077 movu [r2 + r6], xm3
7078 lea r8, [r2 + r3 * 4]
7079 movu [r8], xm4
7080 movu [r8 + r3], xm5
7081
7082 movu xm13, [r7 + r1] ; m13 = row 13
7083 punpckhbw xm0, xm12, xm13
7084 punpcklbw xm12, xm13
7085 vinserti128 m12, m12, xm0, 1
7086 pmaddubsw m0, m12, [r5 + 3 * mmsize]
7087 paddw m6, m0
7088 movu xm0, [r7 + r1 * 2] ; m0 = row 14
7089 punpckhbw xm1, xm13, xm0
7090 punpcklbw xm13, xm0
7091 vinserti128 m13, m13, xm1, 1
7092 pmaddubsw m1, m13, [r5 + 3 * mmsize]
7093 paddw m7, m1
7094
7095 pmulhrsw m6, m14 ; m6 = word: row 6
7096 pmulhrsw m7, m14 ; m7 = word: row 7
7097 packuswb m6, m7
7098 vpermq m6, m6, 11011000b
7099 vextracti128 xm7, m6, 1
7100 movu [r8 + r3 * 2], xm6
7101 movu [r8 + r6], xm7
7102 %endmacro
7103
7104 INIT_YMM avx2
7105 %if ARCH_X86_64 == 1
7106 cglobal interp_8tap_vert_pp_24x32, 4, 11, 15
7107 mov r4d, r4m
7108 shl r4d, 7
7109
7110 %ifdef PIC
7111 lea r5, [tab_LumaCoeffVer_32]
7112 add r5, r4
7113 %else
7114 lea r5, [tab_LumaCoeffVer_32 + r4]
7115 %endif
7116
7117 lea r4, [r1 * 3]
7118 sub r0, r4
7119 lea r6, [r3 * 3]
7120 lea r10, [r1 * 4]
7121 mova m14, [pw_512]
7122 mov r9d, 2
7123 .loopH:
7124 PROCESS_LUMA_AVX2_W16_16R
7125 add r2, 16
7126 add r0, 16
7127
7128 movq xm1, [r0] ; m1 = row 0
7129 movq xm2, [r0 + r1] ; m2 = row 1
7130 punpcklbw xm1, xm2
7131 movq xm3, [r0 + r1 * 2] ; m3 = row 2
7132 punpcklbw xm2, xm3
7133 vinserti128 m5, m1, xm2, 1
7134 pmaddubsw m5, [r5]
7135 movq xm4, [r0 + r4] ; m4 = row 3
7136 punpcklbw xm3, xm4
7137 lea r7, [r0 + r1 * 4]
7138 movq xm1, [r7] ; m1 = row 4
7139 punpcklbw xm4, xm1
7140 vinserti128 m2, m3, xm4, 1
7141 pmaddubsw m0, m2, [r5 + 1 * mmsize]
7142 paddw m5, m0
7143 pmaddubsw m2, [r5]
7144 movq xm3, [r7 + r1] ; m3 = row 5
7145 punpcklbw xm1, xm3
7146 movq xm4, [r7 + r1 * 2] ; m4 = row 6
7147 punpcklbw xm3, xm4
7148 vinserti128 m1, m1, xm3, 1
7149 pmaddubsw m3, m1, [r5 + 2 * mmsize]
7150 paddw m5, m3
7151 pmaddubsw m0, m1, [r5 + 1 * mmsize]
7152 paddw m2, m0
7153 pmaddubsw m1, [r5]
7154 movq xm3, [r7 + r4] ; m3 = row 7
7155 punpcklbw xm4, xm3
7156 lea r7, [r7 + r1 * 4]
7157 movq xm0, [r7] ; m0 = row 8
7158 punpcklbw xm3, xm0
7159 vinserti128 m4, m4, xm3, 1
7160 pmaddubsw m3, m4, [r5 + 3 * mmsize]
7161 paddw m5, m3
7162 pmaddubsw m3, m4, [r5 + 2 * mmsize]
7163 paddw m2, m3
7164 pmaddubsw m3, m4, [r5 + 1 * mmsize]
7165 paddw m1, m3
7166 pmaddubsw m4, [r5]
7167 movq xm3, [r7 + r1] ; m3 = row 9
7168 punpcklbw xm0, xm3
7169 movq xm6, [r7 + r1 * 2] ; m6 = row 10
7170 punpcklbw xm3, xm6
7171 vinserti128 m0, m0, xm3, 1
7172 pmaddubsw m3, m0, [r5 + 3 * mmsize]
7173 paddw m2, m3
7174 pmaddubsw m3, m0, [r5 + 2 * mmsize]
7175 paddw m1, m3
7176 pmaddubsw m3, m0, [r5 + 1 * mmsize]
7177 paddw m4, m3
7178 pmaddubsw m0, [r5]
7179
7180 movq xm3, [r7 + r4] ; m3 = row 11
7181 punpcklbw xm6, xm3
7182 lea r7, [r7 + r1 * 4]
7183 movq xm7, [r7] ; m7 = row 12
7184 punpcklbw xm3, xm7
7185 vinserti128 m6, m6, xm3, 1
7186 pmaddubsw m3, m6, [r5 + 3 * mmsize]
7187 paddw m1, m3
7188 pmaddubsw m3, m6, [r5 + 2 * mmsize]
7189 paddw m4, m3
7190 pmaddubsw m3, m6, [r5 + 1 * mmsize]
7191 paddw m0, m3
7192 pmaddubsw m6, [r5]
7193 movq xm3, [r7 + r1] ; m3 = row 13
7194 punpcklbw xm7, xm3
7195 movq xm8, [r7 + r1 * 2] ; m8 = row 14
7196 punpcklbw xm3, xm8
7197 vinserti128 m7, m7, xm3, 1
7198 pmaddubsw m3, m7, [r5 + 3 * mmsize]
7199 paddw m4, m3
7200 pmaddubsw m3, m7, [r5 + 2 * mmsize]
7201 paddw m0, m3
7202 pmaddubsw m3, m7, [r5 + 1 * mmsize]
7203 paddw m6, m3
7204 pmaddubsw m7, [r5]
7205 movq xm3, [r7 + r4] ; m3 = row 15
7206 punpcklbw xm8, xm3
7207 lea r7, [r7 + r1 * 4]
7208 movq xm9, [r7] ; m9 = row 16
7209 punpcklbw xm3, xm9
7210 vinserti128 m8, m8, xm3, 1
7211 pmaddubsw m3, m8, [r5 + 3 * mmsize]
7212 paddw m0, m3
7213 pmaddubsw m3, m8, [r5 + 2 * mmsize]
7214 paddw m6, m3
7215 pmaddubsw m3, m8, [r5 + 1 * mmsize]
7216 paddw m7, m3
7217 pmaddubsw m8, [r5]
7218 movq xm3, [r7 + r1] ; m3 = row 17
7219 punpcklbw xm9, xm3
7220 movq xm10, [r7 + r1 * 2] ; m10 = row 18
7221 punpcklbw xm3, xm10
7222 vinserti128 m9, m9, xm3, 1
7223 pmaddubsw m3, m9, [r5 + 3 * mmsize]
7224 paddw m6, m3
7225 pmaddubsw m3, m9, [r5 + 2 * mmsize]
7226 paddw m7, m3
7227 pmaddubsw m3, m9, [r5 + 1 * mmsize]
7228 paddw m8, m3
7229 movq xm3, [r7 + r4] ; m3 = row 19
7230 punpcklbw xm10, xm3
7231 lea r7, [r7 + r1 * 4]
7232 movq xm9, [r7] ; m9 = row 20
7233 punpcklbw xm3, xm9
7234 vinserti128 m10, m10, xm3, 1
7235 pmaddubsw m3, m10, [r5 + 3 * mmsize]
7236 paddw m7, m3
7237 pmaddubsw m3, m10, [r5 + 2 * mmsize]
7238 paddw m8, m3
7239 movq xm3, [r7 + r1] ; m3 = row 21
7240 punpcklbw xm9, xm3
7241 movq xm10, [r7 + r1 * 2] ; m10 = row 22
7242 punpcklbw xm3, xm10
7243 vinserti128 m9, m9, xm3, 1
7244 pmaddubsw m3, m9, [r5 + 3 * mmsize]
7245 paddw m8, m3
7246
7247 pmulhrsw m5, m14 ; m5 = word: row 0, row 1
7248 pmulhrsw m2, m14 ; m2 = word: row 2, row 3
7249 pmulhrsw m1, m14 ; m1 = word: row 4, row 5
7250 pmulhrsw m4, m14 ; m4 = word: row 6, row 7
7251 pmulhrsw m0, m14 ; m0 = word: row 8, row 9
7252 pmulhrsw m6, m14 ; m6 = word: row 10, row 11
7253 pmulhrsw m7, m14 ; m7 = word: row 12, row 13
7254 pmulhrsw m8, m14 ; m8 = word: row 14, row 15
7255 packuswb m5, m2
7256 packuswb m1, m4
7257 packuswb m0, m6
7258 packuswb m7, m8
7259 vextracti128 xm2, m5, 1
7260 vextracti128 xm4, m1, 1
7261 vextracti128 xm6, m0, 1
7262 vextracti128 xm8, m7, 1
7263 movq [r2], xm5
7264 movq [r2 + r3], xm2
7265 movhps [r2 + r3 * 2], xm5
7266 movhps [r2 + r6], xm2
7267 lea r8, [r2 + r3 * 4]
7268 movq [r8], xm1
7269 movq [r8 + r3], xm4
7270 movhps [r8 + r3 * 2], xm1
7271 movhps [r8 + r6], xm4
7272 lea r8, [r8 + r3 * 4]
7273 movq [r8], xm0
7274 movq [r8 + r3], xm6
7275 movhps [r8 + r3 * 2], xm0
7276 movhps [r8 + r6], xm6
7277 lea r8, [r8 + r3 * 4]
7278 movq [r8], xm7
7279 movq [r8 + r3], xm8
7280 movhps [r8 + r3 * 2], xm7
7281 movhps [r8 + r6], xm8
7282
7283 sub r7, r10
7284 lea r0, [r7 - 16]
7285 lea r2, [r8 + r3 * 4 - 16]
7286 dec r9d
7287 jnz .loopH
7288 RET
7289 %endif
7290
7291 %macro FILTER_VER_LUMA_AVX2_32xN 2
7292 INIT_YMM avx2
7293 %if ARCH_X86_64 == 1
7294 cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
7295 mov r4d, r4m
7296 shl r4d, 7
7297
7298 %ifdef PIC
7299 lea r5, [tab_LumaCoeffVer_32]
7300 add r5, r4
7301 %else
7302 lea r5, [tab_LumaCoeffVer_32 + r4]
7303 %endif
7304
7305 lea r4, [r1 * 3]
7306 sub r0, r4
7307 lea r6, [r3 * 3]
7308 lea r11, [r1 * 4]
7309 mova m14, [pw_512]
7310 mov r9d, %2 / 16
7311 .loopH:
7312 mov r10d, %1 / 16
7313 .loopW:
7314 PROCESS_LUMA_AVX2_W16_16R
7315 add r2, 16
7316 add r0, 16
7317 dec r10d
7318 jnz .loopW
7319 sub r7, r11
7320 lea r0, [r7 - 16]
7321 lea r2, [r8 + r3 * 4 - 16]
7322 dec r9d
7323 jnz .loopH
7324 RET
7325 %endif
7326 %endmacro
7327
7328 FILTER_VER_LUMA_AVX2_32xN 32, 32
7329 FILTER_VER_LUMA_AVX2_32xN 32, 64
7330
7331 INIT_YMM avx2
7332 %if ARCH_X86_64 == 1
7333 cglobal interp_8tap_vert_pp_32x16, 4, 10, 15
7334 mov r4d, r4m
7335 shl r4d, 7
7336
7337 %ifdef PIC
7338 lea r5, [tab_LumaCoeffVer_32]
7339 add r5, r4
7340 %else
7341 lea r5, [tab_LumaCoeffVer_32 + r4]
7342 %endif
7343
7344 lea r4, [r1 * 3]
7345 sub r0, r4
7346 lea r6, [r3 * 3]
7347 mova m14, [pw_512]
7348 mov r9d, 2
7349 .loopW:
7350 PROCESS_LUMA_AVX2_W16_16R
7351 add r2, 16
7352 add r0, 16
7353 dec r9d
7354 jnz .loopW
7355 RET
7356 %endif
7357
7358 INIT_YMM avx2
7359 %if ARCH_X86_64 == 1
7360 cglobal interp_8tap_vert_pp_32x24, 4, 10, 15
7361 mov r4d, r4m
7362 shl r4d, 7
7363
7364 %ifdef PIC
7365 lea r5, [tab_LumaCoeffVer_32]
7366 add r5, r4
7367 %else
7368 lea r5, [tab_LumaCoeffVer_32 + r4]
7369 %endif
7370
7371 lea r4, [r1 * 3]
7372 sub r0, r4
7373 lea r6, [r3 * 3]
7374 mova m14, [pw_512]
7375 mov r9d, 2
7376 .loopW:
7377 PROCESS_LUMA_AVX2_W16_16R
7378 add r2, 16
7379 add r0, 16
7380 dec r9d
7381 jnz .loopW
7382 lea r9, [r1 * 4]
7383 sub r7, r9
7384 lea r0, [r7 - 16]
7385 lea r2, [r8 + r3 * 4 - 16]
7386 mov r9d, 2
7387 .loop:
7388 PROCESS_LUMA_AVX2_W16_8R
7389 add r2, 16
7390 add r0, 16
7391 dec r9d
7392 jnz .loop
7393 RET
7394 %endif
7395
7396 INIT_YMM avx2
7397 %if ARCH_X86_64 == 1
7398 cglobal interp_8tap_vert_pp_32x8, 4, 10, 15
7399 mov r4d, r4m
7400 shl r4d, 7
7401
7402 %ifdef PIC
7403 lea r5, [tab_LumaCoeffVer_32]
7404 add r5, r4
7405 %else
7406 lea r5, [tab_LumaCoeffVer_32 + r4]
7407 %endif
7408
7409 lea r4, [r1 * 3]
7410 sub r0, r4
7411 lea r6, [r3 * 3]
7412 mova m14, [pw_512]
7413 mov r9d, 2
7414 .loopW:
7415 PROCESS_LUMA_AVX2_W16_8R
7416 add r2, 16
7417 add r0, 16
7418 dec r9d
7419 jnz .loopW
7420 RET
7421 %endif
7422
7423 INIT_YMM avx2
7424 %if ARCH_X86_64 == 1
7425 cglobal interp_8tap_vert_pp_48x64, 4, 12, 15
7426 mov r4d, r4m
7427 shl r4d, 7
7428
7429 %ifdef PIC
7430 lea r5, [tab_LumaCoeffVer_32]
7431 add r5, r4
7432 %else
7433 lea r5, [tab_LumaCoeffVer_32 + r4]
7434 %endif
7435
7436 lea r4, [r1 * 3]
7437 sub r0, r4
7438 lea r6, [r3 * 3]
7439 lea r11, [r1 * 4]
7440 mova m14, [pw_512]
7441 mov r9d, 4
7442 .loopH:
7443 mov r10d, 3
7444 .loopW:
7445 PROCESS_LUMA_AVX2_W16_16R
7446 add r2, 16
7447 add r0, 16
7448 dec r10d
7449 jnz .loopW
7450 sub r7, r11
7451 lea r0, [r7 - 32]
7452 lea r2, [r8 + r3 * 4 - 32]
7453 dec r9d
7454 jnz .loopH
7455 RET
7456 %endif
7457
7458 %macro FILTER_VER_LUMA_AVX2_64xN 2
7459 INIT_YMM avx2
7460 %if ARCH_X86_64 == 1
7461 cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
7462 mov r4d, r4m
7463 shl r4d, 7
7464
7465 %ifdef PIC
7466 lea r5, [tab_LumaCoeffVer_32]
7467 add r5, r4
7468 %else
7469 lea r5, [tab_LumaCoeffVer_32 + r4]
7470 %endif
7471
7472 lea r4, [r1 * 3]
7473 sub r0, r4
7474 lea r6, [r3 * 3]
7475 lea r11, [r1 * 4]
7476 mova m14, [pw_512]
7477 mov r9d, %2 / 16
7478 .loopH:
7479 mov r10d, %1 / 16
7480 .loopW:
7481 PROCESS_LUMA_AVX2_W16_16R
7482 add r2, 16
7483 add r0, 16
7484 dec r10d
7485 jnz .loopW
7486 sub r7, r11
7487 lea r0, [r7 - 48]
7488 lea r2, [r8 + r3 * 4 - 48]
7489 dec r9d
7490 jnz .loopH
7491 RET
7492 %endif
7493 %endmacro
7494
7495 FILTER_VER_LUMA_AVX2_64xN 64, 32
7496 FILTER_VER_LUMA_AVX2_64xN 64, 48
7497 FILTER_VER_LUMA_AVX2_64xN 64, 64
7498
7499 INIT_YMM avx2
7500 %if ARCH_X86_64 == 1
7501 cglobal interp_8tap_vert_pp_64x16, 4, 10, 15
7502 mov r4d, r4m
7503 shl r4d, 7
7504
7505 %ifdef PIC
7506 lea r5, [tab_LumaCoeffVer_32]
7507 add r5, r4
7508 %else
7509 lea r5, [tab_LumaCoeffVer_32 + r4]
7510 %endif
7511
7512 lea r4, [r1 * 3]
7513 sub r0, r4
7514 lea r6, [r3 * 3]
7515 mova m14, [pw_512]
7516 mov r9d, 4
7517 .loopW:
7518 PROCESS_LUMA_AVX2_W16_16R
7519 add r2, 16
7520 add r0, 16
7521 dec r9d
7522 jnz .loopW
7523 RET
7524 %endif
7525
7526 ;-------------------------------------------------------------------------------------------------------------
7527 ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7528 ;-------------------------------------------------------------------------------------------------------------
7529 %macro FILTER_VER_LUMA 3
7530 INIT_XMM sse4
7531 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
7532 lea r5, [3 * r1]
7533 sub r0, r5
7534 shl r4d, 6
7535 %ifidn %3,ps
7536 add r3d, r3d
7537 %endif
7538
7539 %ifdef PIC
7540 lea r5, [tab_LumaCoeffVer]
7541 lea r6, [r5 + r4]
7542 %else
7543 lea r6, [tab_LumaCoeffVer + r4]
7544 %endif
7545
7546 %ifidn %3,pp
7547 mova m3, [pw_512]
7548 %else
7549 mova m3, [pw_2000]
7550 %endif
7551 mov dword [rsp], %2/4
7552
7553 .loopH:
7554 mov r4d, (%1/8)
7555 .loopW:
7556 PROCESS_LUMA_W8_4R
7557 %ifidn %3,pp
7558 pmulhrsw m7, m3
7559 pmulhrsw m6, m3
7560 pmulhrsw m5, m3
7561 pmulhrsw m4, m3
7562
7563 packuswb m7, m6
7564 packuswb m5, m4
7565
7566 movlps [r2], m7
7567 movhps [r2 + r3], m7
7568 lea r5, [r2 + 2 * r3]
7569 movlps [r5], m5
7570 movhps [r5 + r3], m5
7571 %else
7572 psubw m7, m3
7573 psubw m6, m3
7574 psubw m5, m3
7575 psubw m4, m3
7576
7577 movu [r2], m7
7578 movu [r2 + r3], m6
7579 lea r5, [r2 + 2 * r3]
7580 movu [r5], m5
7581 movu [r5 + r3], m4
7582 %endif
7583
7584 lea r5, [8 * r1 - 8]
7585 sub r0, r5
7586 %ifidn %3,pp
7587 add r2, 8
7588 %else
7589 add r2, 16
7590 %endif
7591 dec r4d
7592 jnz .loopW
7593
7594 lea r0, [r0 + 4 * r1 - %1]
7595 %ifidn %3,pp
7596 lea r2, [r2 + 4 * r3 - %1]
7597 %else
7598 lea r2, [r2 + 4 * r3 - 2 * %1]
7599 %endif
7600
7601 dec dword [rsp]
7602 jnz .loopH
7603
7604 RET
7605 %endmacro
7606
7607 FILTER_VER_LUMA 16, 4, pp
7608 FILTER_VER_LUMA 16, 8, pp
7609 FILTER_VER_LUMA 16, 12, pp
7610 FILTER_VER_LUMA 16, 16, pp
7611 FILTER_VER_LUMA 16, 32, pp
7612 FILTER_VER_LUMA 16, 64, pp
7613 FILTER_VER_LUMA 24, 32, pp
7614 FILTER_VER_LUMA 32, 8, pp
7615 FILTER_VER_LUMA 32, 16, pp
7616 FILTER_VER_LUMA 32, 24, pp
7617 FILTER_VER_LUMA 32, 32, pp
7618 FILTER_VER_LUMA 32, 64, pp
7619 FILTER_VER_LUMA 48, 64, pp
7620 FILTER_VER_LUMA 64, 16, pp
7621 FILTER_VER_LUMA 64, 32, pp
7622 FILTER_VER_LUMA 64, 48, pp
7623 FILTER_VER_LUMA 64, 64, pp
7624
7625 FILTER_VER_LUMA 16, 4, ps
7626 FILTER_VER_LUMA 16, 8, ps
7627 FILTER_VER_LUMA 16, 12, ps
7628 FILTER_VER_LUMA 16, 16, ps
7629 FILTER_VER_LUMA 16, 32, ps
7630 FILTER_VER_LUMA 16, 64, ps
7631 FILTER_VER_LUMA 24, 32, ps
7632 FILTER_VER_LUMA 32, 8, ps
7633 FILTER_VER_LUMA 32, 16, ps
7634 FILTER_VER_LUMA 32, 24, ps
7635 FILTER_VER_LUMA 32, 32, ps
7636 FILTER_VER_LUMA 32, 64, ps
7637 FILTER_VER_LUMA 48, 64, ps
7638 FILTER_VER_LUMA 64, 16, ps
7639 FILTER_VER_LUMA 64, 32, ps
7640 FILTER_VER_LUMA 64, 48, ps
7641 FILTER_VER_LUMA 64, 64, ps
7642
7643 %macro PROCESS_LUMA_SP_W4_4R 0
7644 movq m0, [r0]
7645 movq m1, [r0 + r1]
7646 punpcklwd m0, m1 ;m0=[0 1]
7647 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
7648
7649 lea r0, [r0 + 2 * r1]
7650 movq m4, [r0]
7651 punpcklwd m1, m4 ;m1=[1 2]
7652 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
7653
7654 movq m5, [r0 + r1]
7655 punpcklwd m4, m5 ;m4=[2 3]
7656 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
7657 pmaddwd m4, [r6 + 1 * 16]
7658 paddd m0, m4 ;m0=[0+1+2+3] Row1
7659
7660 lea r0, [r0 + 2 * r1]
7661 movq m4, [r0]
7662 punpcklwd m5, m4 ;m5=[3 4]
7663 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
7664 pmaddwd m5, [r6 + 1 * 16]
7665 paddd m1, m5 ;m1 = [1+2+3+4] Row2
7666
7667 movq m5, [r0 + r1]
7668 punpcklwd m4, m5 ;m4=[4 5]
7669 pmaddwd m6, m4, [r6 + 1 * 16]
7670 paddd m2, m6 ;m2=[2+3+4+5] Row3
7671 pmaddwd m4, [r6 + 2 * 16]
7672 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
7673
7674 lea r0, [r0 + 2 * r1]
7675 movq m4, [r0]
7676 punpcklwd m5, m4 ;m5=[5 6]
7677 pmaddwd m6, m5, [r6 + 1 * 16]
7678 paddd m3, m6 ;m3=[3+4+5+6] Row4
7679 pmaddwd m5, [r6 + 2 * 16]
7680 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
7681
7682 movq m5, [r0 + r1]
7683 punpcklwd m4, m5 ;m4=[6 7]
7684 pmaddwd m6, m4, [r6 + 2 * 16]
7685 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
7686 pmaddwd m4, [r6 + 3 * 16]
7687 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
7688
7689 lea r0, [r0 + 2 * r1]
7690 movq m4, [r0]
7691 punpcklwd m5, m4 ;m5=[7 8]
7692 pmaddwd m6, m5, [r6 + 2 * 16]
7693 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
7694 pmaddwd m5, [r6 + 3 * 16]
7695 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
7696
7697 movq m5, [r0 + r1]
7698 punpcklwd m4, m5 ;m4=[8 9]
7699 pmaddwd m4, [r6 + 3 * 16]
7700 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
7701
7702 movq m4, [r0 + 2 * r1]
7703 punpcklwd m5, m4 ;m5=[9 10]
7704 pmaddwd m5, [r6 + 3 * 16]
7705 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
7706 %endmacro
7707
7708 ;--------------------------------------------------------------------------------------------------------------
7709 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7710 ;--------------------------------------------------------------------------------------------------------------
7711 %macro FILTER_VER_LUMA_SP 2
7712 INIT_XMM sse4
7713 cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
7714
7715 add r1d, r1d
7716 lea r5, [r1 + 2 * r1]
7717 sub r0, r5
7718 shl r4d, 6
7719
7720 %ifdef PIC
7721 lea r5, [tab_LumaCoeffV]
7722 lea r6, [r5 + r4]
7723 %else
7724 lea r6, [tab_LumaCoeffV + r4]
7725 %endif
7726
7727 mova m7, [tab_c_526336]
7728
7729 mov dword [rsp], %2/4
7730 .loopH:
7731 mov r4d, (%1/4)
7732 .loopW:
7733 PROCESS_LUMA_SP_W4_4R
7734
7735 paddd m0, m7
7736 paddd m1, m7
7737 paddd m2, m7
7738 paddd m3, m7
7739
7740 psrad m0, 12
7741 psrad m1, 12
7742 psrad m2, 12
7743 psrad m3, 12
7744
7745 packssdw m0, m1
7746 packssdw m2, m3
7747
7748 packuswb m0, m2
7749
7750 movd [r2], m0
7751 pextrd [r2 + r3], m0, 1
7752 lea r5, [r2 + 2 * r3]
7753 pextrd [r5], m0, 2
7754 pextrd [r5 + r3], m0, 3
7755
7756 lea r5, [8 * r1 - 2 * 4]
7757 sub r0, r5
7758 add r2, 4
7759
7760 dec r4d
7761 jnz .loopW
7762
7763 lea r0, [r0 + 4 * r1 - 2 * %1]
7764 lea r2, [r2 + 4 * r3 - %1]
7765
7766 dec dword [rsp]
7767 jnz .loopH
7768
7769 RET
7770 %endmacro
7771
7772 ;--------------------------------------------------------------------------------------------------------------
7773 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7774 ;--------------------------------------------------------------------------------------------------------------
7775 FILTER_VER_LUMA_SP 4, 4
7776 FILTER_VER_LUMA_SP 8, 8
7777 FILTER_VER_LUMA_SP 8, 4
7778 FILTER_VER_LUMA_SP 4, 8
7779 FILTER_VER_LUMA_SP 16, 16
7780 FILTER_VER_LUMA_SP 16, 8
7781 FILTER_VER_LUMA_SP 8, 16
7782 FILTER_VER_LUMA_SP 16, 12
7783 FILTER_VER_LUMA_SP 12, 16
7784 FILTER_VER_LUMA_SP 16, 4
7785 FILTER_VER_LUMA_SP 4, 16
7786 FILTER_VER_LUMA_SP 32, 32
7787 FILTER_VER_LUMA_SP 32, 16
7788 FILTER_VER_LUMA_SP 16, 32
7789 FILTER_VER_LUMA_SP 32, 24
7790 FILTER_VER_LUMA_SP 24, 32
7791 FILTER_VER_LUMA_SP 32, 8
7792 FILTER_VER_LUMA_SP 8, 32
7793 FILTER_VER_LUMA_SP 64, 64
7794 FILTER_VER_LUMA_SP 64, 32
7795 FILTER_VER_LUMA_SP 32, 64
7796 FILTER_VER_LUMA_SP 64, 48
7797 FILTER_VER_LUMA_SP 48, 64
7798 FILTER_VER_LUMA_SP 64, 16
7799 FILTER_VER_LUMA_SP 16, 64
7800
7801 ; TODO: combin of U and V is more performance, but need more register
7802 ; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
7803 INIT_XMM ssse3
7804 cglobal chroma_p2s, 3, 7, 4
7805
7806 ; load width and height
7807 mov r3d, r3m
7808 mov r4d, r4m
7809
7810 ; load constant
7811 mova m2, [pb_128]
7812 mova m3, [tab_c_64_n64]
7813
7814 .loopH:
7815
7816 xor r5d, r5d
7817 .loopW:
7818 lea r6, [r0 + r5]
7819
7820 movh m0, [r6]
7821 punpcklbw m0, m2
7822 pmaddubsw m0, m3
7823
7824 movh m1, [r6 + r1]
7825 punpcklbw m1, m2
7826 pmaddubsw m1, m3
7827
7828 add r5d, 8
7829 cmp r5d, r3d
7830 lea r6, [r2 + r5 * 2]
7831 jg .width4
7832 movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
7833 movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
7834 je .nextH
7835 jmp .loopW
7836
7837 .width4:
7838 test r3d, 4
7839 jz .width2
7840 test r3d, 2
7841 movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
7842 movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
7843 lea r6, [r6 + 8]
7844 pshufd m0, m0, 2
7845 pshufd m1, m1, 2
7846 jz .nextH
7847
7848 .width2:
7849 movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
7850 movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
7851
7852 .nextH:
7853 lea r0, [r0 + r1 * 2]
7854 add r2, FENC_STRIDE / 2 * 4
7855
7856 sub r4d, 2
7857 jnz .loopH
7858
7859 RET
7860
7861 %macro PROCESS_CHROMA_SP_W4_4R 0
7862 movq m0, [r0]
7863 movq m1, [r0 + r1]
7864 punpcklwd m0, m1 ;m0=[0 1]
7865 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
7866
7867 lea r0, [r0 + 2 * r1]
7868 movq m4, [r0]
7869 punpcklwd m1, m4 ;m1=[1 2]
7870 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
7871
7872 movq m5, [r0 + r1]
7873 punpcklwd m4, m5 ;m4=[2 3]
7874 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
7875 pmaddwd m4, [r6 + 1 * 16]
7876 paddd m0, m4 ;m0=[0+1+2+3] Row1 done
7877
7878 lea r0, [r0 + 2 * r1]
7879 movq m4, [r0]
7880 punpcklwd m5, m4 ;m5=[3 4]
7881 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
7882 pmaddwd m5, [r6 + 1 * 16]
7883 paddd m1, m5 ;m1 = [1+2+3+4] Row2
7884
7885 movq m5, [r0 + r1]
7886 punpcklwd m4, m5 ;m4=[4 5]
7887 pmaddwd m4, [r6 + 1 * 16]
7888 paddd m2, m4 ;m2=[2+3+4+5] Row3
7889
7890 movq m4, [r0 + 2 * r1]
7891 punpcklwd m5, m4 ;m5=[5 6]
7892 pmaddwd m5, [r6 + 1 * 16]
7893 paddd m3, m5 ;m3=[3+4+5+6] Row4
7894 %endmacro
7895
7896 ;--------------------------------------------------------------------------------------------------------------
7897 ; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7898 ;--------------------------------------------------------------------------------------------------------------
7899 %macro FILTER_VER_CHROMA_SP 2
7900 INIT_XMM sse4
7901 cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
7902
7903 add r1d, r1d
7904 sub r0, r1
7905 shl r4d, 5
7906
7907 %ifdef PIC
7908 lea r5, [tab_ChromaCoeffV]
7909 lea r6, [r5 + r4]
7910 %else
7911 lea r6, [tab_ChromaCoeffV + r4]
7912 %endif
7913
7914 mova m6, [tab_c_526336]
7915
7916 mov dword [rsp], %2/4
7917
7918 .loopH:
7919 mov r4d, (%1/4)
7920 .loopW:
7921 PROCESS_CHROMA_SP_W4_4R
7922
7923 paddd m0, m6
7924 paddd m1, m6
7925 paddd m2, m6
7926 paddd m3, m6
7927
7928 psrad m0, 12
7929 psrad m1, 12
7930 psrad m2, 12
7931 psrad m3, 12
7932
7933 packssdw m0, m1
7934 packssdw m2, m3
7935
7936 packuswb m0, m2
7937
7938 movd [r2], m0
7939 pextrd [r2 + r3], m0, 1
7940 lea r5, [r2 + 2 * r3]
7941 pextrd [r5], m0, 2
7942 pextrd [r5 + r3], m0, 3
7943
7944 lea r5, [4 * r1 - 2 * 4]
7945 sub r0, r5
7946 add r2, 4
7947
7948 dec r4d
7949 jnz .loopW
7950
7951 lea r0, [r0 + 4 * r1 - 2 * %1]
7952 lea r2, [r2 + 4 * r3 - %1]
7953
7954 dec dword [rsp]
7955 jnz .loopH
7956
7957 RET
7958 %endmacro
7959
7960 FILTER_VER_CHROMA_SP 4, 4
7961 FILTER_VER_CHROMA_SP 4, 8
7962 FILTER_VER_CHROMA_SP 16, 16
7963 FILTER_VER_CHROMA_SP 16, 8
7964 FILTER_VER_CHROMA_SP 16, 12
7965 FILTER_VER_CHROMA_SP 12, 16
7966 FILTER_VER_CHROMA_SP 16, 4
7967 FILTER_VER_CHROMA_SP 4, 16
7968 FILTER_VER_CHROMA_SP 32, 32
7969 FILTER_VER_CHROMA_SP 32, 16
7970 FILTER_VER_CHROMA_SP 16, 32
7971 FILTER_VER_CHROMA_SP 32, 24
7972 FILTER_VER_CHROMA_SP 24, 32
7973 FILTER_VER_CHROMA_SP 32, 8
7974
7975 FILTER_VER_CHROMA_SP 16, 24
7976 FILTER_VER_CHROMA_SP 16, 64
7977 FILTER_VER_CHROMA_SP 12, 32
7978 FILTER_VER_CHROMA_SP 4, 32
7979 FILTER_VER_CHROMA_SP 32, 64
7980 FILTER_VER_CHROMA_SP 32, 48
7981 FILTER_VER_CHROMA_SP 24, 64
7982
7983 FILTER_VER_CHROMA_SP 64, 64
7984 FILTER_VER_CHROMA_SP 64, 32
7985 FILTER_VER_CHROMA_SP 64, 48
7986 FILTER_VER_CHROMA_SP 48, 64
7987 FILTER_VER_CHROMA_SP 64, 16
7988
7989
7990 %macro PROCESS_CHROMA_SP_W2_4R 1
7991 movd m0, [r0]
7992 movd m1, [r0 + r1]
7993 punpcklwd m0, m1 ;m0=[0 1]
7994
7995 lea r0, [r0 + 2 * r1]
7996 movd m2, [r0]
7997 punpcklwd m1, m2 ;m1=[1 2]
7998 punpcklqdq m0, m1 ;m0=[0 1 1 2]
7999 pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
8000
8001 movd m1, [r0 + r1]
8002 punpcklwd m2, m1 ;m2=[2 3]
8003
8004 lea r0, [r0 + 2 * r1]
8005 movd m3, [r0]
8006 punpcklwd m1, m3 ;m2=[3 4]
8007 punpcklqdq m2, m1 ;m2=[2 3 3 4]
8008
8009 pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
8010 pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
8011 paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
8012
8013 movd m1, [r0 + r1]
8014 punpcklwd m3, m1 ;m3=[4 5]
8015
8016 movd m4, [r0 + 2 * r1]
8017 punpcklwd m1, m4 ;m1=[5 6]
8018 punpcklqdq m3, m1 ;m2=[4 5 5 6]
8019 pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
8020 paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
8021 %endmacro
8022
8023 ;-------------------------------------------------------------------------------------------------------------------
8024 ; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8025 ;-------------------------------------------------------------------------------------------------------------------
8026 %macro FILTER_VER_CHROMA_SP_W2_4R 2
8027 INIT_XMM sse4
8028 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6
8029
8030 add r1d, r1d
8031 sub r0, r1
8032 shl r4d, 5
8033
8034 %ifdef PIC
8035 lea r5, [tab_ChromaCoeffV]
8036 lea r5, [r5 + r4]
8037 %else
8038 lea r5, [tab_ChromaCoeffV + r4]
8039 %endif
8040
8041 mova m5, [tab_c_526336]
8042
8043 mov r4d, (%2/4)
8044
8045 .loopH:
8046 PROCESS_CHROMA_SP_W2_4R r5
8047
8048 paddd m0, m5
8049 paddd m2, m5
8050
8051 psrad m0, 12
8052 psrad m2, 12
8053
8054 packssdw m0, m2
8055 packuswb m0, m0
8056
8057 pextrw [r2], m0, 0
8058 pextrw [r2 + r3], m0, 1
8059 lea r2, [r2 + 2 * r3]
8060 pextrw [r2], m0, 2
8061 pextrw [r2 + r3], m0, 3
8062
8063 lea r2, [r2 + 2 * r3]
8064
8065 dec r4d
8066 jnz .loopH
8067
8068 RET
8069 %endmacro
8070
8071 FILTER_VER_CHROMA_SP_W2_4R 2, 4
8072 FILTER_VER_CHROMA_SP_W2_4R 2, 8
8073
8074 FILTER_VER_CHROMA_SP_W2_4R 2, 16
8075
8076 ;--------------------------------------------------------------------------------------------------------------
8077 ; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8078 ;--------------------------------------------------------------------------------------------------------------
8079 INIT_XMM sse4
8080 cglobal interp_4tap_vert_sp_4x2, 5, 6, 5
8081
8082 add r1d, r1d
8083 sub r0, r1
8084 shl r4d, 5
8085
8086 %ifdef PIC
8087 lea r5, [tab_ChromaCoeffV]
8088 lea r5, [r5 + r4]
8089 %else
8090 lea r5, [tab_ChromaCoeffV + r4]
8091 %endif
8092
8093 mova m4, [tab_c_526336]
8094
8095 movq m0, [r0]
8096 movq m1, [r0 + r1]
8097 punpcklwd m0, m1 ;m0=[0 1]
8098 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
8099
8100 lea r0, [r0 + 2 * r1]
8101 movq m2, [r0]
8102 punpcklwd m1, m2 ;m1=[1 2]
8103 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
8104
8105 movq m3, [r0 + r1]
8106 punpcklwd m2, m3 ;m4=[2 3]
8107 pmaddwd m2, [r5 + 1 * 16]
8108 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
8109 paddd m0, m4
8110 psrad m0, 12
8111
8112 movq m2, [r0 + 2 * r1]
8113 punpcklwd m3, m2 ;m5=[3 4]
8114 pmaddwd m3, [r5 + 1 * 16]
8115 paddd m1, m3 ;m1 = [1+2+3+4] Row2 done
8116 paddd m1, m4
8117 psrad m1, 12
8118
8119 packssdw m0, m1
8120 packuswb m0, m0
8121
8122 movd [r2], m0
8123 pextrd [r2 + r3], m0, 1
8124
8125 RET
8126
8127 ;-------------------------------------------------------------------------------------------------------------------
8128 ; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8129 ;-------------------------------------------------------------------------------------------------------------------
8130 %macro FILTER_VER_CHROMA_SP_W6_H4 2
8131 INIT_XMM sse4
8132 cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
8133
8134 add r1d, r1d
8135 sub r0, r1
8136 shl r4d, 5
8137
8138 %ifdef PIC
8139 lea r5, [tab_ChromaCoeffV]
8140 lea r6, [r5 + r4]
8141 %else
8142 lea r6, [tab_ChromaCoeffV + r4]
8143 %endif
8144
8145 mova m6, [tab_c_526336]
8146
8147 mov r4d, %2/4
8148
8149 .loopH:
8150 PROCESS_CHROMA_SP_W4_4R
8151
8152 paddd m0, m6
8153 paddd m1, m6
8154 paddd m2, m6
8155 paddd m3, m6
8156
8157 psrad m0, 12
8158 psrad m1, 12
8159 psrad m2, 12
8160 psrad m3, 12
8161
8162 packssdw m0, m1
8163 packssdw m2, m3
8164
8165 packuswb m0, m2
8166
8167 movd [r2], m0
8168 pextrd [r2 + r3], m0, 1
8169 lea r5, [r2 + 2 * r3]
8170 pextrd [r5], m0, 2
8171 pextrd [r5 + r3], m0, 3
8172
8173 lea r5, [4 * r1 - 2 * 4]
8174 sub r0, r5
8175 add r2, 4
8176
8177 PROCESS_CHROMA_SP_W2_4R r6
8178
8179 paddd m0, m6
8180 paddd m2, m6
8181
8182 psrad m0, 12
8183 psrad m2, 12
8184
8185 packssdw m0, m2
8186 packuswb m0, m0
8187
8188 pextrw [r2], m0, 0
8189 pextrw [r2 + r3], m0, 1
8190 lea r2, [r2 + 2 * r3]
8191 pextrw [r2], m0, 2
8192 pextrw [r2 + r3], m0, 3
8193
8194 sub r0, 2 * 4
8195 lea r2, [r2 + 2 * r3 - 4]
8196
8197 dec r4d
8198 jnz .loopH
8199
8200 RET
8201 %endmacro
8202
8203 FILTER_VER_CHROMA_SP_W6_H4 6, 8
8204
8205 FILTER_VER_CHROMA_SP_W6_H4 6, 16
8206
8207 %macro PROCESS_CHROMA_SP_W8_2R 0
8208 movu m1, [r0]
8209 movu m3, [r0 + r1]
8210 punpcklwd m0, m1, m3
8211 pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
8212 punpckhwd m1, m3
8213 pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
8214
8215 movu m4, [r0 + 2 * r1]
8216 punpcklwd m2, m3, m4
8217 pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
8218 punpckhwd m3, m4
8219 pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
8220
8221 lea r0, [r0 + 2 * r1]
8222 movu m5, [r0 + r1]
8223 punpcklwd m6, m4, m5
8224 pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
8225 paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
8226 punpckhwd m4, m5
8227 pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
8228 paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
8229
8230 movu m4, [r0 + 2 * r1]
8231 punpcklwd m6, m5, m4
8232 pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
8233 paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
8234 punpckhwd m5, m4
8235 pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
8236 paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
8237 %endmacro
8238
8239 ;--------------------------------------------------------------------------------------------------------------
8240 ; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8241 ;--------------------------------------------------------------------------------------------------------------
8242 %macro FILTER_VER_CHROMA_SP_W8_H2 2
8243 INIT_XMM sse2
8244 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8
8245
8246 add r1d, r1d
8247 sub r0, r1
8248 shl r4d, 5
8249
8250 %ifdef PIC
8251 lea r5, [tab_ChromaCoeffV]
8252 lea r5, [r5 + r4]
8253 %else
8254 lea r5, [tab_ChromaCoeffV + r4]
8255 %endif
8256
8257 mova m7, [tab_c_526336]
8258
8259 mov r4d, %2/2
8260 .loopH:
8261 PROCESS_CHROMA_SP_W8_2R
8262
8263 paddd m0, m7
8264 paddd m1, m7
8265 paddd m2, m7
8266 paddd m3, m7
8267
8268 psrad m0, 12
8269 psrad m1, 12
8270 psrad m2, 12
8271 psrad m3, 12
8272
8273 packssdw m0, m1
8274 packssdw m2, m3
8275
8276 packuswb m0, m2
8277
8278 movlps [r2], m0
8279 movhps [r2 + r3], m0
8280
8281 lea r2, [r2 + 2 * r3]
8282
8283 dec r4d
8284 jnz .loopH
8285
8286 RET
8287 %endmacro
8288
8289 FILTER_VER_CHROMA_SP_W8_H2 8, 2
8290 FILTER_VER_CHROMA_SP_W8_H2 8, 4
8291 FILTER_VER_CHROMA_SP_W8_H2 8, 6
8292 FILTER_VER_CHROMA_SP_W8_H2 8, 8
8293 FILTER_VER_CHROMA_SP_W8_H2 8, 16
8294 FILTER_VER_CHROMA_SP_W8_H2 8, 32
8295
8296 FILTER_VER_CHROMA_SP_W8_H2 8, 12
8297 FILTER_VER_CHROMA_SP_W8_H2 8, 64
8298
8299
8300 ;-----------------------------------------------------------------------------------------------------------------------------
8301 ; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8302 ;-----------------------------------------------------------------------------------------------------------------------------
8303 %macro FILTER_HORIZ_CHROMA_2xN 2
8304 INIT_XMM sse4
8305 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
8306 %define coef2 m3
8307 %define Tm0 m2
8308 %define t1 m1
8309 %define t0 m0
8310
8311 dec srcq
8312 mov r4d, r4m
8313 add dststrided, dststrided
8314
8315 %ifdef PIC
8316 lea r6, [tab_ChromaCoeff]
8317 movd coef2, [r6 + r4 * 4]
8318 %else
8319 movd coef2, [tab_ChromaCoeff + r4 * 4]
8320 %endif
8321
8322 pshufd coef2, coef2, 0
8323 mova t1, [pw_2000]
8324 mova Tm0, [tab_Tm]
8325
8326 mov r4d, %2
8327 cmp r5m, byte 0
8328 je .loopH
8329 sub srcq, srcstrideq
8330 add r4d, 3
8331
8332 .loopH:
8333 movh t0, [srcq]
8334 pshufb t0, t0, Tm0
8335 pmaddubsw t0, coef2
8336 phaddw t0, t0
8337 psubw t0, t1
8338 movd [dstq], t0
8339
8340 lea srcq, [srcq + srcstrideq]
8341 lea dstq, [dstq + dststrideq]
8342
8343 dec r4d
8344 jnz .loopH
8345
8346 RET
8347 %endmacro
8348
8349 FILTER_HORIZ_CHROMA_2xN 2, 4
8350 FILTER_HORIZ_CHROMA_2xN 2, 8
8351
8352 FILTER_HORIZ_CHROMA_2xN 2, 16
8353
8354 ;-----------------------------------------------------------------------------------------------------------------------------
8355 ; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8356 ;-----------------------------------------------------------------------------------------------------------------------------
8357 %macro FILTER_HORIZ_CHROMA_4xN 2
8358 INIT_XMM sse4
8359 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
8360 %define coef2 m3
8361 %define Tm0 m2
8362 %define t1 m1
8363 %define t0 m0
8364
8365 dec srcq
8366 mov r4d, r4m
8367 add dststrided, dststrided
8368
8369 %ifdef PIC
8370 lea r6, [tab_ChromaCoeff]
8371 movd coef2, [r6 + r4 * 4]
8372 %else
8373 movd coef2, [tab_ChromaCoeff + r4 * 4]
8374 %endif
8375
8376 pshufd coef2, coef2, 0
8377 mova t1, [pw_2000]
8378 mova Tm0, [tab_Tm]
8379
8380 mov r4d, %2
8381 cmp r5m, byte 0
8382 je .loopH
8383 sub srcq, srcstrideq
8384 add r4d, 3
8385
8386 .loopH:
8387 movh t0, [srcq]
8388 pshufb t0, t0, Tm0
8389 pmaddubsw t0, coef2
8390 phaddw t0, t0
8391 psubw t0, t1
8392 movlps [dstq], t0
8393
8394 lea srcq, [srcq + srcstrideq]
8395 lea dstq, [dstq + dststrideq]
8396
8397 dec r4d
8398 jnz .loopH
8399 RET
8400 %endmacro
8401
8402 FILTER_HORIZ_CHROMA_4xN 4, 2
8403 FILTER_HORIZ_CHROMA_4xN 4, 4
8404 FILTER_HORIZ_CHROMA_4xN 4, 8
8405 FILTER_HORIZ_CHROMA_4xN 4, 16
8406
8407 FILTER_HORIZ_CHROMA_4xN 4, 32
8408
8409 %macro PROCESS_CHROMA_W6 3
8410 movu %1, [srcq]
8411 pshufb %2, %1, Tm0
8412 pmaddubsw %2, coef2
8413 pshufb %1, %1, Tm1
8414 pmaddubsw %1, coef2
8415 phaddw %2, %1
8416 psubw %2, %3
8417 movh [dstq], %2
8418 pshufd %2, %2, 2
8419 movd [dstq + 8], %2
8420 %endmacro
8421
8422 %macro PROCESS_CHROMA_W12 3
8423 movu %1, [srcq]
8424 pshufb %2, %1, Tm0
8425 pmaddubsw %2, coef2
8426 pshufb %1, %1, Tm1
8427 pmaddubsw %1, coef2
8428 phaddw %2, %1
8429 psubw %2, %3
8430 movu [dstq], %2
8431 movu %1, [srcq + 8]
8432 pshufb %1, %1, Tm0
8433 pmaddubsw %1, coef2
8434 phaddw %1, %1
8435 psubw %1, %3
8436 movh [dstq + 16], %1
8437 %endmacro
8438
8439 ;-----------------------------------------------------------------------------------------------------------------------------
8440 ; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8441 ;-----------------------------------------------------------------------------------------------------------------------------
8442 %macro FILTER_HORIZ_CHROMA 2
8443 INIT_XMM sse4
8444 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
8445 %define coef2 m5
8446 %define Tm0 m4
8447 %define Tm1 m3
8448 %define t2 m2
8449 %define t1 m1
8450 %define t0 m0
8451
8452 dec srcq
8453 mov r4d, r4m
8454 add dststrided, dststrided
8455
8456 %ifdef PIC
8457 lea r6, [tab_ChromaCoeff]
8458 movd coef2, [r6 + r4 * 4]
8459 %else
8460 movd coef2, [tab_ChromaCoeff + r4 * 4]
8461 %endif
8462
8463 pshufd coef2, coef2, 0
8464 mova t2, [pw_2000]
8465 mova Tm0, [tab_Tm]
8466 mova Tm1, [tab_Tm + 16]
8467
8468 mov r4d, %2
8469 cmp r5m, byte 0
8470 je .loopH
8471 sub srcq, srcstrideq
8472 add r4d, 3
8473
8474 .loopH:
8475 PROCESS_CHROMA_W%1 t0, t1, t2
8476 add srcq, srcstrideq
8477 add dstq, dststrideq
8478
8479 dec r4d
8480 jnz .loopH
8481
8482 RET
8483 %endmacro
8484
8485 FILTER_HORIZ_CHROMA 6, 8
8486 FILTER_HORIZ_CHROMA 12, 16
8487
8488 FILTER_HORIZ_CHROMA 6, 16
8489 FILTER_HORIZ_CHROMA 12, 32
8490
8491 %macro PROCESS_CHROMA_W8 3
8492 movu %1, [srcq]
8493 pshufb %2, %1, Tm0
8494 pmaddubsw %2, coef2
8495 pshufb %1, %1, Tm1
8496 pmaddubsw %1, coef2
8497 phaddw %2, %1
8498 psubw %2, %3
8499 movu [dstq], %2
8500 %endmacro
8501
8502 ;-----------------------------------------------------------------------------------------------------------------------------
8503 ; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8504 ;-----------------------------------------------------------------------------------------------------------------------------
8505 %macro FILTER_HORIZ_CHROMA_8xN 2
8506 INIT_XMM sse4
8507 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
8508 %define coef2 m5
8509 %define Tm0 m4
8510 %define Tm1 m3
8511 %define t2 m2
8512 %define t1 m1
8513 %define t0 m0
8514
8515 dec srcq
8516 mov r4d, r4m
8517 add dststrided, dststrided
8518
8519 %ifdef PIC
8520 lea r6, [tab_ChromaCoeff]
8521 movd coef2, [r6 + r4 * 4]
8522 %else
8523 movd coef2, [tab_ChromaCoeff + r4 * 4]
8524 %endif
8525
8526 pshufd coef2, coef2, 0
8527 mova t2, [pw_2000]
8528 mova Tm0, [tab_Tm]
8529 mova Tm1, [tab_Tm + 16]
8530
8531 mov r4d, %2
8532 cmp r5m, byte 0
8533 je .loopH
8534 sub srcq, srcstrideq
8535 add r4d, 3
8536
8537 .loopH:
8538 PROCESS_CHROMA_W8 t0, t1, t2
8539 add srcq, srcstrideq
8540 add dstq, dststrideq
8541
8542 dec r4d
8543 jnz .loopH
8544
8545 RET
8546 %endmacro
8547
8548 FILTER_HORIZ_CHROMA_8xN 8, 2
8549 FILTER_HORIZ_CHROMA_8xN 8, 4
8550 FILTER_HORIZ_CHROMA_8xN 8, 6
8551 FILTER_HORIZ_CHROMA_8xN 8, 8
8552 FILTER_HORIZ_CHROMA_8xN 8, 16
8553 FILTER_HORIZ_CHROMA_8xN 8, 32
8554
8555 FILTER_HORIZ_CHROMA_8xN 8, 12
8556 FILTER_HORIZ_CHROMA_8xN 8, 64
8557
8558 %macro PROCESS_CHROMA_W16 4
8559 movu %1, [srcq]
8560 pshufb %2, %1, Tm0
8561 pmaddubsw %2, coef2
8562 pshufb %1, %1, Tm1
8563 pmaddubsw %1, coef2
8564 phaddw %2, %1
8565 movu %1, [srcq + 8]
8566 pshufb %4, %1, Tm0
8567 pmaddubsw %4, coef2
8568 pshufb %1, %1, Tm1
8569 pmaddubsw %1, coef2
8570 phaddw %4, %1
8571 psubw %2, %3
8572 psubw %4, %3
8573 movu [dstq], %2
8574 movu [dstq + 16], %4
8575 %endmacro
8576
8577 %macro PROCESS_CHROMA_W24 4
8578 movu %1, [srcq]
8579 pshufb %2, %1, Tm0
8580 pmaddubsw %2, coef2
8581 pshufb %1, %1, Tm1
8582 pmaddubsw %1, coef2
8583 phaddw %2, %1
8584 movu %1, [srcq + 8]
8585 pshufb %4, %1, Tm0
8586 pmaddubsw %4, coef2
8587 pshufb %1, %1, Tm1
8588 pmaddubsw %1, coef2
8589 phaddw %4, %1
8590 psubw %2, %3
8591 psubw %4, %3
8592 movu [dstq], %2
8593 movu [dstq + 16], %4
8594 movu %1, [srcq + 16]
8595 pshufb %2, %1, Tm0
8596 pmaddubsw %2, coef2
8597 pshufb %1, %1, Tm1
8598 pmaddubsw %1, coef2
8599 phaddw %2, %1
8600 psubw %2, %3
8601 movu [dstq + 32], %2
8602 %endmacro
8603
8604 %macro PROCESS_CHROMA_W32 4
8605 movu %1, [srcq]
8606 pshufb %2, %1, Tm0
8607 pmaddubsw %2, coef2
8608 pshufb %1, %1, Tm1
8609 pmaddubsw %1, coef2
8610 phaddw %2, %1
8611 movu %1, [srcq + 8]
8612 pshufb %4, %1, Tm0
8613 pmaddubsw %4, coef2
8614 pshufb %1, %1, Tm1
8615 pmaddubsw %1, coef2
8616 phaddw %4, %1
8617 psubw %2, %3
8618 psubw %4, %3
8619 movu [dstq], %2
8620 movu [dstq + 16], %4
8621 movu %1, [srcq + 16]
8622 pshufb %2, %1, Tm0
8623 pmaddubsw %2, coef2
8624 pshufb %1, %1, Tm1
8625 pmaddubsw %1, coef2
8626 phaddw %2, %1
8627 movu %1, [srcq + 24]
8628 pshufb %4, %1, Tm0
8629 pmaddubsw %4, coef2
8630 pshufb %1, %1, Tm1
8631 pmaddubsw %1, coef2
8632 phaddw %4, %1
8633 psubw %2, %3
8634 psubw %4, %3
8635 movu [dstq + 32], %2
8636 movu [dstq + 48], %4
8637 %endmacro
8638
8639 %macro PROCESS_CHROMA_W16o 5
8640 movu %1, [srcq + %5]
8641 pshufb %2, %1, Tm0
8642 pmaddubsw %2, coef2
8643 pshufb %1, %1, Tm1
8644 pmaddubsw %1, coef2
8645 phaddw %2, %1
8646 movu %1, [srcq + %5 + 8]
8647 pshufb %4, %1, Tm0
8648 pmaddubsw %4, coef2
8649 pshufb %1, %1, Tm1
8650 pmaddubsw %1, coef2
8651 phaddw %4, %1
8652 psubw %2, %3
8653 psubw %4, %3
8654 movu [dstq + %5 * 2], %2
8655 movu [dstq + %5 * 2 + 16], %4
8656 %endmacro
8657
8658 %macro PROCESS_CHROMA_W48 4
8659 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
8660 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
8661 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
8662 %endmacro
8663
8664 %macro PROCESS_CHROMA_W64 4
8665 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
8666 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
8667 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
8668 PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
8669 %endmacro
8670
8671 ;------------------------------------------------------------------------------------------------------------------------------
8672 ; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8673 ;------------------------------------------------------------------------------------------------------------------------------
8674 %macro FILTER_HORIZ_CHROMA_WxN 2
8675 INIT_XMM sse4
8676 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride
8677 %define coef2 m6
8678 %define Tm0 m5
8679 %define Tm1 m4
8680 %define t3 m3
8681 %define t2 m2
8682 %define t1 m1
8683 %define t0 m0
8684
8685 dec srcq
8686 mov r4d, r4m
8687 add dststrided, dststrided
8688
8689 %ifdef PIC
8690 lea r6, [tab_ChromaCoeff]
8691 movd coef2, [r6 + r4 * 4]
8692 %else
8693 movd coef2, [tab_ChromaCoeff + r4 * 4]
8694 %endif
8695
8696 pshufd coef2, coef2, 0
8697 mova t2, [pw_2000]
8698 mova Tm0, [tab_Tm]
8699 mova Tm1, [tab_Tm + 16]
8700
8701 mov r4d, %2
8702 cmp r5m, byte 0
8703 je .loopH
8704 sub srcq, srcstrideq
8705 add r4d, 3
8706
8707 .loopH:
8708 PROCESS_CHROMA_W%1 t0, t1, t2, t3
8709 add srcq, srcstrideq
8710 add dstq, dststrideq
8711
8712 dec r4d
8713 jnz .loopH
8714
8715 RET
8716 %endmacro
8717
8718 FILTER_HORIZ_CHROMA_WxN 16, 4
8719 FILTER_HORIZ_CHROMA_WxN 16, 8
8720 FILTER_HORIZ_CHROMA_WxN 16, 12
8721 FILTER_HORIZ_CHROMA_WxN 16, 16
8722 FILTER_HORIZ_CHROMA_WxN 16, 32
8723 FILTER_HORIZ_CHROMA_WxN 24, 32
8724 FILTER_HORIZ_CHROMA_WxN 32, 8
8725 FILTER_HORIZ_CHROMA_WxN 32, 16
8726 FILTER_HORIZ_CHROMA_WxN 32, 24
8727 FILTER_HORIZ_CHROMA_WxN 32, 32
8728
8729 FILTER_HORIZ_CHROMA_WxN 16, 24
8730 FILTER_HORIZ_CHROMA_WxN 16, 64
8731 FILTER_HORIZ_CHROMA_WxN 24, 64
8732 FILTER_HORIZ_CHROMA_WxN 32, 48
8733 FILTER_HORIZ_CHROMA_WxN 32, 64
8734
8735 FILTER_HORIZ_CHROMA_WxN 64, 64
8736 FILTER_HORIZ_CHROMA_WxN 64, 32
8737 FILTER_HORIZ_CHROMA_WxN 64, 48
8738 FILTER_HORIZ_CHROMA_WxN 48, 64
8739 FILTER_HORIZ_CHROMA_WxN 64, 16
8740
8741
8742 ;---------------------------------------------------------------------------------------------------------------
8743 ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8744 ;---------------------------------------------------------------------------------------------------------------
8745 %macro FILTER_V_PS_W16n 2
8746 INIT_XMM sse4
8747 cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
8748
8749 mov r4d, r4m
8750 sub r0, r1
8751 add r3d, r3d
8752
8753 %ifdef PIC
8754 lea r5, [tab_ChromaCoeff]
8755 movd m0, [r5 + r4 * 4]
8756 %else
8757 movd m0, [tab_ChromaCoeff + r4 * 4]
8758 %endif
8759
8760 pshufb m1, m0, [tab_Vm]
8761 pshufb m0, [tab_Vm + 16]
8762 mov r4d, %2/2
8763
8764 .loop:
8765
8766 mov r6d, %1/16
8767
8768 .loopW:
8769
8770 movu m2, [r0]
8771 movu m3, [r0 + r1]
8772
8773 punpcklbw m4, m2, m3
8774 punpckhbw m2, m3
8775
8776 pmaddubsw m4, m1
8777 pmaddubsw m2, m1
8778
8779 lea r5, [r0 + 2 * r1]
8780 movu m5, [r5]
8781 movu m7, [r5 + r1]
8782
8783 punpcklbw m6, m5, m7
8784 pmaddubsw m6, m0
8785 paddw m4, m6
8786
8787 punpckhbw m6, m5, m7
8788 pmaddubsw m6, m0
8789 paddw m2, m6
8790
8791 mova m6, [pw_2000]
8792
8793 psubw m4, m6
8794 psubw m2, m6
8795
8796 movu [r2], m4
8797 movu [r2 + 16], m2
8798
8799 punpcklbw m4, m3, m5
8800 punpckhbw m3, m5
8801
8802 pmaddubsw m4, m1
8803 pmaddubsw m3, m1
8804
8805 movu m5, [r5 + 2 * r1]
8806
8807 punpcklbw m2, m7, m5
8808 punpckhbw m7, m5
8809
8810 pmaddubsw m2, m0
8811 pmaddubsw m7, m0
8812
8813 paddw m4, m2
8814 paddw m3, m7
8815
8816 psubw m4, m6
8817 psubw m3, m6
8818
8819 movu [r2 + r3], m4
8820 movu [r2 + r3 + 16], m3
8821
8822 add r0, 16
8823 add r2, 32
8824 dec r6d
8825 jnz .loopW
8826
8827 lea r0, [r0 + r1 * 2 - %1]
8828 lea r2, [r2 + r3 * 2 - %1 * 2]
8829
8830 dec r4d
8831 jnz .loop
8832 RET
8833 %endmacro
8834
8835 FILTER_V_PS_W16n 64, 64
8836 FILTER_V_PS_W16n 64, 32
8837 FILTER_V_PS_W16n 64, 48
8838 FILTER_V_PS_W16n 48, 64
8839 FILTER_V_PS_W16n 64, 16
8840
8841
8842 ;------------------------------------------------------------------------------------------------------------
8843 ;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8844 ;------------------------------------------------------------------------------------------------------------
8845 INIT_XMM sse4
8846 cglobal interp_4tap_vert_ps_2x4, 4, 6, 7
8847
8848 mov r4d, r4m
8849 sub r0, r1
8850 add r3d, r3d
8851
8852 %ifdef PIC
8853 lea r5, [tab_ChromaCoeff]
8854 movd m0, [r5 + r4 * 4]
8855 %else
8856 movd m0, [tab_ChromaCoeff + r4 * 4]
8857 %endif
8858
8859 pshufb m0, [tab_Cm]
8860
8861 lea r5, [3 * r1]
8862
8863 movd m2, [r0]
8864 movd m3, [r0 + r1]
8865 movd m4, [r0 + 2 * r1]
8866 movd m5, [r0 + r5]
8867
8868 punpcklbw m2, m3
8869 punpcklbw m6, m4, m5
8870 punpcklbw m2, m6
8871
8872 pmaddubsw m2, m0
8873
8874 lea r0, [r0 + 4 * r1]
8875 movd m6, [r0]
8876
8877 punpcklbw m3, m4
8878 punpcklbw m1, m5, m6
8879 punpcklbw m3, m1
8880
8881 pmaddubsw m3, m0
8882 phaddw m2, m3
8883
8884 mova m1, [pw_2000]
8885
8886 psubw m2, m1
8887
8888 movd [r2], m2
8889 pextrd [r2 + r3], m2, 2
8890
8891 movd m2, [r0 + r1]
8892
8893 punpcklbw m4, m5
8894 punpcklbw m3, m6, m2
8895 punpcklbw m4, m3
8896
8897 pmaddubsw m4, m0
8898
8899 movd m3, [r0 + 2 * r1]
8900
8901 punpcklbw m5, m6
8902 punpcklbw m2, m3
8903 punpcklbw m5, m2
8904
8905 pmaddubsw m5, m0
8906 phaddw m4, m5
8907 psubw m4, m1
8908
8909 lea r2, [r2 + 2 * r3]
8910 movd [r2], m4
8911 pextrd [r2 + r3], m4, 2
8912
8913 RET
8914
8915 ;-------------------------------------------------------------------------------------------------------------
8916 ; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8917 ;-------------------------------------------------------------------------------------------------------------
8918 %macro FILTER_V_PS_W2 2
8919 INIT_XMM sse4
8920 cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
8921
8922 mov r4d, r4m
8923 sub r0, r1
8924 add r3d, r3d
8925
8926 %ifdef PIC
8927 lea r5, [tab_ChromaCoeff]
8928 movd m0, [r5 + r4 * 4]
8929 %else
8930 movd m0, [tab_ChromaCoeff + r4 * 4]
8931 %endif
8932
8933 pshufb m0, [tab_Cm]
8934
8935 mova m1, [pw_2000]
8936 lea r5, [3 * r1]
8937 mov r4d, %2/4
8938 .loop:
8939 movd m2, [r0]
8940 movd m3, [r0 + r1]
8941 movd m4, [r0 + 2 * r1]
8942 movd m5, [r0 + r5]
8943
8944 punpcklbw m2, m3
8945 punpcklbw m6, m4, m5
8946 punpcklbw m2, m6
8947
8948 pmaddubsw m2, m0
8949
8950 lea r0, [r0 + 4 * r1]
8951 movd m6, [r0]
8952
8953 punpcklbw m3, m4
8954 punpcklbw m7, m5, m6
8955 punpcklbw m3, m7
8956
8957 pmaddubsw m3, m0
8958
8959 phaddw m2, m3
8960 psubw m2, m1
8961
8962
8963 movd [r2], m2
8964 pshufd m2, m2, 2
8965 movd [r2 + r3], m2
8966
8967 movd m2, [r0 + r1]
8968
8969 punpcklbw m4, m5
8970 punpcklbw m3, m6, m2
8971 punpcklbw m4, m3
8972
8973 pmaddubsw m4, m0
8974
8975 movd m3, [r0 + 2 * r1]
8976
8977 punpcklbw m5, m6
8978 punpcklbw m2, m3
8979 punpcklbw m5, m2
8980
8981 pmaddubsw m5, m0
8982
8983 phaddw m4, m5
8984
8985 psubw m4, m1
8986
8987 lea r2, [r2 + 2 * r3]
8988 movd [r2], m4
8989 pshufd m4 , m4 ,2
8990 movd [r2 + r3], m4
8991
8992 lea r2, [r2 + 2 * r3]
8993
8994 dec r4d
8995 jnz .loop
8996
8997 RET
8998 %endmacro
8999
9000 FILTER_V_PS_W2 2, 8
9001
9002 FILTER_V_PS_W2 2, 16
9003
9004 ;-----------------------------------------------------------------------------------------------------------------
9005 ; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9006 ;-----------------------------------------------------------------------------------------------------------------
9007 %macro FILTER_VER_CHROMA_SS 2
9008 INIT_XMM sse2
9009 cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
9010
9011 add r1d, r1d
9012 add r3d, r3d
9013 sub r0, r1
9014 shl r4d, 5
9015
9016 %ifdef PIC
9017 lea r5, [tab_ChromaCoeffV]
9018 lea r6, [r5 + r4]
9019 %else
9020 lea r6, [tab_ChromaCoeffV + r4]
9021 %endif
9022
9023 mov dword [rsp], %2/4
9024
9025 .loopH:
9026 mov r4d, (%1/4)
9027 .loopW:
9028 PROCESS_CHROMA_SP_W4_4R
9029
9030 psrad m0, 6
9031 psrad m1, 6
9032 psrad m2, 6
9033 psrad m3, 6
9034
9035 packssdw m0, m1
9036 packssdw m2, m3
9037
9038 movlps [r2], m0
9039 movhps [r2 + r3], m0
9040 lea r5, [r2 + 2 * r3]
9041 movlps [r5], m2
9042 movhps [r5 + r3], m2
9043
9044 lea r5, [4 * r1 - 2 * 4]
9045 sub r0, r5
9046 add r2, 2 * 4
9047
9048 dec r4d
9049 jnz .loopW
9050
9051 lea r0, [r0 + 4 * r1 - 2 * %1]
9052 lea r2, [r2 + 4 * r3 - 2 * %1]
9053
9054 dec dword [rsp]
9055 jnz .loopH
9056
9057 RET
9058 %endmacro
9059
9060 FILTER_VER_CHROMA_SS 4, 4
9061 FILTER_VER_CHROMA_SS 4, 8
9062 FILTER_VER_CHROMA_SS 16, 16
9063 FILTER_VER_CHROMA_SS 16, 8
9064 FILTER_VER_CHROMA_SS 16, 12
9065 FILTER_VER_CHROMA_SS 12, 16
9066 FILTER_VER_CHROMA_SS 16, 4
9067 FILTER_VER_CHROMA_SS 4, 16
9068 FILTER_VER_CHROMA_SS 32, 32
9069 FILTER_VER_CHROMA_SS 32, 16
9070 FILTER_VER_CHROMA_SS 16, 32
9071 FILTER_VER_CHROMA_SS 32, 24
9072 FILTER_VER_CHROMA_SS 24, 32
9073 FILTER_VER_CHROMA_SS 32, 8
9074
9075 FILTER_VER_CHROMA_SS 16, 24
9076 FILTER_VER_CHROMA_SS 12, 32
9077 FILTER_VER_CHROMA_SS 4, 32
9078 FILTER_VER_CHROMA_SS 32, 64
9079 FILTER_VER_CHROMA_SS 16, 64
9080 FILTER_VER_CHROMA_SS 32, 48
9081 FILTER_VER_CHROMA_SS 24, 64
9082
9083 FILTER_VER_CHROMA_SS 64, 64
9084 FILTER_VER_CHROMA_SS 64, 32
9085 FILTER_VER_CHROMA_SS 64, 48
9086 FILTER_VER_CHROMA_SS 48, 64
9087 FILTER_VER_CHROMA_SS 64, 16
9088
9089
9090 ;---------------------------------------------------------------------------------------------------------------------
9091 ; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9092 ;---------------------------------------------------------------------------------------------------------------------
9093 %macro FILTER_VER_CHROMA_SS_W2_4R 2
9094 INIT_XMM sse4
9095 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5
9096
9097 add r1d, r1d
9098 add r3d, r3d
9099 sub r0, r1
9100 shl r4d, 5
9101
9102 %ifdef PIC
9103 lea r5, [tab_ChromaCoeffV]
9104 lea r5, [r5 + r4]
9105 %else
9106 lea r5, [tab_ChromaCoeffV + r4]
9107 %endif
9108
9109 mov r4d, (%2/4)
9110
9111 .loopH:
9112 PROCESS_CHROMA_SP_W2_4R r5
9113
9114 psrad m0, 6
9115 psrad m2, 6
9116
9117 packssdw m0, m2
9118
9119 movd [r2], m0
9120 pextrd [r2 + r3], m0, 1
9121 lea r2, [r2 + 2 * r3]
9122 pextrd [r2], m0, 2
9123 pextrd [r2 + r3], m0, 3
9124
9125 lea r2, [r2 + 2 * r3]
9126
9127 dec r4d
9128 jnz .loopH
9129
9130 RET
9131 %endmacro
9132
9133 FILTER_VER_CHROMA_SS_W2_4R 2, 4
9134 FILTER_VER_CHROMA_SS_W2_4R 2, 8
9135
9136 FILTER_VER_CHROMA_SS_W2_4R 2, 16
9137
9138 ;---------------------------------------------------------------------------------------------------------------
9139 ; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9140 ;---------------------------------------------------------------------------------------------------------------
9141 INIT_XMM sse2
9142 cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
9143
9144 add r1d, r1d
9145 add r3d, r3d
9146 sub r0, r1
9147 shl r4d, 5
9148
9149 %ifdef PIC
9150 lea r5, [tab_ChromaCoeffV]
9151 lea r5, [r5 + r4]
9152 %else
9153 lea r5, [tab_ChromaCoeffV + r4]
9154 %endif
9155
9156 movq m0, [r0]
9157 movq m1, [r0 + r1]
9158 punpcklwd m0, m1 ;m0=[0 1]
9159 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
9160
9161 lea r0, [r0 + 2 * r1]
9162 movq m2, [r0]
9163 punpcklwd m1, m2 ;m1=[1 2]
9164 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
9165
9166 movq m3, [r0 + r1]
9167 punpcklwd m2, m3 ;m4=[2 3]
9168 pmaddwd m2, [r5 + 1 * 16]
9169 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
9170 psrad m0, 6
9171
9172 movq m2, [r0 + 2 * r1]
9173 punpcklwd m3, m2 ;m5=[3 4]
9174 pmaddwd m3, [r5 + 1 * 16]
9175 paddd m1, m3 ;m1=[1+2+3+4] Row2 done
9176 psrad m1, 6
9177
9178 packssdw m0, m1
9179
9180 movlps [r2], m0
9181 movhps [r2 + r3], m0
9182
9183 RET
9184
9185 ;-------------------------------------------------------------------------------------------------------------------
9186 ; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9187 ;-------------------------------------------------------------------------------------------------------------------
9188 %macro FILTER_VER_CHROMA_SS_W6_H4 2
9189 INIT_XMM sse4
9190 cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
9191
9192 add r1d, r1d
9193 add r3d, r3d
9194 sub r0, r1
9195 shl r4d, 5
9196
9197 %ifdef PIC
9198 lea r5, [tab_ChromaCoeffV]
9199 lea r6, [r5 + r4]
9200 %else
9201 lea r6, [tab_ChromaCoeffV + r4]
9202 %endif
9203
9204 mov r4d, %2/4
9205
9206 .loopH:
9207 PROCESS_CHROMA_SP_W4_4R
9208
9209 psrad m0, 6
9210 psrad m1, 6
9211 psrad m2, 6
9212 psrad m3, 6
9213
9214 packssdw m0, m1
9215 packssdw m2, m3
9216
9217 movlps [r2], m0
9218 movhps [r2 + r3], m0
9219 lea r5, [r2 + 2 * r3]
9220 movlps [r5], m2
9221 movhps [r5 + r3], m2
9222
9223 lea r5, [4 * r1 - 2 * 4]
9224 sub r0, r5
9225 add r2, 2 * 4
9226
9227 PROCESS_CHROMA_SP_W2_4R r6
9228
9229 psrad m0, 6
9230 psrad m2, 6
9231
9232 packssdw m0, m2
9233
9234 movd [r2], m0
9235 pextrd [r2 + r3], m0, 1
9236 lea r2, [r2 + 2 * r3]
9237 pextrd [r2], m0, 2
9238 pextrd [r2 + r3], m0, 3
9239
9240 sub r0, 2 * 4
9241 lea r2, [r2 + 2 * r3 - 2 * 4]
9242
9243 dec r4d
9244 jnz .loopH
9245
9246 RET
9247 %endmacro
9248
9249 FILTER_VER_CHROMA_SS_W6_H4 6, 8
9250
9251 FILTER_VER_CHROMA_SS_W6_H4 6, 16
9252
9253
9254 ;----------------------------------------------------------------------------------------------------------------
9255 ; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9256 ;----------------------------------------------------------------------------------------------------------------
9257 %macro FILTER_VER_CHROMA_SS_W8_H2 2
9258 INIT_XMM sse2
9259 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
9260
9261 add r1d, r1d
9262 add r3d, r3d
9263 sub r0, r1
9264 shl r4d, 5
9265
9266 %ifdef PIC
9267 lea r5, [tab_ChromaCoeffV]
9268 lea r5, [r5 + r4]
9269 %else
9270 lea r5, [tab_ChromaCoeffV + r4]
9271 %endif
9272
9273 mov r4d, %2/2
9274 .loopH:
9275 PROCESS_CHROMA_SP_W8_2R
9276
9277 psrad m0, 6
9278 psrad m1, 6
9279 psrad m2, 6
9280 psrad m3, 6
9281
9282 packssdw m0, m1
9283 packssdw m2, m3
9284
9285 movu [r2], m0
9286 movu [r2 + r3], m2
9287
9288 lea r2, [r2 + 2 * r3]
9289
9290 dec r4d
9291 jnz .loopH
9292
9293 RET
9294 %endmacro
9295
9296 FILTER_VER_CHROMA_SS_W8_H2 8, 2
9297 FILTER_VER_CHROMA_SS_W8_H2 8, 4
9298 FILTER_VER_CHROMA_SS_W8_H2 8, 6
9299 FILTER_VER_CHROMA_SS_W8_H2 8, 8
9300 FILTER_VER_CHROMA_SS_W8_H2 8, 16
9301 FILTER_VER_CHROMA_SS_W8_H2 8, 32
9302
9303 FILTER_VER_CHROMA_SS_W8_H2 8, 12
9304 FILTER_VER_CHROMA_SS_W8_H2 8, 64
9305
9306 ;-----------------------------------------------------------------------------------------------------------------
9307 ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9308 ;-----------------------------------------------------------------------------------------------------------------
9309 %macro FILTER_VER_LUMA_SS 2
9310 INIT_XMM sse2
9311 cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
9312
9313 add r1d, r1d
9314 add r3d, r3d
9315 lea r5, [3 * r1]
9316 sub r0, r5
9317 shl r4d, 6
9318
9319 %ifdef PIC
9320 lea r5, [tab_LumaCoeffV]
9321 lea r6, [r5 + r4]
9322 %else
9323 lea r6, [tab_LumaCoeffV + r4]
9324 %endif
9325
9326 mov dword [rsp], %2/4
9327 .loopH:
9328 mov r4d, (%1/4)
9329 .loopW:
9330 movq m0, [r0]
9331 movq m1, [r0 + r1]
9332 punpcklwd m0, m1 ;m0=[0 1]
9333 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
9334
9335 lea r0, [r0 + 2 * r1]
9336 movq m4, [r0]
9337 punpcklwd m1, m4 ;m1=[1 2]
9338 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
9339
9340 movq m5, [r0 + r1]
9341 punpcklwd m4, m5 ;m4=[2 3]
9342 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
9343 pmaddwd m4, [r6 + 1 * 16]
9344 paddd m0, m4 ;m0=[0+1+2+3] Row1
9345
9346 lea r0, [r0 + 2 * r1]
9347 movq m4, [r0]
9348 punpcklwd m5, m4 ;m5=[3 4]
9349 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
9350 pmaddwd m5, [r6 + 1 * 16]
9351 paddd m1, m5 ;m1 = [1+2+3+4] Row2
9352
9353 movq m5, [r0 + r1]
9354 punpcklwd m4, m5 ;m4=[4 5]
9355 pmaddwd m6, m4, [r6 + 1 * 16]
9356 paddd m2, m6 ;m2=[2+3+4+5] Row3
9357 pmaddwd m4, [r6 + 2 * 16]
9358 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
9359
9360 lea r0, [r0 + 2 * r1]
9361 movq m4, [r0]
9362 punpcklwd m5, m4 ;m5=[5 6]
9363 pmaddwd m6, m5, [r6 + 1 * 16]
9364 paddd m3, m6 ;m3=[3+4+5+6] Row4
9365 pmaddwd m5, [r6 + 2 * 16]
9366 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
9367
9368 movq m5, [r0 + r1]
9369 punpcklwd m4, m5 ;m4=[6 7]
9370 pmaddwd m6, m4, [r6 + 2 * 16]
9371 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
9372 pmaddwd m4, [r6 + 3 * 16]
9373 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
9374 psrad m0, 6
9375
9376 lea r0, [r0 + 2 * r1]
9377 movq m4, [r0]
9378 punpcklwd m5, m4 ;m5=[7 8]
9379 pmaddwd m6, m5, [r6 + 2 * 16]
9380 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
9381 pmaddwd m5, [r6 + 3 * 16]
9382 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
9383 psrad m1, 6
9384
9385 packssdw m0, m1
9386
9387 movlps [r2], m0
9388 movhps [r2 + r3], m0
9389
9390 movq m5, [r0 + r1]
9391 punpcklwd m4, m5 ;m4=[8 9]
9392 pmaddwd m4, [r6 + 3 * 16]
9393 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
9394 psrad m2, 6
9395
9396 movq m4, [r0 + 2 * r1]
9397 punpcklwd m5, m4 ;m5=[9 10]
9398 pmaddwd m5, [r6 + 3 * 16]
9399 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
9400 psrad m3, 6
9401
9402 packssdw m2, m3
9403
9404 movlps [r2 + 2 * r3], m2
9405 lea r5, [3 * r3]
9406 movhps [r2 + r5], m2
9407
9408 lea r5, [8 * r1 - 2 * 4]
9409 sub r0, r5
9410 add r2, 2 * 4
9411
9412 dec r4d
9413 jnz .loopW
9414
9415 lea r0, [r0 + 4 * r1 - 2 * %1]
9416 lea r2, [r2 + 4 * r3 - 2 * %1]
9417
9418 dec dword [rsp]
9419 jnz .loopH
9420
9421 RET
9422 %endmacro
9423
9424 FILTER_VER_LUMA_SS 4, 4
9425 FILTER_VER_LUMA_SS 8, 8
9426 FILTER_VER_LUMA_SS 8, 4
9427 FILTER_VER_LUMA_SS 4, 8
9428 FILTER_VER_LUMA_SS 16, 16
9429 FILTER_VER_LUMA_SS 16, 8
9430 FILTER_VER_LUMA_SS 8, 16
9431 FILTER_VER_LUMA_SS 16, 12
9432 FILTER_VER_LUMA_SS 12, 16
9433 FILTER_VER_LUMA_SS 16, 4
9434 FILTER_VER_LUMA_SS 4, 16
9435 FILTER_VER_LUMA_SS 32, 32
9436 FILTER_VER_LUMA_SS 32, 16
9437 FILTER_VER_LUMA_SS 16, 32
9438 FILTER_VER_LUMA_SS 32, 24
9439 FILTER_VER_LUMA_SS 24, 32
9440 FILTER_VER_LUMA_SS 32, 8
9441 FILTER_VER_LUMA_SS 8, 32
9442 FILTER_VER_LUMA_SS 64, 64
9443 FILTER_VER_LUMA_SS 64, 32
9444 FILTER_VER_LUMA_SS 32, 64
9445 FILTER_VER_LUMA_SS 64, 48
9446 FILTER_VER_LUMA_SS 48, 64
9447 FILTER_VER_LUMA_SS 64, 16
9448 FILTER_VER_LUMA_SS 16, 64