Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / ipfilter8.asm
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Min Chen <chenm003@163.com>
5 ;* Nabajit Deka <nabajit@multicorewareinc.com>
6 ;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7 ;*
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
12 ;*
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;*
22 ;* This program is also available under a commercial proprietary license.
23 ;* For more information, contact us at license @ x265.com.
24 ;*****************************************************************************/
25
26 %include "x86inc.asm"
27 %include "x86util.asm"
28
29 SECTION_RODATA 32
30 tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
31 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
32 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
33
34 ALIGN 32
35 tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
36 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
37 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
38 db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
39
40 tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
41 db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
42
43 tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
44
45 tab_c_512: times 8 dw 512
46 tab_c_526336: times 4 dd 8192*64+2048
47
48 tab_ChromaCoeff: db 0, 64, 0, 0
49 db -2, 58, 10, -2
50 db -4, 54, 16, -2
51 db -6, 46, 28, -4
52 db -4, 36, 36, -4
53 db -4, 28, 46, -6
54 db -2, 16, 54, -4
55 db -2, 10, 58, -2
56
57 tab_ChromaCoeffV: times 4 dw 0, 64
58 times 4 dw 0, 0
59
60 times 4 dw -2, 58
61 times 4 dw 10, -2
62
63 times 4 dw -4, 54
64 times 4 dw 16, -2
65
66 times 4 dw -6, 46
67 times 4 dw 28, -4
68
69 times 4 dw -4, 36
70 times 4 dw 36, -4
71
72 times 4 dw -4, 28
73 times 4 dw 46, -6
74
75 times 4 dw -2, 16
76 times 4 dw 54, -4
77
78 times 4 dw -2, 10
79 times 4 dw 58, -2
80
81 tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
82 db -1, 4, -10, 58, 17, -5, 1, 0
83 db -1, 4, -11, 40, 40, -11, 4, -1
84 db 0, 1, -5, 17, 58, -10, 4, -1
85
86 tab_LumaCoeffV: times 4 dw 0, 0
87 times 4 dw 0, 64
88 times 4 dw 0, 0
89 times 4 dw 0, 0
90
91 times 4 dw -1, 4
92 times 4 dw -10, 58
93 times 4 dw 17, -5
94 times 4 dw 1, 0
95
96 times 4 dw -1, 4
97 times 4 dw -11, 40
98 times 4 dw 40, -11
99 times 4 dw 4, -1
100
101 times 4 dw 0, 1
102 times 4 dw -5, 17
103 times 4 dw 58, -10
104 times 4 dw 4, -1
105
106 tab_LumaCoeffVer: times 8 db 0, 0
107 times 8 db 0, 64
108 times 8 db 0, 0
109 times 8 db 0, 0
110
111 times 8 db -1, 4
112 times 8 db -10, 58
113 times 8 db 17, -5
114 times 8 db 1, 0
115
116 times 8 db -1, 4
117 times 8 db -11, 40
118 times 8 db 40, -11
119 times 8 db 4, -1
120
121 times 8 db 0, 1
122 times 8 db -5, 17
123 times 8 db 58, -10
124 times 8 db 4, -1
125
126 tab_c_128: times 16 db 0x80
127 tab_c_64_n64: times 8 db 64, -64
128
129
130 SECTION .text
131
132 cextern idct4_shuf1
133 cextern pw_1
134 cextern pw_512
135 cextern pw_2000
136
137 %macro FILTER_H4_w2_2 3
138 movh %2, [srcq - 1]
139 pshufb %2, %2, Tm0
140 movh %1, [srcq + srcstrideq - 1]
141 pshufb %1, %1, Tm0
142 punpcklqdq %2, %1
143 pmaddubsw %2, coef2
144 phaddw %2, %2
145 pmulhrsw %2, %3
146 packuswb %2, %2
147 movd r4, %2
148 mov [dstq], r4w
149 shr r4, 16
150 mov [dstq + dststrideq], r4w
151 %endmacro
152
153 ;-----------------------------------------------------------------------------
154 ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
155 ;-----------------------------------------------------------------------------
156 INIT_XMM sse4
157 cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride
158 %define coef2 m4
159 %define Tm0 m3
160 %define t2 m2
161 %define t1 m1
162 %define t0 m0
163
164 mov r4d, r4m
165
166 %ifdef PIC
167 lea r5, [tab_ChromaCoeff]
168 movd coef2, [r5 + r4 * 4]
169 %else
170 movd coef2, [tab_ChromaCoeff + r4 * 4]
171 %endif
172
173 pshufd coef2, coef2, 0
174 mova t2, [tab_c_512]
175 mova Tm0, [tab_Tm]
176
177 %rep 2
178 FILTER_H4_w2_2 t0, t1, t2
179 lea srcq, [srcq + srcstrideq * 2]
180 lea dstq, [dstq + dststrideq * 2]
181 %endrep
182
183 RET
184
185 ;-----------------------------------------------------------------------------
186 ; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
187 ;-----------------------------------------------------------------------------
188 INIT_XMM sse4
189 cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride
190 %define coef2 m4
191 %define Tm0 m3
192 %define t2 m2
193 %define t1 m1
194 %define t0 m0
195
196 mov r4d, r4m
197
198 %ifdef PIC
199 lea r5, [tab_ChromaCoeff]
200 movd coef2, [r5 + r4 * 4]
201 %else
202 movd coef2, [tab_ChromaCoeff + r4 * 4]
203 %endif
204
205 pshufd coef2, coef2, 0
206 mova t2, [tab_c_512]
207 mova Tm0, [tab_Tm]
208
209 %rep 4
210 FILTER_H4_w2_2 t0, t1, t2
211 lea srcq, [srcq + srcstrideq * 2]
212 lea dstq, [dstq + dststrideq * 2]
213 %endrep
214
215 RET
216
217 ;-----------------------------------------------------------------------------
218 ; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
219 ;-----------------------------------------------------------------------------
220 INIT_XMM sse4
221 cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
222 %define coef2 m4
223 %define Tm0 m3
224 %define t2 m2
225 %define t1 m1
226 %define t0 m0
227
228 mov r4d, r4m
229
230 %ifdef PIC
231 lea r5, [tab_ChromaCoeff]
232 movd coef2, [r5 + r4 * 4]
233 %else
234 movd coef2, [tab_ChromaCoeff + r4 * 4]
235 %endif
236
237 pshufd coef2, coef2, 0
238 mova t2, [tab_c_512]
239 mova Tm0, [tab_Tm]
240
241 mov r5d, 16/2
242
243 .loop:
244 FILTER_H4_w2_2 t0, t1, t2
245 lea srcq, [srcq + srcstrideq * 2]
246 lea dstq, [dstq + dststrideq * 2]
247 dec r5d
248 jnz .loop
249
250 RET
251
252 %macro FILTER_H4_w4_2 3
253 movh %2, [srcq - 1]
254 pshufb %2, %2, Tm0
255 pmaddubsw %2, coef2
256 movh %1, [srcq + srcstrideq - 1]
257 pshufb %1, %1, Tm0
258 pmaddubsw %1, coef2
259 phaddw %2, %1
260 pmulhrsw %2, %3
261 packuswb %2, %2
262 movd [dstq], %2
263 palignr %2, %2, 4
264 movd [dstq + dststrideq], %2
265 %endmacro
266
267 ;-----------------------------------------------------------------------------
268 ; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
269 ;-----------------------------------------------------------------------------
270 INIT_XMM sse4
271 cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride
272 %define coef2 m4
273 %define Tm0 m3
274 %define t2 m2
275 %define t1 m1
276 %define t0 m0
277
278 mov r4d, r4m
279
280 %ifdef PIC
281 lea r5, [tab_ChromaCoeff]
282 movd coef2, [r5 + r4 * 4]
283 %else
284 movd coef2, [tab_ChromaCoeff + r4 * 4]
285 %endif
286
287 pshufd coef2, coef2, 0
288 mova t2, [tab_c_512]
289 mova Tm0, [tab_Tm]
290
291 FILTER_H4_w4_2 t0, t1, t2
292
293 RET
294
295 ;-----------------------------------------------------------------------------
296 ; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
297 ;-----------------------------------------------------------------------------
298 INIT_XMM sse4
299 cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride
300 %define coef2 m4
301 %define Tm0 m3
302 %define t2 m2
303 %define t1 m1
304 %define t0 m0
305
306 mov r4d, r4m
307
308 %ifdef PIC
309 lea r5, [tab_ChromaCoeff]
310 movd coef2, [r5 + r4 * 4]
311 %else
312 movd coef2, [tab_ChromaCoeff + r4 * 4]
313 %endif
314
315 pshufd coef2, coef2, 0
316 mova t2, [tab_c_512]
317 mova Tm0, [tab_Tm]
318
319 %rep 2
320 FILTER_H4_w4_2 t0, t1, t2
321 lea srcq, [srcq + srcstrideq * 2]
322 lea dstq, [dstq + dststrideq * 2]
323 %endrep
324
325 RET
326
327 ;-----------------------------------------------------------------------------
328 ; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
329 ;-----------------------------------------------------------------------------
330 INIT_XMM sse4
331 cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride
332 %define coef2 m4
333 %define Tm0 m3
334 %define t2 m2
335 %define t1 m1
336 %define t0 m0
337
338 mov r4d, r4m
339
340 %ifdef PIC
341 lea r5, [tab_ChromaCoeff]
342 movd coef2, [r5 + r4 * 4]
343 %else
344 movd coef2, [tab_ChromaCoeff + r4 * 4]
345 %endif
346
347 pshufd coef2, coef2, 0
348 mova t2, [tab_c_512]
349 mova Tm0, [tab_Tm]
350
351 %rep 4
352 FILTER_H4_w4_2 t0, t1, t2
353 lea srcq, [srcq + srcstrideq * 2]
354 lea dstq, [dstq + dststrideq * 2]
355 %endrep
356
357 RET
358
359 ;-----------------------------------------------------------------------------
360 ; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
361 ;-----------------------------------------------------------------------------
362 INIT_XMM sse4
363 cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride
364 %define coef2 m4
365 %define Tm0 m3
366 %define t2 m2
367 %define t1 m1
368 %define t0 m0
369
370 mov r4d, r4m
371
372 %ifdef PIC
373 lea r5, [tab_ChromaCoeff]
374 movd coef2, [r5 + r4 * 4]
375 %else
376 movd coef2, [tab_ChromaCoeff + r4 * 4]
377 %endif
378
379 pshufd coef2, coef2, 0
380 mova t2, [tab_c_512]
381 mova Tm0, [tab_Tm]
382
383 %rep 8
384 FILTER_H4_w4_2 t0, t1, t2
385 lea srcq, [srcq + srcstrideq * 2]
386 lea dstq, [dstq + dststrideq * 2]
387 %endrep
388
389 RET
390
391 ;-----------------------------------------------------------------------------
392 ; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
393 ;-----------------------------------------------------------------------------
394 INIT_XMM sse4
395 cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
396 %define coef2 m4
397 %define Tm0 m3
398 %define t2 m2
399 %define t1 m1
400 %define t0 m0
401
402 mov r4d, r4m
403
404 %ifdef PIC
405 lea r5, [tab_ChromaCoeff]
406 movd coef2, [r5 + r4 * 4]
407 %else
408 movd coef2, [tab_ChromaCoeff + r4 * 4]
409 %endif
410
411 pshufd coef2, coef2, 0
412 mova t2, [tab_c_512]
413 mova Tm0, [tab_Tm]
414
415 mov r5d, 32/2
416
417 .loop:
418 FILTER_H4_w4_2 t0, t1, t2
419 lea srcq, [srcq + srcstrideq * 2]
420 lea dstq, [dstq + dststrideq * 2]
421 dec r5d
422 jnz .loop
423
424 RET
425
426
427 %macro FILTER_H4_w6 3
428 movu %1, [srcq - 1]
429 pshufb %2, %1, Tm0
430 pmaddubsw %2, coef2
431 pshufb %1, %1, Tm1
432 pmaddubsw %1, coef2
433 phaddw %2, %1
434 pmulhrsw %2, %3
435 packuswb %2, %2
436 movd [dstq], %2
437 pextrw [dstq + 4], %2, 2
438 %endmacro
439
440 %macro FILTER_H4_w8 3
441 movu %1, [srcq - 1]
442 pshufb %2, %1, Tm0
443 pmaddubsw %2, coef2
444 pshufb %1, %1, Tm1
445 pmaddubsw %1, coef2
446 phaddw %2, %1
447 pmulhrsw %2, %3
448 packuswb %2, %2
449 movh [dstq], %2
450 %endmacro
451
452 %macro FILTER_H4_w12 3
453 movu %1, [srcq - 1]
454 pshufb %2, %1, Tm0
455 pmaddubsw %2, coef2
456 pshufb %1, %1, Tm1
457 pmaddubsw %1, coef2
458 phaddw %2, %1
459 pmulhrsw %2, %3
460 movu %1, [srcq - 1 + 8]
461 pshufb %1, %1, Tm0
462 pmaddubsw %1, coef2
463 phaddw %1, %1
464 pmulhrsw %1, %3
465 packuswb %2, %1
466 movh [dstq], %2
467 pextrd [dstq + 8], %2, 2
468 %endmacro
469
470 %macro FILTER_H4_w16 4
471 movu %1, [srcq - 1]
472 pshufb %2, %1, Tm0
473 pmaddubsw %2, coef2
474 pshufb %1, %1, Tm1
475 pmaddubsw %1, coef2
476 phaddw %2, %1
477 movu %1, [srcq - 1 + 8]
478 pshufb %4, %1, Tm0
479 pmaddubsw %4, coef2
480 pshufb %1, %1, Tm1
481 pmaddubsw %1, coef2
482 phaddw %4, %1
483 pmulhrsw %2, %3
484 pmulhrsw %4, %3
485 packuswb %2, %4
486 movu [dstq], %2
487 %endmacro
488
489 %macro FILTER_H4_w24 4
490 movu %1, [srcq - 1]
491 pshufb %2, %1, Tm0
492 pmaddubsw %2, coef2
493 pshufb %1, %1, Tm1
494 pmaddubsw %1, coef2
495 phaddw %2, %1
496 movu %1, [srcq - 1 + 8]
497 pshufb %4, %1, Tm0
498 pmaddubsw %4, coef2
499 pshufb %1, %1, Tm1
500 pmaddubsw %1, coef2
501 phaddw %4, %1
502 pmulhrsw %2, %3
503 pmulhrsw %4, %3
504 packuswb %2, %4
505 movu [dstq], %2
506 movu %1, [srcq - 1 + 16]
507 pshufb %2, %1, Tm0
508 pmaddubsw %2, coef2
509 pshufb %1, %1, Tm1
510 pmaddubsw %1, coef2
511 phaddw %2, %1
512 pmulhrsw %2, %3
513 packuswb %2, %2
514 movh [dstq + 16], %2
515 %endmacro
516
517 %macro FILTER_H4_w32 4
518 movu %1, [srcq - 1]
519 pshufb %2, %1, Tm0
520 pmaddubsw %2, coef2
521 pshufb %1, %1, Tm1
522 pmaddubsw %1, coef2
523 phaddw %2, %1
524 movu %1, [srcq - 1 + 8]
525 pshufb %4, %1, Tm0
526 pmaddubsw %4, coef2
527 pshufb %1, %1, Tm1
528 pmaddubsw %1, coef2
529 phaddw %4, %1
530 pmulhrsw %2, %3
531 pmulhrsw %4, %3
532 packuswb %2, %4
533 movu [dstq], %2
534 movu %1, [srcq - 1 + 16]
535 pshufb %2, %1, Tm0
536 pmaddubsw %2, coef2
537 pshufb %1, %1, Tm1
538 pmaddubsw %1, coef2
539 phaddw %2, %1
540 movu %1, [srcq - 1 + 24]
541 pshufb %4, %1, Tm0
542 pmaddubsw %4, coef2
543 pshufb %1, %1, Tm1
544 pmaddubsw %1, coef2
545 phaddw %4, %1
546 pmulhrsw %2, %3
547 pmulhrsw %4, %3
548 packuswb %2, %4
549 movu [dstq + 16], %2
550 %endmacro
551
552 %macro FILTER_H4_w16o 5
553 movu %1, [srcq + %5 - 1]
554 pshufb %2, %1, Tm0
555 pmaddubsw %2, coef2
556 pshufb %1, %1, Tm1
557 pmaddubsw %1, coef2
558 phaddw %2, %1
559 movu %1, [srcq + %5 - 1 + 8]
560 pshufb %4, %1, Tm0
561 pmaddubsw %4, coef2
562 pshufb %1, %1, Tm1
563 pmaddubsw %1, coef2
564 phaddw %4, %1
565 pmulhrsw %2, %3
566 pmulhrsw %4, %3
567 packuswb %2, %4
568 movu [dstq + %5], %2
569 %endmacro
570
571 %macro FILTER_H4_w48 4
572 FILTER_H4_w16o %1, %2, %3, %4, 0
573 FILTER_H4_w16o %1, %2, %3, %4, 16
574 FILTER_H4_w16o %1, %2, %3, %4, 32
575 %endmacro
576
577 %macro FILTER_H4_w64 4
578 FILTER_H4_w16o %1, %2, %3, %4, 0
579 FILTER_H4_w16o %1, %2, %3, %4, 16
580 FILTER_H4_w16o %1, %2, %3, %4, 32
581 FILTER_H4_w16o %1, %2, %3, %4, 48
582 %endmacro
583
584 ;-----------------------------------------------------------------------------
585 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
586 ;-----------------------------------------------------------------------------
587 %macro IPFILTER_CHROMA 2
588 INIT_XMM sse4
589 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
590 %define coef2 m5
591 %define Tm0 m4
592 %define Tm1 m3
593 %define t2 m2
594 %define t1 m1
595 %define t0 m0
596
597 mov r4d, r4m
598
599 %ifdef PIC
600 lea r5, [tab_ChromaCoeff]
601 movd coef2, [r5 + r4 * 4]
602 %else
603 movd coef2, [tab_ChromaCoeff + r4 * 4]
604 %endif
605
606 mov r5d, %2
607
608 pshufd coef2, coef2, 0
609 mova t2, [tab_c_512]
610 mova Tm0, [tab_Tm]
611 mova Tm1, [tab_Tm + 16]
612
613 .loop:
614 FILTER_H4_w%1 t0, t1, t2
615 add srcq, srcstrideq
616 add dstq, dststrideq
617
618 dec r5d
619 jnz .loop
620
621 RET
622 %endmacro
623
624
625 IPFILTER_CHROMA 6, 8
626 IPFILTER_CHROMA 8, 2
627 IPFILTER_CHROMA 8, 4
628 IPFILTER_CHROMA 8, 6
629 IPFILTER_CHROMA 8, 8
630 IPFILTER_CHROMA 8, 16
631 IPFILTER_CHROMA 8, 32
632 IPFILTER_CHROMA 12, 16
633
634 IPFILTER_CHROMA 6, 16
635 IPFILTER_CHROMA 8, 12
636 IPFILTER_CHROMA 8, 64
637 IPFILTER_CHROMA 12, 32
638
639 ;-----------------------------------------------------------------------------
640 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
641 ;-----------------------------------------------------------------------------
642 %macro IPFILTER_CHROMA_W 2
643 INIT_XMM sse4
644 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
645 %define coef2 m6
646 %define Tm0 m5
647 %define Tm1 m4
648 %define t3 m3
649 %define t2 m2
650 %define t1 m1
651 %define t0 m0
652
653 mov r4d, r4m
654
655 %ifdef PIC
656 lea r5, [tab_ChromaCoeff]
657 movd coef2, [r5 + r4 * 4]
658 %else
659 movd coef2, [tab_ChromaCoeff + r4 * 4]
660 %endif
661
662 mov r5d, %2
663
664 pshufd coef2, coef2, 0
665 mova t2, [tab_c_512]
666 mova Tm0, [tab_Tm]
667 mova Tm1, [tab_Tm + 16]
668
669 .loop:
670 FILTER_H4_w%1 t0, t1, t2, t3
671 add srcq, srcstrideq
672 add dstq, dststrideq
673
674 dec r5d
675 jnz .loop
676
677 RET
678 %endmacro
679
680 IPFILTER_CHROMA_W 16, 4
681 IPFILTER_CHROMA_W 16, 8
682 IPFILTER_CHROMA_W 16, 12
683 IPFILTER_CHROMA_W 16, 16
684 IPFILTER_CHROMA_W 16, 32
685 IPFILTER_CHROMA_W 32, 8
686 IPFILTER_CHROMA_W 32, 16
687 IPFILTER_CHROMA_W 32, 24
688 IPFILTER_CHROMA_W 24, 32
689 IPFILTER_CHROMA_W 32, 32
690
691 IPFILTER_CHROMA_W 16, 24
692 IPFILTER_CHROMA_W 16, 64
693 IPFILTER_CHROMA_W 32, 48
694 IPFILTER_CHROMA_W 24, 64
695 IPFILTER_CHROMA_W 32, 64
696
697 IPFILTER_CHROMA_W 64, 64
698 IPFILTER_CHROMA_W 64, 32
699 IPFILTER_CHROMA_W 64, 48
700 IPFILTER_CHROMA_W 48, 64
701 IPFILTER_CHROMA_W 64, 16
702
703
704 %macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
705 movu %1, %7
706 pshufb %2, %1, [tab_Lm + 0]
707 pmaddubsw %2, %5
708 pshufb %3, %1, [tab_Lm + 16]
709 pmaddubsw %3, %5
710 phaddw %2, %3
711 pshufb %4, %1, [tab_Lm + 32]
712 pmaddubsw %4, %5
713 pshufb %1, %1, [tab_Lm + 48]
714 pmaddubsw %1, %5
715 phaddw %4, %1
716 phaddw %2, %4
717 %if %0 == 8
718 pmulhrsw %2, %6
719 packuswb %2, %2
720 movh %8, %2
721 %endif
722 %endmacro
723
724 %macro FILTER_H8_W4 2
725 movu %1, [r0 - 3 + r5]
726 pshufb %2, %1, [tab_Lm]
727 pmaddubsw %2, m3
728 pshufb m7, %1, [tab_Lm + 16]
729 pmaddubsw m7, m3
730 phaddw %2, m7
731 phaddw %2, %2
732 %endmacro
733
734 ;----------------------------------------------------------------------------------------------------------------------------
735 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
736 ;----------------------------------------------------------------------------------------------------------------------------
737 %macro IPFILTER_LUMA 3
738 INIT_XMM sse4
739 cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
740
741 mov r4d, r4m
742
743 %ifdef PIC
744 lea r6, [tab_LumaCoeff]
745 movh m3, [r6 + r4 * 8]
746 %else
747 movh m3, [tab_LumaCoeff + r4 * 8]
748 %endif
749 punpcklqdq m3, m3
750
751 %ifidn %3, pp
752 mova m2, [tab_c_512]
753 %else
754 mova m2, [pw_2000]
755 %endif
756
757 mov r4d, %2
758 %ifidn %3, ps
759 add r3, r3
760 cmp r5m, byte 0
761 je .loopH
762 lea r6, [r1 + 2 * r1]
763 sub r0, r6
764 add r4d, 7
765 %endif
766
767 .loopH:
768 xor r5, r5
769 %rep %1 / 8
770 %ifidn %3, pp
771 FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
772 %else
773 FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
774 psubw m1, m2
775 movu [r2 + 2 * r5], m1
776 %endif
777 add r5, 8
778 %endrep
779
780 %rep (%1 % 8) / 4
781 FILTER_H8_W4 m0, m1
782 %ifidn %3, pp
783 pmulhrsw m1, m2
784 packuswb m1, m1
785 movd [r2 + r5], m1
786 %else
787 psubw m1, m2
788 movh [r2 + 2 * r5], m1
789 %endif
790 %endrep
791
792 add r0, r1
793 add r2, r3
794
795 dec r4d
796 jnz .loopH
797 RET
798 %endmacro
799
800
801 INIT_YMM avx2
802 cglobal interp_8tap_horiz_pp_4x4, 4,6,6
803 mov r4d, r4m
804
805 %ifdef PIC
806 lea r5, [tab_LumaCoeff]
807 vpbroadcastq m0, [r5 + r4 * 8]
808 %else
809 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
810 %endif
811
812 mova m1, [tab_Lm]
813 vpbroadcastd m2, [pw_1]
814
815 ; register map
816 ; m0 - interpolate coeff
817 ; m1 - shuffle order table
818 ; m2 - constant word 1
819
820 sub r0, 3
821 ; Row 0-1
822 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
823 pshufb m3, m1
824 pmaddubsw m3, m0
825 pmaddwd m3, m2
826 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
827 pshufb m4, m1
828 pmaddubsw m4, m0
829 pmaddwd m4, m2
830 phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
831
832 ; Row 2-3
833 lea r0, [r0 + r1 * 2]
834 vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
835 pshufb m4, m1
836 pmaddubsw m4, m0
837 pmaddwd m4, m2
838 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
839 pshufb m5, m1
840 pmaddubsw m5, m0
841 pmaddwd m5, m2
842 phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
843
844 packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
845 pmulhrsw m3, [pw_512]
846 vextracti128 xm4, m3, 1
847 packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
848 pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]
849
850 lea r0, [r3 * 3]
851 movd [r2], xm3
852 pextrd [r2+r3], xm3, 2
853 pextrd [r2+r3*2], xm3, 1
854 pextrd [r2+r0], xm3, 3
855 RET
856
857
858 ;--------------------------------------------------------------------------------------------------------------
859 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
860 ;--------------------------------------------------------------------------------------------------------------
861 IPFILTER_LUMA 4, 4, pp
862 IPFILTER_LUMA 4, 8, pp
863 IPFILTER_LUMA 12, 16, pp
864 IPFILTER_LUMA 4, 16, pp
865
866 ;--------------------------------------------------------------------------------------------------------------
867 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
868 ;--------------------------------------------------------------------------------------------------------------
869 %macro IPFILTER_LUMA_PP_W8 2
870 INIT_XMM sse4
871 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
872 mov r4d, r4m
873
874 %ifdef PIC
875 lea r5, [tab_LumaCoeff]
876 movh m3, [r5 + r4 * 8]
877 %else
878 movh m3, [tab_LumaCoeff + r4 * 8]
879 %endif
880 pshufd m0, m3, 0 ; m0 = coeff-L
881 pshufd m1, m3, 0x55 ; m1 = coeff-H
882 lea r5, [tab_Tm] ; r5 = shuffle
883 mova m2, [pw_512] ; m2 = 512
884
885 mov r4d, %2
886 .loopH:
887 %assign x 0
888 %rep %1 / 8
889 movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
890 pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
891 pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
892 pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]
893 pmaddubsw m4, m0
894 pmaddubsw m6, m5, m1
895 pmaddubsw m5, m0
896 pmaddubsw m3, m1
897 paddw m4, m6
898 paddw m5, m3
899 phaddw m4, m5
900 pmulhrsw m4, m2
901 packuswb m4, m4
902 movh [r2 + x], m4
903 %assign x x+8
904 %endrep
905
906 add r0, r1
907 add r2, r3
908
909 dec r4d
910 jnz .loopH
911 RET
912 %endmacro
913
914 IPFILTER_LUMA_PP_W8 8, 4
915 IPFILTER_LUMA_PP_W8 8, 8
916 IPFILTER_LUMA_PP_W8 8, 16
917 IPFILTER_LUMA_PP_W8 8, 32
918 IPFILTER_LUMA_PP_W8 16, 4
919 IPFILTER_LUMA_PP_W8 16, 8
920 IPFILTER_LUMA_PP_W8 16, 12
921 IPFILTER_LUMA_PP_W8 16, 16
922 IPFILTER_LUMA_PP_W8 16, 32
923 IPFILTER_LUMA_PP_W8 16, 64
924 IPFILTER_LUMA_PP_W8 24, 32
925 IPFILTER_LUMA_PP_W8 32, 8
926 IPFILTER_LUMA_PP_W8 32, 16
927 IPFILTER_LUMA_PP_W8 32, 24
928 IPFILTER_LUMA_PP_W8 32, 32
929 IPFILTER_LUMA_PP_W8 32, 64
930 IPFILTER_LUMA_PP_W8 48, 64
931 IPFILTER_LUMA_PP_W8 64, 16
932 IPFILTER_LUMA_PP_W8 64, 32
933 IPFILTER_LUMA_PP_W8 64, 48
934 IPFILTER_LUMA_PP_W8 64, 64
935
936 ;----------------------------------------------------------------------------------------------------------------------------
937 ; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
938 ;----------------------------------------------------------------------------------------------------------------------------
939 IPFILTER_LUMA 4, 4, ps
940 IPFILTER_LUMA 8, 8, ps
941 IPFILTER_LUMA 8, 4, ps
942 IPFILTER_LUMA 4, 8, ps
943 IPFILTER_LUMA 16, 16, ps
944 IPFILTER_LUMA 16, 8, ps
945 IPFILTER_LUMA 8, 16, ps
946 IPFILTER_LUMA 16, 12, ps
947 IPFILTER_LUMA 12, 16, ps
948 IPFILTER_LUMA 16, 4, ps
949 IPFILTER_LUMA 4, 16, ps
950 IPFILTER_LUMA 32, 32, ps
951 IPFILTER_LUMA 32, 16, ps
952 IPFILTER_LUMA 16, 32, ps
953 IPFILTER_LUMA 32, 24, ps
954 IPFILTER_LUMA 24, 32, ps
955 IPFILTER_LUMA 32, 8, ps
956 IPFILTER_LUMA 8, 32, ps
957 IPFILTER_LUMA 64, 64, ps
958 IPFILTER_LUMA 64, 32, ps
959 IPFILTER_LUMA 32, 64, ps
960 IPFILTER_LUMA 64, 48, ps
961 IPFILTER_LUMA 48, 64, ps
962 IPFILTER_LUMA 64, 16, ps
963 IPFILTER_LUMA 16, 64, ps
964
965 ;-----------------------------------------------------------------------------
966 ; Interpolate HV
967 ;-----------------------------------------------------------------------------
968 %macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
969 mova %5, [r0 + (%6 + 0) * 16]
970 mova %1, [r0 + (%6 + 1) * 16]
971 mova %2, [r0 + (%6 + 2) * 16]
972 punpcklwd %3, %5, %1
973 punpckhwd %5, %1
974 pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
975 pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
976 punpcklwd %4, %1, %2
977 punpckhwd %1, %2
978 pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
979 pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
980 %endmacro ; FILTER_HV8_START
981
982 %macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
983 mova %8, [r0 + (%9 + 0) * 16]
984 mova %1, [r0 + (%9 + 1) * 16]
985 punpcklwd %7, %2, %8
986 punpckhwd %2, %8
987 pmaddwd %7, [r5 + %10 * 16]
988 pmaddwd %2, [r5 + %10 * 16]
989 paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
990 paddd %5, %2 ; R0 = H[0+1+2+3]
991 punpcklwd %7, %8, %1
992 punpckhwd %8, %1
993 pmaddwd %7, [r5 + %10 * 16]
994 pmaddwd %8, [r5 + %10 * 16]
995 paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
996 paddd %6, %8 ; R1 = H[1+2+3+4]
997 %endmacro ; FILTER_HV8_MID
998
999 ; Round and Saturate
1000 %macro FILTER_HV8_END 4 ; output in [1, 3]
1001 paddd %1, [tab_c_526336]
1002 paddd %2, [tab_c_526336]
1003 paddd %3, [tab_c_526336]
1004 paddd %4, [tab_c_526336]
1005 psrad %1, 12
1006 psrad %2, 12
1007 psrad %3, 12
1008 psrad %4, 12
1009 packssdw %1, %2
1010 packssdw %3, %4
1011
1012 ; TODO: is merge better? I think this way is short dependency link
1013 packuswb %1, %3
1014 %endmacro ; FILTER_HV8_END
1015
1016 ;-----------------------------------------------------------------------------
1017 ; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
1018 ;-----------------------------------------------------------------------------
1019 INIT_XMM ssse3
1020 cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
1021 %define coef m7
1022 %define stk_buf rsp
1023
1024 mov r4d, r4m
1025 mov r5d, r5m
1026
1027 %ifdef PIC
1028 lea r6, [tab_LumaCoeff]
1029 movh coef, [r6 + r4 * 8]
1030 %else
1031 movh coef, [tab_LumaCoeff + r4 * 8]
1032 %endif
1033 punpcklqdq coef, coef
1034
1035 ; move to row -3
1036 lea r6, [r1 + r1 * 2]
1037 sub r0, r6
1038
1039 xor r6, r6
1040 mov r4, rsp
1041
1042 .loopH:
1043 FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
1044 psubw m1, [pw_2000]
1045 mova [r4], m1
1046
1047 add r0, r1
1048 add r4, 16
1049 inc r6
1050 cmp r6, 8+7
1051 jnz .loopH
1052
1053 ; ready to phase V
1054 ; Here all of mN is free
1055
1056 ; load coeff table
1057 shl r5, 6
1058 lea r6, [tab_LumaCoeffV]
1059 lea r5, [r5 + r6]
1060
1061 ; load intermedia buffer
1062 mov r0, stk_buf
1063
1064 ; register mapping
1065 ; r0 - src
1066 ; r5 - coeff
1067 ; r6 - loop_i
1068
1069 ; let's go
1070 xor r6, r6
1071
1072 ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
1073 .loopV:
1074
1075 FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
1076 FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
1077 FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
1078 FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
1079 FILTER_HV8_END m3, m0, m4, m1
1080
1081 movh [r2], m3
1082 movhps [r2 + r3], m3
1083
1084 lea r0, [r0 + 16 * 2]
1085 lea r2, [r2 + r3 * 2]
1086
1087 inc r6
1088 cmp r6, 8/2
1089 jnz .loopV
1090
1091 RET
1092
1093 ;-----------------------------------------------------------------------------
1094 ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1095 ;-----------------------------------------------------------------------------
1096 INIT_XMM sse4
1097 cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
1098
1099 mov r4d, r4m
1100 sub r0, r1
1101
1102 %ifdef PIC
1103 lea r5, [tab_ChromaCoeff]
1104 movd m0, [r5 + r4 * 4]
1105 %else
1106 movd m0, [tab_ChromaCoeff + r4 * 4]
1107 %endif
1108 lea r4, [r1 * 3]
1109 lea r5, [r0 + 4 * r1]
1110 pshufb m0, [tab_Cm]
1111 mova m1, [tab_c_512]
1112
1113 movd m2, [r0]
1114 movd m3, [r0 + r1]
1115 movd m4, [r0 + 2 * r1]
1116 movd m5, [r0 + r4]
1117
1118 punpcklbw m2, m3
1119 punpcklbw m6, m4, m5
1120 punpcklbw m2, m6
1121
1122 pmaddubsw m2, m0
1123
1124 movd m6, [r5]
1125
1126 punpcklbw m3, m4
1127 punpcklbw m7, m5, m6
1128 punpcklbw m3, m7
1129
1130 pmaddubsw m3, m0
1131
1132 phaddw m2, m3
1133
1134 pmulhrsw m2, m1
1135
1136 movd m7, [r5 + r1]
1137
1138 punpcklbw m4, m5
1139 punpcklbw m3, m6, m7
1140 punpcklbw m4, m3
1141
1142 pmaddubsw m4, m0
1143
1144 movd m3, [r5 + 2 * r1]
1145
1146 punpcklbw m5, m6
1147 punpcklbw m7, m3
1148 punpcklbw m5, m7
1149
1150 pmaddubsw m5, m0
1151
1152 phaddw m4, m5
1153
1154 pmulhrsw m4, m1
1155 packuswb m2, m4
1156
1157 pextrw [r2], m2, 0
1158 pextrw [r2 + r3], m2, 2
1159 lea r2, [r2 + 2 * r3]
1160 pextrw [r2], m2, 4
1161 pextrw [r2 + r3], m2, 6
1162
1163 RET
1164
1165 ;-----------------------------------------------------------------------------
1166 ; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1167 ;-----------------------------------------------------------------------------
1168 %macro FILTER_V4_W2_H4 2
1169 INIT_XMM sse4
1170 cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
1171
1172 mov r4d, r4m
1173 sub r0, r1
1174
1175 %ifdef PIC
1176 lea r5, [tab_ChromaCoeff]
1177 movd m0, [r5 + r4 * 4]
1178 %else
1179 movd m0, [tab_ChromaCoeff + r4 * 4]
1180 %endif
1181
1182 pshufb m0, [tab_Cm]
1183
1184 mova m1, [tab_c_512]
1185
1186 mov r4d, %2
1187 lea r5, [3 * r1]
1188
1189 .loop:
1190 movd m2, [r0]
1191 movd m3, [r0 + r1]
1192 movd m4, [r0 + 2 * r1]
1193 movd m5, [r0 + r5]
1194
1195 punpcklbw m2, m3
1196 punpcklbw m6, m4, m5
1197 punpcklbw m2, m6
1198
1199 pmaddubsw m2, m0
1200
1201 lea r0, [r0 + 4 * r1]
1202 movd m6, [r0]
1203
1204 punpcklbw m3, m4
1205 punpcklbw m7, m5, m6
1206 punpcklbw m3, m7
1207
1208 pmaddubsw m3, m0
1209
1210 phaddw m2, m3
1211
1212 pmulhrsw m2, m1
1213
1214 movd m7, [r0 + r1]
1215
1216 punpcklbw m4, m5
1217 punpcklbw m3, m6, m7
1218 punpcklbw m4, m3
1219
1220 pmaddubsw m4, m0
1221
1222 movd m3, [r0 + 2 * r1]
1223
1224 punpcklbw m5, m6
1225 punpcklbw m7, m3
1226 punpcklbw m5, m7
1227
1228 pmaddubsw m5, m0
1229
1230 phaddw m4, m5
1231
1232 pmulhrsw m4, m1
1233 packuswb m2, m4
1234
1235 pextrw [r2], m2, 0
1236 pextrw [r2 + r3], m2, 2
1237 lea r2, [r2 + 2 * r3]
1238 pextrw [r2], m2, 4
1239 pextrw [r2 + r3], m2, 6
1240
1241 lea r2, [r2 + 2 * r3]
1242
1243 sub r4, 4
1244 jnz .loop
1245 RET
1246 %endmacro
1247
1248 FILTER_V4_W2_H4 2, 8
1249
1250 FILTER_V4_W2_H4 2, 16
1251
1252 ;-----------------------------------------------------------------------------
1253 ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1254 ;-----------------------------------------------------------------------------
1255 INIT_XMM sse4
1256 cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
1257
1258 mov r4d, r4m
1259 sub r0, r1
1260
1261 %ifdef PIC
1262 lea r5, [tab_ChromaCoeff]
1263 movd m0, [r5 + r4 * 4]
1264 %else
1265 movd m0, [tab_ChromaCoeff + r4 * 4]
1266 %endif
1267
1268 pshufb m0, [tab_Cm]
1269 lea r5, [r0 + 2 * r1]
1270
1271 movd m2, [r0]
1272 movd m3, [r0 + r1]
1273 movd m4, [r5]
1274 movd m5, [r5 + r1]
1275
1276 punpcklbw m2, m3
1277 punpcklbw m1, m4, m5
1278 punpcklbw m2, m1
1279
1280 pmaddubsw m2, m0
1281
1282 movd m1, [r0 + 4 * r1]
1283
1284 punpcklbw m3, m4
1285 punpcklbw m5, m1
1286 punpcklbw m3, m5
1287
1288 pmaddubsw m3, m0
1289
1290 phaddw m2, m3
1291
1292 pmulhrsw m2, [tab_c_512]
1293 packuswb m2, m2
1294 movd [r2], m2
1295 pextrd [r2 + r3], m2, 1
1296
1297 RET
1298
1299 ;-----------------------------------------------------------------------------
1300 ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1301 ;-----------------------------------------------------------------------------
1302 INIT_XMM sse4
1303 cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
1304
1305 mov r4d, r4m
1306 sub r0, r1
1307
1308 %ifdef PIC
1309 lea r5, [tab_ChromaCoeff]
1310 movd m0, [r5 + r4 * 4]
1311 %else
1312 movd m0, [tab_ChromaCoeff + r4 * 4]
1313 %endif
1314
1315 pshufb m0, [tab_Cm]
1316 mova m1, [tab_c_512]
1317 lea r5, [r0 + 4 * r1]
1318 lea r4, [r1 * 3]
1319
1320 movd m2, [r0]
1321 movd m3, [r0 + r1]
1322 movd m4, [r0 + 2 * r1]
1323 movd m5, [r0 + r4]
1324
1325 punpcklbw m2, m3
1326 punpcklbw m6, m4, m5
1327 punpcklbw m2, m6
1328
1329 pmaddubsw m2, m0
1330
1331 movd m6, [r5]
1332
1333 punpcklbw m3, m4
1334 punpcklbw m7, m5, m6
1335 punpcklbw m3, m7
1336
1337 pmaddubsw m3, m0
1338
1339 phaddw m2, m3
1340
1341 pmulhrsw m2, m1
1342
1343 movd m7, [r5 + r1]
1344
1345 punpcklbw m4, m5
1346 punpcklbw m3, m6, m7
1347 punpcklbw m4, m3
1348
1349 pmaddubsw m4, m0
1350
1351 movd m3, [r5 + 2 * r1]
1352
1353 punpcklbw m5, m6
1354 punpcklbw m7, m3
1355 punpcklbw m5, m7
1356
1357 pmaddubsw m5, m0
1358
1359 phaddw m4, m5
1360
1361 pmulhrsw m4, m1
1362
1363 packuswb m2, m4
1364 movd [r2], m2
1365 pextrd [r2 + r3], m2, 1
1366 lea r2, [r2 + 2 * r3]
1367 pextrd [r2], m2, 2
1368 pextrd [r2 + r3], m2, 3
1369
1370 RET
1371
1372 ;-----------------------------------------------------------------------------
1373 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1374 ;-----------------------------------------------------------------------------
1375 %macro FILTER_V4_W4_H4 2
1376 INIT_XMM sse4
1377 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
1378
1379 mov r4d, r4m
1380 sub r0, r1
1381
1382 %ifdef PIC
1383 lea r5, [tab_ChromaCoeff]
1384 movd m0, [r5 + r4 * 4]
1385 %else
1386 movd m0, [tab_ChromaCoeff + r4 * 4]
1387 %endif
1388
1389 pshufb m0, [tab_Cm]
1390
1391 mova m1, [tab_c_512]
1392
1393 mov r4d, %2
1394
1395 lea r5, [3 * r1]
1396
1397 .loop:
1398 movd m2, [r0]
1399 movd m3, [r0 + r1]
1400 movd m4, [r0 + 2 * r1]
1401 movd m5, [r0 + r5]
1402
1403 punpcklbw m2, m3
1404 punpcklbw m6, m4, m5
1405 punpcklbw m2, m6
1406
1407 pmaddubsw m2, m0
1408
1409 lea r0, [r0 + 4 * r1]
1410 movd m6, [r0]
1411
1412 punpcklbw m3, m4
1413 punpcklbw m7, m5, m6
1414 punpcklbw m3, m7
1415
1416 pmaddubsw m3, m0
1417
1418 phaddw m2, m3
1419
1420 pmulhrsw m2, m1
1421
1422 movd m7, [r0 + r1]
1423
1424 punpcklbw m4, m5
1425 punpcklbw m3, m6, m7
1426 punpcklbw m4, m3
1427
1428 pmaddubsw m4, m0
1429
1430 movd m3, [r0 + 2 * r1]
1431
1432 punpcklbw m5, m6
1433 punpcklbw m7, m3
1434 punpcklbw m5, m7
1435
1436 pmaddubsw m5, m0
1437
1438 phaddw m4, m5
1439
1440 pmulhrsw m4, m1
1441 packuswb m2, m4
1442 movd [r2], m2
1443 pextrd [r2 + r3], m2, 1
1444 lea r2, [r2 + 2 * r3]
1445 pextrd [r2], m2, 2
1446 pextrd [r2 + r3], m2, 3
1447
1448 lea r2, [r2 + 2 * r3]
1449
1450 sub r4, 4
1451 jnz .loop
1452 RET
1453 %endmacro
1454
1455 FILTER_V4_W4_H4 4, 8
1456 FILTER_V4_W4_H4 4, 16
1457
1458 FILTER_V4_W4_H4 4, 32
1459
1460 %macro FILTER_V4_W8_H2 0
1461 punpcklbw m1, m2
1462 punpcklbw m7, m3, m0
1463
1464 pmaddubsw m1, m6
1465 pmaddubsw m7, m5
1466
1467 paddw m1, m7
1468
1469 pmulhrsw m1, m4
1470 packuswb m1, m1
1471 %endmacro
1472
1473 %macro FILTER_V4_W8_H3 0
1474 punpcklbw m2, m3
1475 punpcklbw m7, m0, m1
1476
1477 pmaddubsw m2, m6
1478 pmaddubsw m7, m5
1479
1480 paddw m2, m7
1481
1482 pmulhrsw m2, m4
1483 packuswb m2, m2
1484 %endmacro
1485
1486 %macro FILTER_V4_W8_H4 0
1487 punpcklbw m3, m0
1488 punpcklbw m7, m1, m2
1489
1490 pmaddubsw m3, m6
1491 pmaddubsw m7, m5
1492
1493 paddw m3, m7
1494
1495 pmulhrsw m3, m4
1496 packuswb m3, m3
1497 %endmacro
1498
1499 %macro FILTER_V4_W8_H5 0
1500 punpcklbw m0, m1
1501 punpcklbw m7, m2, m3
1502
1503 pmaddubsw m0, m6
1504 pmaddubsw m7, m5
1505
1506 paddw m0, m7
1507
1508 pmulhrsw m0, m4
1509 packuswb m0, m0
1510 %endmacro
1511
1512 %macro FILTER_V4_W8_8x2 2
1513 FILTER_V4_W8 %1, %2
1514 movq m0, [r0 + 4 * r1]
1515
1516 FILTER_V4_W8_H2
1517
1518 movh [r2 + r3], m1
1519 %endmacro
1520
1521 %macro FILTER_V4_W8_8x4 2
1522 FILTER_V4_W8_8x2 %1, %2
1523 ;8x3
1524 lea r6, [r0 + 4 * r1]
1525 movq m1, [r6 + r1]
1526
1527 FILTER_V4_W8_H3
1528
1529 movh [r2 + 2 * r3], m2
1530
1531 ;8x4
1532 movq m2, [r6 + 2 * r1]
1533
1534 FILTER_V4_W8_H4
1535
1536 lea r5, [r2 + 2 * r3]
1537 movh [r5 + r3], m3
1538 %endmacro
1539
1540 %macro FILTER_V4_W8_8x6 2
1541 FILTER_V4_W8_8x4 %1, %2
1542 ;8x5
1543 lea r6, [r6 + 2 * r1]
1544 movq m3, [r6 + r1]
1545
1546 FILTER_V4_W8_H5
1547
1548 movh [r2 + 4 * r3], m0
1549
1550 ;8x6
1551 movq m0, [r0 + 8 * r1]
1552
1553 FILTER_V4_W8_H2
1554
1555 lea r5, [r2 + 4 * r3]
1556 movh [r5 + r3], m1
1557 %endmacro
1558
1559 ;-----------------------------------------------------------------------------
1560 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1561 ;-----------------------------------------------------------------------------
1562 %macro FILTER_V4_W8 2
1563 INIT_XMM sse4
1564 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
1565
1566 mov r4d, r4m
1567
1568 sub r0, r1
1569 movq m0, [r0]
1570 movq m1, [r0 + r1]
1571 movq m2, [r0 + 2 * r1]
1572 lea r5, [r0 + 2 * r1]
1573 movq m3, [r5 + r1]
1574
1575 punpcklbw m0, m1
1576 punpcklbw m4, m2, m3
1577
1578 %ifdef PIC
1579 lea r6, [tab_ChromaCoeff]
1580 movd m5, [r6 + r4 * 4]
1581 %else
1582 movd m5, [tab_ChromaCoeff + r4 * 4]
1583 %endif
1584
1585 pshufb m6, m5, [tab_Vm]
1586 pmaddubsw m0, m6
1587
1588 pshufb m5, [tab_Vm + 16]
1589 pmaddubsw m4, m5
1590
1591 paddw m0, m4
1592
1593 mova m4, [tab_c_512]
1594
1595 pmulhrsw m0, m4
1596 packuswb m0, m0
1597 movh [r2], m0
1598 %endmacro
1599
1600 ;-----------------------------------------------------------------------------
1601 ; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1602 ;-----------------------------------------------------------------------------
1603 FILTER_V4_W8_8x2 8, 2
1604
1605 RET
1606
1607 ;-----------------------------------------------------------------------------
1608 ; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1609 ;-----------------------------------------------------------------------------
1610 FILTER_V4_W8_8x4 8, 4
1611
1612 RET
1613
1614 ;-----------------------------------------------------------------------------
1615 ; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1616 ;-----------------------------------------------------------------------------
1617 FILTER_V4_W8_8x6 8, 6
1618
1619 RET
1620
1621 ;-------------------------------------------------------------------------------------------------------------
1622 ; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1623 ;-------------------------------------------------------------------------------------------------------------
1624 INIT_XMM sse4
1625 cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
1626
1627 mov r4d, r4m
1628 sub r0, r1
1629 add r3d, r3d
1630
1631 %ifdef PIC
1632 lea r5, [tab_ChromaCoeff]
1633 movd m0, [r5 + r4 * 4]
1634 %else
1635 movd m0, [tab_ChromaCoeff + r4 * 4]
1636 %endif
1637
1638 pshufb m0, [tab_Cm]
1639
1640 movd m2, [r0]
1641 movd m3, [r0 + r1]
1642 lea r5, [r0 + 2 * r1]
1643 movd m4, [r5]
1644 movd m5, [r5 + r1]
1645
1646 punpcklbw m2, m3
1647 punpcklbw m1, m4, m5
1648 punpcklbw m2, m1
1649
1650 pmaddubsw m2, m0
1651
1652 movd m1, [r0 + 4 * r1]
1653
1654 punpcklbw m3, m4
1655 punpcklbw m5, m1
1656 punpcklbw m3, m5
1657
1658 pmaddubsw m3, m0
1659
1660 phaddw m2, m3
1661
1662 psubw m2, [pw_2000]
1663 movh [r2], m2
1664 movhps [r2 + r3], m2
1665
1666 RET
1667
1668 ;-------------------------------------------------------------------------------------------------------------
1669 ; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1670 ;-------------------------------------------------------------------------------------------------------------
1671 INIT_XMM sse4
1672 cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
1673
1674 mov r4d, r4m
1675 sub r0, r1
1676 add r3d, r3d
1677
1678 %ifdef PIC
1679 lea r5, [tab_ChromaCoeff]
1680 movd m0, [r5 + r4 * 4]
1681 %else
1682 movd m0, [tab_ChromaCoeff + r4 * 4]
1683 %endif
1684
1685 pshufb m0, [tab_Cm]
1686
1687 lea r4, [r1 * 3]
1688 lea r5, [r0 + 4 * r1]
1689
1690 movd m2, [r0]
1691 movd m3, [r0 + r1]
1692 movd m4, [r0 + 2 * r1]
1693 movd m5, [r0 + r4]
1694
1695 punpcklbw m2, m3
1696 punpcklbw m6, m4, m5
1697 punpcklbw m2, m6
1698
1699 pmaddubsw m2, m0
1700
1701 movd m6, [r5]
1702
1703 punpcklbw m3, m4
1704 punpcklbw m1, m5, m6
1705 punpcklbw m3, m1
1706
1707 pmaddubsw m3, m0
1708
1709 phaddw m2, m3
1710
1711 mova m1, [pw_2000]
1712
1713 psubw m2, m1
1714 movh [r2], m2
1715 movhps [r2 + r3], m2
1716
1717 movd m2, [r5 + r1]
1718
1719 punpcklbw m4, m5
1720 punpcklbw m3, m6, m2
1721 punpcklbw m4, m3
1722
1723 pmaddubsw m4, m0
1724
1725 movd m3, [r5 + 2 * r1]
1726
1727 punpcklbw m5, m6
1728 punpcklbw m2, m3
1729 punpcklbw m5, m2
1730
1731 pmaddubsw m5, m0
1732
1733 phaddw m4, m5
1734
1735 psubw m4, m1
1736 lea r2, [r2 + 2 * r3]
1737 movh [r2], m4
1738 movhps [r2 + r3], m4
1739
1740 RET
1741
1742 ;---------------------------------------------------------------------------------------------------------------
1743 ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1744 ;---------------------------------------------------------------------------------------------------------------
1745 %macro FILTER_V_PS_W4_H4 2
1746 INIT_XMM sse4
1747 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
1748
1749 mov r4d, r4m
1750 sub r0, r1
1751 add r3d, r3d
1752
1753 %ifdef PIC
1754 lea r5, [tab_ChromaCoeff]
1755 movd m0, [r5 + r4 * 4]
1756 %else
1757 movd m0, [tab_ChromaCoeff + r4 * 4]
1758 %endif
1759
1760 pshufb m0, [tab_Cm]
1761
1762 mova m1, [pw_2000]
1763
1764 mov r4d, %2/4
1765 lea r5, [3 * r1]
1766
1767 .loop:
1768 movd m2, [r0]
1769 movd m3, [r0 + r1]
1770 movd m4, [r0 + 2 * r1]
1771 movd m5, [r0 + r5]
1772
1773 punpcklbw m2, m3
1774 punpcklbw m6, m4, m5
1775 punpcklbw m2, m6
1776
1777 pmaddubsw m2, m0
1778
1779 lea r0, [r0 + 4 * r1]
1780 movd m6, [r0]
1781
1782 punpcklbw m3, m4
1783 punpcklbw m7, m5, m6
1784 punpcklbw m3, m7
1785
1786 pmaddubsw m3, m0
1787
1788 phaddw m2, m3
1789
1790 psubw m2, m1
1791 movh [r2], m2
1792 movhps [r2 + r3], m2
1793
1794 movd m2, [r0 + r1]
1795
1796 punpcklbw m4, m5
1797 punpcklbw m3, m6, m2
1798 punpcklbw m4, m3
1799
1800 pmaddubsw m4, m0
1801
1802 movd m3, [r0 + 2 * r1]
1803
1804 punpcklbw m5, m6
1805 punpcklbw m2, m3
1806 punpcklbw m5, m2
1807
1808 pmaddubsw m5, m0
1809
1810 phaddw m4, m5
1811
1812 psubw m4, m1
1813 lea r2, [r2 + 2 * r3]
1814 movh [r2], m4
1815 movhps [r2 + r3], m4
1816
1817 lea r2, [r2 + 2 * r3]
1818
1819 dec r4d
1820 jnz .loop
1821 RET
1822 %endmacro
1823
1824 FILTER_V_PS_W4_H4 4, 8
1825 FILTER_V_PS_W4_H4 4, 16
1826
1827 FILTER_V_PS_W4_H4 4, 32
1828
1829 ;--------------------------------------------------------------------------------------------------------------
1830 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1831 ;--------------------------------------------------------------------------------------------------------------
1832 %macro FILTER_V_PS_W8_H8_H16_H2 2
1833 INIT_XMM sse4
1834 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
1835
1836 mov r4d, r4m
1837 sub r0, r1
1838 add r3d, r3d
1839
1840 %ifdef PIC
1841 lea r5, [tab_ChromaCoeff]
1842 movd m5, [r5 + r4 * 4]
1843 %else
1844 movd m5, [tab_ChromaCoeff + r4 * 4]
1845 %endif
1846
1847 pshufb m6, m5, [tab_Vm]
1848 pshufb m5, [tab_Vm + 16]
1849 mova m4, [pw_2000]
1850
1851 mov r4d, %2/2
1852 lea r5, [3 * r1]
1853
1854 .loopH:
1855 movq m0, [r0]
1856 movq m1, [r0 + r1]
1857 movq m2, [r0 + 2 * r1]
1858 movq m3, [r0 + r5]
1859
1860 punpcklbw m0, m1
1861 punpcklbw m1, m2
1862 punpcklbw m2, m3
1863
1864 pmaddubsw m0, m6
1865 pmaddubsw m2, m5
1866
1867 paddw m0, m2
1868
1869 psubw m0, m4
1870 movu [r2], m0
1871
1872 movq m0, [r0 + 4 * r1]
1873
1874 punpcklbw m3, m0
1875
1876 pmaddubsw m1, m6
1877 pmaddubsw m3, m5
1878
1879 paddw m1, m3
1880 psubw m1, m4
1881
1882 movu [r2 + r3], m1
1883
1884 lea r0, [r0 + 2 * r1]
1885 lea r2, [r2 + 2 * r3]
1886
1887 dec r4d
1888 jnz .loopH
1889
1890 RET
1891 %endmacro
1892
1893 FILTER_V_PS_W8_H8_H16_H2 8, 2
1894 FILTER_V_PS_W8_H8_H16_H2 8, 4
1895 FILTER_V_PS_W8_H8_H16_H2 8, 6
1896
1897 FILTER_V_PS_W8_H8_H16_H2 8, 12
1898 FILTER_V_PS_W8_H8_H16_H2 8, 64
1899
1900 ;--------------------------------------------------------------------------------------------------------------
1901 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1902 ;--------------------------------------------------------------------------------------------------------------
1903 %macro FILTER_V_PS_W8_H8_H16_H32 2
1904 INIT_XMM sse4
1905 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
1906
1907 mov r4d, r4m
1908 sub r0, r1
1909 add r3d, r3d
1910
1911 %ifdef PIC
1912 lea r5, [tab_ChromaCoeff]
1913 movd m5, [r5 + r4 * 4]
1914 %else
1915 movd m5, [tab_ChromaCoeff + r4 * 4]
1916 %endif
1917
1918 pshufb m6, m5, [tab_Vm]
1919 pshufb m5, [tab_Vm + 16]
1920 mova m4, [pw_2000]
1921
1922 mov r4d, %2/4
1923 lea r5, [3 * r1]
1924
1925 .loop:
1926 movq m0, [r0]
1927 movq m1, [r0 + r1]
1928 movq m2, [r0 + 2 * r1]
1929 movq m3, [r0 + r5]
1930
1931 punpcklbw m0, m1
1932 punpcklbw m1, m2
1933 punpcklbw m2, m3
1934
1935 pmaddubsw m0, m6
1936 pmaddubsw m7, m2, m5
1937
1938 paddw m0, m7
1939
1940 psubw m0, m4
1941 movu [r2], m0
1942
1943 lea r0, [r0 + 4 * r1]
1944 movq m0, [r0]
1945
1946 punpcklbw m3, m0
1947
1948 pmaddubsw m1, m6
1949 pmaddubsw m7, m3, m5
1950
1951 paddw m1, m7
1952
1953 psubw m1, m4
1954 movu [r2 + r3], m1
1955
1956 movq m1, [r0 + r1]
1957
1958 punpcklbw m0, m1
1959
1960 pmaddubsw m2, m6
1961 pmaddubsw m0, m5
1962
1963 paddw m2, m0
1964
1965 psubw m2, m4
1966 lea r2, [r2 + 2 * r3]
1967 movu [r2], m2
1968
1969 movq m2, [r0 + 2 * r1]
1970
1971 punpcklbw m1, m2
1972
1973 pmaddubsw m3, m6
1974 pmaddubsw m1, m5
1975
1976 paddw m3, m1
1977 psubw m3, m4
1978
1979 movu [r2 + r3], m3
1980
1981 lea r2, [r2 + 2 * r3]
1982
1983 dec r4d
1984 jnz .loop
1985 RET
1986 %endmacro
1987
1988 FILTER_V_PS_W8_H8_H16_H32 8, 8
1989 FILTER_V_PS_W8_H8_H16_H32 8, 16
1990 FILTER_V_PS_W8_H8_H16_H32 8, 32
1991
1992 ;------------------------------------------------------------------------------------------------------------
1993 ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1994 ;------------------------------------------------------------------------------------------------------------
1995 %macro FILTER_V_PS_W6 2
1996 INIT_XMM sse4
1997 cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
1998
1999 mov r4d, r4m
2000 sub r0, r1
2001 add r3d, r3d
2002
2003 %ifdef PIC
2004 lea r5, [tab_ChromaCoeff]
2005 movd m5, [r5 + r4 * 4]
2006 %else
2007 movd m5, [tab_ChromaCoeff + r4 * 4]
2008 %endif
2009
2010 pshufb m6, m5, [tab_Vm]
2011 pshufb m5, [tab_Vm + 16]
2012 mova m4, [pw_2000]
2013 lea r5, [3 * r1]
2014 mov r4d, %2/4
2015
2016 .loop:
2017 movq m0, [r0]
2018 movq m1, [r0 + r1]
2019 movq m2, [r0 + 2 * r1]
2020 movq m3, [r0 + r5]
2021
2022 punpcklbw m0, m1
2023 punpcklbw m1, m2
2024 punpcklbw m2, m3
2025
2026 pmaddubsw m0, m6
2027 pmaddubsw m7, m2, m5
2028
2029 paddw m0, m7
2030 psubw m0, m4
2031
2032 movh [r2], m0
2033 pshufd m0, m0, 2
2034 movd [r2 + 8], m0
2035
2036 lea r0, [r0 + 4 * r1]
2037 movq m0, [r0]
2038 punpcklbw m3, m0
2039
2040 pmaddubsw m1, m6
2041 pmaddubsw m7, m3, m5
2042
2043 paddw m1, m7
2044 psubw m1, m4
2045
2046 movh [r2 + r3], m1
2047 pshufd m1, m1, 2
2048 movd [r2 + r3 + 8], m1
2049
2050 movq m1, [r0 + r1]
2051 punpcklbw m0, m1
2052
2053 pmaddubsw m2, m6
2054 pmaddubsw m0, m5
2055
2056 paddw m2, m0
2057 psubw m2, m4
2058
2059 lea r2,[r2 + 2 * r3]
2060 movh [r2], m2
2061 pshufd m2, m2, 2
2062 movd [r2 + 8], m2
2063
2064 movq m2,[r0 + 2 * r1]
2065 punpcklbw m1, m2
2066
2067 pmaddubsw m3, m6
2068 pmaddubsw m1, m5
2069
2070 paddw m3, m1
2071 psubw m3, m4
2072
2073 movh [r2 + r3], m3
2074 pshufd m3, m3, 2
2075 movd [r2 + r3 + 8], m3
2076
2077 lea r2, [r2 + 2 * r3]
2078
2079 dec r4d
2080 jnz .loop
2081 RET
2082 %endmacro
2083
2084 FILTER_V_PS_W6 6, 8
2085 FILTER_V_PS_W6 6, 16
2086
2087 ;---------------------------------------------------------------------------------------------------------------
2088 ; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2089 ;---------------------------------------------------------------------------------------------------------------
2090 %macro FILTER_V_PS_W12 2
2091 INIT_XMM sse4
2092 cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
2093
2094 mov r4d, r4m
2095 sub r0, r1
2096 add r3d, r3d
2097
2098 %ifdef PIC
2099 lea r5, [tab_ChromaCoeff]
2100 movd m0, [r5 + r4 * 4]
2101 %else
2102 movd m0, [tab_ChromaCoeff + r4 * 4]
2103 %endif
2104
2105 pshufb m1, m0, [tab_Vm]
2106 pshufb m0, [tab_Vm + 16]
2107
2108 mov r4d, %2/2
2109
2110 .loop:
2111 movu m2, [r0]
2112 movu m3, [r0 + r1]
2113
2114 punpcklbw m4, m2, m3
2115 punpckhbw m2, m3
2116
2117 pmaddubsw m4, m1
2118 pmaddubsw m2, m1
2119
2120 lea r0, [r0 + 2 * r1]
2121 movu m5, [r0]
2122 movu m7, [r0 + r1]
2123
2124 punpcklbw m6, m5, m7
2125 pmaddubsw m6, m0
2126 paddw m4, m6
2127
2128 punpckhbw m6, m5, m7
2129 pmaddubsw m6, m0
2130 paddw m2, m6
2131
2132 mova m6, [pw_2000]
2133
2134 psubw m4, m6
2135 psubw m2, m6
2136
2137 movu [r2], m4
2138 movh [r2 + 16], m2
2139
2140 punpcklbw m4, m3, m5
2141 punpckhbw m3, m5
2142
2143 pmaddubsw m4, m1
2144 pmaddubsw m3, m1
2145
2146 movu m2, [r0 + 2 * r1]
2147
2148 punpcklbw m5, m7, m2
2149 punpckhbw m7, m2
2150
2151 pmaddubsw m5, m0
2152 pmaddubsw m7, m0
2153
2154 paddw m4, m5
2155 paddw m3, m7
2156
2157 psubw m4, m6
2158 psubw m3, m6
2159
2160 movu [r2 + r3], m4
2161 movh [r2 + r3 + 16], m3
2162
2163 lea r2, [r2 + 2 * r3]
2164
2165 dec r4d
2166 jnz .loop
2167 RET
2168 %endmacro
2169
2170 FILTER_V_PS_W12 12, 16
2171 FILTER_V_PS_W12 12, 32
2172
2173 ;---------------------------------------------------------------------------------------------------------------
2174 ; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2175 ;---------------------------------------------------------------------------------------------------------------
2176 %macro FILTER_V_PS_W16 2
2177 INIT_XMM sse4
2178 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2179
2180 mov r4d, r4m
2181 sub r0, r1
2182 add r3d, r3d
2183
2184 %ifdef PIC
2185 lea r5, [tab_ChromaCoeff]
2186 movd m0, [r5 + r4 * 4]
2187 %else
2188 movd m0, [tab_ChromaCoeff + r4 * 4]
2189 %endif
2190
2191 pshufb m1, m0, [tab_Vm]
2192 pshufb m0, [tab_Vm + 16]
2193 mov r4d, %2/2
2194
2195 .loop:
2196 movu m2, [r0]
2197 movu m3, [r0 + r1]
2198
2199 punpcklbw m4, m2, m3
2200 punpckhbw m2, m3
2201
2202 pmaddubsw m4, m1
2203 pmaddubsw m2, m1
2204
2205 lea r0, [r0 + 2 * r1]
2206 movu m5, [r0]
2207 movu m7, [r0 + r1]
2208
2209 punpcklbw m6, m5, m7
2210 pmaddubsw m6, m0
2211 paddw m4, m6
2212
2213 punpckhbw m6, m5, m7
2214 pmaddubsw m6, m0
2215 paddw m2, m6
2216
2217 mova m6, [pw_2000]
2218
2219 psubw m4, m6
2220 psubw m2, m6
2221
2222 movu [r2], m4
2223 movu [r2 + 16], m2
2224
2225 punpcklbw m4, m3, m5
2226 punpckhbw m3, m5
2227
2228 pmaddubsw m4, m1
2229 pmaddubsw m3, m1
2230
2231 movu m5, [r0 + 2 * r1]
2232
2233 punpcklbw m2, m7, m5
2234 punpckhbw m7, m5
2235
2236 pmaddubsw m2, m0
2237 pmaddubsw m7, m0
2238
2239 paddw m4, m2
2240 paddw m3, m7
2241
2242 psubw m4, m6
2243 psubw m3, m6
2244
2245 movu [r2 + r3], m4
2246 movu [r2 + r3 + 16], m3
2247
2248 lea r2, [r2 + 2 * r3]
2249
2250 dec r4d
2251 jnz .loop
2252 RET
2253 %endmacro
2254
2255 FILTER_V_PS_W16 16, 4
2256 FILTER_V_PS_W16 16, 8
2257 FILTER_V_PS_W16 16, 12
2258 FILTER_V_PS_W16 16, 16
2259 FILTER_V_PS_W16 16, 32
2260
2261 FILTER_V_PS_W16 16, 24
2262 FILTER_V_PS_W16 16, 64
2263
2264 ;--------------------------------------------------------------------------------------------------------------
2265 ;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2266 ;--------------------------------------------------------------------------------------------------------------
2267 %macro FILTER_V4_PS_W24 2
2268 INIT_XMM sse4
2269 cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
2270
2271 mov r4d, r4m
2272 sub r0, r1
2273 add r3d, r3d
2274
2275 %ifdef PIC
2276 lea r5, [tab_ChromaCoeff]
2277 movd m0, [r5 + r4 * 4]
2278 %else
2279 movd m0, [tab_ChromaCoeff + r4 * 4]
2280 %endif
2281
2282 pshufb m1, m0, [tab_Vm]
2283 pshufb m0, [tab_Vm + 16]
2284
2285 mov r4d, %2/2
2286
2287 .loop:
2288 movu m2, [r0]
2289 movu m3, [r0 + r1]
2290
2291 punpcklbw m4, m2, m3
2292 punpckhbw m2, m3
2293
2294 pmaddubsw m4, m1
2295 pmaddubsw m2, m1
2296
2297 lea r5, [r0 + 2 * r1]
2298
2299 movu m5, [r5]
2300 movu m7, [r5 + r1]
2301
2302 punpcklbw m6, m5, m7
2303 pmaddubsw m6, m0
2304 paddw m4, m6
2305
2306 punpckhbw m6, m5, m7
2307 pmaddubsw m6, m0
2308 paddw m2, m6
2309
2310 mova m6, [pw_2000]
2311
2312 psubw m4, m6
2313 psubw m2, m6
2314
2315 movu [r2], m4
2316 movu [r2 + 16], m2
2317
2318 punpcklbw m4, m3, m5
2319 punpckhbw m3, m5
2320
2321 pmaddubsw m4, m1
2322 pmaddubsw m3, m1
2323
2324 movu m2, [r5 + 2 * r1]
2325
2326 punpcklbw m5, m7, m2
2327 punpckhbw m7, m2
2328
2329 pmaddubsw m5, m0
2330 pmaddubsw m7, m0
2331
2332 paddw m4, m5
2333 paddw m3, m7
2334
2335 psubw m4, m6
2336 psubw m3, m6
2337
2338 movu [r2 + r3], m4
2339 movu [r2 + r3 + 16], m3
2340
2341 movq m2, [r0 + 16]
2342 movq m3, [r0 + r1 + 16]
2343 movq m4, [r5 + 16]
2344 movq m5, [r5 + r1 + 16]
2345
2346 punpcklbw m2, m3
2347 punpcklbw m7, m4, m5
2348
2349 pmaddubsw m2, m1
2350 pmaddubsw m7, m0
2351
2352 paddw m2, m7
2353 psubw m2, m6
2354
2355 movu [r2 + 32], m2
2356
2357 movq m2, [r5 + 2 * r1 + 16]
2358
2359 punpcklbw m3, m4
2360 punpcklbw m5, m2
2361
2362 pmaddubsw m3, m1
2363 pmaddubsw m5, m0
2364
2365 paddw m3, m5
2366 psubw m3, m6
2367
2368 movu [r2 + r3 + 32], m3
2369
2370 mov r0, r5
2371 lea r2, [r2 + 2 * r3]
2372
2373 dec r4d
2374 jnz .loop
2375 RET
2376 %endmacro
2377
2378 FILTER_V4_PS_W24 24, 32
2379
2380 FILTER_V4_PS_W24 24, 64
2381
2382 ;---------------------------------------------------------------------------------------------------------------
2383 ; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2384 ;---------------------------------------------------------------------------------------------------------------
2385 %macro FILTER_V_PS_W32 2
2386 INIT_XMM sse4
2387 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2388
2389 mov r4d, r4m
2390 sub r0, r1
2391 add r3d, r3d
2392
2393 %ifdef PIC
2394 lea r5, [tab_ChromaCoeff]
2395 movd m0, [r5 + r4 * 4]
2396 %else
2397 movd m0, [tab_ChromaCoeff + r4 * 4]
2398 %endif
2399
2400 pshufb m1, m0, [tab_Vm]
2401 pshufb m0, [tab_Vm + 16]
2402
2403 mova m7, [pw_2000]
2404
2405 mov r4d, %2
2406
2407 .loop:
2408 movu m2, [r0]
2409 movu m3, [r0 + r1]
2410
2411 punpcklbw m4, m2, m3
2412 punpckhbw m2, m3
2413
2414 pmaddubsw m4, m1
2415 pmaddubsw m2, m1
2416
2417 lea r5, [r0 + 2 * r1]
2418 movu m3, [r5]
2419 movu m5, [r5 + r1]
2420
2421 punpcklbw m6, m3, m5
2422 punpckhbw m3, m5
2423
2424 pmaddubsw m6, m0
2425 pmaddubsw m3, m0
2426
2427 paddw m4, m6
2428 paddw m2, m3
2429
2430 psubw m4, m7
2431 psubw m2, m7
2432
2433 movu [r2], m4
2434 movu [r2 + 16], m2
2435
2436 movu m2, [r0 + 16]
2437 movu m3, [r0 + r1 + 16]
2438
2439 punpcklbw m4, m2, m3
2440 punpckhbw m2, m3
2441
2442 pmaddubsw m4, m1
2443 pmaddubsw m2, m1
2444
2445 movu m3, [r5 + 16]
2446 movu m5, [r5 + r1 + 16]
2447
2448 punpcklbw m6, m3, m5
2449 punpckhbw m3, m5
2450
2451 pmaddubsw m6, m0
2452 pmaddubsw m3, m0
2453
2454 paddw m4, m6
2455 paddw m2, m3
2456
2457 psubw m4, m7
2458 psubw m2, m7
2459
2460 movu [r2 + 32], m4
2461 movu [r2 + 48], m2
2462
2463 lea r0, [r0 + r1]
2464 lea r2, [r2 + r3]
2465
2466 dec r4d
2467 jnz .loop
2468 RET
2469 %endmacro
2470
2471 FILTER_V_PS_W32 32, 8
2472 FILTER_V_PS_W32 32, 16
2473 FILTER_V_PS_W32 32, 24
2474 FILTER_V_PS_W32 32, 32
2475
2476 FILTER_V_PS_W32 32, 48
2477 FILTER_V_PS_W32 32, 64
2478
2479 ;-----------------------------------------------------------------------------
2480 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2481 ;-----------------------------------------------------------------------------
2482 %macro FILTER_V4_W8_H8_H16_H32 2
2483 INIT_XMM sse4
2484 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
2485
2486 mov r4d, r4m
2487 sub r0, r1
2488
2489 %ifdef PIC
2490 lea r5, [tab_ChromaCoeff]
2491 movd m5, [r5 + r4 * 4]
2492 %else
2493 movd m5, [tab_ChromaCoeff + r4 * 4]
2494 %endif
2495
2496 pshufb m6, m5, [tab_Vm]
2497 pshufb m5, [tab_Vm + 16]
2498 mova m4, [tab_c_512]
2499 lea r5, [r1 * 3]
2500
2501 mov r4d, %2
2502
2503 .loop:
2504 movq m0, [r0]
2505 movq m1, [r0 + r1]
2506 movq m2, [r0 + 2 * r1]
2507 movq m3, [r0 + r5]
2508
2509 punpcklbw m0, m1
2510 punpcklbw m1, m2
2511 punpcklbw m2, m3
2512
2513 pmaddubsw m0, m6
2514 pmaddubsw m7, m2, m5
2515
2516 paddw m0, m7
2517
2518 pmulhrsw m0, m4
2519 packuswb m0, m0
2520 movh [r2], m0
2521
2522 lea r0, [r0 + 4 * r1]
2523 movq m0, [r0]
2524
2525 punpcklbw m3, m0
2526
2527 pmaddubsw m1, m6
2528 pmaddubsw m7, m3, m5
2529
2530 paddw m1, m7
2531
2532 pmulhrsw m1, m4
2533 packuswb m1, m1
2534 movh [r2 + r3], m1
2535
2536 movq m1, [r0 + r1]
2537
2538 punpcklbw m0, m1
2539
2540 pmaddubsw m2, m6
2541 pmaddubsw m0, m5
2542
2543 paddw m2, m0
2544
2545 pmulhrsw m2, m4
2546
2547 movq m7, [r0 + 2 * r1]
2548 punpcklbw m1, m7
2549
2550 pmaddubsw m3, m6
2551 pmaddubsw m1, m5
2552
2553 paddw m3, m1
2554
2555 pmulhrsw m3, m4
2556 packuswb m2, m3
2557
2558 lea r2, [r2 + 2 * r3]
2559 movh [r2], m2
2560 movhps [r2 + r3], m2
2561
2562 lea r2, [r2 + 2 * r3]
2563
2564 sub r4, 4
2565 jnz .loop
2566 RET
2567 %endmacro
2568
2569 FILTER_V4_W8_H8_H16_H32 8, 8
2570 FILTER_V4_W8_H8_H16_H32 8, 16
2571 FILTER_V4_W8_H8_H16_H32 8, 32
2572
2573 FILTER_V4_W8_H8_H16_H32 8, 12
2574 FILTER_V4_W8_H8_H16_H32 8, 64
2575
2576
2577 ;-----------------------------------------------------------------------------
2578 ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2579 ;-----------------------------------------------------------------------------
2580 %macro FILTER_V4_W6_H4 2
2581 INIT_XMM sse4
2582 cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
2583
2584 mov r4d, r4m
2585 sub r0, r1
2586
2587 %ifdef PIC
2588 lea r5, [tab_ChromaCoeff]
2589 movd m5, [r5 + r4 * 4]
2590 %else
2591 movd m5, [tab_ChromaCoeff + r4 * 4]
2592 %endif
2593
2594 pshufb m6, m5, [tab_Vm]
2595 pshufb m5, [tab_Vm + 16]
2596 mova m4, [tab_c_512]
2597
2598 mov r4d, %2
2599 lea r5, [3 * r1]
2600
2601 .loop:
2602 movq m0, [r0]
2603 movq m1, [r0 + r1]
2604 movq m2, [r0 + 2 * r1]
2605 movq m3, [r0 + r5]
2606
2607 punpcklbw m0, m1
2608 punpcklbw m1, m2
2609 punpcklbw m2, m3
2610
2611 pmaddubsw m0, m6
2612 pmaddubsw m7, m2, m5
2613
2614 paddw m0, m7
2615
2616 pmulhrsw m0, m4
2617 packuswb m0, m0
2618 movd [r2], m0
2619 pextrw [r2 + 4], m0, 2
2620
2621 lea r0, [r0 + 4 * r1]
2622
2623 movq m0, [r0]
2624 punpcklbw m3, m0
2625
2626 pmaddubsw m1, m6
2627 pmaddubsw m7, m3, m5
2628
2629 paddw m1, m7
2630
2631 pmulhrsw m1, m4
2632 packuswb m1, m1
2633 movd [r2 + r3], m1
2634 pextrw [r2 + r3 + 4], m1, 2
2635
2636 movq m1, [r0 + r1]
2637 punpcklbw m7, m0, m1
2638
2639 pmaddubsw m2, m6
2640 pmaddubsw m7, m5
2641
2642 paddw m2, m7
2643
2644 pmulhrsw m2, m4
2645 packuswb m2, m2
2646 lea r2, [r2 + 2 * r3]
2647 movd [r2], m2
2648 pextrw [r2 + 4], m2, 2
2649
2650 movq m2, [r0 + 2 * r1]
2651 punpcklbw m1, m2
2652
2653 pmaddubsw m3, m6
2654 pmaddubsw m1, m5
2655
2656 paddw m3, m1
2657
2658 pmulhrsw m3, m4
2659 packuswb m3, m3
2660
2661 movd [r2 + r3], m3
2662 pextrw [r2 + r3 + 4], m3, 2
2663
2664 lea r2, [r2 + 2 * r3]
2665
2666 sub r4, 4
2667 jnz .loop
2668 RET
2669 %endmacro
2670
2671 FILTER_V4_W6_H4 6, 8
2672
2673 FILTER_V4_W6_H4 6, 16
2674
2675 ;-----------------------------------------------------------------------------
2676 ; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2677 ;-----------------------------------------------------------------------------
2678 %macro FILTER_V4_W12_H2 2
2679 INIT_XMM sse4
2680 cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
2681
2682 mov r4d, r4m
2683 sub r0, r1
2684
2685 %ifdef PIC
2686 lea r5, [tab_ChromaCoeff]
2687 movd m0, [r5 + r4 * 4]
2688 %else
2689 movd m0, [tab_ChromaCoeff + r4 * 4]
2690 %endif
2691
2692 pshufb m1, m0, [tab_Vm]
2693 pshufb m0, [tab_Vm + 16]
2694
2695 mov r4d, %2
2696
2697 .loop:
2698 movu m2, [r0]
2699 movu m3, [r0 + r1]
2700
2701 punpcklbw m4, m2, m3
2702 punpckhbw m2, m3
2703
2704 pmaddubsw m4, m1
2705 pmaddubsw m2, m1
2706
2707 lea r0, [r0 + 2 * r1]
2708 movu m5, [r0]
2709 movu m7, [r0 + r1]
2710
2711 punpcklbw m6, m5, m7
2712 pmaddubsw m6, m0
2713 paddw m4, m6
2714
2715 punpckhbw m6, m5, m7
2716 pmaddubsw m6, m0
2717 paddw m2, m6
2718
2719 mova m6, [tab_c_512]
2720
2721 pmulhrsw m4, m6
2722 pmulhrsw m2, m6
2723
2724 packuswb m4, m2
2725
2726 movh [r2], m4
2727 pextrd [r2 + 8], m4, 2
2728
2729 punpcklbw m4, m3, m5
2730 punpckhbw m3, m5
2731
2732 pmaddubsw m4, m1
2733 pmaddubsw m3, m1
2734
2735 movu m5, [r0 + 2 * r1]
2736
2737 punpcklbw m2, m7, m5
2738 punpckhbw m7, m5
2739
2740 pmaddubsw m2, m0
2741 pmaddubsw m7, m0
2742
2743 paddw m4, m2
2744 paddw m3, m7
2745
2746 pmulhrsw m4, m6
2747 pmulhrsw m3, m6
2748
2749 packuswb m4, m3
2750
2751 movh [r2 + r3], m4
2752 pextrd [r2 + r3 + 8], m4, 2
2753
2754 lea r2, [r2 + 2 * r3]
2755
2756 sub r4, 2
2757 jnz .loop
2758 RET
2759 %endmacro
2760
2761 FILTER_V4_W12_H2 12, 16
2762
2763 FILTER_V4_W12_H2 12, 32
2764
2765 ;-----------------------------------------------------------------------------
2766 ; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2767 ;-----------------------------------------------------------------------------
2768 %macro FILTER_V4_W16_H2 2
2769 INIT_XMM sse4
2770 cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
2771
2772 mov r4d, r4m
2773 sub r0, r1
2774
2775 %ifdef PIC
2776 lea r5, [tab_ChromaCoeff]
2777 movd m0, [r5 + r4 * 4]
2778 %else
2779 movd m0, [tab_ChromaCoeff + r4 * 4]
2780 %endif
2781
2782 pshufb m1, m0, [tab_Vm]
2783 pshufb m0, [tab_Vm + 16]
2784
2785 mov r4d, %2/2
2786
2787 .loop:
2788 movu m2, [r0]
2789 movu m3, [r0 + r1]
2790
2791 punpcklbw m4, m2, m3
2792 punpckhbw m2, m3
2793
2794 pmaddubsw m4, m1
2795 pmaddubsw m2, m1
2796
2797 lea r0, [r0 + 2 * r1]
2798 movu m5, [r0]
2799 movu m6, [r0 + r1]
2800
2801 punpckhbw m7, m5, m6
2802 pmaddubsw m7, m0
2803 paddw m2, m7
2804
2805 punpcklbw m7, m5, m6
2806 pmaddubsw m7, m0
2807 paddw m4, m7
2808
2809 mova m7, [tab_c_512]
2810
2811 pmulhrsw m4, m7
2812 pmulhrsw m2, m7
2813
2814 packuswb m4, m2
2815
2816 movu [r2], m4
2817
2818 punpcklbw m4, m3, m5
2819 punpckhbw m3, m5
2820
2821 pmaddubsw m4, m1
2822 pmaddubsw m3, m1
2823
2824 movu m5, [r0 + 2 * r1]
2825
2826 punpcklbw m2, m6, m5
2827 punpckhbw m6, m5
2828
2829 pmaddubsw m2, m0
2830 pmaddubsw m6, m0
2831
2832 paddw m4, m2
2833 paddw m3, m6
2834
2835 pmulhrsw m4, m7
2836 pmulhrsw m3, m7
2837
2838 packuswb m4, m3
2839
2840 movu [r2 + r3], m4
2841
2842 lea r2, [r2 + 2 * r3]
2843
2844 dec r4d
2845 jnz .loop
2846 RET
2847 %endmacro
2848
2849 FILTER_V4_W16_H2 16, 4
2850 FILTER_V4_W16_H2 16, 8
2851 FILTER_V4_W16_H2 16, 12
2852 FILTER_V4_W16_H2 16, 16
2853 FILTER_V4_W16_H2 16, 32
2854
2855 FILTER_V4_W16_H2 16, 24
2856 FILTER_V4_W16_H2 16, 64
2857
2858 ;-----------------------------------------------------------------------------
2859 ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2860 ;-----------------------------------------------------------------------------
2861 %macro FILTER_V4_W24 2
2862 INIT_XMM sse4
2863 cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
2864
2865 mov r4d, r4m
2866 sub r0, r1
2867
2868 %ifdef PIC
2869 lea r5, [tab_ChromaCoeff]
2870 movd m0, [r5 + r4 * 4]
2871 %else
2872 movd m0, [tab_ChromaCoeff + r4 * 4]
2873 %endif
2874
2875 pshufb m1, m0, [tab_Vm]
2876 pshufb m0, [tab_Vm + 16]
2877
2878 mov r4d, %2
2879
2880 .loop:
2881 movu m2, [r0]
2882 movu m3, [r0 + r1]
2883
2884 punpcklbw m4, m2, m3
2885 punpckhbw m2, m3
2886
2887 pmaddubsw m4, m1
2888 pmaddubsw m2, m1
2889
2890 lea r5, [r0 + 2 * r1]
2891 movu m5, [r5]
2892 movu m7, [r5 + r1]
2893
2894 punpcklbw m6, m5, m7
2895 pmaddubsw m6, m0
2896 paddw m4, m6
2897
2898 punpckhbw m6, m5, m7
2899 pmaddubsw m6, m0
2900 paddw m2, m6
2901
2902 mova m6, [tab_c_512]
2903
2904 pmulhrsw m4, m6
2905 pmulhrsw m2, m6
2906
2907 packuswb m4, m2
2908
2909 movu [r2], m4
2910
2911 punpcklbw m4, m3, m5
2912 punpckhbw m3, m5
2913
2914 pmaddubsw m4, m1
2915 pmaddubsw m3, m1
2916
2917 movu m2, [r5 + 2 * r1]
2918
2919 punpcklbw m5, m7, m2
2920 punpckhbw m7, m2
2921
2922 pmaddubsw m5, m0
2923 pmaddubsw m7, m0
2924
2925 paddw m4, m5
2926 paddw m3, m7
2927
2928 pmulhrsw m4, m6
2929 pmulhrsw m3, m6
2930
2931 packuswb m4, m3
2932
2933 movu [r2 + r3], m4
2934
2935 movq m2, [r0 + 16]
2936 movq m3, [r0 + r1 + 16]
2937 movq m4, [r5 + 16]
2938 movq m5, [r5 + r1 + 16]
2939
2940 punpcklbw m2, m3
2941 punpcklbw m4, m5
2942
2943 pmaddubsw m2, m1
2944 pmaddubsw m4, m0
2945
2946 paddw m2, m4
2947
2948 pmulhrsw m2, m6
2949
2950 movq m3, [r0 + r1 + 16]
2951 movq m4, [r5 + 16]
2952 movq m5, [r5 + r1 + 16]
2953 movq m7, [r5 + 2 * r1 + 16]
2954
2955 punpcklbw m3, m4
2956 punpcklbw m5, m7
2957
2958 pmaddubsw m3, m1
2959 pmaddubsw m5, m0
2960
2961 paddw m3, m5
2962
2963 pmulhrsw m3, m6
2964 packuswb m2, m3
2965
2966 movh [r2 + 16], m2
2967 movhps [r2 + r3 + 16], m2
2968
2969 mov r0, r5
2970 lea r2, [r2 + 2 * r3]
2971
2972 sub r4, 2
2973 jnz .loop
2974 RET
2975 %endmacro
2976
2977 FILTER_V4_W24 24, 32
2978
2979 FILTER_V4_W24 24, 64
2980
2981 ;-----------------------------------------------------------------------------
2982 ; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2983 ;-----------------------------------------------------------------------------
2984 %macro FILTER_V4_W32 2
2985 INIT_XMM sse4
2986 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
2987
2988 mov r4d, r4m
2989 sub r0, r1
2990
2991 %ifdef PIC
2992 lea r5, [tab_ChromaCoeff]
2993 movd m0, [r5 + r4 * 4]
2994 %else
2995 movd m0, [tab_ChromaCoeff + r4 * 4]
2996 %endif
2997
2998 pshufb m1, m0, [tab_Vm]
2999 pshufb m0, [tab_Vm + 16]
3000
3001 mova m7, [tab_c_512]
3002
3003 mov r4d, %2
3004
3005 .loop:
3006 movu m2, [r0]
3007 movu m3, [r0 + r1]
3008
3009 punpcklbw m4, m2, m3
3010 punpckhbw m2, m3
3011
3012 pmaddubsw m4, m1
3013 pmaddubsw m2, m1
3014
3015 lea r5, [r0 + 2 * r1]
3016 movu m3, [r5]
3017 movu m5, [r5 + r1]
3018
3019 punpcklbw m6, m3, m5
3020 punpckhbw m3, m5
3021
3022 pmaddubsw m6, m0
3023 pmaddubsw m3, m0
3024
3025 paddw m4, m6
3026 paddw m2, m3
3027
3028 pmulhrsw m4, m7
3029 pmulhrsw m2, m7
3030
3031 packuswb m4, m2
3032
3033 movu [r2], m4
3034
3035 movu m2, [r0 + 16]
3036 movu m3, [r0 + r1 + 16]
3037
3038 punpcklbw m4, m2, m3
3039 punpckhbw m2, m3
3040
3041 pmaddubsw m4, m1
3042 pmaddubsw m2, m1
3043
3044 movu m3, [r5 + 16]
3045 movu m5, [r5 + r1 + 16]
3046
3047 punpcklbw m6, m3, m5
3048 punpckhbw m3, m5
3049
3050 pmaddubsw m6, m0
3051 pmaddubsw m3, m0
3052
3053 paddw m4, m6
3054 paddw m2, m3
3055
3056 pmulhrsw m4, m7
3057 pmulhrsw m2, m7
3058
3059 packuswb m4, m2
3060
3061 movu [r2 + 16], m4
3062
3063 lea r0, [r0 + r1]
3064 lea r2, [r2 + r3]
3065
3066 dec r4
3067 jnz .loop
3068 RET
3069 %endmacro
3070
3071 FILTER_V4_W32 32, 8
3072 FILTER_V4_W32 32, 16
3073 FILTER_V4_W32 32, 24
3074 FILTER_V4_W32 32, 32
3075
3076 FILTER_V4_W32 32, 48
3077 FILTER_V4_W32 32, 64
3078
3079
3080 ;-----------------------------------------------------------------------------
3081 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3082 ;-----------------------------------------------------------------------------
3083 %macro FILTER_V4_W16n_H2 2
3084 INIT_XMM sse4
3085 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
3086
3087 mov r4d, r4m
3088 sub r0, r1
3089
3090 %ifdef PIC
3091 lea r5, [tab_ChromaCoeff]
3092 movd m0, [r5 + r4 * 4]
3093 %else
3094 movd m0, [tab_ChromaCoeff + r4 * 4]
3095 %endif
3096
3097 pshufb m1, m0, [tab_Vm]
3098 pshufb m0, [tab_Vm + 16]
3099
3100 mov r4d, %2/2
3101
3102 .loop:
3103
3104 mov r6d, %1/16
3105
3106 .loopW:
3107
3108 movu m2, [r0]
3109 movu m3, [r0 + r1]
3110
3111 punpcklbw m4, m2, m3
3112 punpckhbw m2, m3
3113
3114 pmaddubsw m4, m1
3115 pmaddubsw m2, m1
3116
3117 lea r5, [r0 + 2 * r1]
3118 movu m5, [r5]
3119 movu m6, [r5 + r1]
3120
3121 punpckhbw m7, m5, m6
3122 pmaddubsw m7, m0
3123 paddw m2, m7
3124
3125 punpcklbw m7, m5, m6
3126 pmaddubsw m7, m0
3127 paddw m4, m7
3128
3129 mova m7, [tab_c_512]
3130
3131 pmulhrsw m4, m7
3132 pmulhrsw m2, m7
3133
3134 packuswb m4, m2
3135
3136 movu [r2], m4
3137
3138 punpcklbw m4, m3, m5
3139 punpckhbw m3, m5
3140
3141 pmaddubsw m4, m1
3142 pmaddubsw m3, m1
3143
3144 movu m5, [r5 + 2 * r1]
3145
3146 punpcklbw m2, m6, m5
3147 punpckhbw m6, m5
3148
3149 pmaddubsw m2, m0
3150 pmaddubsw m6, m0
3151
3152 paddw m4, m2
3153 paddw m3, m6
3154
3155 pmulhrsw m4, m7
3156 pmulhrsw m3, m7
3157
3158 packuswb m4, m3
3159
3160 movu [r2 + r3], m4
3161
3162 add r0, 16
3163 add r2, 16
3164 dec r6d
3165 jnz .loopW
3166
3167 lea r0, [r0 + r1 * 2 - %1]
3168 lea r2, [r2 + r3 * 2 - %1]
3169
3170 dec r4d
3171 jnz .loop
3172 RET
3173 %endmacro
3174
3175 FILTER_V4_W16n_H2 64, 64
3176 FILTER_V4_W16n_H2 64, 32
3177 FILTER_V4_W16n_H2 64, 48
3178 FILTER_V4_W16n_H2 48, 64
3179 FILTER_V4_W16n_H2 64, 16
3180
3181
3182 ;-----------------------------------------------------------------------------
3183 ; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
3184 ;-----------------------------------------------------------------------------
3185 INIT_XMM ssse3
3186 cglobal luma_p2s, 3, 7, 6
3187
3188 ; load width and height
3189 mov r3d, r3m
3190 mov r4d, r4m
3191
3192 ; load constant
3193 mova m4, [tab_c_128]
3194 mova m5, [tab_c_64_n64]
3195
3196 .loopH:
3197
3198 xor r5d, r5d
3199 .loopW:
3200 lea r6, [r0 + r5]
3201
3202 movh m0, [r6]
3203 punpcklbw m0, m4
3204 pmaddubsw m0, m5
3205
3206 movh m1, [r6 + r1]
3207 punpcklbw m1, m4
3208 pmaddubsw m1, m5
3209
3210 movh m2, [r6 + r1 * 2]
3211 punpcklbw m2, m4
3212 pmaddubsw m2, m5
3213
3214 lea r6, [r6 + r1 * 2]
3215 movh m3, [r6 + r1]
3216 punpcklbw m3, m4
3217 pmaddubsw m3, m5
3218
3219 add r5, 8
3220 cmp r5, r3
3221 jg .width4
3222 movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
3223 movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
3224 movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
3225 movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
3226 je .nextH
3227 jmp .loopW
3228
3229 .width4:
3230 movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
3231 movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
3232 movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
3233 movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
3234
3235 .nextH:
3236 lea r0, [r0 + r1 * 4]
3237 add r2, FENC_STRIDE * 8
3238
3239 sub r4d, 4
3240 jnz .loopH
3241
3242 RET
3243
3244 %macro PROCESS_LUMA_W4_4R 0
3245 movd m0, [r0]
3246 movd m1, [r0 + r1]
3247 punpcklbw m2, m0, m1 ; m2=[0 1]
3248
3249 lea r0, [r0 + 2 * r1]
3250 movd m0, [r0]
3251 punpcklbw m1, m0 ; m1=[1 2]
3252 punpcklqdq m2, m1 ; m2=[0 1 1 2]
3253 pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]
3254
3255 movd m1, [r0 + r1]
3256 punpcklbw m5, m0, m1 ; m2=[2 3]
3257 lea r0, [r0 + 2 * r1]
3258 movd m0, [r0]
3259 punpcklbw m1, m0 ; m1=[3 4]
3260 punpcklqdq m5, m1 ; m5=[2 3 3 4]
3261 pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]
3262 paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2
3263 pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4
3264
3265 movd m1, [r0 + r1]
3266 punpcklbw m2, m0, m1 ; m2=[4 5]
3267 lea r0, [r0 + 2 * r1]
3268 movd m0, [r0]
3269 punpcklbw m1, m0 ; m1=[5 6]
3270 punpcklqdq m2, m1 ; m2=[4 5 5 6]
3271 pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]
3272 paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2
3273 pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]
3274 paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4
3275
3276 movd m1, [r0 + r1]
3277 punpcklbw m2, m0, m1 ; m2=[6 7]
3278 lea r0, [r0 + 2 * r1]
3279 movd m0, [r0]
3280 punpcklbw m1, m0 ; m1=[7 8]
3281 punpcklqdq m2, m1 ; m2=[6 7 7 8]
3282 pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]
3283 paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end
3284 pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]
3285 paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4
3286
3287 movd m1, [r0 + r1]
3288 punpcklbw m2, m0, m1 ; m2=[8 9]
3289 movd m0, [r0 + 2 * r1]
3290 punpcklbw m1, m0 ; m1=[9 10]
3291 punpcklqdq m2, m1 ; m2=[8 9 9 10]
3292 pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]
3293 paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end
3294 %endmacro
3295
3296 %macro PROCESS_LUMA_W8_4R 0
3297 movq m0, [r0]
3298 movq m1, [r0 + r1]
3299 punpcklbw m0, m1
3300 pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1
3301
3302 lea r0, [r0 + 2 * r1]
3303 movq m0, [r0]
3304 punpcklbw m1, m0
3305 pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2
3306
3307 movq m1, [r0 + r1]
3308 punpcklbw m0, m1
3309 pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3
3310 pmaddubsw m0, [r6 + 1 * 16]
3311 paddw m7, m0 ;m7=[0+1+2+3] Row1
3312
3313 lea r0, [r0 + 2 * r1]
3314 movq m0, [r0]
3315 punpcklbw m1, m0
3316 pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4
3317 pmaddubsw m1, [r6 + 1 * 16]
3318 paddw m6, m1 ;m6 = [1+2+3+4] Row2
3319
3320 movq m1, [r0 + r1]
3321 punpcklbw m0, m1
3322 pmaddubsw m2, m0, [r6 + 1 * 16]
3323 pmaddubsw m0, [r6 + 2 * 16]
3324 paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1
3325 paddw m5, m2 ;m5=[2+3+4+5] Row3
3326
3327 lea r0, [r0 + 2 * r1]
3328 movq m0, [r0]
3329 punpcklbw m1, m0
3330 pmaddubsw m2, m1, [r6 + 1 * 16]
3331 pmaddubsw m1, [r6 + 2 * 16]
3332 paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2
3333 paddw m4, m2 ;m4=[3+4+5+6] Row4
3334
3335 movq m1, [r0 + r1]
3336 punpcklbw m0, m1
3337 pmaddubsw m2, m0, [r6 + 2 * 16]
3338 pmaddubsw m0, [r6 + 3 * 16]
3339 paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end
3340 paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3
3341
3342 lea r0, [r0 + 2 * r1]
3343 movq m0, [r0]
3344 punpcklbw m1, m0
3345 pmaddubsw m2, m1, [r6 + 2 * 16]
3346 pmaddubsw m1, [r6 + 3 * 16]
3347 paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end
3348 paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4
3349
3350 movq m1, [r0 + r1]
3351 punpcklbw m0, m1
3352 pmaddubsw m0, [r6 + 3 * 16]
3353 paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end
3354
3355 movq m0, [r0 + 2 * r1]
3356 punpcklbw m1, m0
3357 pmaddubsw m1, [r6 + 3 * 16]
3358 paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end
3359 %endmacro
3360
3361 ;-------------------------------------------------------------------------------------------------------------
3362 ; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3363 ;-------------------------------------------------------------------------------------------------------------
3364 %macro FILTER_VER_LUMA_4xN 3
3365 INIT_XMM sse4
3366 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
3367 lea r5, [3 * r1]
3368 sub r0, r5
3369 shl r4d, 6
3370 %ifidn %3,ps
3371 add r3d, r3d
3372 %endif
3373
3374 %ifdef PIC
3375 lea r5, [tab_LumaCoeffVer]
3376 lea r6, [r5 + r4]
3377 %else
3378 lea r6, [tab_LumaCoeffVer + r4]
3379 %endif
3380
3381 %ifidn %3,pp
3382 mova m3, [tab_c_512]
3383 %else
3384 mova m3, [pw_2000]
3385 %endif
3386
3387 mov r4d, %2/4
3388 lea r5, [4 * r1]
3389
3390 .loopH:
3391 PROCESS_LUMA_W4_4R
3392
3393 %ifidn %3,pp
3394 pmulhrsw m4, m3
3395 pmulhrsw m5, m3
3396
3397 packuswb m4, m5
3398
3399 movd [r2], m4
3400 pextrd [r2 + r3], m4, 1
3401 lea r2, [r2 + 2 * r3]
3402 pextrd [r2], m4, 2
3403 pextrd [r2 + r3], m4, 3
3404 %else
3405 psubw m4, m3
3406 psubw m5, m3
3407
3408 movlps [r2], m4
3409 movhps [r2 + r3], m4
3410 lea r2, [r2 + 2 * r3]
3411 movlps [r2], m5
3412 movhps [r2 + r3], m5
3413 %endif
3414
3415 sub r0, r5
3416 lea r2, [r2 + 2 * r3]
3417
3418 dec r4d
3419 jnz .loopH
3420
3421 RET
3422 %endmacro
3423
3424 ;-------------------------------------------------------------------------------------------------------------
3425 ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3426 ;-------------------------------------------------------------------------------------------------------------
3427 FILTER_VER_LUMA_4xN 4, 4, pp
3428
3429 ;-------------------------------------------------------------------------------------------------------------
3430 ; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3431 ;-------------------------------------------------------------------------------------------------------------
3432 FILTER_VER_LUMA_4xN 4, 8, pp
3433
3434 ;-------------------------------------------------------------------------------------------------------------
3435 ; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3436 ;-------------------------------------------------------------------------------------------------------------
3437 FILTER_VER_LUMA_4xN 4, 16, pp
3438
3439 ;-------------------------------------------------------------------------------------------------------------
3440 ; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3441 ;-------------------------------------------------------------------------------------------------------------
3442 FILTER_VER_LUMA_4xN 4, 4, ps
3443
3444 ;-------------------------------------------------------------------------------------------------------------
3445 ; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3446 ;-------------------------------------------------------------------------------------------------------------
3447 FILTER_VER_LUMA_4xN 4, 8, ps
3448
3449 ;-------------------------------------------------------------------------------------------------------------
3450 ; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3451 ;-------------------------------------------------------------------------------------------------------------
3452 FILTER_VER_LUMA_4xN 4, 16, ps
3453
3454 ;-------------------------------------------------------------------------------------------------------------
3455 ; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3456 ;-------------------------------------------------------------------------------------------------------------
3457 %macro FILTER_VER_LUMA_8xN 3
3458 INIT_XMM sse4
3459 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
3460 lea r5, [3 * r1]
3461 sub r0, r5
3462 shl r4d, 6
3463
3464 %ifidn %3,ps
3465 add r3d, r3d
3466 %endif
3467
3468 %ifdef PIC
3469 lea r5, [tab_LumaCoeffVer]
3470 lea r6, [r5 + r4]
3471 %else
3472 lea r6, [tab_LumaCoeffVer + r4]
3473 %endif
3474
3475 %ifidn %3,pp
3476 mova m3, [tab_c_512]
3477 %else
3478 mova m3, [pw_2000]
3479 %endif
3480
3481 mov r4d, %2/4
3482 lea r5, [4 * r1]
3483
3484 .loopH:
3485 PROCESS_LUMA_W8_4R
3486
3487 %ifidn %3,pp
3488 pmulhrsw m7, m3
3489 pmulhrsw m6, m3
3490 pmulhrsw m5, m3
3491 pmulhrsw m4, m3
3492
3493 packuswb m7, m6
3494 packuswb m5, m4
3495
3496 movlps [r2], m7
3497 movhps [r2 + r3], m7
3498 lea r2, [r2 + 2 * r3]
3499 movlps [r2], m5
3500 movhps [r2 + r3], m5
3501 %else
3502 psubw m7, m3
3503 psubw m6, m3
3504 psubw m5, m3
3505 psubw m4, m3
3506
3507 movu [r2], m7
3508 movu [r2 + r3], m6
3509 lea r2, [r2 + 2 * r3]
3510 movu [r2], m5
3511 movu [r2 + r3], m4
3512 %endif
3513
3514 sub r0, r5
3515 lea r2, [r2 + 2 * r3]
3516
3517 dec r4d
3518 jnz .loopH
3519
3520 RET
3521 %endmacro
3522
3523 ;-------------------------------------------------------------------------------------------------------------
3524 ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3525 ;-------------------------------------------------------------------------------------------------------------
3526 FILTER_VER_LUMA_8xN 8, 4, pp
3527
3528 ;-------------------------------------------------------------------------------------------------------------
3529 ; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3530 ;-------------------------------------------------------------------------------------------------------------
3531 FILTER_VER_LUMA_8xN 8, 8, pp
3532
3533 ;-------------------------------------------------------------------------------------------------------------
3534 ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3535 ;-------------------------------------------------------------------------------------------------------------
3536 FILTER_VER_LUMA_8xN 8, 16, pp
3537
3538 ;-------------------------------------------------------------------------------------------------------------
3539 ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3540 ;-------------------------------------------------------------------------------------------------------------
3541 FILTER_VER_LUMA_8xN 8, 32, pp
3542
3543 ;-------------------------------------------------------------------------------------------------------------
3544 ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3545 ;-------------------------------------------------------------------------------------------------------------
3546 FILTER_VER_LUMA_8xN 8, 4, ps
3547
3548 ;-------------------------------------------------------------------------------------------------------------
3549 ; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3550 ;-------------------------------------------------------------------------------------------------------------
3551 FILTER_VER_LUMA_8xN 8, 8, ps
3552
3553 ;-------------------------------------------------------------------------------------------------------------
3554 ; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3555 ;-------------------------------------------------------------------------------------------------------------
3556 FILTER_VER_LUMA_8xN 8, 16, ps
3557
3558 ;-------------------------------------------------------------------------------------------------------------
3559 ; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3560 ;-------------------------------------------------------------------------------------------------------------
3561 FILTER_VER_LUMA_8xN 8, 32, ps
3562
3563 ;-------------------------------------------------------------------------------------------------------------
3564 ; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3565 ;-------------------------------------------------------------------------------------------------------------
3566 %macro FILTER_VER_LUMA_12xN 3
3567 INIT_XMM sse4
3568 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
3569 lea r5, [3 * r1]
3570 sub r0, r5
3571 shl r4d, 6
3572 %ifidn %3,ps
3573 add r3d, r3d
3574 %endif
3575
3576 %ifdef PIC
3577 lea r5, [tab_LumaCoeffVer]
3578 lea r6, [r5 + r4]
3579 %else
3580 lea r6, [tab_LumaCoeffVer + r4]
3581 %endif
3582
3583 %ifidn %3,pp
3584 mova m3, [tab_c_512]
3585 %else
3586 mova m3, [pw_2000]
3587 %endif
3588
3589 mov r4d, %2/4
3590
3591 .loopH:
3592 PROCESS_LUMA_W8_4R
3593
3594 %ifidn %3,pp
3595 pmulhrsw m7, m3
3596 pmulhrsw m6, m3
3597 pmulhrsw m5, m3
3598 pmulhrsw m4, m3
3599
3600 packuswb m7, m6
3601 packuswb m5, m4
3602
3603 movlps [r2], m7
3604 movhps [r2 + r3], m7
3605 lea r5, [r2 + 2 * r3]
3606 movlps [r5], m5
3607 movhps [r5 + r3], m5
3608 %else
3609 psubw m7, m3
3610 psubw m6, m3
3611 psubw m5, m3
3612 psubw m4, m3
3613
3614 movu [r2], m7
3615 movu [r2 + r3], m6
3616 lea r5, [r2 + 2 * r3]
3617 movu [r5], m5
3618 movu [r5 + r3], m4
3619 %endif
3620
3621 lea r5, [8 * r1 - 8]
3622 sub r0, r5
3623 %ifidn %3,pp
3624 add r2, 8
3625 %else
3626 add r2, 16
3627 %endif
3628
3629 PROCESS_LUMA_W4_4R
3630
3631 %ifidn %3,pp
3632 pmulhrsw m4, m3
3633 pmulhrsw m5, m3
3634
3635 packuswb m4, m5
3636
3637 movd [r2], m4
3638 pextrd [r2 + r3], m4, 1
3639 lea r5, [r2 + 2 * r3]
3640 pextrd [r5], m4, 2
3641 pextrd [r5 + r3], m4, 3
3642 %else
3643 psubw m4, m3
3644 psubw m5, m3
3645
3646 movlps [r2], m4
3647 movhps [r2 + r3], m4
3648 lea r5, [r2 + 2 * r3]
3649 movlps [r5], m5
3650 movhps [r5 + r3], m5
3651 %endif
3652
3653 lea r5, [4 * r1 + 8]
3654 sub r0, r5
3655 %ifidn %3,pp
3656 lea r2, [r2 + 4 * r3 - 8]
3657 %else
3658 lea r2, [r2 + 4 * r3 - 16]
3659 %endif
3660
3661 dec r4d
3662 jnz .loopH
3663
3664 RET
3665 %endmacro
3666
3667 ;-------------------------------------------------------------------------------------------------------------
3668 ; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3669 ;-------------------------------------------------------------------------------------------------------------
3670 FILTER_VER_LUMA_12xN 12, 16, pp
3671
3672 ;-------------------------------------------------------------------------------------------------------------
3673 ; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3674 ;-------------------------------------------------------------------------------------------------------------
3675 FILTER_VER_LUMA_12xN 12, 16, ps
3676
3677 ;-------------------------------------------------------------------------------------------------------------
3678 ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3679 ;-------------------------------------------------------------------------------------------------------------
3680 %macro FILTER_VER_LUMA 3
3681 INIT_XMM sse4
3682 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
3683 lea r5, [3 * r1]
3684 sub r0, r5
3685 shl r4d, 6
3686 %ifidn %3,ps
3687 add r3d, r3d
3688 %endif
3689
3690 %ifdef PIC
3691 lea r5, [tab_LumaCoeffVer]
3692 lea r6, [r5 + r4]
3693 %else
3694 lea r6, [tab_LumaCoeffVer + r4]
3695 %endif
3696
3697 %ifidn %3,pp
3698 mova m3, [tab_c_512]
3699 %else
3700 mova m3, [pw_2000]
3701 %endif
3702 mov dword [rsp], %2/4
3703
3704 .loopH:
3705 mov r4d, (%1/8)
3706 .loopW:
3707 PROCESS_LUMA_W8_4R
3708 %ifidn %3,pp
3709 pmulhrsw m7, m3
3710 pmulhrsw m6, m3
3711 pmulhrsw m5, m3
3712 pmulhrsw m4, m3
3713
3714 packuswb m7, m6
3715 packuswb m5, m4
3716
3717 movlps [r2], m7
3718 movhps [r2 + r3], m7
3719 lea r5, [r2 + 2 * r3]
3720 movlps [r5], m5
3721 movhps [r5 + r3], m5
3722 %else
3723 psubw m7, m3
3724 psubw m6, m3
3725 psubw m5, m3
3726 psubw m4, m3
3727
3728 movu [r2], m7
3729 movu [r2 + r3], m6
3730 lea r5, [r2 + 2 * r3]
3731 movu [r5], m5
3732 movu [r5 + r3], m4
3733 %endif
3734
3735 lea r5, [8 * r1 - 8]
3736 sub r0, r5
3737 %ifidn %3,pp
3738 add r2, 8
3739 %else
3740 add r2, 16
3741 %endif
3742 dec r4d
3743 jnz .loopW
3744
3745 lea r0, [r0 + 4 * r1 - %1]
3746 %ifidn %3,pp
3747 lea r2, [r2 + 4 * r3 - %1]
3748 %else
3749 lea r2, [r2 + 4 * r3 - 2 * %1]
3750 %endif
3751
3752 dec dword [rsp]
3753 jnz .loopH
3754
3755 RET
3756 %endmacro
3757
3758 FILTER_VER_LUMA 16, 4, pp
3759 FILTER_VER_LUMA 16, 8, pp
3760 FILTER_VER_LUMA 16, 12, pp
3761 FILTER_VER_LUMA 16, 16, pp
3762 FILTER_VER_LUMA 16, 32, pp
3763 FILTER_VER_LUMA 16, 64, pp
3764 FILTER_VER_LUMA 24, 32, pp
3765 FILTER_VER_LUMA 32, 8, pp
3766 FILTER_VER_LUMA 32, 16, pp
3767 FILTER_VER_LUMA 32, 24, pp
3768 FILTER_VER_LUMA 32, 32, pp
3769 FILTER_VER_LUMA 32, 64, pp
3770 FILTER_VER_LUMA 48, 64, pp
3771 FILTER_VER_LUMA 64, 16, pp
3772 FILTER_VER_LUMA 64, 32, pp
3773 FILTER_VER_LUMA 64, 48, pp
3774 FILTER_VER_LUMA 64, 64, pp
3775
3776 FILTER_VER_LUMA 16, 4, ps
3777 FILTER_VER_LUMA 16, 8, ps
3778 FILTER_VER_LUMA 16, 12, ps
3779 FILTER_VER_LUMA 16, 16, ps
3780 FILTER_VER_LUMA 16, 32, ps
3781 FILTER_VER_LUMA 16, 64, ps
3782 FILTER_VER_LUMA 24, 32, ps
3783 FILTER_VER_LUMA 32, 8, ps
3784 FILTER_VER_LUMA 32, 16, ps
3785 FILTER_VER_LUMA 32, 24, ps
3786 FILTER_VER_LUMA 32, 32, ps
3787 FILTER_VER_LUMA 32, 64, ps
3788 FILTER_VER_LUMA 48, 64, ps
3789 FILTER_VER_LUMA 64, 16, ps
3790 FILTER_VER_LUMA 64, 32, ps
3791 FILTER_VER_LUMA 64, 48, ps
3792 FILTER_VER_LUMA 64, 64, ps
3793
3794 %macro PROCESS_LUMA_SP_W4_4R 0
3795 movq m0, [r0]
3796 movq m1, [r0 + r1]
3797 punpcklwd m0, m1 ;m0=[0 1]
3798 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
3799
3800 lea r0, [r0 + 2 * r1]
3801 movq m4, [r0]
3802 punpcklwd m1, m4 ;m1=[1 2]
3803 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
3804
3805 movq m5, [r0 + r1]
3806 punpcklwd m4, m5 ;m4=[2 3]
3807 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
3808 pmaddwd m4, [r6 + 1 * 16]
3809 paddd m0, m4 ;m0=[0+1+2+3] Row1
3810
3811 lea r0, [r0 + 2 * r1]
3812 movq m4, [r0]
3813 punpcklwd m5, m4 ;m5=[3 4]
3814 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
3815 pmaddwd m5, [r6 + 1 * 16]
3816 paddd m1, m5 ;m1 = [1+2+3+4] Row2
3817
3818 movq m5, [r0 + r1]
3819 punpcklwd m4, m5 ;m4=[4 5]
3820 pmaddwd m6, m4, [r6 + 1 * 16]
3821 paddd m2, m6 ;m2=[2+3+4+5] Row3
3822 pmaddwd m4, [r6 + 2 * 16]
3823 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
3824
3825 lea r0, [r0 + 2 * r1]
3826 movq m4, [r0]
3827 punpcklwd m5, m4 ;m5=[5 6]
3828 pmaddwd m6, m5, [r6 + 1 * 16]
3829 paddd m3, m6 ;m3=[3+4+5+6] Row4
3830 pmaddwd m5, [r6 + 2 * 16]
3831 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
3832
3833 movq m5, [r0 + r1]
3834 punpcklwd m4, m5 ;m4=[6 7]
3835 pmaddwd m6, m4, [r6 + 2 * 16]
3836 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
3837 pmaddwd m4, [r6 + 3 * 16]
3838 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
3839
3840 lea r0, [r0 + 2 * r1]
3841 movq m4, [r0]
3842 punpcklwd m5, m4 ;m5=[7 8]
3843 pmaddwd m6, m5, [r6 + 2 * 16]
3844 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
3845 pmaddwd m5, [r6 + 3 * 16]
3846 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
3847
3848 movq m5, [r0 + r1]
3849 punpcklwd m4, m5 ;m4=[8 9]
3850 pmaddwd m4, [r6 + 3 * 16]
3851 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
3852
3853 movq m4, [r0 + 2 * r1]
3854 punpcklwd m5, m4 ;m5=[9 10]
3855 pmaddwd m5, [r6 + 3 * 16]
3856 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
3857 %endmacro
3858
3859 ;--------------------------------------------------------------------------------------------------------------
3860 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3861 ;--------------------------------------------------------------------------------------------------------------
3862 %macro FILTER_VER_LUMA_SP 2
3863 INIT_XMM sse4
3864 cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
3865
3866 add r1d, r1d
3867 lea r5, [r1 + 2 * r1]
3868 sub r0, r5
3869 shl r4d, 6
3870
3871 %ifdef PIC
3872 lea r5, [tab_LumaCoeffV]
3873 lea r6, [r5 + r4]
3874 %else
3875 lea r6, [tab_LumaCoeffV + r4]
3876 %endif
3877
3878 mova m7, [tab_c_526336]
3879
3880 mov dword [rsp], %2/4
3881 .loopH:
3882 mov r4d, (%1/4)
3883 .loopW:
3884 PROCESS_LUMA_SP_W4_4R
3885
3886 paddd m0, m7
3887 paddd m1, m7
3888 paddd m2, m7
3889 paddd m3, m7
3890
3891 psrad m0, 12
3892 psrad m1, 12
3893 psrad m2, 12
3894 psrad m3, 12
3895
3896 packssdw m0, m1
3897 packssdw m2, m3
3898
3899 packuswb m0, m2
3900
3901 movd [r2], m0
3902 pextrd [r2 + r3], m0, 1
3903 lea r5, [r2 + 2 * r3]
3904 pextrd [r5], m0, 2
3905 pextrd [r5 + r3], m0, 3
3906
3907 lea r5, [8 * r1 - 2 * 4]
3908 sub r0, r5
3909 add r2, 4
3910
3911 dec r4d
3912 jnz .loopW
3913
3914 lea r0, [r0 + 4 * r1 - 2 * %1]
3915 lea r2, [r2 + 4 * r3 - %1]
3916
3917 dec dword [rsp]
3918 jnz .loopH
3919
3920 RET
3921 %endmacro
3922
3923 ;--------------------------------------------------------------------------------------------------------------
3924 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3925 ;--------------------------------------------------------------------------------------------------------------
3926 FILTER_VER_LUMA_SP 4, 4
3927 FILTER_VER_LUMA_SP 8, 8
3928 FILTER_VER_LUMA_SP 8, 4
3929 FILTER_VER_LUMA_SP 4, 8
3930 FILTER_VER_LUMA_SP 16, 16
3931 FILTER_VER_LUMA_SP 16, 8
3932 FILTER_VER_LUMA_SP 8, 16
3933 FILTER_VER_LUMA_SP 16, 12
3934 FILTER_VER_LUMA_SP 12, 16
3935 FILTER_VER_LUMA_SP 16, 4
3936 FILTER_VER_LUMA_SP 4, 16
3937 FILTER_VER_LUMA_SP 32, 32
3938 FILTER_VER_LUMA_SP 32, 16
3939 FILTER_VER_LUMA_SP 16, 32
3940 FILTER_VER_LUMA_SP 32, 24
3941 FILTER_VER_LUMA_SP 24, 32
3942 FILTER_VER_LUMA_SP 32, 8
3943 FILTER_VER_LUMA_SP 8, 32
3944 FILTER_VER_LUMA_SP 64, 64
3945 FILTER_VER_LUMA_SP 64, 32
3946 FILTER_VER_LUMA_SP 32, 64
3947 FILTER_VER_LUMA_SP 64, 48
3948 FILTER_VER_LUMA_SP 48, 64
3949 FILTER_VER_LUMA_SP 64, 16
3950 FILTER_VER_LUMA_SP 16, 64
3951
3952 ; TODO: combin of U and V is more performance, but need more register
3953 ; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
3954 INIT_XMM ssse3
3955 cglobal chroma_p2s, 3, 7, 4
3956
3957 ; load width and height
3958 mov r3d, r3m
3959 mov r4d, r4m
3960
3961 ; load constant
3962 mova m2, [tab_c_128]
3963 mova m3, [tab_c_64_n64]
3964
3965 .loopH:
3966
3967 xor r5d, r5d
3968 .loopW:
3969 lea r6, [r0 + r5]
3970
3971 movh m0, [r6]
3972 punpcklbw m0, m2
3973 pmaddubsw m0, m3
3974
3975 movh m1, [r6 + r1]
3976 punpcklbw m1, m2
3977 pmaddubsw m1, m3
3978
3979 add r5d, 8
3980 cmp r5d, r3d
3981 lea r6, [r2 + r5 * 2]
3982 jg .width4
3983 movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
3984 movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
3985 je .nextH
3986 jmp .loopW
3987
3988 .width4:
3989 test r3d, 4
3990 jz .width2
3991 test r3d, 2
3992 movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
3993 movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
3994 lea r6, [r6 + 8]
3995 pshufd m0, m0, 2
3996 pshufd m1, m1, 2
3997 jz .nextH
3998
3999 .width2:
4000 movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
4001 movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
4002
4003 .nextH:
4004 lea r0, [r0 + r1 * 2]
4005 add r2, FENC_STRIDE / 2 * 4
4006
4007 sub r4d, 2
4008 jnz .loopH
4009
4010 RET
4011
4012 %macro PROCESS_CHROMA_SP_W4_4R 0
4013 movq m0, [r0]
4014 movq m1, [r0 + r1]
4015 punpcklwd m0, m1 ;m0=[0 1]
4016 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
4017
4018 lea r0, [r0 + 2 * r1]
4019 movq m4, [r0]
4020 punpcklwd m1, m4 ;m1=[1 2]
4021 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
4022
4023 movq m5, [r0 + r1]
4024 punpcklwd m4, m5 ;m4=[2 3]
4025 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
4026 pmaddwd m4, [r6 + 1 * 16]
4027 paddd m0, m4 ;m0=[0+1+2+3] Row1 done
4028
4029 lea r0, [r0 + 2 * r1]
4030 movq m4, [r0]
4031 punpcklwd m5, m4 ;m5=[3 4]
4032 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
4033 pmaddwd m5, [r6 + 1 * 16]
4034 paddd m1, m5 ;m1 = [1+2+3+4] Row2
4035
4036 movq m5, [r0 + r1]
4037 punpcklwd m4, m5 ;m4=[4 5]
4038 pmaddwd m4, [r6 + 1 * 16]
4039 paddd m2, m4 ;m2=[2+3+4+5] Row3
4040
4041 movq m4, [r0 + 2 * r1]
4042 punpcklwd m5, m4 ;m5=[5 6]
4043 pmaddwd m5, [r6 + 1 * 16]
4044 paddd m3, m5 ;m3=[3+4+5+6] Row4
4045 %endmacro
4046
4047 ;--------------------------------------------------------------------------------------------------------------
4048 ; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4049 ;--------------------------------------------------------------------------------------------------------------
4050 %macro FILTER_VER_CHROMA_SP 2
4051 INIT_XMM sse4
4052 cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
4053
4054 add r1d, r1d
4055 sub r0, r1
4056 shl r4d, 5
4057
4058 %ifdef PIC
4059 lea r5, [tab_ChromaCoeffV]
4060 lea r6, [r5 + r4]
4061 %else
4062 lea r6, [tab_ChromaCoeffV + r4]
4063 %endif
4064
4065 mova m6, [tab_c_526336]
4066
4067 mov dword [rsp], %2/4
4068
4069 .loopH:
4070 mov r4d, (%1/4)
4071 .loopW:
4072 PROCESS_CHROMA_SP_W4_4R
4073
4074 paddd m0, m6
4075 paddd m1, m6
4076 paddd m2, m6
4077 paddd m3, m6
4078
4079 psrad m0, 12
4080 psrad m1, 12
4081 psrad m2, 12
4082 psrad m3, 12
4083
4084 packssdw m0, m1
4085 packssdw m2, m3
4086
4087 packuswb m0, m2
4088
4089 movd [r2], m0
4090 pextrd [r2 + r3], m0, 1
4091 lea r5, [r2 + 2 * r3]
4092 pextrd [r5], m0, 2
4093 pextrd [r5 + r3], m0, 3
4094
4095 lea r5, [4 * r1 - 2 * 4]
4096 sub r0, r5
4097 add r2, 4
4098
4099 dec r4d
4100 jnz .loopW
4101
4102 lea r0, [r0 + 4 * r1 - 2 * %1]
4103 lea r2, [r2 + 4 * r3 - %1]
4104
4105 dec dword [rsp]
4106 jnz .loopH
4107
4108 RET
4109 %endmacro
4110
4111 FILTER_VER_CHROMA_SP 4, 4
4112 FILTER_VER_CHROMA_SP 4, 8
4113 FILTER_VER_CHROMA_SP 16, 16
4114 FILTER_VER_CHROMA_SP 16, 8
4115 FILTER_VER_CHROMA_SP 16, 12
4116 FILTER_VER_CHROMA_SP 12, 16
4117 FILTER_VER_CHROMA_SP 16, 4
4118 FILTER_VER_CHROMA_SP 4, 16
4119 FILTER_VER_CHROMA_SP 32, 32
4120 FILTER_VER_CHROMA_SP 32, 16
4121 FILTER_VER_CHROMA_SP 16, 32
4122 FILTER_VER_CHROMA_SP 32, 24
4123 FILTER_VER_CHROMA_SP 24, 32
4124 FILTER_VER_CHROMA_SP 32, 8
4125
4126 FILTER_VER_CHROMA_SP 16, 24
4127 FILTER_VER_CHROMA_SP 16, 64
4128 FILTER_VER_CHROMA_SP 12, 32
4129 FILTER_VER_CHROMA_SP 4, 32
4130 FILTER_VER_CHROMA_SP 32, 64
4131 FILTER_VER_CHROMA_SP 32, 48
4132 FILTER_VER_CHROMA_SP 24, 64
4133
4134 FILTER_VER_CHROMA_SP 64, 64
4135 FILTER_VER_CHROMA_SP 64, 32
4136 FILTER_VER_CHROMA_SP 64, 48
4137 FILTER_VER_CHROMA_SP 48, 64
4138 FILTER_VER_CHROMA_SP 64, 16
4139
4140
4141 %macro PROCESS_CHROMA_SP_W2_4R 1
4142 movd m0, [r0]
4143 movd m1, [r0 + r1]
4144 punpcklwd m0, m1 ;m0=[0 1]
4145
4146 lea r0, [r0 + 2 * r1]
4147 movd m2, [r0]
4148 punpcklwd m1, m2 ;m1=[1 2]
4149 punpcklqdq m0, m1 ;m0=[0 1 1 2]
4150 pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
4151
4152 movd m1, [r0 + r1]
4153 punpcklwd m2, m1 ;m2=[2 3]
4154
4155 lea r0, [r0 + 2 * r1]
4156 movd m3, [r0]
4157 punpcklwd m1, m3 ;m2=[3 4]
4158 punpcklqdq m2, m1 ;m2=[2 3 3 4]
4159
4160 pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
4161 pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
4162 paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
4163
4164 movd m1, [r0 + r1]
4165 punpcklwd m3, m1 ;m3=[4 5]
4166
4167 movd m4, [r0 + 2 * r1]
4168 punpcklwd m1, m4 ;m1=[5 6]
4169 punpcklqdq m3, m1 ;m2=[4 5 5 6]
4170 pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
4171 paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
4172 %endmacro
4173
4174 ;-------------------------------------------------------------------------------------------------------------------
4175 ; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4176 ;-------------------------------------------------------------------------------------------------------------------
4177 %macro FILTER_VER_CHROMA_SP_W2_4R 2
4178 INIT_XMM sse4
4179 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6
4180
4181 add r1d, r1d
4182 sub r0, r1
4183 shl r4d, 5
4184
4185 %ifdef PIC
4186 lea r5, [tab_ChromaCoeffV]
4187 lea r5, [r5 + r4]
4188 %else
4189 lea r5, [tab_ChromaCoeffV + r4]
4190 %endif
4191
4192 mova m5, [tab_c_526336]
4193
4194 mov r4d, (%2/4)
4195
4196 .loopH:
4197 PROCESS_CHROMA_SP_W2_4R r5
4198
4199 paddd m0, m5
4200 paddd m2, m5
4201
4202 psrad m0, 12
4203 psrad m2, 12
4204
4205 packssdw m0, m2
4206 packuswb m0, m0
4207
4208 pextrw [r2], m0, 0
4209 pextrw [r2 + r3], m0, 1
4210 lea r2, [r2 + 2 * r3]
4211 pextrw [r2], m0, 2
4212 pextrw [r2 + r3], m0, 3
4213
4214 lea r2, [r2 + 2 * r3]
4215
4216 dec r4d
4217 jnz .loopH
4218
4219 RET
4220 %endmacro
4221
4222 FILTER_VER_CHROMA_SP_W2_4R 2, 4
4223 FILTER_VER_CHROMA_SP_W2_4R 2, 8
4224
4225 FILTER_VER_CHROMA_SP_W2_4R 2, 16
4226
4227 ;--------------------------------------------------------------------------------------------------------------
4228 ; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4229 ;--------------------------------------------------------------------------------------------------------------
4230 INIT_XMM sse4
4231 cglobal interp_4tap_vert_sp_4x2, 5, 6, 5
4232
4233 add r1d, r1d
4234 sub r0, r1
4235 shl r4d, 5
4236
4237 %ifdef PIC
4238 lea r5, [tab_ChromaCoeffV]
4239 lea r5, [r5 + r4]
4240 %else
4241 lea r5, [tab_ChromaCoeffV + r4]
4242 %endif
4243
4244 mova m4, [tab_c_526336]
4245
4246 movq m0, [r0]
4247 movq m1, [r0 + r1]
4248 punpcklwd m0, m1 ;m0=[0 1]
4249 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
4250
4251 lea r0, [r0 + 2 * r1]
4252 movq m2, [r0]
4253 punpcklwd m1, m2 ;m1=[1 2]
4254 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
4255
4256 movq m3, [r0 + r1]
4257 punpcklwd m2, m3 ;m4=[2 3]
4258 pmaddwd m2, [r5 + 1 * 16]
4259 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
4260 paddd m0, m4
4261 psrad m0, 12
4262
4263 movq m2, [r0 + 2 * r1]
4264 punpcklwd m3, m2 ;m5=[3 4]
4265 pmaddwd m3, [r5 + 1 * 16]
4266 paddd m1, m3 ;m1 = [1+2+3+4] Row2 done
4267 paddd m1, m4
4268 psrad m1, 12
4269
4270 packssdw m0, m1
4271 packuswb m0, m0
4272
4273 movd [r2], m0
4274 pextrd [r2 + r3], m0, 1
4275
4276 RET
4277
4278 ;-------------------------------------------------------------------------------------------------------------------
4279 ; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4280 ;-------------------------------------------------------------------------------------------------------------------
4281 %macro FILTER_VER_CHROMA_SP_W6_H4 2
4282 INIT_XMM sse4
4283 cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
4284
4285 add r1d, r1d
4286 sub r0, r1
4287 shl r4d, 5
4288
4289 %ifdef PIC
4290 lea r5, [tab_ChromaCoeffV]
4291 lea r6, [r5 + r4]
4292 %else
4293 lea r6, [tab_ChromaCoeffV + r4]
4294 %endif
4295
4296 mova m6, [tab_c_526336]
4297
4298 mov r4d, %2/4
4299
4300 .loopH:
4301 PROCESS_CHROMA_SP_W4_4R
4302
4303 paddd m0, m6
4304 paddd m1, m6
4305 paddd m2, m6
4306 paddd m3, m6
4307
4308 psrad m0, 12
4309 psrad m1, 12
4310 psrad m2, 12
4311 psrad m3, 12
4312
4313 packssdw m0, m1
4314 packssdw m2, m3
4315
4316 packuswb m0, m2
4317
4318 movd [r2], m0
4319 pextrd [r2 + r3], m0, 1
4320 lea r5, [r2 + 2 * r3]
4321 pextrd [r5], m0, 2
4322 pextrd [r5 + r3], m0, 3
4323
4324 lea r5, [4 * r1 - 2 * 4]
4325 sub r0, r5
4326 add r2, 4
4327
4328 PROCESS_CHROMA_SP_W2_4R r6
4329
4330 paddd m0, m6
4331 paddd m2, m6
4332
4333 psrad m0, 12
4334 psrad m2, 12
4335
4336 packssdw m0, m2
4337 packuswb m0, m0
4338
4339 pextrw [r2], m0, 0
4340 pextrw [r2 + r3], m0, 1
4341 lea r2, [r2 + 2 * r3]
4342 pextrw [r2], m0, 2
4343 pextrw [r2 + r3], m0, 3
4344
4345 sub r0, 2 * 4
4346 lea r2, [r2 + 2 * r3 - 4]
4347
4348 dec r4d
4349 jnz .loopH
4350
4351 RET
4352 %endmacro
4353
4354 FILTER_VER_CHROMA_SP_W6_H4 6, 8
4355
4356 FILTER_VER_CHROMA_SP_W6_H4 6, 16
4357
4358 %macro PROCESS_CHROMA_SP_W8_2R 0
4359 movu m1, [r0]
4360 movu m3, [r0 + r1]
4361 punpcklwd m0, m1, m3
4362 pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
4363 punpckhwd m1, m3
4364 pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
4365
4366 movu m4, [r0 + 2 * r1]
4367 punpcklwd m2, m3, m4
4368 pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
4369 punpckhwd m3, m4
4370 pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
4371
4372 lea r0, [r0 + 2 * r1]
4373 movu m5, [r0 + r1]
4374 punpcklwd m6, m4, m5
4375 pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
4376 paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
4377 punpckhwd m4, m5
4378 pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
4379 paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
4380
4381 movu m4, [r0 + 2 * r1]
4382 punpcklwd m6, m5, m4
4383 pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
4384 paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
4385 punpckhwd m5, m4
4386 pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
4387 paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
4388 %endmacro
4389
4390 ;--------------------------------------------------------------------------------------------------------------
4391 ; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4392 ;--------------------------------------------------------------------------------------------------------------
4393 %macro FILTER_VER_CHROMA_SP_W8_H2 2
4394 INIT_XMM sse2
4395 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8
4396
4397 add r1d, r1d
4398 sub r0, r1
4399 shl r4d, 5
4400
4401 %ifdef PIC
4402 lea r5, [tab_ChromaCoeffV]
4403 lea r5, [r5 + r4]
4404 %else
4405 lea r5, [tab_ChromaCoeffV + r4]
4406 %endif
4407
4408 mova m7, [tab_c_526336]
4409
4410 mov r4d, %2/2
4411 .loopH:
4412 PROCESS_CHROMA_SP_W8_2R
4413
4414 paddd m0, m7
4415 paddd m1, m7
4416 paddd m2, m7
4417 paddd m3, m7
4418
4419 psrad m0, 12
4420 psrad m1, 12
4421 psrad m2, 12
4422 psrad m3, 12
4423
4424 packssdw m0, m1
4425 packssdw m2, m3
4426
4427 packuswb m0, m2
4428
4429 movlps [r2], m0
4430 movhps [r2 + r3], m0
4431
4432 lea r2, [r2 + 2 * r3]
4433
4434 dec r4d
4435 jnz .loopH
4436
4437 RET
4438 %endmacro
4439
4440 FILTER_VER_CHROMA_SP_W8_H2 8, 2
4441 FILTER_VER_CHROMA_SP_W8_H2 8, 4
4442 FILTER_VER_CHROMA_SP_W8_H2 8, 6
4443 FILTER_VER_CHROMA_SP_W8_H2 8, 8
4444 FILTER_VER_CHROMA_SP_W8_H2 8, 16
4445 FILTER_VER_CHROMA_SP_W8_H2 8, 32
4446
4447 FILTER_VER_CHROMA_SP_W8_H2 8, 12
4448 FILTER_VER_CHROMA_SP_W8_H2 8, 64
4449
4450
4451 ;-----------------------------------------------------------------------------------------------------------------------------
4452 ; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4453 ;-----------------------------------------------------------------------------------------------------------------------------
4454 %macro FILTER_HORIZ_CHROMA_2xN 2
4455 INIT_XMM sse4
4456 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
4457 %define coef2 m3
4458 %define Tm0 m2
4459 %define t1 m1
4460 %define t0 m0
4461
4462 dec srcq
4463 mov r4d, r4m
4464 add dststrided, dststrided
4465
4466 %ifdef PIC
4467 lea r6, [tab_ChromaCoeff]
4468 movd coef2, [r6 + r4 * 4]
4469 %else
4470 movd coef2, [tab_ChromaCoeff + r4 * 4]
4471 %endif
4472
4473 pshufd coef2, coef2, 0
4474 mova t1, [pw_2000]
4475 mova Tm0, [tab_Tm]
4476
4477 mov r4d, %2
4478 cmp r5m, byte 0
4479 je .loopH
4480 sub srcq, srcstrideq
4481 add r4d, 3
4482
4483 .loopH:
4484 movh t0, [srcq]
4485 pshufb t0, t0, Tm0
4486 pmaddubsw t0, coef2
4487 phaddw t0, t0
4488 psubw t0, t1
4489 movd [dstq], t0
4490
4491 lea srcq, [srcq + srcstrideq]
4492 lea dstq, [dstq + dststrideq]
4493
4494 dec r4d
4495 jnz .loopH
4496
4497 RET
4498 %endmacro
4499
4500 FILTER_HORIZ_CHROMA_2xN 2, 4
4501 FILTER_HORIZ_CHROMA_2xN 2, 8
4502
4503 FILTER_HORIZ_CHROMA_2xN 2, 16
4504
4505 ;-----------------------------------------------------------------------------------------------------------------------------
4506 ; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4507 ;-----------------------------------------------------------------------------------------------------------------------------
4508 %macro FILTER_HORIZ_CHROMA_4xN 2
4509 INIT_XMM sse4
4510 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
4511 %define coef2 m3
4512 %define Tm0 m2
4513 %define t1 m1
4514 %define t0 m0
4515
4516 dec srcq
4517 mov r4d, r4m
4518 add dststrided, dststrided
4519
4520 %ifdef PIC
4521 lea r6, [tab_ChromaCoeff]
4522 movd coef2, [r6 + r4 * 4]
4523 %else
4524 movd coef2, [tab_ChromaCoeff + r4 * 4]
4525 %endif
4526
4527 pshufd coef2, coef2, 0
4528 mova t1, [pw_2000]
4529 mova Tm0, [tab_Tm]
4530
4531 mov r4d, %2
4532 cmp r5m, byte 0
4533 je .loopH
4534 sub srcq, srcstrideq
4535 add r4d, 3
4536
4537 .loopH:
4538 movh t0, [srcq]
4539 pshufb t0, t0, Tm0
4540 pmaddubsw t0, coef2
4541 phaddw t0, t0
4542 psubw t0, t1
4543 movlps [dstq], t0
4544
4545 lea srcq, [srcq + srcstrideq]
4546 lea dstq, [dstq + dststrideq]
4547
4548 dec r4d
4549 jnz .loopH
4550 RET
4551 %endmacro
4552
4553 FILTER_HORIZ_CHROMA_4xN 4, 2
4554 FILTER_HORIZ_CHROMA_4xN 4, 4
4555 FILTER_HORIZ_CHROMA_4xN 4, 8
4556 FILTER_HORIZ_CHROMA_4xN 4, 16
4557
4558 FILTER_HORIZ_CHROMA_4xN 4, 32
4559
4560 %macro PROCESS_CHROMA_W6 3
4561 movu %1, [srcq]
4562 pshufb %2, %1, Tm0
4563 pmaddubsw %2, coef2
4564 pshufb %1, %1, Tm1
4565 pmaddubsw %1, coef2
4566 phaddw %2, %1
4567 psubw %2, %3
4568 movh [dstq], %2
4569 pshufd %2, %2, 2
4570 movd [dstq + 8], %2
4571 %endmacro
4572
4573 %macro PROCESS_CHROMA_W12 3
4574 movu %1, [srcq]
4575 pshufb %2, %1, Tm0
4576 pmaddubsw %2, coef2
4577 pshufb %1, %1, Tm1
4578 pmaddubsw %1, coef2
4579 phaddw %2, %1
4580 psubw %2, %3
4581 movu [dstq], %2
4582 movu %1, [srcq + 8]
4583 pshufb %1, %1, Tm0
4584 pmaddubsw %1, coef2
4585 phaddw %1, %1
4586 psubw %1, %3
4587 movh [dstq + 16], %1
4588 %endmacro
4589
4590 ;-----------------------------------------------------------------------------------------------------------------------------
4591 ; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4592 ;-----------------------------------------------------------------------------------------------------------------------------
4593 %macro FILTER_HORIZ_CHROMA 2
4594 INIT_XMM sse4
4595 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
4596 %define coef2 m5
4597 %define Tm0 m4
4598 %define Tm1 m3
4599 %define t2 m2
4600 %define t1 m1
4601 %define t0 m0
4602
4603 dec srcq
4604 mov r4d, r4m
4605 add dststrided, dststrided
4606
4607 %ifdef PIC
4608 lea r6, [tab_ChromaCoeff]
4609 movd coef2, [r6 + r4 * 4]
4610 %else
4611 movd coef2, [tab_ChromaCoeff + r4 * 4]
4612 %endif
4613
4614 pshufd coef2, coef2, 0
4615 mova t2, [pw_2000]
4616 mova Tm0, [tab_Tm]
4617 mova Tm1, [tab_Tm + 16]
4618
4619 mov r4d, %2
4620 cmp r5m, byte 0
4621 je .loopH
4622 sub srcq, srcstrideq
4623 add r4d, 3
4624
4625 .loopH:
4626 PROCESS_CHROMA_W%1 t0, t1, t2
4627 add srcq, srcstrideq
4628 add dstq, dststrideq
4629
4630 dec r4d
4631 jnz .loopH
4632
4633 RET
4634 %endmacro
4635
4636 FILTER_HORIZ_CHROMA 6, 8
4637 FILTER_HORIZ_CHROMA 12, 16
4638
4639 FILTER_HORIZ_CHROMA 6, 16
4640 FILTER_HORIZ_CHROMA 12, 32
4641
4642 %macro PROCESS_CHROMA_W8 3
4643 movu %1, [srcq]
4644 pshufb %2, %1, Tm0
4645 pmaddubsw %2, coef2
4646 pshufb %1, %1, Tm1
4647 pmaddubsw %1, coef2
4648 phaddw %2, %1
4649 psubw %2, %3
4650 movu [dstq], %2
4651 %endmacro
4652
4653 ;-----------------------------------------------------------------------------------------------------------------------------
4654 ; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4655 ;-----------------------------------------------------------------------------------------------------------------------------
4656 %macro FILTER_HORIZ_CHROMA_8xN 2
4657 INIT_XMM sse4
4658 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
4659 %define coef2 m5
4660 %define Tm0 m4
4661 %define Tm1 m3
4662 %define t2 m2
4663 %define t1 m1
4664 %define t0 m0
4665
4666 dec srcq
4667 mov r4d, r4m
4668 add dststrided, dststrided
4669
4670 %ifdef PIC
4671 lea r6, [tab_ChromaCoeff]
4672 movd coef2, [r6 + r4 * 4]
4673 %else
4674 movd coef2, [tab_ChromaCoeff + r4 * 4]
4675 %endif
4676
4677 pshufd coef2, coef2, 0
4678 mova t2, [pw_2000]
4679 mova Tm0, [tab_Tm]
4680 mova Tm1, [tab_Tm + 16]
4681
4682 mov r4d, %2
4683 cmp r5m, byte 0
4684 je .loopH
4685 sub srcq, srcstrideq
4686 add r4d, 3
4687
4688 .loopH:
4689 PROCESS_CHROMA_W8 t0, t1, t2
4690 add srcq, srcstrideq
4691 add dstq, dststrideq
4692
4693 dec r4d
4694 jnz .loopH
4695
4696 RET
4697 %endmacro
4698
4699 FILTER_HORIZ_CHROMA_8xN 8, 2
4700 FILTER_HORIZ_CHROMA_8xN 8, 4
4701 FILTER_HORIZ_CHROMA_8xN 8, 6
4702 FILTER_HORIZ_CHROMA_8xN 8, 8
4703 FILTER_HORIZ_CHROMA_8xN 8, 16
4704 FILTER_HORIZ_CHROMA_8xN 8, 32
4705
4706 FILTER_HORIZ_CHROMA_8xN 8, 12
4707 FILTER_HORIZ_CHROMA_8xN 8, 64
4708
4709 %macro PROCESS_CHROMA_W16 4
4710 movu %1, [srcq]
4711 pshufb %2, %1, Tm0
4712 pmaddubsw %2, coef2
4713 pshufb %1, %1, Tm1
4714 pmaddubsw %1, coef2
4715 phaddw %2, %1
4716 movu %1, [srcq + 8]
4717 pshufb %4, %1, Tm0
4718 pmaddubsw %4, coef2
4719 pshufb %1, %1, Tm1
4720 pmaddubsw %1, coef2
4721 phaddw %4, %1
4722 psubw %2, %3
4723 psubw %4, %3
4724 movu [dstq], %2
4725 movu [dstq + 16], %4
4726 %endmacro
4727
4728 %macro PROCESS_CHROMA_W24 4
4729 movu %1, [srcq]
4730 pshufb %2, %1, Tm0
4731 pmaddubsw %2, coef2
4732 pshufb %1, %1, Tm1
4733 pmaddubsw %1, coef2
4734 phaddw %2, %1
4735 movu %1, [srcq + 8]
4736 pshufb %4, %1, Tm0
4737 pmaddubsw %4, coef2
4738 pshufb %1, %1, Tm1
4739 pmaddubsw %1, coef2
4740 phaddw %4, %1
4741 psubw %2, %3
4742 psubw %4, %3
4743 movu [dstq], %2
4744 movu [dstq + 16], %4
4745 movu %1, [srcq + 16]
4746 pshufb %2, %1, Tm0
4747 pmaddubsw %2, coef2
4748 pshufb %1, %1, Tm1
4749 pmaddubsw %1, coef2
4750 phaddw %2, %1
4751 psubw %2, %3
4752 movu [dstq + 32], %2
4753 %endmacro
4754
4755 %macro PROCESS_CHROMA_W32 4
4756 movu %1, [srcq]
4757 pshufb %2, %1, Tm0
4758 pmaddubsw %2, coef2
4759 pshufb %1, %1, Tm1
4760 pmaddubsw %1, coef2
4761 phaddw %2, %1
4762 movu %1, [srcq + 8]
4763 pshufb %4, %1, Tm0
4764 pmaddubsw %4, coef2
4765 pshufb %1, %1, Tm1
4766 pmaddubsw %1, coef2
4767 phaddw %4, %1
4768 psubw %2, %3
4769 psubw %4, %3
4770 movu [dstq], %2
4771 movu [dstq + 16], %4
4772 movu %1, [srcq + 16]
4773 pshufb %2, %1, Tm0
4774 pmaddubsw %2, coef2
4775 pshufb %1, %1, Tm1
4776 pmaddubsw %1, coef2
4777 phaddw %2, %1
4778 movu %1, [srcq + 24]
4779 pshufb %4, %1, Tm0
4780 pmaddubsw %4, coef2
4781 pshufb %1, %1, Tm1
4782 pmaddubsw %1, coef2
4783 phaddw %4, %1
4784 psubw %2, %3
4785 psubw %4, %3
4786 movu [dstq + 32], %2
4787 movu [dstq + 48], %4
4788 %endmacro
4789
4790 %macro PROCESS_CHROMA_W16o 5
4791 movu %1, [srcq + %5]
4792 pshufb %2, %1, Tm0
4793 pmaddubsw %2, coef2
4794 pshufb %1, %1, Tm1
4795 pmaddubsw %1, coef2
4796 phaddw %2, %1
4797 movu %1, [srcq + %5 + 8]
4798 pshufb %4, %1, Tm0
4799 pmaddubsw %4, coef2
4800 pshufb %1, %1, Tm1
4801 pmaddubsw %1, coef2
4802 phaddw %4, %1
4803 psubw %2, %3
4804 psubw %4, %3
4805 movu [dstq + %5 * 2], %2
4806 movu [dstq + %5 * 2 + 16], %4
4807 %endmacro
4808
4809 %macro PROCESS_CHROMA_W48 4
4810 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
4811 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
4812 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
4813 %endmacro
4814
4815 %macro PROCESS_CHROMA_W64 4
4816 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
4817 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
4818 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
4819 PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
4820 %endmacro
4821
4822 ;------------------------------------------------------------------------------------------------------------------------------
4823 ; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4824 ;------------------------------------------------------------------------------------------------------------------------------
4825 %macro FILTER_HORIZ_CHROMA_WxN 2
4826 INIT_XMM sse4
4827 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride
4828 %define coef2 m6
4829 %define Tm0 m5
4830 %define Tm1 m4
4831 %define t3 m3
4832 %define t2 m2
4833 %define t1 m1
4834 %define t0 m0
4835
4836 dec srcq
4837 mov r4d, r4m
4838 add dststrided, dststrided
4839
4840 %ifdef PIC
4841 lea r6, [tab_ChromaCoeff]
4842 movd coef2, [r6 + r4 * 4]
4843 %else
4844 movd coef2, [tab_ChromaCoeff + r4 * 4]
4845 %endif
4846
4847 pshufd coef2, coef2, 0
4848 mova t2, [pw_2000]
4849 mova Tm0, [tab_Tm]
4850 mova Tm1, [tab_Tm + 16]
4851
4852 mov r4d, %2
4853 cmp r5m, byte 0
4854 je .loopH
4855 sub srcq, srcstrideq
4856 add r4d, 3
4857
4858 .loopH:
4859 PROCESS_CHROMA_W%1 t0, t1, t2, t3
4860 add srcq, srcstrideq
4861 add dstq, dststrideq
4862
4863 dec r4d
4864 jnz .loopH
4865
4866 RET
4867 %endmacro
4868
4869 FILTER_HORIZ_CHROMA_WxN 16, 4
4870 FILTER_HORIZ_CHROMA_WxN 16, 8
4871 FILTER_HORIZ_CHROMA_WxN 16, 12
4872 FILTER_HORIZ_CHROMA_WxN 16, 16
4873 FILTER_HORIZ_CHROMA_WxN 16, 32
4874 FILTER_HORIZ_CHROMA_WxN 24, 32
4875 FILTER_HORIZ_CHROMA_WxN 32, 8
4876 FILTER_HORIZ_CHROMA_WxN 32, 16
4877 FILTER_HORIZ_CHROMA_WxN 32, 24
4878 FILTER_HORIZ_CHROMA_WxN 32, 32
4879
4880 FILTER_HORIZ_CHROMA_WxN 16, 24
4881 FILTER_HORIZ_CHROMA_WxN 16, 64
4882 FILTER_HORIZ_CHROMA_WxN 24, 64
4883 FILTER_HORIZ_CHROMA_WxN 32, 48
4884 FILTER_HORIZ_CHROMA_WxN 32, 64
4885
4886 FILTER_HORIZ_CHROMA_WxN 64, 64
4887 FILTER_HORIZ_CHROMA_WxN 64, 32
4888 FILTER_HORIZ_CHROMA_WxN 64, 48
4889 FILTER_HORIZ_CHROMA_WxN 48, 64
4890 FILTER_HORIZ_CHROMA_WxN 64, 16
4891
4892
4893 ;---------------------------------------------------------------------------------------------------------------
4894 ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4895 ;---------------------------------------------------------------------------------------------------------------
4896 %macro FILTER_V_PS_W16n 2
4897 INIT_XMM sse4
4898 cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
4899
4900 mov r4d, r4m
4901 sub r0, r1
4902 add r3d, r3d
4903
4904 %ifdef PIC
4905 lea r5, [tab_ChromaCoeff]
4906 movd m0, [r5 + r4 * 4]
4907 %else
4908 movd m0, [tab_ChromaCoeff + r4 * 4]
4909 %endif
4910
4911 pshufb m1, m0, [tab_Vm]
4912 pshufb m0, [tab_Vm + 16]
4913 mov r4d, %2/2
4914
4915 .loop:
4916
4917 mov r6d, %1/16
4918
4919 .loopW:
4920
4921 movu m2, [r0]
4922 movu m3, [r0 + r1]
4923
4924 punpcklbw m4, m2, m3
4925 punpckhbw m2, m3
4926
4927 pmaddubsw m4, m1
4928 pmaddubsw m2, m1
4929
4930 lea r5, [r0 + 2 * r1]
4931 movu m5, [r5]
4932 movu m7, [r5 + r1]
4933
4934 punpcklbw m6, m5, m7
4935 pmaddubsw m6, m0
4936 paddw m4, m6
4937
4938 punpckhbw m6, m5, m7
4939 pmaddubsw m6, m0
4940 paddw m2, m6
4941
4942 mova m6, [pw_2000]
4943
4944 psubw m4, m6
4945 psubw m2, m6
4946
4947 movu [r2], m4
4948 movu [r2 + 16], m2
4949
4950 punpcklbw m4, m3, m5
4951 punpckhbw m3, m5
4952
4953 pmaddubsw m4, m1
4954 pmaddubsw m3, m1
4955
4956 movu m5, [r5 + 2 * r1]
4957
4958 punpcklbw m2, m7, m5
4959 punpckhbw m7, m5
4960
4961 pmaddubsw m2, m0
4962 pmaddubsw m7, m0
4963
4964 paddw m4, m2
4965 paddw m3, m7
4966
4967 psubw m4, m6
4968 psubw m3, m6
4969
4970 movu [r2 + r3], m4
4971 movu [r2 + r3 + 16], m3
4972
4973 add r0, 16
4974 add r2, 32
4975 dec r6d
4976 jnz .loopW
4977
4978 lea r0, [r0 + r1 * 2 - %1]
4979 lea r2, [r2 + r3 * 2 - %1 * 2]
4980
4981 dec r4d
4982 jnz .loop
4983 RET
4984 %endmacro
4985
4986 FILTER_V_PS_W16n 64, 64
4987 FILTER_V_PS_W16n 64, 32
4988 FILTER_V_PS_W16n 64, 48
4989 FILTER_V_PS_W16n 48, 64
4990 FILTER_V_PS_W16n 64, 16
4991
4992
4993 ;------------------------------------------------------------------------------------------------------------
4994 ;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4995 ;------------------------------------------------------------------------------------------------------------
4996 INIT_XMM sse4
4997 cglobal interp_4tap_vert_ps_2x4, 4, 6, 7
4998
4999 mov r4d, r4m
5000 sub r0, r1
5001 add r3d, r3d
5002
5003 %ifdef PIC
5004 lea r5, [tab_ChromaCoeff]
5005 movd m0, [r5 + r4 * 4]
5006 %else
5007 movd m0, [tab_ChromaCoeff + r4 * 4]
5008 %endif
5009
5010 pshufb m0, [tab_Cm]
5011
5012 lea r5, [3 * r1]
5013
5014 movd m2, [r0]
5015 movd m3, [r0 + r1]
5016 movd m4, [r0 + 2 * r1]
5017 movd m5, [r0 + r5]
5018
5019 punpcklbw m2, m3
5020 punpcklbw m6, m4, m5
5021 punpcklbw m2, m6
5022
5023 pmaddubsw m2, m0
5024
5025 lea r0, [r0 + 4 * r1]
5026 movd m6, [r0]
5027
5028 punpcklbw m3, m4
5029 punpcklbw m1, m5, m6
5030 punpcklbw m3, m1
5031
5032 pmaddubsw m3, m0
5033 phaddw m2, m3
5034
5035 mova m1, [pw_2000]
5036
5037 psubw m2, m1
5038
5039 movd [r2], m2
5040 pextrd [r2 + r3], m2, 2
5041
5042 movd m2, [r0 + r1]
5043
5044 punpcklbw m4, m5
5045 punpcklbw m3, m6, m2
5046 punpcklbw m4, m3
5047
5048 pmaddubsw m4, m0
5049
5050 movd m3, [r0 + 2 * r1]
5051
5052 punpcklbw m5, m6
5053 punpcklbw m2, m3
5054 punpcklbw m5, m2
5055
5056 pmaddubsw m5, m0
5057 phaddw m4, m5
5058 psubw m4, m1
5059
5060 lea r2, [r2 + 2 * r3]
5061 movd [r2], m4
5062 pextrd [r2 + r3], m4, 2
5063
5064 RET
5065
5066 ;-------------------------------------------------------------------------------------------------------------
5067 ; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5068 ;-------------------------------------------------------------------------------------------------------------
5069 %macro FILTER_V_PS_W2 2
5070 INIT_XMM sse4
5071 cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
5072
5073 mov r4d, r4m
5074 sub r0, r1
5075 add r3d, r3d
5076
5077 %ifdef PIC
5078 lea r5, [tab_ChromaCoeff]
5079 movd m0, [r5 + r4 * 4]
5080 %else
5081 movd m0, [tab_ChromaCoeff + r4 * 4]
5082 %endif
5083
5084 pshufb m0, [tab_Cm]
5085
5086 mova m1, [pw_2000]
5087 lea r5, [3 * r1]
5088 mov r4d, %2/4
5089 .loop:
5090 movd m2, [r0]
5091 movd m3, [r0 + r1]
5092 movd m4, [r0 + 2 * r1]
5093 movd m5, [r0 + r5]
5094
5095 punpcklbw m2, m3
5096 punpcklbw m6, m4, m5
5097 punpcklbw m2, m6
5098
5099 pmaddubsw m2, m0
5100
5101 lea r0, [r0 + 4 * r1]
5102 movd m6, [r0]
5103
5104 punpcklbw m3, m4
5105 punpcklbw m7, m5, m6
5106 punpcklbw m3, m7
5107
5108 pmaddubsw m3, m0
5109
5110 phaddw m2, m3
5111 psubw m2, m1
5112
5113
5114 movd [r2], m2
5115 pshufd m2, m2, 2
5116 movd [r2 + r3], m2
5117
5118 movd m2, [r0 + r1]
5119
5120 punpcklbw m4, m5
5121 punpcklbw m3, m6, m2
5122 punpcklbw m4, m3
5123
5124 pmaddubsw m4, m0
5125
5126 movd m3, [r0 + 2 * r1]
5127
5128 punpcklbw m5, m6
5129 punpcklbw m2, m3
5130 punpcklbw m5, m2
5131
5132 pmaddubsw m5, m0
5133
5134 phaddw m4, m5
5135
5136 psubw m4, m1
5137
5138 lea r2, [r2 + 2 * r3]
5139 movd [r2], m4
5140 pshufd m4 , m4 ,2
5141 movd [r2 + r3], m4
5142
5143 lea r2, [r2 + 2 * r3]
5144
5145 dec r4d
5146 jnz .loop
5147
5148 RET
5149 %endmacro
5150
5151 FILTER_V_PS_W2 2, 8
5152
5153 FILTER_V_PS_W2 2, 16
5154
5155 ;-----------------------------------------------------------------------------------------------------------------
5156 ; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5157 ;-----------------------------------------------------------------------------------------------------------------
5158 %macro FILTER_VER_CHROMA_SS 2
5159 INIT_XMM sse2
5160 cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
5161
5162 add r1d, r1d
5163 add r3d, r3d
5164 sub r0, r1
5165 shl r4d, 5
5166
5167 %ifdef PIC
5168 lea r5, [tab_ChromaCoeffV]
5169 lea r6, [r5 + r4]
5170 %else
5171 lea r6, [tab_ChromaCoeffV + r4]
5172 %endif
5173
5174 mov dword [rsp], %2/4
5175
5176 .loopH:
5177 mov r4d, (%1/4)
5178 .loopW:
5179 PROCESS_CHROMA_SP_W4_4R
5180
5181 psrad m0, 6
5182 psrad m1, 6
5183 psrad m2, 6
5184 psrad m3, 6
5185
5186 packssdw m0, m1
5187 packssdw m2, m3
5188
5189 movlps [r2], m0
5190 movhps [r2 + r3], m0
5191 lea r5, [r2 + 2 * r3]
5192 movlps [r5], m2
5193 movhps [r5 + r3], m2
5194
5195 lea r5, [4 * r1 - 2 * 4]
5196 sub r0, r5
5197 add r2, 2 * 4
5198
5199 dec r4d
5200 jnz .loopW
5201
5202 lea r0, [r0 + 4 * r1 - 2 * %1]
5203 lea r2, [r2 + 4 * r3 - 2 * %1]
5204
5205 dec dword [rsp]
5206 jnz .loopH
5207
5208 RET
5209 %endmacro
5210
5211 FILTER_VER_CHROMA_SS 4, 4
5212 FILTER_VER_CHROMA_SS 4, 8
5213 FILTER_VER_CHROMA_SS 16, 16
5214 FILTER_VER_CHROMA_SS 16, 8
5215 FILTER_VER_CHROMA_SS 16, 12
5216 FILTER_VER_CHROMA_SS 12, 16
5217 FILTER_VER_CHROMA_SS 16, 4
5218 FILTER_VER_CHROMA_SS 4, 16
5219 FILTER_VER_CHROMA_SS 32, 32
5220 FILTER_VER_CHROMA_SS 32, 16
5221 FILTER_VER_CHROMA_SS 16, 32
5222 FILTER_VER_CHROMA_SS 32, 24
5223 FILTER_VER_CHROMA_SS 24, 32
5224 FILTER_VER_CHROMA_SS 32, 8
5225
5226 FILTER_VER_CHROMA_SS 16, 24
5227 FILTER_VER_CHROMA_SS 12, 32
5228 FILTER_VER_CHROMA_SS 4, 32
5229 FILTER_VER_CHROMA_SS 32, 64
5230 FILTER_VER_CHROMA_SS 16, 64
5231 FILTER_VER_CHROMA_SS 32, 48
5232 FILTER_VER_CHROMA_SS 24, 64
5233
5234 FILTER_VER_CHROMA_SS 64, 64
5235 FILTER_VER_CHROMA_SS 64, 32
5236 FILTER_VER_CHROMA_SS 64, 48
5237 FILTER_VER_CHROMA_SS 48, 64
5238 FILTER_VER_CHROMA_SS 64, 16
5239
5240
5241 ;---------------------------------------------------------------------------------------------------------------------
5242 ; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5243 ;---------------------------------------------------------------------------------------------------------------------
5244 %macro FILTER_VER_CHROMA_SS_W2_4R 2
5245 INIT_XMM sse4
5246 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5
5247
5248 add r1d, r1d
5249 add r3d, r3d
5250 sub r0, r1
5251 shl r4d, 5
5252
5253 %ifdef PIC
5254 lea r5, [tab_ChromaCoeffV]
5255 lea r5, [r5 + r4]
5256 %else
5257 lea r5, [tab_ChromaCoeffV + r4]
5258 %endif
5259
5260 mov r4d, (%2/4)
5261
5262 .loopH:
5263 PROCESS_CHROMA_SP_W2_4R r5
5264
5265 psrad m0, 6
5266 psrad m2, 6
5267
5268 packssdw m0, m2
5269
5270 movd [r2], m0
5271 pextrd [r2 + r3], m0, 1
5272 lea r2, [r2 + 2 * r3]
5273 pextrd [r2], m0, 2
5274 pextrd [r2 + r3], m0, 3
5275
5276 lea r2, [r2 + 2 * r3]
5277
5278 dec r4d
5279 jnz .loopH
5280
5281 RET
5282 %endmacro
5283
5284 FILTER_VER_CHROMA_SS_W2_4R 2, 4
5285 FILTER_VER_CHROMA_SS_W2_4R 2, 8
5286
5287 FILTER_VER_CHROMA_SS_W2_4R 2, 16
5288
5289 ;---------------------------------------------------------------------------------------------------------------
5290 ; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5291 ;---------------------------------------------------------------------------------------------------------------
5292 INIT_XMM sse2
5293 cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
5294
5295 add r1d, r1d
5296 add r3d, r3d
5297 sub r0, r1
5298 shl r4d, 5
5299
5300 %ifdef PIC
5301 lea r5, [tab_ChromaCoeffV]
5302 lea r5, [r5 + r4]
5303 %else
5304 lea r5, [tab_ChromaCoeffV + r4]
5305 %endif
5306
5307 movq m0, [r0]
5308 movq m1, [r0 + r1]
5309 punpcklwd m0, m1 ;m0=[0 1]
5310 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
5311
5312 lea r0, [r0 + 2 * r1]
5313 movq m2, [r0]
5314 punpcklwd m1, m2 ;m1=[1 2]
5315 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
5316
5317 movq m3, [r0 + r1]
5318 punpcklwd m2, m3 ;m4=[2 3]
5319 pmaddwd m2, [r5 + 1 * 16]
5320 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
5321 psrad m0, 6
5322
5323 movq m2, [r0 + 2 * r1]
5324 punpcklwd m3, m2 ;m5=[3 4]
5325 pmaddwd m3, [r5 + 1 * 16]
5326 paddd m1, m3 ;m1=[1+2+3+4] Row2 done
5327 psrad m1, 6
5328
5329 packssdw m0, m1
5330
5331 movlps [r2], m0
5332 movhps [r2 + r3], m0
5333
5334 RET
5335
5336 ;-------------------------------------------------------------------------------------------------------------------
5337 ; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5338 ;-------------------------------------------------------------------------------------------------------------------
5339 %macro FILTER_VER_CHROMA_SS_W6_H4 2
5340 INIT_XMM sse4
5341 cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
5342
5343 add r1d, r1d
5344 add r3d, r3d
5345 sub r0, r1
5346 shl r4d, 5
5347
5348 %ifdef PIC
5349 lea r5, [tab_ChromaCoeffV]
5350 lea r6, [r5 + r4]
5351 %else
5352 lea r6, [tab_ChromaCoeffV + r4]
5353 %endif
5354
5355 mov r4d, %2/4
5356
5357 .loopH:
5358 PROCESS_CHROMA_SP_W4_4R
5359
5360 psrad m0, 6
5361 psrad m1, 6
5362 psrad m2, 6
5363 psrad m3, 6
5364
5365 packssdw m0, m1
5366 packssdw m2, m3
5367
5368 movlps [r2], m0
5369 movhps [r2 + r3], m0
5370 lea r5, [r2 + 2 * r3]
5371 movlps [r5], m2
5372 movhps [r5 + r3], m2
5373
5374 lea r5, [4 * r1 - 2 * 4]
5375 sub r0, r5
5376 add r2, 2 * 4
5377
5378 PROCESS_CHROMA_SP_W2_4R r6
5379
5380 psrad m0, 6
5381 psrad m2, 6
5382
5383 packssdw m0, m2
5384
5385 movd [r2], m0
5386 pextrd [r2 + r3], m0, 1
5387 lea r2, [r2 + 2 * r3]
5388 pextrd [r2], m0, 2
5389 pextrd [r2 + r3], m0, 3
5390
5391 sub r0, 2 * 4
5392 lea r2, [r2 + 2 * r3 - 2 * 4]
5393
5394 dec r4d
5395 jnz .loopH
5396
5397 RET
5398 %endmacro
5399
5400 FILTER_VER_CHROMA_SS_W6_H4 6, 8
5401
5402 FILTER_VER_CHROMA_SS_W6_H4 6, 16
5403
5404
5405 ;----------------------------------------------------------------------------------------------------------------
5406 ; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5407 ;----------------------------------------------------------------------------------------------------------------
5408 %macro FILTER_VER_CHROMA_SS_W8_H2 2
5409 INIT_XMM sse2
5410 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
5411
5412 add r1d, r1d
5413 add r3d, r3d
5414 sub r0, r1
5415 shl r4d, 5
5416
5417 %ifdef PIC
5418 lea r5, [tab_ChromaCoeffV]
5419 lea r5, [r5 + r4]
5420 %else
5421 lea r5, [tab_ChromaCoeffV + r4]
5422 %endif
5423
5424 mov r4d, %2/2
5425 .loopH:
5426 PROCESS_CHROMA_SP_W8_2R
5427
5428 psrad m0, 6
5429 psrad m1, 6
5430 psrad m2, 6
5431 psrad m3, 6
5432
5433 packssdw m0, m1
5434 packssdw m2, m3
5435
5436 movu [r2], m0
5437 movu [r2 + r3], m2
5438
5439 lea r2, [r2 + 2 * r3]
5440
5441 dec r4d
5442 jnz .loopH
5443
5444 RET
5445 %endmacro
5446
5447 FILTER_VER_CHROMA_SS_W8_H2 8, 2
5448 FILTER_VER_CHROMA_SS_W8_H2 8, 4
5449 FILTER_VER_CHROMA_SS_W8_H2 8, 6
5450 FILTER_VER_CHROMA_SS_W8_H2 8, 8
5451 FILTER_VER_CHROMA_SS_W8_H2 8, 16
5452 FILTER_VER_CHROMA_SS_W8_H2 8, 32
5453
5454 FILTER_VER_CHROMA_SS_W8_H2 8, 12
5455 FILTER_VER_CHROMA_SS_W8_H2 8, 64
5456
5457 ;-----------------------------------------------------------------------------------------------------------------
5458 ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5459 ;-----------------------------------------------------------------------------------------------------------------
5460 %macro FILTER_VER_LUMA_SS 2
5461 INIT_XMM sse2
5462 cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
5463
5464 add r1d, r1d
5465 add r3d, r3d
5466 lea r5, [3 * r1]
5467 sub r0, r5
5468 shl r4d, 6
5469
5470 %ifdef PIC
5471 lea r5, [tab_LumaCoeffV]
5472 lea r6, [r5 + r4]
5473 %else
5474 lea r6, [tab_LumaCoeffV + r4]
5475 %endif
5476
5477 mov dword [rsp], %2/4
5478 .loopH:
5479 mov r4d, (%1/4)
5480 .loopW:
5481 movq m0, [r0]
5482 movq m1, [r0 + r1]
5483 punpcklwd m0, m1 ;m0=[0 1]
5484 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
5485
5486 lea r0, [r0 + 2 * r1]
5487 movq m4, [r0]
5488 punpcklwd m1, m4 ;m1=[1 2]
5489 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
5490
5491 movq m5, [r0 + r1]
5492 punpcklwd m4, m5 ;m4=[2 3]
5493 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
5494 pmaddwd m4, [r6 + 1 * 16]
5495 paddd m0, m4 ;m0=[0+1+2+3] Row1
5496
5497 lea r0, [r0 + 2 * r1]
5498 movq m4, [r0]
5499 punpcklwd m5, m4 ;m5=[3 4]
5500 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
5501 pmaddwd m5, [r6 + 1 * 16]
5502 paddd m1, m5 ;m1 = [1+2+3+4] Row2
5503
5504 movq m5, [r0 + r1]
5505 punpcklwd m4, m5 ;m4=[4 5]
5506 pmaddwd m6, m4, [r6 + 1 * 16]
5507 paddd m2, m6 ;m2=[2+3+4+5] Row3
5508 pmaddwd m4, [r6 + 2 * 16]
5509 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
5510
5511 lea r0, [r0 + 2 * r1]
5512 movq m4, [r0]
5513 punpcklwd m5, m4 ;m5=[5 6]
5514 pmaddwd m6, m5, [r6 + 1 * 16]
5515 paddd m3, m6 ;m3=[3+4+5+6] Row4
5516 pmaddwd m5, [r6 + 2 * 16]
5517 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
5518
5519 movq m5, [r0 + r1]
5520 punpcklwd m4, m5 ;m4=[6 7]
5521 pmaddwd m6, m4, [r6 + 2 * 16]
5522 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
5523 pmaddwd m4, [r6 + 3 * 16]
5524 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
5525 psrad m0, 6
5526
5527 lea r0, [r0 + 2 * r1]
5528 movq m4, [r0]
5529 punpcklwd m5, m4 ;m5=[7 8]
5530 pmaddwd m6, m5, [r6 + 2 * 16]
5531 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
5532 pmaddwd m5, [r6 + 3 * 16]
5533 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
5534 psrad m1, 6
5535
5536 packssdw m0, m1
5537
5538 movlps [r2], m0
5539 movhps [r2 + r3], m0
5540
5541 movq m5, [r0 + r1]
5542 punpcklwd m4, m5 ;m4=[8 9]
5543 pmaddwd m4, [r6 + 3 * 16]
5544 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
5545 psrad m2, 6
5546
5547 movq m4, [r0 + 2 * r1]
5548 punpcklwd m5, m4 ;m5=[9 10]
5549 pmaddwd m5, [r6 + 3 * 16]
5550 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
5551 psrad m3, 6
5552
5553 packssdw m2, m3
5554
5555 movlps [r2 + 2 * r3], m2
5556 lea r5, [3 * r3]
5557 movhps [r2 + r5], m2
5558
5559 lea r5, [8 * r1 - 2 * 4]
5560 sub r0, r5
5561 add r2, 2 * 4
5562
5563 dec r4d
5564 jnz .loopW
5565
5566 lea r0, [r0 + 4 * r1 - 2 * %1]
5567 lea r2, [r2 + 4 * r3 - 2 * %1]
5568
5569 dec dword [rsp]
5570 jnz .loopH
5571
5572 RET
5573 %endmacro
5574
5575 FILTER_VER_LUMA_SS 4, 4
5576 FILTER_VER_LUMA_SS 8, 8
5577 FILTER_VER_LUMA_SS 8, 4
5578 FILTER_VER_LUMA_SS 4, 8
5579 FILTER_VER_LUMA_SS 16, 16
5580 FILTER_VER_LUMA_SS 16, 8
5581 FILTER_VER_LUMA_SS 8, 16
5582 FILTER_VER_LUMA_SS 16, 12
5583 FILTER_VER_LUMA_SS 12, 16
5584 FILTER_VER_LUMA_SS 16, 4
5585 FILTER_VER_LUMA_SS 4, 16
5586 FILTER_VER_LUMA_SS 32, 32
5587 FILTER_VER_LUMA_SS 32, 16
5588 FILTER_VER_LUMA_SS 16, 32
5589 FILTER_VER_LUMA_SS 32, 24
5590 FILTER_VER_LUMA_SS 24, 32
5591 FILTER_VER_LUMA_SS 32, 8
5592 FILTER_VER_LUMA_SS 8, 32
5593 FILTER_VER_LUMA_SS 64, 64
5594 FILTER_VER_LUMA_SS 64, 32
5595 FILTER_VER_LUMA_SS 32, 64
5596 FILTER_VER_LUMA_SS 64, 48
5597 FILTER_VER_LUMA_SS 48, 64
5598 FILTER_VER_LUMA_SS 64, 16
5599 FILTER_VER_LUMA_SS 16, 64