Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / ipfilter16.asm
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
5 ;* Murugan Vairavel <murugan@multicorewareinc.com>
6 ;*
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
11 ;*
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 ;*
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
24
25 %include "x86inc.asm"
26 %include "x86util.asm"
27
28 SECTION_RODATA 32
29
30 tab_c_32: times 4 dd 32
31 tab_c_n32768: times 4 dd -32768
32 tab_c_524800: times 4 dd 524800
33 tab_c_n8192: times 8 dw -8192
34
35 tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
36
37 tab_ChromaCoeff: dw 0, 64, 0, 0
38 dw -2, 58, 10, -2
39 dw -4, 54, 16, -2
40 dw -6, 46, 28, -4
41 dw -4, 36, 36, -4
42 dw -4, 28, 46, -6
43 dw -2, 16, 54, -4
44 dw -2, 10, 58, -2
45
46 tab_ChromaCoeffV: times 4 dw 0, 64
47 times 4 dw 0, 0
48
49 times 4 dw -2, 58
50 times 4 dw 10, -2
51
52 times 4 dw -4, 54
53 times 4 dw 16, -2
54
55 times 4 dw -6, 46
56 times 4 dw 28, -4
57
58 times 4 dw -4, 36
59 times 4 dw 36, -4
60
61 times 4 dw -4, 28
62 times 4 dw 46, -6
63
64 times 4 dw -2, 16
65 times 4 dw 54, -4
66
67 times 4 dw -2, 10
68 times 4 dw 58, -2
69
70 tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
71 dw -1, 4, -10, 58, 17, -5, 1, 0
72 dw -1, 4, -11, 40, 40, -11, 4, -1
73 dw 0, 1, -5, 17, 58, -10, 4, -1
74
75 tab_LumaCoeffV: times 4 dw 0, 0
76 times 4 dw 0, 64
77 times 4 dw 0, 0
78 times 4 dw 0, 0
79
80 times 4 dw -1, 4
81 times 4 dw -10, 58
82 times 4 dw 17, -5
83 times 4 dw 1, 0
84
85 times 4 dw -1, 4
86 times 4 dw -11, 40
87 times 4 dw 40, -11
88 times 4 dw 4, -1
89
90 times 4 dw 0, 1
91 times 4 dw -5, 17
92 times 4 dw 58, -10
93 times 4 dw 4, -1
94
95 SECTION .text
96
97 cextern pd_32
98 cextern pw_pixel_max
99 cextern pd_n32768
100
101 ;------------------------------------------------------------------------------------------------------------
102 ; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
103 ;------------------------------------------------------------------------------------------------------------
104 %macro FILTER_HOR_LUMA_W4 3
105 INIT_XMM sse4
106 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
107
108 mov r4d, r4m
109 sub r0, 6
110 shl r4d, 4
111 add r1, r1
112 add r3, r3
113
114 %ifdef PIC
115 lea r6, [tab_LumaCoeff]
116 mova m0, [r6 + r4]
117 %else
118 mova m0, [tab_LumaCoeff + r4]
119 %endif
120
121 %ifidn %3, pp
122 mova m1, [pd_32]
123 pxor m6, m6
124 mova m7, [pw_pixel_max]
125 %else
126 mova m1, [pd_n32768]
127 %endif
128
129 mov r4d, %2
130 %ifidn %3, ps
131 cmp r5m, byte 0
132 je .loopH
133 lea r6, [r1 + 2 * r1]
134 sub r0, r6
135 add r4d, 7
136 %endif
137
138 .loopH:
139 movu m2, [r0] ; m2 = src[0-7]
140 movu m3, [r0 + 16] ; m3 = src[8-15]
141
142 pmaddwd m4, m2, m0
143 palignr m5, m3, m2, 2 ; m5 = src[1-8]
144 pmaddwd m5, m0
145 phaddd m4, m5
146
147 palignr m5, m3, m2, 4 ; m5 = src[2-9]
148 pmaddwd m5, m0
149 palignr m3, m2, 6 ; m3 = src[3-10]
150 pmaddwd m3, m0
151 phaddd m5, m3
152
153 phaddd m4, m5
154 paddd m4, m1
155 %ifidn %3, pp
156 psrad m4, 6
157 packusdw m4, m4
158 CLIPW m4, m6, m7
159 %else
160 psrad m4, 2
161 packssdw m4, m4
162 %endif
163
164 movh [r2], m4
165
166 add r0, r1
167 add r2, r3
168
169 dec r4d
170 jnz .loopH
171 RET
172 %endmacro
173
174 ;------------------------------------------------------------------------------------------------------------
175 ; void interp_8tap_horiz_pp_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
176 ;------------------------------------------------------------------------------------------------------------
177 FILTER_HOR_LUMA_W4 4, 4, pp
178 FILTER_HOR_LUMA_W4 4, 8, pp
179 FILTER_HOR_LUMA_W4 4, 16, pp
180
181 ;---------------------------------------------------------------------------------------------------------------------------
182 ; void interp_8tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
183 ;---------------------------------------------------------------------------------------------------------------------------
184 FILTER_HOR_LUMA_W4 4, 4, ps
185 FILTER_HOR_LUMA_W4 4, 8, ps
186 FILTER_HOR_LUMA_W4 4, 16, ps
187
188 ;------------------------------------------------------------------------------------------------------------
189 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
190 ;------------------------------------------------------------------------------------------------------------
191 %macro FILTER_HOR_LUMA_W8 3
192 INIT_XMM sse4
193 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
194
195 add r1, r1
196 add r3, r3
197 mov r4d, r4m
198 sub r0, 6
199 shl r4d, 4
200
201 %ifdef PIC
202 lea r6, [tab_LumaCoeff]
203 mova m0, [r6 + r4]
204 %else
205 mova m0, [tab_LumaCoeff + r4]
206 %endif
207
208 %ifidn %3, pp
209 mova m1, [pd_32]
210 pxor m7, m7
211 %else
212 mova m1, [pd_n32768]
213 %endif
214
215 mov r4d, %2
216 %ifidn %3, ps
217 cmp r5m, byte 0
218 je .loopH
219 lea r6, [r1 + 2 * r1]
220 sub r0, r6
221 add r4d, 7
222 %endif
223
224 .loopH:
225 movu m2, [r0] ; m2 = src[0-7]
226 movu m3, [r0 + 16] ; m3 = src[8-15]
227
228 pmaddwd m4, m2, m0
229 palignr m5, m3, m2, 2 ; m5 = src[1-8]
230 pmaddwd m5, m0
231 phaddd m4, m5
232
233 palignr m5, m3, m2, 4 ; m5 = src[2-9]
234 pmaddwd m5, m0
235 palignr m6, m3, m2, 6 ; m6 = src[3-10]
236 pmaddwd m6, m0
237 phaddd m5, m6
238 phaddd m4, m5
239 paddd m4, m1
240
241 palignr m5, m3, m2, 8 ; m5 = src[4-11]
242 pmaddwd m5, m0
243 palignr m6, m3, m2, 10 ; m6 = src[5-12]
244 pmaddwd m6, m0
245 phaddd m5, m6
246
247 palignr m6, m3, m2, 12 ; m6 = src[6-13]
248 pmaddwd m6, m0
249 palignr m3, m2, 14 ; m3 = src[7-14]
250 pmaddwd m3, m0
251 phaddd m6, m3
252 phaddd m5, m6
253 paddd m5, m1
254 %ifidn %3, pp
255 psrad m4, 6
256 psrad m5, 6
257 packusdw m4, m5
258 CLIPW m4, m7, [pw_pixel_max]
259 %else
260 psrad m4, 2
261 psrad m5, 2
262 packssdw m4, m5
263 %endif
264
265 movu [r2], m4
266
267 add r0, r1
268 add r2, r3
269
270 dec r4d
271 jnz .loopH
272 RET
273 %endmacro
274
275 ;------------------------------------------------------------------------------------------------------------
276 ; void interp_8tap_horiz_pp_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
277 ;------------------------------------------------------------------------------------------------------------
278 FILTER_HOR_LUMA_W8 8, 4, pp
279 FILTER_HOR_LUMA_W8 8, 8, pp
280 FILTER_HOR_LUMA_W8 8, 16, pp
281 FILTER_HOR_LUMA_W8 8, 32, pp
282
283 ;---------------------------------------------------------------------------------------------------------------------------
284 ; void interp_8tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
285 ;---------------------------------------------------------------------------------------------------------------------------
286 FILTER_HOR_LUMA_W8 8, 4, ps
287 FILTER_HOR_LUMA_W8 8, 8, ps
288 FILTER_HOR_LUMA_W8 8, 16, ps
289 FILTER_HOR_LUMA_W8 8, 32, ps
290
291 ;--------------------------------------------------------------------------------------------------------------
292 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
293 ;--------------------------------------------------------------------------------------------------------------
294 %macro FILTER_HOR_LUMA_W12 3
295 INIT_XMM sse4
296 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
297
298 add r1, r1
299 add r3, r3
300 mov r4d, r4m
301 sub r0, 6
302 shl r4d, 4
303
304 %ifdef PIC
305 lea r6, [tab_LumaCoeff]
306 mova m0, [r6 + r4]
307 %else
308 mova m0, [tab_LumaCoeff + r4]
309 %endif
310 %ifidn %3, pp
311 mova m1, [pd_32]
312 %else
313 mova m1, [pd_n32768]
314 %endif
315
316 mov r4d, %2
317 %ifidn %3, ps
318 cmp r5m, byte 0
319 je .loopH
320 lea r6, [r1 + 2 * r1]
321 sub r0, r6
322 add r4d, 7
323 %endif
324
325 .loopH:
326 movu m2, [r0] ; m2 = src[0-7]
327 movu m3, [r0 + 16] ; m3 = src[8-15]
328
329 pmaddwd m4, m2, m0
330 palignr m5, m3, m2, 2 ; m5 = src[1-8]
331 pmaddwd m5, m0
332 phaddd m4, m5
333
334 palignr m5, m3, m2, 4 ; m5 = src[2-9]
335 pmaddwd m5, m0
336 palignr m6, m3, m2, 6 ; m6 = src[3-10]
337 pmaddwd m6, m0
338 phaddd m5, m6
339 phaddd m4, m5
340 paddd m4, m1
341
342 palignr m5, m3, m2, 8 ; m5 = src[4-11]
343 pmaddwd m5, m0
344 palignr m6, m3, m2, 10 ; m6 = src[5-12]
345 pmaddwd m6, m0
346 phaddd m5, m6
347
348 palignr m6, m3, m2, 12 ; m6 = src[6-13]
349 pmaddwd m6, m0
350 palignr m7, m3, m2, 14 ; m2 = src[7-14]
351 pmaddwd m7, m0
352 phaddd m6, m7
353 phaddd m5, m6
354 paddd m5, m1
355 %ifidn %3, pp
356 psrad m4, 6
357 psrad m5, 6
358 packusdw m4, m5
359 pxor m5, m5
360 CLIPW m4, m5, [pw_pixel_max]
361 %else
362 psrad m4, 2
363 psrad m5, 2
364 packssdw m4, m5
365 %endif
366
367 movu [r2], m4
368
369 movu m2, [r0 + 32] ; m2 = src[16-23]
370
371 pmaddwd m4, m3, m0 ; m3 = src[8-15]
372 palignr m5, m2, m3, 2 ; m5 = src[9-16]
373 pmaddwd m5, m0
374 phaddd m4, m5
375
376 palignr m5, m2, m3, 4 ; m5 = src[10-17]
377 pmaddwd m5, m0
378 palignr m2, m3, 6 ; m2 = src[11-18]
379 pmaddwd m2, m0
380 phaddd m5, m2
381 phaddd m4, m5
382 paddd m4, m1
383 %ifidn %3, pp
384 psrad m4, 6
385 packusdw m4, m4
386 pxor m5, m5
387 CLIPW m4, m5, [pw_pixel_max]
388 %else
389 psrad m4, 2
390 packssdw m4, m4
391 %endif
392
393 movh [r2 + 16], m4
394
395 add r0, r1
396 add r2, r3
397
398 dec r4d
399 jnz .loopH
400 RET
401 %endmacro
402
403 ;-------------------------------------------------------------------------------------------------------------
404 ; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
405 ;-------------------------------------------------------------------------------------------------------------
406 FILTER_HOR_LUMA_W12 12, 16, pp
407
408 ;----------------------------------------------------------------------------------------------------------------------------
409 ; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
410 ;----------------------------------------------------------------------------------------------------------------------------
411 FILTER_HOR_LUMA_W12 12, 16, ps
412
413 ;--------------------------------------------------------------------------------------------------------------
414 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
415 ;--------------------------------------------------------------------------------------------------------------
416 %macro FILTER_HOR_LUMA_W16 3
417 INIT_XMM sse4
418 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
419
420 add r1, r1
421 add r3, r3
422 mov r4d, r4m
423 sub r0, 6
424 shl r4d, 4
425
426 %ifdef PIC
427 lea r6, [tab_LumaCoeff]
428 mova m0, [r6 + r4]
429 %else
430 mova m0, [tab_LumaCoeff + r4]
431 %endif
432
433 %ifidn %3, pp
434 mova m1, [pd_32]
435 %else
436 mova m1, [pd_n32768]
437 %endif
438
439 mov r4d, %2
440 %ifidn %3, ps
441 cmp r5m, byte 0
442 je .loopH
443 lea r6, [r1 + 2 * r1]
444 sub r0, r6
445 add r4d, 7
446 %endif
447
448 .loopH:
449 %assign x 0
450 %rep %1 / 16
451 movu m2, [r0 + x] ; m2 = src[0-7]
452 movu m3, [r0 + 16 + x] ; m3 = src[8-15]
453
454 pmaddwd m4, m2, m0
455 palignr m5, m3, m2, 2 ; m5 = src[1-8]
456 pmaddwd m5, m0
457 phaddd m4, m5
458
459 palignr m5, m3, m2, 4 ; m5 = src[2-9]
460 pmaddwd m5, m0
461 palignr m6, m3, m2, 6 ; m6 = src[3-10]
462 pmaddwd m6, m0
463 phaddd m5, m6
464 phaddd m4, m5
465 paddd m4, m1
466
467 palignr m5, m3, m2, 8 ; m5 = src[4-11]
468 pmaddwd m5, m0
469 palignr m6, m3, m2, 10 ; m6 = src[5-12]
470 pmaddwd m6, m0
471 phaddd m5, m6
472
473 palignr m6, m3, m2, 12 ; m6 = src[6-13]
474 pmaddwd m6, m0
475 palignr m7, m3, m2, 14 ; m2 = src[7-14]
476 pmaddwd m7, m0
477 phaddd m6, m7
478 phaddd m5, m6
479 paddd m5, m1
480 %ifidn %3, pp
481 psrad m4, 6
482 psrad m5, 6
483 packusdw m4, m5
484 pxor m5, m5
485 CLIPW m4, m5, [pw_pixel_max]
486 %else
487 psrad m4, 2
488 psrad m5, 2
489 packssdw m4, m5
490 %endif
491 movu [r2 + x], m4
492
493 movu m2, [r0 + 32 + x] ; m2 = src[16-23]
494
495 pmaddwd m4, m3, m0 ; m3 = src[8-15]
496 palignr m5, m2, m3, 2 ; m5 = src[9-16]
497 pmaddwd m5, m0
498 phaddd m4, m5
499
500 palignr m5, m2, m3, 4 ; m5 = src[10-17]
501 pmaddwd m5, m0
502 palignr m6, m2, m3, 6 ; m6 = src[11-18]
503 pmaddwd m6, m0
504 phaddd m5, m6
505 phaddd m4, m5
506 paddd m4, m1
507
508 palignr m5, m2, m3, 8 ; m5 = src[12-19]
509 pmaddwd m5, m0
510 palignr m6, m2, m3, 10 ; m6 = src[13-20]
511 pmaddwd m6, m0
512 phaddd m5, m6
513
514 palignr m6, m2, m3, 12 ; m6 = src[14-21]
515 pmaddwd m6, m0
516 palignr m2, m3, 14 ; m3 = src[15-22]
517 pmaddwd m2, m0
518 phaddd m6, m2
519 phaddd m5, m6
520 paddd m5, m1
521 %ifidn %3, pp
522 psrad m4, 6
523 psrad m5, 6
524 packusdw m4, m5
525 pxor m5, m5
526 CLIPW m4, m5, [pw_pixel_max]
527 %else
528 psrad m4, 2
529 psrad m5, 2
530 packssdw m4, m5
531 %endif
532 movu [r2 + 16 + x], m4
533
534 %assign x x+32
535 %endrep
536
537 add r0, r1
538 add r2, r3
539
540 dec r4d
541 jnz .loopH
542 RET
543 %endmacro
544
545 ;-------------------------------------------------------------------------------------------------------------
546 ; void interp_8tap_horiz_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
547 ;-------------------------------------------------------------------------------------------------------------
548 FILTER_HOR_LUMA_W16 16, 4, pp
549 FILTER_HOR_LUMA_W16 16, 8, pp
550 FILTER_HOR_LUMA_W16 16, 12, pp
551 FILTER_HOR_LUMA_W16 16, 16, pp
552 FILTER_HOR_LUMA_W16 16, 32, pp
553 FILTER_HOR_LUMA_W16 16, 64, pp
554
555 ;----------------------------------------------------------------------------------------------------------------------------
556 ; void interp_8tap_horiz_ps_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
557 ;----------------------------------------------------------------------------------------------------------------------------
558 FILTER_HOR_LUMA_W16 16, 4, ps
559 FILTER_HOR_LUMA_W16 16, 8, ps
560 FILTER_HOR_LUMA_W16 16, 12, ps
561 FILTER_HOR_LUMA_W16 16, 16, ps
562 FILTER_HOR_LUMA_W16 16, 32, ps
563 FILTER_HOR_LUMA_W16 16, 64, ps
564
565 ;-------------------------------------------------------------------------------------------------------------
566 ; void interp_8tap_horiz_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
567 ;-------------------------------------------------------------------------------------------------------------
568 FILTER_HOR_LUMA_W16 32, 8, pp
569 FILTER_HOR_LUMA_W16 32, 16, pp
570 FILTER_HOR_LUMA_W16 32, 24, pp
571 FILTER_HOR_LUMA_W16 32, 32, pp
572 FILTER_HOR_LUMA_W16 32, 64, pp
573
574 ;----------------------------------------------------------------------------------------------------------------------------
575 ; void interp_8tap_horiz_ps_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
576 ;----------------------------------------------------------------------------------------------------------------------------
577 FILTER_HOR_LUMA_W16 32, 8, ps
578 FILTER_HOR_LUMA_W16 32, 16, ps
579 FILTER_HOR_LUMA_W16 32, 24, ps
580 FILTER_HOR_LUMA_W16 32, 32, ps
581 FILTER_HOR_LUMA_W16 32, 64, ps
582
583 ;-------------------------------------------------------------------------------------------------------------
584 ; void interp_8tap_horiz_pp_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
585 ;-------------------------------------------------------------------------------------------------------------
586 FILTER_HOR_LUMA_W16 48, 64, pp
587
588 ;----------------------------------------------------------------------------------------------------------------------------
589 ; void interp_8tap_horiz_ps_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
590 ;----------------------------------------------------------------------------------------------------------------------------
591 FILTER_HOR_LUMA_W16 48, 64, ps
592
593 ;-------------------------------------------------------------------------------------------------------------
594 ; void interp_8tap_horiz_pp_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
595 ;-------------------------------------------------------------------------------------------------------------
596 FILTER_HOR_LUMA_W16 64, 16, pp
597 FILTER_HOR_LUMA_W16 64, 32, pp
598 FILTER_HOR_LUMA_W16 64, 48, pp
599 FILTER_HOR_LUMA_W16 64, 64, pp
600
601 ;----------------------------------------------------------------------------------------------------------------------------
602 ; void interp_8tap_horiz_ps_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
603 ;----------------------------------------------------------------------------------------------------------------------------
604 FILTER_HOR_LUMA_W16 64, 16, ps
605 FILTER_HOR_LUMA_W16 64, 32, ps
606 FILTER_HOR_LUMA_W16 64, 48, ps
607 FILTER_HOR_LUMA_W16 64, 64, ps
608
609 ;--------------------------------------------------------------------------------------------------------------
610 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
611 ;--------------------------------------------------------------------------------------------------------------
612 %macro FILTER_HOR_LUMA_W24 3
613 INIT_XMM sse4
614 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
615
616 add r1, r1
617 add r3, r3
618 mov r4d, r4m
619 sub r0, 6
620 shl r4d, 4
621
622 %ifdef PIC
623 lea r6, [tab_LumaCoeff]
624 mova m0, [r6 + r4]
625 %else
626 mova m0, [tab_LumaCoeff + r4]
627 %endif
628 %ifidn %3, pp
629 mova m1, [pd_32]
630 %else
631 mova m1, [pd_n32768]
632 %endif
633
634 mov r4d, %2
635 %ifidn %3, ps
636 cmp r5m, byte 0
637 je .loopH
638 lea r6, [r1 + 2 * r1]
639 sub r0, r6
640 add r4d, 7
641 %endif
642
643 .loopH:
644 movu m2, [r0] ; m2 = src[0-7]
645 movu m3, [r0 + 16] ; m3 = src[8-15]
646
647 pmaddwd m4, m2, m0
648 palignr m5, m3, m2, 2 ; m5 = src[1-8]
649 pmaddwd m5, m0
650 phaddd m4, m5
651
652 palignr m5, m3, m2, 4 ; m5 = src[2-9]
653 pmaddwd m5, m0
654 palignr m6, m3, m2, 6 ; m6 = src[3-10]
655 pmaddwd m6, m0
656 phaddd m5, m6
657 phaddd m4, m5
658 paddd m4, m1
659
660 palignr m5, m3, m2, 8 ; m5 = src[4-11]
661 pmaddwd m5, m0
662 palignr m6, m3, m2, 10 ; m6 = src[5-12]
663 pmaddwd m6, m0
664 phaddd m5, m6
665
666 palignr m6, m3, m2, 12 ; m6 = src[6-13]
667 pmaddwd m6, m0
668 palignr m7, m3, m2, 14 ; m7 = src[7-14]
669 pmaddwd m7, m0
670 phaddd m6, m7
671 phaddd m5, m6
672 paddd m5, m1
673 %ifidn %3, pp
674 psrad m4, 6
675 psrad m5, 6
676 packusdw m4, m5
677 pxor m5, m5
678 CLIPW m4, m5, [pw_pixel_max]
679 %else
680 psrad m4, 2
681 psrad m5, 2
682 packssdw m4, m5
683 %endif
684 movu [r2], m4
685
686 movu m2, [r0 + 32] ; m2 = src[16-23]
687
688 pmaddwd m4, m3, m0 ; m3 = src[8-15]
689 palignr m5, m2, m3, 2 ; m5 = src[1-8]
690 pmaddwd m5, m0
691 phaddd m4, m5
692
693 palignr m5, m2, m3, 4 ; m5 = src[2-9]
694 pmaddwd m5, m0
695 palignr m6, m2, m3, 6 ; m6 = src[3-10]
696 pmaddwd m6, m0
697 phaddd m5, m6
698 phaddd m4, m5
699 paddd m4, m1
700
701 palignr m5, m2, m3, 8 ; m5 = src[4-11]
702 pmaddwd m5, m0
703 palignr m6, m2, m3, 10 ; m6 = src[5-12]
704 pmaddwd m6, m0
705 phaddd m5, m6
706
707 palignr m6, m2, m3, 12 ; m6 = src[6-13]
708 pmaddwd m6, m0
709 palignr m7, m2, m3, 14 ; m7 = src[7-14]
710 pmaddwd m7, m0
711 phaddd m6, m7
712 phaddd m5, m6
713 paddd m5, m1
714 %ifidn %3, pp
715 psrad m4, 6
716 psrad m5, 6
717 packusdw m4, m5
718 pxor m5, m5
719 CLIPW m4, m5, [pw_pixel_max]
720 %else
721 psrad m4, 2
722 psrad m5, 2
723 packssdw m4, m5
724 %endif
725 movu [r2 + 16], m4
726
727 movu m3, [r0 + 48] ; m3 = src[24-31]
728
729 pmaddwd m4, m2, m0 ; m2 = src[16-23]
730 palignr m5, m3, m2, 2 ; m5 = src[1-8]
731 pmaddwd m5, m0
732 phaddd m4, m5
733
734 palignr m5, m3, m2, 4 ; m5 = src[2-9]
735 pmaddwd m5, m0
736 palignr m6, m3, m2, 6 ; m6 = src[3-10]
737 pmaddwd m6, m0
738 phaddd m5, m6
739 phaddd m4, m5
740 paddd m4, m1
741
742 palignr m5, m3, m2, 8 ; m5 = src[4-11]
743 pmaddwd m5, m0
744 palignr m6, m3, m2, 10 ; m6 = src[5-12]
745 pmaddwd m6, m0
746 phaddd m5, m6
747
748 palignr m6, m3, m2, 12 ; m6 = src[6-13]
749 pmaddwd m6, m0
750 palignr m7, m3, m2, 14 ; m7 = src[7-14]
751 pmaddwd m7, m0
752 phaddd m6, m7
753 phaddd m5, m6
754 paddd m5, m1
755 %ifidn %3, pp
756 psrad m4, 6
757 psrad m5, 6
758 packusdw m4, m5
759 pxor m5, m5
760 CLIPW m4, m5, [pw_pixel_max]
761 %else
762 psrad m4, 2
763 psrad m5, 2
764 packssdw m4, m5
765 %endif
766 movu [r2 + 32], m4
767
768 add r0, r1
769 add r2, r3
770
771 dec r4d
772 jnz .loopH
773 RET
774 %endmacro
775
776 ;-------------------------------------------------------------------------------------------------------------
777 ; void interp_8tap_horiz_pp_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
778 ;-------------------------------------------------------------------------------------------------------------
779 FILTER_HOR_LUMA_W24 24, 32, pp
780
781 ;----------------------------------------------------------------------------------------------------------------------------
782 ; void interp_8tap_horiz_ps_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
783 ;----------------------------------------------------------------------------------------------------------------------------
784 FILTER_HOR_LUMA_W24 24, 32, ps
785
786 %macro FILTER_W2_2 1
787 movu m3, [r0]
788 pshufb m3, m3, m2
789 pmaddwd m3, m0
790 movu m4, [r0 + r1]
791 pshufb m4, m4, m2
792 pmaddwd m4, m0
793 phaddd m3, m4
794 paddd m3, m1
795 %ifidn %1, pp
796 psrad m3, 6
797 packusdw m3, m3
798 CLIPW m3, m7, m6
799 %else
800 psrad m3, 2
801 packssdw m3, m3
802 %endif
803 movd [r2], m3
804 pextrd [r2 + r3], m3, 1
805 %endmacro
806
807 %macro FILTER_W4_2 1
808 movu m3, [r0]
809 pshufb m3, m3, m2
810 pmaddwd m3, m0
811 movu m4, [r0 + 4]
812 pshufb m4, m4, m2
813 pmaddwd m4, m0
814 phaddd m3, m4
815 paddd m3, m1
816
817 movu m5, [r0 + r1]
818 pshufb m5, m5, m2
819 pmaddwd m5, m0
820 movu m4, [r0 + r1 + 4]
821 pshufb m4, m4, m2
822 pmaddwd m4, m0
823 phaddd m5, m4
824 paddd m5, m1
825 %ifidn %1, pp
826 psrad m3, 6
827 psrad m5, 6
828 packusdw m3, m5
829 CLIPW m3, m7, m6
830 %else
831 psrad m3, 2
832 psrad m5, 2
833 packssdw m3, m5
834 %endif
835 movh [r2], m3
836 movhps [r2 + r3], m3
837 %endmacro
838
839 ;-----------------------------------------------------------------------------
840 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
841 ;-----------------------------------------------------------------------------
842 %macro FILTER_CHROMA_H 6
843 INIT_XMM sse4
844 cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5
845
846 add r3, r3
847 add r1, r1
848 sub r0, 2
849 mov r4d, r4m
850 add r4d, r4d
851
852 %ifdef PIC
853 lea r%6, [tab_ChromaCoeff]
854 movh m0, [r%6 + r4 * 4]
855 %else
856 movh m0, [tab_ChromaCoeff + r4 * 4]
857 %endif
858
859 punpcklqdq m0, m0
860 mova m2, [tab_Tm16]
861
862 %ifidn %3, ps
863 mova m1, [tab_c_n32768]
864 cmp r5m, byte 0
865 je .skip
866 sub r0, r1
867 movu m3, [r0]
868 pshufb m3, m3, m2
869 pmaddwd m3, m0
870
871 %if %1 == 4
872 movu m4, [r0 + 4]
873 pshufb m4, m4, m2
874 pmaddwd m4, m0
875 phaddd m3, m4
876 %else
877 phaddd m3, m3
878 %endif
879
880 paddd m3, m1
881 psrad m3, 2
882 packssdw m3, m3
883
884 %if %1 == 2
885 movd [r2], m3
886 %else
887 movh [r2], m3
888 %endif
889
890 add r0, r1
891 add r2, r3
892 FILTER_W%1_2 %3
893 lea r0, [r0 + 2 * r1]
894 lea r2, [r2 + 2 * r3]
895
896 .skip:
897
898 %else ;%ifidn %3, ps
899 pxor m7, m7
900 mova m6, [pw_pixel_max]
901 mova m1, [tab_c_32]
902 %endif ;%ifidn %3, ps
903
904 FILTER_W%1_2 %3
905
906 %rep (%2/2) - 1
907 lea r0, [r0 + 2 * r1]
908 lea r2, [r2 + 2 * r3]
909 FILTER_W%1_2 %3
910 %endrep
911
912 RET
913 %endmacro
914
915 FILTER_CHROMA_H 2, 4, pp, 6, 8, 5
916 FILTER_CHROMA_H 2, 8, pp, 6, 8, 5
917 FILTER_CHROMA_H 4, 2, pp, 6, 8, 5
918 FILTER_CHROMA_H 4, 4, pp, 6, 8, 5
919 FILTER_CHROMA_H 4, 8, pp, 6, 8, 5
920 FILTER_CHROMA_H 4, 16, pp, 6, 8, 5
921
922 FILTER_CHROMA_H 2, 4, ps, 7, 5, 6
923 FILTER_CHROMA_H 2, 8, ps, 7, 5, 6
924 FILTER_CHROMA_H 4, 2, ps, 7, 6, 6
925 FILTER_CHROMA_H 4, 4, ps, 7, 6, 6
926 FILTER_CHROMA_H 4, 8, ps, 7, 6, 6
927 FILTER_CHROMA_H 4, 16, ps, 7, 6, 6
928
929 FILTER_CHROMA_H 2, 16, pp, 6, 8, 5
930 FILTER_CHROMA_H 4, 32, pp, 6, 8, 5
931 FILTER_CHROMA_H 2, 16, ps, 7, 5, 6
932 FILTER_CHROMA_H 4, 32, ps, 7, 6, 6
933
934
935 %macro FILTER_W6_1 1
936 movu m3, [r0]
937 pshufb m3, m3, m2
938 pmaddwd m3, m0
939 movu m4, [r0 + 4]
940 pshufb m4, m4, m2
941 pmaddwd m4, m0
942 phaddd m3, m4
943 paddd m3, m1
944
945 movu m4, [r0 + 8]
946 pshufb m4, m4, m2
947 pmaddwd m4, m0
948 phaddd m4, m4
949 paddd m4, m1
950 %ifidn %1, pp
951 psrad m3, 6
952 psrad m4, 6
953 packusdw m3, m4
954 CLIPW m3, m6, m7
955 %else
956 psrad m3, 2
957 psrad m4, 2
958 packssdw m3, m4
959 %endif
960 movh [r2], m3
961 pextrd [r2 + 8], m3, 2
962 %endmacro
963
964 cglobal chroma_filter_pp_6x1_internal
965 FILTER_W6_1 pp
966 ret
967
968 cglobal chroma_filter_ps_6x1_internal
969 FILTER_W6_1 ps
970 ret
971
972 %macro FILTER_W8_1 1
973 movu m3, [r0]
974 pshufb m3, m3, m2
975 pmaddwd m3, m0
976 movu m4, [r0 + 4]
977 pshufb m4, m4, m2
978 pmaddwd m4, m0
979 phaddd m3, m4
980 paddd m3, m1
981
982 movu m5, [r0 + 8]
983 pshufb m5, m5, m2
984 pmaddwd m5, m0
985 movu m4, [r0 + 12]
986 pshufb m4, m4, m2
987 pmaddwd m4, m0
988 phaddd m5, m4
989 paddd m5, m1
990 %ifidn %1, pp
991 psrad m3, 6
992 psrad m5, 6
993 packusdw m3, m5
994 CLIPW m3, m6, m7
995 %else
996 psrad m3, 2
997 psrad m5, 2
998 packssdw m3, m5
999 %endif
1000 movh [r2], m3
1001 movhps [r2 + 8], m3
1002 %endmacro
1003
1004 cglobal chroma_filter_pp_8x1_internal
1005 FILTER_W8_1 pp
1006 ret
1007
1008 cglobal chroma_filter_ps_8x1_internal
1009 FILTER_W8_1 ps
1010 ret
1011
1012 %macro FILTER_W12_1 1
1013 movu m3, [r0]
1014 pshufb m3, m3, m2
1015 pmaddwd m3, m0
1016 movu m4, [r0 + 4]
1017 pshufb m4, m4, m2
1018 pmaddwd m4, m0
1019 phaddd m3, m4
1020 paddd m3, m1
1021
1022 movu m5, [r0 + 8]
1023 pshufb m5, m5, m2
1024 pmaddwd m5, m0
1025 movu m4, [r0 + 12]
1026 pshufb m4, m4, m2
1027 pmaddwd m4, m0
1028 phaddd m5, m4
1029 paddd m5, m1
1030 %ifidn %1, pp
1031 psrad m3, 6
1032 psrad m5, 6
1033 packusdw m3, m5
1034 CLIPW m3, m6, m7
1035 %else
1036 psrad m3, 2
1037 psrad m5, 2
1038 packssdw m3, m5
1039 %endif
1040 movh [r2], m3
1041 movhps [r2 + 8], m3
1042
1043 movu m3, [r0 + 16]
1044 pshufb m3, m3, m2
1045 pmaddwd m3, m0
1046 movu m4, [r0 + 20]
1047 pshufb m4, m4, m2
1048 pmaddwd m4, m0
1049 phaddd m3, m4
1050 paddd m3, m1
1051
1052 %ifidn %1, pp
1053 psrad m3, 6
1054 packusdw m3, m3
1055 CLIPW m3, m6, m7
1056 %else
1057 psrad m3, 2
1058 packssdw m3, m3
1059 %endif
1060 movh [r2 + 16], m3
1061 %endmacro
1062
1063 cglobal chroma_filter_pp_12x1_internal
1064 FILTER_W12_1 pp
1065 ret
1066
1067 cglobal chroma_filter_ps_12x1_internal
1068 FILTER_W12_1 ps
1069 ret
1070
1071 %macro FILTER_W16_1 1
1072 movu m3, [r0]
1073 pshufb m3, m3, m2
1074 pmaddwd m3, m0
1075 movu m4, [r0 + 4]
1076 pshufb m4, m4, m2
1077 pmaddwd m4, m0
1078 phaddd m3, m4
1079 paddd m3, m1
1080
1081 movu m5, [r0 + 8]
1082 pshufb m5, m5, m2
1083 pmaddwd m5, m0
1084 movu m4, [r0 + 12]
1085 pshufb m4, m4, m2
1086 pmaddwd m4, m0
1087 phaddd m5, m4
1088 paddd m5, m1
1089 %ifidn %1, pp
1090 psrad m3, 6
1091 psrad m5, 6
1092 packusdw m3, m5
1093 CLIPW m3, m6, m7
1094 %else
1095 psrad m3, 2
1096 psrad m5, 2
1097 packssdw m3, m5
1098 %endif
1099 movh [r2], m3
1100 movhps [r2 + 8], m3
1101
1102 movu m3, [r0 + 16]
1103 pshufb m3, m3, m2
1104 pmaddwd m3, m0
1105 movu m4, [r0 + 20]
1106 pshufb m4, m4, m2
1107 pmaddwd m4, m0
1108 phaddd m3, m4
1109 paddd m3, m1
1110
1111 movu m5, [r0 + 24]
1112 pshufb m5, m5, m2
1113 pmaddwd m5, m0
1114 movu m4, [r0 + 28]
1115 pshufb m4, m4, m2
1116 pmaddwd m4, m0
1117 phaddd m5, m4
1118 paddd m5, m1
1119 %ifidn %1, pp
1120 psrad m3, 6
1121 psrad m5, 6
1122 packusdw m3, m5
1123 CLIPW m3, m6, m7
1124 %else
1125 psrad m3, 2
1126 psrad m5, 2
1127 packssdw m3, m5
1128 %endif
1129 movh [r2 + 16], m3
1130 movhps [r2 + 24], m3
1131 %endmacro
1132
1133 cglobal chroma_filter_pp_16x1_internal
1134 FILTER_W16_1 pp
1135 ret
1136
1137 cglobal chroma_filter_ps_16x1_internal
1138 FILTER_W16_1 ps
1139 ret
1140
1141 %macro FILTER_W24_1 1
1142 movu m3, [r0]
1143 pshufb m3, m3, m2
1144 pmaddwd m3, m0
1145 movu m4, [r0 + 4]
1146 pshufb m4, m4, m2
1147 pmaddwd m4, m0
1148 phaddd m3, m4
1149 paddd m3, m1
1150
1151 movu m5, [r0 + 8]
1152 pshufb m5, m5, m2
1153 pmaddwd m5, m0
1154 movu m4, [r0 + 12]
1155 pshufb m4, m4, m2
1156 pmaddwd m4, m0
1157 phaddd m5, m4
1158 paddd m5, m1
1159 %ifidn %1, pp
1160 psrad m3, 6
1161 psrad m5, 6
1162 packusdw m3, m5
1163 CLIPW m3, m6, m7
1164 %else
1165 psrad m3, 2
1166 psrad m5, 2
1167 packssdw m3, m5
1168 %endif
1169 movh [r2], m3
1170 movhps [r2 + 8], m3
1171
1172 movu m3, [r0 + 16]
1173 pshufb m3, m3, m2
1174 pmaddwd m3, m0
1175 movu m4, [r0 + 20]
1176 pshufb m4, m4, m2
1177 pmaddwd m4, m0
1178 phaddd m3, m4
1179 paddd m3, m1
1180
1181 movu m5, [r0 + 24]
1182 pshufb m5, m5, m2
1183 pmaddwd m5, m0
1184 movu m4, [r0 + 28]
1185 pshufb m4, m4, m2
1186 pmaddwd m4, m0
1187 phaddd m5, m4
1188 paddd m5, m1
1189 %ifidn %1, pp
1190 psrad m3, 6
1191 psrad m5, 6
1192 packusdw m3, m5
1193 CLIPW m3, m6, m7
1194 %else
1195 psrad m3, 2
1196 psrad m5, 2
1197 packssdw m3, m5
1198 %endif
1199 movh [r2 + 16], m3
1200 movhps [r2 + 24], m3
1201
1202 movu m3, [r0 + 32]
1203 pshufb m3, m3, m2
1204 pmaddwd m3, m0
1205 movu m4, [r0 + 36]
1206 pshufb m4, m4, m2
1207 pmaddwd m4, m0
1208 phaddd m3, m4
1209 paddd m3, m1
1210
1211 movu m5, [r0 + 40]
1212 pshufb m5, m5, m2
1213 pmaddwd m5, m0
1214 movu m4, [r0 + 44]
1215 pshufb m4, m4, m2
1216 pmaddwd m4, m0
1217 phaddd m5, m4
1218 paddd m5, m1
1219 %ifidn %1, pp
1220 psrad m3, 6
1221 psrad m5, 6
1222 packusdw m3, m5
1223 CLIPW m3, m6, m7
1224 %else
1225 psrad m3, 2
1226 psrad m5, 2
1227 packssdw m3, m5
1228 %endif
1229 movh [r2 + 32], m3
1230 movhps [r2 + 40], m3
1231 %endmacro
1232
1233 cglobal chroma_filter_pp_24x1_internal
1234 FILTER_W24_1 pp
1235 ret
1236
1237 cglobal chroma_filter_ps_24x1_internal
1238 FILTER_W24_1 ps
1239 ret
1240
1241 %macro FILTER_W32_1 1
1242 movu m3, [r0]
1243 pshufb m3, m3, m2
1244 pmaddwd m3, m0
1245 movu m4, [r0 + 4]
1246 pshufb m4, m4, m2
1247 pmaddwd m4, m0
1248 phaddd m3, m4
1249 paddd m3, m1
1250
1251 movu m5, [r0 + 8]
1252 pshufb m5, m5, m2
1253 pmaddwd m5, m0
1254 movu m4, [r0 + 12]
1255 pshufb m4, m4, m2
1256 pmaddwd m4, m0
1257 phaddd m5, m4
1258 paddd m5, m1
1259 %ifidn %1, pp
1260 psrad m3, 6
1261 psrad m5, 6
1262 packusdw m3, m5
1263 CLIPW m3, m6, m7
1264 %else
1265 psrad m3, 2
1266 psrad m5, 2
1267 packssdw m3, m5
1268 %endif
1269 movh [r2], m3
1270 movhps [r2 + 8], m3
1271
1272 movu m3, [r0 + 16]
1273 pshufb m3, m3, m2
1274 pmaddwd m3, m0
1275 movu m4, [r0 + 20]
1276 pshufb m4, m4, m2
1277 pmaddwd m4, m0
1278 phaddd m3, m4
1279 paddd m3, m1
1280
1281 movu m5, [r0 + 24]
1282 pshufb m5, m5, m2
1283 pmaddwd m5, m0
1284 movu m4, [r0 + 28]
1285 pshufb m4, m4, m2
1286 pmaddwd m4, m0
1287 phaddd m5, m4
1288 paddd m5, m1
1289 %ifidn %1, pp
1290 psrad m3, 6
1291 psrad m5, 6
1292 packusdw m3, m5
1293 CLIPW m3, m6, m7
1294 %else
1295 psrad m3, 2
1296 psrad m5, 2
1297 packssdw m3, m5
1298 %endif
1299 movh [r2 + 16], m3
1300 movhps [r2 + 24], m3
1301
1302 movu m3, [r0 + 32]
1303 pshufb m3, m3, m2
1304 pmaddwd m3, m0
1305 movu m4, [r0 + 36]
1306 pshufb m4, m4, m2
1307 pmaddwd m4, m0
1308 phaddd m3, m4
1309 paddd m3, m1
1310
1311 movu m5, [r0 + 40]
1312 pshufb m5, m5, m2
1313 pmaddwd m5, m0
1314 movu m4, [r0 + 44]
1315 pshufb m4, m4, m2
1316 pmaddwd m4, m0
1317 phaddd m5, m4
1318 paddd m5, m1
1319 %ifidn %1, pp
1320 psrad m3, 6
1321 psrad m5, 6
1322 packusdw m3, m5
1323 CLIPW m3, m6, m7
1324 %else
1325 psrad m3, 2
1326 psrad m5, 2
1327 packssdw m3, m5
1328 %endif
1329 movh [r2 + 32], m3
1330 movhps [r2 + 40], m3
1331
1332 movu m3, [r0 + 48]
1333 pshufb m3, m3, m2
1334 pmaddwd m3, m0
1335 movu m4, [r0 + 52]
1336 pshufb m4, m4, m2
1337 pmaddwd m4, m0
1338 phaddd m3, m4
1339 paddd m3, m1
1340
1341 movu m5, [r0 + 56]
1342 pshufb m5, m5, m2
1343 pmaddwd m5, m0
1344 movu m4, [r0 + 60]
1345 pshufb m4, m4, m2
1346 pmaddwd m4, m0
1347 phaddd m5, m4
1348 paddd m5, m1
1349 %ifidn %1, pp
1350 psrad m3, 6
1351 psrad m5, 6
1352 packusdw m3, m5
1353 CLIPW m3, m6, m7
1354 %else
1355 psrad m3, 2
1356 psrad m5, 2
1357 packssdw m3, m5
1358 %endif
1359 movh [r2 + 48], m3
1360 movhps [r2 + 56], m3
1361 %endmacro
1362
1363 cglobal chroma_filter_pp_32x1_internal
1364 FILTER_W32_1 pp
1365 ret
1366
1367 cglobal chroma_filter_ps_32x1_internal
1368 FILTER_W32_1 ps
1369 ret
1370
1371 %macro FILTER_W8o_1 2
1372 movu m3, [r0 + %2]
1373 pshufb m3, m3, m2
1374 pmaddwd m3, m0
1375 movu m4, [r0 + %2 + 4]
1376 pshufb m4, m4, m2
1377 pmaddwd m4, m0
1378 phaddd m3, m4
1379 paddd m3, m1
1380
1381 movu m5, [r0 + %2 + 8]
1382 pshufb m5, m5, m2
1383 pmaddwd m5, m0
1384 movu m4, [r0 + %2 + 12]
1385 pshufb m4, m4, m2
1386 pmaddwd m4, m0
1387 phaddd m5, m4
1388 paddd m5, m1
1389 %ifidn %1, pp
1390 psrad m3, 6
1391 psrad m5, 6
1392 packusdw m3, m5
1393 CLIPW m3, m6, m7
1394 %else
1395 psrad m3, 2
1396 psrad m5, 2
1397 packssdw m3, m5
1398 %endif
1399 movh [r2 + %2], m3
1400 movhps [r2 + %2 + 8], m3
1401 %endmacro
1402
1403 %macro FILTER_W48_1 1
1404 FILTER_W8o_1 %1, 0
1405 FILTER_W8o_1 %1, 16
1406 FILTER_W8o_1 %1, 32
1407 FILTER_W8o_1 %1, 48
1408 FILTER_W8o_1 %1, 64
1409 FILTER_W8o_1 %1, 80
1410 %endmacro
1411
1412 cglobal chroma_filter_pp_48x1_internal
1413 FILTER_W48_1 pp
1414 ret
1415
1416 cglobal chroma_filter_ps_48x1_internal
1417 FILTER_W48_1 ps
1418 ret
1419
1420 %macro FILTER_W64_1 1
1421 FILTER_W8o_1 %1, 0
1422 FILTER_W8o_1 %1, 16
1423 FILTER_W8o_1 %1, 32
1424 FILTER_W8o_1 %1, 48
1425 FILTER_W8o_1 %1, 64
1426 FILTER_W8o_1 %1, 80
1427 FILTER_W8o_1 %1, 96
1428 FILTER_W8o_1 %1, 112
1429 %endmacro
1430
1431 cglobal chroma_filter_pp_64x1_internal
1432 FILTER_W64_1 pp
1433 ret
1434
1435 cglobal chroma_filter_ps_64x1_internal
1436 FILTER_W64_1 ps
1437 ret
1438
1439
1440 ;-----------------------------------------------------------------------------
1441 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1442 ;-----------------------------------------------------------------------------
1443
1444 INIT_XMM sse4
1445 %macro IPFILTER_CHROMA 6
1446 cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6
1447
1448 add r3, r3
1449 add r1, r1
1450 sub r0, 2
1451 mov r4d, r4m
1452 add r4d, r4d
1453
1454 %ifdef PIC
1455 lea r%4, [tab_ChromaCoeff]
1456 movh m0, [r%4 + r4 * 4]
1457 %else
1458 movh m0, [tab_ChromaCoeff + r4 * 4]
1459 %endif
1460
1461 punpcklqdq m0, m0
1462 mova m2, [tab_Tm16]
1463
1464 %ifidn %3, ps
1465 mova m1, [tab_c_n32768]
1466 cmp r5m, byte 0
1467 je .skip
1468 sub r0, r1
1469 call chroma_filter_%3_%1x1_internal
1470 add r0, r1
1471 add r2, r3
1472 call chroma_filter_%3_%1x1_internal
1473 add r0, r1
1474 add r2, r3
1475 call chroma_filter_%3_%1x1_internal
1476 add r0, r1
1477 add r2, r3
1478 .skip:
1479 %else
1480 mova m1, [tab_c_32]
1481 pxor m6, m6
1482 mova m7, [pw_pixel_max]
1483 %endif
1484
1485 call chroma_filter_%3_%1x1_internal
1486 %rep %2 - 1
1487 add r0, r1
1488 add r2, r3
1489 call chroma_filter_%3_%1x1_internal
1490 %endrep
1491 RET
1492 %endmacro
1493 IPFILTER_CHROMA 6, 8, pp, 5, 6, 8
1494 IPFILTER_CHROMA 8, 2, pp, 5, 6, 8
1495 IPFILTER_CHROMA 8, 4, pp, 5, 6, 8
1496 IPFILTER_CHROMA 8, 6, pp, 5, 6, 8
1497 IPFILTER_CHROMA 8, 8, pp, 5, 6, 8
1498 IPFILTER_CHROMA 8, 16, pp, 5, 6, 8
1499 IPFILTER_CHROMA 8, 32, pp, 5, 6, 8
1500 IPFILTER_CHROMA 12, 16, pp, 5, 6, 8
1501 IPFILTER_CHROMA 16, 4, pp, 5, 6, 8
1502 IPFILTER_CHROMA 16, 8, pp, 5, 6, 8
1503 IPFILTER_CHROMA 16, 12, pp, 5, 6, 8
1504 IPFILTER_CHROMA 16, 16, pp, 5, 6, 8
1505 IPFILTER_CHROMA 16, 32, pp, 5, 6, 8
1506 IPFILTER_CHROMA 24, 32, pp, 5, 6, 8
1507 IPFILTER_CHROMA 32, 8, pp, 5, 6, 8
1508 IPFILTER_CHROMA 32, 16, pp, 5, 6, 8
1509 IPFILTER_CHROMA 32, 24, pp, 5, 6, 8
1510 IPFILTER_CHROMA 32, 32, pp, 5, 6, 8
1511
1512 IPFILTER_CHROMA 6, 8, ps, 6, 7, 6
1513 IPFILTER_CHROMA 8, 2, ps, 6, 7, 6
1514 IPFILTER_CHROMA 8, 4, ps, 6, 7, 6
1515 IPFILTER_CHROMA 8, 6, ps, 6, 7, 6
1516 IPFILTER_CHROMA 8, 8, ps, 6, 7, 6
1517 IPFILTER_CHROMA 8, 16, ps, 6, 7, 6
1518 IPFILTER_CHROMA 8, 32, ps, 6, 7, 6
1519 IPFILTER_CHROMA 12, 16, ps, 6, 7, 6
1520 IPFILTER_CHROMA 16, 4, ps, 6, 7, 6
1521 IPFILTER_CHROMA 16, 8, ps, 6, 7, 6
1522 IPFILTER_CHROMA 16, 12, ps, 6, 7, 6
1523 IPFILTER_CHROMA 16, 16, ps, 6, 7, 6
1524 IPFILTER_CHROMA 16, 32, ps, 6, 7, 6
1525 IPFILTER_CHROMA 24, 32, ps, 6, 7, 6
1526 IPFILTER_CHROMA 32, 8, ps, 6, 7, 6
1527 IPFILTER_CHROMA 32, 16, ps, 6, 7, 6
1528 IPFILTER_CHROMA 32, 24, ps, 6, 7, 6
1529 IPFILTER_CHROMA 32, 32, ps, 6, 7, 6
1530
1531 IPFILTER_CHROMA 6, 16, pp, 5, 6, 8
1532 IPFILTER_CHROMA 8, 12, pp, 5, 6, 8
1533 IPFILTER_CHROMA 8, 64, pp, 5, 6, 8
1534 IPFILTER_CHROMA 12, 32, pp, 5, 6, 8
1535 IPFILTER_CHROMA 16, 24, pp, 5, 6, 8
1536 IPFILTER_CHROMA 16, 64, pp, 5, 6, 8
1537 IPFILTER_CHROMA 24, 64, pp, 5, 6, 8
1538 IPFILTER_CHROMA 32, 48, pp, 5, 6, 8
1539 IPFILTER_CHROMA 32, 64, pp, 5, 6, 8
1540 IPFILTER_CHROMA 6, 16, ps, 6, 7, 6
1541 IPFILTER_CHROMA 8, 12, ps, 6, 7, 6
1542 IPFILTER_CHROMA 8, 64, ps, 6, 7, 6
1543 IPFILTER_CHROMA 12, 32, ps, 6, 7, 6
1544 IPFILTER_CHROMA 16, 24, ps, 6, 7, 6
1545 IPFILTER_CHROMA 16, 64, ps, 6, 7, 6
1546 IPFILTER_CHROMA 24, 64, ps, 6, 7, 6
1547 IPFILTER_CHROMA 32, 48, ps, 6, 7, 6
1548 IPFILTER_CHROMA 32, 64, ps, 6, 7, 6
1549
1550 IPFILTER_CHROMA 48, 64, pp, 5, 6, 8
1551 IPFILTER_CHROMA 64, 48, pp, 5, 6, 8
1552 IPFILTER_CHROMA 64, 64, pp, 5, 6, 8
1553 IPFILTER_CHROMA 64, 32, pp, 5, 6, 8
1554 IPFILTER_CHROMA 64, 16, pp, 5, 6, 8
1555 IPFILTER_CHROMA 48, 64, ps, 6, 7, 6
1556 IPFILTER_CHROMA 64, 48, ps, 6, 7, 6
1557 IPFILTER_CHROMA 64, 64, ps, 6, 7, 6
1558 IPFILTER_CHROMA 64, 32, ps, 6, 7, 6
1559 IPFILTER_CHROMA 64, 16, ps, 6, 7, 6
1560
1561
1562 %macro PROCESS_CHROMA_SP_W4_4R 0
1563 movq m0, [r0]
1564 movq m1, [r0 + r1]
1565 punpcklwd m0, m1 ;m0=[0 1]
1566 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
1567
1568 lea r0, [r0 + 2 * r1]
1569 movq m4, [r0]
1570 punpcklwd m1, m4 ;m1=[1 2]
1571 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
1572
1573 movq m5, [r0 + r1]
1574 punpcklwd m4, m5 ;m4=[2 3]
1575 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
1576 pmaddwd m4, [r6 + 1 * 16]
1577 paddd m0, m4 ;m0=[0+1+2+3] Row1 done
1578
1579 lea r0, [r0 + 2 * r1]
1580 movq m4, [r0]
1581 punpcklwd m5, m4 ;m5=[3 4]
1582 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
1583 pmaddwd m5, [r6 + 1 * 16]
1584 paddd m1, m5 ;m1 = [1+2+3+4] Row2
1585
1586 movq m5, [r0 + r1]
1587 punpcklwd m4, m5 ;m4=[4 5]
1588 pmaddwd m4, [r6 + 1 * 16]
1589 paddd m2, m4 ;m2=[2+3+4+5] Row3
1590
1591 movq m4, [r0 + 2 * r1]
1592 punpcklwd m5, m4 ;m5=[5 6]
1593 pmaddwd m5, [r6 + 1 * 16]
1594 paddd m3, m5 ;m3=[3+4+5+6] Row4
1595 %endmacro
1596
1597 ;-----------------------------------------------------------------------------------------------------------------
1598 ; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1599 ;-----------------------------------------------------------------------------------------------------------------
1600 %macro FILTER_VER_CHROMA_SS 4
1601 INIT_XMM sse2
1602 cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize
1603
1604 add r1d, r1d
1605 add r3d, r3d
1606 sub r0, r1
1607 shl r4d, 5
1608
1609 %ifdef PIC
1610 lea r5, [tab_ChromaCoeffV]
1611 lea r6, [r5 + r4]
1612 %else
1613 lea r6, [tab_ChromaCoeffV + r4]
1614 %endif
1615
1616 mov dword [rsp], %2/4
1617
1618 %ifnidn %3, ss
1619 %ifnidn %3, ps
1620 mova m7, [pw_pixel_max]
1621 %ifidn %3, pp
1622 mova m6, [tab_c_32]
1623 %else
1624 mova m6, [tab_c_524800]
1625 %endif
1626 %else
1627 mova m6, [tab_c_n32768]
1628 %endif
1629 %endif
1630
1631 .loopH:
1632 mov r4d, (%1/4)
1633 .loopW:
1634 PROCESS_CHROMA_SP_W4_4R
1635
1636 %ifidn %3, ss
1637 psrad m0, 6
1638 psrad m1, 6
1639 psrad m2, 6
1640 psrad m3, 6
1641
1642 packssdw m0, m1
1643 packssdw m2, m3
1644 %elifidn %3, ps
1645 paddd m0, m6
1646 paddd m1, m6
1647 paddd m2, m6
1648 paddd m3, m6
1649 psrad m0, 2
1650 psrad m1, 2
1651 psrad m2, 2
1652 psrad m3, 2
1653
1654 packssdw m0, m1
1655 packssdw m2, m3
1656 %else
1657 paddd m0, m6
1658 paddd m1, m6
1659 paddd m2, m6
1660 paddd m3, m6
1661 %ifidn %3, pp
1662 psrad m0, 6
1663 psrad m1, 6
1664 psrad m2, 6
1665 psrad m3, 6
1666 %else
1667 psrad m0, 10
1668 psrad m1, 10
1669 psrad m2, 10
1670 psrad m3, 10
1671 %endif
1672 packssdw m0, m1
1673 packssdw m2, m3
1674 pxor m5, m5
1675 CLIPW2 m0, m2, m5, m7
1676 %endif
1677
1678 movh [r2], m0
1679 movhps [r2 + r3], m0
1680 lea r5, [r2 + 2 * r3]
1681 movh [r5], m2
1682 movhps [r5 + r3], m2
1683
1684 lea r5, [4 * r1 - 2 * 4]
1685 sub r0, r5
1686 add r2, 2 * 4
1687
1688 dec r4d
1689 jnz .loopW
1690
1691 lea r0, [r0 + 4 * r1 - 2 * %1]
1692 lea r2, [r2 + 4 * r3 - 2 * %1]
1693
1694 dec dword [rsp]
1695 jnz .loopH
1696
1697 RET
1698 %endmacro
1699
1700 FILTER_VER_CHROMA_SS 4, 4, ss, 6
1701 FILTER_VER_CHROMA_SS 4, 8, ss, 6
1702 FILTER_VER_CHROMA_SS 16, 16, ss, 6
1703 FILTER_VER_CHROMA_SS 16, 8, ss, 6
1704 FILTER_VER_CHROMA_SS 16, 12, ss, 6
1705 FILTER_VER_CHROMA_SS 12, 16, ss, 6
1706 FILTER_VER_CHROMA_SS 16, 4, ss, 6
1707 FILTER_VER_CHROMA_SS 4, 16, ss, 6
1708 FILTER_VER_CHROMA_SS 32, 32, ss, 6
1709 FILTER_VER_CHROMA_SS 32, 16, ss, 6
1710 FILTER_VER_CHROMA_SS 16, 32, ss, 6
1711 FILTER_VER_CHROMA_SS 32, 24, ss, 6
1712 FILTER_VER_CHROMA_SS 24, 32, ss, 6
1713 FILTER_VER_CHROMA_SS 32, 8, ss, 6
1714
1715 FILTER_VER_CHROMA_SS 4, 4, ps, 7
1716 FILTER_VER_CHROMA_SS 4, 8, ps, 7
1717 FILTER_VER_CHROMA_SS 16, 16, ps, 7
1718 FILTER_VER_CHROMA_SS 16, 8, ps, 7
1719 FILTER_VER_CHROMA_SS 16, 12, ps, 7
1720 FILTER_VER_CHROMA_SS 12, 16, ps, 7
1721 FILTER_VER_CHROMA_SS 16, 4, ps, 7
1722 FILTER_VER_CHROMA_SS 4, 16, ps, 7
1723 FILTER_VER_CHROMA_SS 32, 32, ps, 7
1724 FILTER_VER_CHROMA_SS 32, 16, ps, 7
1725 FILTER_VER_CHROMA_SS 16, 32, ps, 7
1726 FILTER_VER_CHROMA_SS 32, 24, ps, 7
1727 FILTER_VER_CHROMA_SS 24, 32, ps, 7
1728 FILTER_VER_CHROMA_SS 32, 8, ps, 7
1729
1730 FILTER_VER_CHROMA_SS 4, 4, sp, 8
1731 FILTER_VER_CHROMA_SS 4, 8, sp, 8
1732 FILTER_VER_CHROMA_SS 16, 16, sp, 8
1733 FILTER_VER_CHROMA_SS 16, 8, sp, 8
1734 FILTER_VER_CHROMA_SS 16, 12, sp, 8
1735 FILTER_VER_CHROMA_SS 12, 16, sp, 8
1736 FILTER_VER_CHROMA_SS 16, 4, sp, 8
1737 FILTER_VER_CHROMA_SS 4, 16, sp, 8
1738 FILTER_VER_CHROMA_SS 32, 32, sp, 8
1739 FILTER_VER_CHROMA_SS 32, 16, sp, 8
1740 FILTER_VER_CHROMA_SS 16, 32, sp, 8
1741 FILTER_VER_CHROMA_SS 32, 24, sp, 8
1742 FILTER_VER_CHROMA_SS 24, 32, sp, 8
1743 FILTER_VER_CHROMA_SS 32, 8, sp, 8
1744
1745 FILTER_VER_CHROMA_SS 4, 4, pp, 8
1746 FILTER_VER_CHROMA_SS 4, 8, pp, 8
1747 FILTER_VER_CHROMA_SS 16, 16, pp, 8
1748 FILTER_VER_CHROMA_SS 16, 8, pp, 8
1749 FILTER_VER_CHROMA_SS 16, 12, pp, 8
1750 FILTER_VER_CHROMA_SS 12, 16, pp, 8
1751 FILTER_VER_CHROMA_SS 16, 4, pp, 8
1752 FILTER_VER_CHROMA_SS 4, 16, pp, 8
1753 FILTER_VER_CHROMA_SS 32, 32, pp, 8
1754 FILTER_VER_CHROMA_SS 32, 16, pp, 8
1755 FILTER_VER_CHROMA_SS 16, 32, pp, 8
1756 FILTER_VER_CHROMA_SS 32, 24, pp, 8
1757 FILTER_VER_CHROMA_SS 24, 32, pp, 8
1758 FILTER_VER_CHROMA_SS 32, 8, pp, 8
1759
1760
1761 FILTER_VER_CHROMA_SS 16, 24, ss, 6
1762 FILTER_VER_CHROMA_SS 12, 32, ss, 6
1763 FILTER_VER_CHROMA_SS 4, 32, ss, 6
1764 FILTER_VER_CHROMA_SS 32, 64, ss, 6
1765 FILTER_VER_CHROMA_SS 16, 64, ss, 6
1766 FILTER_VER_CHROMA_SS 32, 48, ss, 6
1767 FILTER_VER_CHROMA_SS 24, 64, ss, 6
1768
1769 FILTER_VER_CHROMA_SS 16, 24, ps, 7
1770 FILTER_VER_CHROMA_SS 12, 32, ps, 7
1771 FILTER_VER_CHROMA_SS 4, 32, ps, 7
1772 FILTER_VER_CHROMA_SS 32, 64, ps, 7
1773 FILTER_VER_CHROMA_SS 16, 64, ps, 7
1774 FILTER_VER_CHROMA_SS 32, 48, ps, 7
1775 FILTER_VER_CHROMA_SS 24, 64, ps, 7
1776
1777 FILTER_VER_CHROMA_SS 16, 24, sp, 8
1778 FILTER_VER_CHROMA_SS 12, 32, sp, 8
1779 FILTER_VER_CHROMA_SS 4, 32, sp, 8
1780 FILTER_VER_CHROMA_SS 32, 64, sp, 8
1781 FILTER_VER_CHROMA_SS 16, 64, sp, 8
1782 FILTER_VER_CHROMA_SS 32, 48, sp, 8
1783 FILTER_VER_CHROMA_SS 24, 64, sp, 8
1784
1785 FILTER_VER_CHROMA_SS 16, 24, pp, 8
1786 FILTER_VER_CHROMA_SS 12, 32, pp, 8
1787 FILTER_VER_CHROMA_SS 4, 32, pp, 8
1788 FILTER_VER_CHROMA_SS 32, 64, pp, 8
1789 FILTER_VER_CHROMA_SS 16, 64, pp, 8
1790 FILTER_VER_CHROMA_SS 32, 48, pp, 8
1791 FILTER_VER_CHROMA_SS 24, 64, pp, 8
1792
1793
1794 FILTER_VER_CHROMA_SS 48, 64, ss, 6
1795 FILTER_VER_CHROMA_SS 64, 48, ss, 6
1796 FILTER_VER_CHROMA_SS 64, 64, ss, 6
1797 FILTER_VER_CHROMA_SS 64, 32, ss, 6
1798 FILTER_VER_CHROMA_SS 64, 16, ss, 6
1799
1800 FILTER_VER_CHROMA_SS 48, 64, ps, 7
1801 FILTER_VER_CHROMA_SS 64, 48, ps, 7
1802 FILTER_VER_CHROMA_SS 64, 64, ps, 7
1803 FILTER_VER_CHROMA_SS 64, 32, ps, 7
1804 FILTER_VER_CHROMA_SS 64, 16, ps, 7
1805
1806 FILTER_VER_CHROMA_SS 48, 64, sp, 8
1807 FILTER_VER_CHROMA_SS 64, 48, sp, 8
1808 FILTER_VER_CHROMA_SS 64, 64, sp, 8
1809 FILTER_VER_CHROMA_SS 64, 32, sp, 8
1810 FILTER_VER_CHROMA_SS 64, 16, sp, 8
1811
1812 FILTER_VER_CHROMA_SS 48, 64, pp, 8
1813 FILTER_VER_CHROMA_SS 64, 48, pp, 8
1814 FILTER_VER_CHROMA_SS 64, 64, pp, 8
1815 FILTER_VER_CHROMA_SS 64, 32, pp, 8
1816 FILTER_VER_CHROMA_SS 64, 16, pp, 8
1817
1818
1819 %macro PROCESS_CHROMA_SP_W2_4R 1
1820 movd m0, [r0]
1821 movd m1, [r0 + r1]
1822 punpcklwd m0, m1 ;m0=[0 1]
1823
1824 lea r0, [r0 + 2 * r1]
1825 movd m2, [r0]
1826 punpcklwd m1, m2 ;m1=[1 2]
1827 punpcklqdq m0, m1 ;m0=[0 1 1 2]
1828 pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
1829
1830 movd m1, [r0 + r1]
1831 punpcklwd m2, m1 ;m2=[2 3]
1832
1833 lea r0, [r0 + 2 * r1]
1834 movd m3, [r0]
1835 punpcklwd m1, m3 ;m2=[3 4]
1836 punpcklqdq m2, m1 ;m2=[2 3 3 4]
1837
1838 pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
1839 pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
1840 paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
1841
1842 movd m1, [r0 + r1]
1843 punpcklwd m3, m1 ;m3=[4 5]
1844
1845 movd m4, [r0 + 2 * r1]
1846 punpcklwd m1, m4 ;m1=[5 6]
1847 punpcklqdq m3, m1 ;m2=[4 5 5 6]
1848 pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
1849 paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
1850 %endmacro
1851
1852 ;---------------------------------------------------------------------------------------------------------------------
1853 ; void interp_4tap_vertical_%2_2x%1(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1854 ;---------------------------------------------------------------------------------------------------------------------
1855 %macro FILTER_VER_CHROMA_W2 3
1856 INIT_XMM sse4
1857 cglobal interp_4tap_vert_%2_2x%1, 5, 6, %3
1858
1859 add r1d, r1d
1860 add r3d, r3d
1861 sub r0, r1
1862 shl r4d, 5
1863
1864 %ifdef PIC
1865 lea r5, [tab_ChromaCoeffV]
1866 lea r5, [r5 + r4]
1867 %else
1868 lea r5, [tab_ChromaCoeffV + r4]
1869 %endif
1870
1871 mov r4d, (%1/4)
1872 %ifnidn %2, ss
1873 %ifnidn %2, ps
1874 pxor m7, m7
1875 mova m6, [pw_pixel_max]
1876 %ifidn %2, pp
1877 mova m5, [tab_c_32]
1878 %else
1879 mova m5, [tab_c_524800]
1880 %endif
1881 %else
1882 mova m5, [tab_c_n32768]
1883 %endif
1884 %endif
1885
1886 .loopH:
1887 PROCESS_CHROMA_SP_W2_4R r5
1888 %ifidn %2, ss
1889 psrad m0, 6
1890 psrad m2, 6
1891 packssdw m0, m2
1892 %elifidn %2, ps
1893 paddd m0, m5
1894 paddd m2, m5
1895 psrad m0, 2
1896 psrad m2, 2
1897 packssdw m0, m2
1898 %else
1899 paddd m0, m5
1900 paddd m2, m5
1901 %ifidn %2, pp
1902 psrad m0, 6
1903 psrad m2, 6
1904 %else
1905 psrad m0, 10
1906 psrad m2, 10
1907 %endif
1908 packusdw m0, m2
1909 CLIPW m0, m7, m6
1910 %endif
1911
1912 movd [r2], m0
1913 pextrd [r2 + r3], m0, 1
1914 lea r2, [r2 + 2 * r3]
1915 pextrd [r2], m0, 2
1916 pextrd [r2 + r3], m0, 3
1917
1918 lea r2, [r2 + 2 * r3]
1919
1920 dec r4d
1921 jnz .loopH
1922
1923 RET
1924 %endmacro
1925
1926 FILTER_VER_CHROMA_W2 4, ss, 5
1927 FILTER_VER_CHROMA_W2 8, ss, 5
1928
1929 FILTER_VER_CHROMA_W2 4, pp, 8
1930 FILTER_VER_CHROMA_W2 8, pp, 8
1931
1932 FILTER_VER_CHROMA_W2 4, ps, 6
1933 FILTER_VER_CHROMA_W2 8, ps, 6
1934
1935 FILTER_VER_CHROMA_W2 4, sp, 8
1936 FILTER_VER_CHROMA_W2 8, sp, 8
1937
1938 FILTER_VER_CHROMA_W2 16, ss, 5
1939 FILTER_VER_CHROMA_W2 16, pp, 8
1940 FILTER_VER_CHROMA_W2 16, ps, 6
1941 FILTER_VER_CHROMA_W2 16, sp, 8
1942
1943
1944 ;---------------------------------------------------------------------------------------------------------------
1945 ; void interp_4tap_vert_%1_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1946 ;---------------------------------------------------------------------------------------------------------------
1947 %macro FILTER_VER_CHROMA_W4 3
1948 INIT_XMM sse4
1949 cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
1950
1951 add r1d, r1d
1952 add r3d, r3d
1953 sub r0, r1
1954 shl r4d, 5
1955
1956 %ifdef PIC
1957 lea r5, [tab_ChromaCoeffV]
1958 lea r5, [r5 + r4]
1959 %else
1960 lea r5, [tab_ChromaCoeffV + r4]
1961 %endif
1962
1963 %ifnidn %2, 2
1964 mov r4d, %1/2
1965 %endif
1966
1967 %ifnidn %2, ss
1968 %ifnidn %2, ps
1969 pxor m6, m6
1970 mova m5, [pw_pixel_max]
1971 %ifidn %2, pp
1972 mova m4, [tab_c_32]
1973 %else
1974 mova m4, [tab_c_524800]
1975 %endif
1976 %else
1977 mova m4, [tab_c_n32768]
1978 %endif
1979 %endif
1980
1981 %ifnidn %2, 2
1982 .loop:
1983 %endif
1984
1985 movh m0, [r0]
1986 movh m1, [r0 + r1]
1987 punpcklwd m0, m1 ;m0=[0 1]
1988 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
1989
1990 lea r0, [r0 + 2 * r1]
1991 movh m2, [r0]
1992 punpcklwd m1, m2 ;m1=[1 2]
1993 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
1994
1995 movh m3, [r0 + r1]
1996 punpcklwd m2, m3 ;m4=[2 3]
1997 pmaddwd m2, [r5 + 1 * 16]
1998 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
1999
2000 movh m2, [r0 + 2 * r1]
2001 punpcklwd m3, m2 ;m5=[3 4]
2002 pmaddwd m3, [r5 + 1 * 16]
2003 paddd m1, m3 ;m1=[1+2+3+4] Row2 done
2004
2005 %ifidn %2, ss
2006 psrad m0, 6
2007 psrad m1, 6
2008 packssdw m0, m1
2009 %elifidn %2, ps
2010 paddd m0, m4
2011 paddd m1, m4
2012 psrad m0, 2
2013 psrad m1, 2
2014 packssdw m0, m1
2015 %else
2016 paddd m0, m4
2017 paddd m1, m4
2018 %ifidn %2, pp
2019 psrad m0, 6
2020 psrad m1, 6
2021 %else
2022 psrad m0, 10
2023 psrad m1, 10
2024 %endif
2025 packusdw m0, m1
2026 CLIPW m0, m6, m5
2027 %endif
2028
2029 movh [r2], m0
2030 movhps [r2 + r3], m0
2031
2032 %ifnidn %2, 2
2033 lea r2, [r2 + r3 * 2]
2034 dec r4d
2035 jnz .loop
2036 %endif
2037
2038 RET
2039 %endmacro
2040
2041 FILTER_VER_CHROMA_W4 2, ss, 4
2042 FILTER_VER_CHROMA_W4 2, pp, 7
2043 FILTER_VER_CHROMA_W4 2, ps, 5
2044 FILTER_VER_CHROMA_W4 2, sp, 7
2045
2046 FILTER_VER_CHROMA_W4 4, ss, 4
2047 FILTER_VER_CHROMA_W4 4, pp, 7
2048 FILTER_VER_CHROMA_W4 4, ps, 5
2049 FILTER_VER_CHROMA_W4 4, sp, 7
2050
2051 ;-------------------------------------------------------------------------------------------------------------------
2052 ; void interp_4tap_vertical_%1_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2053 ;-------------------------------------------------------------------------------------------------------------------
2054 %macro FILTER_VER_CHROMA_W6 3
2055 INIT_XMM sse4
2056 cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
2057
2058 add r1d, r1d
2059 add r3d, r3d
2060 sub r0, r1
2061 shl r4d, 5
2062
2063 %ifdef PIC
2064 lea r5, [tab_ChromaCoeffV]
2065 lea r6, [r5 + r4]
2066 %else
2067 lea r6, [tab_ChromaCoeffV + r4]
2068 %endif
2069
2070 mov r4d, %1/4
2071
2072 %ifnidn %2, ss
2073 %ifnidn %2, ps
2074 mova m7, [pw_pixel_max]
2075 %ifidn %2, pp
2076 mova m6, [tab_c_32]
2077 %else
2078 mova m6, [tab_c_524800]
2079 %endif
2080 %else
2081 mova m6, [tab_c_n32768]
2082 %endif
2083 %endif
2084
2085 .loopH:
2086 PROCESS_CHROMA_SP_W4_4R
2087
2088 %ifidn %2, ss
2089 psrad m0, 6
2090 psrad m1, 6
2091 psrad m2, 6
2092 psrad m3, 6
2093
2094 packssdw m0, m1
2095 packssdw m2, m3
2096 %elifidn %2, ps
2097 paddd m0, m6
2098 paddd m1, m6
2099 paddd m2, m6
2100 paddd m3, m6
2101 psrad m0, 2
2102 psrad m1, 2
2103 psrad m2, 2
2104 psrad m3, 2
2105
2106 packssdw m0, m1
2107 packssdw m2, m3
2108 %else
2109 paddd m0, m6
2110 paddd m1, m6
2111 paddd m2, m6
2112 paddd m3, m6
2113 %ifidn %2, pp
2114 psrad m0, 6
2115 psrad m1, 6
2116 psrad m2, 6
2117 psrad m3, 6
2118 %else
2119 psrad m0, 10
2120 psrad m1, 10
2121 psrad m2, 10
2122 psrad m3, 10
2123 %endif
2124 packssdw m0, m1
2125 packssdw m2, m3
2126 pxor m5, m5
2127 CLIPW2 m0, m2, m5, m7
2128 %endif
2129
2130 movh [r2], m0
2131 movhps [r2 + r3], m0
2132 lea r5, [r2 + 2 * r3]
2133 movh [r5], m2
2134 movhps [r5 + r3], m2
2135
2136 lea r5, [4 * r1 - 2 * 4]
2137 sub r0, r5
2138 add r2, 2 * 4
2139
2140 PROCESS_CHROMA_SP_W2_4R r6
2141
2142 %ifidn %2, ss
2143 psrad m0, 6
2144 psrad m2, 6
2145 packssdw m0, m2
2146 %elifidn %2, ps
2147 paddd m0, m6
2148 paddd m2, m6
2149 psrad m0, 2
2150 psrad m2, 2
2151 packssdw m0, m2
2152 %else
2153 paddd m0, m6
2154 paddd m2, m6
2155 %ifidn %2, pp
2156 psrad m0, 6
2157 psrad m2, 6
2158 %else
2159 psrad m0, 10
2160 psrad m2, 10
2161 %endif
2162 packusdw m0, m2
2163 CLIPW m0, m5, m7
2164 %endif
2165
2166 movd [r2], m0
2167 pextrd [r2 + r3], m0, 1
2168 lea r2, [r2 + 2 * r3]
2169 pextrd [r2], m0, 2
2170 pextrd [r2 + r3], m0, 3
2171
2172 sub r0, 2 * 4
2173 lea r2, [r2 + 2 * r3 - 2 * 4]
2174
2175 dec r4d
2176 jnz .loopH
2177
2178 RET
2179 %endmacro
2180
2181 FILTER_VER_CHROMA_W6 8, ss, 6
2182 FILTER_VER_CHROMA_W6 8, ps, 7
2183 FILTER_VER_CHROMA_W6 8, sp, 8
2184 FILTER_VER_CHROMA_W6 8, pp, 8
2185
2186 FILTER_VER_CHROMA_W6 16, ss, 6
2187 FILTER_VER_CHROMA_W6 16, ps, 7
2188 FILTER_VER_CHROMA_W6 16, sp, 8
2189 FILTER_VER_CHROMA_W6 16, pp, 8
2190
2191 %macro PROCESS_CHROMA_SP_W8_2R 0
2192 movu m1, [r0]
2193 movu m3, [r0 + r1]
2194 punpcklwd m0, m1, m3
2195 pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
2196 punpckhwd m1, m3
2197 pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
2198
2199 movu m4, [r0 + 2 * r1]
2200 punpcklwd m2, m3, m4
2201 pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
2202 punpckhwd m3, m4
2203 pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
2204
2205 lea r0, [r0 + 2 * r1]
2206 movu m5, [r0 + r1]
2207 punpcklwd m6, m4, m5
2208 pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
2209 paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
2210 punpckhwd m4, m5
2211 pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
2212 paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
2213
2214 movu m4, [r0 + 2 * r1]
2215 punpcklwd m6, m5, m4
2216 pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
2217 paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
2218 punpckhwd m5, m4
2219 pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
2220 paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
2221 %endmacro
2222
2223 ;----------------------------------------------------------------------------------------------------------------
2224 ; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2225 ;----------------------------------------------------------------------------------------------------------------
2226 %macro FILTER_VER_CHROMA_W8 4
2227 INIT_XMM sse2
2228 cglobal interp_4tap_vert_%3_%1x%2, 5, 6, %4
2229
2230 add r1d, r1d
2231 add r3d, r3d
2232 sub r0, r1
2233 shl r4d, 5
2234
2235 %ifdef PIC
2236 lea r5, [tab_ChromaCoeffV]
2237 lea r5, [r5 + r4]
2238 %else
2239 lea r5, [tab_ChromaCoeffV + r4]
2240 %endif
2241
2242 mov r4d, %2/2
2243
2244 %ifidn %3, pp
2245 mova m7, [tab_c_32]
2246 %elifidn %3, sp
2247 mova m7, [tab_c_524800]
2248 %elifidn %3, ps
2249 mova m7, [tab_c_n32768]
2250 %endif
2251
2252 .loopH:
2253 PROCESS_CHROMA_SP_W8_2R
2254
2255 %ifidn %3, ss
2256 psrad m0, 6
2257 psrad m1, 6
2258 psrad m2, 6
2259 psrad m3, 6
2260
2261 packssdw m0, m1
2262 packssdw m2, m3
2263 %elifidn %3, ps
2264 paddd m0, m7
2265 paddd m1, m7
2266 paddd m2, m7
2267 paddd m3, m7
2268 psrad m0, 2
2269 psrad m1, 2
2270 psrad m2, 2
2271 psrad m3, 2
2272
2273 packssdw m0, m1
2274 packssdw m2, m3
2275 %else
2276 paddd m0, m7
2277 paddd m1, m7
2278 paddd m2, m7
2279 paddd m3, m7
2280 %ifidn %3, pp
2281 psrad m0, 6
2282 psrad m1, 6
2283 psrad m2, 6
2284 psrad m3, 6
2285 %else
2286 psrad m0, 10
2287 psrad m1, 10
2288 psrad m2, 10
2289 psrad m3, 10
2290 %endif
2291 packssdw m0, m1
2292 packssdw m2, m3
2293 pxor m5, m5
2294 mova m6, [pw_pixel_max]
2295 CLIPW2 m0, m2, m5, m6
2296 %endif
2297
2298 movu [r2], m0
2299 movu [r2 + r3], m2
2300
2301 lea r2, [r2 + 2 * r3]
2302
2303 dec r4d
2304 jnz .loopH
2305
2306 RET
2307 %endmacro
2308
2309 FILTER_VER_CHROMA_W8 8, 2, ss, 7
2310 FILTER_VER_CHROMA_W8 8, 4, ss, 7
2311 FILTER_VER_CHROMA_W8 8, 6, ss, 7
2312 FILTER_VER_CHROMA_W8 8, 8, ss, 7
2313 FILTER_VER_CHROMA_W8 8, 16, ss, 7
2314 FILTER_VER_CHROMA_W8 8, 32, ss, 7
2315
2316 FILTER_VER_CHROMA_W8 8, 2, sp, 8
2317 FILTER_VER_CHROMA_W8 8, 4, sp, 8
2318 FILTER_VER_CHROMA_W8 8, 6, sp, 8
2319 FILTER_VER_CHROMA_W8 8, 8, sp, 8
2320 FILTER_VER_CHROMA_W8 8, 16, sp, 8
2321 FILTER_VER_CHROMA_W8 8, 32, sp, 8
2322
2323 FILTER_VER_CHROMA_W8 8, 2, ps, 8
2324 FILTER_VER_CHROMA_W8 8, 4, ps, 8
2325 FILTER_VER_CHROMA_W8 8, 6, ps, 8
2326 FILTER_VER_CHROMA_W8 8, 8, ps, 8
2327 FILTER_VER_CHROMA_W8 8, 16, ps, 8
2328 FILTER_VER_CHROMA_W8 8, 32, ps, 8
2329
2330 FILTER_VER_CHROMA_W8 8, 2, pp, 8
2331 FILTER_VER_CHROMA_W8 8, 4, pp, 8
2332 FILTER_VER_CHROMA_W8 8, 6, pp, 8
2333 FILTER_VER_CHROMA_W8 8, 8, pp, 8
2334 FILTER_VER_CHROMA_W8 8, 16, pp, 8
2335 FILTER_VER_CHROMA_W8 8, 32, pp, 8
2336
2337 FILTER_VER_CHROMA_W8 8, 12, ss, 7
2338 FILTER_VER_CHROMA_W8 8, 64, ss, 7
2339 FILTER_VER_CHROMA_W8 8, 12, sp, 8
2340 FILTER_VER_CHROMA_W8 8, 64, sp, 8
2341 FILTER_VER_CHROMA_W8 8, 12, ps, 8
2342 FILTER_VER_CHROMA_W8 8, 64, ps, 8
2343 FILTER_VER_CHROMA_W8 8, 12, pp, 8
2344 FILTER_VER_CHROMA_W8 8, 64, pp, 8
2345
2346
2347 INIT_XMM sse2
2348 cglobal chroma_p2s, 3, 7, 3
2349
2350 ; load width and height
2351 mov r3d, r3m
2352 mov r4d, r4m
2353 add r1, r1
2354
2355 ; load constant
2356 mova m2, [tab_c_n8192]
2357
2358 .loopH:
2359
2360 xor r5d, r5d
2361 .loopW:
2362 lea r6, [r0 + r5 * 2]
2363
2364 movu m0, [r6]
2365 psllw m0, 4
2366 paddw m0, m2
2367
2368 movu m1, [r6 + r1]
2369 psllw m1, 4
2370 paddw m1, m2
2371
2372 add r5d, 8
2373 cmp r5d, r3d
2374 lea r6, [r2 + r5 * 2]
2375 jg .width4
2376 movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
2377 movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
2378 je .nextH
2379 jmp .loopW
2380
2381 .width4:
2382 test r3d, 4
2383 jz .width2
2384 test r3d, 2
2385 movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
2386 movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
2387 lea r6, [r6 + 8]
2388 pshufd m0, m0, 2
2389 pshufd m1, m1, 2
2390 jz .nextH
2391
2392 .width2:
2393 movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
2394 movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
2395
2396 .nextH:
2397 lea r0, [r0 + r1 * 2]
2398 add r2, FENC_STRIDE / 2 * 4
2399
2400 sub r4d, 2
2401 jnz .loopH
2402
2403 RET
2404
2405 %macro PROCESS_LUMA_VER_W4_4R 0
2406 movq m0, [r0]
2407 movq m1, [r0 + r1]
2408 punpcklwd m0, m1 ;m0=[0 1]
2409 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
2410
2411 lea r0, [r0 + 2 * r1]
2412 movq m4, [r0]
2413 punpcklwd m1, m4 ;m1=[1 2]
2414 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
2415
2416 movq m5, [r0 + r1]
2417 punpcklwd m4, m5 ;m4=[2 3]
2418 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
2419 pmaddwd m4, [r6 + 1 * 16]
2420 paddd m0, m4 ;m0=[0+1+2+3] Row1
2421
2422 lea r0, [r0 + 2 * r1]
2423 movq m4, [r0]
2424 punpcklwd m5, m4 ;m5=[3 4]
2425 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
2426 pmaddwd m5, [r6 + 1 * 16]
2427 paddd m1, m5 ;m1 = [1+2+3+4] Row2
2428
2429 movq m5, [r0 + r1]
2430 punpcklwd m4, m5 ;m4=[4 5]
2431 pmaddwd m6, m4, [r6 + 1 * 16]
2432 paddd m2, m6 ;m2=[2+3+4+5] Row3
2433 pmaddwd m4, [r6 + 2 * 16]
2434 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
2435
2436 lea r0, [r0 + 2 * r1]
2437 movq m4, [r0]
2438 punpcklwd m5, m4 ;m5=[5 6]
2439 pmaddwd m6, m5, [r6 + 1 * 16]
2440 paddd m3, m6 ;m3=[3+4+5+6] Row4
2441 pmaddwd m5, [r6 + 2 * 16]
2442 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
2443
2444 movq m5, [r0 + r1]
2445 punpcklwd m4, m5 ;m4=[6 7]
2446 pmaddwd m6, m4, [r6 + 2 * 16]
2447 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
2448 pmaddwd m4, [r6 + 3 * 16]
2449 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
2450
2451 lea r0, [r0 + 2 * r1]
2452 movq m4, [r0]
2453 punpcklwd m5, m4 ;m5=[7 8]
2454 pmaddwd m6, m5, [r6 + 2 * 16]
2455 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
2456 pmaddwd m5, [r6 + 3 * 16]
2457 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
2458
2459 movq m5, [r0 + r1]
2460 punpcklwd m4, m5 ;m4=[8 9]
2461 pmaddwd m4, [r6 + 3 * 16]
2462 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
2463
2464 movq m4, [r0 + 2 * r1]
2465 punpcklwd m5, m4 ;m5=[9 10]
2466 pmaddwd m5, [r6 + 3 * 16]
2467 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
2468 %endmacro
2469
2470 ;--------------------------------------------------------------------------------------------------------------
2471 ; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2472 ;--------------------------------------------------------------------------------------------------------------
2473 %macro FILTER_VER_LUMA_PP 2
2474 INIT_XMM sse4
2475 cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-gprsize
2476
2477 add r1d, r1d
2478 add r3d, r3d
2479 lea r5, [r1 + 2 * r1]
2480 sub r0, r5
2481 shl r4d, 6
2482
2483 %ifdef PIC
2484 lea r5, [tab_LumaCoeffV]
2485 lea r6, [r5 + r4]
2486 %else
2487 lea r6, [tab_LumaCoeffV + r4]
2488 %endif
2489
2490 mova m7, [pd_32]
2491
2492 mov dword [rsp], %2/4
2493 .loopH:
2494 mov r4d, (%1/4)
2495 .loopW:
2496 PROCESS_LUMA_VER_W4_4R
2497
2498 paddd m0, m7
2499 paddd m1, m7
2500 paddd m2, m7
2501 paddd m3, m7
2502
2503 psrad m0, 6
2504 psrad m1, 6
2505 psrad m2, 6
2506 psrad m3, 6
2507
2508 packssdw m0, m1
2509 packssdw m2, m3
2510
2511 pxor m1, m1
2512 CLIPW2 m0, m2, m1, [pw_pixel_max]
2513
2514 movh [r2], m0
2515 movhps [r2 + r3], m0
2516 lea r5, [r2 + 2 * r3]
2517 movh [r5], m2
2518 movhps [r5 + r3], m2
2519
2520 lea r5, [8 * r1 - 2 * 4]
2521 sub r0, r5
2522 add r2, 2 * 4
2523
2524 dec r4d
2525 jnz .loopW
2526
2527 lea r0, [r0 + 4 * r1 - 2 * %1]
2528 lea r2, [r2 + 4 * r3 - 2 * %1]
2529
2530 dec dword [rsp]
2531 jnz .loopH
2532
2533 RET
2534 %endmacro
2535
2536 ;-------------------------------------------------------------------------------------------------------------
2537 ; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2538 ;-------------------------------------------------------------------------------------------------------------
2539 FILTER_VER_LUMA_PP 4, 4
2540 FILTER_VER_LUMA_PP 8, 8
2541 FILTER_VER_LUMA_PP 8, 4
2542 FILTER_VER_LUMA_PP 4, 8
2543 FILTER_VER_LUMA_PP 16, 16
2544 FILTER_VER_LUMA_PP 16, 8
2545 FILTER_VER_LUMA_PP 8, 16
2546 FILTER_VER_LUMA_PP 16, 12
2547 FILTER_VER_LUMA_PP 12, 16
2548 FILTER_VER_LUMA_PP 16, 4
2549 FILTER_VER_LUMA_PP 4, 16
2550 FILTER_VER_LUMA_PP 32, 32
2551 FILTER_VER_LUMA_PP 32, 16
2552 FILTER_VER_LUMA_PP 16, 32
2553 FILTER_VER_LUMA_PP 32, 24
2554 FILTER_VER_LUMA_PP 24, 32
2555 FILTER_VER_LUMA_PP 32, 8
2556 FILTER_VER_LUMA_PP 8, 32
2557 FILTER_VER_LUMA_PP 64, 64
2558 FILTER_VER_LUMA_PP 64, 32
2559 FILTER_VER_LUMA_PP 32, 64
2560 FILTER_VER_LUMA_PP 64, 48
2561 FILTER_VER_LUMA_PP 48, 64
2562 FILTER_VER_LUMA_PP 64, 16
2563 FILTER_VER_LUMA_PP 16, 64
2564
2565 ;---------------------------------------------------------------------------------------------------------------
2566 ; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2567 ;---------------------------------------------------------------------------------------------------------------
2568 %macro FILTER_VER_LUMA_PS 2
2569 INIT_XMM sse4
2570 cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-gprsize
2571
2572 add r1d, r1d
2573 add r3d, r3d
2574 lea r5, [r1 + 2 * r1]
2575 sub r0, r5
2576 shl r4d, 6
2577
2578 %ifdef PIC
2579 lea r5, [tab_LumaCoeffV]
2580 lea r6, [r5 + r4]
2581 %else
2582 lea r6, [tab_LumaCoeffV + r4]
2583 %endif
2584
2585 mova m7, [pd_n32768]
2586
2587 mov dword [rsp], %2/4
2588 .loopH:
2589 mov r4d, (%1/4)
2590 .loopW:
2591 PROCESS_LUMA_VER_W4_4R
2592
2593 paddd m0, m7
2594 paddd m1, m7
2595 paddd m2, m7
2596 paddd m3, m7
2597
2598 psrad m0, 2
2599 psrad m1, 2
2600 psrad m2, 2
2601 psrad m3, 2
2602
2603 packssdw m0, m1
2604 packssdw m2, m3
2605
2606 movh [r2], m0
2607 movhps [r2 + r3], m0
2608 lea r5, [r2 + 2 * r3]
2609 movh [r5], m2
2610 movhps [r5 + r3], m2
2611
2612 lea r5, [8 * r1 - 2 * 4]
2613 sub r0, r5
2614 add r2, 2 * 4
2615
2616 dec r4d
2617 jnz .loopW
2618
2619 lea r0, [r0 + 4 * r1 - 2 * %1]
2620 lea r2, [r2 + 4 * r3 - 2 * %1]
2621
2622 dec dword [rsp]
2623 jnz .loopH
2624
2625 RET
2626 %endmacro
2627
2628 ;---------------------------------------------------------------------------------------------------------------
2629 ; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2630 ;---------------------------------------------------------------------------------------------------------------
2631 FILTER_VER_LUMA_PS 4, 4
2632 FILTER_VER_LUMA_PS 8, 8
2633 FILTER_VER_LUMA_PS 8, 4
2634 FILTER_VER_LUMA_PS 4, 8
2635 FILTER_VER_LUMA_PS 16, 16
2636 FILTER_VER_LUMA_PS 16, 8
2637 FILTER_VER_LUMA_PS 8, 16
2638 FILTER_VER_LUMA_PS 16, 12
2639 FILTER_VER_LUMA_PS 12, 16
2640 FILTER_VER_LUMA_PS 16, 4
2641 FILTER_VER_LUMA_PS 4, 16
2642 FILTER_VER_LUMA_PS 32, 32
2643 FILTER_VER_LUMA_PS 32, 16
2644 FILTER_VER_LUMA_PS 16, 32
2645 FILTER_VER_LUMA_PS 32, 24
2646 FILTER_VER_LUMA_PS 24, 32
2647 FILTER_VER_LUMA_PS 32, 8
2648 FILTER_VER_LUMA_PS 8, 32
2649 FILTER_VER_LUMA_PS 64, 64
2650 FILTER_VER_LUMA_PS 64, 32
2651 FILTER_VER_LUMA_PS 32, 64
2652 FILTER_VER_LUMA_PS 64, 48
2653 FILTER_VER_LUMA_PS 48, 64
2654 FILTER_VER_LUMA_PS 64, 16
2655 FILTER_VER_LUMA_PS 16, 64
2656
2657 ;--------------------------------------------------------------------------------------------------------------
2658 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2659 ;--------------------------------------------------------------------------------------------------------------
2660 %macro FILTER_VER_LUMA_SP 2
2661 INIT_XMM sse4
2662 cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
2663
2664 add r1d, r1d
2665 add r3d, r3d
2666 lea r5, [r1 + 2 * r1]
2667 sub r0, r5
2668 shl r4d, 6
2669
2670 %ifdef PIC
2671 lea r5, [tab_LumaCoeffV]
2672 lea r6, [r5 + r4]
2673 %else
2674 lea r6, [tab_LumaCoeffV + r4]
2675 %endif
2676
2677 mova m7, [tab_c_524800]
2678
2679 mov dword [rsp], %2/4
2680 .loopH:
2681 mov r4d, (%1/4)
2682 .loopW:
2683 PROCESS_LUMA_VER_W4_4R
2684
2685 paddd m0, m7
2686 paddd m1, m7
2687 paddd m2, m7
2688 paddd m3, m7
2689
2690 psrad m0, 10
2691 psrad m1, 10
2692 psrad m2, 10
2693 psrad m3, 10
2694
2695 packssdw m0, m1
2696 packssdw m2, m3
2697
2698 pxor m1, m1
2699 CLIPW2 m0, m2, m1, [pw_pixel_max]
2700
2701 movh [r2], m0
2702 movhps [r2 + r3], m0
2703 lea r5, [r2 + 2 * r3]
2704 movh [r5], m2
2705 movhps [r5 + r3], m2
2706
2707 lea r5, [8 * r1 - 2 * 4]
2708 sub r0, r5
2709 add r2, 2 * 4
2710
2711 dec r4d
2712 jnz .loopW
2713
2714 lea r0, [r0 + 4 * r1 - 2 * %1]
2715 lea r2, [r2 + 4 * r3 - 2 * %1]
2716
2717 dec dword [rsp]
2718 jnz .loopH
2719
2720 RET
2721 %endmacro
2722
2723 ;--------------------------------------------------------------------------------------------------------------
2724 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2725 ;--------------------------------------------------------------------------------------------------------------
2726 FILTER_VER_LUMA_SP 4, 4
2727 FILTER_VER_LUMA_SP 8, 8
2728 FILTER_VER_LUMA_SP 8, 4
2729 FILTER_VER_LUMA_SP 4, 8
2730 FILTER_VER_LUMA_SP 16, 16
2731 FILTER_VER_LUMA_SP 16, 8
2732 FILTER_VER_LUMA_SP 8, 16
2733 FILTER_VER_LUMA_SP 16, 12
2734 FILTER_VER_LUMA_SP 12, 16
2735 FILTER_VER_LUMA_SP 16, 4
2736 FILTER_VER_LUMA_SP 4, 16
2737 FILTER_VER_LUMA_SP 32, 32
2738 FILTER_VER_LUMA_SP 32, 16
2739 FILTER_VER_LUMA_SP 16, 32
2740 FILTER_VER_LUMA_SP 32, 24
2741 FILTER_VER_LUMA_SP 24, 32
2742 FILTER_VER_LUMA_SP 32, 8
2743 FILTER_VER_LUMA_SP 8, 32
2744 FILTER_VER_LUMA_SP 64, 64
2745 FILTER_VER_LUMA_SP 64, 32
2746 FILTER_VER_LUMA_SP 32, 64
2747 FILTER_VER_LUMA_SP 64, 48
2748 FILTER_VER_LUMA_SP 48, 64
2749 FILTER_VER_LUMA_SP 64, 16
2750 FILTER_VER_LUMA_SP 16, 64
2751
2752 ;-----------------------------------------------------------------------------------------------------------------
2753 ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2754 ;-----------------------------------------------------------------------------------------------------------------
2755 %macro FILTER_VER_LUMA_SS 2
2756 INIT_XMM sse2
2757 cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
2758
2759 add r1d, r1d
2760 add r3d, r3d
2761 lea r5, [3 * r1]
2762 sub r0, r5
2763 shl r4d, 6
2764
2765 %ifdef PIC
2766 lea r5, [tab_LumaCoeffV]
2767 lea r6, [r5 + r4]
2768 %else
2769 lea r6, [tab_LumaCoeffV + r4]
2770 %endif
2771
2772 mov dword [rsp], %2/4
2773 .loopH:
2774 mov r4d, (%1/4)
2775 .loopW:
2776 PROCESS_LUMA_VER_W4_4R
2777
2778 psrad m0, 6
2779 psrad m1, 6
2780 packssdw m0, m1
2781 movlps [r2], m0
2782 movhps [r2 + r3], m0
2783
2784 psrad m2, 6
2785 psrad m3, 6
2786 packssdw m2, m3
2787 movlps [r2 + 2 * r3], m2
2788 lea r5, [3 * r3]
2789 movhps [r2 + r5], m2
2790
2791 lea r5, [8 * r1 - 2 * 4]
2792 sub r0, r5
2793 add r2, 2 * 4
2794
2795 dec r4d
2796 jnz .loopW
2797
2798 lea r0, [r0 + 4 * r1 - 2 * %1]
2799 lea r2, [r2 + 4 * r3 - 2 * %1]
2800
2801 dec dword [rsp]
2802 jnz .loopH
2803
2804 RET
2805 %endmacro
2806
2807 FILTER_VER_LUMA_SS 4, 4
2808 FILTER_VER_LUMA_SS 8, 8
2809 FILTER_VER_LUMA_SS 8, 4
2810 FILTER_VER_LUMA_SS 4, 8
2811 FILTER_VER_LUMA_SS 16, 16
2812 FILTER_VER_LUMA_SS 16, 8
2813 FILTER_VER_LUMA_SS 8, 16
2814 FILTER_VER_LUMA_SS 16, 12
2815 FILTER_VER_LUMA_SS 12, 16
2816 FILTER_VER_LUMA_SS 16, 4
2817 FILTER_VER_LUMA_SS 4, 16
2818 FILTER_VER_LUMA_SS 32, 32
2819 FILTER_VER_LUMA_SS 32, 16
2820 FILTER_VER_LUMA_SS 16, 32
2821 FILTER_VER_LUMA_SS 32, 24
2822 FILTER_VER_LUMA_SS 24, 32
2823 FILTER_VER_LUMA_SS 32, 8
2824 FILTER_VER_LUMA_SS 8, 32
2825 FILTER_VER_LUMA_SS 64, 64
2826 FILTER_VER_LUMA_SS 64, 32
2827 FILTER_VER_LUMA_SS 32, 64
2828 FILTER_VER_LUMA_SS 64, 48
2829 FILTER_VER_LUMA_SS 48, 64
2830 FILTER_VER_LUMA_SS 64, 16
2831 FILTER_VER_LUMA_SS 16, 64
2832
2833 ;--------------------------------------------------------------------------------------------------
2834 ; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
2835 ;--------------------------------------------------------------------------------------------------
2836 INIT_XMM sse2
2837 cglobal luma_p2s, 3, 7, 5
2838
2839 add r1, r1
2840
2841 ; load width and height
2842 mov r3d, r3m
2843 mov r4d, r4m
2844
2845 ; load constant
2846 mova m4, [tab_c_n8192]
2847
2848 .loopH:
2849
2850 xor r5d, r5d
2851 .loopW:
2852 lea r6, [r0 + r5 * 2]
2853
2854 movu m0, [r6]
2855 psllw m0, 4
2856 paddw m0, m4
2857
2858 movu m1, [r6 + r1]
2859 psllw m1, 4
2860 paddw m1, m4
2861
2862 movu m2, [r6 + r1 * 2]
2863 psllw m2, 4
2864 paddw m2, m4
2865
2866 lea r6, [r6 + r1 * 2]
2867 movu m3, [r6 + r1]
2868 psllw m3, 4
2869 paddw m3, m4
2870
2871 add r5, 8
2872 cmp r5, r3
2873 jg .width4
2874 movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
2875 movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
2876 movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
2877 movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
2878 je .nextH
2879 jmp .loopW
2880
2881 .width4:
2882 movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
2883 movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
2884 movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
2885 movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
2886
2887 .nextH:
2888 lea r0, [r0 + r1 * 4]
2889 add r2, FENC_STRIDE * 8
2890
2891 sub r4d, 4
2892 jnz .loopH
2893
2894 RET