Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / hevc_mc.asm
CommitLineData
2ba45a60
DM
1; /*
2; * Provide SSE luma and chroma mc functions for HEVC decoding
3; * Copyright (c) 2013 Pierre-Edouard LEPERE
4; *
5; * This file is part of FFmpeg.
6; *
7; * FFmpeg is free software; you can redistribute it and/or
8; * modify it under the terms of the GNU Lesser General Public
9; * License as published by the Free Software Foundation; either
10; * version 2.1 of the License, or (at your option) any later version.
11; *
12; * FFmpeg is distributed in the hope that it will be useful,
13; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15; * Lesser General Public License for more details.
16; *
17; * You should have received a copy of the GNU Lesser General Public
18; * License along with FFmpeg; if not, write to the Free Software
19; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20; */
21%include "libavutil/x86/x86util.asm"
22
23SECTION_RODATA
24pw_8: times 8 dw (1 << 9)
25pw_10: times 8 dw (1 << 11)
26pw_12: times 8 dw (1 << 13)
27pw_bi_8: times 8 dw (1 << 8)
28pw_bi_10: times 8 dw (1 << 10)
29pw_bi_12: times 8 dw (1 << 12)
30max_pixels_10: times 8 dw ((1 << 10)-1)
31max_pixels_12: times 8 dw ((1 << 12)-1)
32zero: times 4 dd 0
33one_per_32: times 4 dd 1
34
35SECTION .text
36%macro EPEL_TABLE 4
37hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
38 times %2 d%3 10, -2
39 times %2 d%3 -4, 54
40 times %2 d%3 16, -2
41 times %2 d%3 -6, 46
42 times %2 d%3 28, -4
43 times %2 d%3 -4, 36
44 times %2 d%3 36, -4
45 times %2 d%3 -4, 28
46 times %2 d%3 46, -6
47 times %2 d%3 -2, 16
48 times %2 d%3 54, -4
49 times %2 d%3 -2, 10
50 times %2 d%3 58, -2
51%endmacro
52
53
54
55EPEL_TABLE 8, 8, b, sse4
56EPEL_TABLE 10, 4, w, sse4
57EPEL_TABLE 12, 4, w, sse4
58
59%macro QPEL_TABLE 4
60hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
61 times %2 d%3 -10, 58
62 times %2 d%3 17, -5
63 times %2 d%3 1, 0
64 times %2 d%3 -1, 4
65 times %2 d%3 -11, 40
66 times %2 d%3 40,-11
67 times %2 d%3 4, -1
68 times %2 d%3 0, 1
69 times %2 d%3 -5, 17
70 times %2 d%3 58,-10
71 times %2 d%3 4, -1
72%endmacro
73
74QPEL_TABLE 8, 8, b, sse4
75QPEL_TABLE 10, 4, w, sse4
76QPEL_TABLE 12, 4, w, sse4
77
78%define MAX_PB_SIZE 64
79
80%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
81
82%if ARCH_X86_64
83
84%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
85%if %1 <= 4
86 movq %3, [%2] ; load data from source2
87%elif %1 <= 8
88 movdqa %3, [%2] ; load data from source2
89%elif %1 <= 12
90 movdqa %3, [%2] ; load data from source2
91 movq %4, [%2+16] ; load data from source2
92%else
93 movdqa %3, [%2] ; load data from source2
94 movdqa %4, [%2+16] ; load data from source2
95%endif
96%endmacro
97
98%macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
99%if %1 == 2 || (%2 == 8 && %1 <= 4)
100 movd %4, [%3] ; load data from source
101%elif %1 == 4 || (%2 == 8 && %1 <= 8)
102 movq %4, [%3] ; load data from source
103%else
104 movdqu %4, [%3] ; load data from source
105%endif
106%endmacro
107
108%macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
109%if %1 == 2 || (%2 == 8 && %1 <= 4)
110 movq %4, [%3] ; load data from source2
111%elif %1 == 4 || (%2 == 8 && %1 <= 8)
112 movdqa %4, [%3] ; load data from source2
113%elif %1 <= 12
114 movdqa %4, [%3] ; load data from source2
115 movq %5, [%3+16] ; load data from source2
116%else
117 movdqa %4, [%3] ; load data from source2
118 movdqa %5, [%3+16] ; load data from source2
119%endif
120%endmacro
121
122%macro EPEL_FILTER 2-4 ; bit depth, filter index
123%ifdef PIC
124 lea rfilterq, [hevc_epel_filters_sse4_%1]
125%else
126 %define rfilterq hevc_epel_filters_sse4_%1
127%endif
128 sub %2q, 1
129 shl %2q, 5 ; multiply by 32
130%if %0 == 2
131 movdqa m14, [rfilterq + %2q] ; get 2 first values of filters
132 movdqa m15, [rfilterq + %2q+16] ; get 2 last values of filters
133%else
134 movdqa %3, [rfilterq + %2q] ; get 2 first values of filters
135 movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters
136%endif
137%endmacro
138
139%macro EPEL_HV_FILTER 1
140%ifdef PIC
141 lea rfilterq, [hevc_epel_filters_sse4_%1]
142%else
143 %define rfilterq hevc_epel_filters_sse4_%1
144%endif
145 sub mxq, 1
146 sub myq, 1
147 shl mxq, 5 ; multiply by 32
148 shl myq, 5 ; multiply by 32
149 movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
150 movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
151 lea r3srcq, [srcstrideq*3]
152
153%ifdef PIC
154 lea rfilterq, [hevc_epel_filters_sse4_10]
155%else
156 %define rfilterq hevc_epel_filters_sse4_10
157%endif
158 movdqa m12, [rfilterq + myq] ; get 2 first values of filters
159 movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
160%endmacro
161
162%macro QPEL_FILTER 2
163%ifdef PIC
164 lea rfilterq, [hevc_qpel_filters_sse4_%1]
165%else
166 %define rfilterq hevc_qpel_filters_sse4_%1
167%endif
168 lea %2q, [%2q*8-8]
169 movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
170 movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
171 movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
172 movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
173%endmacro
174
175%macro EPEL_LOAD 4
176%ifdef PIC
177 lea rfilterq, [%2]
178%else
179 %define rfilterq %2
180%endif
181%if (%1 == 8 && %4 <= 4)
182%define %%load movd
183%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
184%define %%load movq
185%else
186%define %%load movdqu
187%endif
188
189 %%load m0, [rfilterq ]
190%ifnum %3
191 %%load m1, [rfilterq+ %3]
192 %%load m2, [rfilterq+2*%3]
193 %%load m3, [rfilterq+3*%3]
194%else
195 %%load m1, [rfilterq+ %3q]
196 %%load m2, [rfilterq+2*%3q]
197 %%load m3, [rfilterq+r3srcq]
198%endif
199
200%if %1 == 8
201%if %4 > 8
202 SBUTTERFLY bw, 0, 1, 10
203 SBUTTERFLY bw, 2, 3, 10
204%else
205 punpcklbw m0, m1
206 punpcklbw m2, m3
207%endif
208%else
209%if %4 > 4
210 SBUTTERFLY wd, 0, 1, 10
211 SBUTTERFLY wd, 2, 3, 10
212%else
213 punpcklwd m0, m1
214 punpcklwd m2, m3
215%endif
216%endif
217%endmacro
218
219
220%macro QPEL_H_LOAD 4
221%assign %%stride (%1+7)/8
222%if %1 == 8
223%if %3 <= 4
224%define %%load movd
225%elif %3 == 8
226%define %%load movq
227%else
228%define %%load movdqu
229%endif
230%else
231%if %3 == 2
232%define %%load movd
233%elif %3 == 4
234%define %%load movq
235%else
236%define %%load movdqu
237%endif
238%endif
239 %%load m0, [%2-3*%%stride] ;load data from source
240 %%load m1, [%2-2*%%stride]
241 %%load m2, [%2-%%stride ]
242 %%load m3, [%2 ]
243 %%load m4, [%2+%%stride ]
244 %%load m5, [%2+2*%%stride]
245 %%load m6, [%2+3*%%stride]
246 %%load m7, [%2+4*%%stride]
247
248%if %1 == 8
249%if %3 > 8
250 SBUTTERFLY wd, 0, 1, %4
251 SBUTTERFLY wd, 2, 3, %4
252 SBUTTERFLY wd, 4, 5, %4
253 SBUTTERFLY wd, 6, 7, %4
254%else
255 punpcklwd m0, m1
256 punpcklwd m2, m3
257 punpcklwd m4, m5
258 punpcklwd m6, m7
259%endif
260%else
261%if %3 > 4
262 SBUTTERFLY dq, 0, 1, %4
263 SBUTTERFLY dq, 2, 3, %4
264 SBUTTERFLY dq, 4, 5, %4
265 SBUTTERFLY dq, 6, 7, %4
266%else
267 punpckldq m0, m1
268 punpckldq m2, m3
269 punpckldq m4, m5
270 punpckldq m6, m7
271%endif
272%endif
273%endmacro
274
275%macro QPEL_V_LOAD 5
276 lea %5q, [%2]
277 sub %5q, r3srcq
278 movdqu m0, [%5q ] ;load x- 3*srcstride
279 movdqu m1, [%5q+ %3q ] ;load x- 2*srcstride
280 movdqu m2, [%5q+ 2*%3q ] ;load x-srcstride
281 movdqu m3, [%2 ] ;load x
282 movdqu m4, [%2+ %3q] ;load x+stride
283 movdqu m5, [%2+ 2*%3q] ;load x+2*stride
284 movdqu m6, [%2+r3srcq] ;load x+3*stride
285 movdqu m7, [%2+ 4*%3q] ;load x+4*stride
286%if %1 == 8
287%if %4 > 8
288 SBUTTERFLY bw, 0, 1, 8
289 SBUTTERFLY bw, 2, 3, 8
290 SBUTTERFLY bw, 4, 5, 8
291 SBUTTERFLY bw, 6, 7, 8
292%else
293 punpcklbw m0, m1
294 punpcklbw m2, m3
295 punpcklbw m4, m5
296 punpcklbw m6, m7
297%endif
298%else
299%if %4 > 4
300 SBUTTERFLY wd, 0, 1, 8
301 SBUTTERFLY wd, 2, 3, 8
302 SBUTTERFLY wd, 4, 5, 8
303 SBUTTERFLY wd, 6, 7, 8
304%else
305 punpcklwd m0, m1
306 punpcklwd m2, m3
307 punpcklwd m4, m5
308 punpcklwd m6, m7
309%endif
310%endif
311%endmacro
312
313%macro PEL_12STORE2 3
314 movd [%1], %2
315%endmacro
316%macro PEL_12STORE4 3
317 movq [%1], %2
318%endmacro
319%macro PEL_12STORE6 3
320 movq [%1], %2
321 psrldq %2, 8
322 movd [%1+8], %2
323%endmacro
324%macro PEL_12STORE8 3
325 movdqa [%1], %2
326%endmacro
327%macro PEL_12STORE12 3
328 movdqa [%1], %2
329 movq [%1+16], %3
330%endmacro
331%macro PEL_12STORE16 3
332 PEL_12STORE8 %1, %2, %3
333 movdqa [%1+16], %3
334%endmacro
335
336%macro PEL_10STORE2 3
337 movd [%1], %2
338%endmacro
339%macro PEL_10STORE4 3
340 movq [%1], %2
341%endmacro
342%macro PEL_10STORE6 3
343 movq [%1], %2
344 psrldq %2, 8
345 movd [%1+8], %2
346%endmacro
347%macro PEL_10STORE8 3
348 movdqa [%1], %2
349%endmacro
350%macro PEL_10STORE12 3
351 movdqa [%1], %2
352 movq [%1+16], %3
353%endmacro
354%macro PEL_10STORE16 3
355 PEL_10STORE8 %1, %2, %3
356 movdqa [%1+16], %3
357%endmacro
358
359%macro PEL_8STORE2 3
360 pextrw [%1], %2, 0
361%endmacro
362%macro PEL_8STORE4 3
363 movd [%1], %2
364%endmacro
365%macro PEL_8STORE6 3
366 movd [%1], %2
367 pextrw [%1+4], %2, 2
368%endmacro
369%macro PEL_8STORE8 3
370 movq [%1], %2
371%endmacro
372%macro PEL_8STORE12 3
373 movq [%1], %2
374 psrldq %2, 8
375 movd [%1+8], %2
376%endmacro
377%macro PEL_8STORE16 3
378 movdqa [%1], %2
379%endmacro
380
381%macro LOOP_END 3
382 add %1q, 2*MAX_PB_SIZE ; dst += dststride
383 add %2q, %3q ; src += srcstride
384 dec heightd ; cmp height
385 jnz .loop ; height loop
386%endmacro
387
388
389%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
390%if %2 == 8
391%if %1 > 8
392 punpckhbw m1, m0, m2
393 psllw m1, 14-%2
394%endif
395 punpcklbw m0, m2
396%endif
397 psllw m0, 14-%2
398%endmacro
399
400
401%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
402%if %1 == 8
403 pmaddubsw m0, %3 ;x1*c1+x2*c2
404 pmaddubsw m2, %4 ;x3*c3+x4*c4
405 paddw m0, m2
406%if %2 > 8
407 pmaddubsw m1, %3
408 pmaddubsw m3, %4
409 paddw m1, m3
410%endif
411%else
412 pmaddwd m0, %3
413 pmaddwd m2, %4
414 paddd m0, m2
415%if %2 > 4
416 pmaddwd m1, %3
417 pmaddwd m3, %4
418 paddd m1, m3
419%endif
420%if %1 != 8
421 psrad m0, %1-8
422 psrad m1, %1-8
423%endif
424 packssdw m0, m1
425%endif
426%endmacro
427
428%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
429%ifdef PIC
430 lea rfilterq, [hevc_qpel_filters_sse4_%2]
431%else
432 %define rfilterq hevc_qpel_filters_sse4_%2
433%endif
434
435%if %2 == 8
436 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
437 pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4
438 pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6
439 pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8
440 paddw m0, m2
441 paddw m4, m6
442 paddw m0, m4
443%else
444 pmaddwd m0, [rfilterq + %3q*8 ]
445 pmaddwd m2, [rfilterq + %3q*8+16]
446 pmaddwd m4, [rfilterq + %3q*8+32]
447 pmaddwd m6, [rfilterq + %3q*8+48]
448 paddd m0, m2
449 paddd m4, m6
450 paddd m0, m4
451%if %2 != 8
452 psrad m0, %2-8
453%endif
454%if %1 > 4
455 pmaddwd m1, [rfilterq + %3q*8 ]
456 pmaddwd m3, [rfilterq + %3q*8+16]
457 pmaddwd m5, [rfilterq + %3q*8+32]
458 pmaddwd m7, [rfilterq + %3q*8+48]
459 paddd m1, m3
460 paddd m5, m7
461 paddd m1, m5
462%if %2 != 8
463 psrad m1, %2-8
464%endif
465%endif
466 p%4 m0, m1
467%endif
468%endmacro
469
470%macro QPEL_COMPUTE 2 ; width, bitdepth
471%if %2 == 8
472 pmaddubsw m0, m12 ;x1*c1+x2*c2
473 pmaddubsw m2, m13 ;x3*c3+x4*c4
474 pmaddubsw m4, m14 ;x5*c5+x6*c6
475 pmaddubsw m6, m15 ;x7*c7+x8*c8
476 paddw m0, m2
477 paddw m4, m6
478 paddw m0, m4
479%if %1 > 8
480 pmaddubsw m1, m12
481 pmaddubsw m3, m13
482 pmaddubsw m5, m14
483 pmaddubsw m7, m15
484 paddw m1, m3
485 paddw m5, m7
486 paddw m1, m5
487%endif
488%else
489 pmaddwd m0, m12
490 pmaddwd m2, m13
491 pmaddwd m4, m14
492 pmaddwd m6, m15
493 paddd m0, m2
494 paddd m4, m6
495 paddd m0, m4
496%if %2 != 8
497 psrad m0, %2-8
498%endif
499%if %1 > 4
500 pmaddwd m1, m12
501 pmaddwd m3, m13
502 pmaddwd m5, m14
503 pmaddwd m7, m15
504 paddd m1, m3
505 paddd m5, m7
506 paddd m1, m5
507%if %2 != 8
508 psrad m1, %2-8
509%endif
510%endif
511%endif
512%endmacro
513
514%macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
515 paddsw %3, %5
516%if %1 > 8
517 paddsw %4, %6
518%endif
519 UNI_COMPUTE %1, %2, %3, %4, %7
520%endmacro
521
522%macro UNI_COMPUTE 5
523 pmulhrsw %3, %5
524%if %1 > 8 || (%2 > 8 && %1 > 4)
525 pmulhrsw %4, %5
526%endif
527%if %2 == 8
528 packuswb %3, %4
529%else
530 pminsw %3, [max_pixels_%2]
531 pmaxsw %3, [zero]
532%if %1 > 8
533 pminsw %4, [max_pixels_%2]
534 pmaxsw %4, [zero]
535%endif
536%endif
537%endmacro
538
539INIT_XMM sse4 ; adds ff_ and _sse4 to function name
540; ******************************
541; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
542; uint8_t *_src, ptrdiff_t _srcstride,
543; int height, int mx, int my)
544; ******************************
545
546%macro HEVC_PUT_HEVC_PEL_PIXELS 2
547cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
548 pxor m2, m2
549.loop
550 SIMPLE_LOAD %1, %2, srcq, m0
551 MC_PIXEL_COMPUTE %1, %2
552 PEL_10STORE%1 dstq, m0, m1
553 LOOP_END dst, src, srcstride
554 RET
555
556cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
557.loop
558 SIMPLE_LOAD %1, %2, srcq, m0
559 PEL_%2STORE%1 dstq, m0, m1
560 add dstq, dststrideq ; dst += dststride
561 add srcq, srcstrideq ; src += srcstride
562 dec heightd ; cmp height
563 jnz .loop ; height loop
564 RET
565
566cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
567 pxor m2, m2
568 movdqa m5, [pw_bi_%2]
569.loop
570 SIMPLE_LOAD %1, %2, srcq, m0
571 SIMPLE_BILOAD %1, src2q, m3, m4
572 MC_PIXEL_COMPUTE %1, %2
573 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
574 PEL_%2STORE%1 dstq, m0, m1
575 add dstq, dststrideq ; dst += dststride
576 add srcq, srcstrideq ; src += srcstride
577 add src2q, 2*MAX_PB_SIZE ; src += srcstride
578 dec heightd ; cmp height
579 jnz .loop ; height loop
580 RET
581
582%endmacro
583
584
585; ******************************
586; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
587; uint8_t *_src, ptrdiff_t _srcstride,
588; int width, int height, int mx, int my,
589; int16_t* mcbuffer)
590; ******************************
591
592
593%macro HEVC_PUT_HEVC_EPEL 2
594cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 6, dst, src, srcstride, height, mx, rfilter
595%assign %%stride ((%2 + 7)/8)
596 EPEL_FILTER %2, mx, m4, m5
597.loop
598 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
599 EPEL_COMPUTE %2, %1, m4, m5
600 PEL_10STORE%1 dstq, m0, m1
601 LOOP_END dst, src, srcstride
602 RET
603
604cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter
605%assign %%stride ((%2 + 7)/8)
606 movdqa m6, [pw_%2]
607 EPEL_FILTER %2, mx, m4, m5
608.loop
609 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
610 EPEL_COMPUTE %2, %1, m4, m5
611 UNI_COMPUTE %1, %2, m0, m1, m6
612 PEL_%2STORE%1 dstq, m0, m1
613 add dstq, dststrideq ; dst += dststride
614 add srcq, srcstrideq ; src += srcstride
615 dec heightd ; cmp height
616 jnz .loop ; height loop
617 RET
618
619cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 7, dst, dststride, src, srcstride, src2, height, mx, rfilter
620 movdqa m6, [pw_bi_%2]
621 EPEL_FILTER %2, mx, m4, m5
622.loop
623 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
624 EPEL_COMPUTE %2, %1, m4, m5
625 SIMPLE_BILOAD %1, src2q, m2, m3
626 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
627 PEL_%2STORE%1 dstq, m0, m1
628 add dstq, dststrideq ; dst += dststride
629 add srcq, srcstrideq ; src += srcstride
630 add src2q, 2*MAX_PB_SIZE ; src += srcstride
631 dec heightd ; cmp height
632 jnz .loop ; height loop
633 RET
634
635; ******************************
636; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
637; uint8_t *_src, ptrdiff_t _srcstride,
638; int width, int height, int mx, int my,
639; int16_t* mcbuffer)
640; ******************************
641
642cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 6, dst, src, srcstride, height, r3src, my, rfilter
643 lea r3srcq, [srcstrideq*3]
644 sub srcq, srcstrideq
645 EPEL_FILTER %2, my, m4, m5
646.loop
647 EPEL_LOAD %2, srcq, srcstride, %1
648 EPEL_COMPUTE %2, %1, m4, m5
649 PEL_10STORE%1 dstq, m0, m1
650 LOOP_END dst, src, srcstride
651 RET
652
653cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter
654 lea r3srcq, [srcstrideq*3]
655 movdqa m6, [pw_%2]
656 sub srcq, srcstrideq
657 EPEL_FILTER %2, my, m4, m5
658.loop
659 EPEL_LOAD %2, srcq, srcstride, %1
660 EPEL_COMPUTE %2, %1, m4, m5
661 UNI_COMPUTE %1, %2, m0, m1, m6
662 PEL_%2STORE%1 dstq, m0, m1
663 add dstq, dststrideq ; dst += dststride
664 add srcq, srcstrideq ; src += srcstride
665 dec heightd ; cmp height
666 jnz .loop ; height loop
667 RET
668
669
670cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
671 lea r3srcq, [srcstrideq*3]
672 movdqa m6, [pw_bi_%2]
673 sub srcq, srcstrideq
674 EPEL_FILTER %2, my, m4, m5
675.loop
676 EPEL_LOAD %2, srcq, srcstride, %1
677 EPEL_COMPUTE %2, %1, m4, m5
678 SIMPLE_BILOAD %1, src2q, m2, m3
679 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
680 PEL_%2STORE%1 dstq, m0, m1
681 add dstq, dststrideq ; dst += dststride
682 add srcq, srcstrideq ; src += srcstride
683 add src2q, 2*MAX_PB_SIZE ; src += srcstride
684 dec heightd ; cmp height
685 jnz .loop ; height loop
686 RET
687%endmacro
688
689
690; ******************************
691; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
692; uint8_t *_src, ptrdiff_t _srcstride,
693; int width, int height, int mx, int my)
694; ******************************
695
696%macro HEVC_PUT_HEVC_EPEL_HV 2
697cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 12 , dst, src, srcstride, height, mx, my, r3src, rfilter
698%assign %%stride ((%2 + 7)/8)
699 sub srcq, srcstrideq
700 EPEL_HV_FILTER %2
701 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
702 EPEL_COMPUTE %2, %1, m14, m15
703 SWAP m4, m0
704 add srcq, srcstrideq
705 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
706 EPEL_COMPUTE %2, %1, m14, m15
707 SWAP m5, m0
708 add srcq, srcstrideq
709 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
710 EPEL_COMPUTE %2, %1, m14, m15
711 SWAP m6, m0
712 add srcq, srcstrideq
713.loop
714 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
715 EPEL_COMPUTE %2, %1, m14, m15
716 SWAP m7, m0
717 punpcklwd m0, m4, m5
718 punpcklwd m2, m6, m7
719%if %1 > 4
720 punpckhwd m1, m4, m5
721 punpckhwd m3, m6, m7
722%endif
723 EPEL_COMPUTE 14, %1, m12, m13
724 PEL_10STORE%1 dstq, m0, m1
725 movdqa m4, m5
726 movdqa m5, m6
727 movdqa m6, m7
728 LOOP_END dst, src, srcstride
729 RET
730
731cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
732%assign %%stride ((%2 + 7)/8)
733 sub srcq, srcstrideq
734 EPEL_HV_FILTER %2
735 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
736 EPEL_COMPUTE %2, %1, m14, m15
737 SWAP m4, m0
738 add srcq, srcstrideq
739 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
740 EPEL_COMPUTE %2, %1, m14, m15
741 SWAP m5, m0
742 add srcq, srcstrideq
743 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
744 EPEL_COMPUTE %2, %1, m14, m15
745 SWAP m6, m0
746 add srcq, srcstrideq
747.loop
748 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
749 EPEL_COMPUTE %2, %1, m14, m15
750 SWAP m7, m0
751 punpcklwd m0, m4, m5
752 punpcklwd m2, m6, m7
753%if %1 > 4
754 punpckhwd m1, m4, m5
755 punpckhwd m3, m6, m7
756%endif
757 EPEL_COMPUTE 14, %1, m12, m13
758 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
759 PEL_%2STORE%1 dstq, m0, m1
760 movdqa m4, m5
761 movdqa m5, m6
762 movdqa m6, m7
763 add dstq, dststrideq ; dst += dststride
764 add srcq, srcstrideq ; src += srcstride
765 dec heightd ; cmp height
766 jnz .loop ; height loop
767 RET
768
769
770cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
771%assign %%stride ((%2 + 7)/8)
772 sub srcq, srcstrideq
773 EPEL_HV_FILTER %2
774 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
775 EPEL_COMPUTE %2, %1, m14, m15
776 SWAP m4, m0
777 add srcq, srcstrideq
778 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
779 EPEL_COMPUTE %2, %1, m14, m15
780 SWAP m5, m0
781 add srcq, srcstrideq
782 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
783 EPEL_COMPUTE %2, %1, m14, m15
784 SWAP m6, m0
785 add srcq, srcstrideq
786.loop
787 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
788 EPEL_COMPUTE %2, %1, m14, m15
789 SWAP m7, m0
790 punpcklwd m0, m4, m5
791 punpcklwd m2, m6, m7
792%if %1 > 4
793 punpckhwd m1, m4, m5
794 punpckhwd m3, m6, m7
795%endif
796 EPEL_COMPUTE 14, %1, m12, m13
797 SIMPLE_BILOAD %1, src2q, m8, m9
798 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
799 PEL_%2STORE%1 dstq, m0, m1
800 movdqa m4, m5
801 movdqa m5, m6
802 movdqa m6, m7
803 add dstq, dststrideq ; dst += dststride
804 add srcq, srcstrideq ; src += srcstride
805 add src2q, 2*MAX_PB_SIZE ; src += srcstride
806 dec heightd ; cmp height
807 jnz .loop ; height loop
808 RET
809%endmacro
810
811; ******************************
812; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
813; uint8_t *_src, ptrdiff_t _srcstride,
814; int width, int height, int mx, int my)
815; ******************************
816
817%macro HEVC_PUT_HEVC_QPEL 2
818cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 15, dst, src, srcstride, height, mx, rfilter
819 QPEL_FILTER %2, mx
820.loop
821 QPEL_H_LOAD %2, srcq, %1, 10
822 QPEL_COMPUTE %1, %2
823%if %2 > 8
824 packssdw m0, m1
825%endif
826 PEL_10STORE%1 dstq, m0, m1
827 LOOP_END dst, src, srcstride
828 RET
829
830cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
831 movdqa m9, [pw_%2]
832 QPEL_FILTER %2, mx
833.loop
834 QPEL_H_LOAD %2, srcq, %1, 10
835 QPEL_COMPUTE %1, %2
836%if %2 > 8
837 packssdw m0, m1
838%endif
839 UNI_COMPUTE %1, %2, m0, m1, m9
840 PEL_%2STORE%1 dstq, m0, m1
841 add dstq, dststrideq ; dst += dststride
842 add srcq, srcstrideq ; src += srcstride
843 dec heightd ; cmp height
844 jnz .loop ; height loop
845 RET
846
847cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
848 movdqa m9, [pw_bi_%2]
849 QPEL_FILTER %2, mx
850.loop
851 QPEL_H_LOAD %2, srcq, %1, 10
852 QPEL_COMPUTE %1, %2
853%if %2 > 8
854 packssdw m0, m1
855%endif
856 SIMPLE_BILOAD %1, src2q, m10, m11
857 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
858 PEL_%2STORE%1 dstq, m0, m1
859 add dstq, dststrideq ; dst += dststride
860 add srcq, srcstrideq ; src += srcstride
861 add src2q, 2*MAX_PB_SIZE ; src += srcstride
862 dec heightd ; cmp height
863 jnz .loop ; height loop
864 RET
865
866
867; ******************************
868; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
869; uint8_t *_src, ptrdiff_t _srcstride,
870; int width, int height, int mx, int my)
871; ******************************
872
873cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 15, dst, src, srcstride, height, r3src, my, rfilter
874 lea r3srcq, [srcstrideq*3]
875 QPEL_FILTER %2, my
876.loop
877 QPEL_V_LOAD %2, srcq, srcstride, %1, r7
878 QPEL_COMPUTE %1, %2
879%if %2 > 8
880 packssdw m0, m1
881%endif
882 PEL_10STORE%1 dstq, m0, m1
883 LOOP_END dst, src, srcstride
884 RET
885
886cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 9, 15, dst, dststride, src, srcstride, height, r3src, my, rfilter
887 movdqa m9, [pw_%2]
888 lea r3srcq, [srcstrideq*3]
889 QPEL_FILTER %2, my
890.loop
891 QPEL_V_LOAD %2, srcq, srcstride, %1, r8
892 QPEL_COMPUTE %1, %2
893%if %2 > 8
894 packssdw m0, m1
895%endif
896 UNI_COMPUTE %1, %2, m0, m1, m9
897 PEL_%2STORE%1 dstq, m0, m1
898 add dstq, dststrideq ; dst += dststride
899 add srcq, srcstrideq ; src += srcstride
900 dec heightd ; cmp height
901 jnz .loop ; height loop
902 RET
903
904cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
905 movdqa m9, [pw_bi_%2]
906 lea r3srcq, [srcstrideq*3]
907 QPEL_FILTER %2, my
908.loop
909 SIMPLE_BILOAD %1, src2q, m10, m11
910 QPEL_V_LOAD %2, srcq, srcstride, %1, r9
911 QPEL_COMPUTE %1, %2
912%if %2 > 8
913 packssdw m0, m1
914%endif
915 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
916 PEL_%2STORE%1 dstq, m0, m1
917 add dstq, dststrideq ; dst += dststride
918 add srcq, srcstrideq ; src += srcstride
919 add src2q, 2*MAX_PB_SIZE ; src += srcstride
920 dec heightd ; cmp height
921 jnz .loop ; height loop
922 RET
923%endmacro
924
925
926; ******************************
927; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
928; uint8_t *_src, ptrdiff_t _srcstride,
929; int height, int mx, int my)
930; ******************************
931%macro HEVC_PUT_HEVC_QPEL_HV 2
932cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 12, dst, src, srcstride, height, mx, my, r3src, rfilter
933 lea mxq, [mxq*8-8]
934 lea myq, [myq*8-8]
935 lea r3srcq, [srcstrideq*3]
936 sub srcq, r3srcq
937 QPEL_H_LOAD %2, srcq, %1, 15
938 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
939 SWAP m8, m0
940 add srcq, srcstrideq
941 QPEL_H_LOAD %2, srcq, %1, 15
942 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
943 SWAP m9, m0
944 add srcq, srcstrideq
945 QPEL_H_LOAD %2, srcq, %1, 15
946 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
947 SWAP m10, m0
948 add srcq, srcstrideq
949 QPEL_H_LOAD %2, srcq, %1, 15
950 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
951 SWAP m11, m0
952 add srcq, srcstrideq
953 QPEL_H_LOAD %2, srcq, %1, 15
954 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
955 SWAP m12, m0
956 add srcq, srcstrideq
957 QPEL_H_LOAD %2, srcq, %1, 15
958 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
959 SWAP m13, m0
960 add srcq, srcstrideq
961 QPEL_H_LOAD %2, srcq, %1, 15
962 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
963 SWAP m14, m0
964 add srcq, srcstrideq
965.loop
966 QPEL_H_LOAD %2, srcq, %1, 15
967 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
968 SWAP m15, m0
969 punpcklwd m0, m8, m9
970 punpcklwd m2, m10, m11
971 punpcklwd m4, m12, m13
972 punpcklwd m6, m14, m15
973%if %1 > 4
974 punpckhwd m1, m8, m9
975 punpckhwd m3, m10, m11
976 punpckhwd m5, m12, m13
977 punpckhwd m7, m14, m15
978%endif
979 QPEL_HV_COMPUTE %1, 14, my, ackssdw
980 PEL_10STORE%1 dstq, m0, m1
981%if %1 <= 4
982 movq m8, m9
983 movq m9, m10
984 movq m10, m11
985 movq m11, m12
986 movq m12, m13
987 movq m13, m14
988 movq m14, m15
989%else
990 movdqa m8, m9
991 movdqa m9, m10
992 movdqa m10, m11
993 movdqa m11, m12
994 movdqa m12, m13
995 movdqa m13, m14
996 movdqa m14, m15
997%endif
998 LOOP_END dst, src, srcstride
999 RET
1000
1001cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
1002 lea mxq, [mxq*8-8]
1003 lea myq, [myq*8-8]
1004 lea r3srcq, [srcstrideq*3]
1005 sub srcq, r3srcq
1006 QPEL_H_LOAD %2, srcq, %1, 15
1007 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1008 SWAP m8, m0
1009 add srcq, srcstrideq
1010 QPEL_H_LOAD %2, srcq, %1, 15
1011 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1012 SWAP m9, m0
1013 add srcq, srcstrideq
1014 QPEL_H_LOAD %2, srcq, %1, 15
1015 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1016 SWAP m10, m0
1017 add srcq, srcstrideq
1018 QPEL_H_LOAD %2, srcq, %1, 15
1019 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1020 SWAP m11, m0
1021 add srcq, srcstrideq
1022 QPEL_H_LOAD %2, srcq, %1, 15
1023 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1024 SWAP m12, m0
1025 add srcq, srcstrideq
1026 QPEL_H_LOAD %2, srcq, %1, 15
1027 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1028 SWAP m13, m0
1029 add srcq, srcstrideq
1030 QPEL_H_LOAD %2, srcq, %1, 15
1031 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1032 SWAP m14, m0
1033 add srcq, srcstrideq
1034.loop
1035 QPEL_H_LOAD %2, srcq, %1, 15
1036 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1037 SWAP m15, m0
1038 punpcklwd m0, m8, m9
1039 punpcklwd m2, m10, m11
1040 punpcklwd m4, m12, m13
1041 punpcklwd m6, m14, m15
1042%if %1 > 4
1043 punpckhwd m1, m8, m9
1044 punpckhwd m3, m10, m11
1045 punpckhwd m5, m12, m13
1046 punpckhwd m7, m14, m15
1047%endif
1048 QPEL_HV_COMPUTE %1, 14, my, ackusdw
1049 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
1050 PEL_%2STORE%1 dstq, m0, m1
1051
1052%if %1 <= 4
1053 movq m8, m9
1054 movq m9, m10
1055 movq m10, m11
1056 movq m11, m12
1057 movq m12, m13
1058 movq m13, m14
1059 movq m14, m15
1060%else
1061 movdqa m8, m9
1062 movdqa m9, m10
1063 movdqa m10, m11
1064 movdqa m11, m12
1065 movdqa m12, m13
1066 movdqa m13, m14
1067 movdqa m14, m15
1068%endif
1069 add dstq, dststrideq ; dst += dststride
1070 add srcq, srcstrideq ; src += srcstride
1071 dec heightd ; cmp height
1072 jnz .loop ; height loop
1073 RET
1074
1075cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
1076 lea mxq, [mxq*8-8]
1077 lea myq, [myq*8-8]
1078 lea r3srcq, [srcstrideq*3]
1079 sub srcq, r3srcq
1080 QPEL_H_LOAD %2, srcq, %1, 15
1081 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1082 SWAP m8, m0
1083 add srcq, srcstrideq
1084 QPEL_H_LOAD %2, srcq, %1, 15
1085 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1086 SWAP m9, m0
1087 add srcq, srcstrideq
1088 QPEL_H_LOAD %2, srcq, %1, 15
1089 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1090 SWAP m10, m0
1091 add srcq, srcstrideq
1092 QPEL_H_LOAD %2, srcq, %1, 15
1093 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1094 SWAP m11, m0
1095 add srcq, srcstrideq
1096 QPEL_H_LOAD %2, srcq, %1, 15
1097 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1098 SWAP m12, m0
1099 add srcq, srcstrideq
1100 QPEL_H_LOAD %2, srcq, %1, 15
1101 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1102 SWAP m13, m0
1103 add srcq, srcstrideq
1104 QPEL_H_LOAD %2, srcq, %1, 15
1105 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1106 SWAP m14, m0
1107 add srcq, srcstrideq
1108.loop
1109 QPEL_H_LOAD %2, srcq, %1, 15
1110 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1111 SWAP m15, m0
1112 punpcklwd m0, m8, m9
1113 punpcklwd m2, m10, m11
1114 punpcklwd m4, m12, m13
1115 punpcklwd m6, m14, m15
1116%if %1 > 4
1117 punpckhwd m1, m8, m9
1118 punpckhwd m3, m10, m11
1119 punpckhwd m5, m12, m13
1120 punpckhwd m7, m14, m15
1121%endif
1122 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1123 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
1124 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1125 PEL_%2STORE%1 dstq, m0, m1
1126
1127%if %1 <= 4
1128 movq m8, m9
1129 movq m9, m10
1130 movq m10, m11
1131 movq m11, m12
1132 movq m12, m13
1133 movq m13, m14
1134 movq m14, m15
1135%else
1136 movdqa m8, m9
1137 movdqa m9, m10
1138 movdqa m10, m11
1139 movdqa m11, m12
1140 movdqa m12, m13
1141 movdqa m13, m14
1142 movdqa m14, m15
1143%endif
1144 add dstq, dststrideq ; dst += dststride
1145 add srcq, srcstrideq ; src += srcstride
1146 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1147 dec heightd ; cmp height
1148 jnz .loop ; height loop
1149 RET
1150%endmacro
1151
1152%macro WEIGHTING_FUNCS 2
1153%if WIN64 || ARCH_X86_32
1154cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1155 mov r4d, denomm
1156%define SHIFT r4d
1157%else
1158cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1159%define SHIFT denomd
1160%endif
1161 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
1162%if %1 <= 4
1163 pxor m1, m1
1164%endif
1165 movd m2, wxm ; WX
1166 movd m4, SHIFT ; shift
1167%if %1 <= 4
1168 punpcklwd m2, m1
1169%else
1170 punpcklwd m2, m2
1171%endif
1172 dec SHIFT
1173 movdqu m5, [one_per_32]
1174 movd m6, SHIFT
1175 pshufd m2, m2, 0
1176 mov SHIFT, oxm
1177 pslld m5, m6
1178%if %2 != 8
1179 shl SHIFT, %2-8 ; ox << (bitd - 8)
1180%endif
1181 movd m3, SHIFT ; OX
1182 pshufd m3, m3, 0
1183%if WIN64 || ARCH_X86_32
1184 mov SHIFT, heightm
1185%endif
1186.loop
1187 SIMPLE_LOAD %1, 10, srcq, m0
1188%if %1 <= 4
1189 punpcklwd m0, m1
1190 pmaddwd m0, m2
1191 paddd m0, m5
1192 psrad m0, m4
1193 paddd m0, m3
1194%else
1195 pmulhw m6, m0, m2
1196 pmullw m0, m2
1197 punpckhwd m1, m0, m6
1198 punpcklwd m0, m6
1199 paddd m0, m5
1200 paddd m1, m5
1201 psrad m0, m4
1202 psrad m1, m4
1203 paddd m0, m3
1204 paddd m1, m3
1205%endif
1206 packusdw m0, m1
1207%if %2 == 8
1208 packuswb m0, m0
1209%else
1210 pminsw m0, [max_pixels_%2]
1211%endif
1212 PEL_%2STORE%1 dstq, m0, m1
1213 add dstq, dststrideq ; dst += dststride
1214 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1215 dec heightd ; cmp height
1216 jnz .loop ; height loop
1217 RET
1218
1219cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1
1220 mov r6d, denomm
1221%if %1 <= 4
1222 pxor m1, m1
1223%endif
1224 movd m2, wx0m ; WX0
1225 lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
1226 movd m3, wx1m ; WX1
1227 movd m0, r6d ; shift
1228%if %1 <= 4
1229 punpcklwd m2, m1
1230 punpcklwd m3, m1
1231%else
1232 punpcklwd m2, m2
1233 punpcklwd m3, m3
1234%endif
1235 inc r6d
1236 movd m5, r6d ; shift+1
1237 pshufd m2, m2, 0
1238 mov r6d, ox0m
1239 pshufd m3, m3, 0
1240 add r6d, ox1m
1241%if %2 != 8
1242 shl r6d, %2-8 ; ox << (bitd - 8)
1243%endif
1244 inc r6d
1245 movd m4, r6d ; offset
1246 pshufd m4, m4, 0
1247 mov r6d, heightm
1248 pslld m4, m0
1249
1250.loop
1251 SIMPLE_LOAD %1, 10, srcq, m0
1252 SIMPLE_LOAD %1, 10, src2q, m8
1253%if %1 <= 4
1254 punpcklwd m0, m1
1255 punpcklwd m8, m1
1256 pmaddwd m0, m3
1257 pmaddwd m8, m2
1258 paddd m0, m4
1259 paddd m0, m8
1260 psrad m0, m5
1261%else
1262 pmulhw m6, m0, m3
1263 pmullw m0, m3
1264 pmulhw m7, m8, m2
1265 pmullw m8, m2
1266 punpckhwd m1, m0, m6
1267 punpcklwd m0, m6
1268 punpckhwd m9, m8, m7
1269 punpcklwd m8, m7
1270 paddd m0, m8
1271 paddd m1, m9
1272 paddd m0, m4
1273 paddd m1, m4
1274 psrad m0, m5
1275 psrad m1, m5
1276%endif
1277 packusdw m0, m1
1278%if %2 == 8
1279 packuswb m0, m0
1280%else
1281 pminsw m0, [max_pixels_%2]
1282%endif
1283 PEL_%2STORE%1 dstq, m0, m1
1284 add dstq, dststrideq ; dst += dststride
1285 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1286 add src2q, 2*MAX_PB_SIZE ; src2 += srcstride
1287 dec r6d ; cmp height
1288 jnz .loop ; height loop
1289 RET
1290%endmacro
1291
1292WEIGHTING_FUNCS 2, 8
1293WEIGHTING_FUNCS 4, 8
1294WEIGHTING_FUNCS 6, 8
1295WEIGHTING_FUNCS 8, 8
1296
1297WEIGHTING_FUNCS 2, 10
1298WEIGHTING_FUNCS 4, 10
1299WEIGHTING_FUNCS 6, 10
1300WEIGHTING_FUNCS 8, 10
1301
1302WEIGHTING_FUNCS 2, 12
1303WEIGHTING_FUNCS 4, 12
1304WEIGHTING_FUNCS 6, 12
1305WEIGHTING_FUNCS 8, 12
1306
1307HEVC_PUT_HEVC_PEL_PIXELS 2, 8
1308HEVC_PUT_HEVC_PEL_PIXELS 4, 8
1309HEVC_PUT_HEVC_PEL_PIXELS 6, 8
1310HEVC_PUT_HEVC_PEL_PIXELS 8, 8
1311HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1312HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1313
1314HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1315HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1316HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1317HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1318
1319HEVC_PUT_HEVC_PEL_PIXELS 2, 12
1320HEVC_PUT_HEVC_PEL_PIXELS 4, 12
1321HEVC_PUT_HEVC_PEL_PIXELS 6, 12
1322HEVC_PUT_HEVC_PEL_PIXELS 8, 12
1323
1324HEVC_PUT_HEVC_EPEL 2, 8
1325HEVC_PUT_HEVC_EPEL 4, 8
1326HEVC_PUT_HEVC_EPEL 6, 8
1327HEVC_PUT_HEVC_EPEL 8, 8
1328HEVC_PUT_HEVC_EPEL 12, 8
1329HEVC_PUT_HEVC_EPEL 16, 8
1330
1331
1332HEVC_PUT_HEVC_EPEL 2, 10
1333HEVC_PUT_HEVC_EPEL 4, 10
1334HEVC_PUT_HEVC_EPEL 6, 10
1335HEVC_PUT_HEVC_EPEL 8, 10
1336
1337HEVC_PUT_HEVC_EPEL 2, 12
1338HEVC_PUT_HEVC_EPEL 4, 12
1339HEVC_PUT_HEVC_EPEL 6, 12
1340HEVC_PUT_HEVC_EPEL 8, 12
1341
1342HEVC_PUT_HEVC_EPEL_HV 2, 8
1343HEVC_PUT_HEVC_EPEL_HV 4, 8
1344HEVC_PUT_HEVC_EPEL_HV 6, 8
1345HEVC_PUT_HEVC_EPEL_HV 8, 8
1346
1347HEVC_PUT_HEVC_EPEL_HV 2, 10
1348HEVC_PUT_HEVC_EPEL_HV 4, 10
1349HEVC_PUT_HEVC_EPEL_HV 6, 10
1350HEVC_PUT_HEVC_EPEL_HV 8, 10
1351
1352HEVC_PUT_HEVC_EPEL_HV 2, 12
1353HEVC_PUT_HEVC_EPEL_HV 4, 12
1354HEVC_PUT_HEVC_EPEL_HV 6, 12
1355HEVC_PUT_HEVC_EPEL_HV 8, 12
1356
1357HEVC_PUT_HEVC_QPEL 4, 8
1358HEVC_PUT_HEVC_QPEL 8, 8
1359HEVC_PUT_HEVC_QPEL 12, 8
1360HEVC_PUT_HEVC_QPEL 16, 8
1361
1362HEVC_PUT_HEVC_QPEL 4, 10
1363HEVC_PUT_HEVC_QPEL 8, 10
1364
1365HEVC_PUT_HEVC_QPEL 4, 12
1366HEVC_PUT_HEVC_QPEL 8, 12
1367
1368HEVC_PUT_HEVC_QPEL_HV 2, 8
1369HEVC_PUT_HEVC_QPEL_HV 4, 8
1370HEVC_PUT_HEVC_QPEL_HV 6, 8
1371HEVC_PUT_HEVC_QPEL_HV 8, 8
1372
1373HEVC_PUT_HEVC_QPEL_HV 2, 10
1374HEVC_PUT_HEVC_QPEL_HV 4, 10
1375HEVC_PUT_HEVC_QPEL_HV 6, 10
1376HEVC_PUT_HEVC_QPEL_HV 8, 10
1377
1378HEVC_PUT_HEVC_QPEL_HV 2, 12
1379HEVC_PUT_HEVC_QPEL_HV 4, 12
1380HEVC_PUT_HEVC_QPEL_HV 6, 12
1381HEVC_PUT_HEVC_QPEL_HV 8, 12
1382
1383%endif ; ARCH_X86_64