Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_qpel_8bit.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
3;*****************************************************************************
4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2012 Daniel Kang
6;*
7;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26%include "libavutil/x86/x86util.asm"
27
28SECTION_RODATA 32
29
30cextern pw_16
31cextern pw_5
32cextern pb_0
33
34SECTION .text
35
36
37%macro op_avgh 3
38 movh %3, %2
39 pavgb %1, %3
40 movh %2, %1
41%endmacro
42
43%macro op_avg 2-3
44 pavgb %1, %2
45 mova %2, %1
46%endmacro
47
48%macro op_puth 2-3
49 movh %2, %1
50%endmacro
51
52%macro op_put 2-3
53 mova %2, %1
54%endmacro
55
56%macro QPEL4_H_LOWPASS_OP 1
57cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
58 movsxdifnidn r2, r2d
59 movsxdifnidn r3, r3d
60 pxor m7, m7
61 mova m4, [pw_5]
62 mova m5, [pw_16]
63 mov r4d, 4
64.loop:
65 movh m1, [r1-1]
66 movh m2, [r1+0]
67 movh m3, [r1+1]
68 movh m0, [r1+2]
69 punpcklbw m1, m7
70 punpcklbw m2, m7
71 punpcklbw m3, m7
72 punpcklbw m0, m7
73 paddw m1, m0
74 paddw m2, m3
75 movh m0, [r1-2]
76 movh m3, [r1+3]
77 punpcklbw m0, m7
78 punpcklbw m3, m7
79 paddw m0, m3
80 psllw m2, 2
81 psubw m2, m1
82 pmullw m2, m4
83 paddw m0, m5
84 paddw m0, m2
85 psraw m0, 5
86 packuswb m0, m0
87 op_%1h m0, [r0], m6
88 add r0, r2
89 add r1, r3
90 dec r4d
91 jg .loop
92 REP_RET
93%endmacro
94
95INIT_MMX mmxext
96QPEL4_H_LOWPASS_OP put
97QPEL4_H_LOWPASS_OP avg
98
99%macro QPEL8_H_LOWPASS_OP 1
100cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
101 movsxdifnidn r2, r2d
102 movsxdifnidn r3, r3d
103 mov r4d, 8
104 pxor m7, m7
105 mova m6, [pw_5]
106.loop:
107 mova m0, [r1]
108 mova m2, [r1+1]
109 mova m1, m0
110 mova m3, m2
111 punpcklbw m0, m7
112 punpckhbw m1, m7
113 punpcklbw m2, m7
114 punpckhbw m3, m7
115 paddw m0, m2
116 paddw m1, m3
117 psllw m0, 2
118 psllw m1, 2
119 mova m2, [r1-1]
120 mova m4, [r1+2]
121 mova m3, m2
122 mova m5, m4
123 punpcklbw m2, m7
124 punpckhbw m3, m7
125 punpcklbw m4, m7
126 punpckhbw m5, m7
127 paddw m2, m4
128 paddw m5, m3
129 psubw m0, m2
130 psubw m1, m5
131 pmullw m0, m6
132 pmullw m1, m6
133 movd m2, [r1-2]
134 movd m5, [r1+7]
135 punpcklbw m2, m7
136 punpcklbw m5, m7
137 paddw m2, m3
138 paddw m4, m5
139 mova m5, [pw_16]
140 paddw m2, m5
141 paddw m4, m5
142 paddw m0, m2
143 paddw m1, m4
144 psraw m0, 5
145 psraw m1, 5
146 packuswb m0, m1
147 op_%1 m0, [r0], m4
148 add r0, r2
149 add r1, r3
150 dec r4d
151 jg .loop
152 REP_RET
153%endmacro
154
155INIT_MMX mmxext
156QPEL8_H_LOWPASS_OP put
157QPEL8_H_LOWPASS_OP avg
158
159%macro QPEL8_H_LOWPASS_OP_XMM 1
160cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
161 movsxdifnidn r2, r2d
162 movsxdifnidn r3, r3d
163 mov r4d, 8
164 pxor m7, m7
165 mova m6, [pw_5]
166.loop:
167 movu m1, [r1-2]
168 mova m0, m1
169 punpckhbw m1, m7
170 punpcklbw m0, m7
171 mova m2, m1
172 mova m3, m1
173 mova m4, m1
174 mova m5, m1
175 palignr m4, m0, 2
176 palignr m3, m0, 4
177 palignr m2, m0, 6
178 palignr m1, m0, 8
179 palignr m5, m0, 10
180 paddw m0, m5
181 paddw m2, m3
182 paddw m1, m4
183 psllw m2, 2
184 psubw m2, m1
185 paddw m0, [pw_16]
186 pmullw m2, m6
187 paddw m2, m0
188 psraw m2, 5
189 packuswb m2, m2
190 op_%1h m2, [r0], m4
191 add r1, r3
192 add r0, r2
193 dec r4d
194 jne .loop
195 REP_RET
196%endmacro
197
198INIT_XMM ssse3
199QPEL8_H_LOWPASS_OP_XMM put
200QPEL8_H_LOWPASS_OP_XMM avg
201
202
203%macro QPEL4_H_LOWPASS_L2_OP 1
204cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
205 movsxdifnidn r3, r3d
206 movsxdifnidn r4, r4d
207 pxor m7, m7
208 mova m4, [pw_5]
209 mova m5, [pw_16]
210 mov r5d, 4
211.loop:
212 movh m1, [r1-1]
213 movh m2, [r1+0]
214 movh m3, [r1+1]
215 movh m0, [r1+2]
216 punpcklbw m1, m7
217 punpcklbw m2, m7
218 punpcklbw m3, m7
219 punpcklbw m0, m7
220 paddw m1, m0
221 paddw m2, m3
222 movh m0, [r1-2]
223 movh m3, [r1+3]
224 punpcklbw m0, m7
225 punpcklbw m3, m7
226 paddw m0, m3
227 psllw m2, 2
228 psubw m2, m1
229 pmullw m2, m4
230 paddw m0, m5
231 paddw m0, m2
232 movh m3, [r2]
233 psraw m0, 5
234 packuswb m0, m0
235 pavgb m0, m3
236 op_%1h m0, [r0], m6
237 add r0, r3
238 add r1, r3
239 add r2, r4
240 dec r5d
241 jg .loop
242 REP_RET
243%endmacro
244
245INIT_MMX mmxext
246QPEL4_H_LOWPASS_L2_OP put
247QPEL4_H_LOWPASS_L2_OP avg
248
249
250%macro QPEL8_H_LOWPASS_L2_OP 1
251cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
252 movsxdifnidn r3, r3d
253 movsxdifnidn r4, r4d
254 mov r5d, 8
255 pxor m7, m7
256 mova m6, [pw_5]
257.loop:
258 mova m0, [r1]
259 mova m2, [r1+1]
260 mova m1, m0
261 mova m3, m2
262 punpcklbw m0, m7
263 punpckhbw m1, m7
264 punpcklbw m2, m7
265 punpckhbw m3, m7
266 paddw m0, m2
267 paddw m1, m3
268 psllw m0, 2
269 psllw m1, 2
270 mova m2, [r1-1]
271 mova m4, [r1+2]
272 mova m3, m2
273 mova m5, m4
274 punpcklbw m2, m7
275 punpckhbw m3, m7
276 punpcklbw m4, m7
277 punpckhbw m5, m7
278 paddw m2, m4
279 paddw m5, m3
280 psubw m0, m2
281 psubw m1, m5
282 pmullw m0, m6
283 pmullw m1, m6
284 movd m2, [r1-2]
285 movd m5, [r1+7]
286 punpcklbw m2, m7
287 punpcklbw m5, m7
288 paddw m2, m3
289 paddw m4, m5
290 mova m5, [pw_16]
291 paddw m2, m5
292 paddw m4, m5
293 paddw m0, m2
294 paddw m1, m4
295 psraw m0, 5
296 psraw m1, 5
297 mova m4, [r2]
298 packuswb m0, m1
299 pavgb m0, m4
300 op_%1 m0, [r0], m4
301 add r0, r3
302 add r1, r3
303 add r2, r4
304 dec r5d
305 jg .loop
306 REP_RET
307%endmacro
308
309INIT_MMX mmxext
310QPEL8_H_LOWPASS_L2_OP put
311QPEL8_H_LOWPASS_L2_OP avg
312
313
314%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
315cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
316 movsxdifnidn r3, r3d
317 movsxdifnidn r4, r4d
318 mov r5d, 8
319 pxor m7, m7
320 mova m6, [pw_5]
321.loop:
322 lddqu m1, [r1-2]
323 mova m0, m1
324 punpckhbw m1, m7
325 punpcklbw m0, m7
326 mova m2, m1
327 mova m3, m1
328 mova m4, m1
329 mova m5, m1
330 palignr m4, m0, 2
331 palignr m3, m0, 4
332 palignr m2, m0, 6
333 palignr m1, m0, 8
334 palignr m5, m0, 10
335 paddw m0, m5
336 paddw m2, m3
337 paddw m1, m4
338 psllw m2, 2
339 movh m3, [r2]
340 psubw m2, m1
341 paddw m0, [pw_16]
342 pmullw m2, m6
343 paddw m2, m0
344 psraw m2, 5
345 packuswb m2, m2
346 pavgb m2, m3
347 op_%1h m2, [r0], m4
348 add r1, r3
349 add r0, r3
350 add r2, r4
351 dec r5d
352 jg .loop
353 REP_RET
354%endmacro
355
356INIT_XMM ssse3
357QPEL8_H_LOWPASS_L2_OP_XMM put
358QPEL8_H_LOWPASS_L2_OP_XMM avg
359
360
361; All functions that call this are required to have function arguments of
362; dst, src, dstStride, srcStride
363%macro FILT_V 1
364 mova m6, m2
365 movh m5, [r1]
366 paddw m6, m3
367 psllw m6, 2
368 psubw m6, m1
369 psubw m6, m4
370 punpcklbw m5, m7
371 pmullw m6, [pw_5]
372 paddw m0, [pw_16]
373 add r1, r3
374 paddw m0, m5
375 paddw m6, m0
376 psraw m6, 5
377 packuswb m6, m6
378 op_%1h m6, [r0], m0 ; 1
379 add r0, r2
380 SWAP 0, 1, 2, 3, 4, 5
381%endmacro
382
383%macro QPEL4_V_LOWPASS_OP 1
384cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
385 movsxdifnidn r2, r2d
386 movsxdifnidn r3, r3d
387 sub r1, r3
388 sub r1, r3
389 pxor m7, m7
390 movh m0, [r1]
391 movh m1, [r1+r3]
392 lea r1, [r1+2*r3]
393 movh m2, [r1]
394 movh m3, [r1+r3]
395 lea r1, [r1+2*r3]
396 movh m4, [r1]
397 add r1, r3
398 punpcklbw m0, m7
399 punpcklbw m1, m7
400 punpcklbw m2, m7
401 punpcklbw m3, m7
402 punpcklbw m4, m7
403 FILT_V %1
404 FILT_V %1
405 FILT_V %1
406 FILT_V %1
407 RET
408%endmacro
409
410INIT_MMX mmxext
411QPEL4_V_LOWPASS_OP put
412QPEL4_V_LOWPASS_OP avg
413
414
415
416%macro QPEL8OR16_V_LOWPASS_OP 1
417%if cpuflag(sse2)
418cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
419 movsxdifnidn r2, r2d
420 movsxdifnidn r3, r3d
421 sub r1, r3
422 sub r1, r3
423%else
424cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
425 movsxdifnidn r2, r2d
426 movsxdifnidn r3, r3d
427%endif
428 pxor m7, m7
429 movh m0, [r1]
430 movh m1, [r1+r3]
431 lea r1, [r1+2*r3]
432 movh m2, [r1]
433 movh m3, [r1+r3]
434 lea r1, [r1+2*r3]
435 movh m4, [r1]
436 add r1, r3
437 punpcklbw m0, m7
438 punpcklbw m1, m7
439 punpcklbw m2, m7
440 punpcklbw m3, m7
441 punpcklbw m4, m7
442 FILT_V %1
443 FILT_V %1
444 FILT_V %1
445 FILT_V %1
446 FILT_V %1
447 FILT_V %1
448 FILT_V %1
449 FILT_V %1
450 cmp r4d, 16
451 jne .end
452 FILT_V %1
453 FILT_V %1
454 FILT_V %1
455 FILT_V %1
456 FILT_V %1
457 FILT_V %1
458 FILT_V %1
459 FILT_V %1
460.end:
461 REP_RET
462%endmacro
463
464INIT_MMX mmxext
465QPEL8OR16_V_LOWPASS_OP put
466QPEL8OR16_V_LOWPASS_OP avg
467
468INIT_XMM sse2
469QPEL8OR16_V_LOWPASS_OP put
470QPEL8OR16_V_LOWPASS_OP avg
471
472
473; All functions that use this are required to have args:
474; src, tmp, srcSize
475%macro FILT_HV 1 ; offset
476 mova m6, m2
477 movh m5, [r0]
478 paddw m6, m3
479 psllw m6, 2
480 paddw m0, [pw_16]
481 psubw m6, m1
482 psubw m6, m4
483 punpcklbw m5, m7
484 pmullw m6, [pw_5]
485 paddw m0, m5
486 add r0, r2
487 paddw m6, m0
488 mova [r1+%1], m6
489 SWAP 0, 1, 2, 3, 4, 5
490%endmacro
491
492%macro QPEL4_HV1_LOWPASS_OP 1
493cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
494 movsxdifnidn r2, r2d
495 pxor m7, m7
496 movh m0, [r0]
497 movh m1, [r0+r2]
498 lea r0, [r0+2*r2]
499 movh m2, [r0]
500 movh m3, [r0+r2]
501 lea r0, [r0+2*r2]
502 movh m4, [r0]
503 add r0, r2
504 punpcklbw m0, m7
505 punpcklbw m1, m7
506 punpcklbw m2, m7
507 punpcklbw m3, m7
508 punpcklbw m4, m7
509 FILT_HV 0*24
510 FILT_HV 1*24
511 FILT_HV 2*24
512 FILT_HV 3*24
513 RET
514
515cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
516 movsxdifnidn r2, r2d
517 mov r3d, 4
518.loop:
519 mova m0, [r0]
520 paddw m0, [r0+10]
521 mova m1, [r0+2]
522 paddw m1, [r0+8]
523 mova m2, [r0+4]
524 paddw m2, [r0+6]
525 psubw m0, m1
526 psraw m0, 2
527 psubw m0, m1
528 paddsw m0, m2
529 psraw m0, 2
530 paddw m0, m2
531 psraw m0, 6
532 packuswb m0, m0
533 op_%1h m0, [r1], m7
534 add r0, 24
535 add r1, r2
536 dec r3d
537 jnz .loop
538 REP_RET
539%endmacro
540
541INIT_MMX mmxext
542QPEL4_HV1_LOWPASS_OP put
543QPEL4_HV1_LOWPASS_OP avg
544
545%macro QPEL8OR16_HV1_LOWPASS_OP 1
546cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
547 movsxdifnidn r2, r2d
548 pxor m7, m7
549 movh m0, [r0]
550 movh m1, [r0+r2]
551 lea r0, [r0+2*r2]
552 movh m2, [r0]
553 movh m3, [r0+r2]
554 lea r0, [r0+2*r2]
555 movh m4, [r0]
556 add r0, r2
557 punpcklbw m0, m7
558 punpcklbw m1, m7
559 punpcklbw m2, m7
560 punpcklbw m3, m7
561 punpcklbw m4, m7
562 FILT_HV 0*48
563 FILT_HV 1*48
564 FILT_HV 2*48
565 FILT_HV 3*48
566 FILT_HV 4*48
567 FILT_HV 5*48
568 FILT_HV 6*48
569 FILT_HV 7*48
570 cmp r3d, 16
571 jne .end
572 FILT_HV 8*48
573 FILT_HV 9*48
574 FILT_HV 10*48
575 FILT_HV 11*48
576 FILT_HV 12*48
577 FILT_HV 13*48
578 FILT_HV 14*48
579 FILT_HV 15*48
580.end:
581 REP_RET
582%endmacro
583
584INIT_MMX mmxext
585QPEL8OR16_HV1_LOWPASS_OP put
586QPEL8OR16_HV1_LOWPASS_OP avg
587
588INIT_XMM sse2
589QPEL8OR16_HV1_LOWPASS_OP put
590
591
592
593%macro QPEL8OR16_HV2_LOWPASS_OP 1
594; unused is to match ssse3 and mmxext args
595cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
596 movsxdifnidn r2, r2d
597.loop:
598 mova m0, [r1]
599 mova m3, [r1+8]
600 mova m1, [r1+2]
601 mova m4, [r1+10]
602 paddw m0, m4
603 paddw m1, m3
604 paddw m3, [r1+18]
605 paddw m4, [r1+16]
606 mova m2, [r1+4]
607 mova m5, [r1+12]
608 paddw m2, [r1+6]
609 paddw m5, [r1+14]
610 psubw m0, m1
611 psubw m3, m4
612 psraw m0, 2
613 psraw m3, 2
614 psubw m0, m1
615 psubw m3, m4
616 paddsw m0, m2
617 paddsw m3, m5
618 psraw m0, 2
619 psraw m3, 2
620 paddw m0, m2
621 paddw m3, m5
622 psraw m0, 6
623 psraw m3, 6
624 packuswb m0, m3
625 op_%1 m0, [r0], m7
626 add r1, 48
627 add r0, r2
628 dec r4d
629 jne .loop
630 REP_RET
631%endmacro
632
633INIT_MMX mmxext
634QPEL8OR16_HV2_LOWPASS_OP put
635QPEL8OR16_HV2_LOWPASS_OP avg
636
637%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
638cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
639 movsxdifnidn r2, r2d
640 movsxdifnidn r3, r3d
641 cmp r4d, 16
642 je .op16
643.loop8:
644 mova m1, [r1+16]
645 mova m0, [r1]
646 mova m2, m1
647 mova m3, m1
648 mova m4, m1
649 mova m5, m1
650 palignr m5, m0, 10
651 palignr m4, m0, 8
652 palignr m3, m0, 6
653 palignr m2, m0, 4
654 palignr m1, m0, 2
655 paddw m0, m5
656 paddw m1, m4
657 paddw m2, m3
658 psubw m0, m1
659 psraw m0, 2
660 psubw m0, m1
661 paddw m0, m2
662 psraw m0, 2
663 paddw m0, m2
664 psraw m0, 6
665 packuswb m0, m0
666 op_%1h m0, [r0], m7
667 add r1, 48
668 add r0, r2
669 dec r4d
670 jne .loop8
671 jmp .done
672.op16:
673 mova m4, [r1+32]
674 mova m5, [r1+16]
675 mova m7, [r1]
676 mova m3, m4
677 mova m2, m4
678 mova m1, m4
679 mova m0, m4
680 palignr m0, m5, 10
681 palignr m1, m5, 8
682 palignr m2, m5, 6
683 palignr m3, m5, 4
684 palignr m4, m5, 2
685 paddw m0, m5
686 paddw m1, m4
687 paddw m2, m3
688 mova m6, m5
689 mova m4, m5
690 mova m3, m5
691 palignr m4, m7, 8
692 palignr m6, m7, 2
693 palignr m3, m7, 10
694 paddw m4, m6
695 mova m6, m5
696 palignr m5, m7, 6
697 palignr m6, m7, 4
698 paddw m3, m7
699 paddw m5, m6
700 psubw m0, m1
701 psubw m3, m4
702 psraw m0, 2
703 psraw m3, 2
704 psubw m0, m1
705 psubw m3, m4
706 paddw m0, m2
707 paddw m3, m5
708 psraw m0, 2
709 psraw m3, 2
710 paddw m0, m2
711 paddw m3, m5
712 psraw m0, 6
713 psraw m3, 6
714 packuswb m3, m0
715 op_%1 m3, [r0], m7
716 add r1, 48
717 add r0, r2
718 dec r4d
719 jne .op16
720.done:
721 REP_RET
722%endmacro
723
724INIT_XMM ssse3
725QPEL8OR16_HV2_LOWPASS_OP_XMM put
726QPEL8OR16_HV2_LOWPASS_OP_XMM avg
727
728
729%macro PIXELS4_L2_SHIFT5 1
730cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
731 movsxdifnidn r3, r3d
732 movsxdifnidn r4, r4d
733 mova m0, [r1]
734 mova m1, [r1+24]
735 psraw m0, 5
736 psraw m1, 5
737 packuswb m0, m0
738 packuswb m1, m1
739 pavgb m0, [r2]
740 pavgb m1, [r2+r4]
741 op_%1h m0, [r0], m4
742 op_%1h m1, [r0+r3], m5
743 lea r2, [r2+r4*2]
744 lea r0, [r0+r3*2]
745 mova m0, [r1+48]
746 mova m1, [r1+72]
747 psraw m0, 5
748 psraw m1, 5
749 packuswb m0, m0
750 packuswb m1, m1
751 pavgb m0, [r2]
752 pavgb m1, [r2+r4]
753 op_%1h m0, [r0], m4
754 op_%1h m1, [r0+r3], m5
755 RET
756%endmacro
757
758INIT_MMX mmxext
759PIXELS4_L2_SHIFT5 put
760PIXELS4_L2_SHIFT5 avg
761
762
763%macro PIXELS8_L2_SHIFT5 1
764cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
765 movsxdifnidn r3, r3d
766 movsxdifnidn r4, r4d
767.loop:
768 mova m0, [r1]
769 mova m1, [r1+8]
770 mova m2, [r1+48]
771 mova m3, [r1+48+8]
772 psraw m0, 5
773 psraw m1, 5
774 psraw m2, 5
775 psraw m3, 5
776 packuswb m0, m1
777 packuswb m2, m3
778 pavgb m0, [r2]
779 pavgb m2, [r2+r4]
780 op_%1 m0, [r0], m4
781 op_%1 m2, [r0+r3], m5
782 lea r2, [r2+2*r4]
783 add r1, 48*2
784 lea r0, [r0+2*r3]
785 sub r5d, 2
786 jne .loop
787 REP_RET
788%endmacro
789
790INIT_MMX mmxext
791PIXELS8_L2_SHIFT5 put
792PIXELS8_L2_SHIFT5 avg
793
794
795%if ARCH_X86_64
796%macro QPEL16_H_LOWPASS_L2_OP 1
797cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
798 movsxdifnidn r3, r3d
799 movsxdifnidn r4, r4d
800 mov r5d, 16
801 pxor m15, m15
802 mova m14, [pw_5]
803 mova m13, [pw_16]
804.loop:
805 lddqu m1, [r1+6]
806 lddqu m7, [r1-2]
807 mova m0, m1
808 punpckhbw m1, m15
809 punpcklbw m0, m15
810 punpcklbw m7, m15
811 mova m2, m1
812 mova m6, m0
813 mova m3, m1
814 mova m8, m0
815 mova m4, m1
816 mova m9, m0
817 mova m12, m0
818 mova m11, m1
819 palignr m11, m0, 10
820 palignr m12, m7, 10
821 palignr m4, m0, 2
822 palignr m9, m7, 2
823 palignr m3, m0, 4
824 palignr m8, m7, 4
825 palignr m2, m0, 6
826 palignr m6, m7, 6
827 paddw m11, m0
828 palignr m1, m0, 8
829 palignr m0, m7, 8
830 paddw m7, m12
831 paddw m2, m3
832 paddw m6, m8
833 paddw m1, m4
834 paddw m0, m9
835 psllw m2, 2
836 psllw m6, 2
837 psubw m2, m1
838 psubw m6, m0
839 paddw m11, m13
840 paddw m7, m13
841 pmullw m2, m14
842 pmullw m6, m14
843 lddqu m3, [r2]
844 paddw m2, m11
845 paddw m6, m7
846 psraw m2, 5
847 psraw m6, 5
848 packuswb m6, m2
849 pavgb m6, m3
850 op_%1 m6, [r0], m11
851 add r1, r3
852 add r0, r3
853 add r2, r4
854 dec r5d
855 jg .loop
856 REP_RET
857%endmacro
858
859INIT_XMM ssse3
860QPEL16_H_LOWPASS_L2_OP put
861QPEL16_H_LOWPASS_L2_OP avg
862%endif