Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / hpeldsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;*
3;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
5;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7;* Copyright (c) 2013 Daniel Kang
8;*
9;* SIMD-optimized halfpel functions
10;*
11;* This file is part of FFmpeg.
12;*
13;* FFmpeg is free software; you can redistribute it and/or
14;* modify it under the terms of the GNU Lesser General Public
15;* License as published by the Free Software Foundation; either
16;* version 2.1 of the License, or (at your option) any later version.
17;*
18;* FFmpeg is distributed in the hope that it will be useful,
19;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21;* Lesser General Public License for more details.
22;*
23;* You should have received a copy of the GNU Lesser General Public
24;* License along with FFmpeg; if not, write to the Free Software
25;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26;******************************************************************************
27
28%include "libavutil/x86/x86util.asm"
29
30SECTION_RODATA
31cextern pb_1
32cextern pw_2
33pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
34pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
35
36cextern pw_8192
37
38SECTION_TEXT
39
40; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
41%macro PUT_PIXELS8_X2 0
42%if cpuflag(sse2)
43cglobal put_pixels16_x2, 4,5,4
44%else
45cglobal put_pixels8_x2, 4,5
46%endif
47 lea r4, [r2*2]
48.loop:
49 movu m0, [r1+1]
50 movu m1, [r1+r2+1]
51%if cpuflag(sse2)
52 movu m2, [r1]
53 movu m3, [r1+r2]
54 pavgb m0, m2
55 pavgb m1, m3
56%else
57 PAVGB m0, [r1]
58 PAVGB m1, [r1+r2]
59%endif
60 mova [r0], m0
61 mova [r0+r2], m1
62 add r1, r4
63 add r0, r4
64 movu m0, [r1+1]
65 movu m1, [r1+r2+1]
66%if cpuflag(sse2)
67 movu m2, [r1]
68 movu m3, [r1+r2]
69 pavgb m0, m2
70 pavgb m1, m3
71%else
72 PAVGB m0, [r1]
73 PAVGB m1, [r1+r2]
74%endif
75 add r1, r4
76 mova [r0], m0
77 mova [r0+r2], m1
78 add r0, r4
79 sub r3d, 4
80 jne .loop
81 REP_RET
82%endmacro
83
84INIT_MMX mmxext
85PUT_PIXELS8_X2
86INIT_MMX 3dnow
87PUT_PIXELS8_X2
88
89
90; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
91%macro PUT_PIXELS_16 0
92cglobal put_pixels16_x2, 4,5
93 lea r4, [r2*2]
94.loop:
95 mova m0, [r1]
96 mova m1, [r1+r2]
97 mova m2, [r1+8]
98 mova m3, [r1+r2+8]
99 PAVGB m0, [r1+1]
100 PAVGB m1, [r1+r2+1]
101 PAVGB m2, [r1+9]
102 PAVGB m3, [r1+r2+9]
103 mova [r0], m0
104 mova [r0+r2], m1
105 mova [r0+8], m2
106 mova [r0+r2+8], m3
107 add r1, r4
108 add r0, r4
109 mova m0, [r1]
110 mova m1, [r1+r2]
111 mova m2, [r1+8]
112 mova m3, [r1+r2+8]
113 PAVGB m0, [r1+1]
114 PAVGB m1, [r1+r2+1]
115 PAVGB m2, [r1+9]
116 PAVGB m3, [r1+r2+9]
117 add r1, r4
118 mova [r0], m0
119 mova [r0+r2], m1
120 mova [r0+8], m2
121 mova [r0+r2+8], m3
122 add r0, r4
123 sub r3d, 4
124 jne .loop
125 REP_RET
126%endmacro
127
128INIT_MMX mmxext
129PUT_PIXELS_16
130INIT_MMX 3dnow
131PUT_PIXELS_16
132; The 8_X2 macro can easily be used here
133INIT_XMM sse2
134PUT_PIXELS8_X2
135
136
137; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
138%macro PUT_NO_RND_PIXELS8_X2 0
139cglobal put_no_rnd_pixels8_x2, 4,5
140 mova m6, [pb_1]
141 lea r4, [r2*2]
142.loop:
143 mova m0, [r1]
144 mova m2, [r1+r2]
145 mova m1, [r1+1]
146 mova m3, [r1+r2+1]
147 add r1, r4
148 psubusb m0, m6
149 psubusb m2, m6
150 PAVGB m0, m1
151 PAVGB m2, m3
152 mova [r0], m0
153 mova [r0+r2], m2
154 mova m0, [r1]
155 mova m1, [r1+1]
156 mova m2, [r1+r2]
157 mova m3, [r1+r2+1]
158 add r0, r4
159 add r1, r4
160 psubusb m0, m6
161 psubusb m2, m6
162 PAVGB m0, m1
163 PAVGB m2, m3
164 mova [r0], m0
165 mova [r0+r2], m2
166 add r0, r4
167 sub r3d, 4
168 jne .loop
169 REP_RET
170%endmacro
171
172INIT_MMX mmxext
173PUT_NO_RND_PIXELS8_X2
174INIT_MMX 3dnow
175PUT_NO_RND_PIXELS8_X2
176
177
178; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
179%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
180cglobal put_no_rnd_pixels8_x2_exact, 4,5
181 lea r4, [r2*3]
182 pcmpeqb m6, m6
183.loop:
184 mova m0, [r1]
185 mova m2, [r1+r2]
186 mova m1, [r1+1]
187 mova m3, [r1+r2+1]
188 pxor m0, m6
189 pxor m2, m6
190 pxor m1, m6
191 pxor m3, m6
192 PAVGB m0, m1
193 PAVGB m2, m3
194 pxor m0, m6
195 pxor m2, m6
196 mova [r0], m0
197 mova [r0+r2], m2
198 mova m0, [r1+r2*2]
199 mova m1, [r1+r2*2+1]
200 mova m2, [r1+r4]
201 mova m3, [r1+r4+1]
202 pxor m0, m6
203 pxor m1, m6
204 pxor m2, m6
205 pxor m3, m6
206 PAVGB m0, m1
207 PAVGB m2, m3
208 pxor m0, m6
209 pxor m2, m6
210 mova [r0+r2*2], m0
211 mova [r0+r4], m2
212 lea r1, [r1+r2*4]
213 lea r0, [r0+r2*4]
214 sub r3d, 4
215 jg .loop
216 REP_RET
217%endmacro
218
219INIT_MMX mmxext
220PUT_NO_RND_PIXELS8_X2_EXACT
221INIT_MMX 3dnow
222PUT_NO_RND_PIXELS8_X2_EXACT
223
224
225; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
226%macro PUT_PIXELS8_Y2 0
227%if cpuflag(sse2)
228cglobal put_pixels16_y2, 4,5,3
229%else
230cglobal put_pixels8_y2, 4,5
231%endif
232 lea r4, [r2*2]
233 movu m0, [r1]
234 sub r0, r2
235.loop:
236 movu m1, [r1+r2]
237 movu m2, [r1+r4]
238 add r1, r4
239 PAVGB m0, m1
240 PAVGB m1, m2
241 mova [r0+r2], m0
242 mova [r0+r4], m1
243 movu m1, [r1+r2]
244 movu m0, [r1+r4]
245 add r0, r4
246 add r1, r4
247 PAVGB m2, m1
248 PAVGB m1, m0
249 mova [r0+r2], m2
250 mova [r0+r4], m1
251 add r0, r4
252 sub r3d, 4
253 jne .loop
254 REP_RET
255%endmacro
256
257INIT_MMX mmxext
258PUT_PIXELS8_Y2
259INIT_MMX 3dnow
260PUT_PIXELS8_Y2
261; actually, put_pixels16_y2_sse2
262INIT_XMM sse2
263PUT_PIXELS8_Y2
264
265
266; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
267%macro PUT_NO_RND_PIXELS8_Y2 0
268cglobal put_no_rnd_pixels8_y2, 4,5
269 mova m6, [pb_1]
270 lea r4, [r2+r2]
271 mova m0, [r1]
272 sub r0, r2
273.loop:
274 mova m1, [r1+r2]
275 mova m2, [r1+r4]
276 add r1, r4
277 psubusb m1, m6
278 PAVGB m0, m1
279 PAVGB m1, m2
280 mova [r0+r2], m0
281 mova [r0+r4], m1
282 mova m1, [r1+r2]
283 mova m0, [r1+r4]
284 add r0, r4
285 add r1, r4
286 psubusb m1, m6
287 PAVGB m2, m1
288 PAVGB m1, m0
289 mova [r0+r2], m2
290 mova [r0+r4], m1
291 add r0, r4
292 sub r3d, 4
293 jne .loop
294 REP_RET
295%endmacro
296
297INIT_MMX mmxext
298PUT_NO_RND_PIXELS8_Y2
299INIT_MMX 3dnow
300PUT_NO_RND_PIXELS8_Y2
301
302
303; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
304%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
305cglobal put_no_rnd_pixels8_y2_exact, 4,5
306 lea r4, [r2*3]
307 mova m0, [r1]
308 pcmpeqb m6, m6
309 add r1, r2
310 pxor m0, m6
311.loop:
312 mova m1, [r1]
313 mova m2, [r1+r2]
314 pxor m1, m6
315 pxor m2, m6
316 PAVGB m0, m1
317 PAVGB m1, m2
318 pxor m0, m6
319 pxor m1, m6
320 mova [r0], m0
321 mova [r0+r2], m1
322 mova m1, [r1+r2*2]
323 mova m0, [r1+r4]
324 pxor m1, m6
325 pxor m0, m6
326 PAVGB m2, m1
327 PAVGB m1, m0
328 pxor m2, m6
329 pxor m1, m6
330 mova [r0+r2*2], m2
331 mova [r0+r4], m1
332 lea r1, [r1+r2*4]
333 lea r0, [r0+r2*4]
334 sub r3d, 4
335 jg .loop
336 REP_RET
337%endmacro
338
339INIT_MMX mmxext
340PUT_NO_RND_PIXELS8_Y2_EXACT
341INIT_MMX 3dnow
342PUT_NO_RND_PIXELS8_Y2_EXACT
343
344
345; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
346%macro AVG_PIXELS8 0
347cglobal avg_pixels8, 4,5
348 lea r4, [r2*2]
349.loop:
350 mova m0, [r0]
351 mova m1, [r0+r2]
352 PAVGB m0, [r1]
353 PAVGB m1, [r1+r2]
354 mova [r0], m0
355 mova [r0+r2], m1
356 add r1, r4
357 add r0, r4
358 mova m0, [r0]
359 mova m1, [r0+r2]
360 PAVGB m0, [r1]
361 PAVGB m1, [r1+r2]
362 add r1, r4
363 mova [r0], m0
364 mova [r0+r2], m1
365 add r0, r4
366 sub r3d, 4
367 jne .loop
368 REP_RET
369%endmacro
370
371INIT_MMX 3dnow
372AVG_PIXELS8
373
374
375; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
376%macro AVG_PIXELS8_X2 0
377%if cpuflag(sse2)
378cglobal avg_pixels16_x2, 4,5,4
379%else
380cglobal avg_pixels8_x2, 4,5
381%endif
382 lea r4, [r2*2]
383%if notcpuflag(mmxext)
384 pcmpeqd m5, m5
385 paddb m5, m5
386%endif
387.loop:
388 movu m0, [r1]
389 movu m2, [r1+r2]
390%if cpuflag(sse2)
391 movu m1, [r1+1]
392 movu m3, [r1+r2+1]
393 pavgb m0, m1
394 pavgb m2, m3
395%else
396 PAVGB m0, [r1+1], m3, m5
397 PAVGB m2, [r1+r2+1], m4, m5
398%endif
399 PAVGB m0, [r0], m3, m5
400 PAVGB m2, [r0+r2], m4, m5
401 add r1, r4
402 mova [r0], m0
403 mova [r0+r2], m2
404 movu m0, [r1]
405 movu m2, [r1+r2]
406%if cpuflag(sse2)
407 movu m1, [r1+1]
408 movu m3, [r1+r2+1]
409 pavgb m0, m1
410 pavgb m2, m3
411%else
412 PAVGB m0, [r1+1], m3, m5
413 PAVGB m2, [r1+r2+1], m4, m5
414%endif
415 add r0, r4
416 add r1, r4
417 PAVGB m0, [r0], m3, m5
418 PAVGB m2, [r0+r2], m4, m5
419 mova [r0], m0
420 mova [r0+r2], m2
421 add r0, r4
422 sub r3d, 4
423 jne .loop
424 REP_RET
425%endmacro
426
427INIT_MMX mmx
428AVG_PIXELS8_X2
429INIT_MMX mmxext
430AVG_PIXELS8_X2
431INIT_MMX 3dnow
432AVG_PIXELS8_X2
433; actually avg_pixels16_x2
434INIT_XMM sse2
435AVG_PIXELS8_X2
436
437
438; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
439%macro AVG_PIXELS8_Y2 0
440%if cpuflag(sse2)
441cglobal avg_pixels16_y2, 4,5,3
442%else
443cglobal avg_pixels8_y2, 4,5
444%endif
445 lea r4, [r2*2]
446 movu m0, [r1]
447 sub r0, r2
448.loop:
449 movu m1, [r1+r2]
450 movu m2, [r1+r4]
451 add r1, r4
452 PAVGB m0, m1
453 PAVGB m1, m2
454 PAVGB m0, [r0+r2]
455 PAVGB m1, [r0+r4]
456 mova [r0+r2], m0
457 mova [r0+r4], m1
458 movu m1, [r1+r2]
459 movu m0, [r1+r4]
460 PAVGB m2, m1
461 PAVGB m1, m0
462 add r0, r4
463 add r1, r4
464 PAVGB m2, [r0+r2]
465 PAVGB m1, [r0+r4]
466 mova [r0+r2], m2
467 mova [r0+r4], m1
468 add r0, r4
469 sub r3d, 4
470 jne .loop
471 REP_RET
472%endmacro
473
474INIT_MMX mmxext
475AVG_PIXELS8_Y2
476INIT_MMX 3dnow
477AVG_PIXELS8_Y2
478; actually avg_pixels16_y2
479INIT_XMM sse2
480AVG_PIXELS8_Y2
481
482
483; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
484; Note this is not correctly rounded, and is therefore used for
485; not-bitexact output
486%macro AVG_APPROX_PIXELS8_XY2 0
487cglobal avg_approx_pixels8_xy2, 4,5
488 mova m6, [pb_1]
489 lea r4, [r2*2]
490 mova m0, [r1]
491 PAVGB m0, [r1+1]
492.loop:
493 mova m2, [r1+r4]
494 mova m1, [r1+r2]
495 psubusb m2, m6
496 PAVGB m1, [r1+r2+1]
497 PAVGB m2, [r1+r4+1]
498 add r1, r4
499 PAVGB m0, m1
500 PAVGB m1, m2
501 PAVGB m0, [r0]
502 PAVGB m1, [r0+r2]
503 mova [r0], m0
504 mova [r0+r2], m1
505 mova m1, [r1+r2]
506 mova m0, [r1+r4]
507 PAVGB m1, [r1+r2+1]
508 PAVGB m0, [r1+r4+1]
509 add r0, r4
510 add r1, r4
511 PAVGB m2, m1
512 PAVGB m1, m0
513 PAVGB m2, [r0]
514 PAVGB m1, [r0+r2]
515 mova [r0], m2
516 mova [r0+r2], m1
517 add r0, r4
518 sub r3d, 4
519 jne .loop
520 REP_RET
521%endmacro
522
523INIT_MMX mmxext
524AVG_APPROX_PIXELS8_XY2
525INIT_MMX 3dnow
526AVG_APPROX_PIXELS8_XY2
527
528
529; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
530%macro SET_PIXELS_XY2 1
531%if cpuflag(sse2)
532cglobal %1_pixels16_xy2, 4,5,8
533%else
534cglobal %1_pixels8_xy2, 4,5
535%endif
536 pxor m7, m7
537 mova m6, [pw_2]
538 movu m0, [r1]
539 movu m4, [r1+1]
540 mova m1, m0
541 mova m5, m4
542 punpcklbw m0, m7
543 punpcklbw m4, m7
544 punpckhbw m1, m7
545 punpckhbw m5, m7
546 paddusw m4, m0
547 paddusw m5, m1
548 xor r4, r4
549 add r1, r2
550.loop:
551 movu m0, [r1+r4]
552 movu m2, [r1+r4+1]
553 mova m1, m0
554 mova m3, m2
555 punpcklbw m0, m7
556 punpcklbw m2, m7
557 punpckhbw m1, m7
558 punpckhbw m3, m7
559 paddusw m0, m2
560 paddusw m1, m3
561 paddusw m4, m6
562 paddusw m5, m6
563 paddusw m4, m0
564 paddusw m5, m1
565 psrlw m4, 2
566 psrlw m5, 2
567%ifidn %1, avg
568 mova m3, [r0+r4]
569 packuswb m4, m5
570 PAVGB m4, m3
571%else
572 packuswb m4, m5
573%endif
574 mova [r0+r4], m4
575 add r4, r2
576
577 movu m2, [r1+r4]
578 movu m4, [r1+r4+1]
579 mova m3, m2
580 mova m5, m4
581 punpcklbw m2, m7
582 punpcklbw m4, m7
583 punpckhbw m3, m7
584 punpckhbw m5, m7
585 paddusw m4, m2
586 paddusw m5, m3
587 paddusw m0, m6
588 paddusw m1, m6
589 paddusw m0, m4
590 paddusw m1, m5
591 psrlw m0, 2
592 psrlw m1, 2
593%ifidn %1, avg
594 mova m3, [r0+r4]
595 packuswb m0, m1
596 PAVGB m0, m3
597%else
598 packuswb m0, m1
599%endif
600 mova [r0+r4], m0
601 add r4, r2
602 sub r3d, 2
603 jnz .loop
604 REP_RET
605%endmacro
606
607INIT_MMX mmxext
608SET_PIXELS_XY2 avg
609INIT_MMX 3dnow
610SET_PIXELS_XY2 avg
611INIT_XMM sse2
612SET_PIXELS_XY2 put
613SET_PIXELS_XY2 avg
614
615%macro SSSE3_PIXELS_XY2 1-2
616%if %0 == 2 ; sse2
617cglobal %1_pixels16_xy2, 4,5,%2
618 mova m4, [pb_interleave16]
619%else
620cglobal %1_pixels8_xy2, 4,5
621 mova m4, [pb_interleave8]
622%endif
623 mova m5, [pb_1]
624 movu m0, [r1]
625 movu m1, [r1+1]
626 pmaddubsw m0, m5
627 pmaddubsw m1, m5
628 xor r4, r4
629 add r1, r2
630.loop:
631 movu m2, [r1+r4]
632 movu m3, [r1+r4+1]
633 pmaddubsw m2, m5
634 pmaddubsw m3, m5
635 paddusw m0, m2
636 paddusw m1, m3
637 pmulhrsw m0, [pw_8192]
638 pmulhrsw m1, [pw_8192]
639%ifidn %1, avg
640 mova m6, [r0+r4]
641 packuswb m0, m1
642 pshufb m0, m4
643 pavgb m0, m6
644%else
645 packuswb m0, m1
646 pshufb m0, m4
647%endif
648 mova [r0+r4], m0
649 add r4, r2
650
651 movu m0, [r1+r4]
652 movu m1, [r1+r4+1]
653 pmaddubsw m0, m5
654 pmaddubsw m1, m5
655 paddusw m2, m0
656 paddusw m3, m1
657 pmulhrsw m2, [pw_8192]
658 pmulhrsw m3, [pw_8192]
659%ifidn %1, avg
660 mova m6, [r0+r4]
661 packuswb m2, m3
662 pshufb m2, m4
663 pavgb m2, m6
664%else
665 packuswb m2, m3
666 pshufb m2, m4
667%endif
668 mova [r0+r4], m2
669 add r4, r2
670 sub r3d, 2
671 jnz .loop
672 REP_RET
673%endmacro
674
675INIT_MMX ssse3
676SSSE3_PIXELS_XY2 put
677SSSE3_PIXELS_XY2 avg
678INIT_XMM ssse3
679SSSE3_PIXELS_XY2 put, 6
680SSSE3_PIXELS_XY2 avg, 7