Imported Debian version 2.5.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_chromamc.asm
1 ;******************************************************************************
2 ;* MMX/SSSE3-optimized functions for H264 chroma MC
3 ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4 ;* 2005-2008 Loren Merritt
5 ;*
6 ;* This file is part of FFmpeg.
7 ;*
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
12 ;*
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
22
23 %include "libavutil/x86/x86util.asm"
24
25 SECTION_RODATA
26
27 rnd_rv40_2d_tbl: times 4 dw 0
28 times 4 dw 16
29 times 4 dw 32
30 times 4 dw 16
31 times 4 dw 32
32 times 4 dw 28
33 times 4 dw 32
34 times 4 dw 28
35 times 4 dw 0
36 times 4 dw 32
37 times 4 dw 16
38 times 4 dw 32
39 times 4 dw 32
40 times 4 dw 28
41 times 4 dw 32
42 times 4 dw 28
43 rnd_rv40_1d_tbl: times 4 dw 0
44 times 4 dw 2
45 times 4 dw 4
46 times 4 dw 2
47 times 4 dw 4
48 times 4 dw 3
49 times 4 dw 4
50 times 4 dw 3
51 times 4 dw 0
52 times 4 dw 4
53 times 4 dw 2
54 times 4 dw 4
55 times 4 dw 4
56 times 4 dw 3
57 times 4 dw 4
58 times 4 dw 3
59
60 cextern pw_3
61 cextern pw_4
62 cextern pw_8
63 pw_28: times 8 dw 28
64 cextern pw_32
65 cextern pw_64
66
67 SECTION .text
68
69 %macro mv0_pixels_mc8 0
70 lea r4, [r2*2 ]
71 .next4rows:
72 movq mm0, [r1 ]
73 movq mm1, [r1+r2]
74 add r1, r4
75 CHROMAMC_AVG mm0, [r0 ]
76 CHROMAMC_AVG mm1, [r0+r2]
77 movq [r0 ], mm0
78 movq [r0+r2], mm1
79 add r0, r4
80 movq mm0, [r1 ]
81 movq mm1, [r1+r2]
82 add r1, r4
83 CHROMAMC_AVG mm0, [r0 ]
84 CHROMAMC_AVG mm1, [r0+r2]
85 movq [r0 ], mm0
86 movq [r0+r2], mm1
87 add r0, r4
88 sub r3d, 4
89 jne .next4rows
90 %endmacro
91
92 %macro chroma_mc8_mmx_func 2-3
93 %ifidn %2, rv40
94 %ifdef PIC
95 %define rnd_1d_rv40 r8
96 %define rnd_2d_rv40 r8
97 %define extra_regs 2
98 %else ; no-PIC
99 %define rnd_1d_rv40 rnd_rv40_1d_tbl
100 %define rnd_2d_rv40 rnd_rv40_2d_tbl
101 %define extra_regs 1
102 %endif ; PIC
103 %else
104 %define extra_regs 0
105 %endif ; rv40
106 ; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
107 ; uint8_t *src /* align 1 */,
108 ; int stride, int h, int mx, int my)
109 cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
110 %if ARCH_X86_64
111 movsxd r2, r2d
112 %endif
113 mov r6d, r5d
114 or r6d, r4d
115 jne .at_least_one_non_zero
116 ; mx == 0 AND my == 0 - no filter needed
117 mv0_pixels_mc8
118 REP_RET
119
120 .at_least_one_non_zero:
121 %ifidn %2, rv40
122 %if ARCH_X86_64
123 mov r7, r5
124 and r7, 6 ; &~1 for mx/my=[0,7]
125 lea r7, [r7*4+r4]
126 sar r7d, 1
127 %define rnd_bias r7
128 %define dest_reg r0
129 %else ; x86-32
130 mov r0, r5
131 and r0, 6 ; &~1 for mx/my=[0,7]
132 lea r0, [r0*4+r4]
133 sar r0d, 1
134 %define rnd_bias r0
135 %define dest_reg r5
136 %endif
137 %else ; vc1, h264
138 %define rnd_bias 0
139 %define dest_reg r0
140 %endif
141
142 test r5d, r5d
143 mov r6, 1
144 je .my_is_zero
145 test r4d, r4d
146 mov r6, r2 ; dxy = x ? 1 : stride
147 jne .both_non_zero
148 .my_is_zero:
149 ; mx == 0 XOR my == 0 - 1 dimensional filter only
150 or r4d, r5d ; x + y
151
152 %ifidn %2, rv40
153 %ifdef PIC
154 lea r8, [rnd_rv40_1d_tbl]
155 %endif
156 %if ARCH_X86_64 == 0
157 mov r5, r0m
158 %endif
159 %endif
160
161 movd m5, r4d
162 movq m4, [pw_8]
163 movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
164 punpcklwd m5, m5
165 punpckldq m5, m5 ; mm5 = B = x
166 pxor m7, m7
167 psubw m4, m5 ; mm4 = A = 8-x
168
169 .next1drow:
170 movq m0, [r1 ] ; mm0 = src[0..7]
171 movq m2, [r1+r6] ; mm1 = src[1..8]
172
173 movq m1, m0
174 movq m3, m2
175 punpcklbw m0, m7
176 punpckhbw m1, m7
177 punpcklbw m2, m7
178 punpckhbw m3, m7
179 pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
180 pmullw m1, m4
181 pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
182 pmullw m3, m5
183
184 paddw m0, m6
185 paddw m1, m6
186 paddw m0, m2
187 paddw m1, m3
188 psrlw m0, 3
189 psrlw m1, 3
190 packuswb m0, m1
191 CHROMAMC_AVG m0, [dest_reg]
192 movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
193
194 add dest_reg, r2
195 add r1, r2
196 dec r3d
197 jne .next1drow
198 REP_RET
199
200 .both_non_zero: ; general case, bilinear
201 movd m4, r4d ; x
202 movd m6, r5d ; y
203 %ifidn %2, rv40
204 %ifdef PIC
205 lea r8, [rnd_rv40_2d_tbl]
206 %endif
207 %if ARCH_X86_64 == 0
208 mov r5, r0m
209 %endif
210 %endif
211 mov r6, rsp ; backup stack pointer
212 and rsp, ~(mmsize-1) ; align stack
213 sub rsp, 16 ; AA and DD
214
215 punpcklwd m4, m4
216 punpcklwd m6, m6
217 punpckldq m4, m4 ; mm4 = x words
218 punpckldq m6, m6 ; mm6 = y words
219 movq m5, m4
220 pmullw m4, m6 ; mm4 = x * y
221 psllw m5, 3
222 psllw m6, 3
223 movq m7, m5
224 paddw m7, m6
225 movq [rsp+8], m4 ; DD = x * y
226 psubw m5, m4 ; mm5 = B = 8x - xy
227 psubw m6, m4 ; mm6 = C = 8y - xy
228 paddw m4, [pw_64]
229 psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
230 pxor m7, m7
231 movq [rsp ], m4
232
233 movq m0, [r1 ] ; mm0 = src[0..7]
234 movq m1, [r1+1] ; mm1 = src[1..8]
235 .next2drow:
236 add r1, r2
237
238 movq m2, m0
239 movq m3, m1
240 punpckhbw m0, m7
241 punpcklbw m1, m7
242 punpcklbw m2, m7
243 punpckhbw m3, m7
244 pmullw m0, [rsp]
245 pmullw m2, [rsp]
246 pmullw m1, m5
247 pmullw m3, m5
248 paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
249 paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
250
251 movq m0, [r1]
252 movq m1, m0
253 punpcklbw m0, m7
254 punpckhbw m1, m7
255 pmullw m0, m6
256 pmullw m1, m6
257 paddw m2, m0
258 paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
259
260 movq m1, [r1+1]
261 movq m0, m1
262 movq m4, m1
263 punpcklbw m0, m7
264 punpckhbw m4, m7
265 pmullw m0, [rsp+8]
266 pmullw m4, [rsp+8]
267 paddw m2, m0
268 paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
269 movq m0, [r1]
270
271 paddw m2, [rnd_2d_%2+rnd_bias*8]
272 paddw m3, [rnd_2d_%2+rnd_bias*8]
273 psrlw m2, 6
274 psrlw m3, 6
275 packuswb m2, m3
276 CHROMAMC_AVG m2, [dest_reg]
277 movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
278
279 add dest_reg, r2
280 dec r3d
281 jne .next2drow
282 mov rsp, r6 ; restore stack pointer
283 RET
284 %endmacro
285
286 %macro chroma_mc4_mmx_func 2
287 %define extra_regs 0
288 %ifidn %2, rv40
289 %ifdef PIC
290 %define extra_regs 1
291 %endif ; PIC
292 %endif ; rv40
293 cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
294 %if ARCH_X86_64
295 movsxd r2, r2d
296 %endif
297 pxor m7, m7
298 movd m2, r4d ; x
299 movd m3, r5d ; y
300 movq m4, [pw_8]
301 movq m5, [pw_8]
302 punpcklwd m2, m2
303 punpcklwd m3, m3
304 punpcklwd m2, m2
305 punpcklwd m3, m3
306 psubw m4, m2
307 psubw m5, m3
308
309 %ifidn %2, rv40
310 %ifdef PIC
311 lea r6, [rnd_rv40_2d_tbl]
312 %define rnd_2d_rv40 r6
313 %else
314 %define rnd_2d_rv40 rnd_rv40_2d_tbl
315 %endif
316 and r5, 6 ; &~1 for mx/my=[0,7]
317 lea r5, [r5*4+r4]
318 sar r5d, 1
319 %define rnd_bias r5
320 %else ; vc1, h264
321 %define rnd_bias 0
322 %endif
323
324 movd m0, [r1 ]
325 movd m6, [r1+1]
326 add r1, r2
327 punpcklbw m0, m7
328 punpcklbw m6, m7
329 pmullw m0, m4
330 pmullw m6, m2
331 paddw m6, m0
332
333 .next2rows:
334 movd m0, [r1 ]
335 movd m1, [r1+1]
336 add r1, r2
337 punpcklbw m0, m7
338 punpcklbw m1, m7
339 pmullw m0, m4
340 pmullw m1, m2
341 paddw m1, m0
342 movq m0, m1
343
344 pmullw m6, m5
345 pmullw m1, m3
346 paddw m6, [rnd_2d_%2+rnd_bias*8]
347 paddw m1, m6
348 psrlw m1, 6
349 packuswb m1, m1
350 CHROMAMC_AVG4 m1, m6, [r0]
351 movd [r0], m1
352 add r0, r2
353
354 movd m6, [r1 ]
355 movd m1, [r1+1]
356 add r1, r2
357 punpcklbw m6, m7
358 punpcklbw m1, m7
359 pmullw m6, m4
360 pmullw m1, m2
361 paddw m1, m6
362 movq m6, m1
363 pmullw m0, m5
364 pmullw m1, m3
365 paddw m0, [rnd_2d_%2+rnd_bias*8]
366 paddw m1, m0
367 psrlw m1, 6
368 packuswb m1, m1
369 CHROMAMC_AVG4 m1, m0, [r0]
370 movd [r0], m1
371 add r0, r2
372 sub r3d, 2
373 jnz .next2rows
374 REP_RET
375 %endmacro
376
377 %macro chroma_mc2_mmx_func 2
378 cglobal %1_%2_chroma_mc2, 6, 7, 0
379 %if ARCH_X86_64
380 movsxd r2, r2d
381 %endif
382
383 mov r6d, r4d
384 shl r4d, 16
385 sub r4d, r6d
386 add r4d, 8
387 imul r5d, r4d ; x*y<<16 | y*(8-x)
388 shl r4d, 3
389 sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
390
391 movd m5, r4d
392 movd m6, r5d
393 punpckldq m5, m5 ; mm5 = {A,B,A,B}
394 punpckldq m6, m6 ; mm6 = {C,D,C,D}
395 pxor m7, m7
396 movd m2, [r1]
397 punpcklbw m2, m7
398 pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
399
400 .nextrow:
401 add r1, r2
402 movq m1, m2
403 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
404 movd m0, [r1]
405 punpcklbw m0, m7
406 pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
407 movq m2, m0
408 pmaddwd m0, m6
409 paddw m1, [rnd_2d_%2]
410 paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
411 psrlw m1, 6
412 packssdw m1, m7
413 packuswb m1, m7
414 CHROMAMC_AVG4 m1, m3, [r0]
415 movd r5d, m1
416 mov [r0], r5w
417 add r0, r2
418 sub r3d, 1
419 jnz .nextrow
420 REP_RET
421 %endmacro
422
423 %define rnd_1d_h264 pw_4
424 %define rnd_2d_h264 pw_32
425 %define rnd_1d_vc1 pw_3
426 %define rnd_2d_vc1 pw_28
427
428 %macro NOTHING 2-3
429 %endmacro
430 %macro DIRECT_AVG 2
431 PAVGB %1, %2
432 %endmacro
433 %macro COPY_AVG 3
434 movd %2, %3
435 PAVGB %1, %2
436 %endmacro
437
438 INIT_MMX mmx
439 %define CHROMAMC_AVG NOTHING
440 %define CHROMAMC_AVG4 NOTHING
441 chroma_mc8_mmx_func put, h264, _rnd
442 chroma_mc8_mmx_func put, vc1, _nornd
443 chroma_mc8_mmx_func put, rv40
444 chroma_mc4_mmx_func put, h264
445 chroma_mc4_mmx_func put, rv40
446
447 INIT_MMX mmxext
448 chroma_mc2_mmx_func put, h264
449
450 %define CHROMAMC_AVG DIRECT_AVG
451 %define CHROMAMC_AVG4 COPY_AVG
452 chroma_mc8_mmx_func avg, h264, _rnd
453 chroma_mc8_mmx_func avg, vc1, _nornd
454 chroma_mc8_mmx_func avg, rv40
455 chroma_mc4_mmx_func avg, h264
456 chroma_mc4_mmx_func avg, rv40
457 chroma_mc2_mmx_func avg, h264
458
459 INIT_MMX 3dnow
460 chroma_mc8_mmx_func avg, h264, _rnd
461 chroma_mc8_mmx_func avg, vc1, _nornd
462 chroma_mc8_mmx_func avg, rv40
463 chroma_mc4_mmx_func avg, h264
464 chroma_mc4_mmx_func avg, rv40
465
466 %macro chroma_mc8_ssse3_func 2-3
467 cglobal %1_%2_chroma_mc8%3, 6, 7, 8
468 %if ARCH_X86_64
469 movsxd r2, r2d
470 %endif
471 mov r6d, r5d
472 or r6d, r4d
473 jne .at_least_one_non_zero
474 ; mx == 0 AND my == 0 - no filter needed
475 mv0_pixels_mc8
476 REP_RET
477
478 .at_least_one_non_zero:
479 test r5d, r5d
480 je .my_is_zero
481 test r4d, r4d
482 je .mx_is_zero
483
484 ; general case, bilinear
485 mov r6d, r4d
486 shl r4d, 8
487 sub r4, r6
488 mov r6, 8
489 add r4, 8 ; x*288+8 = x<<8 | (8-x)
490 sub r6d, r5d
491 imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
492 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
493
494 movd m7, r6d
495 movd m6, r4d
496 movdqa m5, [rnd_2d_%2]
497 movq m0, [r1 ]
498 movq m1, [r1+1]
499 pshuflw m7, m7, 0
500 pshuflw m6, m6, 0
501 punpcklbw m0, m1
502 movlhps m7, m7
503 movlhps m6, m6
504
505 .next2rows:
506 movq m1, [r1+r2*1 ]
507 movq m2, [r1+r2*1+1]
508 movq m3, [r1+r2*2 ]
509 movq m4, [r1+r2*2+1]
510 lea r1, [r1+r2*2]
511 punpcklbw m1, m2
512 movdqa m2, m1
513 punpcklbw m3, m4
514 movdqa m4, m3
515 pmaddubsw m0, m7
516 pmaddubsw m1, m6
517 pmaddubsw m2, m7
518 pmaddubsw m3, m6
519 paddw m0, m5
520 paddw m2, m5
521 paddw m1, m0
522 paddw m3, m2
523 psrlw m1, 6
524 movdqa m0, m4
525 psrlw m3, 6
526 %ifidn %1, avg
527 movq m2, [r0 ]
528 movhps m2, [r0+r2]
529 %endif
530 packuswb m1, m3
531 CHROMAMC_AVG m1, m2
532 movq [r0 ], m1
533 movhps [r0+r2], m1
534 sub r3d, 2
535 lea r0, [r0+r2*2]
536 jg .next2rows
537 REP_RET
538
539 .my_is_zero:
540 mov r5d, r4d
541 shl r4d, 8
542 add r4, 8
543 sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
544 movd m7, r4d
545 movdqa m6, [rnd_1d_%2]
546 pshuflw m7, m7, 0
547 movlhps m7, m7
548
549 .next2xrows:
550 movq m0, [r1 ]
551 movq m1, [r1 +1]
552 movq m2, [r1+r2 ]
553 movq m3, [r1+r2+1]
554 punpcklbw m0, m1
555 punpcklbw m2, m3
556 pmaddubsw m0, m7
557 pmaddubsw m2, m7
558 %ifidn %1, avg
559 movq m4, [r0 ]
560 movhps m4, [r0+r2]
561 %endif
562 paddw m0, m6
563 paddw m2, m6
564 psrlw m0, 3
565 psrlw m2, 3
566 packuswb m0, m2
567 CHROMAMC_AVG m0, m4
568 movq [r0 ], m0
569 movhps [r0+r2], m0
570 sub r3d, 2
571 lea r0, [r0+r2*2]
572 lea r1, [r1+r2*2]
573 jg .next2xrows
574 REP_RET
575
576 .mx_is_zero:
577 mov r4d, r5d
578 shl r5d, 8
579 add r5, 8
580 sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
581 movd m7, r5d
582 movdqa m6, [rnd_1d_%2]
583 pshuflw m7, m7, 0
584 movlhps m7, m7
585
586 .next2yrows:
587 movq m0, [r1 ]
588 movq m1, [r1+r2 ]
589 movdqa m2, m1
590 movq m3, [r1+r2*2]
591 lea r1, [r1+r2*2]
592 punpcklbw m0, m1
593 punpcklbw m2, m3
594 pmaddubsw m0, m7
595 pmaddubsw m2, m7
596 %ifidn %1, avg
597 movq m4, [r0 ]
598 movhps m4, [r0+r2]
599 %endif
600 paddw m0, m6
601 paddw m2, m6
602 psrlw m0, 3
603 psrlw m2, 3
604 packuswb m0, m2
605 CHROMAMC_AVG m0, m4
606 movq [r0 ], m0
607 movhps [r0+r2], m0
608 sub r3d, 2
609 lea r0, [r0+r2*2]
610 jg .next2yrows
611 REP_RET
612 %endmacro
613
614 %macro chroma_mc4_ssse3_func 2
615 cglobal %1_%2_chroma_mc4, 6, 7, 0
616 %if ARCH_X86_64
617 movsxd r2, r2d
618 %endif
619 mov r6, r4
620 shl r4d, 8
621 sub r4d, r6d
622 mov r6, 8
623 add r4d, 8 ; x*288+8
624 sub r6d, r5d
625 imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
626 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
627
628 movd m7, r6d
629 movd m6, r4d
630 movq m5, [pw_32]
631 movd m0, [r1 ]
632 pshufw m7, m7, 0
633 punpcklbw m0, [r1+1]
634 pshufw m6, m6, 0
635
636 .next2rows:
637 movd m1, [r1+r2*1 ]
638 movd m3, [r1+r2*2 ]
639 punpcklbw m1, [r1+r2*1+1]
640 punpcklbw m3, [r1+r2*2+1]
641 lea r1, [r1+r2*2]
642 movq m2, m1
643 movq m4, m3
644 pmaddubsw m0, m7
645 pmaddubsw m1, m6
646 pmaddubsw m2, m7
647 pmaddubsw m3, m6
648 paddw m0, m5
649 paddw m2, m5
650 paddw m1, m0
651 paddw m3, m2
652 psrlw m1, 6
653 movq m0, m4
654 psrlw m3, 6
655 packuswb m1, m1
656 packuswb m3, m3
657 CHROMAMC_AVG m1, [r0 ]
658 CHROMAMC_AVG m3, [r0+r2]
659 movd [r0 ], m1
660 movd [r0+r2], m3
661 sub r3d, 2
662 lea r0, [r0+r2*2]
663 jg .next2rows
664 REP_RET
665 %endmacro
666
667 %define CHROMAMC_AVG NOTHING
668 INIT_XMM ssse3
669 chroma_mc8_ssse3_func put, h264, _rnd
670 chroma_mc8_ssse3_func put, vc1, _nornd
671 INIT_MMX ssse3
672 chroma_mc4_ssse3_func put, h264
673
674 %define CHROMAMC_AVG DIRECT_AVG
675 INIT_XMM ssse3
676 chroma_mc8_ssse3_func avg, h264, _rnd
677 chroma_mc8_ssse3_func avg, vc1, _nornd
678 INIT_MMX ssse3
679 chroma_mc4_ssse3_func avg, h264