Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_deblock.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Fiona Glaser <fiona@x264.com>
8;* Oskar Arvidsson <oskar@irock.se>
9;*
10;* This file is part of FFmpeg.
11;*
12;* FFmpeg is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* FFmpeg is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with FFmpeg; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25;******************************************************************************
26
27%include "libavutil/x86/x86util.asm"
28
29SECTION_RODATA
30
31pb_A1: times 16 db 0xA1
32pb_3_1: times 4 db 3, 1
33
34SECTION .text
35
36cextern pb_0
37cextern pb_1
38cextern pb_3
39
40; expands to [base],...,[base+7*stride]
41%define PASS8ROWS(base, base3, stride, stride3) \
42 [base], [base+stride], [base+stride*2], [base3], \
43 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
44
45%define PASS8ROWS(base, base3, stride, stride3, offset) \
46 PASS8ROWS(base+offset, base3+offset, stride, stride3)
47
48; in: 8 rows of 4 bytes in %4..%11
49; out: 4 rows of 8 bytes in m0..m3
50%macro TRANSPOSE4x8_LOAD 11
51 movh m0, %4
52 movh m2, %5
53 movh m1, %6
54 movh m3, %7
55 punpckl%1 m0, m2
56 punpckl%1 m1, m3
57 mova m2, m0
58 punpckl%2 m0, m1
59 punpckh%2 m2, m1
60
61 movh m4, %8
62 movh m6, %9
63 movh m5, %10
64 movh m7, %11
65 punpckl%1 m4, m6
66 punpckl%1 m5, m7
67 mova m6, m4
68 punpckl%2 m4, m5
69 punpckh%2 m6, m5
70
71 punpckh%3 m1, m0, m4
72 punpckh%3 m3, m2, m6
73 punpckl%3 m0, m4
74 punpckl%3 m2, m6
75%endmacro
76
77; in: 4 rows of 8 bytes in m0..m3
78; out: 8 rows of 4 bytes in %1..%8
79%macro TRANSPOSE8x4B_STORE 8
80 punpckhdq m4, m0, m0
81 punpckhdq m5, m1, m1
82 punpckhdq m6, m2, m2
83
84 punpcklbw m0, m1
85 punpcklbw m2, m3
86 punpcklwd m1, m0, m2
87 punpckhwd m0, m2
88 movh %1, m1
89 punpckhdq m1, m1
90 movh %2, m1
91 movh %3, m0
92 punpckhdq m0, m0
93 movh %4, m0
94
95 punpckhdq m3, m3
96 punpcklbw m4, m5
97 punpcklbw m6, m3
98 punpcklwd m5, m4, m6
99 punpckhwd m4, m6
100 movh %5, m5
101 punpckhdq m5, m5
102 movh %6, m5
103 movh %7, m4
104 punpckhdq m4, m4
105 movh %8, m4
106%endmacro
107
108%macro TRANSPOSE4x8B_LOAD 8
109 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
110%endmacro
111
112%macro SBUTTERFLY3 4
113 punpckh%1 %4, %2, %3
114 punpckl%1 %2, %3
115%endmacro
116
117; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
118; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
119%macro TRANSPOSE6x8_MEM 9
120 RESET_MM_PERMUTATION
121 movq m0, %1
122 movq m1, %2
123 movq m2, %3
124 movq m3, %4
125 movq m4, %5
126 movq m5, %6
127 movq m6, %7
128 SBUTTERFLY bw, 0, 1, 7
129 SBUTTERFLY bw, 2, 3, 7
130 SBUTTERFLY bw, 4, 5, 7
131 movq [%9+0x10], m3
132 SBUTTERFLY3 bw, m6, %8, m7
133 SBUTTERFLY wd, 0, 2, 3
134 SBUTTERFLY wd, 4, 6, 3
135 punpckhdq m0, m4
136 movq [%9+0x00], m0
137 SBUTTERFLY3 wd, m1, [%9+0x10], m3
138 SBUTTERFLY wd, 5, 7, 0
139 SBUTTERFLY dq, 1, 5, 0
140 SBUTTERFLY dq, 2, 6, 0
141 punpckldq m3, m7
142 movq [%9+0x10], m2
143 movq [%9+0x20], m6
144 movq [%9+0x30], m1
145 movq [%9+0x40], m5
146 movq [%9+0x50], m3
147 RESET_MM_PERMUTATION
148%endmacro
149
150; in: 8 rows of 8 in %1..%8
151; out: 8 rows of 8 in %9..%16
152%macro TRANSPOSE8x8_MEM 16
153 RESET_MM_PERMUTATION
154 movq m0, %1
155 movq m1, %2
156 movq m2, %3
157 movq m3, %4
158 movq m4, %5
159 movq m5, %6
160 movq m6, %7
161 SBUTTERFLY bw, 0, 1, 7
162 SBUTTERFLY bw, 2, 3, 7
163 SBUTTERFLY bw, 4, 5, 7
164 SBUTTERFLY3 bw, m6, %8, m7
165 movq %9, m5
166 SBUTTERFLY wd, 0, 2, 5
167 SBUTTERFLY wd, 4, 6, 5
168 SBUTTERFLY wd, 1, 3, 5
169 movq %11, m6
170 movq m6, %9
171 SBUTTERFLY wd, 6, 7, 5
172 SBUTTERFLY dq, 0, 4, 5
173 SBUTTERFLY dq, 1, 6, 5
174 movq %9, m0
175 movq %10, m4
176 movq %13, m1
177 movq %14, m6
178 SBUTTERFLY3 dq, m2, %11, m0
179 SBUTTERFLY dq, 3, 7, 4
180 movq %11, m2
181 movq %12, m0
182 movq %15, m3
183 movq %16, m7
184 RESET_MM_PERMUTATION
185%endmacro
186
187; out: %4 = |%1-%2|>%3
188; clobbers: %5
189%macro DIFF_GT 5
190%if avx_enabled == 0
191 mova %5, %2
192 mova %4, %1
193 psubusb %5, %1
194 psubusb %4, %2
195%else
196 psubusb %5, %2, %1
197 psubusb %4, %1, %2
198%endif
199 por %4, %5
200 psubusb %4, %3
201%endmacro
202
203; out: %4 = |%1-%2|>%3
204; clobbers: %5
205%macro DIFF_GT2 5
206%if ARCH_X86_64
207 psubusb %5, %2, %1
208 psubusb %4, %1, %2
209%else
210 mova %5, %2
211 mova %4, %1
212 psubusb %5, %1
213 psubusb %4, %2
214%endif
215 psubusb %5, %3
216 psubusb %4, %3
217 pcmpeqb %4, %5
218%endmacro
219
220; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
221; out: m5=beta-1, m7=mask, %3=alpha-1
222; clobbers: m4,m6
223%macro LOAD_MASK 2-3
224 movd m4, %1
225 movd m5, %2
226 SPLATW m4, m4
227 SPLATW m5, m5
228 packuswb m4, m4 ; 16x alpha-1
229 packuswb m5, m5 ; 16x beta-1
230%if %0>2
231 mova %3, m4
232%endif
233 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
234 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
235 por m7, m4
236 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
237 por m7, m4
238 pxor m6, m6
239 pcmpeqb m7, m6
240%endmacro
241
242; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
243; out: m1=p0' m2=q0'
244; clobbers: m0,3-6
245%macro DEBLOCK_P0_Q0 0
246 pcmpeqb m4, m4
247 pxor m5, m1, m2 ; p0^q0
248 pxor m3, m4
249 pand m5, [pb_1] ; (p0^q0)&1
250 pavgb m3, m0 ; (p1 - q1 + 256)>>1
251 pxor m4, m1
252 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
253 pavgb m4, m2 ; (q0 - p0 + 256)>>1
254 pavgb m3, m5
255 mova m6, [pb_A1]
256 paddusb m3, m4 ; d+128+33
257 psubusb m6, m3
258 psubusb m3, [pb_A1]
259 pminub m6, m7
260 pminub m3, m7
261 psubusb m1, m6
262 psubusb m2, m3
263 paddusb m1, m3
264 paddusb m2, m6
265%endmacro
266
267; in: m1=p0 m2=q0
268; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
269; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
270; clobbers: q2, tmp, tc0
271%macro LUMA_Q1 6
272 pavgb %6, m1, m2
273 pavgb %2, %6 ; avg(p2,avg(p0,q0))
274 pxor %6, %3
275 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
276 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
277 psubusb %6, %1, %5
278 paddusb %5, %1
279 pmaxub %2, %6
280 pminub %2, %5
281 mova %4, %2
282%endmacro
283
284%if ARCH_X86_64
285;-----------------------------------------------------------------------------
286; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta,
287; int8_t *tc0)
288;-----------------------------------------------------------------------------
289%macro DEBLOCK_LUMA 0
290cglobal deblock_v_luma_8, 5,5,10
291 movd m8, [r4] ; tc0
292 lea r4, [r1*3]
293 dec r2d ; alpha-1
294 neg r4
295 dec r3d ; beta-1
296 add r4, r0 ; pix-3*stride
297
298 mova m0, [r4+r1] ; p1
299 mova m1, [r4+2*r1] ; p0
300 mova m2, [r0] ; q0
301 mova m3, [r0+r1] ; q1
302 LOAD_MASK r2d, r3d
303
304 punpcklbw m8, m8
305 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
306 pcmpeqb m9, m9
307 pcmpeqb m9, m8
308 pandn m9, m7
309 pand m8, m9
310
311 movdqa m3, [r4] ; p2
312 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
313 pand m6, m9
314 psubb m7, m8, m6
315 pand m6, m8
316 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
317
318 movdqa m4, [r0+2*r1] ; q2
319 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
320 pand m6, m9
321 pand m8, m6
322 psubb m7, m6
323 mova m3, [r0+r1]
324 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
325
326 DEBLOCK_P0_Q0
327 mova [r4+2*r1], m1
328 mova [r0], m2
329 RET
330
331;-----------------------------------------------------------------------------
332; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
333; int8_t *tc0)
334;-----------------------------------------------------------------------------
335INIT_MMX cpuname
336cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
337 movsxd r7, r1d
338 lea r8, [r7+r7*2]
339 lea r6, [r0-4]
340 lea r5, [r0-4+r8]
341%if WIN64
342 %define pix_tmp rsp+0x30 ; shadow space + r4
343%else
344 %define pix_tmp rsp
345%endif
346
347 ; transpose 6x16 -> tmp space
348 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
349 lea r6, [r6+r7*8]
350 lea r5, [r5+r7*8]
351 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
352
353 ; vertical filter
354 ; alpha, beta, tc0 are still in r2d, r3d, r4
355 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
356 lea r0, [pix_tmp+0x30]
357 mov r1d, 0x10
358%if WIN64
359 mov [rsp+0x20], r4
360%endif
361 call deblock_v_luma_8
362
363 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
364 add r6, 2
365 add r5, 2
366 movq m0, [pix_tmp+0x18]
367 movq m1, [pix_tmp+0x28]
368 movq m2, [pix_tmp+0x38]
369 movq m3, [pix_tmp+0x48]
370 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
371
372 shl r7, 3
373 sub r6, r7
374 sub r5, r7
375 shr r7, 3
376 movq m0, [pix_tmp+0x10]
377 movq m1, [pix_tmp+0x20]
378 movq m2, [pix_tmp+0x30]
379 movq m3, [pix_tmp+0x40]
380 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
381
382 RET
383%endmacro
384
385INIT_XMM sse2
386DEBLOCK_LUMA
387%if HAVE_AVX_EXTERNAL
388INIT_XMM avx
389DEBLOCK_LUMA
390%endif
391
392%else
393
394%macro DEBLOCK_LUMA 2
395;-----------------------------------------------------------------------------
396; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta,
397; int8_t *tc0)
398;-----------------------------------------------------------------------------
399cglobal deblock_%1_luma_8, 5,5,8,2*%2
400 lea r4, [r1*3]
401 dec r2 ; alpha-1
402 neg r4
403 dec r3 ; beta-1
404 add r4, r0 ; pix-3*stride
405
406 mova m0, [r4+r1] ; p1
407 mova m1, [r4+2*r1] ; p0
408 mova m2, [r0] ; q0
409 mova m3, [r0+r1] ; q1
410 LOAD_MASK r2, r3
411
412 mov r3, r4mp
413 pcmpeqb m3, m3
414 movd m4, [r3] ; tc0
415 punpcklbw m4, m4
416 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
417 mova [esp+%2], m4 ; tc
418 pcmpgtb m4, m3
419 mova m3, [r4] ; p2
420 pand m4, m7
421 mova [esp], m4 ; mask
422
423 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
424 pand m6, m4
425 pand m4, [esp+%2] ; tc
426 psubb m7, m4, m6
427 pand m6, m4
428 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
429
430 mova m4, [r0+2*r1] ; q2
431 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
432 pand m6, [esp] ; mask
433 mova m5, [esp+%2] ; tc
434 psubb m7, m6
435 pand m5, m6
436 mova m3, [r0+r1]
437 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
438
439 DEBLOCK_P0_Q0
440 mova [r4+2*r1], m1
441 mova [r0], m2
442 RET
443
444;-----------------------------------------------------------------------------
445; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
446; int8_t *tc0)
447;-----------------------------------------------------------------------------
448INIT_MMX cpuname
449cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
450 mov r0, r0mp
451 mov r3, r1m
452 lea r4, [r3*3]
453 sub r0, 4
454 lea r1, [r0+r4]
455%define pix_tmp esp+12*HAVE_ALIGNED_STACK
456
457 ; transpose 6x16 -> tmp space
458 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
459 lea r0, [r0+r3*8]
460 lea r1, [r1+r3*8]
461 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
462
463 ; vertical filter
464 lea r0, [pix_tmp+0x30]
465 PUSH dword r4m
466 PUSH dword r3m
467 PUSH dword r2m
468 PUSH dword 16
469 PUSH dword r0
470 call deblock_%1_luma_8
471%ifidn %1, v8
472 add dword [esp ], 8 ; pix_tmp+0x38
473 add dword [esp+16], 2 ; tc0+2
474 call deblock_%1_luma_8
475%endif
476 ADD esp, 20
477
478 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
479 mov r0, r0mp
480 sub r0, 2
481
482 movq m0, [pix_tmp+0x10]
483 movq m1, [pix_tmp+0x20]
484 lea r1, [r0+r4]
485 movq m2, [pix_tmp+0x30]
486 movq m3, [pix_tmp+0x40]
487 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
488
489 lea r0, [r0+r3*8]
490 lea r1, [r1+r3*8]
491 movq m0, [pix_tmp+0x18]
492 movq m1, [pix_tmp+0x28]
493 movq m2, [pix_tmp+0x38]
494 movq m3, [pix_tmp+0x48]
495 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
496
497 RET
498%endmacro ; DEBLOCK_LUMA
499
500INIT_MMX mmxext
501DEBLOCK_LUMA v8, 8
502INIT_XMM sse2
503DEBLOCK_LUMA v, 16
504%if HAVE_AVX_EXTERNAL
505INIT_XMM avx
506DEBLOCK_LUMA v, 16
507%endif
508
509%endif ; ARCH
510
511
512
513%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
514%if ARCH_X86_64
515 pavgb t0, p2, p1
516 pavgb t1, p0, q0
517%else
518 mova t0, p2
519 mova t1, p0
520 pavgb t0, p1
521 pavgb t1, q0
522%endif
523 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
524 mova t5, t1
525%if ARCH_X86_64
526 paddb t2, p2, p1
527 paddb t3, p0, q0
528%else
529 mova t2, p2
530 mova t3, p0
531 paddb t2, p1
532 paddb t3, q0
533%endif
534 paddb t2, t3
535 mova t3, t2
536 mova t4, t2
537 psrlw t2, 1
538 pavgb t2, mpb_0
539 pxor t2, t0
540 pand t2, mpb_1
541 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
542
543%if ARCH_X86_64
544 pavgb t1, p2, q1
545 psubb t2, p2, q1
546%else
547 mova t1, p2
548 mova t2, p2
549 pavgb t1, q1
550 psubb t2, q1
551%endif
552 paddb t3, t3
553 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
554 pand t2, mpb_1
555 psubb t1, t2
556 pavgb t1, p1
557 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
558 psrlw t3, 2
559 pavgb t3, mpb_0
560 pxor t3, t1
561 pand t3, mpb_1
562 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
563
564 pxor t3, p0, q1
565 pavgb t2, p0, q1
566 pand t3, mpb_1
567 psubb t2, t3
568 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
569
570 pxor t1, t2
571 pxor t2, p0
572 pand t1, mask1p
573 pand t2, mask0
574 pxor t1, t2
575 pxor t1, p0
576 mova %1, t1 ; store p0
577
578 mova t1, %4 ; p3
579 paddb t2, t1, p2
580 pavgb t1, p2
581 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
582 paddb t2, t2
583 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
584 psrlw t2, 2
585 pavgb t2, mpb_0
586 pxor t2, t1
587 pand t2, mpb_1
588 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
589
590 pxor t0, p1
591 pxor t1, p2
592 pand t0, mask1p
593 pand t1, mask1p
594 pxor t0, p1
595 pxor t1, p2
596 mova %2, t0 ; store p1
597 mova %3, t1 ; store p2
598%endmacro
599
600%macro LUMA_INTRA_SWAP_PQ 0
601 %define q1 m0
602 %define q0 m1
603 %define p0 m2
604 %define p1 m3
605 %define p2 q2
606 %define mask1p mask1q
607%endmacro
608
609%macro DEBLOCK_LUMA_INTRA 1
610 %define p1 m0
611 %define p0 m1
612 %define q0 m2
613 %define q1 m3
614 %define t0 m4
615 %define t1 m5
616 %define t2 m6
617 %define t3 m7
618%if ARCH_X86_64
619 %define p2 m8
620 %define q2 m9
621 %define t4 m10
622 %define t5 m11
623 %define mask0 m12
624 %define mask1p m13
625%if WIN64
626 %define mask1q [rsp]
627%else
628 %define mask1q [rsp-24]
629%endif
630 %define mpb_0 m14
631 %define mpb_1 m15
632%else
633 %define spill(x) [esp+16*x]
634 %define p2 [r4+r1]
635 %define q2 [r0+2*r1]
636 %define t4 spill(0)
637 %define t5 spill(1)
638 %define mask0 spill(2)
639 %define mask1p spill(3)
640 %define mask1q spill(4)
641 %define mpb_0 [pb_0]
642 %define mpb_1 [pb_1]
643%endif
644
645;-----------------------------------------------------------------------------
646; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
647;-----------------------------------------------------------------------------
648%if WIN64
649cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
650%else
651cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
652%endif
653 lea r4, [r1*4]
654 lea r5, [r1*3] ; 3*stride
655 dec r2d ; alpha-1
656 jl .end
657 neg r4
658 dec r3d ; beta-1
659 jl .end
660 add r4, r0 ; pix-4*stride
661 mova p1, [r4+2*r1]
662 mova p0, [r4+r5]
663 mova q0, [r0]
664 mova q1, [r0+r1]
665%if ARCH_X86_64
666 pxor mpb_0, mpb_0
667 mova mpb_1, [pb_1]
668 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
669 SWAP 7, 12 ; m12=mask0
670 pavgb t5, mpb_0
671 pavgb t5, mpb_1 ; alpha/4+1
672 movdqa p2, [r4+r1]
673 movdqa q2, [r0+2*r1]
674 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
675 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
676 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
677 pand t0, mask0
678 pand t4, t0
679 pand t2, t0
680 mova mask1q, t4
681 mova mask1p, t2
682%else
683 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
684 mova m4, t5
685 mova mask0, m7
686 pavgb m4, [pb_0]
687 pavgb m4, [pb_1] ; alpha/4+1
688 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
689 pand m6, mask0
690 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
691 pand m4, m6
692 mova mask1p, m4
693 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
694 pand m4, m6
695 mova mask1q, m4
696%endif
697 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
698 LUMA_INTRA_SWAP_PQ
699 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
700.end:
701 RET
702
703INIT_MMX cpuname
704%if ARCH_X86_64
705;-----------------------------------------------------------------------------
706; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
707;-----------------------------------------------------------------------------
708cglobal deblock_h_luma_intra_8, 4,9,0,0x80
709 movsxd r7, r1d
710 lea r8, [r7*3]
711 lea r6, [r0-4]
712 lea r5, [r0-4+r8]
713%if WIN64
714 %define pix_tmp rsp+0x20 ; shadow space
715%else
716 %define pix_tmp rsp
717%endif
718
719 ; transpose 8x16 -> tmp space
720 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
721 lea r6, [r6+r7*8]
722 lea r5, [r5+r7*8]
723 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
724
725 lea r0, [pix_tmp+0x40]
726 mov r1, 0x10
727 call deblock_v_luma_intra_8
728
729 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
730 lea r5, [r6+r8]
731 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
732 shl r7, 3
733 sub r6, r7
734 sub r5, r7
735 shr r7, 3
736 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
737 RET
738%else
739cglobal deblock_h_luma_intra_8, 2,4,8,0x80
740 lea r3, [r1*3]
741 sub r0, 4
742 lea r2, [r0+r3]
743 %define pix_tmp rsp
744
745 ; transpose 8x16 -> tmp space
746 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
747 lea r0, [r0+r1*8]
748 lea r2, [r2+r1*8]
749 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
750
751 lea r0, [pix_tmp+0x40]
752 PUSH dword r3m
753 PUSH dword r2m
754 PUSH dword 16
755 PUSH r0
756 call deblock_%1_luma_intra_8
757%ifidn %1, v8
758 add dword [rsp], 8 ; pix_tmp+8
759 call deblock_%1_luma_intra_8
760%endif
761 ADD esp, 16
762
763 mov r1, r1m
764 mov r0, r0mp
765 lea r3, [r1*3]
766 sub r0, 4
767 lea r2, [r0+r3]
768 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
769 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
770 lea r0, [r0+r1*8]
771 lea r2, [r2+r1*8]
772 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
773 RET
774%endif ; ARCH_X86_64
775%endmacro ; DEBLOCK_LUMA_INTRA
776
777INIT_XMM sse2
778DEBLOCK_LUMA_INTRA v
779%if HAVE_AVX_EXTERNAL
780INIT_XMM avx
781DEBLOCK_LUMA_INTRA v
782%endif
783%if ARCH_X86_64 == 0
784INIT_MMX mmxext
785DEBLOCK_LUMA_INTRA v8
786%endif
787
788INIT_MMX mmxext
789
790%macro CHROMA_V_START 0
791 dec r2d ; alpha-1
792 dec r3d ; beta-1
793 mov t5, r0
794 sub t5, r1
795 sub t5, r1
796%endmacro
797
798%macro CHROMA_H_START 0
799 dec r2d
800 dec r3d
801 sub r0, 2
802 lea t6, [r1*3]
803 mov t5, r0
804 add r0, t6
805%endmacro
806
807%define t5 r5
808%define t6 r6
809
810;-----------------------------------------------------------------------------
811; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta,
812; int8_t *tc0)
813;-----------------------------------------------------------------------------
814cglobal deblock_v_chroma_8, 5,6
815 CHROMA_V_START
816 movq m0, [t5]
817 movq m1, [t5+r1]
818 movq m2, [r0]
819 movq m3, [r0+r1]
820 call ff_chroma_inter_body_mmxext
821 movq [t5+r1], m1
822 movq [r0], m2
823 RET
824
825;-----------------------------------------------------------------------------
826; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta,
827; int8_t *tc0)
828;-----------------------------------------------------------------------------
829cglobal deblock_h_chroma_8, 5,7
830%if ARCH_X86_64
831 ; This could use the red zone on 64 bit unix to avoid the stack pointer
832 ; readjustment, but valgrind assumes the red zone is clobbered on
833 ; function calls and returns.
834 sub rsp, 16
835 %define buf0 [rsp]
836 %define buf1 [rsp+8]
837%else
838 %define buf0 r0m
839 %define buf1 r2m
840%endif
841 CHROMA_H_START
842 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
843 movq buf0, m0
844 movq buf1, m3
845 LOAD_MASK r2d, r3d
846 movd m6, [r4] ; tc0
847 punpcklbw m6, m6
848 pand m7, m6
849 DEBLOCK_P0_Q0
850 movq m0, buf0
851 movq m3, buf1
852 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
853%if ARCH_X86_64
854 add rsp, 16
855%endif
856 RET
857
858ALIGN 16
859ff_chroma_inter_body_mmxext:
860 LOAD_MASK r2d, r3d
861 movd m6, [r4] ; tc0
862 punpcklbw m6, m6
863 pand m7, m6
864 DEBLOCK_P0_Q0
865 ret
866
867
868
869; in: %1=p0 %2=p1 %3=q1
870; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
871%macro CHROMA_INTRA_P0 3
872 movq m4, %1
873 pxor m4, %3
874 pand m4, [pb_1] ; m4 = (p0^q1)&1
875 pavgb %1, %3
876 psubusb %1, m4
877 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
878%endmacro
879
880%define t5 r4
881%define t6 r5
882
883;------------------------------------------------------------------------------
884; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
885;------------------------------------------------------------------------------
886cglobal deblock_v_chroma_intra_8, 4,5
887 CHROMA_V_START
888 movq m0, [t5]
889 movq m1, [t5+r1]
890 movq m2, [r0]
891 movq m3, [r0+r1]
892 call ff_chroma_intra_body_mmxext
893 movq [t5+r1], m1
894 movq [r0], m2
895 RET
896
897;------------------------------------------------------------------------------
898; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
899;------------------------------------------------------------------------------
900cglobal deblock_h_chroma_intra_8, 4,6
901 CHROMA_H_START
902 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
903 call ff_chroma_intra_body_mmxext
904 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
905 RET
906
907ALIGN 16
908ff_chroma_intra_body_mmxext:
909 LOAD_MASK r2d, r3d
910 movq m5, m1
911 movq m6, m2
912 CHROMA_INTRA_P0 m1, m0, m3
913 CHROMA_INTRA_P0 m2, m3, m0
914 psubb m1, m5
915 psubb m2, m6
916 pand m1, m7
917 pand m2, m7
918 paddb m1, m5
919 paddb m2, m6
920 ret
921
922;-----------------------------------------------------------------------------
923; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
924; int8_t ref[2][40], int16_t mv[2][40][2],
925; int bidir, int edges, int step,
926; int mask_mv0, int mask_mv1, int field);
927;
928; bidir is 0 or 1
929; edges is 1 or 4
930; step is 1 or 2
931; mask_mv0 is 0 or 3
932; mask_mv1 is 0 or 1
933; field is 0 or 1
934;-----------------------------------------------------------------------------
935%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
936 ; dir, d_idx, mask_dir, bidir
937%define edgesd %1
938%define stepd %2
939%define mask_mvd %3
940%define dir %4
941%define d_idx %5
942%define mask_dir %6
943%define bidir %7
944 xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
945%%.b_idx_loop:
946%if mask_dir == 0
947 pxor m0, m0
948%endif
949 test b_idxd, dword mask_mvd
950 jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv))
951%if bidir == 1
952 movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
953 punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
954 pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] }
955 pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] }
956 pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] }
957 psubb m0, m2 ; { ref0[b] != ref0[bn],
958 ; ref0[b] != ref1[bn] }
959 psubb m1, m3 ; { ref1[b] != ref1[bn],
960 ; ref1[b] != ref0[bn] }
961
962 por m0, m1
963 mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
964 mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
965 mova m3, m1
966 mova m4, m2
967 psubw m1, [mvq+b_idxq*4+12*4]
968 psubw m2, [mvq+b_idxq*4+12*4+mmsize]
969 psubw m3, [mvq+b_idxq*4+52*4]
970 psubw m4, [mvq+b_idxq*4+52*4+mmsize]
971 packsswb m1, m2
972 packsswb m3, m4
973 paddb m1, m6
974 paddb m3, m6
975 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
976 psubusb m3, m5
977 packsswb m1, m3
978
979 por m0, m1
980 mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
981 mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
982 mova m3, m1
983 mova m4, m2
984 psubw m1, [mvq+b_idxq*4+12*4]
985 psubw m2, [mvq+b_idxq*4+12*4+mmsize]
986 psubw m3, [mvq+b_idxq*4+52*4]
987 psubw m4, [mvq+b_idxq*4+52*4+mmsize]
988 packsswb m1, m2
989 packsswb m3, m4
990 paddb m1, m6
991 paddb m3, m6
992 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
993 psubusb m3, m5
994 packsswb m1, m3
995
996 pshufw m1, m1, 0x4E
997 por m0, m1
998 pshufw m1, m0, 0x4E
999 pminub m0, m1
1000%else ; bidir == 0
1001 movd m0, [refq+b_idxq+12]
1002 psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
1003
1004 mova m1, [mvq+b_idxq*4+12*4]
1005 mova m2, [mvq+b_idxq*4+12*4+mmsize]
1006 psubw m1, [mvq+b_idxq*4+(d_idx+12)*4]
1007 psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1008 packsswb m1, m2
1009 paddb m1, m6
1010 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1011 packsswb m1, m1
1012 por m0, m1
1013%endif ; bidir == 1/0
1014
1015%%.skip_loop_iter:
1016 movd m1, [nnzq+b_idxq+12]
1017 por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1018
1019 pminub m1, m7
1020 pminub m0, m7
1021 psllw m1, 1
1022 pxor m2, m2
1023 pmaxub m1, m0
1024 punpcklbw m1, m2
1025 movq [bsq+b_idxq+32*dir], m1
1026
1027 add b_idxd, dword stepd
1028 cmp b_idxd, dword edgesd
1029 jl %%.b_idx_loop
1030%endmacro
1031
1032INIT_MMX mmxext
1033cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1034 step, mask_mv0, mask_mv1, field
1035%define b_idxq bidirq
1036%define b_idxd bidird
1037 cmp dword fieldm, 0
1038 mova m7, [pb_1]
1039 mova m5, [pb_3]
1040 je .nofield
1041 mova m5, [pb_3_1]
1042.nofield:
1043 mova m6, m5
1044 paddb m5, m5
1045
1046 shl dword stepd, 3
1047 shl dword edgesd, 3
1048%if ARCH_X86_32
1049%define mask_mv0d mask_mv0m
1050%define mask_mv1d mask_mv1m
1051%endif
1052 shl dword mask_mv1d, 3
1053 shl dword mask_mv0d, 3
1054
1055 cmp dword bidird, 0
1056 jne .bidir
1057 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0
1058 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0
1059
1060 mova m0, [bsq+mmsize*0]
1061 mova m1, [bsq+mmsize*1]
1062 mova m2, [bsq+mmsize*2]
1063 mova m3, [bsq+mmsize*3]
1064 TRANSPOSE4x4W 0, 1, 2, 3, 4
1065 mova [bsq+mmsize*0], m0
1066 mova [bsq+mmsize*1], m1
1067 mova [bsq+mmsize*2], m2
1068 mova [bsq+mmsize*3], m3
1069 RET
1070
1071.bidir:
1072 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1
1073 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1
1074
1075 mova m0, [bsq+mmsize*0]
1076 mova m1, [bsq+mmsize*1]
1077 mova m2, [bsq+mmsize*2]
1078 mova m3, [bsq+mmsize*3]
1079 TRANSPOSE4x4W 0, 1, 2, 3, 4
1080 mova [bsq+mmsize*0], m0
1081 mova [bsq+mmsize*1], m1
1082 mova [bsq+mmsize*2], m2
1083 mova [bsq+mmsize*3], m3
1084 RET