Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_deblock_10bit.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Oskar Arvidsson <oskar@irock.se>
7;* Loren Merritt <lorenm@u.washington.edu>
8;* Fiona Glaser <fiona@x264.com>
9;*
10;* This file is part of FFmpeg.
11;*
12;* FFmpeg is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* FFmpeg is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with FFmpeg; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25;******************************************************************************
26
27%include "libavutil/x86/x86util.asm"
28
29SECTION_RODATA
30
31pw_pixel_max: times 8 dw ((1 << 10)-1)
32
33SECTION .text
34
35cextern pw_2
36cextern pw_3
37cextern pw_4
38
39; out: %4 = |%1-%2|-%3
40; clobbers: %5
41%macro ABS_SUB 5
42 psubusw %5, %2, %1
43 psubusw %4, %1, %2
44 por %4, %5
45 psubw %4, %3
46%endmacro
47
48; out: %4 = |%1-%2|<%3
49%macro DIFF_LT 5
50 psubusw %4, %2, %1
51 psubusw %5, %1, %2
52 por %5, %4 ; |%1-%2|
53 pxor %4, %4
54 psubw %5, %3 ; |%1-%2|-%3
55 pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
56%endmacro
57
58%macro LOAD_AB 4
59 movd %1, %3
60 movd %2, %4
61 SPLATW %1, %1
62 SPLATW %2, %2
63%endmacro
64
65; in: %2=tc reg
66; out: %1=splatted tc
67%macro LOAD_TC 2
68 movd %1, [%2]
69 punpcklbw %1, %1
70%if mmsize == 8
71 pshufw %1, %1, 0
72%else
73 pshuflw %1, %1, 01010000b
74 pshufd %1, %1, 01010000b
75%endif
76 psraw %1, 6
77%endmacro
78
79; in: %1=p1, %2=p0, %3=q0, %4=q1
80; %5=alpha, %6=beta, %7-%9=tmp
81; out: %7=mask
82%macro LOAD_MASK 9
83 ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
84 ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
85 pand %8, %9
86 ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
87 pxor %7, %7
88 pand %8, %9
89 pcmpgtw %7, %8
90%endmacro
91
92; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
93; out: %1=p0', m2=q0'
94%macro DEBLOCK_P0_Q0 7
95 psubw %3, %4
96 pxor %7, %7
97 paddw %3, [pw_4]
98 psubw %7, %5
99 psubw %6, %2, %1
100 psllw %6, 2
101 paddw %3, %6
102 psraw %3, 3
103 mova %6, [pw_pixel_max]
104 CLIPW %3, %7, %5
105 pxor %7, %7
106 paddw %1, %3
107 psubw %2, %3
108 CLIPW %1, %7, %6
109 CLIPW %2, %7, %6
110%endmacro
111
112; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
113%macro LUMA_Q1 6
114 pavgw %6, %3, %4 ; (p0+q0+1)>>1
115 paddw %1, %6
116 pxor %6, %6
117 psraw %1, 1
118 psubw %6, %5
119 psubw %1, %2
120 CLIPW %1, %6, %5
121 paddw %1, %2
122%endmacro
123
124%macro LUMA_DEBLOCK_ONE 3
125 DIFF_LT m5, %1, bm, m4, m6
126 pxor m6, m6
127 mova %3, m4
128 pcmpgtw m6, tcm
129 pand m4, tcm
130 pandn m6, m7
131 pand m4, m6
132 LUMA_Q1 m5, %2, m1, m2, m4, m6
133%endmacro
134
135%macro LUMA_H_STORE 2
136%if mmsize == 8
137 movq [r0-4], m0
138 movq [r0+r1-4], m1
139 movq [r0+r1*2-4], m2
140 movq [r0+%2-4], m3
141%else
142 movq [r0-4], m0
143 movhps [r0+r1-4], m0
144 movq [r0+r1*2-4], m1
145 movhps [%1-4], m1
146 movq [%1+r1-4], m2
147 movhps [%1+r1*2-4], m2
148 movq [%1+%2-4], m3
149 movhps [%1+r1*4-4], m3
150%endif
151%endmacro
152
153%macro DEBLOCK_LUMA 0
154;-----------------------------------------------------------------------------
155; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta,
156; int8_t *tc0)
157;-----------------------------------------------------------------------------
158cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
159 %assign pad 5*mmsize+12-(stack_offset&15)
160 %define tcm [rsp]
161 %define ms1 [rsp+mmsize]
162 %define ms2 [rsp+mmsize*2]
163 %define am [rsp+mmsize*3]
164 %define bm [rsp+mmsize*4]
165 SUB rsp, pad
166 shl r2d, 2
167 shl r3d, 2
168 LOAD_AB m4, m5, r2d, r3d
169 mov r3, 32/mmsize
170 mov r2, r0
171 sub r0, r1
172 mova am, m4
173 sub r0, r1
174 mova bm, m5
175 sub r0, r1
176.loop:
177 mova m0, [r0+r1]
178 mova m1, [r0+r1*2]
179 mova m2, [r2]
180 mova m3, [r2+r1]
181
182 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
183 LOAD_TC m6, r4
184 mova tcm, m6
185
186 mova m5, [r0]
187 LUMA_DEBLOCK_ONE m1, m0, ms1
188 mova [r0+r1], m5
189
190 mova m5, [r2+r1*2]
191 LUMA_DEBLOCK_ONE m2, m3, ms2
192 mova [r2+r1], m5
193
194 pxor m5, m5
195 mova m6, tcm
196 pcmpgtw m5, tcm
197 psubw m6, ms1
198 pandn m5, m7
199 psubw m6, ms2
200 pand m5, m6
201 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
202 mova [r0+r1*2], m1
203 mova [r2], m2
204
205 add r0, mmsize
206 add r2, mmsize
207 add r4, mmsize/8
208 dec r3
209 jg .loop
210 ADD rsp, pad
211 RET
212
213cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
214 %assign pad 7*mmsize+12-(stack_offset&15)
215 %define tcm [rsp]
216 %define ms1 [rsp+mmsize]
217 %define ms2 [rsp+mmsize*2]
218 %define p1m [rsp+mmsize*3]
219 %define p2m [rsp+mmsize*4]
220 %define am [rsp+mmsize*5]
221 %define bm [rsp+mmsize*6]
222 SUB rsp, pad
223 shl r2d, 2
224 shl r3d, 2
225 LOAD_AB m4, m5, r2d, r3d
226 mov r3, r1
227 mova am, m4
228 add r3, r1
229 mov r5, 32/mmsize
230 mova bm, m5
231 add r3, r1
232%if mmsize == 16
233 mov r2, r0
234 add r2, r3
235%endif
236.loop:
237%if mmsize == 8
238 movq m2, [r0-8] ; y q2 q1 q0
239 movq m7, [r0+0]
240 movq m5, [r0+r1-8]
241 movq m3, [r0+r1+0]
242 movq m0, [r0+r1*2-8]
243 movq m6, [r0+r1*2+0]
244 movq m1, [r0+r3-8]
245 TRANSPOSE4x4W 2, 5, 0, 1, 4
246 SWAP 2, 7
247 movq m7, [r0+r3]
248 TRANSPOSE4x4W 2, 3, 6, 7, 4
249%else
250 movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
251 movu m0, [r0+r1-8]
252 movu m2, [r0+r1*2-8]
253 movu m3, [r2-8]
254 TRANSPOSE4x4W 5, 0, 2, 3, 6
255 mova tcm, m3
256
257 movu m4, [r2+r1-8]
258 movu m1, [r2+r1*2-8]
259 movu m3, [r2+r3-8]
260 movu m7, [r2+r1*4-8]
261 TRANSPOSE4x4W 4, 1, 3, 7, 6
262
263 mova m6, tcm
264 punpcklqdq m6, m7
265 punpckhqdq m5, m4
266 SBUTTERFLY qdq, 0, 1, 7
267 SBUTTERFLY qdq, 2, 3, 7
268%endif
269
270 mova p2m, m6
271 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
272 LOAD_TC m6, r4
273 mova tcm, m6
274
275 LUMA_DEBLOCK_ONE m1, m0, ms1
276 mova p1m, m5
277
278 mova m5, p2m
279 LUMA_DEBLOCK_ONE m2, m3, ms2
280 mova p2m, m5
281
282 pxor m5, m5
283 mova m6, tcm
284 pcmpgtw m5, tcm
285 psubw m6, ms1
286 pandn m5, m7
287 psubw m6, ms2
288 pand m5, m6
289 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
290 mova m0, p1m
291 mova m3, p2m
292 TRANSPOSE4x4W 0, 1, 2, 3, 4
293 LUMA_H_STORE r2, r3
294
295 add r4, mmsize/8
296 lea r0, [r0+r1*(mmsize/2)]
297 lea r2, [r2+r1*(mmsize/2)]
298 dec r5
299 jg .loop
300 ADD rsp, pad
301 RET
302%endmacro
303
304%if ARCH_X86_64
305; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
306; m12=alpha, m13=beta
307; out: m0=p1', m3=q1', m1=p0', m2=q0'
308; clobbers: m4, m5, m6, m7, m10, m11, m14
309%macro DEBLOCK_LUMA_INTER_SSE2 0
310 LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
311 LOAD_TC m6, r4
312 DIFF_LT m8, m1, m13, m10, m4
313 DIFF_LT m9, m2, m13, m11, m4
314 pand m6, m7
315
316 mova m14, m6
317 pxor m4, m4
318 pcmpgtw m6, m4
319 pand m6, m14
320
321 mova m5, m10
322 pand m5, m6
323 LUMA_Q1 m8, m0, m1, m2, m5, m4
324
325 mova m5, m11
326 pand m5, m6
327 LUMA_Q1 m9, m3, m1, m2, m5, m4
328
329 pxor m4, m4
330 psubw m6, m10
331 pcmpgtw m4, m14
332 pandn m4, m7
333 psubw m6, m11
334 pand m4, m6
335 DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
336
337 SWAP 0, 8
338 SWAP 3, 9
339%endmacro
340
341%macro DEBLOCK_LUMA_64 0
342cglobal deblock_v_luma_10, 5,5,15
343 %define p2 m8
344 %define p1 m0
345 %define p0 m1
346 %define q0 m2
347 %define q1 m3
348 %define q2 m9
349 %define mask0 m7
350 %define mask1 m10
351 %define mask2 m11
352 shl r2d, 2
353 shl r3d, 2
354 LOAD_AB m12, m13, r2d, r3d
355 mov r2, r0
356 sub r0, r1
357 sub r0, r1
358 sub r0, r1
359 mov r3, 2
360.loop:
361 mova p2, [r0]
362 mova p1, [r0+r1]
363 mova p0, [r0+r1*2]
364 mova q0, [r2]
365 mova q1, [r2+r1]
366 mova q2, [r2+r1*2]
367 DEBLOCK_LUMA_INTER_SSE2
368 mova [r0+r1], p1
369 mova [r0+r1*2], p0
370 mova [r2], q0
371 mova [r2+r1], q1
372 add r0, mmsize
373 add r2, mmsize
374 add r4, 2
375 dec r3
376 jg .loop
377 REP_RET
378
379cglobal deblock_h_luma_10, 5,7,15
380 shl r2d, 2
381 shl r3d, 2
382 LOAD_AB m12, m13, r2d, r3d
383 mov r2, r1
384 add r2, r1
385 add r2, r1
386 mov r5, r0
387 add r5, r2
388 mov r6, 2
389.loop:
390 movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
391 movu m0, [r0+r1-8]
392 movu m2, [r0+r1*2-8]
393 movu m9, [r5-8]
394 movu m5, [r5+r1-8]
395 movu m1, [r5+r1*2-8]
396 movu m3, [r5+r2-8]
397 movu m7, [r5+r1*4-8]
398
399 TRANSPOSE4x4W 8, 0, 2, 9, 10
400 TRANSPOSE4x4W 5, 1, 3, 7, 10
401
402 punpckhqdq m8, m5
403 SBUTTERFLY qdq, 0, 1, 10
404 SBUTTERFLY qdq, 2, 3, 10
405 punpcklqdq m9, m7
406
407 DEBLOCK_LUMA_INTER_SSE2
408
409 TRANSPOSE4x4W 0, 1, 2, 3, 4
410 LUMA_H_STORE r5, r2
411 add r4, 2
412 lea r0, [r0+r1*8]
413 lea r5, [r5+r1*8]
414 dec r6
415 jg .loop
416 REP_RET
417%endmacro
418
419INIT_XMM sse2
420DEBLOCK_LUMA_64
421%if HAVE_AVX_EXTERNAL
422INIT_XMM avx
423DEBLOCK_LUMA_64
424%endif
425%endif
426
427%macro SWAPMOVA 2
428%ifid %1
429 SWAP %1, %2
430%else
431 mova %1, %2
432%endif
433%endmacro
434
435; in: t0-t2: tmp registers
436; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
437; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
438%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
439%if ARCH_X86_64
440 paddw t0, %3, %2
441 mova t2, %4
442 paddw t2, %3
443%else
444 mova t0, %3
445 mova t2, %4
446 paddw t0, %2
447 paddw t2, %3
448%endif
449 paddw t0, %1
450 paddw t2, t2
451 paddw t0, %5
452 paddw t2, %9
453 paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
454 paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
455
456 psrlw t2, 3
457 psrlw t1, t0, 2
458 psubw t2, %3
459 psubw t1, %2
460 pand t2, %8
461 pand t1, %8
462 paddw t2, %3
463 paddw t1, %2
464 SWAPMOVA %11, t1
465
466 psubw t1, t0, %3
467 paddw t0, t0
468 psubw t1, %5
469 psubw t0, %3
470 paddw t1, %6
471 paddw t1, %2
472 paddw t0, %6
473 psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
474 psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
475
476 pxor t0, t1
477 pxor t1, %1
478 pand t0, %8
479 pand t1, %7
480 pxor t0, t1
481 pxor t0, %1
482 SWAPMOVA %10, t0
483 SWAPMOVA %12, t2
484%endmacro
485
486%macro LUMA_INTRA_INIT 1
487 %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
488 %define t0 m4
489 %define t1 m5
490 %define t2 m6
491 %define t3 m7
492 %assign i 4
493%rep %1
494 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
495 %assign i i+1
496%endrep
497 SUB rsp, pad
498%endmacro
499
500; in: %1-%3=tmp, %4=p2, %5=q2
501%macro LUMA_INTRA_INTER 5
502 LOAD_AB t0, t1, r2d, r3d
503 mova %1, t0
504 LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
505%if ARCH_X86_64
506 mova %2, t0 ; mask0
507 psrlw t3, %1, 2
508%else
509 mova t3, %1
510 mova %2, t0 ; mask0
511 psrlw t3, 2
512%endif
513 paddw t3, [pw_2] ; alpha/4+2
514 DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
515 pand t2, %2
516 mova t3, %5 ; q2
517 mova %1, t2 ; mask1
518 DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
519 pand t2, %1
520 mova t3, %4 ; p2
521 mova %3, t2 ; mask1q
522 DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
523 pand t2, %1
524 mova %1, t2 ; mask1p
525%endmacro
526
527%macro LUMA_H_INTRA_LOAD 0
528%if mmsize == 8
529 movu t0, [r0-8]
530 movu t1, [r0+r1-8]
531 movu m0, [r0+r1*2-8]
532 movu m1, [r0+r4-8]
533 TRANSPOSE4x4W 4, 5, 0, 1, 2
534 mova t4, t0 ; p3
535 mova t5, t1 ; p2
536
537 movu m2, [r0]
538 movu m3, [r0+r1]
539 movu t0, [r0+r1*2]
540 movu t1, [r0+r4]
541 TRANSPOSE4x4W 2, 3, 4, 5, 6
542 mova t6, t0 ; q2
543 mova t7, t1 ; q3
544%else
545 movu t0, [r0-8]
546 movu t1, [r0+r1-8]
547 movu m0, [r0+r1*2-8]
548 movu m1, [r0+r5-8]
549 movu m2, [r4-8]
550 movu m3, [r4+r1-8]
551 movu t2, [r4+r1*2-8]
552 movu t3, [r4+r5-8]
553 TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
554 mova t4, t0 ; p3
555 mova t5, t1 ; p2
556 mova t6, t2 ; q2
557 mova t7, t3 ; q3
558%endif
559%endmacro
560
561; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
562%macro LUMA_H_INTRA_STORE 9
563%if mmsize == 8
564 TRANSPOSE4x4W %1, %2, %3, %4, %9
565 movq [r0-8], m%1
566 movq [r0+r1-8], m%2
567 movq [r0+r1*2-8], m%3
568 movq [r0+r4-8], m%4
569 movq m%1, %8
570 TRANSPOSE4x4W %5, %6, %7, %1, %9
571 movq [r0], m%5
572 movq [r0+r1], m%6
573 movq [r0+r1*2], m%7
574 movq [r0+r4], m%1
575%else
576 TRANSPOSE2x4x4W %1, %2, %3, %4, %9
577 movq [r0-8], m%1
578 movq [r0+r1-8], m%2
579 movq [r0+r1*2-8], m%3
580 movq [r0+r5-8], m%4
581 movhps [r4-8], m%1
582 movhps [r4+r1-8], m%2
583 movhps [r4+r1*2-8], m%3
584 movhps [r4+r5-8], m%4
585%ifnum %8
586 SWAP %1, %8
587%else
588 mova m%1, %8
589%endif
590 TRANSPOSE2x4x4W %5, %6, %7, %1, %9
591 movq [r0], m%5
592 movq [r0+r1], m%6
593 movq [r0+r1*2], m%7
594 movq [r0+r5], m%1
595 movhps [r4], m%5
596 movhps [r4+r1], m%6
597 movhps [r4+r1*2], m%7
598 movhps [r4+r5], m%1
599%endif
600%endmacro
601
602%if ARCH_X86_64
603;-----------------------------------------------------------------------------
604; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
605; int beta)
606;-----------------------------------------------------------------------------
607%macro DEBLOCK_LUMA_INTRA_64 0
608cglobal deblock_v_luma_intra_10, 4,7,16
609 %define t0 m1
610 %define t1 m2
611 %define t2 m4
612 %define p2 m8
613 %define p1 m9
614 %define p0 m10
615 %define q0 m11
616 %define q1 m12
617 %define q2 m13
618 %define aa m5
619 %define bb m14
620 lea r4, [r1*4]
621 lea r5, [r1*3] ; 3*stride
622 neg r4
623 add r4, r0 ; pix-4*stride
624 mov r6, 2
625 mova m0, [pw_2]
626 shl r2d, 2
627 shl r3d, 2
628 LOAD_AB aa, bb, r2d, r3d
629.loop:
630 mova p2, [r4+r1]
631 mova p1, [r4+2*r1]
632 mova p0, [r4+r5]
633 mova q0, [r0]
634 mova q1, [r0+r1]
635 mova q2, [r0+2*r1]
636
637 LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
638 mova t2, aa
639 psrlw t2, 2
640 paddw t2, m0 ; alpha/4+2
641 DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
642 DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
643 DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
644 pand m6, m3
645 pand m7, m6
646 pand m6, t1
647 LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
648 LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
649 add r0, mmsize
650 add r4, mmsize
651 dec r6
652 jg .loop
653 REP_RET
654
655;-----------------------------------------------------------------------------
656; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
657; int beta)
658;-----------------------------------------------------------------------------
659cglobal deblock_h_luma_intra_10, 4,7,16
660 %define t0 m15
661 %define t1 m14
662 %define t2 m2
663 %define q3 m5
664 %define q2 m8
665 %define q1 m9
666 %define q0 m10
667 %define p0 m11
668 %define p1 m12
669 %define p2 m13
670 %define p3 m4
671 %define spill [rsp]
672 %assign pad 24-(stack_offset&15)
673 SUB rsp, pad
674 lea r4, [r1*4]
675 lea r5, [r1*3] ; 3*stride
676 add r4, r0 ; pix+4*stride
677 mov r6, 2
678 mova m0, [pw_2]
679 shl r2d, 2
680 shl r3d, 2
681.loop:
682 movu q3, [r0-8]
683 movu q2, [r0+r1-8]
684 movu q1, [r0+r1*2-8]
685 movu q0, [r0+r5-8]
686 movu p0, [r4-8]
687 movu p1, [r4+r1-8]
688 movu p2, [r4+r1*2-8]
689 movu p3, [r4+r5-8]
690 TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
691
692 LOAD_AB m1, m2, r2d, r3d
693 LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
694 psrlw m1, 2
695 paddw m1, m0 ; alpha/4+2
696 DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
697 DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
698 DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
699 pand m6, m3
700 pand m7, m6
701 pand m6, t1
702
703 mova spill, q3
704 LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
705 LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
706 mova m7, spill
707
708 LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
709
710 lea r0, [r0+r1*8]
711 lea r4, [r4+r1*8]
712 dec r6
713 jg .loop
714 ADD rsp, pad
715 RET
716%endmacro
717
718INIT_XMM sse2
719DEBLOCK_LUMA_INTRA_64
720%if HAVE_AVX_EXTERNAL
721INIT_XMM avx
722DEBLOCK_LUMA_INTRA_64
723%endif
724
725%endif
726
727%macro DEBLOCK_LUMA_INTRA 0
728;-----------------------------------------------------------------------------
729; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
730; int beta)
731;-----------------------------------------------------------------------------
732cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
733 LUMA_INTRA_INIT 3
734 lea r4, [r1*4]
735 lea r5, [r1*3]
736 neg r4
737 add r4, r0
738 mov r6, 32/mmsize
739 shl r2d, 2
740 shl r3d, 2
741.loop:
742 mova m0, [r4+r1*2] ; p1
743 mova m1, [r4+r5] ; p0
744 mova m2, [r0] ; q0
745 mova m3, [r0+r1] ; q1
746 LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
747 LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
748 mova t3, [r0+r1*2] ; q2
749 LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
750 add r0, mmsize
751 add r4, mmsize
752 dec r6
753 jg .loop
754 ADD rsp, pad
755 RET
756
757;-----------------------------------------------------------------------------
758; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
759; int beta)
760;-----------------------------------------------------------------------------
761cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
762 LUMA_INTRA_INIT 8
763%if mmsize == 8
764 lea r4, [r1*3]
765 mov r5, 32/mmsize
766%else
767 lea r4, [r1*4]
768 lea r5, [r1*3] ; 3*stride
769 add r4, r0 ; pix+4*stride
770 mov r6, 32/mmsize
771%endif
772 shl r2d, 2
773 shl r3d, 2
774.loop:
775 LUMA_H_INTRA_LOAD
776 LUMA_INTRA_INTER t8, t9, t10, t5, t6
777
778 LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
779 mova t3, t6 ; q2
780 LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
781
782 mova m2, t4
783 mova m0, t11
784 mova m1, t5
785 mova m3, t8
786 mova m6, t6
787
788 LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
789
790 lea r0, [r0+r1*(mmsize/2)]
791%if mmsize == 8
792 dec r5
793%else
794 lea r4, [r4+r1*(mmsize/2)]
795 dec r6
796%endif
797 jg .loop
798 ADD rsp, pad
799 RET
800%endmacro
801
802%if ARCH_X86_64 == 0
803INIT_MMX mmxext
804DEBLOCK_LUMA
805DEBLOCK_LUMA_INTRA
806INIT_XMM sse2
807DEBLOCK_LUMA
808DEBLOCK_LUMA_INTRA
809%if HAVE_AVX_EXTERNAL
810INIT_XMM avx
811DEBLOCK_LUMA
812DEBLOCK_LUMA_INTRA
813%endif
814%endif
815
816; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
817; out: %1=p0', %2=q0'
818%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
819 mova %6, [pw_2]
820 paddw %6, %3
821 paddw %6, %4
822 paddw %7, %6, %2
823 paddw %6, %1
824 paddw %6, %3
825 paddw %7, %4
826 psraw %6, 2
827 psraw %7, 2
828 psubw %6, %1
829 psubw %7, %2
830 pand %6, %5
831 pand %7, %5
832 paddw %1, %6
833 paddw %2, %7
834%endmacro
835
836%macro CHROMA_V_LOAD 1
837 mova m0, [r0] ; p1
838 mova m1, [r0+r1] ; p0
839 mova m2, [%1] ; q0
840 mova m3, [%1+r1] ; q1
841%endmacro
842
843%macro CHROMA_V_STORE 0
844 mova [r0+1*r1], m1
845 mova [r0+2*r1], m2
846%endmacro
847
848%macro CHROMA_V_LOAD_TC 2
849 movd %1, [%2]
850 punpcklbw %1, %1
851 punpcklwd %1, %1
852 psraw %1, 6
853%endmacro
854
855%macro DEBLOCK_CHROMA 0
856;-----------------------------------------------------------------------------
857; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
858; int8_t *tc0)
859;-----------------------------------------------------------------------------
860cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
861 mov r5, r0
862 sub r0, r1
863 sub r0, r1
864 shl r2d, 2
865 shl r3d, 2
866%if mmsize < 16
867 mov r6, 16/mmsize
868.loop:
869%endif
870 CHROMA_V_LOAD r5
871 LOAD_AB m4, m5, r2d, r3d
872 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
873 pxor m4, m4
874 CHROMA_V_LOAD_TC m6, r4
875 psubw m6, [pw_3]
876 pmaxsw m6, m4
877 pand m7, m6
878 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
879 CHROMA_V_STORE
880%if mmsize < 16
881 add r0, mmsize
882 add r5, mmsize
883 add r4, mmsize/4
884 dec r6
885 jg .loop
886 REP_RET
887%else
888 RET
889%endif
890
891;-----------------------------------------------------------------------------
892; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha,
893; int beta)
894;-----------------------------------------------------------------------------
895cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
896 mov r4, r0
897 sub r0, r1
898 sub r0, r1
899 shl r2d, 2
900 shl r3d, 2
901%if mmsize < 16
902 mov r5, 16/mmsize
903.loop:
904%endif
905 CHROMA_V_LOAD r4
906 LOAD_AB m4, m5, r2d, r3d
907 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
908 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
909 CHROMA_V_STORE
910%if mmsize < 16
911 add r0, mmsize
912 add r4, mmsize
913 dec r5
914 jg .loop
915 REP_RET
916%else
917 RET
918%endif
919%endmacro
920
921%if ARCH_X86_64 == 0
922INIT_MMX mmxext
923DEBLOCK_CHROMA
924%endif
925INIT_XMM sse2
926DEBLOCK_CHROMA
927%if HAVE_AVX_EXTERNAL
928INIT_XMM avx
929DEBLOCK_CHROMA
930%endif