Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / hevc_deblock.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* SSE2-optimized HEVC deblocking code
3;*****************************************************************************
4;* Copyright (C) 2013 VTT
5;*
6;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_pixel_max_12: times 8 dw ((1 << 12)-1)
30pw_pixel_max_10: times 8 dw ((1 << 10)-1)
31pw_m2: times 8 dw -2
32pd_1 : times 4 dd 1
33
34cextern pw_4
35cextern pw_8
36cextern pw_m1
37
38SECTION .text
39INIT_XMM sse2
40
41; expands to [base],...,[base+7*stride]
42%define PASS8ROWS(base, base3, stride, stride3) \
43 [base], [base+stride], [base+stride*2], [base3], \
44 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
45
46; in: 8 rows of 4 bytes in %4..%11
47; out: 4 rows of 8 words in m0..m3
48%macro TRANSPOSE4x8B_LOAD 8
49 movd m0, %1
50 movd m2, %2
51 movd m1, %3
52 movd m3, %4
53
54 punpcklbw m0, m2
55 punpcklbw m1, m3
56 punpcklwd m0, m1
57
58 movd m4, %5
59 movd m6, %6
60 movd m5, %7
61 movd m3, %8
62
63 punpcklbw m4, m6
64 punpcklbw m5, m3
65 punpcklwd m4, m5
66
67 punpckhdq m2, m0, m4
68 punpckldq m0, m4
69
70 pxor m5, m5
71 punpckhbw m1, m0, m5
72 punpcklbw m0, m5
73 punpckhbw m3, m2, m5
74 punpcklbw m2, m5
75%endmacro
76
77; in: 4 rows of 8 words in m0..m3
78; out: 8 rows of 4 bytes in %1..%8
79%macro TRANSPOSE8x4B_STORE 8
80 packuswb m0, m2
81 packuswb m1, m3
82 SBUTTERFLY bw, 0, 1, 2
83 SBUTTERFLY wd, 0, 1, 2
84
85 movd %1, m0
86 pshufd m0, m0, 0x39
87 movd %2, m0
88 pshufd m0, m0, 0x39
89 movd %3, m0
90 pshufd m0, m0, 0x39
91 movd %4, m0
92
93 movd %5, m1
94 pshufd m1, m1, 0x39
95 movd %6, m1
96 pshufd m1, m1, 0x39
97 movd %7, m1
98 pshufd m1, m1, 0x39
99 movd %8, m1
100%endmacro
101
102; in: 8 rows of 4 words in %4..%11
103; out: 4 rows of 8 words in m0..m3
104%macro TRANSPOSE4x8W_LOAD 8
105 movq m0, %1
106 movq m2, %2
107 movq m1, %3
108 movq m3, %4
109
110 punpcklwd m0, m2
111 punpcklwd m1, m3
112 punpckhdq m2, m0, m1
113 punpckldq m0, m1
114
115 movq m4, %5
116 movq m6, %6
117 movq m5, %7
118 movq m3, %8
119
120 punpcklwd m4, m6
121 punpcklwd m5, m3
122 punpckhdq m6, m4, m5
123 punpckldq m4, m5
124
125 punpckhqdq m1, m0, m4
126 punpcklqdq m0, m4
127 punpckhqdq m3, m2, m6
128 punpcklqdq m2, m6
129
130%endmacro
131
132; in: 4 rows of 8 words in m0..m3
133; out: 8 rows of 4 words in %1..%8
134%macro TRANSPOSE8x4W_STORE 9
135 TRANSPOSE4x4W 0, 1, 2, 3, 4
136
137 pxor m5, m5; zeros reg
138 CLIPW m0, m5, %9
139 CLIPW m1, m5, %9
140 CLIPW m2, m5, %9
141 CLIPW m3, m5, %9
142
143 movq %1, m0
144 movhps %2, m0
145 movq %3, m1
146 movhps %4, m1
147 movq %5, m2
148 movhps %6, m2
149 movq %7, m3
150 movhps %8, m3
151%endmacro
152
153; in: 8 rows of 8 bytes in %1..%8
154; out: 8 rows of 8 words in m0..m7
155%macro TRANSPOSE8x8B_LOAD 8
156 movq m7, %1
157 movq m2, %2
158 movq m1, %3
159 movq m3, %4
160
161 punpcklbw m7, m2
162 punpcklbw m1, m3
163 punpcklwd m3, m7, m1
164 punpckhwd m7, m1
165
166 movq m4, %5
167 movq m6, %6
168 movq m5, %7
169 movq m15, %8
170
171 punpcklbw m4, m6
172 punpcklbw m5, m15
173 punpcklwd m9, m4, m5
174 punpckhwd m4, m5
175
176 punpckldq m1, m3, m9; 0, 1
177 punpckhdq m3, m9; 2, 3
178
179 punpckldq m5, m7, m4; 4, 5
180 punpckhdq m7, m4; 6, 7
181
182 pxor m13, m13
183
184 punpcklbw m0, m1, m13; 0 in 16 bit
185 punpckhbw m1, m13; 1 in 16 bit
186
187 punpcklbw m2, m3, m13; 2
188 punpckhbw m3, m13; 3
189
190 punpcklbw m4, m5, m13; 4
191 punpckhbw m5, m13; 5
192
193 punpcklbw m6, m7, m13; 6
194 punpckhbw m7, m13; 7
195%endmacro
196
197
198; in: 8 rows of 8 words in m0..m8
199; out: 8 rows of 8 bytes in %1..%8
200%macro TRANSPOSE8x8B_STORE 8
201 packuswb m0, m4
202 packuswb m1, m5
203 packuswb m2, m6
204 packuswb m3, m7
205 TRANSPOSE2x4x4B 0, 1, 2, 3, 4
206
207 movq %1, m0
208 movhps %2, m0
209 movq %3, m1
210 movhps %4, m1
211 movq %5, m2
212 movhps %6, m2
213 movq %7, m3
214 movhps %8, m3
215%endmacro
216
217; in: 8 rows of 8 words in %1..%8
218; out: 8 rows of 8 words in m0..m7
219%macro TRANSPOSE8x8W_LOAD 8
220 movdqu m0, %1
221 movdqu m1, %2
222 movdqu m2, %3
223 movdqu m3, %4
224 movdqu m4, %5
225 movdqu m5, %6
226 movdqu m6, %7
227 movdqu m7, %8
228 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
229%endmacro
230
231; in: 8 rows of 8 words in m0..m8
232; out: 8 rows of 8 words in %1..%8
233%macro TRANSPOSE8x8W_STORE 9
234 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
235
236 pxor m8, m8
237 CLIPW m0, m8, %9
238 CLIPW m1, m8, %9
239 CLIPW m2, m8, %9
240 CLIPW m3, m8, %9
241 CLIPW m4, m8, %9
242 CLIPW m5, m8, %9
243 CLIPW m6, m8, %9
244 CLIPW m7, m8, %9
245
246 movdqu %1, m0
247 movdqu %2, m1
248 movdqu %3, m2
249 movdqu %4, m3
250 movdqu %5, m4
251 movdqu %6, m5
252 movdqu %7, m6
253 movdqu %8, m7
254%endmacro
255
256
257; in: %2 clobbered
258; out: %1
259; mask in m11
260; clobbers m10
261%macro MASKED_COPY 2
262 pand %2, m11 ; and mask
263 pandn m10, m11, %1; and -mask
264 por %2, m10
265 mova %1, %2
266%endmacro
267
268; in: %2 clobbered
269; out: %1
270; mask in %3, will be clobbered
271%macro MASKED_COPY2 3
272 pand %2, %3 ; and mask
273 pandn %3, %1; and -mask
274 por %2, %3
275 mova %1, %2
276%endmacro
277
278ALIGN 16
279; input in m0 ... m3 and tcs in r2. Output in m1 and m2
280%macro CHROMA_DEBLOCK_BODY 1
281 psubw m4, m2, m1; q0 - p0
282 psubw m5, m0, m3; p1 - q1
283 psllw m4, 2; << 2
284 paddw m5, m4;
285
286 ;tc calculations
287 movq m6, [tcq]; tc0
288 punpcklwd m6, m6
289 pshufd m6, m6, 0xA0; tc0, tc1
290%if cpuflag(ssse3)
291 psignw m4, m6, [pw_m1]; -tc0, -tc1
292%else
293 pmullw m4, m6, [pw_m1]; -tc0, -tc1
294%endif
295 ;end tc calculations
296
297 paddw m5, [pw_4]; +4
298 psraw m5, 3; >> 3
299
300%if %1 > 8
301 psllw m4, %1-8; << (BIT_DEPTH - 8)
302 psllw m6, %1-8; << (BIT_DEPTH - 8)
303%endif
304 pmaxsw m5, m4
305 pminsw m5, m6
306 paddw m1, m5; p0 + delta0
307 psubw m2, m5; q0 - delta0
308%endmacro
309
310; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
311%macro LUMA_DEBLOCK_BODY 2
312 psllw m9, m2, 1; *2
313 psubw m10, m1, m9
314 paddw m10, m3
315 ABS1 m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
316
317 psllw m9, m5, 1; *2
318 psubw m11, m6, m9
319 paddw m11, m4
320 ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
321
322 ;beta calculations
323%if %1 > 8
324 shl betaq, %1 - 8
325%endif
326 movd m13, betad
327 SPLATW m13, m13, 0
328 ;end beta calculations
329
330 paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3
331
332 pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high
333 pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low
334
335 pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
336 pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
337
338 paddw m14, m9; 0d0+0d3, 1d0+1d3
339
340 ;compare
341 pcmpgtw m15, m13, m14
342 movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
343 test r13, r13
344 je .bypassluma
345
346 ;weak / strong decision compare to beta_2
347 psraw m15, m13, 2; beta >> 2
348 psllw m8, m9, 1;
349 pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
350 movmskps r6, m15;
351 ;end weak / strong decision
352
353 ; weak filter nd_p/q calculation
354 pshufd m8, m10, 0x31
355 psrld m8, 16
356 paddw m8, m10
357 movd r7d, m8
358 pshufd m8, m8, 0x4E
359 movd r8d, m8
360
361 pshufd m8, m11, 0x31
362 psrld m8, 16
363 paddw m8, m11
364 movd r9d, m8
365 pshufd m8, m8, 0x4E
366 movd r10d, m8
367 ; end calc for weak filter
368
369 ; filtering mask
370 mov r11, r13
371 shr r11, 3
372 movd m15, r11d
373 and r13, 1
374 movd m11, r13d
375 shufps m11, m15, 0
376 shl r11, 1
377 or r13, r11
378
379 pcmpeqd m11, [pd_1]; filtering mask
380
381 ;decide between strong and weak filtering
382 ;tc25 calculations
383 mov r11d, [tcq];
384%if %1 > 8
385 shl r11, %1 - 8
386%endif
387 movd m8, r11d; tc0
388 mov r3d, [tcq+4];
389%if %1 > 8
390 shl r3, %1 - 8
391%endif
392 add r11d, r3d; tc0 + tc1
393 jz .bypassluma
394 movd m9, r3d; tc1
395 punpcklwd m8, m8
396 punpcklwd m9, m9
397 shufps m8, m9, 0; tc0, tc1
398 mova m9, m8
399 psllw m8, 2; tc << 2
400 pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1)
401 ;end tc25 calculations
402
403 ;----beta_3 comparison-----
404 psubw m12, m0, m3; p3 - p0
405 ABS1 m12, m14; abs(p3 - p0)
406
407 psubw m15, m7, m4; q3 - q0
408 ABS1 m15, m14; abs(q3 - q0)
409
410 paddw m12, m15; abs(p3 - p0) + abs(q3 - q0)
411
412 pshufhw m12, m12, 0xf0 ;0b11110000;
413 pshuflw m12, m12, 0xf0 ;0b11110000;
414
415 psraw m13, 3; beta >> 3
416 pcmpgtw m13, m12;
417 movmskps r11, m13;
418 and r6, r11; strong mask , beta_2 and beta_3 comparisons
419 ;----beta_3 comparison end-----
420 ;----tc25 comparison---
421 psubw m12, m3, m4; p0 - q0
422 ABS1 m12, m14; abs(p0 - q0)
423
424 pshufhw m12, m12, 0xf0 ;0b11110000;
425 pshuflw m12, m12, 0xf0 ;0b11110000;
426
427 pcmpgtw m8, m12; tc25 comparisons
428 movmskps r11, m8;
429 and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
430 ;----tc25 comparison end---
431 mov r11, r6;
432 shr r11, 1;
433 and r6, r11; strong mask, bits 2 and 0
434
435 pmullw m14, m9, [pw_m2]; -tc * 2
436 paddw m9, m9
437
438 and r6, 5; 0b101
439 mov r11, r6; strong mask
440 shr r6, 2;
441 movd m12, r6d; store to xmm for mask generation
442 shl r6, 1
443 and r11, 1
444 movd m10, r11d; store to xmm for mask generation
445 or r6, r11; final strong mask, bits 1 and 0
446 jz .weakfilter
447
448 shufps m10, m12, 0
449 pcmpeqd m10, [pd_1]; strong mask
450
451 mova m13, [pw_4]; 4 in every cell
452 pand m11, m10; combine filtering mask and strong mask
453 paddw m12, m2, m3; p1 + p0
454 paddw m12, m4; p1 + p0 + q0
455 mova m10, m12; copy
456 paddw m12, m12; 2*p1 + 2*p0 + 2*q0
457 paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0
458 paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1
459 paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
460 psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
461 psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
462 pmaxsw m12, m14
463 pminsw m12, m9; av_clip( , -2 * tc, 2 * tc)
464 paddw m12, m3; p0'
465
466 paddw m15, m1, m10; p2 + p1 + p0 + q0
467 psrlw m13, 1; 2 in every cell
468 paddw m15, m13; p2 + p1 + p0 + q0 + 2
469 psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2
470 psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
471 pmaxsw m15, m14
472 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
473 paddw m15, m2; p1'
474
475 paddw m8, m1, m0; p3 + p2
476 paddw m8, m8; 2*p3 + 2*p2
477 paddw m8, m1; 2*p3 + 3*p2
478 paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0
479 paddw m13, m13
480 paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4
481 psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
482 psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
483 pmaxsw m8, m14
484 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
485 paddw m8, m1; p2'
486 MASKED_COPY m1, m8
487
488 paddw m8, m3, m4; p0 + q0
489 paddw m8, m5; p0 + q0 + q1
490 paddw m8, m8; 2*p0 + 2*q0 + 2*q1
491 paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1
492 paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2
493 paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
494 psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
495 psubw m8, m4;
496 pmaxsw m8, m14
497 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
498 paddw m8, m4; q0'
499 MASKED_COPY m2, m15
500
501 paddw m15, m3, m4; p0 + q0
502 paddw m15, m5; p0 + q0 + q1
503 mova m10, m15;
504 paddw m15, m6; p0 + q0 + q1 + q2
505 psrlw m13, 1; 2 in every cell
506 paddw m15, m13; p0 + q0 + q1 + q2 + 2
507 psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2
508 psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
509 pmaxsw m15, m14
510 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
511 paddw m15, m5; q1'
512
513 paddw m13, m7; q3 + 2
514 paddw m13, m6; q3 + q2 + 2
515 paddw m13, m13; 2*q3 + 2*q2 + 4
516 paddw m13, m6; 2*q3 + 3*q2 + 4
517 paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4
518 psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
519 psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
520 pmaxsw m13, m14
521 pminsw m13, m9; av_clip( , -2 * tc, 2 * tc)
522 paddw m13, m6; q2'
523
524 MASKED_COPY m6, m13
525 MASKED_COPY m5, m15
526 MASKED_COPY m4, m8
527 MASKED_COPY m3, m12
528
529.weakfilter:
530 not r6; strong mask -> weak mask
531 and r6, r13; final weak filtering mask, bits 0 and 1
532 jz .store
533
534 ; weak filtering mask
535 mov r11, r6
536 shr r11, 1
537 movd m12, r11d
538 and r6, 1
539 movd m11, r6d
540 shufps m11, m12, 0
541 pcmpeqd m11, [pd_1]; filtering mask
542
543 mov r13, betaq
544 shr r13, 1;
545 add betaq, r13
546 shr betaq, 3; ((beta + (beta >> 1)) >> 3))
547
548 mova m13, [pw_8]
549 psubw m12, m4, m3 ; q0 - p0
550 psllw m10, m12, 3; 8 * (q0 - p0)
551 paddw m12, m10 ; 9 * (q0 - p0)
552
553 psubw m10, m5, m2 ; q1 - p1
554 psllw m8, m10, 1; 2 * ( q1 - p1 )
555 paddw m10, m8; 3 * ( q1 - p1 )
556 psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
557 paddw m12, m13; + 8
558 psraw m12, 4; >> 4 , delta0
559 PABSW m13, m12; abs(delta0)
560
561
562 psllw m10, m9, 2; 8 * tc
563 paddw m10, m9; 10 * tc
564 pcmpgtw m10, m13
565 pand m11, m10
566
567 psraw m9, 1; tc * 2 -> tc
568 psraw m14, 1; -tc * 2 -> -tc
569
570 pmaxsw m12, m14
571 pminsw m12, m9; av_clip(delta0, -tc, tc)
572
573 psraw m9, 1; tc -> tc / 2
574%if cpuflag(ssse3)
575 psignw m14, m9, [pw_m1]; -tc / 2
576%else
577 pmullw m14, m9, [pw_m1]; -tc / 2
578%endif
579
580 pavgw m15, m1, m3; (p2 + p0 + 1) >> 1
581 psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1
582 paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
583 psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
584 pmaxsw m15, m14
585 pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2)
586 paddw m15, m2; p1'
587
588 ;beta calculations
589 movd m10, betad
590 SPLATW m10, m10, 0
591
592 movd m13, r7d; 1dp0 + 1dp3
593 movd m8, r8d; 0dp0 + 0dp3
594 punpcklwd m8, m8
595 punpcklwd m13, m13
596 shufps m13, m8, 0;
597 pcmpgtw m8, m10, m13
598 pand m8, m11
599 ;end beta calculations
600 MASKED_COPY2 m2, m15, m8; write p1'
601
602 pavgw m8, m6, m4; (q2 + q0 + 1) >> 1
603 psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1
604 psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
605 psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
606 pmaxsw m8, m14
607 pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2)
608 paddw m8, m5; q1'
609
610 movd m13, r9d;
611 movd m15, r10d;
612 punpcklwd m15, m15
613 punpcklwd m13, m13
614 shufps m13, m15, 0; dq0 + dq3
615
616 pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3)
617 pand m10, m11
618 MASKED_COPY2 m5, m8, m10; write q1'
619
620 paddw m15, m3, m12 ; p0 + delta0
621 MASKED_COPY m3, m15
622
623 psubw m8, m4, m12 ; q0 - delta0
624 MASKED_COPY m4, m8
625%endmacro
626
627;-----------------------------------------------------------------------------
628; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
629; uint8_t *_no_p, uint8_t *_no_q);
630;-----------------------------------------------------------------------------
631%macro LOOP_FILTER_CHROMA 0
632cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
633 sub pixq, 2
634 lea r3strideq, [3*strideq]
635 mov pix0q, pixq
636 add pixq, r3strideq
637 TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
638 CHROMA_DEBLOCK_BODY 8
639 TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
640 RET
641
642cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
643 sub pixq, 4
644 lea r3strideq, [3*strideq]
645 mov pix0q, pixq
646 add pixq, r3strideq
647 TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
648 CHROMA_DEBLOCK_BODY 10
649 TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
650 RET
651
652cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
653 sub pixq, 4
654 lea r3strideq, [3*strideq]
655 mov pix0q, pixq
656 add pixq, r3strideq
657 TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
658 CHROMA_DEBLOCK_BODY 12
659 TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
660 RET
661
662;-----------------------------------------------------------------------------
663; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
664; uint8_t *_no_p, uint8_t *_no_q);
665;-----------------------------------------------------------------------------
666cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
667 mov pix0q, pixq
668 sub pix0q, strideq
669 sub pix0q, strideq
670 movq m0, [pix0q]; p1
671 movq m1, [pix0q+strideq]; p0
672 movq m2, [pixq]; q0
673 movq m3, [pixq+strideq]; q1
674 pxor m5, m5; zeros reg
675 punpcklbw m0, m5
676 punpcklbw m1, m5
677 punpcklbw m2, m5
678 punpcklbw m3, m5
679 CHROMA_DEBLOCK_BODY 8
680 packuswb m1, m2
681 movh[pix0q+strideq], m1
682 movhps [pixq], m1
683 RET
684
685cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
686 mov pix0q, pixq
687 sub pix0q, strideq
688 sub pix0q, strideq
689 movu m0, [pix0q]; p1
690 movu m1, [pix0q+strideq]; p0
691 movu m2, [pixq]; q0
692 movu m3, [pixq+strideq]; q1
693 CHROMA_DEBLOCK_BODY 10
694 pxor m5, m5; zeros reg
695 CLIPW m1, m5, [pw_pixel_max_10]
696 CLIPW m2, m5, [pw_pixel_max_10]
697 movu [pix0q+strideq], m1
698 movu [pixq], m2
699 RET
700
701cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
702 mov pix0q, pixq
703 sub pix0q, strideq
704 sub pix0q, strideq
705 movu m0, [pix0q]; p1
706 movu m1, [pix0q+strideq]; p0
707 movu m2, [pixq]; q0
708 movu m3, [pixq+strideq]; q1
709 CHROMA_DEBLOCK_BODY 12
710 pxor m5, m5; zeros reg
711 CLIPW m1, m5, [pw_pixel_max_12]
712 CLIPW m2, m5, [pw_pixel_max_12]
713 movu [pix0q+strideq], m1
714 movu [pixq], m2
715 RET
716%endmacro
717
718INIT_XMM sse2
719LOOP_FILTER_CHROMA
720INIT_XMM avx
721LOOP_FILTER_CHROMA
722
723%if ARCH_X86_64
724%macro LOOP_FILTER_LUMA 0
725;-----------------------------------------------------------------------------
726; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
727; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
728;-----------------------------------------------------------------------------
729cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
730 sub pixq, 4
731 lea pix0q, [3 * r1]
732 mov src3strideq, pixq
733 add pixq, pix0q
734 TRANSPOSE8x8B_LOAD PASS8ROWS(src3strideq, pixq, r1, pix0q)
735 LUMA_DEBLOCK_BODY 8, v
736.store:
737 TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
738.bypassluma:
739 RET
740
741cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
742 sub pixq, 8
743 lea pix0q, [3 * strideq]
744 mov src3strideq, pixq
745 add pixq, pix0q
746 TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q)
747 LUMA_DEBLOCK_BODY 10, v
748.store:
749 TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
750.bypassluma:
751 RET
752
753cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
754 sub pixq, 8
755 lea pix0q, [3 * strideq]
756 mov src3strideq, pixq
757 add pixq, pix0q
758 TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q)
759 LUMA_DEBLOCK_BODY 12, v
760.store:
761 TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
762.bypassluma:
763 RET
764
765;-----------------------------------------------------------------------------
766; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
767; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
768;-----------------------------------------------------------------------------
769cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
770 lea src3strideq, [3 * strideq]
771 mov pix0q, pixq
772 sub pix0q, src3strideq
773 sub pix0q, strideq
774 movq m0, [pix0q]; p3
775 movq m1, [pix0q + strideq]; p2
776 movq m2, [pix0q + 2 * strideq]; p1
777 movq m3, [pix0q + src3strideq]; p0
778 movq m4, [pixq]; q0
779 movq m5, [pixq + strideq]; q1
780 movq m6, [pixq + 2 * strideq]; q2
781 movq m7, [pixq + src3strideq]; q3
782 pxor m8, m8
783 punpcklbw m0, m8
784 punpcklbw m1, m8
785 punpcklbw m2, m8
786 punpcklbw m3, m8
787 punpcklbw m4, m8
788 punpcklbw m5, m8
789 punpcklbw m6, m8
790 punpcklbw m7, m8
791 LUMA_DEBLOCK_BODY 8, h
792.store:
793 packuswb m1, m2
794 packuswb m3, m4
795 packuswb m5, m6
796 movh [pix0q + strideq], m1
797 movhps [pix0q + 2 * strideq], m1
798 movh [pix0q + src3strideq], m3
799 movhps [pixq ], m3
800 movh [pixq + strideq], m5
801 movhps [pixq + 2 * strideq], m5
802.bypassluma:
803 RET
804
805cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
806 lea src3strideq, [3 * strideq]
807 mov pix0q, pixq
808 sub pix0q, src3strideq
809 sub pix0q, strideq
810 movdqu m0, [pix0q]; p3
811 movdqu m1, [pix0q + strideq]; p2
812 movdqu m2, [pix0q + 2 * strideq]; p1
813 movdqu m3, [pix0q + src3strideq]; p0
814 movdqu m4, [pixq]; q0
815 movdqu m5, [pixq + strideq]; q1
816 movdqu m6, [pixq + 2 * strideq]; q2
817 movdqu m7, [pixq + src3strideq]; q3
818 LUMA_DEBLOCK_BODY 10, h
819.store:
820 pxor m8, m8; zeros reg
821 CLIPW m1, m8, [pw_pixel_max_10]
822 CLIPW m2, m8, [pw_pixel_max_10]
823 CLIPW m3, m8, [pw_pixel_max_10]
824 CLIPW m4, m8, [pw_pixel_max_10]
825 CLIPW m5, m8, [pw_pixel_max_10]
826 CLIPW m6, m8, [pw_pixel_max_10]
827 movdqu [pix0q + strideq], m1; p2
828 movdqu [pix0q + 2 * strideq], m2; p1
829 movdqu [pix0q + src3strideq], m3; p0
830 movdqu [pixq ], m4; q0
831 movdqu [pixq + strideq], m5; q1
832 movdqu [pixq + 2 * strideq], m6; q2
833.bypassluma:
834 RET
835
836cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
837 lea src3strideq, [3 * strideq]
838 mov pix0q, pixq
839 sub pix0q, src3strideq
840 sub pix0q, strideq
841 movdqu m0, [pix0q]; p3
842 movdqu m1, [pix0q + strideq]; p2
843 movdqu m2, [pix0q + 2 * strideq]; p1
844 movdqu m3, [pix0q + src3strideq]; p0
845 movdqu m4, [pixq]; q0
846 movdqu m5, [pixq + strideq]; q1
847 movdqu m6, [pixq + 2 * strideq]; q2
848 movdqu m7, [pixq + src3strideq]; q3
849 LUMA_DEBLOCK_BODY 12, h
850.store:
851 pxor m8, m8; zeros reg
852 CLIPW m1, m8, [pw_pixel_max_12]
853 CLIPW m2, m8, [pw_pixel_max_12]
854 CLIPW m3, m8, [pw_pixel_max_12]
855 CLIPW m4, m8, [pw_pixel_max_12]
856 CLIPW m5, m8, [pw_pixel_max_12]
857 CLIPW m6, m8, [pw_pixel_max_12]
858 movdqu [pix0q + strideq], m1; p2
859 movdqu [pix0q + 2 * strideq], m2; p1
860 movdqu [pix0q + src3strideq], m3; p0
861 movdqu [pixq ], m4; q0
862 movdqu [pixq + strideq], m5; q1
863 movdqu [pixq + 2 * strideq], m6; q2
864.bypassluma:
865 RET
866
867%endmacro
868
869INIT_XMM sse2
870LOOP_FILTER_LUMA
871INIT_XMM ssse3
872LOOP_FILTER_LUMA
873INIT_XMM avx
874LOOP_FILTER_LUMA
875%endif