Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (C) 2005-2011 x264 project | |
5 | ;* | |
6 | ;* Authors: Oskar Arvidsson <oskar@irock.se> | |
7 | ;* Loren Merritt <lorenm@u.washington.edu> | |
8 | ;* Fiona Glaser <fiona@x264.com> | |
9 | ;* | |
10 | ;* This file is part of FFmpeg. | |
11 | ;* | |
12 | ;* FFmpeg is free software; you can redistribute it and/or | |
13 | ;* modify it under the terms of the GNU Lesser General Public | |
14 | ;* License as published by the Free Software Foundation; either | |
15 | ;* version 2.1 of the License, or (at your option) any later version. | |
16 | ;* | |
17 | ;* FFmpeg is distributed in the hope that it will be useful, | |
18 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 | ;* Lesser General Public License for more details. | |
21 | ;* | |
22 | ;* You should have received a copy of the GNU Lesser General Public | |
23 | ;* License along with FFmpeg; if not, write to the Free Software | |
24 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25 | ;****************************************************************************** | |
26 | ||
27 | %include "libavutil/x86/x86util.asm" | |
28 | ||
29 | SECTION_RODATA | |
30 | ||
31 | pw_pixel_max: times 8 dw ((1 << 10)-1) | |
32 | ||
33 | SECTION .text | |
34 | ||
35 | cextern pw_2 | |
36 | cextern pw_3 | |
37 | cextern pw_4 | |
38 | ||
39 | ; out: %4 = |%1-%2|-%3 | |
40 | ; clobbers: %5 | |
41 | %macro ABS_SUB 5 | |
42 | psubusw %5, %2, %1 | |
43 | psubusw %4, %1, %2 | |
44 | por %4, %5 | |
45 | psubw %4, %3 | |
46 | %endmacro | |
47 | ||
48 | ; out: %4 = |%1-%2|<%3 | |
49 | %macro DIFF_LT 5 | |
50 | psubusw %4, %2, %1 | |
51 | psubusw %5, %1, %2 | |
52 | por %5, %4 ; |%1-%2| | |
53 | pxor %4, %4 | |
54 | psubw %5, %3 ; |%1-%2|-%3 | |
55 | pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 | |
56 | %endmacro | |
57 | ||
58 | %macro LOAD_AB 4 | |
59 | movd %1, %3 | |
60 | movd %2, %4 | |
61 | SPLATW %1, %1 | |
62 | SPLATW %2, %2 | |
63 | %endmacro | |
64 | ||
65 | ; in: %2=tc reg | |
66 | ; out: %1=splatted tc | |
67 | %macro LOAD_TC 2 | |
68 | movd %1, [%2] | |
69 | punpcklbw %1, %1 | |
70 | %if mmsize == 8 | |
71 | pshufw %1, %1, 0 | |
72 | %else | |
73 | pshuflw %1, %1, 01010000b | |
74 | pshufd %1, %1, 01010000b | |
75 | %endif | |
76 | psraw %1, 6 | |
77 | %endmacro | |
78 | ||
79 | ; in: %1=p1, %2=p0, %3=q0, %4=q1 | |
80 | ; %5=alpha, %6=beta, %7-%9=tmp | |
81 | ; out: %7=mask | |
82 | %macro LOAD_MASK 9 | |
83 | ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha | |
84 | ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta | |
85 | pand %8, %9 | |
86 | ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta | |
87 | pxor %7, %7 | |
88 | pand %8, %9 | |
89 | pcmpgtw %7, %8 | |
90 | %endmacro | |
91 | ||
92 | ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp | |
93 | ; out: %1=p0', m2=q0' | |
94 | %macro DEBLOCK_P0_Q0 7 | |
95 | psubw %3, %4 | |
96 | pxor %7, %7 | |
97 | paddw %3, [pw_4] | |
98 | psubw %7, %5 | |
99 | psubw %6, %2, %1 | |
100 | psllw %6, 2 | |
101 | paddw %3, %6 | |
102 | psraw %3, 3 | |
103 | mova %6, [pw_pixel_max] | |
104 | CLIPW %3, %7, %5 | |
105 | pxor %7, %7 | |
106 | paddw %1, %3 | |
107 | psubw %2, %3 | |
108 | CLIPW %1, %7, %6 | |
109 | CLIPW %2, %7, %6 | |
110 | %endmacro | |
111 | ||
112 | ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp | |
113 | %macro LUMA_Q1 6 | |
114 | pavgw %6, %3, %4 ; (p0+q0+1)>>1 | |
115 | paddw %1, %6 | |
116 | pxor %6, %6 | |
117 | psraw %1, 1 | |
118 | psubw %6, %5 | |
119 | psubw %1, %2 | |
120 | CLIPW %1, %6, %5 | |
121 | paddw %1, %2 | |
122 | %endmacro | |
123 | ||
124 | %macro LUMA_DEBLOCK_ONE 3 | |
125 | DIFF_LT m5, %1, bm, m4, m6 | |
126 | pxor m6, m6 | |
127 | mova %3, m4 | |
128 | pcmpgtw m6, tcm | |
129 | pand m4, tcm | |
130 | pandn m6, m7 | |
131 | pand m4, m6 | |
132 | LUMA_Q1 m5, %2, m1, m2, m4, m6 | |
133 | %endmacro | |
134 | ||
135 | %macro LUMA_H_STORE 2 | |
136 | %if mmsize == 8 | |
137 | movq [r0-4], m0 | |
138 | movq [r0+r1-4], m1 | |
139 | movq [r0+r1*2-4], m2 | |
140 | movq [r0+%2-4], m3 | |
141 | %else | |
142 | movq [r0-4], m0 | |
143 | movhps [r0+r1-4], m0 | |
144 | movq [r0+r1*2-4], m1 | |
145 | movhps [%1-4], m1 | |
146 | movq [%1+r1-4], m2 | |
147 | movhps [%1+r1*2-4], m2 | |
148 | movq [%1+%2-4], m3 | |
149 | movhps [%1+r1*4-4], m3 | |
150 | %endif | |
151 | %endmacro | |
152 | ||
153 | %macro DEBLOCK_LUMA 0 | |
154 | ;----------------------------------------------------------------------------- | |
155 | ; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta, | |
156 | ; int8_t *tc0) | |
157 | ;----------------------------------------------------------------------------- | |
158 | cglobal deblock_v_luma_10, 5,5,8*(mmsize/16) | |
159 | %assign pad 5*mmsize+12-(stack_offset&15) | |
160 | %define tcm [rsp] | |
161 | %define ms1 [rsp+mmsize] | |
162 | %define ms2 [rsp+mmsize*2] | |
163 | %define am [rsp+mmsize*3] | |
164 | %define bm [rsp+mmsize*4] | |
165 | SUB rsp, pad | |
166 | shl r2d, 2 | |
167 | shl r3d, 2 | |
168 | LOAD_AB m4, m5, r2d, r3d | |
169 | mov r3, 32/mmsize | |
170 | mov r2, r0 | |
171 | sub r0, r1 | |
172 | mova am, m4 | |
173 | sub r0, r1 | |
174 | mova bm, m5 | |
175 | sub r0, r1 | |
176 | .loop: | |
177 | mova m0, [r0+r1] | |
178 | mova m1, [r0+r1*2] | |
179 | mova m2, [r2] | |
180 | mova m3, [r2+r1] | |
181 | ||
182 | LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 | |
183 | LOAD_TC m6, r4 | |
184 | mova tcm, m6 | |
185 | ||
186 | mova m5, [r0] | |
187 | LUMA_DEBLOCK_ONE m1, m0, ms1 | |
188 | mova [r0+r1], m5 | |
189 | ||
190 | mova m5, [r2+r1*2] | |
191 | LUMA_DEBLOCK_ONE m2, m3, ms2 | |
192 | mova [r2+r1], m5 | |
193 | ||
194 | pxor m5, m5 | |
195 | mova m6, tcm | |
196 | pcmpgtw m5, tcm | |
197 | psubw m6, ms1 | |
198 | pandn m5, m7 | |
199 | psubw m6, ms2 | |
200 | pand m5, m6 | |
201 | DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 | |
202 | mova [r0+r1*2], m1 | |
203 | mova [r2], m2 | |
204 | ||
205 | add r0, mmsize | |
206 | add r2, mmsize | |
207 | add r4, mmsize/8 | |
208 | dec r3 | |
209 | jg .loop | |
210 | ADD rsp, pad | |
211 | RET | |
212 | ||
213 | cglobal deblock_h_luma_10, 5,6,8*(mmsize/16) | |
214 | %assign pad 7*mmsize+12-(stack_offset&15) | |
215 | %define tcm [rsp] | |
216 | %define ms1 [rsp+mmsize] | |
217 | %define ms2 [rsp+mmsize*2] | |
218 | %define p1m [rsp+mmsize*3] | |
219 | %define p2m [rsp+mmsize*4] | |
220 | %define am [rsp+mmsize*5] | |
221 | %define bm [rsp+mmsize*6] | |
222 | SUB rsp, pad | |
223 | shl r2d, 2 | |
224 | shl r3d, 2 | |
225 | LOAD_AB m4, m5, r2d, r3d | |
226 | mov r3, r1 | |
227 | mova am, m4 | |
228 | add r3, r1 | |
229 | mov r5, 32/mmsize | |
230 | mova bm, m5 | |
231 | add r3, r1 | |
232 | %if mmsize == 16 | |
233 | mov r2, r0 | |
234 | add r2, r3 | |
235 | %endif | |
236 | .loop: | |
237 | %if mmsize == 8 | |
238 | movq m2, [r0-8] ; y q2 q1 q0 | |
239 | movq m7, [r0+0] | |
240 | movq m5, [r0+r1-8] | |
241 | movq m3, [r0+r1+0] | |
242 | movq m0, [r0+r1*2-8] | |
243 | movq m6, [r0+r1*2+0] | |
244 | movq m1, [r0+r3-8] | |
245 | TRANSPOSE4x4W 2, 5, 0, 1, 4 | |
246 | SWAP 2, 7 | |
247 | movq m7, [r0+r3] | |
248 | TRANSPOSE4x4W 2, 3, 6, 7, 4 | |
249 | %else | |
250 | movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x | |
251 | movu m0, [r0+r1-8] | |
252 | movu m2, [r0+r1*2-8] | |
253 | movu m3, [r2-8] | |
254 | TRANSPOSE4x4W 5, 0, 2, 3, 6 | |
255 | mova tcm, m3 | |
256 | ||
257 | movu m4, [r2+r1-8] | |
258 | movu m1, [r2+r1*2-8] | |
259 | movu m3, [r2+r3-8] | |
260 | movu m7, [r2+r1*4-8] | |
261 | TRANSPOSE4x4W 4, 1, 3, 7, 6 | |
262 | ||
263 | mova m6, tcm | |
264 | punpcklqdq m6, m7 | |
265 | punpckhqdq m5, m4 | |
266 | SBUTTERFLY qdq, 0, 1, 7 | |
267 | SBUTTERFLY qdq, 2, 3, 7 | |
268 | %endif | |
269 | ||
270 | mova p2m, m6 | |
271 | LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 | |
272 | LOAD_TC m6, r4 | |
273 | mova tcm, m6 | |
274 | ||
275 | LUMA_DEBLOCK_ONE m1, m0, ms1 | |
276 | mova p1m, m5 | |
277 | ||
278 | mova m5, p2m | |
279 | LUMA_DEBLOCK_ONE m2, m3, ms2 | |
280 | mova p2m, m5 | |
281 | ||
282 | pxor m5, m5 | |
283 | mova m6, tcm | |
284 | pcmpgtw m5, tcm | |
285 | psubw m6, ms1 | |
286 | pandn m5, m7 | |
287 | psubw m6, ms2 | |
288 | pand m5, m6 | |
289 | DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 | |
290 | mova m0, p1m | |
291 | mova m3, p2m | |
292 | TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
293 | LUMA_H_STORE r2, r3 | |
294 | ||
295 | add r4, mmsize/8 | |
296 | lea r0, [r0+r1*(mmsize/2)] | |
297 | lea r2, [r2+r1*(mmsize/2)] | |
298 | dec r5 | |
299 | jg .loop | |
300 | ADD rsp, pad | |
301 | RET | |
302 | %endmacro | |
303 | ||
304 | %if ARCH_X86_64 | |
305 | ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 | |
306 | ; m12=alpha, m13=beta | |
307 | ; out: m0=p1', m3=q1', m1=p0', m2=q0' | |
308 | ; clobbers: m4, m5, m6, m7, m10, m11, m14 | |
309 | %macro DEBLOCK_LUMA_INTER_SSE2 0 | |
310 | LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 | |
311 | LOAD_TC m6, r4 | |
312 | DIFF_LT m8, m1, m13, m10, m4 | |
313 | DIFF_LT m9, m2, m13, m11, m4 | |
314 | pand m6, m7 | |
315 | ||
316 | mova m14, m6 | |
317 | pxor m4, m4 | |
318 | pcmpgtw m6, m4 | |
319 | pand m6, m14 | |
320 | ||
321 | mova m5, m10 | |
322 | pand m5, m6 | |
323 | LUMA_Q1 m8, m0, m1, m2, m5, m4 | |
324 | ||
325 | mova m5, m11 | |
326 | pand m5, m6 | |
327 | LUMA_Q1 m9, m3, m1, m2, m5, m4 | |
328 | ||
329 | pxor m4, m4 | |
330 | psubw m6, m10 | |
331 | pcmpgtw m4, m14 | |
332 | pandn m4, m7 | |
333 | psubw m6, m11 | |
334 | pand m4, m6 | |
335 | DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 | |
336 | ||
337 | SWAP 0, 8 | |
338 | SWAP 3, 9 | |
339 | %endmacro | |
340 | ||
341 | %macro DEBLOCK_LUMA_64 0 | |
342 | cglobal deblock_v_luma_10, 5,5,15 | |
343 | %define p2 m8 | |
344 | %define p1 m0 | |
345 | %define p0 m1 | |
346 | %define q0 m2 | |
347 | %define q1 m3 | |
348 | %define q2 m9 | |
349 | %define mask0 m7 | |
350 | %define mask1 m10 | |
351 | %define mask2 m11 | |
352 | shl r2d, 2 | |
353 | shl r3d, 2 | |
354 | LOAD_AB m12, m13, r2d, r3d | |
355 | mov r2, r0 | |
356 | sub r0, r1 | |
357 | sub r0, r1 | |
358 | sub r0, r1 | |
359 | mov r3, 2 | |
360 | .loop: | |
361 | mova p2, [r0] | |
362 | mova p1, [r0+r1] | |
363 | mova p0, [r0+r1*2] | |
364 | mova q0, [r2] | |
365 | mova q1, [r2+r1] | |
366 | mova q2, [r2+r1*2] | |
367 | DEBLOCK_LUMA_INTER_SSE2 | |
368 | mova [r0+r1], p1 | |
369 | mova [r0+r1*2], p0 | |
370 | mova [r2], q0 | |
371 | mova [r2+r1], q1 | |
372 | add r0, mmsize | |
373 | add r2, mmsize | |
374 | add r4, 2 | |
375 | dec r3 | |
376 | jg .loop | |
377 | REP_RET | |
378 | ||
379 | cglobal deblock_h_luma_10, 5,7,15 | |
380 | shl r2d, 2 | |
381 | shl r3d, 2 | |
382 | LOAD_AB m12, m13, r2d, r3d | |
383 | mov r2, r1 | |
384 | add r2, r1 | |
385 | add r2, r1 | |
386 | mov r5, r0 | |
387 | add r5, r2 | |
388 | mov r6, 2 | |
389 | .loop: | |
390 | movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x | |
391 | movu m0, [r0+r1-8] | |
392 | movu m2, [r0+r1*2-8] | |
393 | movu m9, [r5-8] | |
394 | movu m5, [r5+r1-8] | |
395 | movu m1, [r5+r1*2-8] | |
396 | movu m3, [r5+r2-8] | |
397 | movu m7, [r5+r1*4-8] | |
398 | ||
399 | TRANSPOSE4x4W 8, 0, 2, 9, 10 | |
400 | TRANSPOSE4x4W 5, 1, 3, 7, 10 | |
401 | ||
402 | punpckhqdq m8, m5 | |
403 | SBUTTERFLY qdq, 0, 1, 10 | |
404 | SBUTTERFLY qdq, 2, 3, 10 | |
405 | punpcklqdq m9, m7 | |
406 | ||
407 | DEBLOCK_LUMA_INTER_SSE2 | |
408 | ||
409 | TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
410 | LUMA_H_STORE r5, r2 | |
411 | add r4, 2 | |
412 | lea r0, [r0+r1*8] | |
413 | lea r5, [r5+r1*8] | |
414 | dec r6 | |
415 | jg .loop | |
416 | REP_RET | |
417 | %endmacro | |
418 | ||
419 | INIT_XMM sse2 | |
420 | DEBLOCK_LUMA_64 | |
421 | %if HAVE_AVX_EXTERNAL | |
422 | INIT_XMM avx | |
423 | DEBLOCK_LUMA_64 | |
424 | %endif | |
425 | %endif | |
426 | ||
427 | %macro SWAPMOVA 2 | |
428 | %ifid %1 | |
429 | SWAP %1, %2 | |
430 | %else | |
431 | mova %1, %2 | |
432 | %endif | |
433 | %endmacro | |
434 | ||
435 | ; in: t0-t2: tmp registers | |
436 | ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 | |
437 | ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' | |
438 | %macro LUMA_INTRA_P012 12 ; p0..p3 in memory | |
439 | %if ARCH_X86_64 | |
440 | paddw t0, %3, %2 | |
441 | mova t2, %4 | |
442 | paddw t2, %3 | |
443 | %else | |
444 | mova t0, %3 | |
445 | mova t2, %4 | |
446 | paddw t0, %2 | |
447 | paddw t2, %3 | |
448 | %endif | |
449 | paddw t0, %1 | |
450 | paddw t2, t2 | |
451 | paddw t0, %5 | |
452 | paddw t2, %9 | |
453 | paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) | |
454 | paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) | |
455 | ||
456 | psrlw t2, 3 | |
457 | psrlw t1, t0, 2 | |
458 | psubw t2, %3 | |
459 | psubw t1, %2 | |
460 | pand t2, %8 | |
461 | pand t1, %8 | |
462 | paddw t2, %3 | |
463 | paddw t1, %2 | |
464 | SWAPMOVA %11, t1 | |
465 | ||
466 | psubw t1, t0, %3 | |
467 | paddw t0, t0 | |
468 | psubw t1, %5 | |
469 | psubw t0, %3 | |
470 | paddw t1, %6 | |
471 | paddw t1, %2 | |
472 | paddw t0, %6 | |
473 | psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 | |
474 | psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 | |
475 | ||
476 | pxor t0, t1 | |
477 | pxor t1, %1 | |
478 | pand t0, %8 | |
479 | pand t1, %7 | |
480 | pxor t0, t1 | |
481 | pxor t0, %1 | |
482 | SWAPMOVA %10, t0 | |
483 | SWAPMOVA %12, t2 | |
484 | %endmacro | |
485 | ||
486 | %macro LUMA_INTRA_INIT 1 | |
487 | %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) | |
488 | %define t0 m4 | |
489 | %define t1 m5 | |
490 | %define t2 m6 | |
491 | %define t3 m7 | |
492 | %assign i 4 | |
493 | %rep %1 | |
494 | CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] | |
495 | %assign i i+1 | |
496 | %endrep | |
497 | SUB rsp, pad | |
498 | %endmacro | |
499 | ||
500 | ; in: %1-%3=tmp, %4=p2, %5=q2 | |
501 | %macro LUMA_INTRA_INTER 5 | |
502 | LOAD_AB t0, t1, r2d, r3d | |
503 | mova %1, t0 | |
504 | LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 | |
505 | %if ARCH_X86_64 | |
506 | mova %2, t0 ; mask0 | |
507 | psrlw t3, %1, 2 | |
508 | %else | |
509 | mova t3, %1 | |
510 | mova %2, t0 ; mask0 | |
511 | psrlw t3, 2 | |
512 | %endif | |
513 | paddw t3, [pw_2] ; alpha/4+2 | |
514 | DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 | |
515 | pand t2, %2 | |
516 | mova t3, %5 ; q2 | |
517 | mova %1, t2 ; mask1 | |
518 | DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta | |
519 | pand t2, %1 | |
520 | mova t3, %4 ; p2 | |
521 | mova %3, t2 ; mask1q | |
522 | DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta | |
523 | pand t2, %1 | |
524 | mova %1, t2 ; mask1p | |
525 | %endmacro | |
526 | ||
527 | %macro LUMA_H_INTRA_LOAD 0 | |
528 | %if mmsize == 8 | |
529 | movu t0, [r0-8] | |
530 | movu t1, [r0+r1-8] | |
531 | movu m0, [r0+r1*2-8] | |
532 | movu m1, [r0+r4-8] | |
533 | TRANSPOSE4x4W 4, 5, 0, 1, 2 | |
534 | mova t4, t0 ; p3 | |
535 | mova t5, t1 ; p2 | |
536 | ||
537 | movu m2, [r0] | |
538 | movu m3, [r0+r1] | |
539 | movu t0, [r0+r1*2] | |
540 | movu t1, [r0+r4] | |
541 | TRANSPOSE4x4W 2, 3, 4, 5, 6 | |
542 | mova t6, t0 ; q2 | |
543 | mova t7, t1 ; q3 | |
544 | %else | |
545 | movu t0, [r0-8] | |
546 | movu t1, [r0+r1-8] | |
547 | movu m0, [r0+r1*2-8] | |
548 | movu m1, [r0+r5-8] | |
549 | movu m2, [r4-8] | |
550 | movu m3, [r4+r1-8] | |
551 | movu t2, [r4+r1*2-8] | |
552 | movu t3, [r4+r5-8] | |
553 | TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 | |
554 | mova t4, t0 ; p3 | |
555 | mova t5, t1 ; p2 | |
556 | mova t6, t2 ; q2 | |
557 | mova t7, t3 ; q3 | |
558 | %endif | |
559 | %endmacro | |
560 | ||
561 | ; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp | |
562 | %macro LUMA_H_INTRA_STORE 9 | |
563 | %if mmsize == 8 | |
564 | TRANSPOSE4x4W %1, %2, %3, %4, %9 | |
565 | movq [r0-8], m%1 | |
566 | movq [r0+r1-8], m%2 | |
567 | movq [r0+r1*2-8], m%3 | |
568 | movq [r0+r4-8], m%4 | |
569 | movq m%1, %8 | |
570 | TRANSPOSE4x4W %5, %6, %7, %1, %9 | |
571 | movq [r0], m%5 | |
572 | movq [r0+r1], m%6 | |
573 | movq [r0+r1*2], m%7 | |
574 | movq [r0+r4], m%1 | |
575 | %else | |
576 | TRANSPOSE2x4x4W %1, %2, %3, %4, %9 | |
577 | movq [r0-8], m%1 | |
578 | movq [r0+r1-8], m%2 | |
579 | movq [r0+r1*2-8], m%3 | |
580 | movq [r0+r5-8], m%4 | |
581 | movhps [r4-8], m%1 | |
582 | movhps [r4+r1-8], m%2 | |
583 | movhps [r4+r1*2-8], m%3 | |
584 | movhps [r4+r5-8], m%4 | |
585 | %ifnum %8 | |
586 | SWAP %1, %8 | |
587 | %else | |
588 | mova m%1, %8 | |
589 | %endif | |
590 | TRANSPOSE2x4x4W %5, %6, %7, %1, %9 | |
591 | movq [r0], m%5 | |
592 | movq [r0+r1], m%6 | |
593 | movq [r0+r1*2], m%7 | |
594 | movq [r0+r5], m%1 | |
595 | movhps [r4], m%5 | |
596 | movhps [r4+r1], m%6 | |
597 | movhps [r4+r1*2], m%7 | |
598 | movhps [r4+r5], m%1 | |
599 | %endif | |
600 | %endmacro | |
601 | ||
602 | %if ARCH_X86_64 | |
603 | ;----------------------------------------------------------------------------- | |
604 | ; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha, | |
605 | ; int beta) | |
606 | ;----------------------------------------------------------------------------- | |
607 | %macro DEBLOCK_LUMA_INTRA_64 0 | |
608 | cglobal deblock_v_luma_intra_10, 4,7,16 | |
609 | %define t0 m1 | |
610 | %define t1 m2 | |
611 | %define t2 m4 | |
612 | %define p2 m8 | |
613 | %define p1 m9 | |
614 | %define p0 m10 | |
615 | %define q0 m11 | |
616 | %define q1 m12 | |
617 | %define q2 m13 | |
618 | %define aa m5 | |
619 | %define bb m14 | |
620 | lea r4, [r1*4] | |
621 | lea r5, [r1*3] ; 3*stride | |
622 | neg r4 | |
623 | add r4, r0 ; pix-4*stride | |
624 | mov r6, 2 | |
625 | mova m0, [pw_2] | |
626 | shl r2d, 2 | |
627 | shl r3d, 2 | |
628 | LOAD_AB aa, bb, r2d, r3d | |
629 | .loop: | |
630 | mova p2, [r4+r1] | |
631 | mova p1, [r4+2*r1] | |
632 | mova p0, [r4+r5] | |
633 | mova q0, [r0] | |
634 | mova q1, [r0+r1] | |
635 | mova q2, [r0+2*r1] | |
636 | ||
637 | LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 | |
638 | mova t2, aa | |
639 | psrlw t2, 2 | |
640 | paddw t2, m0 ; alpha/4+2 | |
641 | DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 | |
642 | DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta | |
643 | DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta | |
644 | pand m6, m3 | |
645 | pand m7, m6 | |
646 | pand m6, t1 | |
647 | LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] | |
648 | LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] | |
649 | add r0, mmsize | |
650 | add r4, mmsize | |
651 | dec r6 | |
652 | jg .loop | |
653 | REP_RET | |
654 | ||
655 | ;----------------------------------------------------------------------------- | |
656 | ; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha, | |
657 | ; int beta) | |
658 | ;----------------------------------------------------------------------------- | |
659 | cglobal deblock_h_luma_intra_10, 4,7,16 | |
660 | %define t0 m15 | |
661 | %define t1 m14 | |
662 | %define t2 m2 | |
663 | %define q3 m5 | |
664 | %define q2 m8 | |
665 | %define q1 m9 | |
666 | %define q0 m10 | |
667 | %define p0 m11 | |
668 | %define p1 m12 | |
669 | %define p2 m13 | |
670 | %define p3 m4 | |
671 | %define spill [rsp] | |
672 | %assign pad 24-(stack_offset&15) | |
673 | SUB rsp, pad | |
674 | lea r4, [r1*4] | |
675 | lea r5, [r1*3] ; 3*stride | |
676 | add r4, r0 ; pix+4*stride | |
677 | mov r6, 2 | |
678 | mova m0, [pw_2] | |
679 | shl r2d, 2 | |
680 | shl r3d, 2 | |
681 | .loop: | |
682 | movu q3, [r0-8] | |
683 | movu q2, [r0+r1-8] | |
684 | movu q1, [r0+r1*2-8] | |
685 | movu q0, [r0+r5-8] | |
686 | movu p0, [r4-8] | |
687 | movu p1, [r4+r1-8] | |
688 | movu p2, [r4+r1*2-8] | |
689 | movu p3, [r4+r5-8] | |
690 | TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 | |
691 | ||
692 | LOAD_AB m1, m2, r2d, r3d | |
693 | LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 | |
694 | psrlw m1, 2 | |
695 | paddw m1, m0 ; alpha/4+2 | |
696 | DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 | |
697 | DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta | |
698 | DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta | |
699 | pand m6, m3 | |
700 | pand m7, m6 | |
701 | pand m6, t1 | |
702 | ||
703 | mova spill, q3 | |
704 | LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 | |
705 | LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 | |
706 | mova m7, spill | |
707 | ||
708 | LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 | |
709 | ||
710 | lea r0, [r0+r1*8] | |
711 | lea r4, [r4+r1*8] | |
712 | dec r6 | |
713 | jg .loop | |
714 | ADD rsp, pad | |
715 | RET | |
716 | %endmacro | |
717 | ||
718 | INIT_XMM sse2 | |
719 | DEBLOCK_LUMA_INTRA_64 | |
720 | %if HAVE_AVX_EXTERNAL | |
721 | INIT_XMM avx | |
722 | DEBLOCK_LUMA_INTRA_64 | |
723 | %endif | |
724 | ||
725 | %endif | |
726 | ||
727 | %macro DEBLOCK_LUMA_INTRA 0 | |
728 | ;----------------------------------------------------------------------------- | |
729 | ; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha, | |
730 | ; int beta) | |
731 | ;----------------------------------------------------------------------------- | |
732 | cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16) | |
733 | LUMA_INTRA_INIT 3 | |
734 | lea r4, [r1*4] | |
735 | lea r5, [r1*3] | |
736 | neg r4 | |
737 | add r4, r0 | |
738 | mov r6, 32/mmsize | |
739 | shl r2d, 2 | |
740 | shl r3d, 2 | |
741 | .loop: | |
742 | mova m0, [r4+r1*2] ; p1 | |
743 | mova m1, [r4+r5] ; p0 | |
744 | mova m2, [r0] ; q0 | |
745 | mova m3, [r0+r1] ; q1 | |
746 | LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] | |
747 | LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] | |
748 | mova t3, [r0+r1*2] ; q2 | |
749 | LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] | |
750 | add r0, mmsize | |
751 | add r4, mmsize | |
752 | dec r6 | |
753 | jg .loop | |
754 | ADD rsp, pad | |
755 | RET | |
756 | ||
757 | ;----------------------------------------------------------------------------- | |
758 | ; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha, | |
759 | ; int beta) | |
760 | ;----------------------------------------------------------------------------- | |
761 | cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) | |
762 | LUMA_INTRA_INIT 8 | |
763 | %if mmsize == 8 | |
764 | lea r4, [r1*3] | |
765 | mov r5, 32/mmsize | |
766 | %else | |
767 | lea r4, [r1*4] | |
768 | lea r5, [r1*3] ; 3*stride | |
769 | add r4, r0 ; pix+4*stride | |
770 | mov r6, 32/mmsize | |
771 | %endif | |
772 | shl r2d, 2 | |
773 | shl r3d, 2 | |
774 | .loop: | |
775 | LUMA_H_INTRA_LOAD | |
776 | LUMA_INTRA_INTER t8, t9, t10, t5, t6 | |
777 | ||
778 | LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 | |
779 | mova t3, t6 ; q2 | |
780 | LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 | |
781 | ||
782 | mova m2, t4 | |
783 | mova m0, t11 | |
784 | mova m1, t5 | |
785 | mova m3, t8 | |
786 | mova m6, t6 | |
787 | ||
788 | LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 | |
789 | ||
790 | lea r0, [r0+r1*(mmsize/2)] | |
791 | %if mmsize == 8 | |
792 | dec r5 | |
793 | %else | |
794 | lea r4, [r4+r1*(mmsize/2)] | |
795 | dec r6 | |
796 | %endif | |
797 | jg .loop | |
798 | ADD rsp, pad | |
799 | RET | |
800 | %endmacro | |
801 | ||
802 | %if ARCH_X86_64 == 0 | |
803 | INIT_MMX mmxext | |
804 | DEBLOCK_LUMA | |
805 | DEBLOCK_LUMA_INTRA | |
806 | INIT_XMM sse2 | |
807 | DEBLOCK_LUMA | |
808 | DEBLOCK_LUMA_INTRA | |
809 | %if HAVE_AVX_EXTERNAL | |
810 | INIT_XMM avx | |
811 | DEBLOCK_LUMA | |
812 | DEBLOCK_LUMA_INTRA | |
813 | %endif | |
814 | %endif | |
815 | ||
816 | ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp | |
817 | ; out: %1=p0', %2=q0' | |
818 | %macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 | |
819 | mova %6, [pw_2] | |
820 | paddw %6, %3 | |
821 | paddw %6, %4 | |
822 | paddw %7, %6, %2 | |
823 | paddw %6, %1 | |
824 | paddw %6, %3 | |
825 | paddw %7, %4 | |
826 | psraw %6, 2 | |
827 | psraw %7, 2 | |
828 | psubw %6, %1 | |
829 | psubw %7, %2 | |
830 | pand %6, %5 | |
831 | pand %7, %5 | |
832 | paddw %1, %6 | |
833 | paddw %2, %7 | |
834 | %endmacro | |
835 | ||
836 | %macro CHROMA_V_LOAD 1 | |
837 | mova m0, [r0] ; p1 | |
838 | mova m1, [r0+r1] ; p0 | |
839 | mova m2, [%1] ; q0 | |
840 | mova m3, [%1+r1] ; q1 | |
841 | %endmacro | |
842 | ||
843 | %macro CHROMA_V_STORE 0 | |
844 | mova [r0+1*r1], m1 | |
845 | mova [r0+2*r1], m2 | |
846 | %endmacro | |
847 | ||
848 | %macro CHROMA_V_LOAD_TC 2 | |
849 | movd %1, [%2] | |
850 | punpcklbw %1, %1 | |
851 | punpcklwd %1, %1 | |
852 | psraw %1, 6 | |
853 | %endmacro | |
854 | ||
855 | %macro DEBLOCK_CHROMA 0 | |
856 | ;----------------------------------------------------------------------------- | |
857 | ; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta, | |
858 | ; int8_t *tc0) | |
859 | ;----------------------------------------------------------------------------- | |
860 | cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) | |
861 | mov r5, r0 | |
862 | sub r0, r1 | |
863 | sub r0, r1 | |
864 | shl r2d, 2 | |
865 | shl r3d, 2 | |
866 | %if mmsize < 16 | |
867 | mov r6, 16/mmsize | |
868 | .loop: | |
869 | %endif | |
870 | CHROMA_V_LOAD r5 | |
871 | LOAD_AB m4, m5, r2d, r3d | |
872 | LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 | |
873 | pxor m4, m4 | |
874 | CHROMA_V_LOAD_TC m6, r4 | |
875 | psubw m6, [pw_3] | |
876 | pmaxsw m6, m4 | |
877 | pand m7, m6 | |
878 | DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 | |
879 | CHROMA_V_STORE | |
880 | %if mmsize < 16 | |
881 | add r0, mmsize | |
882 | add r5, mmsize | |
883 | add r4, mmsize/4 | |
884 | dec r6 | |
885 | jg .loop | |
886 | REP_RET | |
887 | %else | |
888 | RET | |
889 | %endif | |
890 | ||
891 | ;----------------------------------------------------------------------------- | |
892 | ; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha, | |
893 | ; int beta) | |
894 | ;----------------------------------------------------------------------------- | |
895 | cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) | |
896 | mov r4, r0 | |
897 | sub r0, r1 | |
898 | sub r0, r1 | |
899 | shl r2d, 2 | |
900 | shl r3d, 2 | |
901 | %if mmsize < 16 | |
902 | mov r5, 16/mmsize | |
903 | .loop: | |
904 | %endif | |
905 | CHROMA_V_LOAD r4 | |
906 | LOAD_AB m4, m5, r2d, r3d | |
907 | LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 | |
908 | CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 | |
909 | CHROMA_V_STORE | |
910 | %if mmsize < 16 | |
911 | add r0, mmsize | |
912 | add r4, mmsize | |
913 | dec r5 | |
914 | jg .loop | |
915 | REP_RET | |
916 | %else | |
917 | RET | |
918 | %endif | |
919 | %endmacro | |
920 | ||
921 | %if ARCH_X86_64 == 0 | |
922 | INIT_MMX mmxext | |
923 | DEBLOCK_CHROMA | |
924 | %endif | |
925 | INIT_XMM sse2 | |
926 | DEBLOCK_CHROMA | |
927 | %if HAVE_AVX_EXTERNAL | |
928 | INIT_XMM avx | |
929 | DEBLOCK_CHROMA | |
930 | %endif |