Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* VP8 MMXEXT optimizations | |
3 | ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 | ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> | |
5 | ;* | |
6 | ;* This file is part of FFmpeg. | |
7 | ;* | |
8 | ;* FFmpeg is free software; you can redistribute it and/or | |
9 | ;* modify it under the terms of the GNU Lesser General Public | |
10 | ;* License as published by the Free Software Foundation; either | |
11 | ;* version 2.1 of the License, or (at your option) any later version. | |
12 | ;* | |
13 | ;* FFmpeg is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | ;* Lesser General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU Lesser General Public | |
19 | ;* License along with FFmpeg; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | ;****************************************************************************** | |
22 | ||
23 | %include "libavutil/x86/x86util.asm" | |
24 | ||
25 | SECTION_RODATA | |
26 | ||
27 | pw_27: times 8 dw 27 | |
28 | pw_63: times 8 dw 63 | |
29 | ||
30 | pb_4: times 16 db 4 | |
31 | pb_F8: times 16 db 0xF8 | |
32 | pb_FE: times 16 db 0xFE | |
33 | pb_27_63: times 8 db 27, 63 | |
34 | pb_18_63: times 8 db 18, 63 | |
35 | pb_9_63: times 8 db 9, 63 | |
36 | ||
37 | cextern pb_1 | |
38 | cextern pb_3 | |
39 | cextern pw_9 | |
40 | cextern pw_18 | |
41 | cextern pb_80 | |
42 | ||
43 | SECTION .text | |
44 | ||
45 | ;----------------------------------------------------------------------------- | |
46 | ; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); | |
47 | ;----------------------------------------------------------------------------- | |
48 | ||
49 | ; macro called with 7 mm register indexes as argument, and 4 regular registers | |
50 | ; | |
51 | ; first 4 mm registers will carry the transposed pixel data | |
52 | ; the other three are scratchspace (one would be sufficient, but this allows | |
53 | ; for more spreading/pipelining and thus faster execution on OOE CPUs) | |
54 | ; | |
55 | ; first two regular registers are buf+4*stride and buf+5*stride | |
56 | ; third is -stride, fourth is +stride | |
57 | %macro READ_8x4_INTERLEAVED 11 | |
58 | ; interleave 8 (A-H) rows of 4 pixels each | |
59 | movd m%1, [%8+%10*4] ; A0-3 | |
60 | movd m%5, [%9+%10*4] ; B0-3 | |
61 | movd m%2, [%8+%10*2] ; C0-3 | |
62 | movd m%6, [%8+%10] ; D0-3 | |
63 | movd m%3, [%8] ; E0-3 | |
64 | movd m%7, [%9] ; F0-3 | |
65 | movd m%4, [%9+%11] ; G0-3 | |
66 | punpcklbw m%1, m%5 ; A/B interleaved | |
67 | movd m%5, [%9+%11*2] ; H0-3 | |
68 | punpcklbw m%2, m%6 ; C/D interleaved | |
69 | punpcklbw m%3, m%7 ; E/F interleaved | |
70 | punpcklbw m%4, m%5 ; G/H interleaved | |
71 | %endmacro | |
72 | ||
73 | ; macro called with 7 mm register indexes as argument, and 5 regular registers | |
74 | ; first 11 mean the same as READ_8x4_TRANSPOSED above | |
75 | ; fifth regular register is scratchspace to reach the bottom 8 rows, it | |
76 | ; will be set to second regular register + 8*stride at the end | |
77 | %macro READ_16x4_INTERLEAVED 12 | |
78 | ; transpose 16 (A-P) rows of 4 pixels each | |
79 | lea %12, [r0+8*r2] | |
80 | ||
81 | ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M | |
82 | movd m%1, [%8+%10*4] ; A0-3 | |
83 | movd m%3, [%12+%10*4] ; I0-3 | |
84 | movd m%2, [%8+%10*2] ; C0-3 | |
85 | movd m%4, [%12+%10*2] ; K0-3 | |
86 | movd m%6, [%8+%10] ; D0-3 | |
87 | movd m%5, [%12+%10] ; L0-3 | |
88 | movd m%7, [%12] ; M0-3 | |
89 | add %12, %11 | |
90 | punpcklbw m%1, m%3 ; A/I | |
91 | movd m%3, [%8] ; E0-3 | |
92 | punpcklbw m%2, m%4 ; C/K | |
93 | punpcklbw m%6, m%5 ; D/L | |
94 | punpcklbw m%3, m%7 ; E/M | |
95 | punpcklbw m%2, m%6 ; C/D/K/L interleaved | |
96 | ||
97 | ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P | |
98 | movd m%5, [%9+%10*4] ; B0-3 | |
99 | movd m%4, [%12+%10*4] ; J0-3 | |
100 | movd m%7, [%9] ; F0-3 | |
101 | movd m%6, [%12] ; N0-3 | |
102 | punpcklbw m%5, m%4 ; B/J | |
103 | punpcklbw m%7, m%6 ; F/N | |
104 | punpcklbw m%1, m%5 ; A/B/I/J interleaved | |
105 | punpcklbw m%3, m%7 ; E/F/M/N interleaved | |
106 | movd m%4, [%9+%11] ; G0-3 | |
107 | movd m%6, [%12+%11] ; O0-3 | |
108 | movd m%5, [%9+%11*2] ; H0-3 | |
109 | movd m%7, [%12+%11*2] ; P0-3 | |
110 | punpcklbw m%4, m%6 ; G/O | |
111 | punpcklbw m%5, m%7 ; H/P | |
112 | punpcklbw m%4, m%5 ; G/H/O/P interleaved | |
113 | %endmacro | |
114 | ||
115 | ; write 4 mm registers of 2 dwords each | |
116 | ; first four arguments are mm register indexes containing source data | |
117 | ; last four are registers containing buf+4*stride, buf+5*stride, | |
118 | ; -stride and +stride | |
119 | %macro WRITE_4x2D 8 | |
120 | ; write out (2 dwords per register) | |
121 | movd [%5+%7*4], m%1 | |
122 | movd [%5+%7*2], m%2 | |
123 | movd [%5], m%3 | |
124 | movd [%6+%8], m%4 | |
125 | punpckhdq m%1, m%1 | |
126 | punpckhdq m%2, m%2 | |
127 | punpckhdq m%3, m%3 | |
128 | punpckhdq m%4, m%4 | |
129 | movd [%6+%7*4], m%1 | |
130 | movd [%5+%7], m%2 | |
131 | movd [%6], m%3 | |
132 | movd [%6+%8*2], m%4 | |
133 | %endmacro | |
134 | ||
135 | ; write 4 xmm registers of 4 dwords each | |
136 | ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular | |
137 | ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride | |
138 | ; we add 1*stride to the third regular registry in the process | |
139 | ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the | |
140 | ; same memory region), or 8 if they cover two separate buffers (third one points to | |
141 | ; a different memory region than the first two), allowing for more optimal code for | |
142 | ; the 16-width case | |
143 | %macro WRITE_4x4D 10 | |
144 | ; write out (4 dwords per register), start with dwords zero | |
145 | movd [%5+%8*4], m%1 | |
146 | movd [%5], m%2 | |
147 | movd [%7+%8*4], m%3 | |
148 | movd [%7], m%4 | |
149 | ||
150 | ; store dwords 1 | |
151 | psrldq m%1, 4 | |
152 | psrldq m%2, 4 | |
153 | psrldq m%3, 4 | |
154 | psrldq m%4, 4 | |
155 | movd [%6+%8*4], m%1 | |
156 | movd [%6], m%2 | |
157 | %if %10 == 16 | |
158 | movd [%6+%9*4], m%3 | |
159 | %endif | |
160 | movd [%7+%9], m%4 | |
161 | ||
162 | ; write dwords 2 | |
163 | psrldq m%1, 4 | |
164 | psrldq m%2, 4 | |
165 | %if %10 == 8 | |
166 | movd [%5+%8*2], m%1 | |
167 | movd %5d, m%3 | |
168 | %endif | |
169 | psrldq m%3, 4 | |
170 | psrldq m%4, 4 | |
171 | %if %10 == 16 | |
172 | movd [%5+%8*2], m%1 | |
173 | %endif | |
174 | movd [%6+%9], m%2 | |
175 | movd [%7+%8*2], m%3 | |
176 | movd [%7+%9*2], m%4 | |
177 | add %7, %9 | |
178 | ||
179 | ; store dwords 3 | |
180 | psrldq m%1, 4 | |
181 | psrldq m%2, 4 | |
182 | psrldq m%3, 4 | |
183 | psrldq m%4, 4 | |
184 | %if %10 == 8 | |
185 | mov [%7+%8*4], %5d | |
186 | movd [%6+%8*2], m%1 | |
187 | %else | |
188 | movd [%5+%8], m%1 | |
189 | %endif | |
190 | movd [%6+%9*2], m%2 | |
191 | movd [%7+%8*2], m%3 | |
192 | movd [%7+%9*2], m%4 | |
193 | %endmacro | |
194 | ||
195 | ; write 4 or 8 words in the mmx/xmm registers as 8 lines | |
196 | ; 1 and 2 are the registers to write, this can be the same (for SSE2) | |
197 | ; for pre-SSE4: | |
198 | ; 3 is a general-purpose register that we will clobber | |
199 | ; for SSE4: | |
200 | ; 3 is a pointer to the destination's 5th line | |
201 | ; 4 is a pointer to the destination's 4th line | |
202 | ; 5/6 is -stride and +stride | |
203 | %macro WRITE_2x4W 6 | |
204 | movd %3d, %1 | |
205 | punpckhdq %1, %1 | |
206 | mov [%4+%5*4], %3w | |
207 | shr %3, 16 | |
208 | add %4, %6 | |
209 | mov [%4+%5*4], %3w | |
210 | ||
211 | movd %3d, %1 | |
212 | add %4, %5 | |
213 | mov [%4+%5*2], %3w | |
214 | shr %3, 16 | |
215 | mov [%4+%5 ], %3w | |
216 | ||
217 | movd %3d, %2 | |
218 | punpckhdq %2, %2 | |
219 | mov [%4 ], %3w | |
220 | shr %3, 16 | |
221 | mov [%4+%6 ], %3w | |
222 | ||
223 | movd %3d, %2 | |
224 | add %4, %6 | |
225 | mov [%4+%6 ], %3w | |
226 | shr %3, 16 | |
227 | mov [%4+%6*2], %3w | |
228 | add %4, %5 | |
229 | %endmacro | |
230 | ||
231 | %macro WRITE_8W 5 | |
232 | %if cpuflag(sse4) | |
233 | pextrw [%3+%4*4], %1, 0 | |
234 | pextrw [%2+%4*4], %1, 1 | |
235 | pextrw [%3+%4*2], %1, 2 | |
236 | pextrw [%3+%4 ], %1, 3 | |
237 | pextrw [%3 ], %1, 4 | |
238 | pextrw [%2 ], %1, 5 | |
239 | pextrw [%2+%5 ], %1, 6 | |
240 | pextrw [%2+%5*2], %1, 7 | |
241 | %else | |
242 | movd %2d, %1 | |
243 | psrldq %1, 4 | |
244 | mov [%3+%4*4], %2w | |
245 | shr %2, 16 | |
246 | add %3, %5 | |
247 | mov [%3+%4*4], %2w | |
248 | ||
249 | movd %2d, %1 | |
250 | psrldq %1, 4 | |
251 | add %3, %4 | |
252 | mov [%3+%4*2], %2w | |
253 | shr %2, 16 | |
254 | mov [%3+%4 ], %2w | |
255 | ||
256 | movd %2d, %1 | |
257 | psrldq %1, 4 | |
258 | mov [%3 ], %2w | |
259 | shr %2, 16 | |
260 | mov [%3+%5 ], %2w | |
261 | ||
262 | movd %2d, %1 | |
263 | add %3, %5 | |
264 | mov [%3+%5 ], %2w | |
265 | shr %2, 16 | |
266 | mov [%3+%5*2], %2w | |
267 | %endif | |
268 | %endmacro | |
269 | ||
270 | %macro SIMPLE_LOOPFILTER 2 | |
271 | cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr | |
272 | %if mmsize == 8 ; mmx/mmxext | |
273 | mov cntrq, 2 | |
274 | %endif | |
275 | %if cpuflag(ssse3) | |
276 | pxor m0, m0 | |
277 | %endif | |
278 | SPLATB_REG m7, flim, m0 ; splat "flim" into register | |
279 | ||
280 | ; set up indexes to address 4 rows | |
281 | %if mmsize == 8 | |
282 | DEFINE_ARGS dst1, mstride, stride, cntr, dst2 | |
283 | %else | |
284 | DEFINE_ARGS dst1, mstride, stride, dst3, dst2 | |
285 | %endif | |
286 | mov strideq, mstrideq | |
287 | neg mstrideq | |
288 | %ifidn %1, h | |
289 | lea dst1q, [dst1q+4*strideq-2] | |
290 | %endif | |
291 | ||
292 | %if mmsize == 8 ; mmx / mmxext | |
293 | .next8px: | |
294 | %endif | |
295 | %ifidn %1, v | |
296 | ; read 4 half/full rows of pixels | |
297 | mova m0, [dst1q+mstrideq*2] ; p1 | |
298 | mova m1, [dst1q+mstrideq] ; p0 | |
299 | mova m2, [dst1q] ; q0 | |
300 | mova m3, [dst1q+ strideq] ; q1 | |
301 | %else ; h | |
302 | lea dst2q, [dst1q+ strideq] | |
303 | ||
304 | %if mmsize == 8 ; mmx/mmxext | |
305 | READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq | |
306 | %else ; sse2 | |
307 | READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q | |
308 | %endif | |
309 | TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
310 | %endif | |
311 | ||
312 | ; simple_limit | |
313 | mova m5, m2 ; m5=backup of q0 | |
314 | mova m6, m1 ; m6=backup of p0 | |
315 | psubusb m1, m2 ; p0-q0 | |
316 | psubusb m2, m6 ; q0-p0 | |
317 | por m1, m2 ; FFABS(p0-q0) | |
318 | paddusb m1, m1 ; m1=FFABS(p0-q0)*2 | |
319 | ||
320 | mova m4, m3 | |
321 | mova m2, m0 | |
322 | psubusb m3, m0 ; q1-p1 | |
323 | psubusb m0, m4 ; p1-q1 | |
324 | por m3, m0 ; FFABS(p1-q1) | |
325 | mova m0, [pb_80] | |
326 | pxor m2, m0 | |
327 | pxor m4, m0 | |
328 | psubsb m2, m4 ; m2=p1-q1 (signed) backup for below | |
329 | pand m3, [pb_FE] | |
330 | psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed | |
331 | paddusb m3, m1 | |
332 | psubusb m3, m7 | |
333 | pxor m1, m1 | |
334 | pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) | |
335 | ||
336 | ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) | |
337 | mova m4, m5 | |
338 | pxor m5, m0 | |
339 | pxor m0, m6 | |
340 | psubsb m5, m0 ; q0-p0 (signed) | |
341 | paddsb m2, m5 | |
342 | paddsb m2, m5 | |
343 | paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) | |
344 | pand m2, m3 ; apply filter mask (m3) | |
345 | ||
346 | mova m3, [pb_F8] | |
347 | mova m1, m2 | |
348 | paddsb m2, [pb_4] ; f1<<3=a+4 | |
349 | paddsb m1, [pb_3] ; f2<<3=a+3 | |
350 | pand m2, m3 | |
351 | pand m1, m3 ; cache f2<<3 | |
352 | ||
353 | pxor m0, m0 | |
354 | pxor m3, m3 | |
355 | pcmpgtb m0, m2 ; which values are <0? | |
356 | psubb m3, m2 ; -f1<<3 | |
357 | psrlq m2, 3 ; +f1 | |
358 | psrlq m3, 3 ; -f1 | |
359 | pand m3, m0 | |
360 | pandn m0, m2 | |
361 | psubusb m4, m0 | |
362 | paddusb m4, m3 ; q0-f1 | |
363 | ||
364 | pxor m0, m0 | |
365 | pxor m3, m3 | |
366 | pcmpgtb m0, m1 ; which values are <0? | |
367 | psubb m3, m1 ; -f2<<3 | |
368 | psrlq m1, 3 ; +f2 | |
369 | psrlq m3, 3 ; -f2 | |
370 | pand m3, m0 | |
371 | pandn m0, m1 | |
372 | paddusb m6, m0 | |
373 | psubusb m6, m3 ; p0+f2 | |
374 | ||
375 | ; store | |
376 | %ifidn %1, v | |
377 | mova [dst1q], m4 | |
378 | mova [dst1q+mstrideq], m6 | |
379 | %else ; h | |
380 | inc dst1q | |
381 | SBUTTERFLY bw, 6, 4, 0 | |
382 | ||
383 | %if mmsize == 16 ; sse2 | |
384 | %if cpuflag(sse4) | |
385 | inc dst2q | |
386 | %endif | |
387 | WRITE_8W m6, dst2q, dst1q, mstrideq, strideq | |
388 | lea dst2q, [dst3q+mstrideq+1] | |
389 | %if cpuflag(sse4) | |
390 | inc dst3q | |
391 | %endif | |
392 | WRITE_8W m4, dst3q, dst2q, mstrideq, strideq | |
393 | %else ; mmx/mmxext | |
394 | WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq | |
395 | %endif | |
396 | %endif | |
397 | ||
398 | %if mmsize == 8 ; mmx/mmxext | |
399 | ; next 8 pixels | |
400 | %ifidn %1, v | |
401 | add dst1q, 8 ; advance 8 cols = pixels | |
402 | %else ; h | |
403 | lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines | |
404 | %endif | |
405 | dec cntrq | |
406 | jg .next8px | |
407 | REP_RET | |
408 | %else ; sse2 | |
409 | RET | |
410 | %endif | |
411 | %endmacro | |
412 | ||
413 | %if ARCH_X86_32 | |
414 | INIT_MMX mmx | |
415 | SIMPLE_LOOPFILTER v, 4 | |
416 | SIMPLE_LOOPFILTER h, 5 | |
417 | INIT_MMX mmxext | |
418 | SIMPLE_LOOPFILTER v, 4 | |
419 | SIMPLE_LOOPFILTER h, 5 | |
420 | %endif | |
421 | ||
422 | INIT_XMM sse2 | |
423 | SIMPLE_LOOPFILTER v, 3 | |
424 | SIMPLE_LOOPFILTER h, 5 | |
425 | INIT_XMM ssse3 | |
426 | SIMPLE_LOOPFILTER v, 3 | |
427 | SIMPLE_LOOPFILTER h, 5 | |
428 | INIT_XMM sse4 | |
429 | SIMPLE_LOOPFILTER h, 5 | |
430 | ||
431 | ;----------------------------------------------------------------------------- | |
432 | ; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, | |
433 | ; int flimE, int flimI, int hev_thr); | |
434 | ;----------------------------------------------------------------------------- | |
435 | ||
436 | %macro INNER_LOOPFILTER 2 | |
437 | %define stack_size 0 | |
438 | %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | |
439 | %ifidn %1, v ; [3]=hev() result | |
440 | %define stack_size mmsize * -4 | |
441 | %else ; h ; extra storage space for transposes | |
442 | %define stack_size mmsize * -5 | |
443 | %endif | |
444 | %endif | |
445 | ||
446 | %if %2 == 8 ; chroma | |
447 | cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr | |
448 | %else ; luma | |
449 | cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr | |
450 | %endif | |
451 | ||
452 | %if cpuflag(ssse3) | |
453 | pxor m7, m7 | |
454 | %endif | |
455 | ||
456 | %ifndef m8 | |
457 | ; splat function arguments | |
458 | SPLATB_REG m0, flimEq, m7 ; E | |
459 | SPLATB_REG m1, flimIq, m7 ; I | |
460 | SPLATB_REG m2, hevthrq, m7 ; hev_thresh | |
461 | ||
462 | %define m_flimE [rsp] | |
463 | %define m_flimI [rsp+mmsize] | |
464 | %define m_hevthr [rsp+mmsize*2] | |
465 | %define m_maskres [rsp+mmsize*3] | |
466 | %define m_p0backup [rsp+mmsize*3] | |
467 | %define m_q0backup [rsp+mmsize*4] | |
468 | ||
469 | mova m_flimE, m0 | |
470 | mova m_flimI, m1 | |
471 | mova m_hevthr, m2 | |
472 | %else | |
473 | %define m_flimE m9 | |
474 | %define m_flimI m10 | |
475 | %define m_hevthr m11 | |
476 | %define m_maskres m12 | |
477 | %define m_p0backup m12 | |
478 | %define m_q0backup m8 | |
479 | ||
480 | ; splat function arguments | |
481 | SPLATB_REG m_flimE, flimEq, m7 ; E | |
482 | SPLATB_REG m_flimI, flimIq, m7 ; I | |
483 | SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh | |
484 | %endif | |
485 | ||
486 | %if %2 == 8 ; chroma | |
487 | DEFINE_ARGS dst1, dst8, mstride, stride, dst2 | |
488 | %elif mmsize == 8 | |
489 | DEFINE_ARGS dst1, mstride, stride, dst2, cntr | |
490 | mov cntrq, 2 | |
491 | %else | |
492 | DEFINE_ARGS dst1, mstride, stride, dst2, dst8 | |
493 | %endif | |
494 | mov strideq, mstrideq | |
495 | neg mstrideq | |
496 | %ifidn %1, h | |
497 | lea dst1q, [dst1q+strideq*4-4] | |
498 | %if %2 == 8 ; chroma | |
499 | lea dst8q, [dst8q+strideq*4-4] | |
500 | %endif | |
501 | %endif | |
502 | ||
503 | %if mmsize == 8 | |
504 | .next8px: | |
505 | %endif | |
506 | ; read | |
507 | lea dst2q, [dst1q+strideq] | |
508 | %ifidn %1, v | |
509 | %if %2 == 8 && mmsize == 16 | |
510 | %define movrow movh | |
511 | %else | |
512 | %define movrow mova | |
513 | %endif | |
514 | movrow m0, [dst1q+mstrideq*4] ; p3 | |
515 | movrow m1, [dst2q+mstrideq*4] ; p2 | |
516 | movrow m2, [dst1q+mstrideq*2] ; p1 | |
517 | movrow m5, [dst2q] ; q1 | |
518 | movrow m6, [dst2q+ strideq*1] ; q2 | |
519 | movrow m7, [dst2q+ strideq*2] ; q3 | |
520 | %if mmsize == 16 && %2 == 8 | |
521 | movhps m0, [dst8q+mstrideq*4] | |
522 | movhps m2, [dst8q+mstrideq*2] | |
523 | add dst8q, strideq | |
524 | movhps m1, [dst8q+mstrideq*4] | |
525 | movhps m5, [dst8q] | |
526 | movhps m6, [dst8q+ strideq ] | |
527 | movhps m7, [dst8q+ strideq*2] | |
528 | add dst8q, mstrideq | |
529 | %endif | |
530 | %elif mmsize == 8 ; mmx/mmxext (h) | |
531 | ; read 8 rows of 8px each | |
532 | movu m0, [dst1q+mstrideq*4] | |
533 | movu m1, [dst2q+mstrideq*4] | |
534 | movu m2, [dst1q+mstrideq*2] | |
535 | movu m3, [dst1q+mstrideq ] | |
536 | movu m4, [dst1q] | |
537 | movu m5, [dst2q] | |
538 | movu m6, [dst2q+ strideq ] | |
539 | ||
540 | ; 8x8 transpose | |
541 | TRANSPOSE4x4B 0, 1, 2, 3, 7 | |
542 | mova m_q0backup, m1 | |
543 | movu m7, [dst2q+ strideq*2] | |
544 | TRANSPOSE4x4B 4, 5, 6, 7, 1 | |
545 | SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | |
546 | SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | |
547 | SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | |
548 | mova m1, m_q0backup | |
549 | mova m_q0backup, m2 ; store q0 | |
550 | SBUTTERFLY dq, 1, 5, 2 ; p1/p0 | |
551 | mova m_p0backup, m5 ; store p0 | |
552 | SWAP 1, 4 | |
553 | SWAP 2, 4 | |
554 | SWAP 6, 3 | |
555 | SWAP 5, 3 | |
556 | %else ; sse2 (h) | |
557 | %if %2 == 16 | |
558 | lea dst8q, [dst1q+ strideq*8] | |
559 | %endif | |
560 | ||
561 | ; read 16 rows of 8px each, interleave | |
562 | movh m0, [dst1q+mstrideq*4] | |
563 | movh m1, [dst8q+mstrideq*4] | |
564 | movh m2, [dst1q+mstrideq*2] | |
565 | movh m5, [dst8q+mstrideq*2] | |
566 | movh m3, [dst1q+mstrideq ] | |
567 | movh m6, [dst8q+mstrideq ] | |
568 | movh m4, [dst1q] | |
569 | movh m7, [dst8q] | |
570 | punpcklbw m0, m1 ; A/I | |
571 | punpcklbw m2, m5 ; C/K | |
572 | punpcklbw m3, m6 ; D/L | |
573 | punpcklbw m4, m7 ; E/M | |
574 | ||
575 | add dst8q, strideq | |
576 | movh m1, [dst2q+mstrideq*4] | |
577 | movh m6, [dst8q+mstrideq*4] | |
578 | movh m5, [dst2q] | |
579 | movh m7, [dst8q] | |
580 | punpcklbw m1, m6 ; B/J | |
581 | punpcklbw m5, m7 ; F/N | |
582 | movh m6, [dst2q+ strideq ] | |
583 | movh m7, [dst8q+ strideq ] | |
584 | punpcklbw m6, m7 ; G/O | |
585 | ||
586 | ; 8x16 transpose | |
587 | TRANSPOSE4x4B 0, 1, 2, 3, 7 | |
588 | %ifdef m8 | |
589 | SWAP 1, 8 | |
590 | %else | |
591 | mova m_q0backup, m1 | |
592 | %endif | |
593 | movh m7, [dst2q+ strideq*2] | |
594 | movh m1, [dst8q+ strideq*2] | |
595 | punpcklbw m7, m1 ; H/P | |
596 | TRANSPOSE4x4B 4, 5, 6, 7, 1 | |
597 | SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | |
598 | SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | |
599 | SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | |
600 | %ifdef m8 | |
601 | SWAP 1, 8 | |
602 | SWAP 2, 8 | |
603 | %else | |
604 | mova m1, m_q0backup | |
605 | mova m_q0backup, m2 ; store q0 | |
606 | %endif | |
607 | SBUTTERFLY dq, 1, 5, 2 ; p1/p0 | |
608 | %ifdef m12 | |
609 | SWAP 5, 12 | |
610 | %else | |
611 | mova m_p0backup, m5 ; store p0 | |
612 | %endif | |
613 | SWAP 1, 4 | |
614 | SWAP 2, 4 | |
615 | SWAP 6, 3 | |
616 | SWAP 5, 3 | |
617 | %endif | |
618 | ||
619 | ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 | |
620 | mova m4, m1 | |
621 | SWAP 4, 1 | |
622 | psubusb m4, m0 ; p2-p3 | |
623 | psubusb m0, m1 ; p3-p2 | |
624 | por m0, m4 ; abs(p3-p2) | |
625 | ||
626 | mova m4, m2 | |
627 | SWAP 4, 2 | |
628 | psubusb m4, m1 ; p1-p2 | |
629 | psubusb m1, m2 ; p2-p1 | |
630 | por m1, m4 ; abs(p2-p1) | |
631 | ||
632 | mova m4, m6 | |
633 | SWAP 4, 6 | |
634 | psubusb m4, m7 ; q2-q3 | |
635 | psubusb m7, m6 ; q3-q2 | |
636 | por m7, m4 ; abs(q3-q2) | |
637 | ||
638 | mova m4, m5 | |
639 | SWAP 4, 5 | |
640 | psubusb m4, m6 ; q1-q2 | |
641 | psubusb m6, m5 ; q2-q1 | |
642 | por m6, m4 ; abs(q2-q1) | |
643 | ||
644 | %if notcpuflag(mmxext) | |
645 | mova m4, m_flimI | |
646 | pxor m3, m3 | |
647 | psubusb m0, m4 | |
648 | psubusb m1, m4 | |
649 | psubusb m7, m4 | |
650 | psubusb m6, m4 | |
651 | pcmpeqb m0, m3 ; abs(p3-p2) <= I | |
652 | pcmpeqb m1, m3 ; abs(p2-p1) <= I | |
653 | pcmpeqb m7, m3 ; abs(q3-q2) <= I | |
654 | pcmpeqb m6, m3 ; abs(q2-q1) <= I | |
655 | pand m0, m1 | |
656 | pand m7, m6 | |
657 | pand m0, m7 | |
658 | %else ; mmxext/sse2 | |
659 | pmaxub m0, m1 | |
660 | pmaxub m6, m7 | |
661 | pmaxub m0, m6 | |
662 | %endif | |
663 | ||
664 | ; normal_limit and high_edge_variance for p1-p0, q1-q0 | |
665 | SWAP 7, 3 ; now m7 is zero | |
666 | %ifidn %1, v | |
667 | movrow m3, [dst1q+mstrideq ] ; p0 | |
668 | %if mmsize == 16 && %2 == 8 | |
669 | movhps m3, [dst8q+mstrideq ] | |
670 | %endif | |
671 | %elifdef m12 | |
672 | SWAP 3, 12 | |
673 | %else | |
674 | mova m3, m_p0backup | |
675 | %endif | |
676 | ||
677 | mova m1, m2 | |
678 | SWAP 1, 2 | |
679 | mova m6, m3 | |
680 | SWAP 3, 6 | |
681 | psubusb m1, m3 ; p1-p0 | |
682 | psubusb m6, m2 ; p0-p1 | |
683 | por m1, m6 ; abs(p1-p0) | |
684 | %if notcpuflag(mmxext) | |
685 | mova m6, m1 | |
686 | psubusb m1, m4 | |
687 | psubusb m6, m_hevthr | |
688 | pcmpeqb m1, m7 ; abs(p1-p0) <= I | |
689 | pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh | |
690 | pand m0, m1 | |
691 | mova m_maskres, m6 | |
692 | %else ; mmxext/sse2 | |
693 | pmaxub m0, m1 ; max_I | |
694 | SWAP 1, 4 ; max_hev_thresh | |
695 | %endif | |
696 | ||
697 | SWAP 6, 4 ; now m6 is I | |
698 | %ifidn %1, v | |
699 | movrow m4, [dst1q] ; q0 | |
700 | %if mmsize == 16 && %2 == 8 | |
701 | movhps m4, [dst8q] | |
702 | %endif | |
703 | %elifdef m8 | |
704 | SWAP 4, 8 | |
705 | %else | |
706 | mova m4, m_q0backup | |
707 | %endif | |
708 | mova m1, m4 | |
709 | SWAP 1, 4 | |
710 | mova m7, m5 | |
711 | SWAP 7, 5 | |
712 | psubusb m1, m5 ; q0-q1 | |
713 | psubusb m7, m4 ; q1-q0 | |
714 | por m1, m7 ; abs(q1-q0) | |
715 | %if notcpuflag(mmxext) | |
716 | mova m7, m1 | |
717 | psubusb m1, m6 | |
718 | psubusb m7, m_hevthr | |
719 | pxor m6, m6 | |
720 | pcmpeqb m1, m6 ; abs(q1-q0) <= I | |
721 | pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh | |
722 | mova m6, m_maskres | |
723 | pand m0, m1 ; abs([pq][321]-[pq][210]) <= I | |
724 | pand m6, m7 | |
725 | %else ; mmxext/sse2 | |
726 | pxor m7, m7 | |
727 | pmaxub m0, m1 | |
728 | pmaxub m6, m1 | |
729 | psubusb m0, m_flimI | |
730 | psubusb m6, m_hevthr | |
731 | pcmpeqb m0, m7 ; max(abs(..)) <= I | |
732 | pcmpeqb m6, m7 ; !(max(abs..) > thresh) | |
733 | %endif | |
734 | %ifdef m12 | |
735 | SWAP 6, 12 | |
736 | %else | |
737 | mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) | |
738 | %endif | |
739 | ||
740 | ; simple_limit | |
741 | mova m1, m3 | |
742 | SWAP 1, 3 | |
743 | mova m6, m4 ; keep copies of p0/q0 around for later use | |
744 | SWAP 6, 4 | |
745 | psubusb m1, m4 ; p0-q0 | |
746 | psubusb m6, m3 ; q0-p0 | |
747 | por m1, m6 ; abs(q0-p0) | |
748 | paddusb m1, m1 ; m1=2*abs(q0-p0) | |
749 | ||
750 | mova m7, m2 | |
751 | SWAP 7, 2 | |
752 | mova m6, m5 | |
753 | SWAP 6, 5 | |
754 | psubusb m7, m5 ; p1-q1 | |
755 | psubusb m6, m2 ; q1-p1 | |
756 | por m7, m6 ; abs(q1-p1) | |
757 | pxor m6, m6 | |
758 | pand m7, [pb_FE] | |
759 | psrlq m7, 1 ; abs(q1-p1)/2 | |
760 | paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 | |
761 | psubusb m7, m_flimE | |
762 | pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E | |
763 | pand m0, m7 ; normal_limit result | |
764 | ||
765 | ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask | |
766 | %ifdef m8 ; x86-64 && sse2 | |
767 | mova m8, [pb_80] | |
768 | %define m_pb_80 m8 | |
769 | %else ; x86-32 or mmx/mmxext | |
770 | %define m_pb_80 [pb_80] | |
771 | %endif | |
772 | mova m1, m4 | |
773 | mova m7, m3 | |
774 | pxor m1, m_pb_80 | |
775 | pxor m7, m_pb_80 | |
776 | psubsb m1, m7 ; (signed) q0-p0 | |
777 | mova m6, m2 | |
778 | mova m7, m5 | |
779 | pxor m6, m_pb_80 | |
780 | pxor m7, m_pb_80 | |
781 | psubsb m6, m7 ; (signed) p1-q1 | |
782 | mova m7, m_maskres | |
783 | pandn m7, m6 | |
784 | paddsb m7, m1 | |
785 | paddsb m7, m1 | |
786 | paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) | |
787 | ||
788 | pand m7, m0 | |
789 | mova m1, [pb_F8] | |
790 | mova m6, m7 | |
791 | paddsb m7, [pb_3] | |
792 | paddsb m6, [pb_4] | |
793 | pand m7, m1 | |
794 | pand m6, m1 | |
795 | ||
796 | pxor m1, m1 | |
797 | pxor m0, m0 | |
798 | pcmpgtb m1, m7 | |
799 | psubb m0, m7 | |
800 | psrlq m7, 3 ; +f2 | |
801 | psrlq m0, 3 ; -f2 | |
802 | pand m0, m1 | |
803 | pandn m1, m7 | |
804 | psubusb m3, m0 | |
805 | paddusb m3, m1 ; p0+f2 | |
806 | ||
807 | pxor m1, m1 | |
808 | pxor m0, m0 | |
809 | pcmpgtb m0, m6 | |
810 | psubb m1, m6 | |
811 | psrlq m6, 3 ; +f1 | |
812 | psrlq m1, 3 ; -f1 | |
813 | pand m1, m0 | |
814 | pandn m0, m6 | |
815 | psubusb m4, m0 | |
816 | paddusb m4, m1 ; q0-f1 | |
817 | ||
818 | %ifdef m12 | |
819 | SWAP 6, 12 | |
820 | %else | |
821 | mova m6, m_maskres | |
822 | %endif | |
823 | %if notcpuflag(mmxext) | |
824 | mova m7, [pb_1] | |
825 | %else ; mmxext/sse2 | |
826 | pxor m7, m7 | |
827 | %endif | |
828 | pand m0, m6 | |
829 | pand m1, m6 | |
830 | %if notcpuflag(mmxext) | |
831 | paddusb m0, m7 | |
832 | pand m1, [pb_FE] | |
833 | pandn m7, m0 | |
834 | psrlq m1, 1 | |
835 | psrlq m7, 1 | |
836 | SWAP 0, 7 | |
837 | %else ; mmxext/sse2 | |
838 | psubusb m1, [pb_1] | |
839 | pavgb m0, m7 ; a | |
840 | pavgb m1, m7 ; -a | |
841 | %endif | |
842 | psubusb m5, m0 | |
843 | psubusb m2, m1 | |
844 | paddusb m5, m1 ; q1-a | |
845 | paddusb m2, m0 ; p1+a | |
846 | ||
847 | ; store | |
848 | %ifidn %1, v | |
849 | movrow [dst1q+mstrideq*2], m2 | |
850 | movrow [dst1q+mstrideq ], m3 | |
851 | movrow [dst1q], m4 | |
852 | movrow [dst1q+ strideq ], m5 | |
853 | %if mmsize == 16 && %2 == 8 | |
854 | movhps [dst8q+mstrideq*2], m2 | |
855 | movhps [dst8q+mstrideq ], m3 | |
856 | movhps [dst8q], m4 | |
857 | movhps [dst8q+ strideq ], m5 | |
858 | %endif | |
859 | %else ; h | |
860 | add dst1q, 2 | |
861 | add dst2q, 2 | |
862 | ||
863 | ; 4x8/16 transpose | |
864 | TRANSPOSE4x4B 2, 3, 4, 5, 6 | |
865 | ||
866 | %if mmsize == 8 ; mmx/mmxext (h) | |
867 | WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq | |
868 | %else ; sse2 (h) | |
869 | lea dst8q, [dst8q+mstrideq +2] | |
870 | WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 | |
871 | %endif | |
872 | %endif | |
873 | ||
874 | %if mmsize == 8 | |
875 | %if %2 == 8 ; chroma | |
876 | %ifidn %1, h | |
877 | sub dst1q, 2 | |
878 | %endif | |
879 | cmp dst1q, dst8q | |
880 | mov dst1q, dst8q | |
881 | jnz .next8px | |
882 | %else | |
883 | %ifidn %1, h | |
884 | lea dst1q, [dst1q+ strideq*8-2] | |
885 | %else ; v | |
886 | add dst1q, 8 | |
887 | %endif | |
888 | dec cntrq | |
889 | jg .next8px | |
890 | %endif | |
891 | REP_RET | |
892 | %else ; mmsize == 16 | |
893 | RET | |
894 | %endif | |
895 | %endmacro | |
896 | ||
897 | %if ARCH_X86_32 | |
898 | INIT_MMX mmx | |
899 | INNER_LOOPFILTER v, 16 | |
900 | INNER_LOOPFILTER h, 16 | |
901 | INNER_LOOPFILTER v, 8 | |
902 | INNER_LOOPFILTER h, 8 | |
903 | ||
904 | INIT_MMX mmxext | |
905 | INNER_LOOPFILTER v, 16 | |
906 | INNER_LOOPFILTER h, 16 | |
907 | INNER_LOOPFILTER v, 8 | |
908 | INNER_LOOPFILTER h, 8 | |
909 | %endif | |
910 | ||
911 | INIT_XMM sse2 | |
912 | INNER_LOOPFILTER v, 16 | |
913 | INNER_LOOPFILTER h, 16 | |
914 | INNER_LOOPFILTER v, 8 | |
915 | INNER_LOOPFILTER h, 8 | |
916 | ||
917 | INIT_XMM ssse3 | |
918 | INNER_LOOPFILTER v, 16 | |
919 | INNER_LOOPFILTER h, 16 | |
920 | INNER_LOOPFILTER v, 8 | |
921 | INNER_LOOPFILTER h, 8 | |
922 | ||
923 | ;----------------------------------------------------------------------------- | |
924 | ; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, | |
925 | ; int flimE, int flimI, int hev_thr); | |
926 | ;----------------------------------------------------------------------------- | |
927 | ||
928 | %macro MBEDGE_LOOPFILTER 2 | |
929 | %define stack_size 0 | |
930 | %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | |
931 | %if mmsize == 16 ; [3]=hev() result | |
932 | ; [4]=filter tmp result | |
933 | ; [5]/[6] = p2/q2 backup | |
934 | ; [7]=lim_res sign result | |
935 | %define stack_size mmsize * -7 | |
936 | %else ; 8 ; extra storage space for transposes | |
937 | %define stack_size mmsize * -8 | |
938 | %endif | |
939 | %endif | |
940 | ||
941 | %if %2 == 8 ; chroma | |
942 | cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr | |
943 | %else ; luma | |
944 | cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr | |
945 | %endif | |
946 | ||
947 | %if cpuflag(ssse3) | |
948 | pxor m7, m7 | |
949 | %endif | |
950 | ||
951 | %ifndef m8 | |
952 | ; splat function arguments | |
953 | SPLATB_REG m0, flimEq, m7 ; E | |
954 | SPLATB_REG m1, flimIq, m7 ; I | |
955 | SPLATB_REG m2, hevthrq, m7 ; hev_thresh | |
956 | ||
957 | %define m_flimE [rsp] | |
958 | %define m_flimI [rsp+mmsize] | |
959 | %define m_hevthr [rsp+mmsize*2] | |
960 | %define m_maskres [rsp+mmsize*3] | |
961 | %define m_limres [rsp+mmsize*4] | |
962 | %define m_p0backup [rsp+mmsize*3] | |
963 | %define m_q0backup [rsp+mmsize*4] | |
964 | %define m_p2backup [rsp+mmsize*5] | |
965 | %define m_q2backup [rsp+mmsize*6] | |
966 | %if mmsize == 16 | |
967 | %define m_limsign [rsp] | |
968 | %else | |
969 | %define m_limsign [rsp+mmsize*7] | |
970 | %endif | |
971 | ||
972 | mova m_flimE, m0 | |
973 | mova m_flimI, m1 | |
974 | mova m_hevthr, m2 | |
975 | %else ; sse2 on x86-64 | |
976 | %define m_flimE m9 | |
977 | %define m_flimI m10 | |
978 | %define m_hevthr m11 | |
979 | %define m_maskres m12 | |
980 | %define m_limres m8 | |
981 | %define m_p0backup m12 | |
982 | %define m_q0backup m8 | |
983 | %define m_p2backup m13 | |
984 | %define m_q2backup m14 | |
985 | %define m_limsign m9 | |
986 | ||
987 | ; splat function arguments | |
988 | SPLATB_REG m_flimE, flimEq, m7 ; E | |
989 | SPLATB_REG m_flimI, flimIq, m7 ; I | |
990 | SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh | |
991 | %endif | |
992 | ||
993 | %if %2 == 8 ; chroma | |
994 | DEFINE_ARGS dst1, dst8, mstride, stride, dst2 | |
995 | %elif mmsize == 8 | |
996 | DEFINE_ARGS dst1, mstride, stride, dst2, cntr | |
997 | mov cntrq, 2 | |
998 | %else | |
999 | DEFINE_ARGS dst1, mstride, stride, dst2, dst8 | |
1000 | %endif | |
1001 | mov strideq, mstrideq | |
1002 | neg mstrideq | |
1003 | %ifidn %1, h | |
1004 | lea dst1q, [dst1q+strideq*4-4] | |
1005 | %if %2 == 8 ; chroma | |
1006 | lea dst8q, [dst8q+strideq*4-4] | |
1007 | %endif | |
1008 | %endif | |
1009 | ||
1010 | %if mmsize == 8 | |
1011 | .next8px: | |
1012 | %endif | |
1013 | ; read | |
1014 | lea dst2q, [dst1q+ strideq ] | |
1015 | %ifidn %1, v | |
1016 | %if %2 == 8 && mmsize == 16 | |
1017 | %define movrow movh | |
1018 | %else | |
1019 | %define movrow mova | |
1020 | %endif | |
1021 | movrow m0, [dst1q+mstrideq*4] ; p3 | |
1022 | movrow m1, [dst2q+mstrideq*4] ; p2 | |
1023 | movrow m2, [dst1q+mstrideq*2] ; p1 | |
1024 | movrow m5, [dst2q] ; q1 | |
1025 | movrow m6, [dst2q+ strideq ] ; q2 | |
1026 | movrow m7, [dst2q+ strideq*2] ; q3 | |
1027 | %if mmsize == 16 && %2 == 8 | |
1028 | movhps m0, [dst8q+mstrideq*4] | |
1029 | movhps m2, [dst8q+mstrideq*2] | |
1030 | add dst8q, strideq | |
1031 | movhps m1, [dst8q+mstrideq*4] | |
1032 | movhps m5, [dst8q] | |
1033 | movhps m6, [dst8q+ strideq ] | |
1034 | movhps m7, [dst8q+ strideq*2] | |
1035 | add dst8q, mstrideq | |
1036 | %endif | |
1037 | %elif mmsize == 8 ; mmx/mmxext (h) | |
1038 | ; read 8 rows of 8px each | |
1039 | movu m0, [dst1q+mstrideq*4] | |
1040 | movu m1, [dst2q+mstrideq*4] | |
1041 | movu m2, [dst1q+mstrideq*2] | |
1042 | movu m3, [dst1q+mstrideq ] | |
1043 | movu m4, [dst1q] | |
1044 | movu m5, [dst2q] | |
1045 | movu m6, [dst2q+ strideq ] | |
1046 | ||
1047 | ; 8x8 transpose | |
1048 | TRANSPOSE4x4B 0, 1, 2, 3, 7 | |
1049 | mova m_q0backup, m1 | |
1050 | movu m7, [dst2q+ strideq*2] | |
1051 | TRANSPOSE4x4B 4, 5, 6, 7, 1 | |
1052 | SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | |
1053 | SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | |
1054 | SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | |
1055 | mova m1, m_q0backup | |
1056 | mova m_q0backup, m2 ; store q0 | |
1057 | SBUTTERFLY dq, 1, 5, 2 ; p1/p0 | |
1058 | mova m_p0backup, m5 ; store p0 | |
1059 | SWAP 1, 4 | |
1060 | SWAP 2, 4 | |
1061 | SWAP 6, 3 | |
1062 | SWAP 5, 3 | |
1063 | %else ; sse2 (h) | |
1064 | %if %2 == 16 | |
1065 | lea dst8q, [dst1q+ strideq*8 ] | |
1066 | %endif | |
1067 | ||
1068 | ; read 16 rows of 8px each, interleave | |
1069 | movh m0, [dst1q+mstrideq*4] | |
1070 | movh m1, [dst8q+mstrideq*4] | |
1071 | movh m2, [dst1q+mstrideq*2] | |
1072 | movh m5, [dst8q+mstrideq*2] | |
1073 | movh m3, [dst1q+mstrideq ] | |
1074 | movh m6, [dst8q+mstrideq ] | |
1075 | movh m4, [dst1q] | |
1076 | movh m7, [dst8q] | |
1077 | punpcklbw m0, m1 ; A/I | |
1078 | punpcklbw m2, m5 ; C/K | |
1079 | punpcklbw m3, m6 ; D/L | |
1080 | punpcklbw m4, m7 ; E/M | |
1081 | ||
1082 | add dst8q, strideq | |
1083 | movh m1, [dst2q+mstrideq*4] | |
1084 | movh m6, [dst8q+mstrideq*4] | |
1085 | movh m5, [dst2q] | |
1086 | movh m7, [dst8q] | |
1087 | punpcklbw m1, m6 ; B/J | |
1088 | punpcklbw m5, m7 ; F/N | |
1089 | movh m6, [dst2q+ strideq ] | |
1090 | movh m7, [dst8q+ strideq ] | |
1091 | punpcklbw m6, m7 ; G/O | |
1092 | ||
1093 | ; 8x16 transpose | |
1094 | TRANSPOSE4x4B 0, 1, 2, 3, 7 | |
1095 | %ifdef m8 | |
1096 | SWAP 1, 8 | |
1097 | %else | |
1098 | mova m_q0backup, m1 | |
1099 | %endif | |
1100 | movh m7, [dst2q+ strideq*2] | |
1101 | movh m1, [dst8q+ strideq*2] | |
1102 | punpcklbw m7, m1 ; H/P | |
1103 | TRANSPOSE4x4B 4, 5, 6, 7, 1 | |
1104 | SBUTTERFLY dq, 0, 4, 1 ; p3/p2 | |
1105 | SBUTTERFLY dq, 2, 6, 1 ; q0/q1 | |
1106 | SBUTTERFLY dq, 3, 7, 1 ; q2/q3 | |
1107 | %ifdef m8 | |
1108 | SWAP 1, 8 | |
1109 | SWAP 2, 8 | |
1110 | %else | |
1111 | mova m1, m_q0backup | |
1112 | mova m_q0backup, m2 ; store q0 | |
1113 | %endif | |
1114 | SBUTTERFLY dq, 1, 5, 2 ; p1/p0 | |
1115 | %ifdef m12 | |
1116 | SWAP 5, 12 | |
1117 | %else | |
1118 | mova m_p0backup, m5 ; store p0 | |
1119 | %endif | |
1120 | SWAP 1, 4 | |
1121 | SWAP 2, 4 | |
1122 | SWAP 6, 3 | |
1123 | SWAP 5, 3 | |
1124 | %endif | |
1125 | ||
1126 | ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 | |
1127 | mova m4, m1 | |
1128 | SWAP 4, 1 | |
1129 | psubusb m4, m0 ; p2-p3 | |
1130 | psubusb m0, m1 ; p3-p2 | |
1131 | por m0, m4 ; abs(p3-p2) | |
1132 | ||
1133 | mova m4, m2 | |
1134 | SWAP 4, 2 | |
1135 | psubusb m4, m1 ; p1-p2 | |
1136 | mova m_p2backup, m1 | |
1137 | psubusb m1, m2 ; p2-p1 | |
1138 | por m1, m4 ; abs(p2-p1) | |
1139 | ||
1140 | mova m4, m6 | |
1141 | SWAP 4, 6 | |
1142 | psubusb m4, m7 ; q2-q3 | |
1143 | psubusb m7, m6 ; q3-q2 | |
1144 | por m7, m4 ; abs(q3-q2) | |
1145 | ||
1146 | mova m4, m5 | |
1147 | SWAP 4, 5 | |
1148 | psubusb m4, m6 ; q1-q2 | |
1149 | mova m_q2backup, m6 | |
1150 | psubusb m6, m5 ; q2-q1 | |
1151 | por m6, m4 ; abs(q2-q1) | |
1152 | ||
1153 | %if notcpuflag(mmxext) | |
1154 | mova m4, m_flimI | |
1155 | pxor m3, m3 | |
1156 | psubusb m0, m4 | |
1157 | psubusb m1, m4 | |
1158 | psubusb m7, m4 | |
1159 | psubusb m6, m4 | |
1160 | pcmpeqb m0, m3 ; abs(p3-p2) <= I | |
1161 | pcmpeqb m1, m3 ; abs(p2-p1) <= I | |
1162 | pcmpeqb m7, m3 ; abs(q3-q2) <= I | |
1163 | pcmpeqb m6, m3 ; abs(q2-q1) <= I | |
1164 | pand m0, m1 | |
1165 | pand m7, m6 | |
1166 | pand m0, m7 | |
1167 | %else ; mmxext/sse2 | |
1168 | pmaxub m0, m1 | |
1169 | pmaxub m6, m7 | |
1170 | pmaxub m0, m6 | |
1171 | %endif | |
1172 | ||
1173 | ; normal_limit and high_edge_variance for p1-p0, q1-q0 | |
1174 | SWAP 7, 3 ; now m7 is zero | |
1175 | %ifidn %1, v | |
1176 | movrow m3, [dst1q+mstrideq ] ; p0 | |
1177 | %if mmsize == 16 && %2 == 8 | |
1178 | movhps m3, [dst8q+mstrideq ] | |
1179 | %endif | |
1180 | %elifdef m12 | |
1181 | SWAP 3, 12 | |
1182 | %else | |
1183 | mova m3, m_p0backup | |
1184 | %endif | |
1185 | ||
1186 | mova m1, m2 | |
1187 | SWAP 1, 2 | |
1188 | mova m6, m3 | |
1189 | SWAP 3, 6 | |
1190 | psubusb m1, m3 ; p1-p0 | |
1191 | psubusb m6, m2 ; p0-p1 | |
1192 | por m1, m6 ; abs(p1-p0) | |
1193 | %if notcpuflag(mmxext) | |
1194 | mova m6, m1 | |
1195 | psubusb m1, m4 | |
1196 | psubusb m6, m_hevthr | |
1197 | pcmpeqb m1, m7 ; abs(p1-p0) <= I | |
1198 | pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh | |
1199 | pand m0, m1 | |
1200 | mova m_maskres, m6 | |
1201 | %else ; mmxext/sse2 | |
1202 | pmaxub m0, m1 ; max_I | |
1203 | SWAP 1, 4 ; max_hev_thresh | |
1204 | %endif | |
1205 | ||
1206 | SWAP 6, 4 ; now m6 is I | |
1207 | %ifidn %1, v | |
1208 | movrow m4, [dst1q] ; q0 | |
1209 | %if mmsize == 16 && %2 == 8 | |
1210 | movhps m4, [dst8q] | |
1211 | %endif | |
1212 | %elifdef m8 | |
1213 | SWAP 4, 8 | |
1214 | %else | |
1215 | mova m4, m_q0backup | |
1216 | %endif | |
1217 | mova m1, m4 | |
1218 | SWAP 1, 4 | |
1219 | mova m7, m5 | |
1220 | SWAP 7, 5 | |
1221 | psubusb m1, m5 ; q0-q1 | |
1222 | psubusb m7, m4 ; q1-q0 | |
1223 | por m1, m7 ; abs(q1-q0) | |
1224 | %if notcpuflag(mmxext) | |
1225 | mova m7, m1 | |
1226 | psubusb m1, m6 | |
1227 | psubusb m7, m_hevthr | |
1228 | pxor m6, m6 | |
1229 | pcmpeqb m1, m6 ; abs(q1-q0) <= I | |
1230 | pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh | |
1231 | mova m6, m_maskres | |
1232 | pand m0, m1 ; abs([pq][321]-[pq][210]) <= I | |
1233 | pand m6, m7 | |
1234 | %else ; mmxext/sse2 | |
1235 | pxor m7, m7 | |
1236 | pmaxub m0, m1 | |
1237 | pmaxub m6, m1 | |
1238 | psubusb m0, m_flimI | |
1239 | psubusb m6, m_hevthr | |
1240 | pcmpeqb m0, m7 ; max(abs(..)) <= I | |
1241 | pcmpeqb m6, m7 ; !(max(abs..) > thresh) | |
1242 | %endif | |
1243 | %ifdef m12 | |
1244 | SWAP 6, 12 | |
1245 | %else | |
1246 | mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) | |
1247 | %endif | |
1248 | ||
1249 | ; simple_limit | |
1250 | mova m1, m3 | |
1251 | SWAP 1, 3 | |
1252 | mova m6, m4 ; keep copies of p0/q0 around for later use | |
1253 | SWAP 6, 4 | |
1254 | psubusb m1, m4 ; p0-q0 | |
1255 | psubusb m6, m3 ; q0-p0 | |
1256 | por m1, m6 ; abs(q0-p0) | |
1257 | paddusb m1, m1 ; m1=2*abs(q0-p0) | |
1258 | ||
1259 | mova m7, m2 | |
1260 | SWAP 7, 2 | |
1261 | mova m6, m5 | |
1262 | SWAP 6, 5 | |
1263 | psubusb m7, m5 ; p1-q1 | |
1264 | psubusb m6, m2 ; q1-p1 | |
1265 | por m7, m6 ; abs(q1-p1) | |
1266 | pxor m6, m6 | |
1267 | pand m7, [pb_FE] | |
1268 | psrlq m7, 1 ; abs(q1-p1)/2 | |
1269 | paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 | |
1270 | psubusb m7, m_flimE | |
1271 | pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E | |
1272 | pand m0, m7 ; normal_limit result | |
1273 | ||
1274 | ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask | |
1275 | %ifdef m8 ; x86-64 && sse2 | |
1276 | mova m8, [pb_80] | |
1277 | %define m_pb_80 m8 | |
1278 | %else ; x86-32 or mmx/mmxext | |
1279 | %define m_pb_80 [pb_80] | |
1280 | %endif | |
1281 | mova m1, m4 | |
1282 | mova m7, m3 | |
1283 | pxor m1, m_pb_80 | |
1284 | pxor m7, m_pb_80 | |
1285 | psubsb m1, m7 ; (signed) q0-p0 | |
1286 | mova m6, m2 | |
1287 | mova m7, m5 | |
1288 | pxor m6, m_pb_80 | |
1289 | pxor m7, m_pb_80 | |
1290 | psubsb m6, m7 ; (signed) p1-q1 | |
1291 | mova m7, m_maskres | |
1292 | paddsb m6, m1 | |
1293 | paddsb m6, m1 | |
1294 | paddsb m6, m1 | |
1295 | pand m6, m0 | |
1296 | %ifdef m8 | |
1297 | mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge | |
1298 | pand m_limres, m7 | |
1299 | %else | |
1300 | mova m0, m6 | |
1301 | pand m0, m7 | |
1302 | mova m_limres, m0 | |
1303 | %endif | |
1304 | pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common | |
1305 | ||
1306 | mova m1, [pb_F8] | |
1307 | mova m6, m7 | |
1308 | paddsb m7, [pb_3] | |
1309 | paddsb m6, [pb_4] | |
1310 | pand m7, m1 | |
1311 | pand m6, m1 | |
1312 | ||
1313 | pxor m1, m1 | |
1314 | pxor m0, m0 | |
1315 | pcmpgtb m1, m7 | |
1316 | psubb m0, m7 | |
1317 | psrlq m7, 3 ; +f2 | |
1318 | psrlq m0, 3 ; -f2 | |
1319 | pand m0, m1 | |
1320 | pandn m1, m7 | |
1321 | psubusb m3, m0 | |
1322 | paddusb m3, m1 ; p0+f2 | |
1323 | ||
1324 | pxor m1, m1 | |
1325 | pxor m0, m0 | |
1326 | pcmpgtb m0, m6 | |
1327 | psubb m1, m6 | |
1328 | psrlq m6, 3 ; +f1 | |
1329 | psrlq m1, 3 ; -f1 | |
1330 | pand m1, m0 | |
1331 | pandn m0, m6 | |
1332 | psubusb m4, m0 | |
1333 | paddusb m4, m1 ; q0-f1 | |
1334 | ||
1335 | ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) | |
1336 | %if cpuflag(ssse3) | |
1337 | mova m7, [pb_1] | |
1338 | %else | |
1339 | mova m7, [pw_63] | |
1340 | %endif | |
1341 | %ifdef m8 | |
1342 | SWAP 1, 8 | |
1343 | %else | |
1344 | mova m1, m_limres | |
1345 | %endif | |
1346 | pxor m0, m0 | |
1347 | mova m6, m1 | |
1348 | pcmpgtb m0, m1 ; which are negative | |
1349 | %if cpuflag(ssse3) | |
1350 | punpcklbw m6, m7 ; interleave with "1" for rounding | |
1351 | punpckhbw m1, m7 | |
1352 | %else | |
1353 | punpcklbw m6, m0 ; signed byte->word | |
1354 | punpckhbw m1, m0 | |
1355 | %endif | |
1356 | mova m_limsign, m0 | |
1357 | %if cpuflag(ssse3) | |
1358 | mova m7, [pb_27_63] | |
1359 | %ifndef m8 | |
1360 | mova m_limres, m1 | |
1361 | %endif | |
1362 | %ifdef m10 | |
1363 | SWAP 0, 10 ; don't lose lim_sign copy | |
1364 | %endif | |
1365 | mova m0, m7 | |
1366 | pmaddubsw m7, m6 | |
1367 | SWAP 6, 7 | |
1368 | pmaddubsw m0, m1 | |
1369 | SWAP 1, 0 | |
1370 | %ifdef m10 | |
1371 | SWAP 0, 10 | |
1372 | %else | |
1373 | mova m0, m_limsign | |
1374 | %endif | |
1375 | %else | |
1376 | mova m_maskres, m6 ; backup for later in filter | |
1377 | mova m_limres, m1 | |
1378 | pmullw m6, [pw_27] | |
1379 | pmullw m1, [pw_27] | |
1380 | paddw m6, m7 | |
1381 | paddw m1, m7 | |
1382 | %endif | |
1383 | psraw m6, 7 | |
1384 | psraw m1, 7 | |
1385 | packsswb m6, m1 ; a0 | |
1386 | pxor m1, m1 | |
1387 | psubb m1, m6 | |
1388 | pand m1, m0 ; -a0 | |
1389 | pandn m0, m6 ; +a0 | |
1390 | %if cpuflag(ssse3) | |
1391 | mova m6, [pb_18_63] ; pipelining | |
1392 | %endif | |
1393 | psubusb m3, m1 | |
1394 | paddusb m4, m1 | |
1395 | paddusb m3, m0 ; p0+a0 | |
1396 | psubusb m4, m0 ; q0-a0 | |
1397 | ||
1398 | %if cpuflag(ssse3) | |
1399 | SWAP 6, 7 | |
1400 | %ifdef m10 | |
1401 | SWAP 1, 10 | |
1402 | %else | |
1403 | mova m1, m_limres | |
1404 | %endif | |
1405 | mova m0, m7 | |
1406 | pmaddubsw m7, m6 | |
1407 | SWAP 6, 7 | |
1408 | pmaddubsw m0, m1 | |
1409 | SWAP 1, 0 | |
1410 | %ifdef m10 | |
1411 | SWAP 0, 10 | |
1412 | %endif | |
1413 | mova m0, m_limsign | |
1414 | %else | |
1415 | mova m6, m_maskres | |
1416 | mova m1, m_limres | |
1417 | pmullw m6, [pw_18] | |
1418 | pmullw m1, [pw_18] | |
1419 | paddw m6, m7 | |
1420 | paddw m1, m7 | |
1421 | %endif | |
1422 | mova m0, m_limsign | |
1423 | psraw m6, 7 | |
1424 | psraw m1, 7 | |
1425 | packsswb m6, m1 ; a1 | |
1426 | pxor m1, m1 | |
1427 | psubb m1, m6 | |
1428 | pand m1, m0 ; -a1 | |
1429 | pandn m0, m6 ; +a1 | |
1430 | %if cpuflag(ssse3) | |
1431 | mova m6, [pb_9_63] | |
1432 | %endif | |
1433 | psubusb m2, m1 | |
1434 | paddusb m5, m1 | |
1435 | paddusb m2, m0 ; p1+a1 | |
1436 | psubusb m5, m0 ; q1-a1 | |
1437 | ||
1438 | %if cpuflag(ssse3) | |
1439 | SWAP 6, 7 | |
1440 | %ifdef m10 | |
1441 | SWAP 1, 10 | |
1442 | %else | |
1443 | mova m1, m_limres | |
1444 | %endif | |
1445 | mova m0, m7 | |
1446 | pmaddubsw m7, m6 | |
1447 | SWAP 6, 7 | |
1448 | pmaddubsw m0, m1 | |
1449 | SWAP 1, 0 | |
1450 | %else | |
1451 | %ifdef m8 | |
1452 | SWAP 6, 12 | |
1453 | SWAP 1, 8 | |
1454 | %else | |
1455 | mova m6, m_maskres | |
1456 | mova m1, m_limres | |
1457 | %endif | |
1458 | pmullw m6, [pw_9] | |
1459 | pmullw m1, [pw_9] | |
1460 | paddw m6, m7 | |
1461 | paddw m1, m7 | |
1462 | %endif | |
1463 | %ifdef m9 | |
1464 | SWAP 7, 9 | |
1465 | %else | |
1466 | mova m7, m_limsign | |
1467 | %endif | |
1468 | psraw m6, 7 | |
1469 | psraw m1, 7 | |
1470 | packsswb m6, m1 ; a1 | |
1471 | pxor m0, m0 | |
1472 | psubb m0, m6 | |
1473 | pand m0, m7 ; -a1 | |
1474 | pandn m7, m6 ; +a1 | |
1475 | %ifdef m8 | |
1476 | SWAP 1, 13 | |
1477 | SWAP 6, 14 | |
1478 | %else | |
1479 | mova m1, m_p2backup | |
1480 | mova m6, m_q2backup | |
1481 | %endif | |
1482 | psubusb m1, m0 | |
1483 | paddusb m6, m0 | |
1484 | paddusb m1, m7 ; p1+a1 | |
1485 | psubusb m6, m7 ; q1-a1 | |
1486 | ||
1487 | ; store | |
1488 | %ifidn %1, v | |
1489 | movrow [dst2q+mstrideq*4], m1 | |
1490 | movrow [dst1q+mstrideq*2], m2 | |
1491 | movrow [dst1q+mstrideq ], m3 | |
1492 | movrow [dst1q], m4 | |
1493 | movrow [dst2q], m5 | |
1494 | movrow [dst2q+ strideq ], m6 | |
1495 | %if mmsize == 16 && %2 == 8 | |
1496 | add dst8q, mstrideq | |
1497 | movhps [dst8q+mstrideq*2], m1 | |
1498 | movhps [dst8q+mstrideq ], m2 | |
1499 | movhps [dst8q], m3 | |
1500 | add dst8q, strideq | |
1501 | movhps [dst8q], m4 | |
1502 | movhps [dst8q+ strideq ], m5 | |
1503 | movhps [dst8q+ strideq*2], m6 | |
1504 | %endif | |
1505 | %else ; h | |
1506 | inc dst1q | |
1507 | inc dst2q | |
1508 | ||
1509 | ; 4x8/16 transpose | |
1510 | TRANSPOSE4x4B 1, 2, 3, 4, 0 | |
1511 | SBUTTERFLY bw, 5, 6, 0 | |
1512 | ||
1513 | %if mmsize == 8 ; mmx/mmxext (h) | |
1514 | WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq | |
1515 | add dst1q, 4 | |
1516 | WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq | |
1517 | %else ; sse2 (h) | |
1518 | lea dst8q, [dst8q+mstrideq+1] | |
1519 | WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 | |
1520 | lea dst1q, [dst2q+mstrideq+4] | |
1521 | lea dst8q, [dst8q+mstrideq+4] | |
1522 | %if cpuflag(sse4) | |
1523 | add dst2q, 4 | |
1524 | %endif | |
1525 | WRITE_8W m5, dst2q, dst1q, mstrideq, strideq | |
1526 | %if cpuflag(sse4) | |
1527 | lea dst2q, [dst8q+ strideq ] | |
1528 | %endif | |
1529 | WRITE_8W m6, dst2q, dst8q, mstrideq, strideq | |
1530 | %endif | |
1531 | %endif | |
1532 | ||
1533 | %if mmsize == 8 | |
1534 | %if %2 == 8 ; chroma | |
1535 | %ifidn %1, h | |
1536 | sub dst1q, 5 | |
1537 | %endif | |
1538 | cmp dst1q, dst8q | |
1539 | mov dst1q, dst8q | |
1540 | jnz .next8px | |
1541 | %else | |
1542 | %ifidn %1, h | |
1543 | lea dst1q, [dst1q+ strideq*8-5] | |
1544 | %else ; v | |
1545 | add dst1q, 8 | |
1546 | %endif | |
1547 | dec cntrq | |
1548 | jg .next8px | |
1549 | %endif | |
1550 | REP_RET | |
1551 | %else ; mmsize == 16 | |
1552 | RET | |
1553 | %endif | |
1554 | %endmacro | |
1555 | ||
1556 | %if ARCH_X86_32 | |
1557 | INIT_MMX mmx | |
1558 | MBEDGE_LOOPFILTER v, 16 | |
1559 | MBEDGE_LOOPFILTER h, 16 | |
1560 | MBEDGE_LOOPFILTER v, 8 | |
1561 | MBEDGE_LOOPFILTER h, 8 | |
1562 | ||
1563 | INIT_MMX mmxext | |
1564 | MBEDGE_LOOPFILTER v, 16 | |
1565 | MBEDGE_LOOPFILTER h, 16 | |
1566 | MBEDGE_LOOPFILTER v, 8 | |
1567 | MBEDGE_LOOPFILTER h, 8 | |
1568 | %endif | |
1569 | ||
1570 | INIT_XMM sse2 | |
1571 | MBEDGE_LOOPFILTER v, 16 | |
1572 | MBEDGE_LOOPFILTER h, 16 | |
1573 | MBEDGE_LOOPFILTER v, 8 | |
1574 | MBEDGE_LOOPFILTER h, 8 | |
1575 | ||
1576 | INIT_XMM ssse3 | |
1577 | MBEDGE_LOOPFILTER v, 16 | |
1578 | MBEDGE_LOOPFILTER h, 16 | |
1579 | MBEDGE_LOOPFILTER v, 8 | |
1580 | MBEDGE_LOOPFILTER h, 8 | |
1581 | ||
1582 | INIT_XMM sse4 | |
1583 | MBEDGE_LOOPFILTER h, 16 | |
1584 | MBEDGE_LOOPFILTER h, 8 |