Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vp8dsp_loopfilter.asm
1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5 ;*
6 ;* This file is part of FFmpeg.
7 ;*
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
12 ;*
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
22
23 %include "libavutil/x86/x86util.asm"
24
25 SECTION_RODATA
26
27 pw_27: times 8 dw 27
28 pw_63: times 8 dw 63
29
30 pb_4: times 16 db 4
31 pb_F8: times 16 db 0xF8
32 pb_FE: times 16 db 0xFE
33 pb_27_63: times 8 db 27, 63
34 pb_18_63: times 8 db 18, 63
35 pb_9_63: times 8 db 9, 63
36
37 cextern pb_1
38 cextern pb_3
39 cextern pw_9
40 cextern pw_18
41 cextern pb_80
42
43 SECTION .text
44
45 ;-----------------------------------------------------------------------------
46 ; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
47 ;-----------------------------------------------------------------------------
48
49 ; macro called with 7 mm register indexes as argument, and 4 regular registers
50 ;
51 ; first 4 mm registers will carry the transposed pixel data
52 ; the other three are scratchspace (one would be sufficient, but this allows
53 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
54 ;
55 ; first two regular registers are buf+4*stride and buf+5*stride
56 ; third is -stride, fourth is +stride
57 %macro READ_8x4_INTERLEAVED 11
58 ; interleave 8 (A-H) rows of 4 pixels each
59 movd m%1, [%8+%10*4] ; A0-3
60 movd m%5, [%9+%10*4] ; B0-3
61 movd m%2, [%8+%10*2] ; C0-3
62 movd m%6, [%8+%10] ; D0-3
63 movd m%3, [%8] ; E0-3
64 movd m%7, [%9] ; F0-3
65 movd m%4, [%9+%11] ; G0-3
66 punpcklbw m%1, m%5 ; A/B interleaved
67 movd m%5, [%9+%11*2] ; H0-3
68 punpcklbw m%2, m%6 ; C/D interleaved
69 punpcklbw m%3, m%7 ; E/F interleaved
70 punpcklbw m%4, m%5 ; G/H interleaved
71 %endmacro
72
73 ; macro called with 7 mm register indexes as argument, and 5 regular registers
74 ; first 11 mean the same as READ_8x4_TRANSPOSED above
75 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
76 ; will be set to second regular register + 8*stride at the end
77 %macro READ_16x4_INTERLEAVED 12
78 ; transpose 16 (A-P) rows of 4 pixels each
79 lea %12, [r0+8*r2]
80
81 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
82 movd m%1, [%8+%10*4] ; A0-3
83 movd m%3, [%12+%10*4] ; I0-3
84 movd m%2, [%8+%10*2] ; C0-3
85 movd m%4, [%12+%10*2] ; K0-3
86 movd m%6, [%8+%10] ; D0-3
87 movd m%5, [%12+%10] ; L0-3
88 movd m%7, [%12] ; M0-3
89 add %12, %11
90 punpcklbw m%1, m%3 ; A/I
91 movd m%3, [%8] ; E0-3
92 punpcklbw m%2, m%4 ; C/K
93 punpcklbw m%6, m%5 ; D/L
94 punpcklbw m%3, m%7 ; E/M
95 punpcklbw m%2, m%6 ; C/D/K/L interleaved
96
97 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
98 movd m%5, [%9+%10*4] ; B0-3
99 movd m%4, [%12+%10*4] ; J0-3
100 movd m%7, [%9] ; F0-3
101 movd m%6, [%12] ; N0-3
102 punpcklbw m%5, m%4 ; B/J
103 punpcklbw m%7, m%6 ; F/N
104 punpcklbw m%1, m%5 ; A/B/I/J interleaved
105 punpcklbw m%3, m%7 ; E/F/M/N interleaved
106 movd m%4, [%9+%11] ; G0-3
107 movd m%6, [%12+%11] ; O0-3
108 movd m%5, [%9+%11*2] ; H0-3
109 movd m%7, [%12+%11*2] ; P0-3
110 punpcklbw m%4, m%6 ; G/O
111 punpcklbw m%5, m%7 ; H/P
112 punpcklbw m%4, m%5 ; G/H/O/P interleaved
113 %endmacro
114
115 ; write 4 mm registers of 2 dwords each
116 ; first four arguments are mm register indexes containing source data
117 ; last four are registers containing buf+4*stride, buf+5*stride,
118 ; -stride and +stride
119 %macro WRITE_4x2D 8
120 ; write out (2 dwords per register)
121 movd [%5+%7*4], m%1
122 movd [%5+%7*2], m%2
123 movd [%5], m%3
124 movd [%6+%8], m%4
125 punpckhdq m%1, m%1
126 punpckhdq m%2, m%2
127 punpckhdq m%3, m%3
128 punpckhdq m%4, m%4
129 movd [%6+%7*4], m%1
130 movd [%5+%7], m%2
131 movd [%6], m%3
132 movd [%6+%8*2], m%4
133 %endmacro
134
135 ; write 4 xmm registers of 4 dwords each
136 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
137 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
138 ; we add 1*stride to the third regular registry in the process
139 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
140 ; same memory region), or 8 if they cover two separate buffers (third one points to
141 ; a different memory region than the first two), allowing for more optimal code for
142 ; the 16-width case
143 %macro WRITE_4x4D 10
144 ; write out (4 dwords per register), start with dwords zero
145 movd [%5+%8*4], m%1
146 movd [%5], m%2
147 movd [%7+%8*4], m%3
148 movd [%7], m%4
149
150 ; store dwords 1
151 psrldq m%1, 4
152 psrldq m%2, 4
153 psrldq m%3, 4
154 psrldq m%4, 4
155 movd [%6+%8*4], m%1
156 movd [%6], m%2
157 %if %10 == 16
158 movd [%6+%9*4], m%3
159 %endif
160 movd [%7+%9], m%4
161
162 ; write dwords 2
163 psrldq m%1, 4
164 psrldq m%2, 4
165 %if %10 == 8
166 movd [%5+%8*2], m%1
167 movd %5d, m%3
168 %endif
169 psrldq m%3, 4
170 psrldq m%4, 4
171 %if %10 == 16
172 movd [%5+%8*2], m%1
173 %endif
174 movd [%6+%9], m%2
175 movd [%7+%8*2], m%3
176 movd [%7+%9*2], m%4
177 add %7, %9
178
179 ; store dwords 3
180 psrldq m%1, 4
181 psrldq m%2, 4
182 psrldq m%3, 4
183 psrldq m%4, 4
184 %if %10 == 8
185 mov [%7+%8*4], %5d
186 movd [%6+%8*2], m%1
187 %else
188 movd [%5+%8], m%1
189 %endif
190 movd [%6+%9*2], m%2
191 movd [%7+%8*2], m%3
192 movd [%7+%9*2], m%4
193 %endmacro
194
195 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
196 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
197 ; for pre-SSE4:
198 ; 3 is a general-purpose register that we will clobber
199 ; for SSE4:
200 ; 3 is a pointer to the destination's 5th line
201 ; 4 is a pointer to the destination's 4th line
202 ; 5/6 is -stride and +stride
203 %macro WRITE_2x4W 6
204 movd %3d, %1
205 punpckhdq %1, %1
206 mov [%4+%5*4], %3w
207 shr %3, 16
208 add %4, %6
209 mov [%4+%5*4], %3w
210
211 movd %3d, %1
212 add %4, %5
213 mov [%4+%5*2], %3w
214 shr %3, 16
215 mov [%4+%5 ], %3w
216
217 movd %3d, %2
218 punpckhdq %2, %2
219 mov [%4 ], %3w
220 shr %3, 16
221 mov [%4+%6 ], %3w
222
223 movd %3d, %2
224 add %4, %6
225 mov [%4+%6 ], %3w
226 shr %3, 16
227 mov [%4+%6*2], %3w
228 add %4, %5
229 %endmacro
230
231 %macro WRITE_8W 5
232 %if cpuflag(sse4)
233 pextrw [%3+%4*4], %1, 0
234 pextrw [%2+%4*4], %1, 1
235 pextrw [%3+%4*2], %1, 2
236 pextrw [%3+%4 ], %1, 3
237 pextrw [%3 ], %1, 4
238 pextrw [%2 ], %1, 5
239 pextrw [%2+%5 ], %1, 6
240 pextrw [%2+%5*2], %1, 7
241 %else
242 movd %2d, %1
243 psrldq %1, 4
244 mov [%3+%4*4], %2w
245 shr %2, 16
246 add %3, %5
247 mov [%3+%4*4], %2w
248
249 movd %2d, %1
250 psrldq %1, 4
251 add %3, %4
252 mov [%3+%4*2], %2w
253 shr %2, 16
254 mov [%3+%4 ], %2w
255
256 movd %2d, %1
257 psrldq %1, 4
258 mov [%3 ], %2w
259 shr %2, 16
260 mov [%3+%5 ], %2w
261
262 movd %2d, %1
263 add %3, %5
264 mov [%3+%5 ], %2w
265 shr %2, 16
266 mov [%3+%5*2], %2w
267 %endif
268 %endmacro
269
270 %macro SIMPLE_LOOPFILTER 2
271 cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
272 %if mmsize == 8 ; mmx/mmxext
273 mov cntrq, 2
274 %endif
275 %if cpuflag(ssse3)
276 pxor m0, m0
277 %endif
278 SPLATB_REG m7, flim, m0 ; splat "flim" into register
279
280 ; set up indexes to address 4 rows
281 %if mmsize == 8
282 DEFINE_ARGS dst1, mstride, stride, cntr, dst2
283 %else
284 DEFINE_ARGS dst1, mstride, stride, dst3, dst2
285 %endif
286 mov strideq, mstrideq
287 neg mstrideq
288 %ifidn %1, h
289 lea dst1q, [dst1q+4*strideq-2]
290 %endif
291
292 %if mmsize == 8 ; mmx / mmxext
293 .next8px:
294 %endif
295 %ifidn %1, v
296 ; read 4 half/full rows of pixels
297 mova m0, [dst1q+mstrideq*2] ; p1
298 mova m1, [dst1q+mstrideq] ; p0
299 mova m2, [dst1q] ; q0
300 mova m3, [dst1q+ strideq] ; q1
301 %else ; h
302 lea dst2q, [dst1q+ strideq]
303
304 %if mmsize == 8 ; mmx/mmxext
305 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
306 %else ; sse2
307 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
308 %endif
309 TRANSPOSE4x4W 0, 1, 2, 3, 4
310 %endif
311
312 ; simple_limit
313 mova m5, m2 ; m5=backup of q0
314 mova m6, m1 ; m6=backup of p0
315 psubusb m1, m2 ; p0-q0
316 psubusb m2, m6 ; q0-p0
317 por m1, m2 ; FFABS(p0-q0)
318 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
319
320 mova m4, m3
321 mova m2, m0
322 psubusb m3, m0 ; q1-p1
323 psubusb m0, m4 ; p1-q1
324 por m3, m0 ; FFABS(p1-q1)
325 mova m0, [pb_80]
326 pxor m2, m0
327 pxor m4, m0
328 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
329 pand m3, [pb_FE]
330 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
331 paddusb m3, m1
332 psubusb m3, m7
333 pxor m1, m1
334 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
335
336 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
337 mova m4, m5
338 pxor m5, m0
339 pxor m0, m6
340 psubsb m5, m0 ; q0-p0 (signed)
341 paddsb m2, m5
342 paddsb m2, m5
343 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
344 pand m2, m3 ; apply filter mask (m3)
345
346 mova m3, [pb_F8]
347 mova m1, m2
348 paddsb m2, [pb_4] ; f1<<3=a+4
349 paddsb m1, [pb_3] ; f2<<3=a+3
350 pand m2, m3
351 pand m1, m3 ; cache f2<<3
352
353 pxor m0, m0
354 pxor m3, m3
355 pcmpgtb m0, m2 ; which values are <0?
356 psubb m3, m2 ; -f1<<3
357 psrlq m2, 3 ; +f1
358 psrlq m3, 3 ; -f1
359 pand m3, m0
360 pandn m0, m2
361 psubusb m4, m0
362 paddusb m4, m3 ; q0-f1
363
364 pxor m0, m0
365 pxor m3, m3
366 pcmpgtb m0, m1 ; which values are <0?
367 psubb m3, m1 ; -f2<<3
368 psrlq m1, 3 ; +f2
369 psrlq m3, 3 ; -f2
370 pand m3, m0
371 pandn m0, m1
372 paddusb m6, m0
373 psubusb m6, m3 ; p0+f2
374
375 ; store
376 %ifidn %1, v
377 mova [dst1q], m4
378 mova [dst1q+mstrideq], m6
379 %else ; h
380 inc dst1q
381 SBUTTERFLY bw, 6, 4, 0
382
383 %if mmsize == 16 ; sse2
384 %if cpuflag(sse4)
385 inc dst2q
386 %endif
387 WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
388 lea dst2q, [dst3q+mstrideq+1]
389 %if cpuflag(sse4)
390 inc dst3q
391 %endif
392 WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
393 %else ; mmx/mmxext
394 WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
395 %endif
396 %endif
397
398 %if mmsize == 8 ; mmx/mmxext
399 ; next 8 pixels
400 %ifidn %1, v
401 add dst1q, 8 ; advance 8 cols = pixels
402 %else ; h
403 lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
404 %endif
405 dec cntrq
406 jg .next8px
407 REP_RET
408 %else ; sse2
409 RET
410 %endif
411 %endmacro
412
413 %if ARCH_X86_32
414 INIT_MMX mmx
415 SIMPLE_LOOPFILTER v, 4
416 SIMPLE_LOOPFILTER h, 5
417 INIT_MMX mmxext
418 SIMPLE_LOOPFILTER v, 4
419 SIMPLE_LOOPFILTER h, 5
420 %endif
421
422 INIT_XMM sse2
423 SIMPLE_LOOPFILTER v, 3
424 SIMPLE_LOOPFILTER h, 5
425 INIT_XMM ssse3
426 SIMPLE_LOOPFILTER v, 3
427 SIMPLE_LOOPFILTER h, 5
428 INIT_XMM sse4
429 SIMPLE_LOOPFILTER h, 5
430
431 ;-----------------------------------------------------------------------------
432 ; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
433 ; int flimE, int flimI, int hev_thr);
434 ;-----------------------------------------------------------------------------
435
436 %macro INNER_LOOPFILTER 2
437 %define stack_size 0
438 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
439 %ifidn %1, v ; [3]=hev() result
440 %define stack_size mmsize * -4
441 %else ; h ; extra storage space for transposes
442 %define stack_size mmsize * -5
443 %endif
444 %endif
445
446 %if %2 == 8 ; chroma
447 cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
448 %else ; luma
449 cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
450 %endif
451
452 %if cpuflag(ssse3)
453 pxor m7, m7
454 %endif
455
456 %ifndef m8
457 ; splat function arguments
458 SPLATB_REG m0, flimEq, m7 ; E
459 SPLATB_REG m1, flimIq, m7 ; I
460 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
461
462 %define m_flimE [rsp]
463 %define m_flimI [rsp+mmsize]
464 %define m_hevthr [rsp+mmsize*2]
465 %define m_maskres [rsp+mmsize*3]
466 %define m_p0backup [rsp+mmsize*3]
467 %define m_q0backup [rsp+mmsize*4]
468
469 mova m_flimE, m0
470 mova m_flimI, m1
471 mova m_hevthr, m2
472 %else
473 %define m_flimE m9
474 %define m_flimI m10
475 %define m_hevthr m11
476 %define m_maskres m12
477 %define m_p0backup m12
478 %define m_q0backup m8
479
480 ; splat function arguments
481 SPLATB_REG m_flimE, flimEq, m7 ; E
482 SPLATB_REG m_flimI, flimIq, m7 ; I
483 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
484 %endif
485
486 %if %2 == 8 ; chroma
487 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
488 %elif mmsize == 8
489 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
490 mov cntrq, 2
491 %else
492 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
493 %endif
494 mov strideq, mstrideq
495 neg mstrideq
496 %ifidn %1, h
497 lea dst1q, [dst1q+strideq*4-4]
498 %if %2 == 8 ; chroma
499 lea dst8q, [dst8q+strideq*4-4]
500 %endif
501 %endif
502
503 %if mmsize == 8
504 .next8px:
505 %endif
506 ; read
507 lea dst2q, [dst1q+strideq]
508 %ifidn %1, v
509 %if %2 == 8 && mmsize == 16
510 %define movrow movh
511 %else
512 %define movrow mova
513 %endif
514 movrow m0, [dst1q+mstrideq*4] ; p3
515 movrow m1, [dst2q+mstrideq*4] ; p2
516 movrow m2, [dst1q+mstrideq*2] ; p1
517 movrow m5, [dst2q] ; q1
518 movrow m6, [dst2q+ strideq*1] ; q2
519 movrow m7, [dst2q+ strideq*2] ; q3
520 %if mmsize == 16 && %2 == 8
521 movhps m0, [dst8q+mstrideq*4]
522 movhps m2, [dst8q+mstrideq*2]
523 add dst8q, strideq
524 movhps m1, [dst8q+mstrideq*4]
525 movhps m5, [dst8q]
526 movhps m6, [dst8q+ strideq ]
527 movhps m7, [dst8q+ strideq*2]
528 add dst8q, mstrideq
529 %endif
530 %elif mmsize == 8 ; mmx/mmxext (h)
531 ; read 8 rows of 8px each
532 movu m0, [dst1q+mstrideq*4]
533 movu m1, [dst2q+mstrideq*4]
534 movu m2, [dst1q+mstrideq*2]
535 movu m3, [dst1q+mstrideq ]
536 movu m4, [dst1q]
537 movu m5, [dst2q]
538 movu m6, [dst2q+ strideq ]
539
540 ; 8x8 transpose
541 TRANSPOSE4x4B 0, 1, 2, 3, 7
542 mova m_q0backup, m1
543 movu m7, [dst2q+ strideq*2]
544 TRANSPOSE4x4B 4, 5, 6, 7, 1
545 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
546 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
547 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
548 mova m1, m_q0backup
549 mova m_q0backup, m2 ; store q0
550 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
551 mova m_p0backup, m5 ; store p0
552 SWAP 1, 4
553 SWAP 2, 4
554 SWAP 6, 3
555 SWAP 5, 3
556 %else ; sse2 (h)
557 %if %2 == 16
558 lea dst8q, [dst1q+ strideq*8]
559 %endif
560
561 ; read 16 rows of 8px each, interleave
562 movh m0, [dst1q+mstrideq*4]
563 movh m1, [dst8q+mstrideq*4]
564 movh m2, [dst1q+mstrideq*2]
565 movh m5, [dst8q+mstrideq*2]
566 movh m3, [dst1q+mstrideq ]
567 movh m6, [dst8q+mstrideq ]
568 movh m4, [dst1q]
569 movh m7, [dst8q]
570 punpcklbw m0, m1 ; A/I
571 punpcklbw m2, m5 ; C/K
572 punpcklbw m3, m6 ; D/L
573 punpcklbw m4, m7 ; E/M
574
575 add dst8q, strideq
576 movh m1, [dst2q+mstrideq*4]
577 movh m6, [dst8q+mstrideq*4]
578 movh m5, [dst2q]
579 movh m7, [dst8q]
580 punpcklbw m1, m6 ; B/J
581 punpcklbw m5, m7 ; F/N
582 movh m6, [dst2q+ strideq ]
583 movh m7, [dst8q+ strideq ]
584 punpcklbw m6, m7 ; G/O
585
586 ; 8x16 transpose
587 TRANSPOSE4x4B 0, 1, 2, 3, 7
588 %ifdef m8
589 SWAP 1, 8
590 %else
591 mova m_q0backup, m1
592 %endif
593 movh m7, [dst2q+ strideq*2]
594 movh m1, [dst8q+ strideq*2]
595 punpcklbw m7, m1 ; H/P
596 TRANSPOSE4x4B 4, 5, 6, 7, 1
597 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
598 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
599 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
600 %ifdef m8
601 SWAP 1, 8
602 SWAP 2, 8
603 %else
604 mova m1, m_q0backup
605 mova m_q0backup, m2 ; store q0
606 %endif
607 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
608 %ifdef m12
609 SWAP 5, 12
610 %else
611 mova m_p0backup, m5 ; store p0
612 %endif
613 SWAP 1, 4
614 SWAP 2, 4
615 SWAP 6, 3
616 SWAP 5, 3
617 %endif
618
619 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
620 mova m4, m1
621 SWAP 4, 1
622 psubusb m4, m0 ; p2-p3
623 psubusb m0, m1 ; p3-p2
624 por m0, m4 ; abs(p3-p2)
625
626 mova m4, m2
627 SWAP 4, 2
628 psubusb m4, m1 ; p1-p2
629 psubusb m1, m2 ; p2-p1
630 por m1, m4 ; abs(p2-p1)
631
632 mova m4, m6
633 SWAP 4, 6
634 psubusb m4, m7 ; q2-q3
635 psubusb m7, m6 ; q3-q2
636 por m7, m4 ; abs(q3-q2)
637
638 mova m4, m5
639 SWAP 4, 5
640 psubusb m4, m6 ; q1-q2
641 psubusb m6, m5 ; q2-q1
642 por m6, m4 ; abs(q2-q1)
643
644 %if notcpuflag(mmxext)
645 mova m4, m_flimI
646 pxor m3, m3
647 psubusb m0, m4
648 psubusb m1, m4
649 psubusb m7, m4
650 psubusb m6, m4
651 pcmpeqb m0, m3 ; abs(p3-p2) <= I
652 pcmpeqb m1, m3 ; abs(p2-p1) <= I
653 pcmpeqb m7, m3 ; abs(q3-q2) <= I
654 pcmpeqb m6, m3 ; abs(q2-q1) <= I
655 pand m0, m1
656 pand m7, m6
657 pand m0, m7
658 %else ; mmxext/sse2
659 pmaxub m0, m1
660 pmaxub m6, m7
661 pmaxub m0, m6
662 %endif
663
664 ; normal_limit and high_edge_variance for p1-p0, q1-q0
665 SWAP 7, 3 ; now m7 is zero
666 %ifidn %1, v
667 movrow m3, [dst1q+mstrideq ] ; p0
668 %if mmsize == 16 && %2 == 8
669 movhps m3, [dst8q+mstrideq ]
670 %endif
671 %elifdef m12
672 SWAP 3, 12
673 %else
674 mova m3, m_p0backup
675 %endif
676
677 mova m1, m2
678 SWAP 1, 2
679 mova m6, m3
680 SWAP 3, 6
681 psubusb m1, m3 ; p1-p0
682 psubusb m6, m2 ; p0-p1
683 por m1, m6 ; abs(p1-p0)
684 %if notcpuflag(mmxext)
685 mova m6, m1
686 psubusb m1, m4
687 psubusb m6, m_hevthr
688 pcmpeqb m1, m7 ; abs(p1-p0) <= I
689 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
690 pand m0, m1
691 mova m_maskres, m6
692 %else ; mmxext/sse2
693 pmaxub m0, m1 ; max_I
694 SWAP 1, 4 ; max_hev_thresh
695 %endif
696
697 SWAP 6, 4 ; now m6 is I
698 %ifidn %1, v
699 movrow m4, [dst1q] ; q0
700 %if mmsize == 16 && %2 == 8
701 movhps m4, [dst8q]
702 %endif
703 %elifdef m8
704 SWAP 4, 8
705 %else
706 mova m4, m_q0backup
707 %endif
708 mova m1, m4
709 SWAP 1, 4
710 mova m7, m5
711 SWAP 7, 5
712 psubusb m1, m5 ; q0-q1
713 psubusb m7, m4 ; q1-q0
714 por m1, m7 ; abs(q1-q0)
715 %if notcpuflag(mmxext)
716 mova m7, m1
717 psubusb m1, m6
718 psubusb m7, m_hevthr
719 pxor m6, m6
720 pcmpeqb m1, m6 ; abs(q1-q0) <= I
721 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
722 mova m6, m_maskres
723 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
724 pand m6, m7
725 %else ; mmxext/sse2
726 pxor m7, m7
727 pmaxub m0, m1
728 pmaxub m6, m1
729 psubusb m0, m_flimI
730 psubusb m6, m_hevthr
731 pcmpeqb m0, m7 ; max(abs(..)) <= I
732 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
733 %endif
734 %ifdef m12
735 SWAP 6, 12
736 %else
737 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
738 %endif
739
740 ; simple_limit
741 mova m1, m3
742 SWAP 1, 3
743 mova m6, m4 ; keep copies of p0/q0 around for later use
744 SWAP 6, 4
745 psubusb m1, m4 ; p0-q0
746 psubusb m6, m3 ; q0-p0
747 por m1, m6 ; abs(q0-p0)
748 paddusb m1, m1 ; m1=2*abs(q0-p0)
749
750 mova m7, m2
751 SWAP 7, 2
752 mova m6, m5
753 SWAP 6, 5
754 psubusb m7, m5 ; p1-q1
755 psubusb m6, m2 ; q1-p1
756 por m7, m6 ; abs(q1-p1)
757 pxor m6, m6
758 pand m7, [pb_FE]
759 psrlq m7, 1 ; abs(q1-p1)/2
760 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
761 psubusb m7, m_flimE
762 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
763 pand m0, m7 ; normal_limit result
764
765 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
766 %ifdef m8 ; x86-64 && sse2
767 mova m8, [pb_80]
768 %define m_pb_80 m8
769 %else ; x86-32 or mmx/mmxext
770 %define m_pb_80 [pb_80]
771 %endif
772 mova m1, m4
773 mova m7, m3
774 pxor m1, m_pb_80
775 pxor m7, m_pb_80
776 psubsb m1, m7 ; (signed) q0-p0
777 mova m6, m2
778 mova m7, m5
779 pxor m6, m_pb_80
780 pxor m7, m_pb_80
781 psubsb m6, m7 ; (signed) p1-q1
782 mova m7, m_maskres
783 pandn m7, m6
784 paddsb m7, m1
785 paddsb m7, m1
786 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
787
788 pand m7, m0
789 mova m1, [pb_F8]
790 mova m6, m7
791 paddsb m7, [pb_3]
792 paddsb m6, [pb_4]
793 pand m7, m1
794 pand m6, m1
795
796 pxor m1, m1
797 pxor m0, m0
798 pcmpgtb m1, m7
799 psubb m0, m7
800 psrlq m7, 3 ; +f2
801 psrlq m0, 3 ; -f2
802 pand m0, m1
803 pandn m1, m7
804 psubusb m3, m0
805 paddusb m3, m1 ; p0+f2
806
807 pxor m1, m1
808 pxor m0, m0
809 pcmpgtb m0, m6
810 psubb m1, m6
811 psrlq m6, 3 ; +f1
812 psrlq m1, 3 ; -f1
813 pand m1, m0
814 pandn m0, m6
815 psubusb m4, m0
816 paddusb m4, m1 ; q0-f1
817
818 %ifdef m12
819 SWAP 6, 12
820 %else
821 mova m6, m_maskres
822 %endif
823 %if notcpuflag(mmxext)
824 mova m7, [pb_1]
825 %else ; mmxext/sse2
826 pxor m7, m7
827 %endif
828 pand m0, m6
829 pand m1, m6
830 %if notcpuflag(mmxext)
831 paddusb m0, m7
832 pand m1, [pb_FE]
833 pandn m7, m0
834 psrlq m1, 1
835 psrlq m7, 1
836 SWAP 0, 7
837 %else ; mmxext/sse2
838 psubusb m1, [pb_1]
839 pavgb m0, m7 ; a
840 pavgb m1, m7 ; -a
841 %endif
842 psubusb m5, m0
843 psubusb m2, m1
844 paddusb m5, m1 ; q1-a
845 paddusb m2, m0 ; p1+a
846
847 ; store
848 %ifidn %1, v
849 movrow [dst1q+mstrideq*2], m2
850 movrow [dst1q+mstrideq ], m3
851 movrow [dst1q], m4
852 movrow [dst1q+ strideq ], m5
853 %if mmsize == 16 && %2 == 8
854 movhps [dst8q+mstrideq*2], m2
855 movhps [dst8q+mstrideq ], m3
856 movhps [dst8q], m4
857 movhps [dst8q+ strideq ], m5
858 %endif
859 %else ; h
860 add dst1q, 2
861 add dst2q, 2
862
863 ; 4x8/16 transpose
864 TRANSPOSE4x4B 2, 3, 4, 5, 6
865
866 %if mmsize == 8 ; mmx/mmxext (h)
867 WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
868 %else ; sse2 (h)
869 lea dst8q, [dst8q+mstrideq +2]
870 WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
871 %endif
872 %endif
873
874 %if mmsize == 8
875 %if %2 == 8 ; chroma
876 %ifidn %1, h
877 sub dst1q, 2
878 %endif
879 cmp dst1q, dst8q
880 mov dst1q, dst8q
881 jnz .next8px
882 %else
883 %ifidn %1, h
884 lea dst1q, [dst1q+ strideq*8-2]
885 %else ; v
886 add dst1q, 8
887 %endif
888 dec cntrq
889 jg .next8px
890 %endif
891 REP_RET
892 %else ; mmsize == 16
893 RET
894 %endif
895 %endmacro
896
897 %if ARCH_X86_32
898 INIT_MMX mmx
899 INNER_LOOPFILTER v, 16
900 INNER_LOOPFILTER h, 16
901 INNER_LOOPFILTER v, 8
902 INNER_LOOPFILTER h, 8
903
904 INIT_MMX mmxext
905 INNER_LOOPFILTER v, 16
906 INNER_LOOPFILTER h, 16
907 INNER_LOOPFILTER v, 8
908 INNER_LOOPFILTER h, 8
909 %endif
910
911 INIT_XMM sse2
912 INNER_LOOPFILTER v, 16
913 INNER_LOOPFILTER h, 16
914 INNER_LOOPFILTER v, 8
915 INNER_LOOPFILTER h, 8
916
917 INIT_XMM ssse3
918 INNER_LOOPFILTER v, 16
919 INNER_LOOPFILTER h, 16
920 INNER_LOOPFILTER v, 8
921 INNER_LOOPFILTER h, 8
922
923 ;-----------------------------------------------------------------------------
924 ; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
925 ; int flimE, int flimI, int hev_thr);
926 ;-----------------------------------------------------------------------------
927
928 %macro MBEDGE_LOOPFILTER 2
929 %define stack_size 0
930 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
931 %if mmsize == 16 ; [3]=hev() result
932 ; [4]=filter tmp result
933 ; [5]/[6] = p2/q2 backup
934 ; [7]=lim_res sign result
935 %define stack_size mmsize * -7
936 %else ; 8 ; extra storage space for transposes
937 %define stack_size mmsize * -8
938 %endif
939 %endif
940
941 %if %2 == 8 ; chroma
942 cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
943 %else ; luma
944 cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
945 %endif
946
947 %if cpuflag(ssse3)
948 pxor m7, m7
949 %endif
950
951 %ifndef m8
952 ; splat function arguments
953 SPLATB_REG m0, flimEq, m7 ; E
954 SPLATB_REG m1, flimIq, m7 ; I
955 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
956
957 %define m_flimE [rsp]
958 %define m_flimI [rsp+mmsize]
959 %define m_hevthr [rsp+mmsize*2]
960 %define m_maskres [rsp+mmsize*3]
961 %define m_limres [rsp+mmsize*4]
962 %define m_p0backup [rsp+mmsize*3]
963 %define m_q0backup [rsp+mmsize*4]
964 %define m_p2backup [rsp+mmsize*5]
965 %define m_q2backup [rsp+mmsize*6]
966 %if mmsize == 16
967 %define m_limsign [rsp]
968 %else
969 %define m_limsign [rsp+mmsize*7]
970 %endif
971
972 mova m_flimE, m0
973 mova m_flimI, m1
974 mova m_hevthr, m2
975 %else ; sse2 on x86-64
976 %define m_flimE m9
977 %define m_flimI m10
978 %define m_hevthr m11
979 %define m_maskres m12
980 %define m_limres m8
981 %define m_p0backup m12
982 %define m_q0backup m8
983 %define m_p2backup m13
984 %define m_q2backup m14
985 %define m_limsign m9
986
987 ; splat function arguments
988 SPLATB_REG m_flimE, flimEq, m7 ; E
989 SPLATB_REG m_flimI, flimIq, m7 ; I
990 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
991 %endif
992
993 %if %2 == 8 ; chroma
994 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
995 %elif mmsize == 8
996 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
997 mov cntrq, 2
998 %else
999 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
1000 %endif
1001 mov strideq, mstrideq
1002 neg mstrideq
1003 %ifidn %1, h
1004 lea dst1q, [dst1q+strideq*4-4]
1005 %if %2 == 8 ; chroma
1006 lea dst8q, [dst8q+strideq*4-4]
1007 %endif
1008 %endif
1009
1010 %if mmsize == 8
1011 .next8px:
1012 %endif
1013 ; read
1014 lea dst2q, [dst1q+ strideq ]
1015 %ifidn %1, v
1016 %if %2 == 8 && mmsize == 16
1017 %define movrow movh
1018 %else
1019 %define movrow mova
1020 %endif
1021 movrow m0, [dst1q+mstrideq*4] ; p3
1022 movrow m1, [dst2q+mstrideq*4] ; p2
1023 movrow m2, [dst1q+mstrideq*2] ; p1
1024 movrow m5, [dst2q] ; q1
1025 movrow m6, [dst2q+ strideq ] ; q2
1026 movrow m7, [dst2q+ strideq*2] ; q3
1027 %if mmsize == 16 && %2 == 8
1028 movhps m0, [dst8q+mstrideq*4]
1029 movhps m2, [dst8q+mstrideq*2]
1030 add dst8q, strideq
1031 movhps m1, [dst8q+mstrideq*4]
1032 movhps m5, [dst8q]
1033 movhps m6, [dst8q+ strideq ]
1034 movhps m7, [dst8q+ strideq*2]
1035 add dst8q, mstrideq
1036 %endif
1037 %elif mmsize == 8 ; mmx/mmxext (h)
1038 ; read 8 rows of 8px each
1039 movu m0, [dst1q+mstrideq*4]
1040 movu m1, [dst2q+mstrideq*4]
1041 movu m2, [dst1q+mstrideq*2]
1042 movu m3, [dst1q+mstrideq ]
1043 movu m4, [dst1q]
1044 movu m5, [dst2q]
1045 movu m6, [dst2q+ strideq ]
1046
1047 ; 8x8 transpose
1048 TRANSPOSE4x4B 0, 1, 2, 3, 7
1049 mova m_q0backup, m1
1050 movu m7, [dst2q+ strideq*2]
1051 TRANSPOSE4x4B 4, 5, 6, 7, 1
1052 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1053 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1054 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1055 mova m1, m_q0backup
1056 mova m_q0backup, m2 ; store q0
1057 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1058 mova m_p0backup, m5 ; store p0
1059 SWAP 1, 4
1060 SWAP 2, 4
1061 SWAP 6, 3
1062 SWAP 5, 3
1063 %else ; sse2 (h)
1064 %if %2 == 16
1065 lea dst8q, [dst1q+ strideq*8 ]
1066 %endif
1067
1068 ; read 16 rows of 8px each, interleave
1069 movh m0, [dst1q+mstrideq*4]
1070 movh m1, [dst8q+mstrideq*4]
1071 movh m2, [dst1q+mstrideq*2]
1072 movh m5, [dst8q+mstrideq*2]
1073 movh m3, [dst1q+mstrideq ]
1074 movh m6, [dst8q+mstrideq ]
1075 movh m4, [dst1q]
1076 movh m7, [dst8q]
1077 punpcklbw m0, m1 ; A/I
1078 punpcklbw m2, m5 ; C/K
1079 punpcklbw m3, m6 ; D/L
1080 punpcklbw m4, m7 ; E/M
1081
1082 add dst8q, strideq
1083 movh m1, [dst2q+mstrideq*4]
1084 movh m6, [dst8q+mstrideq*4]
1085 movh m5, [dst2q]
1086 movh m7, [dst8q]
1087 punpcklbw m1, m6 ; B/J
1088 punpcklbw m5, m7 ; F/N
1089 movh m6, [dst2q+ strideq ]
1090 movh m7, [dst8q+ strideq ]
1091 punpcklbw m6, m7 ; G/O
1092
1093 ; 8x16 transpose
1094 TRANSPOSE4x4B 0, 1, 2, 3, 7
1095 %ifdef m8
1096 SWAP 1, 8
1097 %else
1098 mova m_q0backup, m1
1099 %endif
1100 movh m7, [dst2q+ strideq*2]
1101 movh m1, [dst8q+ strideq*2]
1102 punpcklbw m7, m1 ; H/P
1103 TRANSPOSE4x4B 4, 5, 6, 7, 1
1104 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1105 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1106 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1107 %ifdef m8
1108 SWAP 1, 8
1109 SWAP 2, 8
1110 %else
1111 mova m1, m_q0backup
1112 mova m_q0backup, m2 ; store q0
1113 %endif
1114 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1115 %ifdef m12
1116 SWAP 5, 12
1117 %else
1118 mova m_p0backup, m5 ; store p0
1119 %endif
1120 SWAP 1, 4
1121 SWAP 2, 4
1122 SWAP 6, 3
1123 SWAP 5, 3
1124 %endif
1125
1126 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1127 mova m4, m1
1128 SWAP 4, 1
1129 psubusb m4, m0 ; p2-p3
1130 psubusb m0, m1 ; p3-p2
1131 por m0, m4 ; abs(p3-p2)
1132
1133 mova m4, m2
1134 SWAP 4, 2
1135 psubusb m4, m1 ; p1-p2
1136 mova m_p2backup, m1
1137 psubusb m1, m2 ; p2-p1
1138 por m1, m4 ; abs(p2-p1)
1139
1140 mova m4, m6
1141 SWAP 4, 6
1142 psubusb m4, m7 ; q2-q3
1143 psubusb m7, m6 ; q3-q2
1144 por m7, m4 ; abs(q3-q2)
1145
1146 mova m4, m5
1147 SWAP 4, 5
1148 psubusb m4, m6 ; q1-q2
1149 mova m_q2backup, m6
1150 psubusb m6, m5 ; q2-q1
1151 por m6, m4 ; abs(q2-q1)
1152
1153 %if notcpuflag(mmxext)
1154 mova m4, m_flimI
1155 pxor m3, m3
1156 psubusb m0, m4
1157 psubusb m1, m4
1158 psubusb m7, m4
1159 psubusb m6, m4
1160 pcmpeqb m0, m3 ; abs(p3-p2) <= I
1161 pcmpeqb m1, m3 ; abs(p2-p1) <= I
1162 pcmpeqb m7, m3 ; abs(q3-q2) <= I
1163 pcmpeqb m6, m3 ; abs(q2-q1) <= I
1164 pand m0, m1
1165 pand m7, m6
1166 pand m0, m7
1167 %else ; mmxext/sse2
1168 pmaxub m0, m1
1169 pmaxub m6, m7
1170 pmaxub m0, m6
1171 %endif
1172
1173 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1174 SWAP 7, 3 ; now m7 is zero
1175 %ifidn %1, v
1176 movrow m3, [dst1q+mstrideq ] ; p0
1177 %if mmsize == 16 && %2 == 8
1178 movhps m3, [dst8q+mstrideq ]
1179 %endif
1180 %elifdef m12
1181 SWAP 3, 12
1182 %else
1183 mova m3, m_p0backup
1184 %endif
1185
1186 mova m1, m2
1187 SWAP 1, 2
1188 mova m6, m3
1189 SWAP 3, 6
1190 psubusb m1, m3 ; p1-p0
1191 psubusb m6, m2 ; p0-p1
1192 por m1, m6 ; abs(p1-p0)
1193 %if notcpuflag(mmxext)
1194 mova m6, m1
1195 psubusb m1, m4
1196 psubusb m6, m_hevthr
1197 pcmpeqb m1, m7 ; abs(p1-p0) <= I
1198 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
1199 pand m0, m1
1200 mova m_maskres, m6
1201 %else ; mmxext/sse2
1202 pmaxub m0, m1 ; max_I
1203 SWAP 1, 4 ; max_hev_thresh
1204 %endif
1205
1206 SWAP 6, 4 ; now m6 is I
1207 %ifidn %1, v
1208 movrow m4, [dst1q] ; q0
1209 %if mmsize == 16 && %2 == 8
1210 movhps m4, [dst8q]
1211 %endif
1212 %elifdef m8
1213 SWAP 4, 8
1214 %else
1215 mova m4, m_q0backup
1216 %endif
1217 mova m1, m4
1218 SWAP 1, 4
1219 mova m7, m5
1220 SWAP 7, 5
1221 psubusb m1, m5 ; q0-q1
1222 psubusb m7, m4 ; q1-q0
1223 por m1, m7 ; abs(q1-q0)
1224 %if notcpuflag(mmxext)
1225 mova m7, m1
1226 psubusb m1, m6
1227 psubusb m7, m_hevthr
1228 pxor m6, m6
1229 pcmpeqb m1, m6 ; abs(q1-q0) <= I
1230 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
1231 mova m6, m_maskres
1232 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
1233 pand m6, m7
1234 %else ; mmxext/sse2
1235 pxor m7, m7
1236 pmaxub m0, m1
1237 pmaxub m6, m1
1238 psubusb m0, m_flimI
1239 psubusb m6, m_hevthr
1240 pcmpeqb m0, m7 ; max(abs(..)) <= I
1241 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
1242 %endif
1243 %ifdef m12
1244 SWAP 6, 12
1245 %else
1246 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1247 %endif
1248
1249 ; simple_limit
1250 mova m1, m3
1251 SWAP 1, 3
1252 mova m6, m4 ; keep copies of p0/q0 around for later use
1253 SWAP 6, 4
1254 psubusb m1, m4 ; p0-q0
1255 psubusb m6, m3 ; q0-p0
1256 por m1, m6 ; abs(q0-p0)
1257 paddusb m1, m1 ; m1=2*abs(q0-p0)
1258
1259 mova m7, m2
1260 SWAP 7, 2
1261 mova m6, m5
1262 SWAP 6, 5
1263 psubusb m7, m5 ; p1-q1
1264 psubusb m6, m2 ; q1-p1
1265 por m7, m6 ; abs(q1-p1)
1266 pxor m6, m6
1267 pand m7, [pb_FE]
1268 psrlq m7, 1 ; abs(q1-p1)/2
1269 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
1270 psubusb m7, m_flimE
1271 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1272 pand m0, m7 ; normal_limit result
1273
1274 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1275 %ifdef m8 ; x86-64 && sse2
1276 mova m8, [pb_80]
1277 %define m_pb_80 m8
1278 %else ; x86-32 or mmx/mmxext
1279 %define m_pb_80 [pb_80]
1280 %endif
1281 mova m1, m4
1282 mova m7, m3
1283 pxor m1, m_pb_80
1284 pxor m7, m_pb_80
1285 psubsb m1, m7 ; (signed) q0-p0
1286 mova m6, m2
1287 mova m7, m5
1288 pxor m6, m_pb_80
1289 pxor m7, m_pb_80
1290 psubsb m6, m7 ; (signed) p1-q1
1291 mova m7, m_maskres
1292 paddsb m6, m1
1293 paddsb m6, m1
1294 paddsb m6, m1
1295 pand m6, m0
1296 %ifdef m8
1297 mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
1298 pand m_limres, m7
1299 %else
1300 mova m0, m6
1301 pand m0, m7
1302 mova m_limres, m0
1303 %endif
1304 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
1305
1306 mova m1, [pb_F8]
1307 mova m6, m7
1308 paddsb m7, [pb_3]
1309 paddsb m6, [pb_4]
1310 pand m7, m1
1311 pand m6, m1
1312
1313 pxor m1, m1
1314 pxor m0, m0
1315 pcmpgtb m1, m7
1316 psubb m0, m7
1317 psrlq m7, 3 ; +f2
1318 psrlq m0, 3 ; -f2
1319 pand m0, m1
1320 pandn m1, m7
1321 psubusb m3, m0
1322 paddusb m3, m1 ; p0+f2
1323
1324 pxor m1, m1
1325 pxor m0, m0
1326 pcmpgtb m0, m6
1327 psubb m1, m6
1328 psrlq m6, 3 ; +f1
1329 psrlq m1, 3 ; -f1
1330 pand m1, m0
1331 pandn m0, m6
1332 psubusb m4, m0
1333 paddusb m4, m1 ; q0-f1
1334
1335 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
1336 %if cpuflag(ssse3)
1337 mova m7, [pb_1]
1338 %else
1339 mova m7, [pw_63]
1340 %endif
1341 %ifdef m8
1342 SWAP 1, 8
1343 %else
1344 mova m1, m_limres
1345 %endif
1346 pxor m0, m0
1347 mova m6, m1
1348 pcmpgtb m0, m1 ; which are negative
1349 %if cpuflag(ssse3)
1350 punpcklbw m6, m7 ; interleave with "1" for rounding
1351 punpckhbw m1, m7
1352 %else
1353 punpcklbw m6, m0 ; signed byte->word
1354 punpckhbw m1, m0
1355 %endif
1356 mova m_limsign, m0
1357 %if cpuflag(ssse3)
1358 mova m7, [pb_27_63]
1359 %ifndef m8
1360 mova m_limres, m1
1361 %endif
1362 %ifdef m10
1363 SWAP 0, 10 ; don't lose lim_sign copy
1364 %endif
1365 mova m0, m7
1366 pmaddubsw m7, m6
1367 SWAP 6, 7
1368 pmaddubsw m0, m1
1369 SWAP 1, 0
1370 %ifdef m10
1371 SWAP 0, 10
1372 %else
1373 mova m0, m_limsign
1374 %endif
1375 %else
1376 mova m_maskres, m6 ; backup for later in filter
1377 mova m_limres, m1
1378 pmullw m6, [pw_27]
1379 pmullw m1, [pw_27]
1380 paddw m6, m7
1381 paddw m1, m7
1382 %endif
1383 psraw m6, 7
1384 psraw m1, 7
1385 packsswb m6, m1 ; a0
1386 pxor m1, m1
1387 psubb m1, m6
1388 pand m1, m0 ; -a0
1389 pandn m0, m6 ; +a0
1390 %if cpuflag(ssse3)
1391 mova m6, [pb_18_63] ; pipelining
1392 %endif
1393 psubusb m3, m1
1394 paddusb m4, m1
1395 paddusb m3, m0 ; p0+a0
1396 psubusb m4, m0 ; q0-a0
1397
1398 %if cpuflag(ssse3)
1399 SWAP 6, 7
1400 %ifdef m10
1401 SWAP 1, 10
1402 %else
1403 mova m1, m_limres
1404 %endif
1405 mova m0, m7
1406 pmaddubsw m7, m6
1407 SWAP 6, 7
1408 pmaddubsw m0, m1
1409 SWAP 1, 0
1410 %ifdef m10
1411 SWAP 0, 10
1412 %endif
1413 mova m0, m_limsign
1414 %else
1415 mova m6, m_maskres
1416 mova m1, m_limres
1417 pmullw m6, [pw_18]
1418 pmullw m1, [pw_18]
1419 paddw m6, m7
1420 paddw m1, m7
1421 %endif
1422 mova m0, m_limsign
1423 psraw m6, 7
1424 psraw m1, 7
1425 packsswb m6, m1 ; a1
1426 pxor m1, m1
1427 psubb m1, m6
1428 pand m1, m0 ; -a1
1429 pandn m0, m6 ; +a1
1430 %if cpuflag(ssse3)
1431 mova m6, [pb_9_63]
1432 %endif
1433 psubusb m2, m1
1434 paddusb m5, m1
1435 paddusb m2, m0 ; p1+a1
1436 psubusb m5, m0 ; q1-a1
1437
1438 %if cpuflag(ssse3)
1439 SWAP 6, 7
1440 %ifdef m10
1441 SWAP 1, 10
1442 %else
1443 mova m1, m_limres
1444 %endif
1445 mova m0, m7
1446 pmaddubsw m7, m6
1447 SWAP 6, 7
1448 pmaddubsw m0, m1
1449 SWAP 1, 0
1450 %else
1451 %ifdef m8
1452 SWAP 6, 12
1453 SWAP 1, 8
1454 %else
1455 mova m6, m_maskres
1456 mova m1, m_limres
1457 %endif
1458 pmullw m6, [pw_9]
1459 pmullw m1, [pw_9]
1460 paddw m6, m7
1461 paddw m1, m7
1462 %endif
1463 %ifdef m9
1464 SWAP 7, 9
1465 %else
1466 mova m7, m_limsign
1467 %endif
1468 psraw m6, 7
1469 psraw m1, 7
1470 packsswb m6, m1 ; a1
1471 pxor m0, m0
1472 psubb m0, m6
1473 pand m0, m7 ; -a1
1474 pandn m7, m6 ; +a1
1475 %ifdef m8
1476 SWAP 1, 13
1477 SWAP 6, 14
1478 %else
1479 mova m1, m_p2backup
1480 mova m6, m_q2backup
1481 %endif
1482 psubusb m1, m0
1483 paddusb m6, m0
1484 paddusb m1, m7 ; p1+a1
1485 psubusb m6, m7 ; q1-a1
1486
1487 ; store
1488 %ifidn %1, v
1489 movrow [dst2q+mstrideq*4], m1
1490 movrow [dst1q+mstrideq*2], m2
1491 movrow [dst1q+mstrideq ], m3
1492 movrow [dst1q], m4
1493 movrow [dst2q], m5
1494 movrow [dst2q+ strideq ], m6
1495 %if mmsize == 16 && %2 == 8
1496 add dst8q, mstrideq
1497 movhps [dst8q+mstrideq*2], m1
1498 movhps [dst8q+mstrideq ], m2
1499 movhps [dst8q], m3
1500 add dst8q, strideq
1501 movhps [dst8q], m4
1502 movhps [dst8q+ strideq ], m5
1503 movhps [dst8q+ strideq*2], m6
1504 %endif
1505 %else ; h
1506 inc dst1q
1507 inc dst2q
1508
1509 ; 4x8/16 transpose
1510 TRANSPOSE4x4B 1, 2, 3, 4, 0
1511 SBUTTERFLY bw, 5, 6, 0
1512
1513 %if mmsize == 8 ; mmx/mmxext (h)
1514 WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
1515 add dst1q, 4
1516 WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
1517 %else ; sse2 (h)
1518 lea dst8q, [dst8q+mstrideq+1]
1519 WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
1520 lea dst1q, [dst2q+mstrideq+4]
1521 lea dst8q, [dst8q+mstrideq+4]
1522 %if cpuflag(sse4)
1523 add dst2q, 4
1524 %endif
1525 WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
1526 %if cpuflag(sse4)
1527 lea dst2q, [dst8q+ strideq ]
1528 %endif
1529 WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
1530 %endif
1531 %endif
1532
1533 %if mmsize == 8
1534 %if %2 == 8 ; chroma
1535 %ifidn %1, h
1536 sub dst1q, 5
1537 %endif
1538 cmp dst1q, dst8q
1539 mov dst1q, dst8q
1540 jnz .next8px
1541 %else
1542 %ifidn %1, h
1543 lea dst1q, [dst1q+ strideq*8-5]
1544 %else ; v
1545 add dst1q, 8
1546 %endif
1547 dec cntrq
1548 jg .next8px
1549 %endif
1550 REP_RET
1551 %else ; mmsize == 16
1552 RET
1553 %endif
1554 %endmacro
1555
1556 %if ARCH_X86_32
1557 INIT_MMX mmx
1558 MBEDGE_LOOPFILTER v, 16
1559 MBEDGE_LOOPFILTER h, 16
1560 MBEDGE_LOOPFILTER v, 8
1561 MBEDGE_LOOPFILTER h, 8
1562
1563 INIT_MMX mmxext
1564 MBEDGE_LOOPFILTER v, 16
1565 MBEDGE_LOOPFILTER h, 16
1566 MBEDGE_LOOPFILTER v, 8
1567 MBEDGE_LOOPFILTER h, 8
1568 %endif
1569
1570 INIT_XMM sse2
1571 MBEDGE_LOOPFILTER v, 16
1572 MBEDGE_LOOPFILTER h, 16
1573 MBEDGE_LOOPFILTER v, 8
1574 MBEDGE_LOOPFILTER h, 8
1575
1576 INIT_XMM ssse3
1577 MBEDGE_LOOPFILTER v, 16
1578 MBEDGE_LOOPFILTER h, 16
1579 MBEDGE_LOOPFILTER v, 8
1580 MBEDGE_LOOPFILTER h, 8
1581
1582 INIT_XMM sse4
1583 MBEDGE_LOOPFILTER h, 16
1584 MBEDGE_LOOPFILTER h, 8