Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vp8dsp_loopfilter.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* VP8 MMXEXT optimizations
3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27pw_27: times 8 dw 27
28pw_63: times 8 dw 63
29
30pb_4: times 16 db 4
31pb_F8: times 16 db 0xF8
32pb_FE: times 16 db 0xFE
33pb_27_63: times 8 db 27, 63
34pb_18_63: times 8 db 18, 63
35pb_9_63: times 8 db 9, 63
36
37cextern pb_1
38cextern pb_3
39cextern pw_9
40cextern pw_18
41cextern pb_80
42
43SECTION .text
44
45;-----------------------------------------------------------------------------
46; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
47;-----------------------------------------------------------------------------
48
49; macro called with 7 mm register indexes as argument, and 4 regular registers
50;
51; first 4 mm registers will carry the transposed pixel data
52; the other three are scratchspace (one would be sufficient, but this allows
53; for more spreading/pipelining and thus faster execution on OOE CPUs)
54;
55; first two regular registers are buf+4*stride and buf+5*stride
56; third is -stride, fourth is +stride
57%macro READ_8x4_INTERLEAVED 11
58 ; interleave 8 (A-H) rows of 4 pixels each
59 movd m%1, [%8+%10*4] ; A0-3
60 movd m%5, [%9+%10*4] ; B0-3
61 movd m%2, [%8+%10*2] ; C0-3
62 movd m%6, [%8+%10] ; D0-3
63 movd m%3, [%8] ; E0-3
64 movd m%7, [%9] ; F0-3
65 movd m%4, [%9+%11] ; G0-3
66 punpcklbw m%1, m%5 ; A/B interleaved
67 movd m%5, [%9+%11*2] ; H0-3
68 punpcklbw m%2, m%6 ; C/D interleaved
69 punpcklbw m%3, m%7 ; E/F interleaved
70 punpcklbw m%4, m%5 ; G/H interleaved
71%endmacro
72
73; macro called with 7 mm register indexes as argument, and 5 regular registers
74; first 11 mean the same as READ_8x4_TRANSPOSED above
75; fifth regular register is scratchspace to reach the bottom 8 rows, it
76; will be set to second regular register + 8*stride at the end
77%macro READ_16x4_INTERLEAVED 12
78 ; transpose 16 (A-P) rows of 4 pixels each
79 lea %12, [r0+8*r2]
80
81 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
82 movd m%1, [%8+%10*4] ; A0-3
83 movd m%3, [%12+%10*4] ; I0-3
84 movd m%2, [%8+%10*2] ; C0-3
85 movd m%4, [%12+%10*2] ; K0-3
86 movd m%6, [%8+%10] ; D0-3
87 movd m%5, [%12+%10] ; L0-3
88 movd m%7, [%12] ; M0-3
89 add %12, %11
90 punpcklbw m%1, m%3 ; A/I
91 movd m%3, [%8] ; E0-3
92 punpcklbw m%2, m%4 ; C/K
93 punpcklbw m%6, m%5 ; D/L
94 punpcklbw m%3, m%7 ; E/M
95 punpcklbw m%2, m%6 ; C/D/K/L interleaved
96
97 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
98 movd m%5, [%9+%10*4] ; B0-3
99 movd m%4, [%12+%10*4] ; J0-3
100 movd m%7, [%9] ; F0-3
101 movd m%6, [%12] ; N0-3
102 punpcklbw m%5, m%4 ; B/J
103 punpcklbw m%7, m%6 ; F/N
104 punpcklbw m%1, m%5 ; A/B/I/J interleaved
105 punpcklbw m%3, m%7 ; E/F/M/N interleaved
106 movd m%4, [%9+%11] ; G0-3
107 movd m%6, [%12+%11] ; O0-3
108 movd m%5, [%9+%11*2] ; H0-3
109 movd m%7, [%12+%11*2] ; P0-3
110 punpcklbw m%4, m%6 ; G/O
111 punpcklbw m%5, m%7 ; H/P
112 punpcklbw m%4, m%5 ; G/H/O/P interleaved
113%endmacro
114
115; write 4 mm registers of 2 dwords each
116; first four arguments are mm register indexes containing source data
117; last four are registers containing buf+4*stride, buf+5*stride,
118; -stride and +stride
119%macro WRITE_4x2D 8
120 ; write out (2 dwords per register)
121 movd [%5+%7*4], m%1
122 movd [%5+%7*2], m%2
123 movd [%5], m%3
124 movd [%6+%8], m%4
125 punpckhdq m%1, m%1
126 punpckhdq m%2, m%2
127 punpckhdq m%3, m%3
128 punpckhdq m%4, m%4
129 movd [%6+%7*4], m%1
130 movd [%5+%7], m%2
131 movd [%6], m%3
132 movd [%6+%8*2], m%4
133%endmacro
134
135; write 4 xmm registers of 4 dwords each
136; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
137; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
138; we add 1*stride to the third regular registry in the process
139; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
140; same memory region), or 8 if they cover two separate buffers (third one points to
141; a different memory region than the first two), allowing for more optimal code for
142; the 16-width case
143%macro WRITE_4x4D 10
144 ; write out (4 dwords per register), start with dwords zero
145 movd [%5+%8*4], m%1
146 movd [%5], m%2
147 movd [%7+%8*4], m%3
148 movd [%7], m%4
149
150 ; store dwords 1
151 psrldq m%1, 4
152 psrldq m%2, 4
153 psrldq m%3, 4
154 psrldq m%4, 4
155 movd [%6+%8*4], m%1
156 movd [%6], m%2
157%if %10 == 16
158 movd [%6+%9*4], m%3
159%endif
160 movd [%7+%9], m%4
161
162 ; write dwords 2
163 psrldq m%1, 4
164 psrldq m%2, 4
165%if %10 == 8
166 movd [%5+%8*2], m%1
167 movd %5d, m%3
168%endif
169 psrldq m%3, 4
170 psrldq m%4, 4
171%if %10 == 16
172 movd [%5+%8*2], m%1
173%endif
174 movd [%6+%9], m%2
175 movd [%7+%8*2], m%3
176 movd [%7+%9*2], m%4
177 add %7, %9
178
179 ; store dwords 3
180 psrldq m%1, 4
181 psrldq m%2, 4
182 psrldq m%3, 4
183 psrldq m%4, 4
184%if %10 == 8
185 mov [%7+%8*4], %5d
186 movd [%6+%8*2], m%1
187%else
188 movd [%5+%8], m%1
189%endif
190 movd [%6+%9*2], m%2
191 movd [%7+%8*2], m%3
192 movd [%7+%9*2], m%4
193%endmacro
194
195; write 4 or 8 words in the mmx/xmm registers as 8 lines
196; 1 and 2 are the registers to write, this can be the same (for SSE2)
197; for pre-SSE4:
198; 3 is a general-purpose register that we will clobber
199; for SSE4:
200; 3 is a pointer to the destination's 5th line
201; 4 is a pointer to the destination's 4th line
202; 5/6 is -stride and +stride
203%macro WRITE_2x4W 6
204 movd %3d, %1
205 punpckhdq %1, %1
206 mov [%4+%5*4], %3w
207 shr %3, 16
208 add %4, %6
209 mov [%4+%5*4], %3w
210
211 movd %3d, %1
212 add %4, %5
213 mov [%4+%5*2], %3w
214 shr %3, 16
215 mov [%4+%5 ], %3w
216
217 movd %3d, %2
218 punpckhdq %2, %2
219 mov [%4 ], %3w
220 shr %3, 16
221 mov [%4+%6 ], %3w
222
223 movd %3d, %2
224 add %4, %6
225 mov [%4+%6 ], %3w
226 shr %3, 16
227 mov [%4+%6*2], %3w
228 add %4, %5
229%endmacro
230
231%macro WRITE_8W 5
232%if cpuflag(sse4)
233 pextrw [%3+%4*4], %1, 0
234 pextrw [%2+%4*4], %1, 1
235 pextrw [%3+%4*2], %1, 2
236 pextrw [%3+%4 ], %1, 3
237 pextrw [%3 ], %1, 4
238 pextrw [%2 ], %1, 5
239 pextrw [%2+%5 ], %1, 6
240 pextrw [%2+%5*2], %1, 7
241%else
242 movd %2d, %1
243 psrldq %1, 4
244 mov [%3+%4*4], %2w
245 shr %2, 16
246 add %3, %5
247 mov [%3+%4*4], %2w
248
249 movd %2d, %1
250 psrldq %1, 4
251 add %3, %4
252 mov [%3+%4*2], %2w
253 shr %2, 16
254 mov [%3+%4 ], %2w
255
256 movd %2d, %1
257 psrldq %1, 4
258 mov [%3 ], %2w
259 shr %2, 16
260 mov [%3+%5 ], %2w
261
262 movd %2d, %1
263 add %3, %5
264 mov [%3+%5 ], %2w
265 shr %2, 16
266 mov [%3+%5*2], %2w
267%endif
268%endmacro
269
270%macro SIMPLE_LOOPFILTER 2
271cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
272%if mmsize == 8 ; mmx/mmxext
273 mov cntrq, 2
274%endif
275%if cpuflag(ssse3)
276 pxor m0, m0
277%endif
278 SPLATB_REG m7, flim, m0 ; splat "flim" into register
279
280 ; set up indexes to address 4 rows
281%if mmsize == 8
282 DEFINE_ARGS dst1, mstride, stride, cntr, dst2
283%else
284 DEFINE_ARGS dst1, mstride, stride, dst3, dst2
285%endif
286 mov strideq, mstrideq
287 neg mstrideq
288%ifidn %1, h
289 lea dst1q, [dst1q+4*strideq-2]
290%endif
291
292%if mmsize == 8 ; mmx / mmxext
293.next8px:
294%endif
295%ifidn %1, v
296 ; read 4 half/full rows of pixels
297 mova m0, [dst1q+mstrideq*2] ; p1
298 mova m1, [dst1q+mstrideq] ; p0
299 mova m2, [dst1q] ; q0
300 mova m3, [dst1q+ strideq] ; q1
301%else ; h
302 lea dst2q, [dst1q+ strideq]
303
304%if mmsize == 8 ; mmx/mmxext
305 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
306%else ; sse2
307 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
308%endif
309 TRANSPOSE4x4W 0, 1, 2, 3, 4
310%endif
311
312 ; simple_limit
313 mova m5, m2 ; m5=backup of q0
314 mova m6, m1 ; m6=backup of p0
315 psubusb m1, m2 ; p0-q0
316 psubusb m2, m6 ; q0-p0
317 por m1, m2 ; FFABS(p0-q0)
318 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
319
320 mova m4, m3
321 mova m2, m0
322 psubusb m3, m0 ; q1-p1
323 psubusb m0, m4 ; p1-q1
324 por m3, m0 ; FFABS(p1-q1)
325 mova m0, [pb_80]
326 pxor m2, m0
327 pxor m4, m0
328 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
329 pand m3, [pb_FE]
330 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
331 paddusb m3, m1
332 psubusb m3, m7
333 pxor m1, m1
334 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
335
336 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
337 mova m4, m5
338 pxor m5, m0
339 pxor m0, m6
340 psubsb m5, m0 ; q0-p0 (signed)
341 paddsb m2, m5
342 paddsb m2, m5
343 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
344 pand m2, m3 ; apply filter mask (m3)
345
346 mova m3, [pb_F8]
347 mova m1, m2
348 paddsb m2, [pb_4] ; f1<<3=a+4
349 paddsb m1, [pb_3] ; f2<<3=a+3
350 pand m2, m3
351 pand m1, m3 ; cache f2<<3
352
353 pxor m0, m0
354 pxor m3, m3
355 pcmpgtb m0, m2 ; which values are <0?
356 psubb m3, m2 ; -f1<<3
357 psrlq m2, 3 ; +f1
358 psrlq m3, 3 ; -f1
359 pand m3, m0
360 pandn m0, m2
361 psubusb m4, m0
362 paddusb m4, m3 ; q0-f1
363
364 pxor m0, m0
365 pxor m3, m3
366 pcmpgtb m0, m1 ; which values are <0?
367 psubb m3, m1 ; -f2<<3
368 psrlq m1, 3 ; +f2
369 psrlq m3, 3 ; -f2
370 pand m3, m0
371 pandn m0, m1
372 paddusb m6, m0
373 psubusb m6, m3 ; p0+f2
374
375 ; store
376%ifidn %1, v
377 mova [dst1q], m4
378 mova [dst1q+mstrideq], m6
379%else ; h
380 inc dst1q
381 SBUTTERFLY bw, 6, 4, 0
382
383%if mmsize == 16 ; sse2
384%if cpuflag(sse4)
385 inc dst2q
386%endif
387 WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
388 lea dst2q, [dst3q+mstrideq+1]
389%if cpuflag(sse4)
390 inc dst3q
391%endif
392 WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
393%else ; mmx/mmxext
394 WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
395%endif
396%endif
397
398%if mmsize == 8 ; mmx/mmxext
399 ; next 8 pixels
400%ifidn %1, v
401 add dst1q, 8 ; advance 8 cols = pixels
402%else ; h
403 lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
404%endif
405 dec cntrq
406 jg .next8px
407 REP_RET
408%else ; sse2
409 RET
410%endif
411%endmacro
412
413%if ARCH_X86_32
414INIT_MMX mmx
415SIMPLE_LOOPFILTER v, 4
416SIMPLE_LOOPFILTER h, 5
417INIT_MMX mmxext
418SIMPLE_LOOPFILTER v, 4
419SIMPLE_LOOPFILTER h, 5
420%endif
421
422INIT_XMM sse2
423SIMPLE_LOOPFILTER v, 3
424SIMPLE_LOOPFILTER h, 5
425INIT_XMM ssse3
426SIMPLE_LOOPFILTER v, 3
427SIMPLE_LOOPFILTER h, 5
428INIT_XMM sse4
429SIMPLE_LOOPFILTER h, 5
430
431;-----------------------------------------------------------------------------
432; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
433; int flimE, int flimI, int hev_thr);
434;-----------------------------------------------------------------------------
435
436%macro INNER_LOOPFILTER 2
437%define stack_size 0
438%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
439%ifidn %1, v ; [3]=hev() result
440%define stack_size mmsize * -4
441%else ; h ; extra storage space for transposes
442%define stack_size mmsize * -5
443%endif
444%endif
445
446%if %2 == 8 ; chroma
447cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
448%else ; luma
449cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
450%endif
451
452%if cpuflag(ssse3)
453 pxor m7, m7
454%endif
455
456%ifndef m8
457 ; splat function arguments
458 SPLATB_REG m0, flimEq, m7 ; E
459 SPLATB_REG m1, flimIq, m7 ; I
460 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
461
462%define m_flimE [rsp]
463%define m_flimI [rsp+mmsize]
464%define m_hevthr [rsp+mmsize*2]
465%define m_maskres [rsp+mmsize*3]
466%define m_p0backup [rsp+mmsize*3]
467%define m_q0backup [rsp+mmsize*4]
468
469 mova m_flimE, m0
470 mova m_flimI, m1
471 mova m_hevthr, m2
472%else
473%define m_flimE m9
474%define m_flimI m10
475%define m_hevthr m11
476%define m_maskres m12
477%define m_p0backup m12
478%define m_q0backup m8
479
480 ; splat function arguments
481 SPLATB_REG m_flimE, flimEq, m7 ; E
482 SPLATB_REG m_flimI, flimIq, m7 ; I
483 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
484%endif
485
486%if %2 == 8 ; chroma
487 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
488%elif mmsize == 8
489 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
490 mov cntrq, 2
491%else
492 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
493%endif
494 mov strideq, mstrideq
495 neg mstrideq
496%ifidn %1, h
497 lea dst1q, [dst1q+strideq*4-4]
498%if %2 == 8 ; chroma
499 lea dst8q, [dst8q+strideq*4-4]
500%endif
501%endif
502
503%if mmsize == 8
504.next8px:
505%endif
506 ; read
507 lea dst2q, [dst1q+strideq]
508%ifidn %1, v
509%if %2 == 8 && mmsize == 16
510%define movrow movh
511%else
512%define movrow mova
513%endif
514 movrow m0, [dst1q+mstrideq*4] ; p3
515 movrow m1, [dst2q+mstrideq*4] ; p2
516 movrow m2, [dst1q+mstrideq*2] ; p1
517 movrow m5, [dst2q] ; q1
518 movrow m6, [dst2q+ strideq*1] ; q2
519 movrow m7, [dst2q+ strideq*2] ; q3
520%if mmsize == 16 && %2 == 8
521 movhps m0, [dst8q+mstrideq*4]
522 movhps m2, [dst8q+mstrideq*2]
523 add dst8q, strideq
524 movhps m1, [dst8q+mstrideq*4]
525 movhps m5, [dst8q]
526 movhps m6, [dst8q+ strideq ]
527 movhps m7, [dst8q+ strideq*2]
528 add dst8q, mstrideq
529%endif
530%elif mmsize == 8 ; mmx/mmxext (h)
531 ; read 8 rows of 8px each
532 movu m0, [dst1q+mstrideq*4]
533 movu m1, [dst2q+mstrideq*4]
534 movu m2, [dst1q+mstrideq*2]
535 movu m3, [dst1q+mstrideq ]
536 movu m4, [dst1q]
537 movu m5, [dst2q]
538 movu m6, [dst2q+ strideq ]
539
540 ; 8x8 transpose
541 TRANSPOSE4x4B 0, 1, 2, 3, 7
542 mova m_q0backup, m1
543 movu m7, [dst2q+ strideq*2]
544 TRANSPOSE4x4B 4, 5, 6, 7, 1
545 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
546 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
547 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
548 mova m1, m_q0backup
549 mova m_q0backup, m2 ; store q0
550 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
551 mova m_p0backup, m5 ; store p0
552 SWAP 1, 4
553 SWAP 2, 4
554 SWAP 6, 3
555 SWAP 5, 3
556%else ; sse2 (h)
557%if %2 == 16
558 lea dst8q, [dst1q+ strideq*8]
559%endif
560
561 ; read 16 rows of 8px each, interleave
562 movh m0, [dst1q+mstrideq*4]
563 movh m1, [dst8q+mstrideq*4]
564 movh m2, [dst1q+mstrideq*2]
565 movh m5, [dst8q+mstrideq*2]
566 movh m3, [dst1q+mstrideq ]
567 movh m6, [dst8q+mstrideq ]
568 movh m4, [dst1q]
569 movh m7, [dst8q]
570 punpcklbw m0, m1 ; A/I
571 punpcklbw m2, m5 ; C/K
572 punpcklbw m3, m6 ; D/L
573 punpcklbw m4, m7 ; E/M
574
575 add dst8q, strideq
576 movh m1, [dst2q+mstrideq*4]
577 movh m6, [dst8q+mstrideq*4]
578 movh m5, [dst2q]
579 movh m7, [dst8q]
580 punpcklbw m1, m6 ; B/J
581 punpcklbw m5, m7 ; F/N
582 movh m6, [dst2q+ strideq ]
583 movh m7, [dst8q+ strideq ]
584 punpcklbw m6, m7 ; G/O
585
586 ; 8x16 transpose
587 TRANSPOSE4x4B 0, 1, 2, 3, 7
588%ifdef m8
589 SWAP 1, 8
590%else
591 mova m_q0backup, m1
592%endif
593 movh m7, [dst2q+ strideq*2]
594 movh m1, [dst8q+ strideq*2]
595 punpcklbw m7, m1 ; H/P
596 TRANSPOSE4x4B 4, 5, 6, 7, 1
597 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
598 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
599 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
600%ifdef m8
601 SWAP 1, 8
602 SWAP 2, 8
603%else
604 mova m1, m_q0backup
605 mova m_q0backup, m2 ; store q0
606%endif
607 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
608%ifdef m12
609 SWAP 5, 12
610%else
611 mova m_p0backup, m5 ; store p0
612%endif
613 SWAP 1, 4
614 SWAP 2, 4
615 SWAP 6, 3
616 SWAP 5, 3
617%endif
618
619 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
620 mova m4, m1
621 SWAP 4, 1
622 psubusb m4, m0 ; p2-p3
623 psubusb m0, m1 ; p3-p2
624 por m0, m4 ; abs(p3-p2)
625
626 mova m4, m2
627 SWAP 4, 2
628 psubusb m4, m1 ; p1-p2
629 psubusb m1, m2 ; p2-p1
630 por m1, m4 ; abs(p2-p1)
631
632 mova m4, m6
633 SWAP 4, 6
634 psubusb m4, m7 ; q2-q3
635 psubusb m7, m6 ; q3-q2
636 por m7, m4 ; abs(q3-q2)
637
638 mova m4, m5
639 SWAP 4, 5
640 psubusb m4, m6 ; q1-q2
641 psubusb m6, m5 ; q2-q1
642 por m6, m4 ; abs(q2-q1)
643
644%if notcpuflag(mmxext)
645 mova m4, m_flimI
646 pxor m3, m3
647 psubusb m0, m4
648 psubusb m1, m4
649 psubusb m7, m4
650 psubusb m6, m4
651 pcmpeqb m0, m3 ; abs(p3-p2) <= I
652 pcmpeqb m1, m3 ; abs(p2-p1) <= I
653 pcmpeqb m7, m3 ; abs(q3-q2) <= I
654 pcmpeqb m6, m3 ; abs(q2-q1) <= I
655 pand m0, m1
656 pand m7, m6
657 pand m0, m7
658%else ; mmxext/sse2
659 pmaxub m0, m1
660 pmaxub m6, m7
661 pmaxub m0, m6
662%endif
663
664 ; normal_limit and high_edge_variance for p1-p0, q1-q0
665 SWAP 7, 3 ; now m7 is zero
666%ifidn %1, v
667 movrow m3, [dst1q+mstrideq ] ; p0
668%if mmsize == 16 && %2 == 8
669 movhps m3, [dst8q+mstrideq ]
670%endif
671%elifdef m12
672 SWAP 3, 12
673%else
674 mova m3, m_p0backup
675%endif
676
677 mova m1, m2
678 SWAP 1, 2
679 mova m6, m3
680 SWAP 3, 6
681 psubusb m1, m3 ; p1-p0
682 psubusb m6, m2 ; p0-p1
683 por m1, m6 ; abs(p1-p0)
684%if notcpuflag(mmxext)
685 mova m6, m1
686 psubusb m1, m4
687 psubusb m6, m_hevthr
688 pcmpeqb m1, m7 ; abs(p1-p0) <= I
689 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
690 pand m0, m1
691 mova m_maskres, m6
692%else ; mmxext/sse2
693 pmaxub m0, m1 ; max_I
694 SWAP 1, 4 ; max_hev_thresh
695%endif
696
697 SWAP 6, 4 ; now m6 is I
698%ifidn %1, v
699 movrow m4, [dst1q] ; q0
700%if mmsize == 16 && %2 == 8
701 movhps m4, [dst8q]
702%endif
703%elifdef m8
704 SWAP 4, 8
705%else
706 mova m4, m_q0backup
707%endif
708 mova m1, m4
709 SWAP 1, 4
710 mova m7, m5
711 SWAP 7, 5
712 psubusb m1, m5 ; q0-q1
713 psubusb m7, m4 ; q1-q0
714 por m1, m7 ; abs(q1-q0)
715%if notcpuflag(mmxext)
716 mova m7, m1
717 psubusb m1, m6
718 psubusb m7, m_hevthr
719 pxor m6, m6
720 pcmpeqb m1, m6 ; abs(q1-q0) <= I
721 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
722 mova m6, m_maskres
723 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
724 pand m6, m7
725%else ; mmxext/sse2
726 pxor m7, m7
727 pmaxub m0, m1
728 pmaxub m6, m1
729 psubusb m0, m_flimI
730 psubusb m6, m_hevthr
731 pcmpeqb m0, m7 ; max(abs(..)) <= I
732 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
733%endif
734%ifdef m12
735 SWAP 6, 12
736%else
737 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
738%endif
739
740 ; simple_limit
741 mova m1, m3
742 SWAP 1, 3
743 mova m6, m4 ; keep copies of p0/q0 around for later use
744 SWAP 6, 4
745 psubusb m1, m4 ; p0-q0
746 psubusb m6, m3 ; q0-p0
747 por m1, m6 ; abs(q0-p0)
748 paddusb m1, m1 ; m1=2*abs(q0-p0)
749
750 mova m7, m2
751 SWAP 7, 2
752 mova m6, m5
753 SWAP 6, 5
754 psubusb m7, m5 ; p1-q1
755 psubusb m6, m2 ; q1-p1
756 por m7, m6 ; abs(q1-p1)
757 pxor m6, m6
758 pand m7, [pb_FE]
759 psrlq m7, 1 ; abs(q1-p1)/2
760 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
761 psubusb m7, m_flimE
762 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
763 pand m0, m7 ; normal_limit result
764
765 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
766%ifdef m8 ; x86-64 && sse2
767 mova m8, [pb_80]
768%define m_pb_80 m8
769%else ; x86-32 or mmx/mmxext
770%define m_pb_80 [pb_80]
771%endif
772 mova m1, m4
773 mova m7, m3
774 pxor m1, m_pb_80
775 pxor m7, m_pb_80
776 psubsb m1, m7 ; (signed) q0-p0
777 mova m6, m2
778 mova m7, m5
779 pxor m6, m_pb_80
780 pxor m7, m_pb_80
781 psubsb m6, m7 ; (signed) p1-q1
782 mova m7, m_maskres
783 pandn m7, m6
784 paddsb m7, m1
785 paddsb m7, m1
786 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
787
788 pand m7, m0
789 mova m1, [pb_F8]
790 mova m6, m7
791 paddsb m7, [pb_3]
792 paddsb m6, [pb_4]
793 pand m7, m1
794 pand m6, m1
795
796 pxor m1, m1
797 pxor m0, m0
798 pcmpgtb m1, m7
799 psubb m0, m7
800 psrlq m7, 3 ; +f2
801 psrlq m0, 3 ; -f2
802 pand m0, m1
803 pandn m1, m7
804 psubusb m3, m0
805 paddusb m3, m1 ; p0+f2
806
807 pxor m1, m1
808 pxor m0, m0
809 pcmpgtb m0, m6
810 psubb m1, m6
811 psrlq m6, 3 ; +f1
812 psrlq m1, 3 ; -f1
813 pand m1, m0
814 pandn m0, m6
815 psubusb m4, m0
816 paddusb m4, m1 ; q0-f1
817
818%ifdef m12
819 SWAP 6, 12
820%else
821 mova m6, m_maskres
822%endif
823%if notcpuflag(mmxext)
824 mova m7, [pb_1]
825%else ; mmxext/sse2
826 pxor m7, m7
827%endif
828 pand m0, m6
829 pand m1, m6
830%if notcpuflag(mmxext)
831 paddusb m0, m7
832 pand m1, [pb_FE]
833 pandn m7, m0
834 psrlq m1, 1
835 psrlq m7, 1
836 SWAP 0, 7
837%else ; mmxext/sse2
838 psubusb m1, [pb_1]
839 pavgb m0, m7 ; a
840 pavgb m1, m7 ; -a
841%endif
842 psubusb m5, m0
843 psubusb m2, m1
844 paddusb m5, m1 ; q1-a
845 paddusb m2, m0 ; p1+a
846
847 ; store
848%ifidn %1, v
849 movrow [dst1q+mstrideq*2], m2
850 movrow [dst1q+mstrideq ], m3
851 movrow [dst1q], m4
852 movrow [dst1q+ strideq ], m5
853%if mmsize == 16 && %2 == 8
854 movhps [dst8q+mstrideq*2], m2
855 movhps [dst8q+mstrideq ], m3
856 movhps [dst8q], m4
857 movhps [dst8q+ strideq ], m5
858%endif
859%else ; h
860 add dst1q, 2
861 add dst2q, 2
862
863 ; 4x8/16 transpose
864 TRANSPOSE4x4B 2, 3, 4, 5, 6
865
866%if mmsize == 8 ; mmx/mmxext (h)
867 WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
868%else ; sse2 (h)
869 lea dst8q, [dst8q+mstrideq +2]
870 WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
871%endif
872%endif
873
874%if mmsize == 8
875%if %2 == 8 ; chroma
876%ifidn %1, h
877 sub dst1q, 2
878%endif
879 cmp dst1q, dst8q
880 mov dst1q, dst8q
881 jnz .next8px
882%else
883%ifidn %1, h
884 lea dst1q, [dst1q+ strideq*8-2]
885%else ; v
886 add dst1q, 8
887%endif
888 dec cntrq
889 jg .next8px
890%endif
891 REP_RET
892%else ; mmsize == 16
893 RET
894%endif
895%endmacro
896
897%if ARCH_X86_32
898INIT_MMX mmx
899INNER_LOOPFILTER v, 16
900INNER_LOOPFILTER h, 16
901INNER_LOOPFILTER v, 8
902INNER_LOOPFILTER h, 8
903
904INIT_MMX mmxext
905INNER_LOOPFILTER v, 16
906INNER_LOOPFILTER h, 16
907INNER_LOOPFILTER v, 8
908INNER_LOOPFILTER h, 8
909%endif
910
911INIT_XMM sse2
912INNER_LOOPFILTER v, 16
913INNER_LOOPFILTER h, 16
914INNER_LOOPFILTER v, 8
915INNER_LOOPFILTER h, 8
916
917INIT_XMM ssse3
918INNER_LOOPFILTER v, 16
919INNER_LOOPFILTER h, 16
920INNER_LOOPFILTER v, 8
921INNER_LOOPFILTER h, 8
922
923;-----------------------------------------------------------------------------
924; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
925; int flimE, int flimI, int hev_thr);
926;-----------------------------------------------------------------------------
927
928%macro MBEDGE_LOOPFILTER 2
929%define stack_size 0
930%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
931%if mmsize == 16 ; [3]=hev() result
932 ; [4]=filter tmp result
933 ; [5]/[6] = p2/q2 backup
934 ; [7]=lim_res sign result
935%define stack_size mmsize * -7
936%else ; 8 ; extra storage space for transposes
937%define stack_size mmsize * -8
938%endif
939%endif
940
941%if %2 == 8 ; chroma
942cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
943%else ; luma
944cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
945%endif
946
947%if cpuflag(ssse3)
948 pxor m7, m7
949%endif
950
951%ifndef m8
952 ; splat function arguments
953 SPLATB_REG m0, flimEq, m7 ; E
954 SPLATB_REG m1, flimIq, m7 ; I
955 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
956
957%define m_flimE [rsp]
958%define m_flimI [rsp+mmsize]
959%define m_hevthr [rsp+mmsize*2]
960%define m_maskres [rsp+mmsize*3]
961%define m_limres [rsp+mmsize*4]
962%define m_p0backup [rsp+mmsize*3]
963%define m_q0backup [rsp+mmsize*4]
964%define m_p2backup [rsp+mmsize*5]
965%define m_q2backup [rsp+mmsize*6]
966%if mmsize == 16
967%define m_limsign [rsp]
968%else
969%define m_limsign [rsp+mmsize*7]
970%endif
971
972 mova m_flimE, m0
973 mova m_flimI, m1
974 mova m_hevthr, m2
975%else ; sse2 on x86-64
976%define m_flimE m9
977%define m_flimI m10
978%define m_hevthr m11
979%define m_maskres m12
980%define m_limres m8
981%define m_p0backup m12
982%define m_q0backup m8
983%define m_p2backup m13
984%define m_q2backup m14
985%define m_limsign m9
986
987 ; splat function arguments
988 SPLATB_REG m_flimE, flimEq, m7 ; E
989 SPLATB_REG m_flimI, flimIq, m7 ; I
990 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
991%endif
992
993%if %2 == 8 ; chroma
994 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
995%elif mmsize == 8
996 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
997 mov cntrq, 2
998%else
999 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
1000%endif
1001 mov strideq, mstrideq
1002 neg mstrideq
1003%ifidn %1, h
1004 lea dst1q, [dst1q+strideq*4-4]
1005%if %2 == 8 ; chroma
1006 lea dst8q, [dst8q+strideq*4-4]
1007%endif
1008%endif
1009
1010%if mmsize == 8
1011.next8px:
1012%endif
1013 ; read
1014 lea dst2q, [dst1q+ strideq ]
1015%ifidn %1, v
1016%if %2 == 8 && mmsize == 16
1017%define movrow movh
1018%else
1019%define movrow mova
1020%endif
1021 movrow m0, [dst1q+mstrideq*4] ; p3
1022 movrow m1, [dst2q+mstrideq*4] ; p2
1023 movrow m2, [dst1q+mstrideq*2] ; p1
1024 movrow m5, [dst2q] ; q1
1025 movrow m6, [dst2q+ strideq ] ; q2
1026 movrow m7, [dst2q+ strideq*2] ; q3
1027%if mmsize == 16 && %2 == 8
1028 movhps m0, [dst8q+mstrideq*4]
1029 movhps m2, [dst8q+mstrideq*2]
1030 add dst8q, strideq
1031 movhps m1, [dst8q+mstrideq*4]
1032 movhps m5, [dst8q]
1033 movhps m6, [dst8q+ strideq ]
1034 movhps m7, [dst8q+ strideq*2]
1035 add dst8q, mstrideq
1036%endif
1037%elif mmsize == 8 ; mmx/mmxext (h)
1038 ; read 8 rows of 8px each
1039 movu m0, [dst1q+mstrideq*4]
1040 movu m1, [dst2q+mstrideq*4]
1041 movu m2, [dst1q+mstrideq*2]
1042 movu m3, [dst1q+mstrideq ]
1043 movu m4, [dst1q]
1044 movu m5, [dst2q]
1045 movu m6, [dst2q+ strideq ]
1046
1047 ; 8x8 transpose
1048 TRANSPOSE4x4B 0, 1, 2, 3, 7
1049 mova m_q0backup, m1
1050 movu m7, [dst2q+ strideq*2]
1051 TRANSPOSE4x4B 4, 5, 6, 7, 1
1052 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1053 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1054 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1055 mova m1, m_q0backup
1056 mova m_q0backup, m2 ; store q0
1057 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1058 mova m_p0backup, m5 ; store p0
1059 SWAP 1, 4
1060 SWAP 2, 4
1061 SWAP 6, 3
1062 SWAP 5, 3
1063%else ; sse2 (h)
1064%if %2 == 16
1065 lea dst8q, [dst1q+ strideq*8 ]
1066%endif
1067
1068 ; read 16 rows of 8px each, interleave
1069 movh m0, [dst1q+mstrideq*4]
1070 movh m1, [dst8q+mstrideq*4]
1071 movh m2, [dst1q+mstrideq*2]
1072 movh m5, [dst8q+mstrideq*2]
1073 movh m3, [dst1q+mstrideq ]
1074 movh m6, [dst8q+mstrideq ]
1075 movh m4, [dst1q]
1076 movh m7, [dst8q]
1077 punpcklbw m0, m1 ; A/I
1078 punpcklbw m2, m5 ; C/K
1079 punpcklbw m3, m6 ; D/L
1080 punpcklbw m4, m7 ; E/M
1081
1082 add dst8q, strideq
1083 movh m1, [dst2q+mstrideq*4]
1084 movh m6, [dst8q+mstrideq*4]
1085 movh m5, [dst2q]
1086 movh m7, [dst8q]
1087 punpcklbw m1, m6 ; B/J
1088 punpcklbw m5, m7 ; F/N
1089 movh m6, [dst2q+ strideq ]
1090 movh m7, [dst8q+ strideq ]
1091 punpcklbw m6, m7 ; G/O
1092
1093 ; 8x16 transpose
1094 TRANSPOSE4x4B 0, 1, 2, 3, 7
1095%ifdef m8
1096 SWAP 1, 8
1097%else
1098 mova m_q0backup, m1
1099%endif
1100 movh m7, [dst2q+ strideq*2]
1101 movh m1, [dst8q+ strideq*2]
1102 punpcklbw m7, m1 ; H/P
1103 TRANSPOSE4x4B 4, 5, 6, 7, 1
1104 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1105 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1106 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1107%ifdef m8
1108 SWAP 1, 8
1109 SWAP 2, 8
1110%else
1111 mova m1, m_q0backup
1112 mova m_q0backup, m2 ; store q0
1113%endif
1114 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1115%ifdef m12
1116 SWAP 5, 12
1117%else
1118 mova m_p0backup, m5 ; store p0
1119%endif
1120 SWAP 1, 4
1121 SWAP 2, 4
1122 SWAP 6, 3
1123 SWAP 5, 3
1124%endif
1125
1126 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1127 mova m4, m1
1128 SWAP 4, 1
1129 psubusb m4, m0 ; p2-p3
1130 psubusb m0, m1 ; p3-p2
1131 por m0, m4 ; abs(p3-p2)
1132
1133 mova m4, m2
1134 SWAP 4, 2
1135 psubusb m4, m1 ; p1-p2
1136 mova m_p2backup, m1
1137 psubusb m1, m2 ; p2-p1
1138 por m1, m4 ; abs(p2-p1)
1139
1140 mova m4, m6
1141 SWAP 4, 6
1142 psubusb m4, m7 ; q2-q3
1143 psubusb m7, m6 ; q3-q2
1144 por m7, m4 ; abs(q3-q2)
1145
1146 mova m4, m5
1147 SWAP 4, 5
1148 psubusb m4, m6 ; q1-q2
1149 mova m_q2backup, m6
1150 psubusb m6, m5 ; q2-q1
1151 por m6, m4 ; abs(q2-q1)
1152
1153%if notcpuflag(mmxext)
1154 mova m4, m_flimI
1155 pxor m3, m3
1156 psubusb m0, m4
1157 psubusb m1, m4
1158 psubusb m7, m4
1159 psubusb m6, m4
1160 pcmpeqb m0, m3 ; abs(p3-p2) <= I
1161 pcmpeqb m1, m3 ; abs(p2-p1) <= I
1162 pcmpeqb m7, m3 ; abs(q3-q2) <= I
1163 pcmpeqb m6, m3 ; abs(q2-q1) <= I
1164 pand m0, m1
1165 pand m7, m6
1166 pand m0, m7
1167%else ; mmxext/sse2
1168 pmaxub m0, m1
1169 pmaxub m6, m7
1170 pmaxub m0, m6
1171%endif
1172
1173 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1174 SWAP 7, 3 ; now m7 is zero
1175%ifidn %1, v
1176 movrow m3, [dst1q+mstrideq ] ; p0
1177%if mmsize == 16 && %2 == 8
1178 movhps m3, [dst8q+mstrideq ]
1179%endif
1180%elifdef m12
1181 SWAP 3, 12
1182%else
1183 mova m3, m_p0backup
1184%endif
1185
1186 mova m1, m2
1187 SWAP 1, 2
1188 mova m6, m3
1189 SWAP 3, 6
1190 psubusb m1, m3 ; p1-p0
1191 psubusb m6, m2 ; p0-p1
1192 por m1, m6 ; abs(p1-p0)
1193%if notcpuflag(mmxext)
1194 mova m6, m1
1195 psubusb m1, m4
1196 psubusb m6, m_hevthr
1197 pcmpeqb m1, m7 ; abs(p1-p0) <= I
1198 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
1199 pand m0, m1
1200 mova m_maskres, m6
1201%else ; mmxext/sse2
1202 pmaxub m0, m1 ; max_I
1203 SWAP 1, 4 ; max_hev_thresh
1204%endif
1205
1206 SWAP 6, 4 ; now m6 is I
1207%ifidn %1, v
1208 movrow m4, [dst1q] ; q0
1209%if mmsize == 16 && %2 == 8
1210 movhps m4, [dst8q]
1211%endif
1212%elifdef m8
1213 SWAP 4, 8
1214%else
1215 mova m4, m_q0backup
1216%endif
1217 mova m1, m4
1218 SWAP 1, 4
1219 mova m7, m5
1220 SWAP 7, 5
1221 psubusb m1, m5 ; q0-q1
1222 psubusb m7, m4 ; q1-q0
1223 por m1, m7 ; abs(q1-q0)
1224%if notcpuflag(mmxext)
1225 mova m7, m1
1226 psubusb m1, m6
1227 psubusb m7, m_hevthr
1228 pxor m6, m6
1229 pcmpeqb m1, m6 ; abs(q1-q0) <= I
1230 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
1231 mova m6, m_maskres
1232 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
1233 pand m6, m7
1234%else ; mmxext/sse2
1235 pxor m7, m7
1236 pmaxub m0, m1
1237 pmaxub m6, m1
1238 psubusb m0, m_flimI
1239 psubusb m6, m_hevthr
1240 pcmpeqb m0, m7 ; max(abs(..)) <= I
1241 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
1242%endif
1243%ifdef m12
1244 SWAP 6, 12
1245%else
1246 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1247%endif
1248
1249 ; simple_limit
1250 mova m1, m3
1251 SWAP 1, 3
1252 mova m6, m4 ; keep copies of p0/q0 around for later use
1253 SWAP 6, 4
1254 psubusb m1, m4 ; p0-q0
1255 psubusb m6, m3 ; q0-p0
1256 por m1, m6 ; abs(q0-p0)
1257 paddusb m1, m1 ; m1=2*abs(q0-p0)
1258
1259 mova m7, m2
1260 SWAP 7, 2
1261 mova m6, m5
1262 SWAP 6, 5
1263 psubusb m7, m5 ; p1-q1
1264 psubusb m6, m2 ; q1-p1
1265 por m7, m6 ; abs(q1-p1)
1266 pxor m6, m6
1267 pand m7, [pb_FE]
1268 psrlq m7, 1 ; abs(q1-p1)/2
1269 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
1270 psubusb m7, m_flimE
1271 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1272 pand m0, m7 ; normal_limit result
1273
1274 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1275%ifdef m8 ; x86-64 && sse2
1276 mova m8, [pb_80]
1277%define m_pb_80 m8
1278%else ; x86-32 or mmx/mmxext
1279%define m_pb_80 [pb_80]
1280%endif
1281 mova m1, m4
1282 mova m7, m3
1283 pxor m1, m_pb_80
1284 pxor m7, m_pb_80
1285 psubsb m1, m7 ; (signed) q0-p0
1286 mova m6, m2
1287 mova m7, m5
1288 pxor m6, m_pb_80
1289 pxor m7, m_pb_80
1290 psubsb m6, m7 ; (signed) p1-q1
1291 mova m7, m_maskres
1292 paddsb m6, m1
1293 paddsb m6, m1
1294 paddsb m6, m1
1295 pand m6, m0
1296%ifdef m8
1297 mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
1298 pand m_limres, m7
1299%else
1300 mova m0, m6
1301 pand m0, m7
1302 mova m_limres, m0
1303%endif
1304 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
1305
1306 mova m1, [pb_F8]
1307 mova m6, m7
1308 paddsb m7, [pb_3]
1309 paddsb m6, [pb_4]
1310 pand m7, m1
1311 pand m6, m1
1312
1313 pxor m1, m1
1314 pxor m0, m0
1315 pcmpgtb m1, m7
1316 psubb m0, m7
1317 psrlq m7, 3 ; +f2
1318 psrlq m0, 3 ; -f2
1319 pand m0, m1
1320 pandn m1, m7
1321 psubusb m3, m0
1322 paddusb m3, m1 ; p0+f2
1323
1324 pxor m1, m1
1325 pxor m0, m0
1326 pcmpgtb m0, m6
1327 psubb m1, m6
1328 psrlq m6, 3 ; +f1
1329 psrlq m1, 3 ; -f1
1330 pand m1, m0
1331 pandn m0, m6
1332 psubusb m4, m0
1333 paddusb m4, m1 ; q0-f1
1334
1335 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
1336%if cpuflag(ssse3)
1337 mova m7, [pb_1]
1338%else
1339 mova m7, [pw_63]
1340%endif
1341%ifdef m8
1342 SWAP 1, 8
1343%else
1344 mova m1, m_limres
1345%endif
1346 pxor m0, m0
1347 mova m6, m1
1348 pcmpgtb m0, m1 ; which are negative
1349%if cpuflag(ssse3)
1350 punpcklbw m6, m7 ; interleave with "1" for rounding
1351 punpckhbw m1, m7
1352%else
1353 punpcklbw m6, m0 ; signed byte->word
1354 punpckhbw m1, m0
1355%endif
1356 mova m_limsign, m0
1357%if cpuflag(ssse3)
1358 mova m7, [pb_27_63]
1359%ifndef m8
1360 mova m_limres, m1
1361%endif
1362%ifdef m10
1363 SWAP 0, 10 ; don't lose lim_sign copy
1364%endif
1365 mova m0, m7
1366 pmaddubsw m7, m6
1367 SWAP 6, 7
1368 pmaddubsw m0, m1
1369 SWAP 1, 0
1370%ifdef m10
1371 SWAP 0, 10
1372%else
1373 mova m0, m_limsign
1374%endif
1375%else
1376 mova m_maskres, m6 ; backup for later in filter
1377 mova m_limres, m1
1378 pmullw m6, [pw_27]
1379 pmullw m1, [pw_27]
1380 paddw m6, m7
1381 paddw m1, m7
1382%endif
1383 psraw m6, 7
1384 psraw m1, 7
1385 packsswb m6, m1 ; a0
1386 pxor m1, m1
1387 psubb m1, m6
1388 pand m1, m0 ; -a0
1389 pandn m0, m6 ; +a0
1390%if cpuflag(ssse3)
1391 mova m6, [pb_18_63] ; pipelining
1392%endif
1393 psubusb m3, m1
1394 paddusb m4, m1
1395 paddusb m3, m0 ; p0+a0
1396 psubusb m4, m0 ; q0-a0
1397
1398%if cpuflag(ssse3)
1399 SWAP 6, 7
1400%ifdef m10
1401 SWAP 1, 10
1402%else
1403 mova m1, m_limres
1404%endif
1405 mova m0, m7
1406 pmaddubsw m7, m6
1407 SWAP 6, 7
1408 pmaddubsw m0, m1
1409 SWAP 1, 0
1410%ifdef m10
1411 SWAP 0, 10
1412%endif
1413 mova m0, m_limsign
1414%else
1415 mova m6, m_maskres
1416 mova m1, m_limres
1417 pmullw m6, [pw_18]
1418 pmullw m1, [pw_18]
1419 paddw m6, m7
1420 paddw m1, m7
1421%endif
1422 mova m0, m_limsign
1423 psraw m6, 7
1424 psraw m1, 7
1425 packsswb m6, m1 ; a1
1426 pxor m1, m1
1427 psubb m1, m6
1428 pand m1, m0 ; -a1
1429 pandn m0, m6 ; +a1
1430%if cpuflag(ssse3)
1431 mova m6, [pb_9_63]
1432%endif
1433 psubusb m2, m1
1434 paddusb m5, m1
1435 paddusb m2, m0 ; p1+a1
1436 psubusb m5, m0 ; q1-a1
1437
1438%if cpuflag(ssse3)
1439 SWAP 6, 7
1440%ifdef m10
1441 SWAP 1, 10
1442%else
1443 mova m1, m_limres
1444%endif
1445 mova m0, m7
1446 pmaddubsw m7, m6
1447 SWAP 6, 7
1448 pmaddubsw m0, m1
1449 SWAP 1, 0
1450%else
1451%ifdef m8
1452 SWAP 6, 12
1453 SWAP 1, 8
1454%else
1455 mova m6, m_maskres
1456 mova m1, m_limres
1457%endif
1458 pmullw m6, [pw_9]
1459 pmullw m1, [pw_9]
1460 paddw m6, m7
1461 paddw m1, m7
1462%endif
1463%ifdef m9
1464 SWAP 7, 9
1465%else
1466 mova m7, m_limsign
1467%endif
1468 psraw m6, 7
1469 psraw m1, 7
1470 packsswb m6, m1 ; a1
1471 pxor m0, m0
1472 psubb m0, m6
1473 pand m0, m7 ; -a1
1474 pandn m7, m6 ; +a1
1475%ifdef m8
1476 SWAP 1, 13
1477 SWAP 6, 14
1478%else
1479 mova m1, m_p2backup
1480 mova m6, m_q2backup
1481%endif
1482 psubusb m1, m0
1483 paddusb m6, m0
1484 paddusb m1, m7 ; p1+a1
1485 psubusb m6, m7 ; q1-a1
1486
1487 ; store
1488%ifidn %1, v
1489 movrow [dst2q+mstrideq*4], m1
1490 movrow [dst1q+mstrideq*2], m2
1491 movrow [dst1q+mstrideq ], m3
1492 movrow [dst1q], m4
1493 movrow [dst2q], m5
1494 movrow [dst2q+ strideq ], m6
1495%if mmsize == 16 && %2 == 8
1496 add dst8q, mstrideq
1497 movhps [dst8q+mstrideq*2], m1
1498 movhps [dst8q+mstrideq ], m2
1499 movhps [dst8q], m3
1500 add dst8q, strideq
1501 movhps [dst8q], m4
1502 movhps [dst8q+ strideq ], m5
1503 movhps [dst8q+ strideq*2], m6
1504%endif
1505%else ; h
1506 inc dst1q
1507 inc dst2q
1508
1509 ; 4x8/16 transpose
1510 TRANSPOSE4x4B 1, 2, 3, 4, 0
1511 SBUTTERFLY bw, 5, 6, 0
1512
1513%if mmsize == 8 ; mmx/mmxext (h)
1514 WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
1515 add dst1q, 4
1516 WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
1517%else ; sse2 (h)
1518 lea dst8q, [dst8q+mstrideq+1]
1519 WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
1520 lea dst1q, [dst2q+mstrideq+4]
1521 lea dst8q, [dst8q+mstrideq+4]
1522%if cpuflag(sse4)
1523 add dst2q, 4
1524%endif
1525 WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
1526%if cpuflag(sse4)
1527 lea dst2q, [dst8q+ strideq ]
1528%endif
1529 WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
1530%endif
1531%endif
1532
1533%if mmsize == 8
1534%if %2 == 8 ; chroma
1535%ifidn %1, h
1536 sub dst1q, 5
1537%endif
1538 cmp dst1q, dst8q
1539 mov dst1q, dst8q
1540 jnz .next8px
1541%else
1542%ifidn %1, h
1543 lea dst1q, [dst1q+ strideq*8-5]
1544%else ; v
1545 add dst1q, 8
1546%endif
1547 dec cntrq
1548 jg .next8px
1549%endif
1550 REP_RET
1551%else ; mmsize == 16
1552 RET
1553%endif
1554%endmacro
1555
1556%if ARCH_X86_32
1557INIT_MMX mmx
1558MBEDGE_LOOPFILTER v, 16
1559MBEDGE_LOOPFILTER h, 16
1560MBEDGE_LOOPFILTER v, 8
1561MBEDGE_LOOPFILTER h, 8
1562
1563INIT_MMX mmxext
1564MBEDGE_LOOPFILTER v, 16
1565MBEDGE_LOOPFILTER h, 16
1566MBEDGE_LOOPFILTER v, 8
1567MBEDGE_LOOPFILTER h, 8
1568%endif
1569
1570INIT_XMM sse2
1571MBEDGE_LOOPFILTER v, 16
1572MBEDGE_LOOPFILTER h, 16
1573MBEDGE_LOOPFILTER v, 8
1574MBEDGE_LOOPFILTER h, 8
1575
1576INIT_XMM ssse3
1577MBEDGE_LOOPFILTER v, 16
1578MBEDGE_LOOPFILTER h, 16
1579MBEDGE_LOOPFILTER v, 8
1580MBEDGE_LOOPFILTER h, 8
1581
1582INIT_XMM sse4
1583MBEDGE_LOOPFILTER h, 16
1584MBEDGE_LOOPFILTER h, 8