Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* MMX/SSE2-optimized functions for the VP6 decoder | |
3 | ;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> | |
4 | ;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> | |
5 | ;* | |
6 | ;* This file is part of FFmpeg. | |
7 | ;* | |
8 | ;* FFmpeg is free software; you can redistribute it and/or | |
9 | ;* modify it under the terms of the GNU Lesser General Public | |
10 | ;* License as published by the Free Software Foundation; either | |
11 | ;* version 2.1 of the License, or (at your option) any later version. | |
12 | ;* | |
13 | ;* FFmpeg is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | ;* Lesser General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU Lesser General Public | |
19 | ;* License along with FFmpeg; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | ;****************************************************************************** | |
22 | ||
23 | %include "libavutil/x86/x86util.asm" | |
24 | ||
25 | cextern pw_64 | |
26 | ||
27 | SECTION .text | |
28 | ||
29 | %macro DIAG4 6 | |
30 | %if mmsize == 8 | |
31 | movq m0, [%1+%2] | |
32 | movq m1, [%1+%3] | |
33 | movq m3, m0 | |
34 | movq m4, m1 | |
35 | punpcklbw m0, m7 | |
36 | punpcklbw m1, m7 | |
37 | punpckhbw m3, m7 | |
38 | punpckhbw m4, m7 | |
39 | pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] | |
40 | pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] | |
41 | pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] | |
42 | pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] | |
43 | paddw m0, m1 | |
44 | paddw m3, m4 | |
45 | movq m1, [%1+%4] | |
46 | movq m2, [%1+%5] | |
47 | movq m4, m1 | |
48 | movq m5, m2 | |
49 | punpcklbw m1, m7 | |
50 | punpcklbw m2, m7 | |
51 | punpckhbw m4, m7 | |
52 | punpckhbw m5, m7 | |
53 | pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] | |
54 | pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] | |
55 | pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] | |
56 | pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] | |
57 | paddw m1, m2 | |
58 | paddw m4, m5 | |
59 | paddsw m0, m1 | |
60 | paddsw m3, m4 | |
61 | paddsw m0, m6 ; Add 64 | |
62 | paddsw m3, m6 ; Add 64 | |
63 | psraw m0, 7 | |
64 | psraw m3, 7 | |
65 | packuswb m0, m3 | |
66 | movq [%6], m0 | |
67 | %else ; mmsize == 16 | |
68 | movq m0, [%1+%2] | |
69 | movq m1, [%1+%3] | |
70 | punpcklbw m0, m7 | |
71 | punpcklbw m1, m7 | |
72 | pmullw m0, m4 ; src[x-8 ] * biweight [0] | |
73 | pmullw m1, m5 ; src[x ] * biweight [1] | |
74 | paddw m0, m1 | |
75 | movq m1, [%1+%4] | |
76 | movq m2, [%1+%5] | |
77 | punpcklbw m1, m7 | |
78 | punpcklbw m2, m7 | |
79 | pmullw m1, m6 ; src[x+8 ] * biweight [2] | |
80 | pmullw m2, m3 ; src[x+16] * biweight [3] | |
81 | paddw m1, m2 | |
82 | paddsw m0, m1 | |
83 | paddsw m0, [pw_64] ; Add 64 | |
84 | psraw m0, 7 | |
85 | packuswb m0, m0 | |
86 | movq [%6], m0 | |
87 | %endif ; mmsize == 8/16 | |
88 | %endmacro | |
89 | ||
90 | %macro SPLAT4REGS 0 | |
91 | %if mmsize == 8 | |
92 | movq m5, m3 | |
93 | punpcklwd m3, m3 | |
94 | movq m4, m3 | |
95 | punpckldq m3, m3 | |
96 | punpckhdq m4, m4 | |
97 | punpckhwd m5, m5 | |
98 | movq m2, m5 | |
99 | punpckhdq m2, m2 | |
100 | punpckldq m5, m5 | |
101 | movq [rsp+8*11], m3 | |
102 | movq [rsp+8*12], m4 | |
103 | movq [rsp+8*13], m5 | |
104 | movq [rsp+8*14], m2 | |
105 | %else ; mmsize == 16 | |
106 | pshuflw m4, m3, 0x0 | |
107 | pshuflw m5, m3, 0x55 | |
108 | pshuflw m6, m3, 0xAA | |
109 | pshuflw m3, m3, 0xFF | |
110 | punpcklqdq m4, m4 | |
111 | punpcklqdq m5, m5 | |
112 | punpcklqdq m6, m6 | |
113 | punpcklqdq m3, m3 | |
114 | %endif ; mmsize == 8/16 | |
115 | %endmacro | |
116 | ||
117 | %macro vp6_filter_diag4 0 | |
118 | ; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride, | |
119 | ; const int16_t h_weight[4], const int16_t v_weights[4]) | |
120 | cglobal vp6_filter_diag4, 5, 7, 8 | |
121 | mov r5, rsp ; backup stack pointer | |
122 | and rsp, ~(mmsize-1) ; align stack | |
123 | %if mmsize == 16 | |
124 | sub rsp, 8*11 | |
125 | %else | |
126 | sub rsp, 8*15 | |
127 | movq m6, [pw_64] | |
128 | %endif | |
129 | %if ARCH_X86_64 | |
130 | movsxd r2, r2d | |
131 | %endif | |
132 | ||
133 | sub r1, r2 | |
134 | ||
135 | pxor m7, m7 | |
136 | movq m3, [r3] | |
137 | SPLAT4REGS | |
138 | ||
139 | mov r3, rsp | |
140 | mov r6, 11 | |
141 | .nextrow: | |
142 | DIAG4 r1, -1, 0, 1, 2, r3 | |
143 | add r3, 8 | |
144 | add r1, r2 | |
145 | dec r6 | |
146 | jnz .nextrow | |
147 | ||
148 | movq m3, [r4] | |
149 | SPLAT4REGS | |
150 | ||
151 | lea r3, [rsp+8] | |
152 | mov r6, 8 | |
153 | .nextcol: | |
154 | DIAG4 r3, -8, 0, 8, 16, r0 | |
155 | add r3, 8 | |
156 | add r0, r2 | |
157 | dec r6 | |
158 | jnz .nextcol | |
159 | ||
160 | mov rsp, r5 ; restore stack pointer | |
161 | RET | |
162 | %endmacro | |
163 | ||
164 | %if ARCH_X86_32 | |
165 | INIT_MMX mmx | |
166 | vp6_filter_diag4 | |
167 | %endif | |
168 | ||
169 | INIT_XMM sse2 | |
170 | vp6_filter_diag4 |