Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* Copyright (c) 2010 David Conrad | |
3 | ;* | |
4 | ;* This file is part of FFmpeg. | |
5 | ;* | |
6 | ;* FFmpeg is free software; you can redistribute it and/or | |
7 | ;* modify it under the terms of the GNU Lesser General Public | |
8 | ;* License as published by the Free Software Foundation; either | |
9 | ;* version 2.1 of the License, or (at your option) any later version. | |
10 | ;* | |
11 | ;* FFmpeg is distributed in the hope that it will be useful, | |
12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | ;* Lesser General Public License for more details. | |
15 | ;* | |
16 | ;* You should have received a copy of the GNU Lesser General Public | |
17 | ;* License along with FFmpeg; if not, write to the Free Software | |
18 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | ;****************************************************************************** | |
20 | ||
21 | %include "libavutil/x86/x86util.asm" | |
22 | ||
23 | SECTION_RODATA | |
24 | pw_7: times 8 dw 7 | |
25 | ||
26 | cextern pw_3 | |
27 | cextern pw_16 | |
28 | cextern pw_32 | |
29 | cextern pb_80 | |
30 | ||
31 | section .text | |
32 | ||
33 | %macro UNPACK_ADD 6 | |
34 | mov%5 %1, %3 | |
35 | mov%6 m5, %4 | |
36 | mova m4, %1 | |
37 | mova %2, m5 | |
38 | punpcklbw %1, m7 | |
39 | punpcklbw m5, m7 | |
40 | punpckhbw m4, m7 | |
41 | punpckhbw %2, m7 | |
42 | paddw %1, m5 | |
43 | paddw %2, m4 | |
44 | %endmacro | |
45 | ||
46 | %macro HPEL_FILTER 1 | |
47 | ; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); | |
48 | cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 | |
49 | mov src0q, srcq | |
50 | lea stridex3q, [3*strideq] | |
51 | sub src0q, stridex3q | |
52 | pxor m7, m7 | |
53 | .loop: | |
54 | ; 7*(src[0] + src[1]) | |
55 | UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a | |
56 | pmullw m0, [pw_7] | |
57 | pmullw m1, [pw_7] | |
58 | ||
59 | ; 3*( ... + src[-2] + src[3]) | |
60 | UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a | |
61 | paddw m0, m2 | |
62 | paddw m1, m3 | |
63 | pmullw m0, [pw_3] | |
64 | pmullw m1, [pw_3] | |
65 | ||
66 | ; ... - 7*(src[-1] + src[2]) | |
67 | UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a | |
68 | pmullw m2, [pw_7] | |
69 | pmullw m3, [pw_7] | |
70 | psubw m0, m2 | |
71 | psubw m1, m3 | |
72 | ||
73 | ; ... - (src[-3] + src[4]) | |
74 | UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a | |
75 | psubw m0, m2 | |
76 | psubw m1, m3 | |
77 | ||
78 | paddw m0, [pw_16] | |
79 | paddw m1, [pw_16] | |
80 | psraw m0, 5 | |
81 | psraw m1, 5 | |
82 | packuswb m0, m1 | |
83 | mova [dstq], m0 | |
84 | add dstq, mmsize | |
85 | add srcq, mmsize | |
86 | add src0q, mmsize | |
87 | sub widthd, mmsize | |
88 | jg .loop | |
89 | RET | |
90 | ||
91 | ; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); | |
92 | cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width | |
93 | dec widthd | |
94 | pxor m7, m7 | |
95 | and widthd, ~(mmsize-1) | |
96 | .loop: | |
97 | ; 7*(src[0] + src[1]) | |
98 | UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u | |
99 | pmullw m0, [pw_7] | |
100 | pmullw m1, [pw_7] | |
101 | ||
102 | ; 3*( ... + src[-2] + src[3]) | |
103 | UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u | |
104 | paddw m0, m2 | |
105 | paddw m1, m3 | |
106 | pmullw m0, [pw_3] | |
107 | pmullw m1, [pw_3] | |
108 | ||
109 | ; ... - 7*(src[-1] + src[2]) | |
110 | UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u | |
111 | pmullw m2, [pw_7] | |
112 | pmullw m3, [pw_7] | |
113 | psubw m0, m2 | |
114 | psubw m1, m3 | |
115 | ||
116 | ; ... - (src[-3] + src[4]) | |
117 | UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u | |
118 | psubw m0, m2 | |
119 | psubw m1, m3 | |
120 | ||
121 | paddw m0, [pw_16] | |
122 | paddw m1, [pw_16] | |
123 | psraw m0, 5 | |
124 | psraw m1, 5 | |
125 | packuswb m0, m1 | |
126 | mova [dstq + widthq], m0 | |
127 | sub widthd, mmsize | |
128 | jge .loop | |
129 | RET | |
130 | %endmacro | |
131 | ||
132 | %macro PUT_RECT 1 | |
133 | ; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) | |
134 | cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 | |
135 | mova m0, [pb_80] | |
136 | add wd, (mmsize-1) | |
137 | and wd, ~(mmsize-1) | |
138 | ||
139 | %if ARCH_X86_64 | |
140 | movsxd dst_strideq, dst_strided | |
141 | movsxd src_strideq, src_strided | |
142 | mov r7d, r5m | |
143 | mov r8d, wd | |
144 | %define wspill r8d | |
145 | %define hd r7d | |
146 | %else | |
147 | mov r4m, wd | |
148 | %define wspill r4m | |
149 | %define hd r5mp | |
150 | %endif | |
151 | ||
152 | .loopy | |
153 | lea src2q, [srcq+src_strideq*2] | |
154 | lea dst2q, [dstq+dst_strideq] | |
155 | .loopx: | |
156 | sub wd, mmsize | |
157 | mova m1, [srcq +2*wq] | |
158 | mova m2, [src2q+2*wq] | |
159 | packsswb m1, [srcq +2*wq+mmsize] | |
160 | packsswb m2, [src2q+2*wq+mmsize] | |
161 | paddb m1, m0 | |
162 | paddb m2, m0 | |
163 | mova [dstq +wq], m1 | |
164 | mova [dst2q+wq], m2 | |
165 | jg .loopx | |
166 | ||
167 | lea srcq, [srcq+src_strideq*4] | |
168 | lea dstq, [dstq+dst_strideq*2] | |
169 | sub hd, 2 | |
170 | mov wd, wspill | |
171 | jg .loopy | |
172 | RET | |
173 | %endm | |
174 | ||
175 | %macro ADD_RECT 1 | |
176 | ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) | |
177 | cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h | |
178 | mova m0, [pw_32] | |
179 | add wd, (mmsize-1) | |
180 | and wd, ~(mmsize-1) | |
181 | ||
182 | %if ARCH_X86_64 | |
183 | movsxd strideq, strided | |
184 | movsxd idwt_strideq, idwt_strided | |
185 | mov r8d, wd | |
186 | %define wspill r8d | |
187 | %else | |
188 | mov r5m, wd | |
189 | %define wspill r5m | |
190 | %endif | |
191 | ||
192 | .loop: | |
193 | sub wd, mmsize | |
194 | movu m1, [srcq +2*wq] ; FIXME: ensure alignment | |
195 | paddw m1, m0 | |
196 | psraw m1, 6 | |
197 | movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment | |
198 | paddw m2, m0 | |
199 | psraw m2, 6 | |
200 | paddw m1, [idwtq+2*wq] | |
201 | paddw m2, [idwtq+2*wq+mmsize] | |
202 | packuswb m1, m2 | |
203 | mova [dstq +wq], m1 | |
204 | jg .loop | |
205 | ||
206 | lea srcq, [srcq + 2*strideq] | |
207 | add dstq, strideq | |
208 | lea idwtq, [idwtq+ 2*idwt_strideq] | |
209 | sub hd, 1 | |
210 | mov wd, wspill | |
211 | jg .loop | |
212 | RET | |
213 | %endm | |
214 | ||
215 | %macro ADD_OBMC 2 | |
216 | ; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) | |
217 | cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen | |
218 | pxor m4, m4 | |
219 | .loop: | |
220 | %assign i 0 | |
221 | %rep %1 / mmsize | |
222 | mova m0, [srcq+i] | |
223 | mova m1, m0 | |
224 | punpcklbw m0, m4 | |
225 | punpckhbw m1, m4 | |
226 | mova m2, [obmcq+i] | |
227 | mova m3, m2 | |
228 | punpcklbw m2, m4 | |
229 | punpckhbw m3, m4 | |
230 | pmullw m0, m2 | |
231 | pmullw m1, m3 | |
232 | movu m2, [dstq+2*i] | |
233 | movu m3, [dstq+2*i+mmsize] | |
234 | paddw m0, m2 | |
235 | paddw m1, m3 | |
236 | movu [dstq+2*i], m0 | |
237 | movu [dstq+2*i+mmsize], m1 | |
238 | %assign i i+mmsize | |
239 | %endrep | |
240 | lea srcq, [srcq+strideq] | |
241 | lea dstq, [dstq+2*strideq] | |
242 | add obmcq, 32 | |
243 | sub yblend, 1 | |
244 | jg .loop | |
245 | RET | |
246 | %endm | |
247 | ||
248 | INIT_MMX | |
249 | %if ARCH_X86_64 == 0 | |
250 | PUT_RECT mmx | |
251 | ADD_RECT mmx | |
252 | ||
253 | HPEL_FILTER mmx | |
254 | ADD_OBMC 32, mmx | |
255 | ADD_OBMC 16, mmx | |
256 | %endif | |
257 | ADD_OBMC 8, mmx | |
258 | ||
259 | INIT_XMM | |
260 | PUT_RECT sse2 | |
261 | ADD_RECT sse2 | |
262 | ||
263 | HPEL_FILTER sse2 | |
264 | ADD_OBMC 32, sse2 | |
265 | ADD_OBMC 16, sse2 |