Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* SIMD lossless video DSP utils | |
3 | ;* Copyright (c) 2008 Loren Merritt | |
4 | ;* Copyright (c) 2014 Michael Niedermayer | |
5 | ;* | |
6 | ;* This file is part of FFmpeg. | |
7 | ;* | |
8 | ;* FFmpeg is free software; you can redistribute it and/or | |
9 | ;* modify it under the terms of the GNU Lesser General Public | |
10 | ;* License as published by the Free Software Foundation; either | |
11 | ;* version 2.1 of the License, or (at your option) any later version. | |
12 | ;* | |
13 | ;* FFmpeg is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | ;* Lesser General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU Lesser General Public | |
19 | ;* License along with FFmpeg; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | ;****************************************************************************** | |
22 | ||
23 | %include "libavutil/x86/x86util.asm" | |
24 | ||
25 | SECTION_RODATA | |
26 | ||
27 | pb_ef: times 8 db 14,15 | |
28 | pb_67: times 8 db 6, 7 | |
29 | pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11 | |
30 | pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7 | |
31 | ||
32 | SECTION_TEXT | |
33 | ||
34 | %macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub | |
35 | movd m4, maskd | |
36 | SPLATW m4, m4 | |
37 | add wd, wd | |
38 | test wq, 2*mmsize - 1 | |
39 | jz %%.tomainloop | |
40 | push tmpq | |
41 | %%.wordloop: | |
42 | sub wq, 2 | |
43 | %ifidn %2, add | |
44 | mov tmpw, [srcq+wq] | |
45 | add tmpw, [dstq+wq] | |
46 | %else | |
47 | mov tmpw, [src1q+wq] | |
48 | sub tmpw, [src2q+wq] | |
49 | %endif | |
50 | and tmpw, maskw | |
51 | mov [dstq+wq], tmpw | |
52 | test wq, 2*mmsize - 1 | |
53 | jnz %%.wordloop | |
54 | pop tmpq | |
55 | %%.tomainloop: | |
56 | %ifidn %2, add | |
57 | add srcq, wq | |
58 | %else | |
59 | add src1q, wq | |
60 | add src2q, wq | |
61 | %endif | |
62 | add dstq, wq | |
63 | neg wq | |
64 | jz %%.end | |
65 | %%.loop: | |
66 | %ifidn %2, add | |
67 | mov%1 m0, [srcq+wq] | |
68 | mov%1 m1, [dstq+wq] | |
69 | mov%1 m2, [srcq+wq+mmsize] | |
70 | mov%1 m3, [dstq+wq+mmsize] | |
71 | %else | |
72 | mov%1 m0, [src1q+wq] | |
73 | mov%1 m1, [src2q+wq] | |
74 | mov%1 m2, [src1q+wq+mmsize] | |
75 | mov%1 m3, [src2q+wq+mmsize] | |
76 | %endif | |
77 | p%2w m0, m1 | |
78 | p%2w m2, m3 | |
79 | pand m0, m4 | |
80 | pand m2, m4 | |
81 | mov%1 [dstq+wq] , m0 | |
82 | mov%1 [dstq+wq+mmsize], m2 | |
83 | add wq, 2*mmsize | |
84 | jl %%.loop | |
85 | %%.end: | |
86 | RET | |
87 | %endmacro | |
88 | ||
89 | INIT_MMX mmx | |
90 | cglobal add_int16, 4,4,5, dst, src, mask, w, tmp | |
91 | INT16_LOOP a, add | |
92 | ||
93 | INIT_XMM sse2 | |
94 | cglobal add_int16, 4,4,5, dst, src, mask, w, tmp | |
95 | test srcq, mmsize-1 | |
96 | jnz .unaligned | |
97 | test dstq, mmsize-1 | |
98 | jnz .unaligned | |
99 | INT16_LOOP a, add | |
100 | .unaligned: | |
101 | INT16_LOOP u, add | |
102 | ||
103 | INIT_MMX mmx | |
104 | cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp | |
105 | INT16_LOOP a, sub | |
106 | ||
107 | INIT_XMM sse2 | |
108 | cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp | |
109 | test src1q, mmsize-1 | |
110 | jnz .unaligned | |
111 | test src2q, mmsize-1 | |
112 | jnz .unaligned | |
113 | test dstq, mmsize-1 | |
114 | jnz .unaligned | |
115 | INT16_LOOP a, sub | |
116 | .unaligned: | |
117 | INT16_LOOP u, sub | |
118 | ||
119 | ||
120 | %macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u) | |
121 | add wd, wd | |
122 | add srcq, wq | |
123 | add dstq, wq | |
124 | neg wq | |
125 | %%.loop: | |
126 | mov%2 m1, [srcq+wq] | |
127 | mova m2, m1 | |
128 | pslld m1, 16 | |
129 | paddw m1, m2 | |
130 | mova m2, m1 | |
131 | ||
132 | pshufb m1, m3 | |
133 | paddw m1, m2 | |
134 | pshufb m0, m5 | |
135 | %if mmsize == 16 | |
136 | mova m2, m1 | |
137 | pshufb m1, m4 | |
138 | paddw m1, m2 | |
139 | %endif | |
140 | paddw m0, m1 | |
141 | pand m0, m7 | |
142 | %ifidn %1, a | |
143 | mova [dstq+wq], m0 | |
144 | %else | |
145 | movq [dstq+wq], m0 | |
146 | movhps [dstq+wq+8], m0 | |
147 | %endif | |
148 | add wq, mmsize | |
149 | jl %%.loop | |
150 | mov eax, mmsize-1 | |
151 | sub eax, wd | |
152 | mov wd, eax | |
153 | shl wd, 8 | |
154 | lea eax, [wd+eax-1] | |
155 | movd m1, eax | |
156 | pshufb m0, m1 | |
157 | movd eax, m0 | |
158 | RET | |
159 | %endmacro | |
160 | ||
161 | ; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left) | |
162 | INIT_MMX ssse3 | |
163 | cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left | |
164 | .skip_prologue: | |
165 | mova m5, [pb_67] | |
166 | mova m3, [pb_zzzz2323zzzzabab] | |
167 | movd m0, leftm | |
168 | psllq m0, 48 | |
169 | movd m7, maskm | |
170 | SPLATW m7 ,m7 | |
171 | ADD_HFYU_LEFT_LOOP_INT16 a, a | |
172 | ||
173 | INIT_XMM sse4 | |
174 | cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left | |
175 | mova m5, [pb_ef] | |
176 | mova m4, [pb_zzzzzzzz67676767] | |
177 | mova m3, [pb_zzzz2323zzzzabab] | |
178 | movd m0, leftm | |
179 | pslldq m0, 14 | |
180 | movd m7, maskm | |
181 | SPLATW m7 ,m7 | |
182 | test srcq, 15 | |
183 | jnz .src_unaligned | |
184 | test dstq, 15 | |
185 | jnz .dst_unaligned | |
186 | ADD_HFYU_LEFT_LOOP_INT16 a, a | |
187 | .dst_unaligned: | |
188 | ADD_HFYU_LEFT_LOOP_INT16 u, a | |
189 | .src_unaligned: | |
190 | ADD_HFYU_LEFT_LOOP_INT16 u, u | |
191 | ||
192 | ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top) | |
193 | INIT_MMX mmxext | |
194 | cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top | |
195 | add wd, wd | |
196 | movd mm6, maskd | |
197 | SPLATW mm6, mm6 | |
198 | movq mm0, [topq] | |
199 | movq mm2, mm0 | |
200 | movd mm4, [left_topq] | |
201 | psllq mm2, 16 | |
202 | movq mm1, mm0 | |
203 | por mm4, mm2 | |
204 | movd mm3, [leftq] | |
205 | psubw mm0, mm4 ; t-tl | |
206 | add dstq, wq | |
207 | add topq, wq | |
208 | add diffq, wq | |
209 | neg wq | |
210 | jmp .skip | |
211 | .loop: | |
212 | movq mm4, [topq+wq] | |
213 | movq mm0, mm4 | |
214 | psllq mm4, 16 | |
215 | por mm4, mm1 | |
216 | movq mm1, mm0 ; t | |
217 | psubw mm0, mm4 ; t-tl | |
218 | .skip: | |
219 | movq mm2, [diffq+wq] | |
220 | %assign i 0 | |
221 | %rep 4 | |
222 | movq mm4, mm0 | |
223 | paddw mm4, mm3 ; t-tl+l | |
224 | pand mm4, mm6 | |
225 | movq mm5, mm3 | |
226 | pmaxsw mm3, mm1 | |
227 | pminsw mm5, mm1 | |
228 | pminsw mm3, mm4 | |
229 | pmaxsw mm3, mm5 ; median | |
230 | paddw mm3, mm2 ; +residual | |
231 | pand mm3, mm6 | |
232 | %if i==0 | |
233 | movq mm7, mm3 | |
234 | psllq mm7, 48 | |
235 | %else | |
236 | movq mm4, mm3 | |
237 | psrlq mm7, 16 | |
238 | psllq mm4, 48 | |
239 | por mm7, mm4 | |
240 | %endif | |
241 | %if i<3 | |
242 | psrlq mm0, 16 | |
243 | psrlq mm1, 16 | |
244 | psrlq mm2, 16 | |
245 | %endif | |
246 | %assign i i+1 | |
247 | %endrep | |
248 | movq [dstq+wq], mm7 | |
249 | add wq, 8 | |
250 | jl .loop | |
251 | movzx r2d, word [dstq-2] | |
252 | mov [leftq], r2d | |
253 | movzx r2d, word [topq-2] | |
254 | mov [left_topq], r2d | |
255 | RET | |
256 | ||
257 | cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top | |
258 | add wd, wd | |
259 | movd mm7, maskd | |
260 | SPLATW mm7, mm7 | |
261 | movq mm0, [src1q] | |
262 | movq mm2, [src2q] | |
263 | psllq mm0, 16 | |
264 | psllq mm2, 16 | |
265 | movd mm6, [left_topq] | |
266 | por mm0, mm6 | |
267 | movd mm6, [leftq] | |
268 | por mm2, mm6 | |
269 | xor maskq, maskq | |
270 | .loop: | |
271 | movq mm1, [src1q + maskq] | |
272 | movq mm3, [src2q + maskq] | |
273 | movq mm4, mm2 | |
274 | psubw mm2, mm0 | |
275 | paddw mm2, mm1 | |
276 | pand mm2, mm7 | |
277 | movq mm5, mm4 | |
278 | pmaxsw mm4, mm1 | |
279 | pminsw mm1, mm5 | |
280 | pminsw mm4, mm2 | |
281 | pmaxsw mm4, mm1 | |
282 | psubw mm3, mm4 | |
283 | pand mm3, mm7 | |
284 | movq [dstq + maskq], mm3 | |
285 | add maskq, 8 | |
286 | movq mm0, [src1q + maskq - 2] | |
287 | movq mm2, [src2q + maskq - 2] | |
288 | cmp maskq, wq | |
289 | jb .loop | |
290 | movzx maskd, word [src1q + wq - 2] | |
291 | mov [left_topq], maskd | |
292 | movzx maskd, word [src2q + wq - 2] | |
293 | mov [leftq], maskd | |
294 | RET |