Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* Core video DSP functions | |
3 | ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION .text | |
25 | ||
26 | ; slow vertical extension loop function. Works with variable-width, and | |
27 | ; does per-line reading/writing of source data | |
28 | ||
29 | %macro V_COPY_ROW 2 ; type (top/body/bottom), h | |
30 | .%1_y_loop: ; do { | |
31 | mov wq, r7mp ; initialize w (r7mp = wmp) | |
32 | .%1_x_loop: ; do { | |
33 | movu m0, [srcq+wq] ; m0 = read($mmsize) | |
34 | movu [dstq+wq], m0 ; write(m0, $mmsize) | |
35 | add wq, mmsize ; w -= $mmsize | |
36 | cmp wq, -mmsize ; } while (w > $mmsize); | |
37 | jl .%1_x_loop | |
38 | movu m0, [srcq-mmsize] ; m0 = read($mmsize) | |
39 | movu [dstq-mmsize], m0 ; write(m0, $mmsize) | |
40 | %ifidn %1, body ; if ($type == body) { | |
41 | add srcq, src_strideq ; src += src_stride | |
42 | %endif ; } | |
43 | add dstq, dst_strideq ; dst += dst_stride | |
44 | dec %2 ; } while (--$h); | |
45 | jnz .%1_y_loop | |
46 | %endmacro | |
47 | ||
48 | %macro vvar_fn 0 | |
49 | ; .----. <- zero | |
50 | ; | | <- top is copied from first line in body of source | |
51 | ; |----| <- start_y | |
52 | ; | | <- body is copied verbatim (line-by-line) from source | |
53 | ; |----| <- end_y | |
54 | ; | | <- bottom is copied from last line in body of source | |
55 | ; '----' <- bh | |
56 | %if ARCH_X86_64 | |
57 | cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ | |
58 | start_y, end_y, bh, w | |
59 | %else ; x86-32 | |
60 | cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w | |
61 | %define src_strideq r3mp | |
62 | %define dst_strideq r1mp | |
63 | mov srcq, r2mp | |
64 | mov start_yq, r4mp | |
65 | mov end_yq, r5mp | |
66 | mov bhq, r6mp | |
67 | %endif | |
68 | sub bhq, end_yq ; bh -= end_q | |
69 | sub end_yq, start_yq ; end_q -= start_q | |
70 | add srcq, r7mp ; (r7mp = wmp) | |
71 | add dstq, r7mp ; (r7mp = wmp) | |
72 | neg r7mp ; (r7mp = wmp) | |
73 | test start_yq, start_yq ; if (start_q) { | |
74 | jz .body | |
75 | V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) | |
76 | .body: ; } | |
77 | V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) | |
78 | test bhq, bhq ; if (bh) { | |
79 | jz .end | |
80 | sub srcq, src_strideq ; src -= src_stride | |
81 | V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) | |
82 | .end: ; } | |
83 | RET | |
84 | %endmacro | |
85 | ||
86 | %if ARCH_X86_32 | |
87 | INIT_MMX mmx | |
88 | vvar_fn | |
89 | %endif | |
90 | ||
91 | INIT_XMM sse | |
92 | vvar_fn | |
93 | ||
94 | %macro hvar_fn 0 | |
95 | cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w | |
96 | lea dstq, [dstq+n_wordsq*2] | |
97 | neg n_wordsq | |
98 | lea start_xq, [start_xq+n_wordsq*2] | |
99 | .y_loop: ; do { | |
100 | ; FIXME also write a ssse3 version using pshufb | |
101 | movzx wd, byte [dstq+start_xq] ; w = read(1) | |
102 | imul wd, 0x01010101 ; w *= 0x01010101 | |
103 | movd m0, wd | |
104 | mov wq, n_wordsq ; initialize w | |
105 | %if cpuflag(sse2) | |
106 | pshufd m0, m0, q0000 ; splat | |
107 | %else ; mmx | |
108 | punpckldq m0, m0 ; splat | |
109 | %endif ; mmx/sse | |
110 | .x_loop: ; do { | |
111 | movu [dstq+wq*2], m0 ; write($reg, $mmsize) | |
112 | add wq, mmsize/2 ; w -= $mmsize/2 | |
113 | cmp wq, -mmsize/2 ; } while (w > $mmsize/2) | |
114 | jl .x_loop | |
115 | movu [dstq-mmsize], m0 ; write($reg, $mmsize) | |
116 | add dstq, dst_strideq ; dst += dst_stride | |
117 | dec hq ; } while (h--) | |
118 | jnz .y_loop | |
119 | RET | |
120 | %endmacro | |
121 | ||
122 | %if ARCH_X86_32 | |
123 | INIT_MMX mmx | |
124 | hvar_fn | |
125 | %endif | |
126 | ||
127 | INIT_XMM sse2 | |
128 | hvar_fn | |
129 | ||
130 | ; macro to read/write a horizontal number of pixels (%2) to/from registers | |
131 | ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels | |
132 | ; - if (%2 & 8) fills 8 bytes into xmm$next | |
133 | ; - if (%2 & 4) fills 4 bytes into xmm$next | |
134 | ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax | |
135 | ; on mmx, - fills mm0-7 for consecutive sets of 8 pixels | |
136 | ; - if (%2 & 4) fills 4 bytes into mm$next | |
137 | ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax | |
138 | ; writing data out is in the same way | |
139 | %macro READ_NUM_BYTES 2 | |
140 | %assign %%off 0 ; offset in source buffer | |
141 | %assign %%mmx_idx 0 ; mmx register index | |
142 | %assign %%xmm_idx 0 ; xmm register index | |
143 | ||
144 | %rep %2/mmsize | |
145 | %if mmsize == 16 | |
146 | movu xmm %+ %%xmm_idx, [srcq+%%off] | |
147 | %assign %%xmm_idx %%xmm_idx+1 | |
148 | %else ; mmx | |
149 | movu mm %+ %%mmx_idx, [srcq+%%off] | |
150 | %assign %%mmx_idx %%mmx_idx+1 | |
151 | %endif | |
152 | %assign %%off %%off+mmsize | |
153 | %endrep ; %2/mmsize | |
154 | ||
155 | %if mmsize == 16 | |
156 | %if (%2-%%off) >= 8 | |
157 | %if %2 > 16 && (%2-%%off) > 8 | |
158 | movu xmm %+ %%xmm_idx, [srcq+%2-16] | |
159 | %assign %%xmm_idx %%xmm_idx+1 | |
160 | %assign %%off %2 | |
161 | %else | |
162 | movq mm %+ %%mmx_idx, [srcq+%%off] | |
163 | %assign %%mmx_idx %%mmx_idx+1 | |
164 | %assign %%off %%off+8 | |
165 | %endif | |
166 | %endif ; (%2-%%off) >= 8 | |
167 | %endif | |
168 | ||
169 | %if (%2-%%off) >= 4 | |
170 | %if %2 > 8 && (%2-%%off) > 4 | |
171 | movq mm %+ %%mmx_idx, [srcq+%2-8] | |
172 | %assign %%off %2 | |
173 | %else | |
174 | movd mm %+ %%mmx_idx, [srcq+%%off] | |
175 | %assign %%off %%off+4 | |
176 | %endif | |
177 | %assign %%mmx_idx %%mmx_idx+1 | |
178 | %endif ; (%2-%%off) >= 4 | |
179 | ||
180 | %if (%2-%%off) >= 1 | |
181 | %if %2 >= 4 | |
182 | movd mm %+ %%mmx_idx, [srcq+%2-4] | |
183 | %elif (%2-%%off) == 1 | |
184 | mov valb, [srcq+%2-1] | |
185 | %elif (%2-%%off) == 2 | |
186 | mov valw, [srcq+%2-2] | |
187 | %elifidn %1, body | |
188 | mov vald, [srcq+%2-3] | |
189 | %else | |
190 | movd mm %+ %%mmx_idx, [srcq+%2-3] | |
191 | %endif | |
192 | %endif ; (%2-%%off) >= 1 | |
193 | %endmacro ; READ_NUM_BYTES | |
194 | ||
195 | %macro WRITE_NUM_BYTES 2 | |
196 | %assign %%off 0 ; offset in destination buffer | |
197 | %assign %%mmx_idx 0 ; mmx register index | |
198 | %assign %%xmm_idx 0 ; xmm register index | |
199 | ||
200 | %rep %2/mmsize | |
201 | %if mmsize == 16 | |
202 | movu [dstq+%%off], xmm %+ %%xmm_idx | |
203 | %assign %%xmm_idx %%xmm_idx+1 | |
204 | %else ; mmx | |
205 | movu [dstq+%%off], mm %+ %%mmx_idx | |
206 | %assign %%mmx_idx %%mmx_idx+1 | |
207 | %endif | |
208 | %assign %%off %%off+mmsize | |
209 | %endrep ; %2/mmsize | |
210 | ||
211 | %if mmsize == 16 | |
212 | %if (%2-%%off) >= 8 | |
213 | %if %2 > 16 && (%2-%%off) > 8 | |
214 | movu [dstq+%2-16], xmm %+ %%xmm_idx | |
215 | %assign %%xmm_idx %%xmm_idx+1 | |
216 | %assign %%off %2 | |
217 | %else | |
218 | movq [dstq+%%off], mm %+ %%mmx_idx | |
219 | %assign %%mmx_idx %%mmx_idx+1 | |
220 | %assign %%off %%off+8 | |
221 | %endif | |
222 | %endif ; (%2-%%off) >= 8 | |
223 | %endif | |
224 | ||
225 | %if (%2-%%off) >= 4 | |
226 | %if %2 > 8 && (%2-%%off) > 4 | |
227 | movq [dstq+%2-8], mm %+ %%mmx_idx | |
228 | %assign %%off %2 | |
229 | %else | |
230 | movd [dstq+%%off], mm %+ %%mmx_idx | |
231 | %assign %%off %%off+4 | |
232 | %endif | |
233 | %assign %%mmx_idx %%mmx_idx+1 | |
234 | %endif ; (%2-%%off) >= 4 | |
235 | ||
236 | %if (%2-%%off) >= 1 | |
237 | %if %2 >= 4 | |
238 | movd [dstq+%2-4], mm %+ %%mmx_idx | |
239 | %elif (%2-%%off) == 1 | |
240 | mov [dstq+%2-1], valb | |
241 | %elif (%2-%%off) == 2 | |
242 | mov [dstq+%2-2], valw | |
243 | %elifidn %1, body | |
244 | mov [dstq+%2-3], valw | |
245 | shr vald, 16 | |
246 | mov [dstq+%2-1], valb | |
247 | %else | |
248 | movd vald, mm %+ %%mmx_idx | |
249 | mov [dstq+%2-3], valw | |
250 | shr vald, 16 | |
251 | mov [dstq+%2-1], valb | |
252 | %endif | |
253 | %endif ; (%2-%%off) >= 1 | |
254 | %endmacro ; WRITE_NUM_BYTES | |
255 | ||
256 | ; vertical top/bottom extend and body copy fast loops | |
257 | ; these are function pointers to set-width line copy functions, i.e. | |
258 | ; they read a fixed number of pixels into set registers, and write | |
259 | ; those out into the destination buffer | |
260 | %macro VERTICAL_EXTEND 2 | |
261 | %assign %%n %1 | |
262 | %rep 1+%2-%1 | |
263 | %if %%n <= 3 | |
264 | %if ARCH_X86_64 | |
265 | cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ | |
266 | start_y, end_y, val, bh | |
267 | mov bhq, r6mp ; r6mp = bhmp | |
268 | %else ; x86-32 | |
269 | cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh | |
270 | mov dstq, r0mp | |
271 | mov srcq, r2mp | |
272 | mov start_yq, r4mp | |
273 | mov end_yq, r5mp | |
274 | mov bhq, r6mp | |
275 | %define dst_strideq r1mp | |
276 | %define src_strideq r3mp | |
277 | %endif ; x86-64/32 | |
278 | %else | |
279 | %if ARCH_X86_64 | |
280 | cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ | |
281 | start_y, end_y, bh | |
282 | %else ; x86-32 | |
283 | cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh | |
284 | mov srcq, r2mp | |
285 | mov start_yq, r4mp | |
286 | mov end_yq, r5mp | |
287 | mov bhq, r6mp | |
288 | %define dst_strideq r1mp | |
289 | %define src_strideq r3mp | |
290 | %endif ; x86-64/32 | |
291 | %endif | |
292 | ; FIXME move this to c wrapper? | |
293 | sub bhq, end_yq ; bh -= end_y | |
294 | sub end_yq, start_yq ; end_y -= start_y | |
295 | ||
296 | ; extend pixels above body | |
297 | test start_yq, start_yq ; if (start_y) { | |
298 | jz .body_loop | |
299 | READ_NUM_BYTES top, %%n ; $variable_regs = read($n) | |
300 | .top_loop: ; do { | |
301 | WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) | |
302 | add dstq, dst_strideq ; dst += linesize | |
303 | dec start_yq ; } while (--start_y) | |
304 | jnz .top_loop ; } | |
305 | ||
306 | ; copy body pixels | |
307 | .body_loop: ; do { | |
308 | READ_NUM_BYTES body, %%n ; $variable_regs = read($n) | |
309 | WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) | |
310 | add dstq, dst_strideq ; dst += dst_stride | |
311 | add srcq, src_strideq ; src += src_stride | |
312 | dec end_yq ; } while (--end_y) | |
313 | jnz .body_loop | |
314 | ||
315 | ; copy bottom pixels | |
316 | test bhq, bhq ; if (block_h) { | |
317 | jz .end | |
318 | sub srcq, src_strideq ; src -= linesize | |
319 | READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) | |
320 | .bottom_loop: ; do { | |
321 | WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) | |
322 | add dstq, dst_strideq ; dst += linesize | |
323 | dec bhq ; } while (--bh) | |
324 | jnz .bottom_loop ; } | |
325 | ||
326 | .end: | |
327 | RET | |
328 | %assign %%n %%n+1 | |
329 | %endrep ; 1+%2-%1 | |
330 | %endmacro ; VERTICAL_EXTEND | |
331 | ||
332 | INIT_MMX mmx | |
333 | VERTICAL_EXTEND 1, 15 | |
334 | %if ARCH_X86_32 | |
335 | VERTICAL_EXTEND 16, 22 | |
336 | %endif | |
337 | ||
338 | INIT_XMM sse | |
339 | VERTICAL_EXTEND 16, 22 | |
340 | ||
341 | ; left/right (horizontal) fast extend functions | |
342 | ; these are essentially identical to the vertical extend ones above, | |
343 | ; just left/right separated because number of pixels to extend is | |
344 | ; obviously not the same on both sides. | |
345 | ||
346 | %macro READ_V_PIXEL 2 | |
347 | movzx vald, byte %2 | |
348 | imul vald, 0x01010101 | |
349 | %if %1 >= 8 | |
350 | movd m0, vald | |
351 | %if mmsize == 16 | |
352 | pshufd m0, m0, q0000 | |
353 | %else | |
354 | punpckldq m0, m0 | |
355 | %endif ; mmsize == 16 | |
356 | %endif ; %1 > 16 | |
357 | %endmacro ; READ_V_PIXEL | |
358 | ||
359 | %macro WRITE_V_PIXEL 2 | |
360 | %assign %%off 0 | |
361 | ||
362 | %if %1 >= 8 | |
363 | ||
364 | %rep %1/mmsize | |
365 | movu [%2+%%off], m0 | |
366 | %assign %%off %%off+mmsize | |
367 | %endrep ; %1/mmsize | |
368 | ||
369 | %if mmsize == 16 | |
370 | %if %1-%%off >= 8 | |
371 | %if %1 > 16 && %1-%%off > 8 | |
372 | movu [%2+%1-16], m0 | |
373 | %assign %%off %1 | |
374 | %else | |
375 | movq [%2+%%off], m0 | |
376 | %assign %%off %%off+8 | |
377 | %endif | |
378 | %endif ; %1-%%off >= 8 | |
379 | %endif ; mmsize == 16 | |
380 | ||
381 | %if %1-%%off >= 4 | |
382 | %if %1 > 8 && %1-%%off > 4 | |
383 | movq [%2+%1-8], m0 | |
384 | %assign %%off %1 | |
385 | %else | |
386 | movd [%2+%%off], m0 | |
387 | %assign %%off %%off+4 | |
388 | %endif | |
389 | %endif ; %1-%%off >= 4 | |
390 | ||
391 | %else ; %1 < 8 | |
392 | ||
393 | %rep %1/4 | |
394 | mov [%2+%%off], vald | |
395 | %assign %%off %%off+4 | |
396 | %endrep ; %1/4 | |
397 | ||
398 | %endif ; %1 >=/< 8 | |
399 | ||
400 | %if %1-%%off == 2 | |
401 | mov [%2+%%off], valw | |
402 | %endif ; (%1-%%off)/2 | |
403 | %endmacro ; WRITE_V_PIXEL | |
404 | ||
405 | %macro H_EXTEND 2 | |
406 | %assign %%n %1 | |
407 | %rep 1+(%2-%1)/2 | |
408 | cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val | |
409 | .loop_y: ; do { | |
410 | READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) | |
411 | WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) | |
412 | add dstq, dst_strideq ; dst += dst_stride | |
413 | dec bhq ; } while (--bh) | |
414 | jnz .loop_y | |
415 | RET | |
416 | %assign %%n %%n+2 | |
417 | %endrep ; 1+(%2-%1)/2 | |
418 | %endmacro ; H_EXTEND | |
419 | ||
420 | INIT_MMX mmx | |
421 | H_EXTEND 2, 14 | |
422 | %if ARCH_X86_32 | |
423 | H_EXTEND 16, 22 | |
424 | %endif | |
425 | ||
426 | INIT_XMM sse2 | |
427 | H_EXTEND 16, 22 | |
428 | ||
429 | %macro PREFETCH_FN 1 | |
430 | cglobal prefetch, 3, 3, 0, buf, stride, h | |
431 | .loop: | |
432 | %1 [bufq] | |
433 | add bufq, strideq | |
434 | dec hd | |
435 | jg .loop | |
436 | REP_RET | |
437 | %endmacro | |
438 | ||
439 | INIT_MMX mmxext | |
440 | PREFETCH_FN prefetcht0 | |
441 | %if ARCH_X86_32 | |
442 | INIT_MMX 3dnow | |
443 | PREFETCH_FN prefetch | |
444 | %endif |