Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* Core video DSP functions | |
3 | ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION .text | |
25 | ||
26 | ; slow vertical extension loop function. Works with variable-width, and | |
27 | ; does per-line reading/writing of source data | |
28 | ||
29 | %macro V_COPY_ROW 2 ; type (top/body/bottom), h | |
30 | .%1_y_loop: ; do { | |
31 | mov wq, r7mp ; initialize w (r7mp = wmp) | |
32 | .%1_x_loop: ; do { | |
33 | movu m0, [srcq+wq] ; m0 = read($mmsize) | |
34 | movu [dstq+wq], m0 ; write(m0, $mmsize) | |
35 | add wq, mmsize ; w -= $mmsize | |
36 | cmp wq, -mmsize ; } while (w > $mmsize); | |
37 | jl .%1_x_loop | |
38 | movu m0, [srcq-mmsize] ; m0 = read($mmsize) | |
39 | movu [dstq-mmsize], m0 ; write(m0, $mmsize) | |
40 | %ifidn %1, body ; if ($type == body) { | |
41 | add srcq, src_strideq ; src += src_stride | |
42 | %endif ; } | |
43 | add dstq, dst_strideq ; dst += dst_stride | |
44 | dec %2 ; } while (--$h); | |
45 | jnz .%1_y_loop | |
46 | %endmacro | |
47 | ||
48 | %macro vvar_fn 0 | |
49 | ; .----. <- zero | |
50 | ; | | <- top is copied from first line in body of source | |
51 | ; |----| <- start_y | |
52 | ; | | <- body is copied verbatim (line-by-line) from source | |
53 | ; |----| <- end_y | |
54 | ; | | <- bottom is copied from last line in body of source | |
55 | ; '----' <- bh | |
56 | %if ARCH_X86_64 | |
57 | cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ | |
58 | start_y, end_y, bh, w | |
59 | %else ; x86-32 | |
60 | cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w | |
61 | %define src_strideq r3mp | |
62 | %define dst_strideq r1mp | |
63 | mov srcq, r2mp | |
64 | mov start_yq, r4mp | |
65 | mov end_yq, r5mp | |
66 | mov bhq, r6mp | |
67 | %endif | |
68 | sub bhq, end_yq ; bh -= end_q | |
69 | sub end_yq, start_yq ; end_q -= start_q | |
70 | add srcq, r7mp ; (r7mp = wmp) | |
71 | add dstq, r7mp ; (r7mp = wmp) | |
72 | neg r7mp ; (r7mp = wmp) | |
73 | test start_yq, start_yq ; if (start_q) { | |
74 | jz .body | |
75 | V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) | |
76 | .body: ; } | |
77 | V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) | |
78 | test bhq, bhq ; if (bh) { | |
79 | jz .end | |
80 | sub srcq, src_strideq ; src -= src_stride | |
81 | V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) | |
82 | .end: ; } | |
83 | RET | |
84 | %endmacro | |
85 | ||
86 | %if ARCH_X86_32 | |
87 | INIT_MMX mmx | |
88 | vvar_fn | |
89 | %endif | |
90 | ||
91 | INIT_XMM sse | |
92 | vvar_fn | |
93 | ||
94 | %macro hvar_fn 0 | |
95 | cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w | |
96 | lea dstq, [dstq+n_wordsq*2] | |
97 | neg n_wordsq | |
98 | lea start_xq, [start_xq+n_wordsq*2] | |
99 | .y_loop: ; do { | |
f6fa7814 DM |
100 | %if cpuflag(avx2) |
101 | vpbroadcastb m0, [dstq+start_xq] | |
102 | mov wq, n_wordsq ; initialize w | |
103 | %else | |
2ba45a60 DM |
104 | movzx wd, byte [dstq+start_xq] ; w = read(1) |
105 | imul wd, 0x01010101 ; w *= 0x01010101 | |
106 | movd m0, wd | |
107 | mov wq, n_wordsq ; initialize w | |
108 | %if cpuflag(sse2) | |
109 | pshufd m0, m0, q0000 ; splat | |
110 | %else ; mmx | |
111 | punpckldq m0, m0 ; splat | |
112 | %endif ; mmx/sse | |
f6fa7814 | 113 | %endif ; avx2 |
2ba45a60 DM |
114 | .x_loop: ; do { |
115 | movu [dstq+wq*2], m0 ; write($reg, $mmsize) | |
116 | add wq, mmsize/2 ; w -= $mmsize/2 | |
117 | cmp wq, -mmsize/2 ; } while (w > $mmsize/2) | |
118 | jl .x_loop | |
119 | movu [dstq-mmsize], m0 ; write($reg, $mmsize) | |
120 | add dstq, dst_strideq ; dst += dst_stride | |
121 | dec hq ; } while (h--) | |
122 | jnz .y_loop | |
123 | RET | |
124 | %endmacro | |
125 | ||
126 | %if ARCH_X86_32 | |
127 | INIT_MMX mmx | |
128 | hvar_fn | |
129 | %endif | |
130 | ||
131 | INIT_XMM sse2 | |
132 | hvar_fn | |
133 | ||
f6fa7814 DM |
134 | %if HAVE_AVX2_EXTERNAL |
135 | INIT_XMM avx2 | |
136 | hvar_fn | |
137 | %endif | |
138 | ||
2ba45a60 DM |
139 | ; macro to read/write a horizontal number of pixels (%2) to/from registers |
140 | ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels | |
141 | ; - if (%2 & 8) fills 8 bytes into xmm$next | |
142 | ; - if (%2 & 4) fills 4 bytes into xmm$next | |
143 | ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax | |
144 | ; on mmx, - fills mm0-7 for consecutive sets of 8 pixels | |
145 | ; - if (%2 & 4) fills 4 bytes into mm$next | |
146 | ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax | |
147 | ; writing data out is in the same way | |
148 | %macro READ_NUM_BYTES 2 | |
149 | %assign %%off 0 ; offset in source buffer | |
150 | %assign %%mmx_idx 0 ; mmx register index | |
151 | %assign %%xmm_idx 0 ; xmm register index | |
152 | ||
153 | %rep %2/mmsize | |
154 | %if mmsize == 16 | |
155 | movu xmm %+ %%xmm_idx, [srcq+%%off] | |
156 | %assign %%xmm_idx %%xmm_idx+1 | |
157 | %else ; mmx | |
158 | movu mm %+ %%mmx_idx, [srcq+%%off] | |
159 | %assign %%mmx_idx %%mmx_idx+1 | |
160 | %endif | |
161 | %assign %%off %%off+mmsize | |
162 | %endrep ; %2/mmsize | |
163 | ||
164 | %if mmsize == 16 | |
165 | %if (%2-%%off) >= 8 | |
166 | %if %2 > 16 && (%2-%%off) > 8 | |
167 | movu xmm %+ %%xmm_idx, [srcq+%2-16] | |
168 | %assign %%xmm_idx %%xmm_idx+1 | |
169 | %assign %%off %2 | |
170 | %else | |
171 | movq mm %+ %%mmx_idx, [srcq+%%off] | |
172 | %assign %%mmx_idx %%mmx_idx+1 | |
173 | %assign %%off %%off+8 | |
174 | %endif | |
175 | %endif ; (%2-%%off) >= 8 | |
176 | %endif | |
177 | ||
178 | %if (%2-%%off) >= 4 | |
179 | %if %2 > 8 && (%2-%%off) > 4 | |
180 | movq mm %+ %%mmx_idx, [srcq+%2-8] | |
181 | %assign %%off %2 | |
182 | %else | |
183 | movd mm %+ %%mmx_idx, [srcq+%%off] | |
184 | %assign %%off %%off+4 | |
185 | %endif | |
186 | %assign %%mmx_idx %%mmx_idx+1 | |
187 | %endif ; (%2-%%off) >= 4 | |
188 | ||
189 | %if (%2-%%off) >= 1 | |
190 | %if %2 >= 4 | |
191 | movd mm %+ %%mmx_idx, [srcq+%2-4] | |
192 | %elif (%2-%%off) == 1 | |
193 | mov valb, [srcq+%2-1] | |
194 | %elif (%2-%%off) == 2 | |
195 | mov valw, [srcq+%2-2] | |
196 | %elifidn %1, body | |
197 | mov vald, [srcq+%2-3] | |
198 | %else | |
199 | movd mm %+ %%mmx_idx, [srcq+%2-3] | |
200 | %endif | |
201 | %endif ; (%2-%%off) >= 1 | |
202 | %endmacro ; READ_NUM_BYTES | |
203 | ||
204 | %macro WRITE_NUM_BYTES 2 | |
205 | %assign %%off 0 ; offset in destination buffer | |
206 | %assign %%mmx_idx 0 ; mmx register index | |
207 | %assign %%xmm_idx 0 ; xmm register index | |
208 | ||
209 | %rep %2/mmsize | |
210 | %if mmsize == 16 | |
211 | movu [dstq+%%off], xmm %+ %%xmm_idx | |
212 | %assign %%xmm_idx %%xmm_idx+1 | |
213 | %else ; mmx | |
214 | movu [dstq+%%off], mm %+ %%mmx_idx | |
215 | %assign %%mmx_idx %%mmx_idx+1 | |
216 | %endif | |
217 | %assign %%off %%off+mmsize | |
218 | %endrep ; %2/mmsize | |
219 | ||
220 | %if mmsize == 16 | |
221 | %if (%2-%%off) >= 8 | |
222 | %if %2 > 16 && (%2-%%off) > 8 | |
223 | movu [dstq+%2-16], xmm %+ %%xmm_idx | |
224 | %assign %%xmm_idx %%xmm_idx+1 | |
225 | %assign %%off %2 | |
226 | %else | |
227 | movq [dstq+%%off], mm %+ %%mmx_idx | |
228 | %assign %%mmx_idx %%mmx_idx+1 | |
229 | %assign %%off %%off+8 | |
230 | %endif | |
231 | %endif ; (%2-%%off) >= 8 | |
232 | %endif | |
233 | ||
234 | %if (%2-%%off) >= 4 | |
235 | %if %2 > 8 && (%2-%%off) > 4 | |
236 | movq [dstq+%2-8], mm %+ %%mmx_idx | |
237 | %assign %%off %2 | |
238 | %else | |
239 | movd [dstq+%%off], mm %+ %%mmx_idx | |
240 | %assign %%off %%off+4 | |
241 | %endif | |
242 | %assign %%mmx_idx %%mmx_idx+1 | |
243 | %endif ; (%2-%%off) >= 4 | |
244 | ||
245 | %if (%2-%%off) >= 1 | |
246 | %if %2 >= 4 | |
247 | movd [dstq+%2-4], mm %+ %%mmx_idx | |
248 | %elif (%2-%%off) == 1 | |
249 | mov [dstq+%2-1], valb | |
250 | %elif (%2-%%off) == 2 | |
251 | mov [dstq+%2-2], valw | |
252 | %elifidn %1, body | |
253 | mov [dstq+%2-3], valw | |
254 | shr vald, 16 | |
255 | mov [dstq+%2-1], valb | |
256 | %else | |
257 | movd vald, mm %+ %%mmx_idx | |
258 | mov [dstq+%2-3], valw | |
259 | shr vald, 16 | |
260 | mov [dstq+%2-1], valb | |
261 | %endif | |
262 | %endif ; (%2-%%off) >= 1 | |
263 | %endmacro ; WRITE_NUM_BYTES | |
264 | ||
265 | ; vertical top/bottom extend and body copy fast loops | |
266 | ; these are function pointers to set-width line copy functions, i.e. | |
267 | ; they read a fixed number of pixels into set registers, and write | |
268 | ; those out into the destination buffer | |
269 | %macro VERTICAL_EXTEND 2 | |
270 | %assign %%n %1 | |
271 | %rep 1+%2-%1 | |
272 | %if %%n <= 3 | |
273 | %if ARCH_X86_64 | |
274 | cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ | |
275 | start_y, end_y, val, bh | |
276 | mov bhq, r6mp ; r6mp = bhmp | |
277 | %else ; x86-32 | |
278 | cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh | |
279 | mov dstq, r0mp | |
280 | mov srcq, r2mp | |
281 | mov start_yq, r4mp | |
282 | mov end_yq, r5mp | |
283 | mov bhq, r6mp | |
284 | %define dst_strideq r1mp | |
285 | %define src_strideq r3mp | |
286 | %endif ; x86-64/32 | |
287 | %else | |
288 | %if ARCH_X86_64 | |
289 | cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ | |
290 | start_y, end_y, bh | |
291 | %else ; x86-32 | |
292 | cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh | |
293 | mov srcq, r2mp | |
294 | mov start_yq, r4mp | |
295 | mov end_yq, r5mp | |
296 | mov bhq, r6mp | |
297 | %define dst_strideq r1mp | |
298 | %define src_strideq r3mp | |
299 | %endif ; x86-64/32 | |
300 | %endif | |
301 | ; FIXME move this to c wrapper? | |
302 | sub bhq, end_yq ; bh -= end_y | |
303 | sub end_yq, start_yq ; end_y -= start_y | |
304 | ||
305 | ; extend pixels above body | |
306 | test start_yq, start_yq ; if (start_y) { | |
307 | jz .body_loop | |
308 | READ_NUM_BYTES top, %%n ; $variable_regs = read($n) | |
309 | .top_loop: ; do { | |
310 | WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) | |
311 | add dstq, dst_strideq ; dst += linesize | |
312 | dec start_yq ; } while (--start_y) | |
313 | jnz .top_loop ; } | |
314 | ||
315 | ; copy body pixels | |
316 | .body_loop: ; do { | |
317 | READ_NUM_BYTES body, %%n ; $variable_regs = read($n) | |
318 | WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) | |
319 | add dstq, dst_strideq ; dst += dst_stride | |
320 | add srcq, src_strideq ; src += src_stride | |
321 | dec end_yq ; } while (--end_y) | |
322 | jnz .body_loop | |
323 | ||
324 | ; copy bottom pixels | |
325 | test bhq, bhq ; if (block_h) { | |
326 | jz .end | |
327 | sub srcq, src_strideq ; src -= linesize | |
328 | READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) | |
329 | .bottom_loop: ; do { | |
330 | WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) | |
331 | add dstq, dst_strideq ; dst += linesize | |
332 | dec bhq ; } while (--bh) | |
333 | jnz .bottom_loop ; } | |
334 | ||
335 | .end: | |
336 | RET | |
337 | %assign %%n %%n+1 | |
338 | %endrep ; 1+%2-%1 | |
339 | %endmacro ; VERTICAL_EXTEND | |
340 | ||
341 | INIT_MMX mmx | |
342 | VERTICAL_EXTEND 1, 15 | |
343 | %if ARCH_X86_32 | |
344 | VERTICAL_EXTEND 16, 22 | |
345 | %endif | |
346 | ||
347 | INIT_XMM sse | |
348 | VERTICAL_EXTEND 16, 22 | |
349 | ||
350 | ; left/right (horizontal) fast extend functions | |
351 | ; these are essentially identical to the vertical extend ones above, | |
352 | ; just left/right separated because number of pixels to extend is | |
353 | ; obviously not the same on both sides. | |
354 | ||
355 | %macro READ_V_PIXEL 2 | |
f6fa7814 DM |
356 | %if cpuflag(avx2) |
357 | vpbroadcastb m0, %2 | |
358 | %else | |
2ba45a60 DM |
359 | movzx vald, byte %2 |
360 | imul vald, 0x01010101 | |
361 | %if %1 >= 8 | |
362 | movd m0, vald | |
363 | %if mmsize == 16 | |
364 | pshufd m0, m0, q0000 | |
365 | %else | |
366 | punpckldq m0, m0 | |
367 | %endif ; mmsize == 16 | |
368 | %endif ; %1 > 16 | |
f6fa7814 | 369 | %endif ; avx2 |
2ba45a60 DM |
370 | %endmacro ; READ_V_PIXEL |
371 | ||
372 | %macro WRITE_V_PIXEL 2 | |
373 | %assign %%off 0 | |
374 | ||
375 | %if %1 >= 8 | |
376 | ||
377 | %rep %1/mmsize | |
378 | movu [%2+%%off], m0 | |
379 | %assign %%off %%off+mmsize | |
380 | %endrep ; %1/mmsize | |
381 | ||
382 | %if mmsize == 16 | |
383 | %if %1-%%off >= 8 | |
384 | %if %1 > 16 && %1-%%off > 8 | |
385 | movu [%2+%1-16], m0 | |
386 | %assign %%off %1 | |
387 | %else | |
388 | movq [%2+%%off], m0 | |
389 | %assign %%off %%off+8 | |
390 | %endif | |
391 | %endif ; %1-%%off >= 8 | |
392 | %endif ; mmsize == 16 | |
393 | ||
394 | %if %1-%%off >= 4 | |
395 | %if %1 > 8 && %1-%%off > 4 | |
396 | movq [%2+%1-8], m0 | |
397 | %assign %%off %1 | |
398 | %else | |
399 | movd [%2+%%off], m0 | |
400 | %assign %%off %%off+4 | |
401 | %endif | |
402 | %endif ; %1-%%off >= 4 | |
403 | ||
404 | %else ; %1 < 8 | |
405 | ||
406 | %rep %1/4 | |
407 | mov [%2+%%off], vald | |
408 | %assign %%off %%off+4 | |
409 | %endrep ; %1/4 | |
410 | ||
411 | %endif ; %1 >=/< 8 | |
412 | ||
413 | %if %1-%%off == 2 | |
f6fa7814 DM |
414 | %if cpuflag(avx2) |
415 | movd [%2+%%off-2], m0 | |
416 | %else | |
2ba45a60 | 417 | mov [%2+%%off], valw |
f6fa7814 | 418 | %endif ; avx2 |
2ba45a60 DM |
419 | %endif ; (%1-%%off)/2 |
420 | %endmacro ; WRITE_V_PIXEL | |
421 | ||
422 | %macro H_EXTEND 2 | |
423 | %assign %%n %1 | |
424 | %rep 1+(%2-%1)/2 | |
f6fa7814 DM |
425 | %if cpuflag(avx2) |
426 | cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh | |
427 | %else | |
2ba45a60 | 428 | cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val |
f6fa7814 | 429 | %endif |
2ba45a60 DM |
430 | .loop_y: ; do { |
431 | READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) | |
432 | WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) | |
433 | add dstq, dst_strideq ; dst += dst_stride | |
434 | dec bhq ; } while (--bh) | |
435 | jnz .loop_y | |
436 | RET | |
437 | %assign %%n %%n+2 | |
438 | %endrep ; 1+(%2-%1)/2 | |
439 | %endmacro ; H_EXTEND | |
440 | ||
441 | INIT_MMX mmx | |
442 | H_EXTEND 2, 14 | |
443 | %if ARCH_X86_32 | |
444 | H_EXTEND 16, 22 | |
445 | %endif | |
446 | ||
447 | INIT_XMM sse2 | |
448 | H_EXTEND 16, 22 | |
449 | ||
f6fa7814 DM |
450 | %if HAVE_AVX2_EXTERNAL |
451 | INIT_XMM avx2 | |
452 | H_EXTEND 8, 22 | |
453 | %endif | |
454 | ||
2ba45a60 DM |
455 | %macro PREFETCH_FN 1 |
456 | cglobal prefetch, 3, 3, 0, buf, stride, h | |
457 | .loop: | |
458 | %1 [bufq] | |
459 | add bufq, strideq | |
460 | dec hd | |
461 | jg .loop | |
462 | REP_RET | |
463 | %endmacro | |
464 | ||
465 | INIT_MMX mmxext | |
466 | PREFETCH_FN prefetcht0 | |
467 | %if ARCH_X86_32 | |
468 | INIT_MMX 3dnow | |
469 | PREFETCH_FN prefetch | |
470 | %endif |