Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* Copyright (c) 2012 Michael Niedermayer | |
3 | ;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com> | |
4 | ;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com> | |
5 | ;* | |
6 | ;* This file is part of FFmpeg. | |
7 | ;* | |
8 | ;* FFmpeg is free software; you can redistribute it and/or | |
9 | ;* modify it under the terms of the GNU Lesser General Public | |
10 | ;* License as published by the Free Software Foundation; either | |
11 | ;* version 2.1 of the License, or (at your option) any later version. | |
12 | ;* | |
13 | ;* FFmpeg is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | ;* Lesser General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU Lesser General Public | |
19 | ;* License along with FFmpeg; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | ;****************************************************************************** | |
22 | ||
23 | %include "libavutil/x86/x86util.asm" | |
24 | ||
25 | %if ARCH_X86_64 | |
26 | %define pointer resq | |
27 | %else | |
28 | %define pointer resd | |
29 | %endif | |
30 | ||
31 | struc ResampleContext | |
32 | .av_class: pointer 1 | |
33 | .filter_bank: pointer 1 | |
34 | .filter_length: resd 1 | |
35 | .filter_alloc: resd 1 | |
36 | .ideal_dst_incr: resd 1 | |
37 | .dst_incr: resd 1 | |
38 | .dst_incr_div: resd 1 | |
39 | .dst_incr_mod: resd 1 | |
40 | .index: resd 1 | |
41 | .frac: resd 1 | |
42 | .src_incr: resd 1 | |
43 | .compensation_distance: resd 1 | |
44 | .phase_shift: resd 1 | |
45 | .phase_mask: resd 1 | |
46 | ||
47 | ; there's a few more here but we only care about the first few | |
48 | endstruc | |
49 | ||
50 | SECTION_RODATA | |
51 | ||
52 | pf_1: dd 1.0 | |
53 | pdbl_1: dq 1.0 | |
54 | pd_0x4000: dd 0x4000 | |
55 | ||
56 | SECTION .text | |
57 | ||
58 | %macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant | |
59 | ; int resample_common_$format(ResampleContext *ctx, $format *dst, | |
60 | ; const $format *src, int size, int update_ctx) | |
61 | %if ARCH_X86_64 ; unix64 and win64 | |
62 | cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \ | |
63 | dst_incr_mod, size, min_filter_count_x4, \ | |
64 | min_filter_len_x4, dst_incr_div, src_incr, \ | |
65 | phase_mask, dst_end, filter_bank | |
66 | ||
67 | ; use red-zone for variable storage | |
68 | %define ctx_stackq [rsp-0x8] | |
69 | %define src_stackq [rsp-0x10] | |
70 | %if WIN64 | |
71 | %define update_context_stackd r4m | |
72 | %else ; unix64 | |
73 | %define update_context_stackd [rsp-0x14] | |
74 | %endif | |
75 | ||
76 | ; load as many variables in registers as possible; for the rest, store | |
77 | ; on stack so that we have 'ctx' available as one extra register | |
78 | mov sized, r3d | |
79 | mov phase_maskd, [ctxq+ResampleContext.phase_mask] | |
80 | %if UNIX64 | |
81 | mov update_context_stackd, r4d | |
82 | %endif | |
83 | mov indexd, [ctxq+ResampleContext.index] | |
84 | mov fracd, [ctxq+ResampleContext.frac] | |
85 | mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] | |
86 | mov filter_bankq, [ctxq+ResampleContext.filter_bank] | |
87 | mov src_incrd, [ctxq+ResampleContext.src_incr] | |
88 | mov ctx_stackq, ctxq | |
89 | mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] | |
90 | mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] | |
91 | shl min_filter_len_x4d, %3 | |
92 | lea dst_endq, [dstq+sizeq*%2] | |
93 | ||
94 | %if UNIX64 | |
95 | mov ecx, [ctxq+ResampleContext.phase_shift] | |
96 | mov edi, [ctxq+ResampleContext.filter_alloc] | |
97 | ||
98 | DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ | |
99 | filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ | |
100 | src_incr, phase_mask, dst_end, filter_bank | |
101 | %elif WIN64 | |
102 | mov R9d, [ctxq+ResampleContext.filter_alloc] | |
103 | mov ecx, [ctxq+ResampleContext.phase_shift] | |
104 | ||
105 | DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ | |
106 | filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ | |
107 | src_incr, phase_mask, dst_end, filter_bank | |
108 | %endif | |
109 | ||
110 | neg min_filter_len_x4q | |
111 | sub filter_bankq, min_filter_len_x4q | |
112 | sub srcq, min_filter_len_x4q | |
113 | mov src_stackq, srcq | |
114 | %else ; x86-32 | |
115 | cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ | |
116 | index, min_filter_length_x4, filter_bank | |
117 | ||
118 | ; push temp variables to stack | |
119 | %define ctx_stackq r0mp | |
120 | %define src_stackq r2mp | |
121 | %define update_context_stackd r4m | |
122 | ||
123 | mov dstq, r1mp | |
124 | mov r3, r3mp | |
125 | lea r3, [dstq+r3*%2] | |
126 | PUSH dword [ctxq+ResampleContext.dst_incr_div] | |
127 | PUSH dword [ctxq+ResampleContext.dst_incr_mod] | |
128 | PUSH dword [ctxq+ResampleContext.filter_alloc] | |
129 | PUSH r3 | |
130 | PUSH dword [ctxq+ResampleContext.phase_mask] | |
131 | PUSH dword [ctxq+ResampleContext.src_incr] | |
132 | mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] | |
133 | mov indexd, [ctxq+ResampleContext.index] | |
134 | shl min_filter_length_x4d, %3 | |
135 | mov fracd, [ctxq+ResampleContext.frac] | |
136 | neg min_filter_length_x4q | |
137 | mov filter_bankq, [ctxq+ResampleContext.filter_bank] | |
138 | sub r2mp, min_filter_length_x4q | |
139 | sub filter_bankq, min_filter_length_x4q | |
140 | PUSH min_filter_length_x4q | |
141 | PUSH filter_bankq | |
142 | mov phase_shiftd, [ctxq+ResampleContext.phase_shift] | |
143 | ||
144 | DEFINE_ARGS src, phase_shift, dst, frac, index, min_filter_count_x4, filter | |
145 | ||
146 | %define filter_bankq dword [rsp+0x0] | |
147 | %define min_filter_length_x4q dword [rsp+0x4] | |
148 | %define src_incrd dword [rsp+0x8] | |
149 | %define phase_maskd dword [rsp+0xc] | |
150 | %define dst_endq dword [rsp+0x10] | |
151 | %define filter_allocd dword [rsp+0x14] | |
152 | %define dst_incr_modd dword [rsp+0x18] | |
153 | %define dst_incr_divd dword [rsp+0x1c] | |
154 | ||
155 | mov srcq, r2mp | |
156 | %endif | |
157 | ||
158 | .loop: | |
159 | mov filterd, filter_allocd | |
160 | imul filterd, indexd | |
161 | %if ARCH_X86_64 | |
162 | mov min_filter_count_x4q, min_filter_len_x4q | |
163 | lea filterq, [filter_bankq+filterq*%2] | |
164 | %else ; x86-32 | |
165 | mov min_filter_count_x4q, filter_bankq | |
166 | lea filterq, [min_filter_count_x4q+filterq*%2] | |
167 | mov min_filter_count_x4q, min_filter_length_x4q | |
168 | %endif | |
169 | %ifidn %1, int16 | |
170 | movd m0, [pd_0x4000] | |
171 | %else ; float/double | |
172 | xorps m0, m0, m0 | |
173 | %endif | |
174 | ||
175 | align 16 | |
176 | .inner_loop: | |
177 | movu m1, [srcq+min_filter_count_x4q*1] | |
178 | %ifidn %1, int16 | |
179 | PMADCSWD m0, m1, [filterq+min_filter_count_x4q*1], m0, m1 | |
180 | %else ; float/double | |
181 | %if cpuflag(fma4) || cpuflag(fma3) | |
182 | fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 | |
183 | %else | |
184 | mulp%4 m1, m1, [filterq+min_filter_count_x4q*1] | |
185 | addp%4 m0, m0, m1 | |
186 | %endif ; cpuflag | |
187 | %endif | |
188 | add min_filter_count_x4q, mmsize | |
189 | js .inner_loop | |
190 | ||
191 | %ifidn %1, int16 | |
192 | HADDD m0, m1 | |
193 | psrad m0, 15 | |
194 | add fracd, dst_incr_modd | |
195 | packssdw m0, m0 | |
196 | add indexd, dst_incr_divd | |
197 | movd [dstq], m0 | |
198 | %else ; float/double | |
199 | ; horizontal sum & store | |
200 | %if mmsize == 32 | |
201 | vextractf128 xm1, m0, 0x1 | |
202 | addps xm0, xm1 | |
203 | %endif | |
204 | movhlps xm1, xm0 | |
205 | %ifidn %1, float | |
206 | addps xm0, xm1 | |
207 | shufps xm1, xm0, xm0, q0001 | |
208 | %endif | |
209 | add fracd, dst_incr_modd | |
210 | addp%4 xm0, xm1 | |
211 | add indexd, dst_incr_divd | |
212 | movs%4 [dstq], xm0 | |
213 | %endif | |
214 | cmp fracd, src_incrd | |
215 | jl .skip | |
216 | sub fracd, src_incrd | |
217 | inc indexd | |
218 | ||
219 | %if UNIX64 | |
220 | DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ | |
221 | index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ | |
222 | src_incr, phase_mask, dst_end, filter_bank | |
223 | %elif WIN64 | |
224 | DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ | |
225 | index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ | |
226 | src_incr, phase_mask, dst_end, filter_bank | |
227 | %else ; x86-32 | |
228 | DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr | |
229 | %endif | |
230 | ||
231 | .skip: | |
232 | mov index_incrd, indexd | |
233 | add dstq, %2 | |
234 | and indexd, phase_maskd | |
235 | sar index_incrd, phase_shiftb | |
236 | lea srcq, [srcq+index_incrq*%2] | |
237 | cmp dstq, dst_endq | |
238 | jne .loop | |
239 | ||
240 | %if ARCH_X86_64 | |
241 | DEFINE_ARGS ctx, dst, src, phase_shift, index, frac | |
242 | %else ; x86-32 | |
243 | DEFINE_ARGS src, ctx, update_context, frac, index | |
244 | %endif | |
245 | ||
246 | cmp dword update_context_stackd, 0 | |
247 | jz .skip_store | |
248 | ; strictly speaking, the function should always return the consumed | |
249 | ; number of bytes; however, we only use the value if update_context | |
250 | ; is true, so let's just leave it uninitialized otherwise | |
251 | mov ctxq, ctx_stackq | |
252 | movifnidn rax, srcq | |
253 | mov [ctxq+ResampleContext.frac ], fracd | |
254 | sub rax, src_stackq | |
255 | mov [ctxq+ResampleContext.index], indexd | |
256 | shr rax, %3 | |
257 | ||
258 | .skip_store: | |
259 | %if ARCH_X86_32 | |
260 | ADD rsp, 0x20 | |
261 | %endif | |
262 | RET | |
263 | ||
264 | ; int resample_linear_$format(ResampleContext *ctx, float *dst, | |
265 | ; const float *src, int size, int update_ctx) | |
266 | %if ARCH_X86_64 ; unix64 and win64 | |
267 | %if UNIX64 | |
268 | cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_shift, index, frac, \ | |
269 | size, dst_incr_mod, min_filter_count_x4, \ | |
270 | min_filter_len_x4, dst_incr_div, src_incr, \ | |
271 | src, dst_end, filter_bank | |
272 | ||
273 | mov srcq, r2mp | |
274 | %else ; win64 | |
275 | cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, frac, \ | |
276 | size, dst_incr_mod, min_filter_count_x4, \ | |
277 | min_filter_len_x4, dst_incr_div, src_incr, \ | |
278 | dst, dst_end, filter_bank | |
279 | ||
280 | mov dstq, r1mp | |
281 | %endif | |
282 | ||
283 | ; use red-zone for variable storage | |
284 | %define ctx_stackq [rsp-0x8] | |
285 | %define src_stackq [rsp-0x10] | |
286 | %define phase_mask_stackd [rsp-0x14] | |
287 | %if WIN64 | |
288 | %define update_context_stackd r4m | |
289 | %else ; unix64 | |
290 | %define update_context_stackd [rsp-0x18] | |
291 | %endif | |
292 | ||
293 | ; load as many variables in registers as possible; for the rest, store | |
294 | ; on stack so that we have 'ctx' available as one extra register | |
295 | mov sized, r3d | |
296 | mov phase_maskd, [ctxq+ResampleContext.phase_mask] | |
297 | %if UNIX64 | |
298 | mov update_context_stackd, r4d | |
299 | %endif | |
300 | mov indexd, [ctxq+ResampleContext.index] | |
301 | mov fracd, [ctxq+ResampleContext.frac] | |
302 | mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] | |
303 | mov filter_bankq, [ctxq+ResampleContext.filter_bank] | |
304 | mov src_incrd, [ctxq+ResampleContext.src_incr] | |
305 | mov ctx_stackq, ctxq | |
306 | mov phase_mask_stackd, phase_maskd | |
307 | mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] | |
308 | %ifidn %1, int16 | |
309 | movd m4, [pd_0x4000] | |
310 | %else ; float/double | |
311 | cvtsi2s%4 xm0, src_incrd | |
312 | movs%4 xm4, [%5] | |
313 | divs%4 xm4, xm0 | |
314 | %endif | |
315 | mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] | |
316 | shl min_filter_len_x4d, %3 | |
317 | lea dst_endq, [dstq+sizeq*%2] | |
318 | ||
319 | %if UNIX64 | |
320 | mov ecx, [ctxq+ResampleContext.phase_shift] | |
321 | mov edi, [ctxq+ResampleContext.filter_alloc] | |
322 | ||
323 | DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, filter1, \ | |
324 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ | |
325 | dst_incr_div, src_incr, src, dst_end, filter_bank | |
326 | %elif WIN64 | |
327 | mov R9d, [ctxq+ResampleContext.filter_alloc] | |
328 | mov ecx, [ctxq+ResampleContext.phase_shift] | |
329 | ||
330 | DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, filter1, \ | |
331 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ | |
332 | dst_incr_div, src_incr, dst, dst_end, filter_bank | |
333 | %endif | |
334 | ||
335 | neg min_filter_len_x4q | |
336 | sub filter_bankq, min_filter_len_x4q | |
337 | sub srcq, min_filter_len_x4q | |
338 | mov src_stackq, srcq | |
339 | %else ; x86-32 | |
340 | cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ | |
341 | frac, index, dst, filter_bank | |
342 | ||
343 | ; push temp variables to stack | |
344 | %define ctx_stackq r0mp | |
345 | %define src_stackq r2mp | |
346 | %define update_context_stackd r4m | |
347 | ||
348 | mov dstq, r1mp | |
349 | mov r3, r3mp | |
350 | lea r3, [dstq+r3*%2] | |
351 | PUSH dword [ctxq+ResampleContext.dst_incr_div] | |
352 | PUSH r3 | |
353 | mov r3, dword [ctxq+ResampleContext.filter_alloc] | |
354 | PUSH dword [ctxq+ResampleContext.dst_incr_mod] | |
355 | PUSH r3 | |
356 | shl r3, %3 | |
357 | PUSH r3 | |
358 | mov r3, dword [ctxq+ResampleContext.src_incr] | |
359 | PUSH dword [ctxq+ResampleContext.phase_mask] | |
360 | PUSH r3d | |
361 | %ifidn %1, int16 | |
362 | movd m4, [pd_0x4000] | |
363 | %else ; float/double | |
364 | cvtsi2s%4 xm0, r3d | |
365 | movs%4 xm4, [%5] | |
366 | divs%4 xm4, xm0 | |
367 | %endif | |
368 | mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] | |
369 | mov indexd, [ctxq+ResampleContext.index] | |
370 | shl min_filter_length_x4d, %3 | |
371 | mov fracd, [ctxq+ResampleContext.frac] | |
372 | neg min_filter_length_x4q | |
373 | mov filter_bankq, [ctxq+ResampleContext.filter_bank] | |
374 | sub r2mp, min_filter_length_x4q | |
375 | sub filter_bankq, min_filter_length_x4q | |
376 | PUSH min_filter_length_x4q | |
377 | PUSH filter_bankq | |
378 | PUSH dword [ctxq+ResampleContext.phase_shift] | |
379 | ||
380 | DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src | |
381 | ||
382 | %define phase_shift_stackd dword [rsp+0x0] | |
383 | %define filter_bankq dword [rsp+0x4] | |
384 | %define min_filter_length_x4q dword [rsp+0x8] | |
385 | %define src_incrd dword [rsp+0xc] | |
386 | %define phase_mask_stackd dword [rsp+0x10] | |
387 | %define filter_alloc_x4q dword [rsp+0x14] | |
388 | %define filter_allocd dword [rsp+0x18] | |
389 | %define dst_incr_modd dword [rsp+0x1c] | |
390 | %define dst_endq dword [rsp+0x20] | |
391 | %define dst_incr_divd dword [rsp+0x24] | |
392 | ||
393 | mov srcq, r2mp | |
394 | %endif | |
395 | ||
396 | .loop: | |
397 | mov filter1d, filter_allocd | |
398 | imul filter1d, indexd | |
399 | %if ARCH_X86_64 | |
400 | mov min_filter_count_x4q, min_filter_len_x4q | |
401 | lea filter1q, [filter_bankq+filter1q*%2] | |
402 | lea filter2q, [filter1q+filter_allocq*%2] | |
403 | %else ; x86-32 | |
404 | mov min_filter_count_x4q, filter_bankq | |
405 | lea filter1q, [min_filter_count_x4q+filter1q*%2] | |
406 | mov min_filter_count_x4q, min_filter_length_x4q | |
407 | mov filter2q, filter1q | |
408 | add filter2q, filter_alloc_x4q | |
409 | %endif | |
410 | %ifidn %1, int16 | |
411 | mova m0, m4 | |
412 | mova m2, m4 | |
413 | %else ; float/double | |
414 | xorps m0, m0, m0 | |
415 | xorps m2, m2, m2 | |
416 | %endif | |
417 | ||
418 | align 16 | |
419 | .inner_loop: | |
420 | movu m1, [srcq+min_filter_count_x4q*1] | |
421 | %ifidn %1, int16 | |
422 | %if cpuflag(xop) | |
423 | vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2 | |
424 | vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0 | |
425 | %else | |
426 | pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1] | |
427 | pmaddwd m1, [filter1q+min_filter_count_x4q*1] | |
428 | paddd m2, m3 | |
429 | paddd m0, m1 | |
430 | %endif ; cpuflag | |
431 | %else ; float/double | |
432 | %if cpuflag(fma4) || cpuflag(fma3) | |
433 | fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2 | |
434 | fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0 | |
435 | %else | |
436 | mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1] | |
437 | mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1] | |
438 | addp%4 m2, m2, m3 | |
439 | addp%4 m0, m0, m1 | |
440 | %endif ; cpuflag | |
441 | %endif | |
442 | add min_filter_count_x4q, mmsize | |
443 | js .inner_loop | |
444 | ||
445 | %ifidn %1, int16 | |
446 | %if mmsize == 16 | |
447 | %if cpuflag(xop) | |
448 | vphadddq m2, m2 | |
449 | vphadddq m0, m0 | |
450 | %endif | |
451 | pshufd m3, m2, q0032 | |
452 | pshufd m1, m0, q0032 | |
453 | paddd m2, m3 | |
454 | paddd m0, m1 | |
455 | %endif | |
456 | %if notcpuflag(xop) | |
457 | PSHUFLW m3, m2, q0032 | |
458 | PSHUFLW m1, m0, q0032 | |
459 | paddd m2, m3 | |
460 | paddd m0, m1 | |
461 | %endif | |
462 | psubd m2, m0 | |
463 | ; This is probably a really bad idea on atom and other machines with a | |
464 | ; long transfer latency between GPRs and XMMs (atom). However, it does | |
465 | ; make the clip a lot simpler... | |
466 | movd eax, m2 | |
467 | add indexd, dst_incr_divd | |
468 | imul fracd | |
469 | idiv src_incrd | |
470 | movd m1, eax | |
471 | add fracd, dst_incr_modd | |
472 | paddd m0, m1 | |
473 | psrad m0, 15 | |
474 | packssdw m0, m0 | |
475 | movd [dstq], m0 | |
476 | ||
477 | ; note that for imul/idiv, I need to move filter to edx/eax for each: | |
478 | ; - 32bit: eax=r0[filter1], edx=r2[filter2] | |
479 | ; - win64: eax=r6[filter1], edx=r1[todo] | |
480 | ; - unix64: eax=r6[filter1], edx=r2[todo] | |
481 | %else ; float/double | |
482 | ; val += (v2 - val) * (FELEML) frac / c->src_incr; | |
483 | %if mmsize == 32 | |
484 | vextractf128 xm1, m0, 0x1 | |
485 | vextractf128 xm3, m2, 0x1 | |
486 | addps xm0, xm1 | |
487 | addps xm2, xm3 | |
488 | %endif | |
489 | cvtsi2s%4 xm1, fracd | |
490 | subp%4 xm2, xm0 | |
491 | mulp%4 xm1, xm4 | |
492 | shufp%4 xm1, xm1, q0000 | |
493 | %if cpuflag(fma4) || cpuflag(fma3) | |
494 | fmaddp%4 xm0, xm2, xm1, xm0 | |
495 | %else | |
496 | mulp%4 xm2, xm1 | |
497 | addp%4 xm0, xm2 | |
498 | %endif ; cpuflag | |
499 | ||
500 | ; horizontal sum & store | |
501 | movhlps xm1, xm0 | |
502 | %ifidn %1, float | |
503 | addps xm0, xm1 | |
504 | shufps xm1, xm0, xm0, q0001 | |
505 | %endif | |
506 | add fracd, dst_incr_modd | |
507 | addp%4 xm0, xm1 | |
508 | add indexd, dst_incr_divd | |
509 | movs%4 [dstq], xm0 | |
510 | %endif | |
511 | cmp fracd, src_incrd | |
512 | jl .skip | |
513 | sub fracd, src_incrd | |
514 | inc indexd | |
515 | ||
516 | %if UNIX64 | |
517 | DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, index_incr, \ | |
518 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ | |
519 | dst_incr_div, src_incr, src, dst_end, filter_bank | |
520 | %elif WIN64 | |
521 | DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, index_incr, \ | |
522 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ | |
523 | dst_incr_div, src_incr, dst, dst_end, filter_bank | |
524 | %else ; x86-32 | |
525 | DEFINE_ARGS filter1, phase_shift, index_incr, frac, index, dst, src | |
526 | %endif | |
527 | ||
528 | .skip: | |
529 | %if ARCH_X86_32 | |
530 | mov phase_shiftd, phase_shift_stackd | |
531 | %endif | |
532 | mov index_incrd, indexd | |
533 | add dstq, %2 | |
534 | and indexd, phase_mask_stackd | |
535 | sar index_incrd, phase_shiftb | |
536 | lea srcq, [srcq+index_incrq*%2] | |
537 | cmp dstq, dst_endq | |
538 | jne .loop | |
539 | ||
540 | %if UNIX64 | |
541 | DEFINE_ARGS ctx, dst, filter2, phase_shift, index, frac, index_incr, \ | |
542 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ | |
543 | dst_incr_div, src_incr, src, dst_end, filter_bank | |
544 | %elif WIN64 | |
545 | DEFINE_ARGS ctx, filter2, src, phase_shift, index, frac, index_incr, \ | |
546 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ | |
547 | dst_incr_div, src_incr, dst, dst_end, filter_bank | |
548 | %else ; x86-32 | |
549 | DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src | |
550 | %endif | |
551 | ||
552 | cmp dword update_context_stackd, 0 | |
553 | jz .skip_store | |
554 | ; strictly speaking, the function should always return the consumed | |
555 | ; number of bytes; however, we only use the value if update_context | |
556 | ; is true, so let's just leave it uninitialized otherwise | |
557 | mov ctxq, ctx_stackq | |
558 | movifnidn rax, srcq | |
559 | mov [ctxq+ResampleContext.frac ], fracd | |
560 | sub rax, src_stackq | |
561 | mov [ctxq+ResampleContext.index], indexd | |
562 | shr rax, %3 | |
563 | ||
564 | .skip_store: | |
565 | %if ARCH_X86_32 | |
566 | ADD rsp, 0x28 | |
567 | %endif | |
568 | RET | |
569 | %endmacro | |
570 | ||
571 | INIT_XMM sse | |
572 | RESAMPLE_FNS float, 4, 2, s, pf_1 | |
573 | ||
574 | %if HAVE_AVX_EXTERNAL | |
575 | INIT_YMM avx | |
576 | RESAMPLE_FNS float, 4, 2, s, pf_1 | |
577 | %endif | |
578 | %if HAVE_FMA3_EXTERNAL | |
579 | INIT_YMM fma3 | |
580 | RESAMPLE_FNS float, 4, 2, s, pf_1 | |
581 | %endif | |
582 | %if HAVE_FMA4_EXTERNAL | |
583 | INIT_XMM fma4 | |
584 | RESAMPLE_FNS float, 4, 2, s, pf_1 | |
585 | %endif | |
586 | ||
587 | %if ARCH_X86_32 | |
588 | INIT_MMX mmxext | |
589 | RESAMPLE_FNS int16, 2, 1 | |
590 | %endif | |
591 | ||
592 | INIT_XMM sse2 | |
593 | RESAMPLE_FNS int16, 2, 1 | |
594 | %if HAVE_XOP_EXTERNAL | |
595 | INIT_XMM xop | |
596 | RESAMPLE_FNS int16, 2, 1 | |
597 | %endif | |
598 | ||
599 | INIT_XMM sse2 | |
600 | RESAMPLE_FNS double, 8, 3, d, pdbl_1 |