Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "../swscale_internal.h" | |
22 | #include "libavutil/x86/asm.h" | |
23 | #include "libavutil/x86/cpu.h" | |
24 | ||
25 | #define RET 0xC3 // near return opcode for x86 | |
26 | #define PREFETCH "prefetchnta" | |
27 | ||
28 | #if HAVE_INLINE_ASM | |
29 | av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, | |
30 | int16_t *filter, int32_t *filterPos, | |
31 | int numSplits) | |
32 | { | |
33 | uint8_t *fragmentA; | |
34 | x86_reg imm8OfPShufW1A; | |
35 | x86_reg imm8OfPShufW2A; | |
36 | x86_reg fragmentLengthA; | |
37 | uint8_t *fragmentB; | |
38 | x86_reg imm8OfPShufW1B; | |
39 | x86_reg imm8OfPShufW2B; | |
40 | x86_reg fragmentLengthB; | |
41 | int fragmentPos; | |
42 | ||
43 | int xpos, i; | |
44 | ||
45 | // create an optimized horizontal scaling routine | |
46 | /* This scaler is made of runtime-generated MMXEXT code using specially tuned | |
47 | * pshufw instructions. For every four output pixels, if four input pixels | |
48 | * are enough for the fast bilinear scaling, then a chunk of fragmentB is | |
49 | * used. If five input pixels are needed, then a chunk of fragmentA is used. | |
50 | */ | |
51 | ||
52 | // code fragment | |
53 | ||
54 | __asm__ volatile ( | |
55 | "jmp 9f \n\t" | |
56 | // Begin | |
57 | "0: \n\t" | |
58 | "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" | |
59 | "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" | |
60 | "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t" | |
61 | "punpcklbw %%mm7, %%mm1 \n\t" | |
62 | "punpcklbw %%mm7, %%mm0 \n\t" | |
63 | "pshufw $0xFF, %%mm1, %%mm1 \n\t" | |
64 | "1: \n\t" | |
65 | "pshufw $0xFF, %%mm0, %%mm0 \n\t" | |
66 | "2: \n\t" | |
67 | "psubw %%mm1, %%mm0 \n\t" | |
68 | "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" | |
69 | "pmullw %%mm3, %%mm0 \n\t" | |
70 | "psllw $7, %%mm1 \n\t" | |
71 | "paddw %%mm1, %%mm0 \n\t" | |
72 | ||
73 | "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" | |
74 | ||
75 | "add $8, %%"REG_a" \n\t" | |
76 | // End | |
77 | "9: \n\t" | |
78 | // "int $3 \n\t" | |
79 | "lea " LOCAL_MANGLE(0b) ", %0 \n\t" | |
80 | "lea " LOCAL_MANGLE(1b) ", %1 \n\t" | |
81 | "lea " LOCAL_MANGLE(2b) ", %2 \n\t" | |
82 | "dec %1 \n\t" | |
83 | "dec %2 \n\t" | |
84 | "sub %0, %1 \n\t" | |
85 | "sub %0, %2 \n\t" | |
86 | "lea " LOCAL_MANGLE(9b) ", %3 \n\t" | |
87 | "sub %0, %3 \n\t" | |
88 | ||
89 | ||
90 | : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), | |
91 | "=r" (fragmentLengthA) | |
92 | ); | |
93 | ||
94 | __asm__ volatile ( | |
95 | "jmp 9f \n\t" | |
96 | // Begin | |
97 | "0: \n\t" | |
98 | "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" | |
99 | "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" | |
100 | "punpcklbw %%mm7, %%mm0 \n\t" | |
101 | "pshufw $0xFF, %%mm0, %%mm1 \n\t" | |
102 | "1: \n\t" | |
103 | "pshufw $0xFF, %%mm0, %%mm0 \n\t" | |
104 | "2: \n\t" | |
105 | "psubw %%mm1, %%mm0 \n\t" | |
106 | "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" | |
107 | "pmullw %%mm3, %%mm0 \n\t" | |
108 | "psllw $7, %%mm1 \n\t" | |
109 | "paddw %%mm1, %%mm0 \n\t" | |
110 | ||
111 | "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" | |
112 | ||
113 | "add $8, %%"REG_a" \n\t" | |
114 | // End | |
115 | "9: \n\t" | |
116 | // "int $3 \n\t" | |
117 | "lea " LOCAL_MANGLE(0b) ", %0 \n\t" | |
118 | "lea " LOCAL_MANGLE(1b) ", %1 \n\t" | |
119 | "lea " LOCAL_MANGLE(2b) ", %2 \n\t" | |
120 | "dec %1 \n\t" | |
121 | "dec %2 \n\t" | |
122 | "sub %0, %1 \n\t" | |
123 | "sub %0, %2 \n\t" | |
124 | "lea " LOCAL_MANGLE(9b) ", %3 \n\t" | |
125 | "sub %0, %3 \n\t" | |
126 | ||
127 | ||
128 | : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), | |
129 | "=r" (fragmentLengthB) | |
130 | ); | |
131 | ||
132 | xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers | |
133 | fragmentPos = 0; | |
134 | ||
135 | for (i = 0; i < dstW / numSplits; i++) { | |
136 | int xx = xpos >> 16; | |
137 | ||
138 | if ((i & 3) == 0) { | |
139 | int a = 0; | |
140 | int b = ((xpos + xInc) >> 16) - xx; | |
141 | int c = ((xpos + xInc * 2) >> 16) - xx; | |
142 | int d = ((xpos + xInc * 3) >> 16) - xx; | |
143 | int inc = (d + 1 < 4); | |
144 | uint8_t *fragment = inc ? fragmentB : fragmentA; | |
145 | x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A; | |
146 | x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A; | |
147 | x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA; | |
148 | int maxShift = 3 - (d + inc); | |
149 | int shift = 0; | |
150 | ||
151 | if (filterCode) { | |
152 | filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9; | |
153 | filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9; | |
154 | filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9; | |
155 | filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9; | |
156 | filterPos[i / 2] = xx; | |
157 | ||
158 | memcpy(filterCode + fragmentPos, fragment, fragmentLength); | |
159 | ||
160 | filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) | | |
161 | ((b + inc) << 2) | | |
162 | ((c + inc) << 4) | | |
163 | ((d + inc) << 6); | |
164 | filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) | | |
165 | (c << 4) | | |
166 | (d << 6); | |
167 | ||
168 | if (i + 4 - inc >= dstW) | |
169 | shift = maxShift; // avoid overread | |
170 | else if ((filterPos[i / 2] & 3) <= maxShift) | |
171 | shift = filterPos[i / 2] & 3; // align | |
172 | ||
173 | if (shift && i >= shift) { | |
174 | filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift; | |
175 | filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift; | |
176 | filterPos[i / 2] -= shift; | |
177 | } | |
178 | } | |
179 | ||
180 | fragmentPos += fragmentLength; | |
181 | ||
182 | if (filterCode) | |
183 | filterCode[fragmentPos] = RET; | |
184 | } | |
185 | xpos += xInc; | |
186 | } | |
187 | if (filterCode) | |
188 | filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part | |
189 | ||
190 | return fragmentPos + 1; | |
191 | } | |
192 | ||
193 | void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, | |
194 | int dstWidth, const uint8_t *src, | |
195 | int srcW, int xInc) | |
196 | { | |
197 | int32_t *filterPos = c->hLumFilterPos; | |
198 | int16_t *filter = c->hLumFilter; | |
199 | void *mmxextFilterCode = c->lumMmxextFilterCode; | |
200 | int i; | |
201 | #if defined(PIC) | |
202 | uint64_t ebxsave; | |
203 | #endif | |
204 | #if ARCH_X86_64 | |
205 | uint64_t retsave; | |
206 | #endif | |
207 | ||
208 | __asm__ volatile( | |
209 | #if defined(PIC) | |
210 | "mov %%"REG_b", %5 \n\t" | |
211 | #if ARCH_X86_64 | |
212 | "mov -8(%%rsp), %%"REG_a" \n\t" | |
213 | "mov %%"REG_a", %6 \n\t" | |
214 | #endif | |
215 | #else | |
216 | #if ARCH_X86_64 | |
217 | "mov -8(%%rsp), %%"REG_a" \n\t" | |
218 | "mov %%"REG_a", %5 \n\t" | |
219 | #endif | |
220 | #endif | |
221 | "pxor %%mm7, %%mm7 \n\t" | |
222 | "mov %0, %%"REG_c" \n\t" | |
223 | "mov %1, %%"REG_D" \n\t" | |
224 | "mov %2, %%"REG_d" \n\t" | |
225 | "mov %3, %%"REG_b" \n\t" | |
226 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
227 | PREFETCH" (%%"REG_c") \n\t" | |
228 | PREFETCH" 32(%%"REG_c") \n\t" | |
229 | PREFETCH" 64(%%"REG_c") \n\t" | |
230 | ||
231 | #if ARCH_X86_64 | |
232 | #define CALL_MMXEXT_FILTER_CODE \ | |
233 | "movl (%%"REG_b"), %%esi \n\t"\ | |
234 | "call *%4 \n\t"\ | |
235 | "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ | |
236 | "add %%"REG_S", %%"REG_c" \n\t"\ | |
237 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
238 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
239 | ||
240 | #else | |
241 | #define CALL_MMXEXT_FILTER_CODE \ | |
242 | "movl (%%"REG_b"), %%esi \n\t"\ | |
243 | "call *%4 \n\t"\ | |
244 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ | |
245 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
246 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
247 | ||
248 | #endif /* ARCH_X86_64 */ | |
249 | ||
250 | CALL_MMXEXT_FILTER_CODE | |
251 | CALL_MMXEXT_FILTER_CODE | |
252 | CALL_MMXEXT_FILTER_CODE | |
253 | CALL_MMXEXT_FILTER_CODE | |
254 | CALL_MMXEXT_FILTER_CODE | |
255 | CALL_MMXEXT_FILTER_CODE | |
256 | CALL_MMXEXT_FILTER_CODE | |
257 | CALL_MMXEXT_FILTER_CODE | |
258 | ||
259 | #if defined(PIC) | |
260 | "mov %5, %%"REG_b" \n\t" | |
261 | #if ARCH_X86_64 | |
262 | "mov %6, %%"REG_a" \n\t" | |
263 | "mov %%"REG_a", -8(%%rsp) \n\t" | |
264 | #endif | |
265 | #else | |
266 | #if ARCH_X86_64 | |
267 | "mov %5, %%"REG_a" \n\t" | |
268 | "mov %%"REG_a", -8(%%rsp) \n\t" | |
269 | #endif | |
270 | #endif | |
271 | :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), | |
272 | "m" (mmxextFilterCode) | |
273 | #if defined(PIC) | |
274 | ,"m" (ebxsave) | |
275 | #endif | |
276 | #if ARCH_X86_64 | |
277 | ,"m"(retsave) | |
278 | #endif | |
279 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |
280 | #if !defined(PIC) | |
281 | ,"%"REG_b | |
282 | #endif | |
283 | ); | |
284 | ||
285 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) | |
286 | dst[i] = src[srcW-1]*128; | |
287 | } | |
288 | ||
289 | void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, | |
290 | int dstWidth, const uint8_t *src1, | |
291 | const uint8_t *src2, int srcW, int xInc) | |
292 | { | |
293 | int32_t *filterPos = c->hChrFilterPos; | |
294 | int16_t *filter = c->hChrFilter; | |
295 | void *mmxextFilterCode = c->chrMmxextFilterCode; | |
296 | int i; | |
297 | #if defined(PIC) | |
298 | DECLARE_ALIGNED(8, uint64_t, ebxsave); | |
299 | #endif | |
300 | #if ARCH_X86_64 | |
301 | DECLARE_ALIGNED(8, uint64_t, retsave); | |
302 | #endif | |
303 | ||
304 | __asm__ volatile( | |
305 | #if defined(PIC) | |
306 | "mov %%"REG_b", %7 \n\t" | |
307 | #if ARCH_X86_64 | |
308 | "mov -8(%%rsp), %%"REG_a" \n\t" | |
309 | "mov %%"REG_a", %8 \n\t" | |
310 | #endif | |
311 | #else | |
312 | #if ARCH_X86_64 | |
313 | "mov -8(%%rsp), %%"REG_a" \n\t" | |
314 | "mov %%"REG_a", %7 \n\t" | |
315 | #endif | |
316 | #endif | |
317 | "pxor %%mm7, %%mm7 \n\t" | |
318 | "mov %0, %%"REG_c" \n\t" | |
319 | "mov %1, %%"REG_D" \n\t" | |
320 | "mov %2, %%"REG_d" \n\t" | |
321 | "mov %3, %%"REG_b" \n\t" | |
322 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
323 | PREFETCH" (%%"REG_c") \n\t" | |
324 | PREFETCH" 32(%%"REG_c") \n\t" | |
325 | PREFETCH" 64(%%"REG_c") \n\t" | |
326 | ||
327 | CALL_MMXEXT_FILTER_CODE | |
328 | CALL_MMXEXT_FILTER_CODE | |
329 | CALL_MMXEXT_FILTER_CODE | |
330 | CALL_MMXEXT_FILTER_CODE | |
331 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
332 | "mov %5, %%"REG_c" \n\t" // src | |
333 | "mov %6, %%"REG_D" \n\t" // buf2 | |
334 | PREFETCH" (%%"REG_c") \n\t" | |
335 | PREFETCH" 32(%%"REG_c") \n\t" | |
336 | PREFETCH" 64(%%"REG_c") \n\t" | |
337 | ||
338 | CALL_MMXEXT_FILTER_CODE | |
339 | CALL_MMXEXT_FILTER_CODE | |
340 | CALL_MMXEXT_FILTER_CODE | |
341 | CALL_MMXEXT_FILTER_CODE | |
342 | ||
343 | #if defined(PIC) | |
344 | "mov %7, %%"REG_b" \n\t" | |
345 | #if ARCH_X86_64 | |
346 | "mov %8, %%"REG_a" \n\t" | |
347 | "mov %%"REG_a", -8(%%rsp) \n\t" | |
348 | #endif | |
349 | #else | |
350 | #if ARCH_X86_64 | |
351 | "mov %7, %%"REG_a" \n\t" | |
352 | "mov %%"REG_a", -8(%%rsp) \n\t" | |
353 | #endif | |
354 | #endif | |
355 | :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), | |
356 | "m" (mmxextFilterCode), "m" (src2), "m"(dst2) | |
357 | #if defined(PIC) | |
358 | ,"m" (ebxsave) | |
359 | #endif | |
360 | #if ARCH_X86_64 | |
361 | ,"m"(retsave) | |
362 | #endif | |
363 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |
364 | #if !defined(PIC) | |
365 | ,"%"REG_b | |
366 | #endif | |
367 | ); | |
368 | ||
369 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { | |
370 | dst1[i] = src1[srcW-1]*128; | |
371 | dst2[i] = src2[srcW-1]*128; | |
372 | } | |
373 | } | |
374 | #endif //HAVE_INLINE_ASM |