Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include <stdint.h> | |
22 | ||
23 | #include "libavutil/x86/asm.h" | |
24 | #include "libswscale/swscale_internal.h" | |
25 | ||
26 | #undef REAL_MOVNTQ | |
27 | #undef MOVNTQ | |
28 | #undef MOVNTQ2 | |
29 | #undef PREFETCH | |
30 | ||
31 | ||
32 | #if COMPILE_TEMPLATE_MMXEXT | |
33 | #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" | |
34 | #define MOVNTQ2 "movntq " | |
35 | #else | |
36 | #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" | |
37 | #define MOVNTQ2 "movq " | |
38 | #endif | |
39 | #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) | |
40 | ||
41 | #if !COMPILE_TEMPLATE_MMXEXT | |
42 | static av_always_inline void | |
43 | dither_8to16(const uint8_t *srcDither, int rot) | |
44 | { | |
45 | if (rot) { | |
46 | __asm__ volatile("pxor %%mm0, %%mm0\n\t" | |
47 | "movq (%0), %%mm3\n\t" | |
48 | "movq %%mm3, %%mm4\n\t" | |
49 | "psrlq $24, %%mm3\n\t" | |
50 | "psllq $40, %%mm4\n\t" | |
51 | "por %%mm4, %%mm3\n\t" | |
52 | "movq %%mm3, %%mm4\n\t" | |
53 | "punpcklbw %%mm0, %%mm3\n\t" | |
54 | "punpckhbw %%mm0, %%mm4\n\t" | |
55 | :: "r"(srcDither) | |
56 | ); | |
57 | } else { | |
58 | __asm__ volatile("pxor %%mm0, %%mm0\n\t" | |
59 | "movq (%0), %%mm3\n\t" | |
60 | "movq %%mm3, %%mm4\n\t" | |
61 | "punpcklbw %%mm0, %%mm3\n\t" | |
62 | "punpckhbw %%mm0, %%mm4\n\t" | |
63 | :: "r"(srcDither) | |
64 | ); | |
65 | } | |
66 | } | |
67 | #endif | |
68 | ||
69 | static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, | |
70 | const int16_t **src, uint8_t *dest, int dstW, | |
71 | const uint8_t *dither, int offset) | |
72 | { | |
73 | dither_8to16(dither, offset); | |
74 | filterSize--; | |
75 | __asm__ volatile( | |
76 | "movd %0, %%mm1\n\t" | |
77 | "punpcklwd %%mm1, %%mm1\n\t" | |
78 | "punpckldq %%mm1, %%mm1\n\t" | |
79 | "psllw $3, %%mm1\n\t" | |
80 | "paddw %%mm1, %%mm3\n\t" | |
81 | "paddw %%mm1, %%mm4\n\t" | |
82 | "psraw $4, %%mm3\n\t" | |
83 | "psraw $4, %%mm4\n\t" | |
84 | ::"m"(filterSize) | |
85 | ); | |
86 | ||
87 | __asm__ volatile(\ | |
88 | "movq %%mm3, %%mm6\n\t" | |
89 | "movq %%mm4, %%mm7\n\t" | |
90 | "movl %3, %%ecx\n\t" | |
91 | "mov %0, %%"REG_d" \n\t"\ | |
92 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
93 | ".p2align 4 \n\t" /* FIXME Unroll? */\ | |
94 | "1: \n\t"\ | |
95 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
96 | "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ | |
97 | "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ | |
98 | "add $16, %%"REG_d" \n\t"\ | |
99 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
100 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
101 | "pmulhw %%mm0, %%mm2 \n\t"\ | |
102 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
103 | "paddw %%mm2, %%mm3 \n\t"\ | |
104 | "paddw %%mm5, %%mm4 \n\t"\ | |
105 | " jnz 1b \n\t"\ | |
106 | "psraw $3, %%mm3 \n\t"\ | |
107 | "psraw $3, %%mm4 \n\t"\ | |
108 | "packuswb %%mm4, %%mm3 \n\t" | |
109 | MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" | |
110 | "add $8, %%"REG_c" \n\t"\ | |
111 | "cmp %2, %%"REG_c" \n\t"\ | |
112 | "movq %%mm6, %%mm3\n\t" | |
113 | "movq %%mm7, %%mm4\n\t" | |
114 | "mov %0, %%"REG_d" \n\t"\ | |
115 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
116 | "jb 1b \n\t"\ | |
117 | :: "g" (filter), | |
118 | "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) | |
119 | : "%"REG_d, "%"REG_S, "%"REG_c | |
120 | ); | |
121 | } | |
122 | ||
123 | #define YSCALEYUV2PACKEDX_UV \ | |
124 | __asm__ volatile(\ | |
125 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
126 | ".p2align 4 \n\t"\ | |
127 | "nop \n\t"\ | |
128 | "1: \n\t"\ | |
129 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | |
130 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
131 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | |
132 | "movq %%mm3, %%mm4 \n\t"\ | |
133 | ".p2align 4 \n\t"\ | |
134 | "2: \n\t"\ | |
135 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
136 | "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | |
137 | "add %6, %%"REG_S" \n\t" \ | |
138 | "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ | |
139 | "add $16, %%"REG_d" \n\t"\ | |
140 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
141 | "pmulhw %%mm0, %%mm2 \n\t"\ | |
142 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
143 | "paddw %%mm2, %%mm3 \n\t"\ | |
144 | "paddw %%mm5, %%mm4 \n\t"\ | |
145 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
146 | " jnz 2b \n\t"\ | |
147 | ||
148 | #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ | |
149 | "lea "offset"(%0), %%"REG_d" \n\t"\ | |
150 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
151 | "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ | |
152 | "movq "#dst1", "#dst2" \n\t"\ | |
153 | ".p2align 4 \n\t"\ | |
154 | "2: \n\t"\ | |
155 | "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ | |
156 | "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ | |
157 | "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ | |
158 | "add $16, %%"REG_d" \n\t"\ | |
159 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
160 | "pmulhw "#coeff", "#src1" \n\t"\ | |
161 | "pmulhw "#coeff", "#src2" \n\t"\ | |
162 | "paddw "#src1", "#dst1" \n\t"\ | |
163 | "paddw "#src2", "#dst2" \n\t"\ | |
164 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
165 | " jnz 2b \n\t"\ | |
166 | ||
167 | #define YSCALEYUV2PACKEDX \ | |
168 | YSCALEYUV2PACKEDX_UV \ | |
169 | YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ | |
170 | ||
171 | #define YSCALEYUV2PACKEDX_END \ | |
172 | :: "r" (&c->redDither), \ | |
173 | "m" (dummy), "m" (dummy), "m" (dummy),\ | |
174 | "r" (dest), "m" (dstW_reg), "m"(uv_off) \ | |
175 | NAMED_CONSTRAINTS_ADD(bF8,bFC) \ | |
176 | : "%"REG_a, "%"REG_d, "%"REG_S \ | |
177 | ); | |
178 | ||
179 | #define YSCALEYUV2PACKEDX_ACCURATE_UV \ | |
180 | __asm__ volatile(\ | |
181 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
182 | ".p2align 4 \n\t"\ | |
183 | "nop \n\t"\ | |
184 | "1: \n\t"\ | |
185 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | |
186 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
187 | "pxor %%mm4, %%mm4 \n\t"\ | |
188 | "pxor %%mm5, %%mm5 \n\t"\ | |
189 | "pxor %%mm6, %%mm6 \n\t"\ | |
190 | "pxor %%mm7, %%mm7 \n\t"\ | |
191 | ".p2align 4 \n\t"\ | |
192 | "2: \n\t"\ | |
193 | "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ | |
194 | "add %6, %%"REG_S" \n\t" \ | |
195 | "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ | |
196 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ | |
197 | "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ | |
198 | "movq %%mm0, %%mm3 \n\t"\ | |
199 | "punpcklwd %%mm1, %%mm0 \n\t"\ | |
200 | "punpckhwd %%mm1, %%mm3 \n\t"\ | |
201 | "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ | |
202 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
203 | "pmaddwd %%mm1, %%mm3 \n\t"\ | |
204 | "paddd %%mm0, %%mm4 \n\t"\ | |
205 | "paddd %%mm3, %%mm5 \n\t"\ | |
206 | "add %6, %%"REG_S" \n\t" \ | |
207 | "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ | |
208 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ | |
209 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |
210 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
211 | "movq %%mm2, %%mm0 \n\t"\ | |
212 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
213 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
214 | "pmaddwd %%mm1, %%mm2 \n\t"\ | |
215 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
216 | "paddd %%mm2, %%mm6 \n\t"\ | |
217 | "paddd %%mm0, %%mm7 \n\t"\ | |
218 | " jnz 2b \n\t"\ | |
219 | "psrad $16, %%mm4 \n\t"\ | |
220 | "psrad $16, %%mm5 \n\t"\ | |
221 | "psrad $16, %%mm6 \n\t"\ | |
222 | "psrad $16, %%mm7 \n\t"\ | |
223 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | |
224 | "packssdw %%mm5, %%mm4 \n\t"\ | |
225 | "packssdw %%mm7, %%mm6 \n\t"\ | |
226 | "paddw %%mm0, %%mm4 \n\t"\ | |
227 | "paddw %%mm0, %%mm6 \n\t"\ | |
228 | "movq %%mm4, "U_TEMP"(%0) \n\t"\ | |
229 | "movq %%mm6, "V_TEMP"(%0) \n\t"\ | |
230 | ||
231 | #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ | |
232 | "lea "offset"(%0), %%"REG_d" \n\t"\ | |
233 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
234 | "pxor %%mm1, %%mm1 \n\t"\ | |
235 | "pxor %%mm5, %%mm5 \n\t"\ | |
236 | "pxor %%mm7, %%mm7 \n\t"\ | |
237 | "pxor %%mm6, %%mm6 \n\t"\ | |
238 | ".p2align 4 \n\t"\ | |
239 | "2: \n\t"\ | |
240 | "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ | |
241 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ | |
242 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ | |
243 | "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ | |
244 | "movq %%mm0, %%mm3 \n\t"\ | |
245 | "punpcklwd %%mm4, %%mm0 \n\t"\ | |
246 | "punpckhwd %%mm4, %%mm3 \n\t"\ | |
247 | "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ | |
248 | "pmaddwd %%mm4, %%mm0 \n\t"\ | |
249 | "pmaddwd %%mm4, %%mm3 \n\t"\ | |
250 | "paddd %%mm0, %%mm1 \n\t"\ | |
251 | "paddd %%mm3, %%mm5 \n\t"\ | |
252 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ | |
253 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ | |
254 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |
255 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
256 | "movq %%mm2, %%mm0 \n\t"\ | |
257 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
258 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
259 | "pmaddwd %%mm4, %%mm2 \n\t"\ | |
260 | "pmaddwd %%mm4, %%mm0 \n\t"\ | |
261 | "paddd %%mm2, %%mm7 \n\t"\ | |
262 | "paddd %%mm0, %%mm6 \n\t"\ | |
263 | " jnz 2b \n\t"\ | |
264 | "psrad $16, %%mm1 \n\t"\ | |
265 | "psrad $16, %%mm5 \n\t"\ | |
266 | "psrad $16, %%mm7 \n\t"\ | |
267 | "psrad $16, %%mm6 \n\t"\ | |
268 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | |
269 | "packssdw %%mm5, %%mm1 \n\t"\ | |
270 | "packssdw %%mm6, %%mm7 \n\t"\ | |
271 | "paddw %%mm0, %%mm1 \n\t"\ | |
272 | "paddw %%mm0, %%mm7 \n\t"\ | |
273 | "movq "U_TEMP"(%0), %%mm3 \n\t"\ | |
274 | "movq "V_TEMP"(%0), %%mm4 \n\t"\ | |
275 | ||
276 | #define YSCALEYUV2PACKEDX_ACCURATE \ | |
277 | YSCALEYUV2PACKEDX_ACCURATE_UV \ | |
278 | YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) | |
279 | ||
280 | #define YSCALEYUV2RGBX \ | |
281 | "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ | |
282 | "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
283 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
284 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
285 | "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ | |
286 | "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
287 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
288 | "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ | |
289 | "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
290 | "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
291 | "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
292 | "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
293 | "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
294 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
295 | "paddw %%mm3, %%mm4 \n\t"\ | |
296 | "movq %%mm2, %%mm0 \n\t"\ | |
297 | "movq %%mm5, %%mm6 \n\t"\ | |
298 | "movq %%mm4, %%mm3 \n\t"\ | |
299 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
300 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
301 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
302 | "paddw %%mm1, %%mm2 \n\t"\ | |
303 | "paddw %%mm1, %%mm5 \n\t"\ | |
304 | "paddw %%mm1, %%mm4 \n\t"\ | |
305 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
306 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
307 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
308 | "paddw %%mm7, %%mm0 \n\t"\ | |
309 | "paddw %%mm7, %%mm6 \n\t"\ | |
310 | "paddw %%mm7, %%mm3 \n\t"\ | |
311 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
312 | "packuswb %%mm0, %%mm2 \n\t"\ | |
313 | "packuswb %%mm6, %%mm5 \n\t"\ | |
314 | "packuswb %%mm3, %%mm4 \n\t"\ | |
315 | ||
316 | #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ | |
317 | "movq "#b", "#q2" \n\t" /* B */\ | |
318 | "movq "#r", "#t" \n\t" /* R */\ | |
319 | "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ | |
320 | "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ | |
321 | "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ | |
322 | "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ | |
323 | "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ | |
324 | "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ | |
325 | "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ | |
326 | "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ | |
327 | "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ | |
328 | "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ | |
329 | \ | |
330 | MOVNTQ( q0, (dst, index, 4))\ | |
331 | MOVNTQ( b, 8(dst, index, 4))\ | |
332 | MOVNTQ( q2, 16(dst, index, 4))\ | |
333 | MOVNTQ( q3, 24(dst, index, 4))\ | |
334 | \ | |
335 | "add $8, "#index" \n\t"\ | |
336 | "cmp "dstw", "#index" \n\t"\ | |
337 | " jb 1b \n\t" | |
338 | #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) | |
339 | ||
340 | static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, | |
341 | const int16_t **lumSrc, int lumFilterSize, | |
342 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
343 | const int16_t **chrVSrc, | |
344 | int chrFilterSize, const int16_t **alpSrc, | |
345 | uint8_t *dest, int dstW, int dstY) | |
346 | { | |
347 | x86_reg dummy=0; | |
348 | x86_reg dstW_reg = dstW; | |
349 | x86_reg uv_off = c->uv_offx2; | |
350 | ||
351 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { | |
352 | YSCALEYUV2PACKEDX_ACCURATE | |
353 | YSCALEYUV2RGBX | |
354 | "movq %%mm2, "U_TEMP"(%0) \n\t" | |
355 | "movq %%mm4, "V_TEMP"(%0) \n\t" | |
356 | "movq %%mm5, "Y_TEMP"(%0) \n\t" | |
357 | YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) | |
358 | "movq "Y_TEMP"(%0), %%mm5 \n\t" | |
359 | "psraw $3, %%mm1 \n\t" | |
360 | "psraw $3, %%mm7 \n\t" | |
361 | "packuswb %%mm7, %%mm1 \n\t" | |
362 | WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) | |
363 | YSCALEYUV2PACKEDX_END | |
364 | } else { | |
365 | YSCALEYUV2PACKEDX_ACCURATE | |
366 | YSCALEYUV2RGBX | |
367 | "pcmpeqd %%mm7, %%mm7 \n\t" | |
368 | WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |
369 | YSCALEYUV2PACKEDX_END | |
370 | } | |
371 | } | |
372 | ||
373 | static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, | |
374 | const int16_t **lumSrc, int lumFilterSize, | |
375 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
376 | const int16_t **chrVSrc, | |
377 | int chrFilterSize, const int16_t **alpSrc, | |
378 | uint8_t *dest, int dstW, int dstY) | |
379 | { | |
380 | x86_reg dummy=0; | |
381 | x86_reg dstW_reg = dstW; | |
382 | x86_reg uv_off = c->uv_offx2; | |
383 | ||
384 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { | |
385 | YSCALEYUV2PACKEDX | |
386 | YSCALEYUV2RGBX | |
387 | YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) | |
388 | "psraw $3, %%mm1 \n\t" | |
389 | "psraw $3, %%mm7 \n\t" | |
390 | "packuswb %%mm7, %%mm1 \n\t" | |
391 | WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) | |
392 | YSCALEYUV2PACKEDX_END | |
393 | } else { | |
394 | YSCALEYUV2PACKEDX | |
395 | YSCALEYUV2RGBX | |
396 | "pcmpeqd %%mm7, %%mm7 \n\t" | |
397 | WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |
398 | YSCALEYUV2PACKEDX_END | |
399 | } | |
400 | } | |
401 | ||
402 | #define REAL_WRITERGB16(dst, dstw, index) \ | |
403 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
404 | "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
405 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
406 | "psrlq $3, %%mm2 \n\t"\ | |
407 | \ | |
408 | "movq %%mm2, %%mm1 \n\t"\ | |
409 | "movq %%mm4, %%mm3 \n\t"\ | |
410 | \ | |
411 | "punpcklbw %%mm7, %%mm3 \n\t"\ | |
412 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
413 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
414 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
415 | \ | |
416 | "psllq $3, %%mm3 \n\t"\ | |
417 | "psllq $3, %%mm4 \n\t"\ | |
418 | \ | |
419 | "por %%mm3, %%mm2 \n\t"\ | |
420 | "por %%mm4, %%mm1 \n\t"\ | |
421 | \ | |
422 | MOVNTQ(%%mm2, (dst, index, 2))\ | |
423 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
424 | \ | |
425 | "add $8, "#index" \n\t"\ | |
426 | "cmp "dstw", "#index" \n\t"\ | |
427 | " jb 1b \n\t" | |
428 | #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) | |
429 | ||
430 | static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, | |
431 | const int16_t **lumSrc, int lumFilterSize, | |
432 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
433 | const int16_t **chrVSrc, | |
434 | int chrFilterSize, const int16_t **alpSrc, | |
435 | uint8_t *dest, int dstW, int dstY) | |
436 | { | |
437 | x86_reg dummy=0; | |
438 | x86_reg dstW_reg = dstW; | |
439 | x86_reg uv_off = c->uv_offx2; | |
440 | ||
441 | YSCALEYUV2PACKEDX_ACCURATE | |
442 | YSCALEYUV2RGBX | |
443 | "pxor %%mm7, %%mm7 \n\t" | |
444 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
445 | #ifdef DITHER1XBPP | |
446 | "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" | |
447 | "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" | |
448 | "paddusb "RED_DITHER"(%0), %%mm5\n\t" | |
449 | #endif | |
450 | WRITERGB16(%4, "%5", %%REGa) | |
451 | YSCALEYUV2PACKEDX_END | |
452 | } | |
453 | ||
454 | static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, | |
455 | const int16_t **lumSrc, int lumFilterSize, | |
456 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
457 | const int16_t **chrVSrc, | |
458 | int chrFilterSize, const int16_t **alpSrc, | |
459 | uint8_t *dest, int dstW, int dstY) | |
460 | { | |
461 | x86_reg dummy=0; | |
462 | x86_reg dstW_reg = dstW; | |
463 | x86_reg uv_off = c->uv_offx2; | |
464 | ||
465 | YSCALEYUV2PACKEDX | |
466 | YSCALEYUV2RGBX | |
467 | "pxor %%mm7, %%mm7 \n\t" | |
468 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
469 | #ifdef DITHER1XBPP | |
470 | "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" | |
471 | "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" | |
472 | "paddusb "RED_DITHER"(%0), %%mm5 \n\t" | |
473 | #endif | |
474 | WRITERGB16(%4, "%5", %%REGa) | |
475 | YSCALEYUV2PACKEDX_END | |
476 | } | |
477 | ||
478 | #define REAL_WRITERGB15(dst, dstw, index) \ | |
479 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
480 | "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
481 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
482 | "psrlq $3, %%mm2 \n\t"\ | |
483 | "psrlq $1, %%mm5 \n\t"\ | |
484 | \ | |
485 | "movq %%mm2, %%mm1 \n\t"\ | |
486 | "movq %%mm4, %%mm3 \n\t"\ | |
487 | \ | |
488 | "punpcklbw %%mm7, %%mm3 \n\t"\ | |
489 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
490 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
491 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
492 | \ | |
493 | "psllq $2, %%mm3 \n\t"\ | |
494 | "psllq $2, %%mm4 \n\t"\ | |
495 | \ | |
496 | "por %%mm3, %%mm2 \n\t"\ | |
497 | "por %%mm4, %%mm1 \n\t"\ | |
498 | \ | |
499 | MOVNTQ(%%mm2, (dst, index, 2))\ | |
500 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
501 | \ | |
502 | "add $8, "#index" \n\t"\ | |
503 | "cmp "dstw", "#index" \n\t"\ | |
504 | " jb 1b \n\t" | |
505 | #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) | |
506 | ||
507 | static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, | |
508 | const int16_t **lumSrc, int lumFilterSize, | |
509 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
510 | const int16_t **chrVSrc, | |
511 | int chrFilterSize, const int16_t **alpSrc, | |
512 | uint8_t *dest, int dstW, int dstY) | |
513 | { | |
514 | x86_reg dummy=0; | |
515 | x86_reg dstW_reg = dstW; | |
516 | x86_reg uv_off = c->uv_offx2; | |
517 | ||
518 | YSCALEYUV2PACKEDX_ACCURATE | |
519 | YSCALEYUV2RGBX | |
520 | "pxor %%mm7, %%mm7 \n\t" | |
521 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
522 | #ifdef DITHER1XBPP | |
523 | "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" | |
524 | "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" | |
525 | "paddusb "RED_DITHER"(%0), %%mm5\n\t" | |
526 | #endif | |
527 | WRITERGB15(%4, "%5", %%REGa) | |
528 | YSCALEYUV2PACKEDX_END | |
529 | } | |
530 | ||
531 | static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, | |
532 | const int16_t **lumSrc, int lumFilterSize, | |
533 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
534 | const int16_t **chrVSrc, | |
535 | int chrFilterSize, const int16_t **alpSrc, | |
536 | uint8_t *dest, int dstW, int dstY) | |
537 | { | |
538 | x86_reg dummy=0; | |
539 | x86_reg dstW_reg = dstW; | |
540 | x86_reg uv_off = c->uv_offx2; | |
541 | ||
542 | YSCALEYUV2PACKEDX | |
543 | YSCALEYUV2RGBX | |
544 | "pxor %%mm7, %%mm7 \n\t" | |
545 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
546 | #ifdef DITHER1XBPP | |
547 | "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" | |
548 | "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" | |
549 | "paddusb "RED_DITHER"(%0), %%mm5 \n\t" | |
550 | #endif | |
551 | WRITERGB15(%4, "%5", %%REGa) | |
552 | YSCALEYUV2PACKEDX_END | |
553 | } | |
554 | ||
555 | #define WRITEBGR24MMX(dst, dstw, index) \ | |
556 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
557 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
558 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
559 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
560 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
561 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
562 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
563 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
564 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
565 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
566 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
567 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
568 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
569 | \ | |
570 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
571 | "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
572 | "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
573 | "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
574 | \ | |
575 | "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
576 | "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
577 | "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
578 | "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
579 | \ | |
580 | "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
581 | "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
582 | "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
583 | "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
584 | \ | |
585 | "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
586 | "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
587 | "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
588 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
589 | MOVNTQ(%%mm0, (dst))\ | |
590 | \ | |
591 | "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
592 | "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
593 | "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
594 | "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
595 | MOVNTQ(%%mm6, 8(dst))\ | |
596 | \ | |
597 | "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
598 | "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
599 | "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
600 | MOVNTQ(%%mm5, 16(dst))\ | |
601 | \ | |
602 | "add $24, "#dst" \n\t"\ | |
603 | \ | |
604 | "add $8, "#index" \n\t"\ | |
605 | "cmp "dstw", "#index" \n\t"\ | |
606 | " jb 1b \n\t" | |
607 | ||
608 | #define WRITEBGR24MMXEXT(dst, dstw, index) \ | |
609 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
610 | "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ | |
611 | "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ | |
612 | "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ | |
613 | "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
614 | "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
615 | \ | |
616 | "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
617 | "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
618 | "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
619 | \ | |
620 | "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
621 | "por %%mm1, %%mm6 \n\t"\ | |
622 | "por %%mm3, %%mm6 \n\t"\ | |
623 | MOVNTQ(%%mm6, (dst))\ | |
624 | \ | |
625 | "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
626 | "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
627 | "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
628 | "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
629 | \ | |
630 | "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ | |
631 | "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ | |
632 | "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
633 | \ | |
634 | "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
635 | "por %%mm3, %%mm6 \n\t"\ | |
636 | MOVNTQ(%%mm6, 8(dst))\ | |
637 | \ | |
638 | "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
639 | "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
640 | "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
641 | \ | |
642 | "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
643 | "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
644 | "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ | |
645 | \ | |
646 | "por %%mm1, %%mm3 \n\t"\ | |
647 | "por %%mm3, %%mm6 \n\t"\ | |
648 | MOVNTQ(%%mm6, 16(dst))\ | |
649 | \ | |
650 | "add $24, "#dst" \n\t"\ | |
651 | \ | |
652 | "add $8, "#index" \n\t"\ | |
653 | "cmp "dstw", "#index" \n\t"\ | |
654 | " jb 1b \n\t" | |
655 | ||
656 | #if COMPILE_TEMPLATE_MMXEXT | |
657 | #undef WRITEBGR24 | |
658 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) | |
659 | #else | |
660 | #undef WRITEBGR24 | |
661 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) | |
662 | #endif | |
663 | ||
664 | #if HAVE_6REGS | |
665 | static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, | |
666 | const int16_t **lumSrc, int lumFilterSize, | |
667 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
668 | const int16_t **chrVSrc, | |
669 | int chrFilterSize, const int16_t **alpSrc, | |
670 | uint8_t *dest, int dstW, int dstY) | |
671 | { | |
672 | x86_reg dummy=0; | |
673 | x86_reg dstW_reg = dstW; | |
674 | x86_reg uv_off = c->uv_offx2; | |
675 | ||
676 | YSCALEYUV2PACKEDX_ACCURATE | |
677 | YSCALEYUV2RGBX | |
678 | "pxor %%mm7, %%mm7 \n\t" | |
679 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize | |
680 | "add %4, %%"REG_c" \n\t" | |
681 | WRITEBGR24(%%REGc, "%5", %%REGa) | |
682 | :: "r" (&c->redDither), | |
683 | "m" (dummy), "m" (dummy), "m" (dummy), | |
684 | "r" (dest), "m" (dstW_reg), "m"(uv_off) | |
685 | NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |
686 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | |
687 | ); | |
688 | } | |
689 | ||
690 | static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, | |
691 | const int16_t **lumSrc, int lumFilterSize, | |
692 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
693 | const int16_t **chrVSrc, | |
694 | int chrFilterSize, const int16_t **alpSrc, | |
695 | uint8_t *dest, int dstW, int dstY) | |
696 | { | |
697 | x86_reg dummy=0; | |
698 | x86_reg dstW_reg = dstW; | |
699 | x86_reg uv_off = c->uv_offx2; | |
700 | ||
701 | YSCALEYUV2PACKEDX | |
702 | YSCALEYUV2RGBX | |
703 | "pxor %%mm7, %%mm7 \n\t" | |
704 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize | |
705 | "add %4, %%"REG_c" \n\t" | |
706 | WRITEBGR24(%%REGc, "%5", %%REGa) | |
707 | :: "r" (&c->redDither), | |
708 | "m" (dummy), "m" (dummy), "m" (dummy), | |
709 | "r" (dest), "m" (dstW_reg), "m"(uv_off) | |
710 | NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |
711 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | |
712 | ); | |
713 | } | |
714 | #endif /* HAVE_6REGS */ | |
715 | ||
716 | #define REAL_WRITEYUY2(dst, dstw, index) \ | |
717 | "packuswb %%mm3, %%mm3 \n\t"\ | |
718 | "packuswb %%mm4, %%mm4 \n\t"\ | |
719 | "packuswb %%mm7, %%mm1 \n\t"\ | |
720 | "punpcklbw %%mm4, %%mm3 \n\t"\ | |
721 | "movq %%mm1, %%mm7 \n\t"\ | |
722 | "punpcklbw %%mm3, %%mm1 \n\t"\ | |
723 | "punpckhbw %%mm3, %%mm7 \n\t"\ | |
724 | \ | |
725 | MOVNTQ(%%mm1, (dst, index, 2))\ | |
726 | MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
727 | \ | |
728 | "add $8, "#index" \n\t"\ | |
729 | "cmp "dstw", "#index" \n\t"\ | |
730 | " jb 1b \n\t" | |
731 | #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) | |
732 | ||
733 | static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, | |
734 | const int16_t **lumSrc, int lumFilterSize, | |
735 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
736 | const int16_t **chrVSrc, | |
737 | int chrFilterSize, const int16_t **alpSrc, | |
738 | uint8_t *dest, int dstW, int dstY) | |
739 | { | |
740 | x86_reg dummy=0; | |
741 | x86_reg dstW_reg = dstW; | |
742 | x86_reg uv_off = c->uv_offx2; | |
743 | ||
744 | YSCALEYUV2PACKEDX_ACCURATE | |
745 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
746 | "psraw $3, %%mm3 \n\t" | |
747 | "psraw $3, %%mm4 \n\t" | |
748 | "psraw $3, %%mm1 \n\t" | |
749 | "psraw $3, %%mm7 \n\t" | |
750 | WRITEYUY2(%4, "%5", %%REGa) | |
751 | YSCALEYUV2PACKEDX_END | |
752 | } | |
753 | ||
754 | static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, | |
755 | const int16_t **lumSrc, int lumFilterSize, | |
756 | const int16_t *chrFilter, const int16_t **chrUSrc, | |
757 | const int16_t **chrVSrc, | |
758 | int chrFilterSize, const int16_t **alpSrc, | |
759 | uint8_t *dest, int dstW, int dstY) | |
760 | { | |
761 | x86_reg dummy=0; | |
762 | x86_reg dstW_reg = dstW; | |
763 | x86_reg uv_off = c->uv_offx2; | |
764 | ||
765 | YSCALEYUV2PACKEDX | |
766 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
767 | "psraw $3, %%mm3 \n\t" | |
768 | "psraw $3, %%mm4 \n\t" | |
769 | "psraw $3, %%mm1 \n\t" | |
770 | "psraw $3, %%mm7 \n\t" | |
771 | WRITEYUY2(%4, "%5", %%REGa) | |
772 | YSCALEYUV2PACKEDX_END | |
773 | } | |
774 | ||
775 | #define REAL_YSCALEYUV2RGB_UV(index, c) \ | |
776 | "xor "#index", "#index" \n\t"\ | |
777 | ".p2align 4 \n\t"\ | |
778 | "1: \n\t"\ | |
779 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
780 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
781 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
782 | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
783 | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
784 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
785 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
786 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
787 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | |
788 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
789 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
790 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
791 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
792 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
793 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
794 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
795 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
796 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
797 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
798 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | |
799 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | |
800 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
801 | ||
802 | #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ | |
803 | "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
804 | "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
805 | "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
806 | "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
807 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
808 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
809 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
810 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
811 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
812 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
813 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
814 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
815 | ||
816 | #define REAL_YSCALEYUV2RGB_COEFF(c) \ | |
817 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | |
818 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | |
819 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
820 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
821 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
822 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
823 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
824 | "paddw %%mm3, %%mm4 \n\t"\ | |
825 | "movq %%mm2, %%mm0 \n\t"\ | |
826 | "movq %%mm5, %%mm6 \n\t"\ | |
827 | "movq %%mm4, %%mm3 \n\t"\ | |
828 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
829 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
830 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
831 | "paddw %%mm1, %%mm2 \n\t"\ | |
832 | "paddw %%mm1, %%mm5 \n\t"\ | |
833 | "paddw %%mm1, %%mm4 \n\t"\ | |
834 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
835 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
836 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
837 | "paddw %%mm7, %%mm0 \n\t"\ | |
838 | "paddw %%mm7, %%mm6 \n\t"\ | |
839 | "paddw %%mm7, %%mm3 \n\t"\ | |
840 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
841 | "packuswb %%mm0, %%mm2 \n\t"\ | |
842 | "packuswb %%mm6, %%mm5 \n\t"\ | |
843 | "packuswb %%mm3, %%mm4 \n\t"\ | |
844 | ||
845 | #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) | |
846 | ||
847 | #define YSCALEYUV2RGB(index, c) \ | |
848 | REAL_YSCALEYUV2RGB_UV(index, c) \ | |
849 | REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ | |
850 | REAL_YSCALEYUV2RGB_COEFF(c) | |
851 | ||
852 | /** | |
853 | * vertical bilinear scale YV12 to RGB | |
854 | */ | |
855 | static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], | |
856 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
857 | const int16_t *abuf[2], uint8_t *dest, | |
858 | int dstW, int yalpha, int uvalpha, int y) | |
859 | { | |
860 | const int16_t *buf0 = buf[0], *buf1 = buf[1], | |
861 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; | |
862 | ||
863 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { | |
864 | const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1]; | |
865 | #if ARCH_X86_64 | |
866 | __asm__ volatile( | |
867 | YSCALEYUV2RGB(%%r8, %5) | |
868 | YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) | |
869 | "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ | |
870 | "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ | |
871 | "packuswb %%mm7, %%mm1 \n\t" | |
872 | WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) | |
873 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), | |
874 | "a" (&c->redDither), | |
875 | "r" (abuf0), "r" (abuf1) | |
876 | : "%r8" | |
877 | ); | |
878 | #else | |
879 | c->u_temp=(intptr_t)abuf0; | |
880 | c->v_temp=(intptr_t)abuf1; | |
881 | __asm__ volatile( | |
882 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
883 | "mov %4, %%"REG_b" \n\t" | |
884 | "push %%"REG_BP" \n\t" | |
885 | YSCALEYUV2RGB(%%REGBP, %5) | |
886 | "push %0 \n\t" | |
887 | "push %1 \n\t" | |
888 | "mov "U_TEMP"(%5), %0 \n\t" | |
889 | "mov "V_TEMP"(%5), %1 \n\t" | |
890 | YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) | |
891 | "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ | |
892 | "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ | |
893 | "packuswb %%mm7, %%mm1 \n\t" | |
894 | "pop %1 \n\t" | |
895 | "pop %0 \n\t" | |
896 | WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) | |
897 | "pop %%"REG_BP" \n\t" | |
898 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
899 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
900 | "a" (&c->redDither) | |
901 | ); | |
902 | #endif | |
903 | } else { | |
904 | __asm__ volatile( | |
905 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
906 | "mov %4, %%"REG_b" \n\t" | |
907 | "push %%"REG_BP" \n\t" | |
908 | YSCALEYUV2RGB(%%REGBP, %5) | |
909 | "pcmpeqd %%mm7, %%mm7 \n\t" | |
910 | WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |
911 | "pop %%"REG_BP" \n\t" | |
912 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
913 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
914 | "a" (&c->redDither) | |
915 | ); | |
916 | } | |
917 | } | |
918 | ||
919 | static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], | |
920 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
921 | const int16_t *abuf[2], uint8_t *dest, | |
922 | int dstW, int yalpha, int uvalpha, int y) | |
923 | { | |
924 | const int16_t *buf0 = buf[0], *buf1 = buf[1], | |
925 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; | |
926 | ||
927 | __asm__ volatile( | |
928 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
929 | "mov %4, %%"REG_b" \n\t" | |
930 | "push %%"REG_BP" \n\t" | |
931 | YSCALEYUV2RGB(%%REGBP, %5) | |
932 | "pxor %%mm7, %%mm7 \n\t" | |
933 | WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
934 | "pop %%"REG_BP" \n\t" | |
935 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
936 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
937 | "a" (&c->redDither) | |
938 | NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |
939 | ); | |
940 | } | |
941 | ||
942 | static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], | |
943 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
944 | const int16_t *abuf[2], uint8_t *dest, | |
945 | int dstW, int yalpha, int uvalpha, int y) | |
946 | { | |
947 | const int16_t *buf0 = buf[0], *buf1 = buf[1], | |
948 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; | |
949 | ||
950 | __asm__ volatile( | |
951 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
952 | "mov %4, %%"REG_b" \n\t" | |
953 | "push %%"REG_BP" \n\t" | |
954 | YSCALEYUV2RGB(%%REGBP, %5) | |
955 | "pxor %%mm7, %%mm7 \n\t" | |
956 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
957 | #ifdef DITHER1XBPP | |
958 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |
959 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
960 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
961 | #endif | |
962 | WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
963 | "pop %%"REG_BP" \n\t" | |
964 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
965 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
966 | "a" (&c->redDither) | |
967 | NAMED_CONSTRAINTS_ADD(bF8) | |
968 | ); | |
969 | } | |
970 | ||
971 | static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], | |
972 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
973 | const int16_t *abuf[2], uint8_t *dest, | |
974 | int dstW, int yalpha, int uvalpha, int y) | |
975 | { | |
976 | const int16_t *buf0 = buf[0], *buf1 = buf[1], | |
977 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; | |
978 | ||
979 | __asm__ volatile( | |
980 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
981 | "mov %4, %%"REG_b" \n\t" | |
982 | "push %%"REG_BP" \n\t" | |
983 | YSCALEYUV2RGB(%%REGBP, %5) | |
984 | "pxor %%mm7, %%mm7 \n\t" | |
985 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
986 | #ifdef DITHER1XBPP | |
987 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |
988 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
989 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
990 | #endif | |
991 | WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
992 | "pop %%"REG_BP" \n\t" | |
993 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
994 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
995 | "a" (&c->redDither) | |
996 | NAMED_CONSTRAINTS_ADD(bF8,bFC) | |
997 | ); | |
998 | } | |
999 | ||
1000 | #define REAL_YSCALEYUV2PACKED(index, c) \ | |
1001 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | |
1002 | "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ | |
1003 | "psraw $3, %%mm0 \n\t"\ | |
1004 | "psraw $3, %%mm1 \n\t"\ | |
1005 | "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ | |
1006 | "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ | |
1007 | "xor "#index", "#index" \n\t"\ | |
1008 | ".p2align 4 \n\t"\ | |
1009 | "1: \n\t"\ | |
1010 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
1011 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
1012 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1013 | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
1014 | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
1015 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1016 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
1017 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
1018 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | |
1019 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
1020 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
1021 | "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
1022 | "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
1023 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
1024 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
1025 | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
1026 | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
1027 | "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
1028 | "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
1029 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
1030 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
1031 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
1032 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
1033 | "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
1034 | "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
1035 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
1036 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
1037 | ||
1038 | #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) | |
1039 | ||
1040 | static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], | |
1041 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
1042 | const int16_t *abuf[2], uint8_t *dest, | |
1043 | int dstW, int yalpha, int uvalpha, int y) | |
1044 | { | |
1045 | const int16_t *buf0 = buf[0], *buf1 = buf[1], | |
1046 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; | |
1047 | ||
1048 | __asm__ volatile( | |
1049 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1050 | "mov %4, %%"REG_b" \n\t" | |
1051 | "push %%"REG_BP" \n\t" | |
1052 | YSCALEYUV2PACKED(%%REGBP, %5) | |
1053 | WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
1054 | "pop %%"REG_BP" \n\t" | |
1055 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1056 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1057 | "a" (&c->redDither) | |
1058 | ); | |
1059 | } | |
1060 | ||
1061 | #define REAL_YSCALEYUV2RGB1(index, c) \ | |
1062 | "xor "#index", "#index" \n\t"\ | |
1063 | ".p2align 4 \n\t"\ | |
1064 | "1: \n\t"\ | |
1065 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
1066 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1067 | "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
1068 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1069 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
1070 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
1071 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
1072 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
1073 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
1074 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
1075 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | |
1076 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | |
1077 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
1078 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
1079 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
1080 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
1081 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
1082 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | |
1083 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | |
1084 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
1085 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
1086 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
1087 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
1088 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
1089 | "paddw %%mm3, %%mm4 \n\t"\ | |
1090 | "movq %%mm2, %%mm0 \n\t"\ | |
1091 | "movq %%mm5, %%mm6 \n\t"\ | |
1092 | "movq %%mm4, %%mm3 \n\t"\ | |
1093 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
1094 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
1095 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
1096 | "paddw %%mm1, %%mm2 \n\t"\ | |
1097 | "paddw %%mm1, %%mm5 \n\t"\ | |
1098 | "paddw %%mm1, %%mm4 \n\t"\ | |
1099 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
1100 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
1101 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
1102 | "paddw %%mm7, %%mm0 \n\t"\ | |
1103 | "paddw %%mm7, %%mm6 \n\t"\ | |
1104 | "paddw %%mm7, %%mm3 \n\t"\ | |
1105 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
1106 | "packuswb %%mm0, %%mm2 \n\t"\ | |
1107 | "packuswb %%mm6, %%mm5 \n\t"\ | |
1108 | "packuswb %%mm3, %%mm4 \n\t"\ | |
1109 | ||
1110 | #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) | |
1111 | ||
1112 | // do vertical chrominance interpolation | |
1113 | #define REAL_YSCALEYUV2RGB1b(index, c) \ | |
1114 | "xor "#index", "#index" \n\t"\ | |
1115 | ".p2align 4 \n\t"\ | |
1116 | "1: \n\t"\ | |
1117 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
1118 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
1119 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1120 | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
1121 | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
1122 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1123 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
1124 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
1125 | "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ | |
1126 | "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
1127 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
1128 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
1129 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
1130 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
1131 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | |
1132 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | |
1133 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
1134 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
1135 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
1136 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
1137 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
1138 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | |
1139 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | |
1140 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
1141 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
1142 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
1143 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
1144 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
1145 | "paddw %%mm3, %%mm4 \n\t"\ | |
1146 | "movq %%mm2, %%mm0 \n\t"\ | |
1147 | "movq %%mm5, %%mm6 \n\t"\ | |
1148 | "movq %%mm4, %%mm3 \n\t"\ | |
1149 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
1150 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
1151 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
1152 | "paddw %%mm1, %%mm2 \n\t"\ | |
1153 | "paddw %%mm1, %%mm5 \n\t"\ | |
1154 | "paddw %%mm1, %%mm4 \n\t"\ | |
1155 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
1156 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
1157 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
1158 | "paddw %%mm7, %%mm0 \n\t"\ | |
1159 | "paddw %%mm7, %%mm6 \n\t"\ | |
1160 | "paddw %%mm7, %%mm3 \n\t"\ | |
1161 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
1162 | "packuswb %%mm0, %%mm2 \n\t"\ | |
1163 | "packuswb %%mm6, %%mm5 \n\t"\ | |
1164 | "packuswb %%mm3, %%mm4 \n\t"\ | |
1165 | ||
1166 | #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) | |
1167 | ||
1168 | #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ | |
1169 | "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ | |
1170 | "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ | |
1171 | "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ | |
1172 | "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ | |
1173 | "packuswb %%mm1, %%mm7 \n\t" | |
1174 | #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) | |
1175 | ||
1176 | /** | |
1177 | * YV12 to RGB without scaling or interpolating | |
1178 | */ | |
1179 | static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, | |
1180 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
1181 | const int16_t *abuf0, uint8_t *dest, | |
1182 | int dstW, int uvalpha, int y) | |
1183 | { | |
1184 | const int16_t *ubuf0 = ubuf[0]; | |
1185 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 | |
1186 | ||
1187 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | |
1188 | const int16_t *ubuf1 = ubuf[0]; | |
1189 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { | |
1190 | __asm__ volatile( | |
1191 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1192 | "mov %4, %%"REG_b" \n\t" | |
1193 | "push %%"REG_BP" \n\t" | |
1194 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1195 | YSCALEYUV2RGB1_ALPHA(%%REGBP) | |
1196 | WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |
1197 | "pop %%"REG_BP" \n\t" | |
1198 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1199 | :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1200 | "a" (&c->redDither) | |
1201 | ); | |
1202 | } else { | |
1203 | __asm__ volatile( | |
1204 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1205 | "mov %4, %%"REG_b" \n\t" | |
1206 | "push %%"REG_BP" \n\t" | |
1207 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1208 | "pcmpeqd %%mm7, %%mm7 \n\t" | |
1209 | WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |
1210 | "pop %%"REG_BP" \n\t" | |
1211 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1212 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1213 | "a" (&c->redDither) | |
1214 | ); | |
1215 | } | |
1216 | } else { | |
1217 | const int16_t *ubuf1 = ubuf[1]; | |
1218 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { | |
1219 | __asm__ volatile( | |
1220 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1221 | "mov %4, %%"REG_b" \n\t" | |
1222 | "push %%"REG_BP" \n\t" | |
1223 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1224 | YSCALEYUV2RGB1_ALPHA(%%REGBP) | |
1225 | WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |
1226 | "pop %%"REG_BP" \n\t" | |
1227 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1228 | :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1229 | "a" (&c->redDither) | |
1230 | ); | |
1231 | } else { | |
1232 | __asm__ volatile( | |
1233 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1234 | "mov %4, %%"REG_b" \n\t" | |
1235 | "push %%"REG_BP" \n\t" | |
1236 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1237 | "pcmpeqd %%mm7, %%mm7 \n\t" | |
1238 | WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |
1239 | "pop %%"REG_BP" \n\t" | |
1240 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1241 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1242 | "a" (&c->redDither) | |
1243 | ); | |
1244 | } | |
1245 | } | |
1246 | } | |
1247 | ||
1248 | static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, | |
1249 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
1250 | const int16_t *abuf0, uint8_t *dest, | |
1251 | int dstW, int uvalpha, int y) | |
1252 | { | |
1253 | const int16_t *ubuf0 = ubuf[0]; | |
1254 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 | |
1255 | ||
1256 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | |
1257 | const int16_t *ubuf1 = ubuf[0]; | |
1258 | __asm__ volatile( | |
1259 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1260 | "mov %4, %%"REG_b" \n\t" | |
1261 | "push %%"REG_BP" \n\t" | |
1262 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1263 | "pxor %%mm7, %%mm7 \n\t" | |
1264 | WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
1265 | "pop %%"REG_BP" \n\t" | |
1266 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1267 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1268 | "a" (&c->redDither) | |
1269 | NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |
1270 | ); | |
1271 | } else { | |
1272 | const int16_t *ubuf1 = ubuf[1]; | |
1273 | __asm__ volatile( | |
1274 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1275 | "mov %4, %%"REG_b" \n\t" | |
1276 | "push %%"REG_BP" \n\t" | |
1277 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1278 | "pxor %%mm7, %%mm7 \n\t" | |
1279 | WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
1280 | "pop %%"REG_BP" \n\t" | |
1281 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1282 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1283 | "a" (&c->redDither) | |
1284 | NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |
1285 | ); | |
1286 | } | |
1287 | } | |
1288 | ||
1289 | static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, | |
1290 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
1291 | const int16_t *abuf0, uint8_t *dest, | |
1292 | int dstW, int uvalpha, int y) | |
1293 | { | |
1294 | const int16_t *ubuf0 = ubuf[0]; | |
1295 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 | |
1296 | ||
1297 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | |
1298 | const int16_t *ubuf1 = ubuf[0]; | |
1299 | __asm__ volatile( | |
1300 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1301 | "mov %4, %%"REG_b" \n\t" | |
1302 | "push %%"REG_BP" \n\t" | |
1303 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1304 | "pxor %%mm7, %%mm7 \n\t" | |
1305 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1306 | #ifdef DITHER1XBPP | |
1307 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |
1308 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1309 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
1310 | #endif | |
1311 | WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
1312 | "pop %%"REG_BP" \n\t" | |
1313 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1314 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1315 | "a" (&c->redDither) | |
1316 | NAMED_CONSTRAINTS_ADD(bF8) | |
1317 | ); | |
1318 | } else { | |
1319 | const int16_t *ubuf1 = ubuf[1]; | |
1320 | __asm__ volatile( | |
1321 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1322 | "mov %4, %%"REG_b" \n\t" | |
1323 | "push %%"REG_BP" \n\t" | |
1324 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1325 | "pxor %%mm7, %%mm7 \n\t" | |
1326 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1327 | #ifdef DITHER1XBPP | |
1328 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |
1329 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1330 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
1331 | #endif | |
1332 | WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
1333 | "pop %%"REG_BP" \n\t" | |
1334 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1335 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1336 | "a" (&c->redDither) | |
1337 | NAMED_CONSTRAINTS_ADD(bF8) | |
1338 | ); | |
1339 | } | |
1340 | } | |
1341 | ||
1342 | static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, | |
1343 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
1344 | const int16_t *abuf0, uint8_t *dest, | |
1345 | int dstW, int uvalpha, int y) | |
1346 | { | |
1347 | const int16_t *ubuf0 = ubuf[0]; | |
1348 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 | |
1349 | ||
1350 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | |
1351 | const int16_t *ubuf1 = ubuf[0]; | |
1352 | __asm__ volatile( | |
1353 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1354 | "mov %4, %%"REG_b" \n\t" | |
1355 | "push %%"REG_BP" \n\t" | |
1356 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1357 | "pxor %%mm7, %%mm7 \n\t" | |
1358 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1359 | #ifdef DITHER1XBPP | |
1360 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |
1361 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1362 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
1363 | #endif | |
1364 | WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
1365 | "pop %%"REG_BP" \n\t" | |
1366 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1367 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1368 | "a" (&c->redDither) | |
1369 | NAMED_CONSTRAINTS_ADD(bF8,bFC) | |
1370 | ); | |
1371 | } else { | |
1372 | const int16_t *ubuf1 = ubuf[1]; | |
1373 | __asm__ volatile( | |
1374 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1375 | "mov %4, %%"REG_b" \n\t" | |
1376 | "push %%"REG_BP" \n\t" | |
1377 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1378 | "pxor %%mm7, %%mm7 \n\t" | |
1379 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1380 | #ifdef DITHER1XBPP | |
1381 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |
1382 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1383 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
1384 | #endif | |
1385 | WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
1386 | "pop %%"REG_BP" \n\t" | |
1387 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1388 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1389 | "a" (&c->redDither) | |
1390 | NAMED_CONSTRAINTS_ADD(bF8,bFC) | |
1391 | ); | |
1392 | } | |
1393 | } | |
1394 | ||
1395 | #define REAL_YSCALEYUV2PACKED1(index, c) \ | |
1396 | "xor "#index", "#index" \n\t"\ | |
1397 | ".p2align 4 \n\t"\ | |
1398 | "1: \n\t"\ | |
1399 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
1400 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1401 | "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
1402 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1403 | "psraw $7, %%mm3 \n\t" \ | |
1404 | "psraw $7, %%mm4 \n\t" \ | |
1405 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
1406 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
1407 | "psraw $7, %%mm1 \n\t" \ | |
1408 | "psraw $7, %%mm7 \n\t" \ | |
1409 | ||
1410 | #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) | |
1411 | ||
1412 | #define REAL_YSCALEYUV2PACKED1b(index, c) \ | |
1413 | "xor "#index", "#index" \n\t"\ | |
1414 | ".p2align 4 \n\t"\ | |
1415 | "1: \n\t"\ | |
1416 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
1417 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
1418 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1419 | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
1420 | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
1421 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ | |
1422 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
1423 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
1424 | "psrlw $8, %%mm3 \n\t" \ | |
1425 | "psrlw $8, %%mm4 \n\t" \ | |
1426 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
1427 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
1428 | "psraw $7, %%mm1 \n\t" \ | |
1429 | "psraw $7, %%mm7 \n\t" | |
1430 | #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) | |
1431 | ||
1432 | static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, | |
1433 | const int16_t *ubuf[2], const int16_t *vbuf[2], | |
1434 | const int16_t *abuf0, uint8_t *dest, | |
1435 | int dstW, int uvalpha, int y) | |
1436 | { | |
1437 | const int16_t *ubuf0 = ubuf[0]; | |
1438 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 | |
1439 | ||
1440 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | |
1441 | const int16_t *ubuf1 = ubuf[0]; | |
1442 | __asm__ volatile( | |
1443 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1444 | "mov %4, %%"REG_b" \n\t" | |
1445 | "push %%"REG_BP" \n\t" | |
1446 | YSCALEYUV2PACKED1(%%REGBP, %5) | |
1447 | WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
1448 | "pop %%"REG_BP" \n\t" | |
1449 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1450 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1451 | "a" (&c->redDither) | |
1452 | ); | |
1453 | } else { | |
1454 | const int16_t *ubuf1 = ubuf[1]; | |
1455 | __asm__ volatile( | |
1456 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1457 | "mov %4, %%"REG_b" \n\t" | |
1458 | "push %%"REG_BP" \n\t" | |
1459 | YSCALEYUV2PACKED1b(%%REGBP, %5) | |
1460 | WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |
1461 | "pop %%"REG_BP" \n\t" | |
1462 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1463 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |
1464 | "a" (&c->redDither) | |
1465 | ); | |
1466 | } | |
1467 | } | |
1468 | static av_cold void RENAME(sws_init_swscale)(SwsContext *c) | |
1469 | { | |
1470 | enum AVPixelFormat dstFormat = c->dstFormat; | |
1471 | ||
1472 | c->use_mmx_vfilter= 0; | |
1473 | if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 | |
1474 | && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { | |
1475 | if (c->flags & SWS_ACCURATE_RND) { | |
1476 | if (!(c->flags & SWS_FULL_CHR_H_INT)) { | |
1477 | switch (c->dstFormat) { | |
1478 | case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; | |
1479 | #if HAVE_6REGS | |
1480 | case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; | |
1481 | #endif | |
1482 | case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; | |
1483 | case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; | |
1484 | case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; | |
1485 | default: break; | |
1486 | } | |
1487 | } | |
1488 | } else { | |
1489 | c->use_mmx_vfilter= 1; | |
1490 | c->yuv2planeX = RENAME(yuv2yuvX ); | |
1491 | if (!(c->flags & SWS_FULL_CHR_H_INT)) { | |
1492 | switch (c->dstFormat) { | |
1493 | case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; | |
1494 | #if HAVE_6REGS | |
1495 | case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; | |
1496 | #endif | |
1497 | case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; | |
1498 | case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; | |
1499 | case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; | |
1500 | default: break; | |
1501 | } | |
1502 | } | |
1503 | } | |
1504 | if (!(c->flags & SWS_FULL_CHR_H_INT)) { | |
1505 | switch (c->dstFormat) { | |
1506 | case AV_PIX_FMT_RGB32: | |
1507 | c->yuv2packed1 = RENAME(yuv2rgb32_1); | |
1508 | c->yuv2packed2 = RENAME(yuv2rgb32_2); | |
1509 | break; | |
1510 | case AV_PIX_FMT_BGR24: | |
1511 | c->yuv2packed1 = RENAME(yuv2bgr24_1); | |
1512 | c->yuv2packed2 = RENAME(yuv2bgr24_2); | |
1513 | break; | |
1514 | case AV_PIX_FMT_RGB555: | |
1515 | c->yuv2packed1 = RENAME(yuv2rgb555_1); | |
1516 | c->yuv2packed2 = RENAME(yuv2rgb555_2); | |
1517 | break; | |
1518 | case AV_PIX_FMT_RGB565: | |
1519 | c->yuv2packed1 = RENAME(yuv2rgb565_1); | |
1520 | c->yuv2packed2 = RENAME(yuv2rgb565_2); | |
1521 | break; | |
1522 | case AV_PIX_FMT_YUYV422: | |
1523 | c->yuv2packed1 = RENAME(yuv2yuyv422_1); | |
1524 | c->yuv2packed2 = RENAME(yuv2yuyv422_2); | |
1525 | break; | |
1526 | default: | |
1527 | break; | |
1528 | } | |
1529 | } | |
1530 | } | |
1531 | ||
1532 | if (c->srcBpc == 8 && c->dstBpc <= 14) { | |
1533 | // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). | |
1534 | #if COMPILE_TEMPLATE_MMXEXT | |
1535 | if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { | |
1536 | c->hyscale_fast = ff_hyscale_fast_mmxext; | |
1537 | c->hcscale_fast = ff_hcscale_fast_mmxext; | |
1538 | } else { | |
1539 | #endif /* COMPILE_TEMPLATE_MMXEXT */ | |
1540 | c->hyscale_fast = NULL; | |
1541 | c->hcscale_fast = NULL; | |
1542 | #if COMPILE_TEMPLATE_MMXEXT | |
1543 | } | |
1544 | #endif /* COMPILE_TEMPLATE_MMXEXT */ | |
1545 | } | |
1546 | } |