Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * The simplest mpeg encoder (well, it was the simplest!) | |
3 | * Copyright (c) 2000,2001 Fabrice Bellard | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/attributes.h" | |
23 | #include "libavutil/cpu.h" | |
24 | #include "libavutil/x86/asm.h" | |
25 | #include "libavutil/x86/cpu.h" | |
26 | #include "libavcodec/avcodec.h" | |
27 | #include "libavcodec/dct.h" | |
28 | #include "libavcodec/mpegvideo.h" | |
29 | ||
30 | /* not permutated inverse zigzag_direct + 1 for MMX quantizer */ | |
31 | DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64]; | |
32 | ||
33 | #if HAVE_6REGS | |
34 | ||
35 | #if HAVE_MMX_INLINE | |
36 | #define COMPILE_TEMPLATE_MMXEXT 0 | |
37 | #define COMPILE_TEMPLATE_SSE2 0 | |
38 | #define COMPILE_TEMPLATE_SSSE3 0 | |
39 | #define RENAME(a) a ## _mmx | |
40 | #define RENAME_FDCT(a) a ## _mmx | |
41 | #include "mpegvideoenc_template.c" | |
42 | #endif /* HAVE_MMX_INLINE */ | |
43 | ||
44 | #if HAVE_MMXEXT_INLINE | |
45 | #undef COMPILE_TEMPLATE_SSSE3 | |
46 | #undef COMPILE_TEMPLATE_SSE2 | |
47 | #undef COMPILE_TEMPLATE_MMXEXT | |
48 | #define COMPILE_TEMPLATE_MMXEXT 1 | |
49 | #define COMPILE_TEMPLATE_SSE2 0 | |
50 | #define COMPILE_TEMPLATE_SSSE3 0 | |
51 | #undef RENAME | |
52 | #undef RENAME_FDCT | |
53 | #define RENAME(a) a ## _mmxext | |
54 | #define RENAME_FDCT(a) a ## _mmxext | |
55 | #include "mpegvideoenc_template.c" | |
56 | #endif /* HAVE_MMXEXT_INLINE */ | |
57 | ||
58 | #if HAVE_SSE2_INLINE | |
59 | #undef COMPILE_TEMPLATE_MMXEXT | |
60 | #undef COMPILE_TEMPLATE_SSE2 | |
61 | #undef COMPILE_TEMPLATE_SSSE3 | |
62 | #define COMPILE_TEMPLATE_MMXEXT 0 | |
63 | #define COMPILE_TEMPLATE_SSE2 1 | |
64 | #define COMPILE_TEMPLATE_SSSE3 0 | |
65 | #undef RENAME | |
66 | #undef RENAME_FDCT | |
67 | #define RENAME(a) a ## _sse2 | |
68 | #define RENAME_FDCT(a) a ## _sse2 | |
69 | #include "mpegvideoenc_template.c" | |
70 | #endif /* HAVE_SSE2_INLINE */ | |
71 | ||
72 | #if HAVE_SSSE3_INLINE | |
73 | #undef COMPILE_TEMPLATE_MMXEXT | |
74 | #undef COMPILE_TEMPLATE_SSE2 | |
75 | #undef COMPILE_TEMPLATE_SSSE3 | |
76 | #define COMPILE_TEMPLATE_MMXEXT 0 | |
77 | #define COMPILE_TEMPLATE_SSE2 1 | |
78 | #define COMPILE_TEMPLATE_SSSE3 1 | |
79 | #undef RENAME | |
80 | #undef RENAME_FDCT | |
81 | #define RENAME(a) a ## _ssse3 | |
82 | #define RENAME_FDCT(a) a ## _sse2 | |
83 | #include "mpegvideoenc_template.c" | |
84 | #endif /* HAVE_SSSE3_INLINE */ | |
85 | ||
86 | #endif /* HAVE_6REGS */ | |
87 | ||
88 | #if HAVE_INLINE_ASM | |
89 | static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){ | |
90 | const int intra= s->mb_intra; | |
91 | int *sum= s->dct_error_sum[intra]; | |
92 | uint16_t *offset= s->dct_offset[intra]; | |
93 | ||
94 | s->dct_count[intra]++; | |
95 | ||
96 | __asm__ volatile( | |
97 | "pxor %%mm7, %%mm7 \n\t" | |
98 | "1: \n\t" | |
99 | "pxor %%mm0, %%mm0 \n\t" | |
100 | "pxor %%mm1, %%mm1 \n\t" | |
101 | "movq (%0), %%mm2 \n\t" | |
102 | "movq 8(%0), %%mm3 \n\t" | |
103 | "pcmpgtw %%mm2, %%mm0 \n\t" | |
104 | "pcmpgtw %%mm3, %%mm1 \n\t" | |
105 | "pxor %%mm0, %%mm2 \n\t" | |
106 | "pxor %%mm1, %%mm3 \n\t" | |
107 | "psubw %%mm0, %%mm2 \n\t" | |
108 | "psubw %%mm1, %%mm3 \n\t" | |
109 | "movq %%mm2, %%mm4 \n\t" | |
110 | "movq %%mm3, %%mm5 \n\t" | |
111 | "psubusw (%2), %%mm2 \n\t" | |
112 | "psubusw 8(%2), %%mm3 \n\t" | |
113 | "pxor %%mm0, %%mm2 \n\t" | |
114 | "pxor %%mm1, %%mm3 \n\t" | |
115 | "psubw %%mm0, %%mm2 \n\t" | |
116 | "psubw %%mm1, %%mm3 \n\t" | |
117 | "movq %%mm2, (%0) \n\t" | |
118 | "movq %%mm3, 8(%0) \n\t" | |
119 | "movq %%mm4, %%mm2 \n\t" | |
120 | "movq %%mm5, %%mm3 \n\t" | |
121 | "punpcklwd %%mm7, %%mm4 \n\t" | |
122 | "punpckhwd %%mm7, %%mm2 \n\t" | |
123 | "punpcklwd %%mm7, %%mm5 \n\t" | |
124 | "punpckhwd %%mm7, %%mm3 \n\t" | |
125 | "paddd (%1), %%mm4 \n\t" | |
126 | "paddd 8(%1), %%mm2 \n\t" | |
127 | "paddd 16(%1), %%mm5 \n\t" | |
128 | "paddd 24(%1), %%mm3 \n\t" | |
129 | "movq %%mm4, (%1) \n\t" | |
130 | "movq %%mm2, 8(%1) \n\t" | |
131 | "movq %%mm5, 16(%1) \n\t" | |
132 | "movq %%mm3, 24(%1) \n\t" | |
133 | "add $16, %0 \n\t" | |
134 | "add $32, %1 \n\t" | |
135 | "add $16, %2 \n\t" | |
136 | "cmp %3, %0 \n\t" | |
137 | " jb 1b \n\t" | |
138 | : "+r" (block), "+r" (sum), "+r" (offset) | |
139 | : "r"(block+64) | |
140 | ); | |
141 | } | |
142 | ||
143 | static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){ | |
144 | const int intra= s->mb_intra; | |
145 | int *sum= s->dct_error_sum[intra]; | |
146 | uint16_t *offset= s->dct_offset[intra]; | |
147 | ||
148 | s->dct_count[intra]++; | |
149 | ||
150 | __asm__ volatile( | |
151 | "pxor %%xmm7, %%xmm7 \n\t" | |
152 | "1: \n\t" | |
153 | "pxor %%xmm0, %%xmm0 \n\t" | |
154 | "pxor %%xmm1, %%xmm1 \n\t" | |
155 | "movdqa (%0), %%xmm2 \n\t" | |
156 | "movdqa 16(%0), %%xmm3 \n\t" | |
157 | "pcmpgtw %%xmm2, %%xmm0 \n\t" | |
158 | "pcmpgtw %%xmm3, %%xmm1 \n\t" | |
159 | "pxor %%xmm0, %%xmm2 \n\t" | |
160 | "pxor %%xmm1, %%xmm3 \n\t" | |
161 | "psubw %%xmm0, %%xmm2 \n\t" | |
162 | "psubw %%xmm1, %%xmm3 \n\t" | |
163 | "movdqa %%xmm2, %%xmm4 \n\t" | |
164 | "movdqa %%xmm3, %%xmm5 \n\t" | |
165 | "psubusw (%2), %%xmm2 \n\t" | |
166 | "psubusw 16(%2), %%xmm3 \n\t" | |
167 | "pxor %%xmm0, %%xmm2 \n\t" | |
168 | "pxor %%xmm1, %%xmm3 \n\t" | |
169 | "psubw %%xmm0, %%xmm2 \n\t" | |
170 | "psubw %%xmm1, %%xmm3 \n\t" | |
171 | "movdqa %%xmm2, (%0) \n\t" | |
172 | "movdqa %%xmm3, 16(%0) \n\t" | |
173 | "movdqa %%xmm4, %%xmm6 \n\t" | |
174 | "movdqa %%xmm5, %%xmm0 \n\t" | |
175 | "punpcklwd %%xmm7, %%xmm4 \n\t" | |
176 | "punpckhwd %%xmm7, %%xmm6 \n\t" | |
177 | "punpcklwd %%xmm7, %%xmm5 \n\t" | |
178 | "punpckhwd %%xmm7, %%xmm0 \n\t" | |
179 | "paddd (%1), %%xmm4 \n\t" | |
180 | "paddd 16(%1), %%xmm6 \n\t" | |
181 | "paddd 32(%1), %%xmm5 \n\t" | |
182 | "paddd 48(%1), %%xmm0 \n\t" | |
183 | "movdqa %%xmm4, (%1) \n\t" | |
184 | "movdqa %%xmm6, 16(%1) \n\t" | |
185 | "movdqa %%xmm5, 32(%1) \n\t" | |
186 | "movdqa %%xmm0, 48(%1) \n\t" | |
187 | "add $32, %0 \n\t" | |
188 | "add $64, %1 \n\t" | |
189 | "add $32, %2 \n\t" | |
190 | "cmp %3, %0 \n\t" | |
191 | " jb 1b \n\t" | |
192 | : "+r" (block), "+r" (sum), "+r" (offset) | |
193 | : "r"(block+64) | |
194 | XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", | |
195 | "%xmm4", "%xmm5", "%xmm6", "%xmm7") | |
196 | ); | |
197 | } | |
198 | #endif /* HAVE_INLINE_ASM */ | |
199 | ||
200 | av_cold void ff_dct_encode_init_x86(MpegEncContext *s) | |
201 | { | |
202 | const int dct_algo = s->avctx->dct_algo; | |
203 | int i; | |
204 | ||
205 | for (i = 0; i < 64; i++) | |
206 | inv_zigzag_direct16[ff_zigzag_direct[i]] = i + 1; | |
207 | ||
208 | if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) { | |
209 | #if HAVE_MMX_INLINE | |
210 | int cpu_flags = av_get_cpu_flags(); | |
211 | if (INLINE_MMX(cpu_flags)) { | |
212 | #if HAVE_6REGS | |
213 | s->dct_quantize = dct_quantize_mmx; | |
214 | #endif | |
215 | s->denoise_dct = denoise_dct_mmx; | |
216 | } | |
217 | #endif | |
218 | #if HAVE_6REGS && HAVE_MMXEXT_INLINE | |
219 | if (INLINE_MMXEXT(cpu_flags)) | |
220 | s->dct_quantize = dct_quantize_mmxext; | |
221 | #endif | |
222 | #if HAVE_SSE2_INLINE | |
223 | if (INLINE_SSE2(cpu_flags)) { | |
224 | #if HAVE_6REGS | |
225 | s->dct_quantize = dct_quantize_sse2; | |
226 | #endif | |
227 | s->denoise_dct = denoise_dct_sse2; | |
228 | } | |
229 | #endif | |
230 | #if HAVE_6REGS && HAVE_SSSE3_INLINE | |
231 | if (INLINE_SSSE3(cpu_flags)) | |
232 | s->dct_quantize = dct_quantize_ssse3; | |
233 | #endif | |
234 | } | |
235 | } |