Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * DSP utils | |
3 | * Copyright (c) 2000, 2001 Fabrice Bellard | |
4 | * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "libavutil/attributes.h" | |
24 | #include "libavutil/internal.h" | |
25 | #include "avcodec.h" | |
26 | #include "copy_block.h" | |
27 | #include "simple_idct.h" | |
28 | #include "me_cmp.h" | |
29 | #include "mpegvideo.h" | |
30 | #include "config.h" | |
31 | ||
32 | uint32_t ff_square_tab[512] = { 0, }; | |
33 | ||
34 | static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
35 | int line_size, int h) | |
36 | { | |
37 | int s = 0, i; | |
38 | uint32_t *sq = ff_square_tab + 256; | |
39 | ||
40 | for (i = 0; i < h; i++) { | |
41 | s += sq[pix1[0] - pix2[0]]; | |
42 | s += sq[pix1[1] - pix2[1]]; | |
43 | s += sq[pix1[2] - pix2[2]]; | |
44 | s += sq[pix1[3] - pix2[3]]; | |
45 | pix1 += line_size; | |
46 | pix2 += line_size; | |
47 | } | |
48 | return s; | |
49 | } | |
50 | ||
51 | static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
52 | int line_size, int h) | |
53 | { | |
54 | int s = 0, i; | |
55 | uint32_t *sq = ff_square_tab + 256; | |
56 | ||
57 | for (i = 0; i < h; i++) { | |
58 | s += sq[pix1[0] - pix2[0]]; | |
59 | s += sq[pix1[1] - pix2[1]]; | |
60 | s += sq[pix1[2] - pix2[2]]; | |
61 | s += sq[pix1[3] - pix2[3]]; | |
62 | s += sq[pix1[4] - pix2[4]]; | |
63 | s += sq[pix1[5] - pix2[5]]; | |
64 | s += sq[pix1[6] - pix2[6]]; | |
65 | s += sq[pix1[7] - pix2[7]]; | |
66 | pix1 += line_size; | |
67 | pix2 += line_size; | |
68 | } | |
69 | return s; | |
70 | } | |
71 | ||
72 | static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
73 | int line_size, int h) | |
74 | { | |
75 | int s = 0, i; | |
76 | uint32_t *sq = ff_square_tab + 256; | |
77 | ||
78 | for (i = 0; i < h; i++) { | |
79 | s += sq[pix1[0] - pix2[0]]; | |
80 | s += sq[pix1[1] - pix2[1]]; | |
81 | s += sq[pix1[2] - pix2[2]]; | |
82 | s += sq[pix1[3] - pix2[3]]; | |
83 | s += sq[pix1[4] - pix2[4]]; | |
84 | s += sq[pix1[5] - pix2[5]]; | |
85 | s += sq[pix1[6] - pix2[6]]; | |
86 | s += sq[pix1[7] - pix2[7]]; | |
87 | s += sq[pix1[8] - pix2[8]]; | |
88 | s += sq[pix1[9] - pix2[9]]; | |
89 | s += sq[pix1[10] - pix2[10]]; | |
90 | s += sq[pix1[11] - pix2[11]]; | |
91 | s += sq[pix1[12] - pix2[12]]; | |
92 | s += sq[pix1[13] - pix2[13]]; | |
93 | s += sq[pix1[14] - pix2[14]]; | |
94 | s += sq[pix1[15] - pix2[15]]; | |
95 | ||
96 | pix1 += line_size; | |
97 | pix2 += line_size; | |
98 | } | |
99 | return s; | |
100 | } | |
101 | ||
102 | static int sum_abs_dctelem_c(int16_t *block) | |
103 | { | |
104 | int sum = 0, i; | |
105 | ||
106 | for (i = 0; i < 64; i++) | |
107 | sum += FFABS(block[i]); | |
108 | return sum; | |
109 | } | |
110 | ||
111 | #define avg2(a, b) ((a + b + 1) >> 1) | |
112 | #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2) | |
113 | ||
114 | static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
115 | int line_size, int h) | |
116 | { | |
117 | int s = 0, i; | |
118 | ||
119 | for (i = 0; i < h; i++) { | |
120 | s += abs(pix1[0] - pix2[0]); | |
121 | s += abs(pix1[1] - pix2[1]); | |
122 | s += abs(pix1[2] - pix2[2]); | |
123 | s += abs(pix1[3] - pix2[3]); | |
124 | s += abs(pix1[4] - pix2[4]); | |
125 | s += abs(pix1[5] - pix2[5]); | |
126 | s += abs(pix1[6] - pix2[6]); | |
127 | s += abs(pix1[7] - pix2[7]); | |
128 | s += abs(pix1[8] - pix2[8]); | |
129 | s += abs(pix1[9] - pix2[9]); | |
130 | s += abs(pix1[10] - pix2[10]); | |
131 | s += abs(pix1[11] - pix2[11]); | |
132 | s += abs(pix1[12] - pix2[12]); | |
133 | s += abs(pix1[13] - pix2[13]); | |
134 | s += abs(pix1[14] - pix2[14]); | |
135 | s += abs(pix1[15] - pix2[15]); | |
136 | pix1 += line_size; | |
137 | pix2 += line_size; | |
138 | } | |
139 | return s; | |
140 | } | |
141 | ||
142 | static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
143 | int line_size, int h) | |
144 | { | |
145 | int s = 0, i; | |
146 | ||
147 | for (i = 0; i < h; i++) { | |
148 | s += abs(pix1[0] - avg2(pix2[0], pix2[1])); | |
149 | s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
150 | s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
151 | s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
152 | s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
153 | s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
154 | s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
155 | s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
156 | s += abs(pix1[8] - avg2(pix2[8], pix2[9])); | |
157 | s += abs(pix1[9] - avg2(pix2[9], pix2[10])); | |
158 | s += abs(pix1[10] - avg2(pix2[10], pix2[11])); | |
159 | s += abs(pix1[11] - avg2(pix2[11], pix2[12])); | |
160 | s += abs(pix1[12] - avg2(pix2[12], pix2[13])); | |
161 | s += abs(pix1[13] - avg2(pix2[13], pix2[14])); | |
162 | s += abs(pix1[14] - avg2(pix2[14], pix2[15])); | |
163 | s += abs(pix1[15] - avg2(pix2[15], pix2[16])); | |
164 | pix1 += line_size; | |
165 | pix2 += line_size; | |
166 | } | |
167 | return s; | |
168 | } | |
169 | ||
170 | static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
171 | int line_size, int h) | |
172 | { | |
173 | int s = 0, i; | |
174 | uint8_t *pix3 = pix2 + line_size; | |
175 | ||
176 | for (i = 0; i < h; i++) { | |
177 | s += abs(pix1[0] - avg2(pix2[0], pix3[0])); | |
178 | s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
179 | s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
180 | s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
181 | s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
182 | s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
183 | s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
184 | s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
185 | s += abs(pix1[8] - avg2(pix2[8], pix3[8])); | |
186 | s += abs(pix1[9] - avg2(pix2[9], pix3[9])); | |
187 | s += abs(pix1[10] - avg2(pix2[10], pix3[10])); | |
188 | s += abs(pix1[11] - avg2(pix2[11], pix3[11])); | |
189 | s += abs(pix1[12] - avg2(pix2[12], pix3[12])); | |
190 | s += abs(pix1[13] - avg2(pix2[13], pix3[13])); | |
191 | s += abs(pix1[14] - avg2(pix2[14], pix3[14])); | |
192 | s += abs(pix1[15] - avg2(pix2[15], pix3[15])); | |
193 | pix1 += line_size; | |
194 | pix2 += line_size; | |
195 | pix3 += line_size; | |
196 | } | |
197 | return s; | |
198 | } | |
199 | ||
200 | static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
201 | int line_size, int h) | |
202 | { | |
203 | int s = 0, i; | |
204 | uint8_t *pix3 = pix2 + line_size; | |
205 | ||
206 | for (i = 0; i < h; i++) { | |
207 | s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); | |
208 | s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
209 | s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
210 | s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
211 | s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
212 | s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
213 | s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
214 | s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
215 | s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); | |
216 | s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); | |
217 | s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); | |
218 | s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); | |
219 | s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); | |
220 | s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); | |
221 | s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); | |
222 | s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); | |
223 | pix1 += line_size; | |
224 | pix2 += line_size; | |
225 | pix3 += line_size; | |
226 | } | |
227 | return s; | |
228 | } | |
229 | ||
230 | static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
231 | int line_size, int h) | |
232 | { | |
233 | int s = 0, i; | |
234 | ||
235 | for (i = 0; i < h; i++) { | |
236 | s += abs(pix1[0] - pix2[0]); | |
237 | s += abs(pix1[1] - pix2[1]); | |
238 | s += abs(pix1[2] - pix2[2]); | |
239 | s += abs(pix1[3] - pix2[3]); | |
240 | s += abs(pix1[4] - pix2[4]); | |
241 | s += abs(pix1[5] - pix2[5]); | |
242 | s += abs(pix1[6] - pix2[6]); | |
243 | s += abs(pix1[7] - pix2[7]); | |
244 | pix1 += line_size; | |
245 | pix2 += line_size; | |
246 | } | |
247 | return s; | |
248 | } | |
249 | ||
250 | static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
251 | int line_size, int h) | |
252 | { | |
253 | int s = 0, i; | |
254 | ||
255 | for (i = 0; i < h; i++) { | |
256 | s += abs(pix1[0] - avg2(pix2[0], pix2[1])); | |
257 | s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
258 | s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
259 | s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
260 | s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
261 | s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
262 | s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
263 | s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
264 | pix1 += line_size; | |
265 | pix2 += line_size; | |
266 | } | |
267 | return s; | |
268 | } | |
269 | ||
270 | static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
271 | int line_size, int h) | |
272 | { | |
273 | int s = 0, i; | |
274 | uint8_t *pix3 = pix2 + line_size; | |
275 | ||
276 | for (i = 0; i < h; i++) { | |
277 | s += abs(pix1[0] - avg2(pix2[0], pix3[0])); | |
278 | s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
279 | s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
280 | s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
281 | s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
282 | s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
283 | s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
284 | s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
285 | pix1 += line_size; | |
286 | pix2 += line_size; | |
287 | pix3 += line_size; | |
288 | } | |
289 | return s; | |
290 | } | |
291 | ||
292 | static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
293 | int line_size, int h) | |
294 | { | |
295 | int s = 0, i; | |
296 | uint8_t *pix3 = pix2 + line_size; | |
297 | ||
298 | for (i = 0; i < h; i++) { | |
299 | s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); | |
300 | s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
301 | s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
302 | s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
303 | s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
304 | s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
305 | s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
306 | s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
307 | pix1 += line_size; | |
308 | pix2 += line_size; | |
309 | pix3 += line_size; | |
310 | } | |
311 | return s; | |
312 | } | |
313 | ||
314 | static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) | |
315 | { | |
316 | int score1 = 0, score2 = 0, x, y; | |
317 | ||
318 | for (y = 0; y < h; y++) { | |
319 | for (x = 0; x < 16; x++) | |
320 | score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); | |
321 | if (y + 1 < h) { | |
322 | for (x = 0; x < 15; x++) | |
323 | score2 += FFABS(s1[x] - s1[x + stride] - | |
324 | s1[x + 1] + s1[x + stride + 1]) - | |
325 | FFABS(s2[x] - s2[x + stride] - | |
326 | s2[x + 1] + s2[x + stride + 1]); | |
327 | } | |
328 | s1 += stride; | |
329 | s2 += stride; | |
330 | } | |
331 | ||
332 | if (c) | |
333 | return score1 + FFABS(score2) * c->avctx->nsse_weight; | |
334 | else | |
335 | return score1 + FFABS(score2) * 8; | |
336 | } | |
337 | ||
338 | static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) | |
339 | { | |
340 | int score1 = 0, score2 = 0, x, y; | |
341 | ||
342 | for (y = 0; y < h; y++) { | |
343 | for (x = 0; x < 8; x++) | |
344 | score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); | |
345 | if (y + 1 < h) { | |
346 | for (x = 0; x < 7; x++) | |
347 | score2 += FFABS(s1[x] - s1[x + stride] - | |
348 | s1[x + 1] + s1[x + stride + 1]) - | |
349 | FFABS(s2[x] - s2[x + stride] - | |
350 | s2[x + 1] + s2[x + stride + 1]); | |
351 | } | |
352 | s1 += stride; | |
353 | s2 += stride; | |
354 | } | |
355 | ||
356 | if (c) | |
357 | return score1 + FFABS(score2) * c->avctx->nsse_weight; | |
358 | else | |
359 | return score1 + FFABS(score2) * 8; | |
360 | } | |
361 | ||
362 | static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b, | |
363 | int stride, int h) | |
364 | { | |
365 | return 0; | |
366 | } | |
367 | ||
368 | void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type) | |
369 | { | |
370 | int i; | |
371 | ||
372 | memset(cmp, 0, sizeof(void *) * 6); | |
373 | ||
374 | for (i = 0; i < 6; i++) { | |
375 | switch (type & 0xFF) { | |
376 | case FF_CMP_SAD: | |
377 | cmp[i] = c->sad[i]; | |
378 | break; | |
379 | case FF_CMP_SATD: | |
380 | cmp[i] = c->hadamard8_diff[i]; | |
381 | break; | |
382 | case FF_CMP_SSE: | |
383 | cmp[i] = c->sse[i]; | |
384 | break; | |
385 | case FF_CMP_DCT: | |
386 | cmp[i] = c->dct_sad[i]; | |
387 | break; | |
388 | case FF_CMP_DCT264: | |
389 | cmp[i] = c->dct264_sad[i]; | |
390 | break; | |
391 | case FF_CMP_DCTMAX: | |
392 | cmp[i] = c->dct_max[i]; | |
393 | break; | |
394 | case FF_CMP_PSNR: | |
395 | cmp[i] = c->quant_psnr[i]; | |
396 | break; | |
397 | case FF_CMP_BIT: | |
398 | cmp[i] = c->bit[i]; | |
399 | break; | |
400 | case FF_CMP_RD: | |
401 | cmp[i] = c->rd[i]; | |
402 | break; | |
403 | case FF_CMP_VSAD: | |
404 | cmp[i] = c->vsad[i]; | |
405 | break; | |
406 | case FF_CMP_VSSE: | |
407 | cmp[i] = c->vsse[i]; | |
408 | break; | |
409 | case FF_CMP_ZERO: | |
410 | cmp[i] = zero_cmp; | |
411 | break; | |
412 | case FF_CMP_NSSE: | |
413 | cmp[i] = c->nsse[i]; | |
414 | break; | |
415 | #if CONFIG_DWT | |
416 | case FF_CMP_W53: | |
417 | cmp[i]= c->w53[i]; | |
418 | break; | |
419 | case FF_CMP_W97: | |
420 | cmp[i]= c->w97[i]; | |
421 | break; | |
422 | #endif | |
423 | default: | |
424 | av_log(NULL, AV_LOG_ERROR, | |
425 | "internal error in cmp function selection\n"); | |
426 | } | |
427 | } | |
428 | } | |
429 | ||
430 | #define BUTTERFLY2(o1, o2, i1, i2) \ | |
431 | o1 = (i1) + (i2); \ | |
432 | o2 = (i1) - (i2); | |
433 | ||
434 | #define BUTTERFLY1(x, y) \ | |
435 | { \ | |
436 | int a, b; \ | |
437 | a = x; \ | |
438 | b = y; \ | |
439 | x = a + b; \ | |
440 | y = a - b; \ | |
441 | } | |
442 | ||
443 | #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y))) | |
444 | ||
445 | static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst, | |
446 | uint8_t *src, int stride, int h) | |
447 | { | |
448 | int i, temp[64], sum = 0; | |
449 | ||
450 | av_assert2(h == 8); | |
451 | ||
452 | for (i = 0; i < 8; i++) { | |
453 | // FIXME: try pointer walks | |
454 | BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], | |
455 | src[stride * i + 0] - dst[stride * i + 0], | |
456 | src[stride * i + 1] - dst[stride * i + 1]); | |
457 | BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], | |
458 | src[stride * i + 2] - dst[stride * i + 2], | |
459 | src[stride * i + 3] - dst[stride * i + 3]); | |
460 | BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], | |
461 | src[stride * i + 4] - dst[stride * i + 4], | |
462 | src[stride * i + 5] - dst[stride * i + 5]); | |
463 | BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], | |
464 | src[stride * i + 6] - dst[stride * i + 6], | |
465 | src[stride * i + 7] - dst[stride * i + 7]); | |
466 | ||
467 | BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); | |
468 | BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); | |
469 | BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); | |
470 | BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); | |
471 | ||
472 | BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); | |
473 | BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); | |
474 | BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); | |
475 | BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); | |
476 | } | |
477 | ||
478 | for (i = 0; i < 8; i++) { | |
479 | BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); | |
480 | BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); | |
481 | BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); | |
482 | BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); | |
483 | ||
484 | BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); | |
485 | BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); | |
486 | BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); | |
487 | BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); | |
488 | ||
489 | sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + | |
490 | BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + | |
491 | BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + | |
492 | BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); | |
493 | } | |
494 | return sum; | |
495 | } | |
496 | ||
497 | static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src, | |
498 | uint8_t *dummy, int stride, int h) | |
499 | { | |
500 | int i, temp[64], sum = 0; | |
501 | ||
502 | av_assert2(h == 8); | |
503 | ||
504 | for (i = 0; i < 8; i++) { | |
505 | // FIXME: try pointer walks | |
506 | BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], | |
507 | src[stride * i + 0], src[stride * i + 1]); | |
508 | BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], | |
509 | src[stride * i + 2], src[stride * i + 3]); | |
510 | BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], | |
511 | src[stride * i + 4], src[stride * i + 5]); | |
512 | BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], | |
513 | src[stride * i + 6], src[stride * i + 7]); | |
514 | ||
515 | BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); | |
516 | BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); | |
517 | BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); | |
518 | BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); | |
519 | ||
520 | BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); | |
521 | BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); | |
522 | BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); | |
523 | BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); | |
524 | } | |
525 | ||
526 | for (i = 0; i < 8; i++) { | |
527 | BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); | |
528 | BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); | |
529 | BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); | |
530 | BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); | |
531 | ||
532 | BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); | |
533 | BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); | |
534 | BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); | |
535 | BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); | |
536 | ||
537 | sum += | |
538 | BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) | |
539 | + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) | |
540 | + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) | |
541 | + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); | |
542 | } | |
543 | ||
544 | sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean | |
545 | ||
546 | return sum; | |
547 | } | |
548 | ||
549 | static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1, | |
550 | uint8_t *src2, int stride, int h) | |
551 | { | |
552 | LOCAL_ALIGNED_16(int16_t, temp, [64]); | |
553 | ||
554 | av_assert2(h == 8); | |
555 | ||
556 | s->pdsp.diff_pixels(temp, src1, src2, stride); | |
557 | s->fdsp.fdct(temp); | |
558 | return s->mecc.sum_abs_dctelem(temp); | |
559 | } | |
560 | ||
561 | #if CONFIG_GPL | |
562 | #define DCT8_1D \ | |
563 | { \ | |
564 | const int s07 = SRC(0) + SRC(7); \ | |
565 | const int s16 = SRC(1) + SRC(6); \ | |
566 | const int s25 = SRC(2) + SRC(5); \ | |
567 | const int s34 = SRC(3) + SRC(4); \ | |
568 | const int a0 = s07 + s34; \ | |
569 | const int a1 = s16 + s25; \ | |
570 | const int a2 = s07 - s34; \ | |
571 | const int a3 = s16 - s25; \ | |
572 | const int d07 = SRC(0) - SRC(7); \ | |
573 | const int d16 = SRC(1) - SRC(6); \ | |
574 | const int d25 = SRC(2) - SRC(5); \ | |
575 | const int d34 = SRC(3) - SRC(4); \ | |
576 | const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \ | |
577 | const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \ | |
578 | const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \ | |
579 | const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \ | |
580 | DST(0, a0 + a1); \ | |
581 | DST(1, a4 + (a7 >> 2)); \ | |
582 | DST(2, a2 + (a3 >> 1)); \ | |
583 | DST(3, a5 + (a6 >> 2)); \ | |
584 | DST(4, a0 - a1); \ | |
585 | DST(5, a6 - (a5 >> 2)); \ | |
586 | DST(6, (a2 >> 1) - a3); \ | |
587 | DST(7, (a4 >> 2) - a7); \ | |
588 | } | |
589 | ||
590 | static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1, | |
591 | uint8_t *src2, int stride, int h) | |
592 | { | |
593 | int16_t dct[8][8]; | |
594 | int i, sum = 0; | |
595 | ||
596 | s->pdsp.diff_pixels(dct[0], src1, src2, stride); | |
597 | ||
598 | #define SRC(x) dct[i][x] | |
599 | #define DST(x, v) dct[i][x] = v | |
600 | for (i = 0; i < 8; i++) | |
601 | DCT8_1D | |
602 | #undef SRC | |
603 | #undef DST | |
604 | ||
605 | #define SRC(x) dct[x][i] | |
606 | #define DST(x, v) sum += FFABS(v) | |
607 | for (i = 0; i < 8; i++) | |
608 | DCT8_1D | |
609 | #undef SRC | |
610 | #undef DST | |
611 | return sum; | |
612 | } | |
613 | #endif | |
614 | ||
615 | static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1, | |
616 | uint8_t *src2, int stride, int h) | |
617 | { | |
618 | LOCAL_ALIGNED_16(int16_t, temp, [64]); | |
619 | int sum = 0, i; | |
620 | ||
621 | av_assert2(h == 8); | |
622 | ||
623 | s->pdsp.diff_pixels(temp, src1, src2, stride); | |
624 | s->fdsp.fdct(temp); | |
625 | ||
626 | for (i = 0; i < 64; i++) | |
627 | sum = FFMAX(sum, FFABS(temp[i])); | |
628 | ||
629 | return sum; | |
630 | } | |
631 | ||
632 | static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1, | |
633 | uint8_t *src2, int stride, int h) | |
634 | { | |
635 | LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]); | |
636 | int16_t *const bak = temp + 64; | |
637 | int sum = 0, i; | |
638 | ||
639 | av_assert2(h == 8); | |
640 | s->mb_intra = 0; | |
641 | ||
642 | s->pdsp.diff_pixels(temp, src1, src2, stride); | |
643 | ||
644 | memcpy(bak, temp, 64 * sizeof(int16_t)); | |
645 | ||
646 | s->block_last_index[0 /* FIXME */] = | |
647 | s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); | |
648 | s->dct_unquantize_inter(s, temp, 0, s->qscale); | |
649 | ff_simple_idct_8(temp); // FIXME | |
650 | ||
651 | for (i = 0; i < 64; i++) | |
652 | sum += (temp[i] - bak[i]) * (temp[i] - bak[i]); | |
653 | ||
654 | return sum; | |
655 | } | |
656 | ||
657 | static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, | |
658 | int stride, int h) | |
659 | { | |
660 | const uint8_t *scantable = s->intra_scantable.permutated; | |
661 | LOCAL_ALIGNED_16(int16_t, temp, [64]); | |
662 | LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]); | |
663 | LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]); | |
664 | int i, last, run, bits, level, distortion, start_i; | |
665 | const int esc_length = s->ac_esc_length; | |
666 | uint8_t *length, *last_length; | |
667 | ||
668 | av_assert2(h == 8); | |
669 | ||
670 | copy_block8(lsrc1, src1, 8, stride, 8); | |
671 | copy_block8(lsrc2, src2, 8, stride, 8); | |
672 | ||
673 | s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8); | |
674 | ||
675 | s->block_last_index[0 /* FIXME */] = | |
676 | last = | |
677 | s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); | |
678 | ||
679 | bits = 0; | |
680 | ||
681 | if (s->mb_intra) { | |
682 | start_i = 1; | |
683 | length = s->intra_ac_vlc_length; | |
684 | last_length = s->intra_ac_vlc_last_length; | |
685 | bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma | |
686 | } else { | |
687 | start_i = 0; | |
688 | length = s->inter_ac_vlc_length; | |
689 | last_length = s->inter_ac_vlc_last_length; | |
690 | } | |
691 | ||
692 | if (last >= start_i) { | |
693 | run = 0; | |
694 | for (i = start_i; i < last; i++) { | |
695 | int j = scantable[i]; | |
696 | level = temp[j]; | |
697 | ||
698 | if (level) { | |
699 | level += 64; | |
700 | if ((level & (~127)) == 0) | |
701 | bits += length[UNI_AC_ENC_INDEX(run, level)]; | |
702 | else | |
703 | bits += esc_length; | |
704 | run = 0; | |
705 | } else | |
706 | run++; | |
707 | } | |
708 | i = scantable[last]; | |
709 | ||
710 | level = temp[i] + 64; | |
711 | ||
712 | av_assert2(level - 64); | |
713 | ||
714 | if ((level & (~127)) == 0) { | |
715 | bits += last_length[UNI_AC_ENC_INDEX(run, level)]; | |
716 | } else | |
717 | bits += esc_length; | |
718 | } | |
719 | ||
720 | if (last >= 0) { | |
721 | if (s->mb_intra) | |
722 | s->dct_unquantize_intra(s, temp, 0, s->qscale); | |
723 | else | |
724 | s->dct_unquantize_inter(s, temp, 0, s->qscale); | |
725 | } | |
726 | ||
727 | s->idsp.idct_add(lsrc2, 8, temp); | |
728 | ||
729 | distortion = s->mecc.sse[1](NULL, lsrc2, lsrc1, 8, 8); | |
730 | ||
731 | return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7); | |
732 | } | |
733 | ||
734 | static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, | |
735 | int stride, int h) | |
736 | { | |
737 | const uint8_t *scantable = s->intra_scantable.permutated; | |
738 | LOCAL_ALIGNED_16(int16_t, temp, [64]); | |
739 | int i, last, run, bits, level, start_i; | |
740 | const int esc_length = s->ac_esc_length; | |
741 | uint8_t *length, *last_length; | |
742 | ||
743 | av_assert2(h == 8); | |
744 | ||
745 | s->pdsp.diff_pixels(temp, src1, src2, stride); | |
746 | ||
747 | s->block_last_index[0 /* FIXME */] = | |
748 | last = | |
749 | s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); | |
750 | ||
751 | bits = 0; | |
752 | ||
753 | if (s->mb_intra) { | |
754 | start_i = 1; | |
755 | length = s->intra_ac_vlc_length; | |
756 | last_length = s->intra_ac_vlc_last_length; | |
757 | bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma | |
758 | } else { | |
759 | start_i = 0; | |
760 | length = s->inter_ac_vlc_length; | |
761 | last_length = s->inter_ac_vlc_last_length; | |
762 | } | |
763 | ||
764 | if (last >= start_i) { | |
765 | run = 0; | |
766 | for (i = start_i; i < last; i++) { | |
767 | int j = scantable[i]; | |
768 | level = temp[j]; | |
769 | ||
770 | if (level) { | |
771 | level += 64; | |
772 | if ((level & (~127)) == 0) | |
773 | bits += length[UNI_AC_ENC_INDEX(run, level)]; | |
774 | else | |
775 | bits += esc_length; | |
776 | run = 0; | |
777 | } else | |
778 | run++; | |
779 | } | |
780 | i = scantable[last]; | |
781 | ||
782 | level = temp[i] + 64; | |
783 | ||
784 | av_assert2(level - 64); | |
785 | ||
786 | if ((level & (~127)) == 0) | |
787 | bits += last_length[UNI_AC_ENC_INDEX(run, level)]; | |
788 | else | |
789 | bits += esc_length; | |
790 | } | |
791 | ||
792 | return bits; | |
793 | } | |
794 | ||
795 | #define VSAD_INTRA(size) \ | |
796 | static int vsad_intra ## size ## _c(MpegEncContext *c, \ | |
797 | uint8_t *s, uint8_t *dummy, \ | |
798 | int stride, int h) \ | |
799 | { \ | |
800 | int score = 0, x, y; \ | |
801 | \ | |
802 | for (y = 1; y < h; y++) { \ | |
803 | for (x = 0; x < size; x += 4) { \ | |
804 | score += FFABS(s[x] - s[x + stride]) + \ | |
805 | FFABS(s[x + 1] - s[x + stride + 1]) + \ | |
806 | FFABS(s[x + 2] - s[x + 2 + stride]) + \ | |
807 | FFABS(s[x + 3] - s[x + 3 + stride]); \ | |
808 | } \ | |
809 | s += stride; \ | |
810 | } \ | |
811 | \ | |
812 | return score; \ | |
813 | } | |
814 | VSAD_INTRA(8) | |
815 | VSAD_INTRA(16) | |
816 | ||
817 | #define VSAD(size) \ | |
818 | static int vsad ## size ## _c(MpegEncContext *c, \ | |
819 | uint8_t *s1, uint8_t *s2, \ | |
820 | int stride, int h) \ | |
821 | { \ | |
822 | int score = 0, x, y; \ | |
823 | \ | |
824 | for (y = 1; y < h; y++) { \ | |
825 | for (x = 0; x < size; x++) \ | |
826 | score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \ | |
827 | s1 += stride; \ | |
828 | s2 += stride; \ | |
829 | } \ | |
830 | \ | |
831 | return score; \ | |
832 | } | |
833 | VSAD(8) | |
834 | VSAD(16) | |
835 | ||
836 | #define SQ(a) ((a) * (a)) | |
837 | #define VSSE_INTRA(size) \ | |
838 | static int vsse_intra ## size ## _c(MpegEncContext *c, \ | |
839 | uint8_t *s, uint8_t *dummy, \ | |
840 | int stride, int h) \ | |
841 | { \ | |
842 | int score = 0, x, y; \ | |
843 | \ | |
844 | for (y = 1; y < h; y++) { \ | |
845 | for (x = 0; x < size; x += 4) { \ | |
846 | score += SQ(s[x] - s[x + stride]) + \ | |
847 | SQ(s[x + 1] - s[x + stride + 1]) + \ | |
848 | SQ(s[x + 2] - s[x + stride + 2]) + \ | |
849 | SQ(s[x + 3] - s[x + stride + 3]); \ | |
850 | } \ | |
851 | s += stride; \ | |
852 | } \ | |
853 | \ | |
854 | return score; \ | |
855 | } | |
856 | VSSE_INTRA(8) | |
857 | VSSE_INTRA(16) | |
858 | ||
859 | #define VSSE(size) \ | |
860 | static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, \ | |
861 | int stride, int h) \ | |
862 | { \ | |
863 | int score = 0, x, y; \ | |
864 | \ | |
865 | for (y = 1; y < h; y++) { \ | |
866 | for (x = 0; x < size; x++) \ | |
867 | score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \ | |
868 | s1 += stride; \ | |
869 | s2 += stride; \ | |
870 | } \ | |
871 | \ | |
872 | return score; \ | |
873 | } | |
874 | VSSE(8) | |
875 | VSSE(16) | |
876 | ||
877 | #define WRAPPER8_16_SQ(name8, name16) \ | |
878 | static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ | |
879 | int stride, int h) \ | |
880 | { \ | |
881 | int score = 0; \ | |
882 | \ | |
883 | score += name8(s, dst, src, stride, 8); \ | |
884 | score += name8(s, dst + 8, src + 8, stride, 8); \ | |
885 | if (h == 16) { \ | |
886 | dst += 8 * stride; \ | |
887 | src += 8 * stride; \ | |
888 | score += name8(s, dst, src, stride, 8); \ | |
889 | score += name8(s, dst + 8, src + 8, stride, 8); \ | |
890 | } \ | |
891 | return score; \ | |
892 | } | |
893 | ||
894 | WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) | |
895 | WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) | |
896 | WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) | |
897 | #if CONFIG_GPL | |
898 | WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) | |
899 | #endif | |
900 | WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) | |
901 | WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) | |
902 | WRAPPER8_16_SQ(rd8x8_c, rd16_c) | |
903 | WRAPPER8_16_SQ(bit8x8_c, bit16_c) | |
904 | ||
905 | av_cold void ff_me_cmp_init_static(void) | |
906 | { | |
907 | int i; | |
908 | ||
909 | for (i = 0; i < 512; i++) | |
910 | ff_square_tab[i] = (i - 256) * (i - 256); | |
911 | } | |
912 | ||
913 | int ff_check_alignment(void) | |
914 | { | |
915 | static int did_fail = 0; | |
916 | LOCAL_ALIGNED_16(int, aligned, [4]); | |
917 | ||
918 | if ((intptr_t)aligned & 15) { | |
919 | if (!did_fail) { | |
920 | #if HAVE_MMX || HAVE_ALTIVEC | |
921 | av_log(NULL, AV_LOG_ERROR, | |
922 | "Compiler did not align stack variables. Libavcodec has been miscompiled\n" | |
923 | "and may be very slow or crash. This is not a bug in libavcodec,\n" | |
924 | "but in the compiler. You may try recompiling using gcc >= 4.2.\n" | |
925 | "Do not report crashes to FFmpeg developers.\n"); | |
926 | #endif | |
927 | did_fail=1; | |
928 | } | |
929 | return -1; | |
930 | } | |
931 | return 0; | |
932 | } | |
933 | ||
934 | av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx) | |
935 | { | |
936 | ff_check_alignment(); | |
937 | ||
938 | c->sum_abs_dctelem = sum_abs_dctelem_c; | |
939 | ||
940 | /* TODO [0] 16 [1] 8 */ | |
941 | c->pix_abs[0][0] = pix_abs16_c; | |
942 | c->pix_abs[0][1] = pix_abs16_x2_c; | |
943 | c->pix_abs[0][2] = pix_abs16_y2_c; | |
944 | c->pix_abs[0][3] = pix_abs16_xy2_c; | |
945 | c->pix_abs[1][0] = pix_abs8_c; | |
946 | c->pix_abs[1][1] = pix_abs8_x2_c; | |
947 | c->pix_abs[1][2] = pix_abs8_y2_c; | |
948 | c->pix_abs[1][3] = pix_abs8_xy2_c; | |
949 | ||
950 | #define SET_CMP_FUNC(name) \ | |
951 | c->name[0] = name ## 16_c; \ | |
952 | c->name[1] = name ## 8x8_c; | |
953 | ||
954 | SET_CMP_FUNC(hadamard8_diff) | |
955 | c->hadamard8_diff[4] = hadamard8_intra16_c; | |
956 | c->hadamard8_diff[5] = hadamard8_intra8x8_c; | |
957 | SET_CMP_FUNC(dct_sad) | |
958 | SET_CMP_FUNC(dct_max) | |
959 | #if CONFIG_GPL | |
960 | SET_CMP_FUNC(dct264_sad) | |
961 | #endif | |
962 | c->sad[0] = pix_abs16_c; | |
963 | c->sad[1] = pix_abs8_c; | |
964 | c->sse[0] = sse16_c; | |
965 | c->sse[1] = sse8_c; | |
966 | c->sse[2] = sse4_c; | |
967 | SET_CMP_FUNC(quant_psnr) | |
968 | SET_CMP_FUNC(rd) | |
969 | SET_CMP_FUNC(bit) | |
970 | c->vsad[0] = vsad16_c; | |
971 | c->vsad[1] = vsad8_c; | |
972 | c->vsad[4] = vsad_intra16_c; | |
973 | c->vsad[5] = vsad_intra8_c; | |
974 | c->vsse[0] = vsse16_c; | |
975 | c->vsse[1] = vsse8_c; | |
976 | c->vsse[4] = vsse_intra16_c; | |
977 | c->vsse[5] = vsse_intra8_c; | |
978 | c->nsse[0] = nsse16_c; | |
979 | c->nsse[1] = nsse8_c; | |
980 | #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER | |
981 | ff_dsputil_init_dwt(c); | |
982 | #endif | |
983 | ||
984 | if (ARCH_ALPHA) | |
985 | ff_me_cmp_init_alpha(c, avctx); | |
986 | if (ARCH_ARM) | |
987 | ff_me_cmp_init_arm(c, avctx); | |
988 | if (ARCH_PPC) | |
989 | ff_me_cmp_init_ppc(c, avctx); | |
990 | if (ARCH_X86) | |
991 | ff_me_cmp_init_x86(c, avctx); | |
992 | } |