Imported Debian version 2.5.2~trusty
[deb_ffmpeg.git] / ffmpeg / libpostproc / postprocess.c
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/**
24 * @file
25 * postprocessing.
26 */
27
28/*
29 C MMX MMX2 3DNow AltiVec
30isVertDC Ec Ec Ec
31isVertMinMaxOk Ec Ec Ec
32doVertLowPass E e e Ec
33doVertDefFilter Ec Ec e e Ec
34isHorizDC Ec Ec Ec
35isHorizMinMaxOk a E Ec
36doHorizLowPass E e e Ec
37doHorizDefFilter Ec Ec e e Ec
38do_a_deblock Ec E Ec E
39deRing E e e* Ecp
40Vertical RKAlgo1 E a a
41Horizontal RKAlgo1 a a
42Vertical X1# a E E
43Horizontal X1# a E E
44LinIpolDeinterlace e E E*
45CubicIpolDeinterlace a e e*
46LinBlendDeinterlace e E E*
47MedianDeinterlace# E Ec Ec
48TempDeNoiser# E e e Ec
49
50* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51# more or less selfinvented filters so the exactness is not too meaningful
52E = Exact implementation
53e = almost exact implementation (slightly different rounding,...)
54a = alternative / approximate impl
55c = checked against the other implementations (-vo md5)
56p = partially optimized, still some work to do
57*/
58
59/*
60TODO:
61reduce the time wasted on the mem transfer
62unroll stuff if instructions depend too much on the prior one
63move YScale thing to the end instead of fixing QP
64write a faster and higher quality deblocking filter :)
65make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67compare the quality & speed of all filters
68split this huge file
69optimize c versions
70try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71...
72*/
73
74//Changelog: use git log
75
76#include "config.h"
77#include "libavutil/avutil.h"
78#include "libavutil/avassert.h"
79#include <inttypes.h>
80#include <stdio.h>
81#include <stdlib.h>
82#include <string.h>
83//#undef HAVE_MMXEXT_INLINE
84//#define HAVE_AMD3DNOW_INLINE
85//#undef HAVE_MMX_INLINE
86//#undef ARCH_X86
87//#define DEBUG_BRIGHTNESS
88#include "postprocess.h"
89#include "postprocess_internal.h"
90#include "libavutil/avstring.h"
91
092a9121
DM
92#include "libavutil/ffversion.h"
93const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
94
2ba45a60
DM
95unsigned postproc_version(void)
96{
97 av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
98 return LIBPOSTPROC_VERSION_INT;
99}
100
101const char *postproc_configuration(void)
102{
103 return FFMPEG_CONFIGURATION;
104}
105
106const char *postproc_license(void)
107{
108#define LICENSE_PREFIX "libpostproc license: "
109 return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
110}
111
112#if HAVE_ALTIVEC_H
113#include <altivec.h>
114#endif
115
116#define GET_MODE_BUFFER_SIZE 500
117#define OPTIONS_ARRAY_SIZE 10
118#define BLOCK_SIZE 8
119#define TEMP_STRIDE 8
120//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
121
122#if ARCH_X86 && HAVE_INLINE_ASM
123DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
124DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
125DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
126DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
127DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
128DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
129DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
130DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
131#endif
132
133DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
134
135
136static const struct PPFilter filters[]=
137{
138 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
139 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
140/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
141 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
142 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
143 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
144 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
145 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
146 {"dr", "dering", 1, 5, 6, DERING},
147 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
148 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
149 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
150 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
151 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
152 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
153 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
154 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
155 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
156 {"be", "bitexact", 1, 0, 0, BITEXACT},
f6fa7814 157 {"vi", "visualize", 1, 0, 0, VISUALIZE},
2ba45a60
DM
158 {NULL, NULL,0,0,0,0} //End Marker
159};
160
161static const char * const replaceTable[]=
162{
163 "default", "hb:a,vb:a,dr:a",
164 "de", "hb:a,vb:a,dr:a",
165 "fast", "h1:a,v1:a,dr:a",
166 "fa", "h1:a,v1:a,dr:a",
167 "ac", "ha:a:128:7,va:a,dr:a",
168 NULL //End Marker
169};
170
171
172#if ARCH_X86 && HAVE_INLINE_ASM
f6fa7814 173static inline void prefetchnta(const void *p)
2ba45a60
DM
174{
175 __asm__ volatile( "prefetchnta (%0)\n\t"
176 : : "r" (p)
177 );
178}
179
f6fa7814 180static inline void prefetcht0(const void *p)
2ba45a60
DM
181{
182 __asm__ volatile( "prefetcht0 (%0)\n\t"
183 : : "r" (p)
184 );
185}
186
f6fa7814 187static inline void prefetcht1(const void *p)
2ba45a60
DM
188{
189 __asm__ volatile( "prefetcht1 (%0)\n\t"
190 : : "r" (p)
191 );
192}
193
f6fa7814 194static inline void prefetcht2(const void *p)
2ba45a60
DM
195{
196 __asm__ volatile( "prefetcht2 (%0)\n\t"
197 : : "r" (p)
198 );
199}
200#endif
201
202/* The horizontal functions exist only in C because the MMX
203 * code is faster with vertical filters and transposing. */
204
205/**
206 * Check if the given 8x8 Block is mostly "flat"
207 */
208static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
209{
210 int numEq= 0;
211 int y;
212 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
213 const int dcThreshold= dcOffset*2 + 1;
214
215 for(y=0; y<BLOCK_SIZE; y++){
f6fa7814
DM
216 numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
217 numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
218 numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
219 numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
220 numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
221 numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
222 numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
2ba45a60
DM
223 src+= stride;
224 }
225 return numEq > c->ppMode.flatnessThreshold;
226}
227
228/**
229 * Check if the middle 8x8 Block in the given 8x16 block is flat
230 */
231static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
232{
233 int numEq= 0;
234 int y;
235 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
236 const int dcThreshold= dcOffset*2 + 1;
237
238 src+= stride*4; // src points to begin of the 8x8 Block
239 for(y=0; y<BLOCK_SIZE-1; y++){
f6fa7814
DM
240 numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
241 numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
242 numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
243 numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
244 numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
245 numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
246 numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
247 numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
2ba45a60
DM
248 src+= stride;
249 }
250 return numEq > c->ppMode.flatnessThreshold;
251}
252
253static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
254{
255 int i;
256 for(i=0; i<2; i++){
257 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
258 src += stride;
259 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
260 src += stride;
261 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
262 src += stride;
263 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
264 src += stride;
265 }
266 return 1;
267}
268
269static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
270{
271 int x;
272 src+= stride*4;
273 for(x=0; x<BLOCK_SIZE; x+=4){
274 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
275 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
276 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
277 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
278 }
279 return 1;
280}
281
282static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
283{
284 if( isHorizDC_C(src, stride, c) ){
f6fa7814 285 return isHorizMinMaxOk_C(src, stride, c->QP);
2ba45a60
DM
286 }else{
287 return 2;
288 }
289}
290
291static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
292{
293 if( isVertDC_C(src, stride, c) ){
f6fa7814 294 return isVertMinMaxOk_C(src, stride, c->QP);
2ba45a60
DM
295 }else{
296 return 2;
297 }
298}
299
300static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
301{
302 int y;
303 for(y=0; y<BLOCK_SIZE; y++){
304 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
305
306 if(FFABS(middleEnergy) < 8*c->QP){
307 const int q=(dst[3] - dst[4])/2;
308 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
309 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
310
311 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
312 d= FFMAX(d, 0);
313
314 d= (5*d + 32) >> 6;
315 d*= FFSIGN(-middleEnergy);
316
317 if(q>0)
318 {
319 d = FFMAX(d, 0);
320 d = FFMIN(d, q);
321 }
322 else
323 {
324 d = FFMIN(d, 0);
325 d = FFMAX(d, q);
326 }
327
328 dst[3]-= d;
329 dst[4]+= d;
330 }
331 dst+= stride;
332 }
333}
334
335/**
336 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
337 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
338 */
339static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
340{
341 int y;
342 for(y=0; y<BLOCK_SIZE; y++){
343 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
344 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
345
346 int sums[10];
347 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
348 sums[1] = sums[0] - first + dst[3];
349 sums[2] = sums[1] - first + dst[4];
350 sums[3] = sums[2] - first + dst[5];
351 sums[4] = sums[3] - first + dst[6];
352 sums[5] = sums[4] - dst[0] + dst[7];
353 sums[6] = sums[5] - dst[1] + last;
354 sums[7] = sums[6] - dst[2] + last;
355 sums[8] = sums[7] - dst[3] + last;
356 sums[9] = sums[8] - dst[4] + last;
357
358 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
359 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
360 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
361 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
362 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
363 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
364 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
365 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
366
367 dst+= stride;
368 }
369}
370
371/**
372 * Experimental Filter 1 (Horizontal)
373 * will not damage linear gradients
374 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
375 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
376 * MMX2 version does correct clipping C version does not
377 * not identical with the vertical one
378 */
379static inline void horizX1Filter(uint8_t *src, int stride, int QP)
380{
381 int y;
382 static uint64_t lut[256];
383 if(!lut[255])
384 {
385 int i;
386 for(i=0; i<256; i++)
387 {
388 int v= i < 128 ? 2*i : 2*(i-256);
389/*
390//Simulate 112242211 9-Tap filter
391 uint64_t a= (v/16) & 0xFF;
392 uint64_t b= (v/8) & 0xFF;
393 uint64_t c= (v/4) & 0xFF;
394 uint64_t d= (3*v/8) & 0xFF;
395*/
396//Simulate piecewise linear interpolation
397 uint64_t a= (v/16) & 0xFF;
398 uint64_t b= (v*3/16) & 0xFF;
399 uint64_t c= (v*5/16) & 0xFF;
400 uint64_t d= (7*v/16) & 0xFF;
401 uint64_t A= (0x100 - a)&0xFF;
402 uint64_t B= (0x100 - b)&0xFF;
403 uint64_t C= (0x100 - c)&0xFF;
404 uint64_t D= (0x100 - c)&0xFF;
405
406 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
407 (D<<24) | (C<<16) | (B<<8) | (A);
408 //lut[i] = (v<<32) | (v<<24);
409 }
410 }
411
412 for(y=0; y<BLOCK_SIZE; y++){
413 int a= src[1] - src[2];
414 int b= src[3] - src[4];
415 int c= src[5] - src[6];
416
417 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
418
419 if(d < QP){
420 int v = d * FFSIGN(-b);
421
422 src[1] +=v/8;
423 src[2] +=v/4;
424 src[3] +=3*v/8;
425 src[4] -=3*v/8;
426 src[5] -=v/4;
427 src[6] -=v/8;
428 }
429 src+=stride;
430 }
431}
432
433/**
434 * accurate deblock filter
435 */
436static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
f6fa7814 437 int stride, const PPContext *c, int mode)
2ba45a60
DM
438{
439 int y;
440 const int QP= c->QP;
441 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
442 const int dcThreshold= dcOffset*2 + 1;
443//START_TIMER
444 src+= step*4; // src points to begin of the 8x8 Block
445 for(y=0; y<8; y++){
446 int numEq= 0;
447
f6fa7814
DM
448 numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
449 numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
450 numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
451 numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
452 numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
453 numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
454 numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
455 numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
456 numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
2ba45a60
DM
457 if(numEq > c->ppMode.flatnessThreshold){
458 int min, max, x;
459
460 if(src[0] > src[step]){
461 max= src[0];
462 min= src[step];
463 }else{
464 max= src[step];
465 min= src[0];
466 }
467 for(x=2; x<8; x+=2){
468 if(src[x*step] > src[(x+1)*step]){
469 if(src[x *step] > max) max= src[ x *step];
470 if(src[(x+1)*step] < min) min= src[(x+1)*step];
471 }else{
472 if(src[(x+1)*step] > max) max= src[(x+1)*step];
473 if(src[ x *step] < min) min= src[ x *step];
474 }
475 }
476 if(max-min < 2*QP){
477 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
478 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
479
480 int sums[10];
481 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
482 sums[1] = sums[0] - first + src[3*step];
483 sums[2] = sums[1] - first + src[4*step];
484 sums[3] = sums[2] - first + src[5*step];
485 sums[4] = sums[3] - first + src[6*step];
486 sums[5] = sums[4] - src[0*step] + src[7*step];
487 sums[6] = sums[5] - src[1*step] + last;
488 sums[7] = sums[6] - src[2*step] + last;
489 sums[8] = sums[7] - src[3*step] + last;
490 sums[9] = sums[8] - src[4*step] + last;
491
f6fa7814
DM
492 if (mode & VISUALIZE) {
493 src[0*step] =
494 src[1*step] =
495 src[2*step] =
496 src[3*step] =
497 src[4*step] =
498 src[5*step] =
499 src[6*step] =
500 src[7*step] = 128;
501 }
2ba45a60
DM
502 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
503 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
504 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
505 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
506 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
507 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
508 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
509 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
510 }
511 }else{
512 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
513
514 if(FFABS(middleEnergy) < 8*QP){
515 const int q=(src[3*step] - src[4*step])/2;
516 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
517 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
518
519 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
520 d= FFMAX(d, 0);
521
522 d= (5*d + 32) >> 6;
523 d*= FFSIGN(-middleEnergy);
524
525 if(q>0){
526 d = FFMAX(d, 0);
527 d = FFMIN(d, q);
528 }else{
529 d = FFMIN(d, 0);
530 d = FFMAX(d, q);
531 }
532
f6fa7814
DM
533 if ((mode & VISUALIZE) && d) {
534 d= (d < 0) ? 32 : -32;
535 src[3*step]= av_clip_uint8(src[3*step] - d);
536 src[4*step]= av_clip_uint8(src[4*step] + d);
537 d = 0;
538 }
539
2ba45a60
DM
540 src[3*step]-= d;
541 src[4*step]+= d;
542 }
543 }
544
545 src += stride;
546 }
547/*if(step==16){
548 STOP_TIMER("step16")
549}else{
550 STOP_TIMER("stepX")
551}*/
552}
553
554//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
555//Plain C versions
556//we always compile C for testing which needs bitexactness
557#define TEMPLATE_PP_C 1
558#include "postprocess_template.c"
559
560#if HAVE_ALTIVEC
561# define TEMPLATE_PP_ALTIVEC 1
562# include "postprocess_altivec_template.c"
563# include "postprocess_template.c"
564#endif
565
566#if ARCH_X86 && HAVE_INLINE_ASM
567# if CONFIG_RUNTIME_CPUDETECT
568# define TEMPLATE_PP_MMX 1
569# include "postprocess_template.c"
570# define TEMPLATE_PP_MMXEXT 1
571# include "postprocess_template.c"
572# define TEMPLATE_PP_3DNOW 1
573# include "postprocess_template.c"
574# define TEMPLATE_PP_SSE2 1
575# include "postprocess_template.c"
576# else
577# if HAVE_SSE2_INLINE
578# define TEMPLATE_PP_SSE2 1
579# include "postprocess_template.c"
580# elif HAVE_MMXEXT_INLINE
581# define TEMPLATE_PP_MMXEXT 1
582# include "postprocess_template.c"
583# elif HAVE_AMD3DNOW_INLINE
584# define TEMPLATE_PP_3DNOW 1
585# include "postprocess_template.c"
586# elif HAVE_MMX_INLINE
587# define TEMPLATE_PP_MMX 1
588# include "postprocess_template.c"
589# endif
590# endif
591#endif
592
593typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
594 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
595
596static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
597 const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
598{
599 pp_fn pp = postProcess_C;
600 PPContext *c= (PPContext *)vc;
601 PPMode *ppMode= (PPMode *)vm;
602 c->ppMode= *ppMode; //FIXME
603
604 if (!(ppMode->lumMode & BITEXACT)) {
605#if CONFIG_RUNTIME_CPUDETECT
606#if ARCH_X86 && HAVE_INLINE_ASM
607 // ordered per speed fastest first
608 if (c->cpuCaps & AV_CPU_FLAG_SSE2) pp = postProcess_SSE2;
609 else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT) pp = postProcess_MMX2;
610 else if (c->cpuCaps & AV_CPU_FLAG_3DNOW) pp = postProcess_3DNow;
611 else if (c->cpuCaps & AV_CPU_FLAG_MMX) pp = postProcess_MMX;
612#elif HAVE_ALTIVEC
613 if (c->cpuCaps & AV_CPU_FLAG_ALTIVEC) pp = postProcess_altivec;
614#endif
615#else /* CONFIG_RUNTIME_CPUDETECT */
616#if HAVE_SSE2_INLINE
617 pp = postProcess_SSE2;
618#elif HAVE_MMXEXT_INLINE
619 pp = postProcess_MMX2;
620#elif HAVE_AMD3DNOW_INLINE
621 pp = postProcess_3DNow;
622#elif HAVE_MMX_INLINE
623 pp = postProcess_MMX;
624#elif HAVE_ALTIVEC
625 pp = postProcess_altivec;
626#endif
627#endif /* !CONFIG_RUNTIME_CPUDETECT */
628 }
629
630 pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
631}
632
633/* -pp Command line Help
634*/
635const char pp_help[] =
636"Available postprocessing filters:\n"
637"Filters Options\n"
638"short long name short long option Description\n"
639"* * a autoq CPU power dependent enabler\n"
640" c chrom chrominance filtering enabled\n"
641" y nochrom chrominance filtering disabled\n"
642" n noluma luma filtering disabled\n"
643"hb hdeblock (2 threshold) horizontal deblocking filter\n"
644" 1. difference factor: default=32, higher -> more deblocking\n"
645" 2. flatness threshold: default=39, lower -> more deblocking\n"
646" the h & v deblocking filters share these\n"
647" so you can't set different thresholds for h / v\n"
648"vb vdeblock (2 threshold) vertical deblocking filter\n"
649"ha hadeblock (2 threshold) horizontal deblocking filter\n"
650"va vadeblock (2 threshold) vertical deblocking filter\n"
651"h1 x1hdeblock experimental h deblock filter 1\n"
652"v1 x1vdeblock experimental v deblock filter 1\n"
653"dr dering deringing filter\n"
654"al autolevels automatic brightness / contrast\n"
655" f fullyrange stretch luminance to (0..255)\n"
656"lb linblenddeint linear blend deinterlacer\n"
657"li linipoldeint linear interpolating deinterlace\n"
658"ci cubicipoldeint cubic interpolating deinterlacer\n"
659"md mediandeint median deinterlacer\n"
660"fd ffmpegdeint ffmpeg deinterlacer\n"
661"l5 lowpass5 FIR lowpass deinterlacer\n"
662"de default hb:a,vb:a,dr:a\n"
663"fa fast h1:a,v1:a,dr:a\n"
664"ac ha:a:128:7,va:a,dr:a\n"
665"tn tmpnoise (3 threshold) temporal noise reducer\n"
666" 1. <= 2. <= 3. larger -> stronger filtering\n"
667"fq forceQuant <quantizer> force quantizer\n"
668"Usage:\n"
669"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
670"long form example:\n"
671"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
672"short form example:\n"
673"vb:a/hb:a/lb de,-vb\n"
674"more examples:\n"
675"tn:64:128:256\n"
676"\n"
677;
678
679pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
680{
681 char temp[GET_MODE_BUFFER_SIZE];
682 char *p= temp;
683 static const char filterDelimiters[] = ",/";
684 static const char optionDelimiters[] = ":|";
685 struct PPMode *ppMode;
686 char *filterToken;
687
688 if (!name) {
689 av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
690 return NULL;
691 }
692
693 if (!strcmp(name, "help")) {
694 const char *p;
695 for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
696 av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
697 av_log(NULL, AV_LOG_INFO, "%s", temp);
698 }
699 return NULL;
700 }
701
702 ppMode= av_malloc(sizeof(PPMode));
703
704 ppMode->lumMode= 0;
705 ppMode->chromMode= 0;
706 ppMode->maxTmpNoise[0]= 700;
707 ppMode->maxTmpNoise[1]= 1500;
708 ppMode->maxTmpNoise[2]= 3000;
709 ppMode->maxAllowedY= 234;
710 ppMode->minAllowedY= 16;
711 ppMode->baseDcDiff= 256/8;
712 ppMode->flatnessThreshold= 56-16-1;
713 ppMode->maxClippedThreshold= 0.01;
714 ppMode->error=0;
715
716 memset(temp, 0, GET_MODE_BUFFER_SIZE);
717 av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
718
719 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
720
721 for(;;){
f6fa7814 722 const char *filterName;
2ba45a60
DM
723 int q= 1000000; //PP_QUALITY_MAX;
724 int chrom=-1;
725 int luma=-1;
f6fa7814
DM
726 const char *option;
727 const char *options[OPTIONS_ARRAY_SIZE];
2ba45a60
DM
728 int i;
729 int filterNameOk=0;
730 int numOfUnknownOptions=0;
731 int enable=1; //does the user want us to enabled or disabled the filter
f6fa7814 732 char *tokstate;
2ba45a60 733
f6fa7814 734 filterToken= av_strtok(p, filterDelimiters, &tokstate);
2ba45a60
DM
735 if(!filterToken) break;
736 p+= strlen(filterToken) + 1; // p points to next filterToken
f6fa7814 737 filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
2ba45a60
DM
738 if (!filterName) {
739 ppMode->error++;
740 break;
741 }
742 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
743
744 if(*filterName == '-'){
745 enable=0;
746 filterName++;
747 }
748
749 for(;;){ //for all options
f6fa7814 750 option= av_strtok(NULL, optionDelimiters, &tokstate);
2ba45a60
DM
751 if(!option) break;
752
753 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
754 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
755 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
756 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
757 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
758 else{
759 options[numOfUnknownOptions] = option;
760 numOfUnknownOptions++;
761 }
762 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
763 }
764 options[numOfUnknownOptions] = NULL;
765
766 /* replace stuff from the replace Table */
767 for(i=0; replaceTable[2*i]; i++){
768 if(!strcmp(replaceTable[2*i], filterName)){
769 int newlen= strlen(replaceTable[2*i + 1]);
770 int plen;
771 int spaceLeft;
772
773 p--, *p=',';
774
775 plen= strlen(p);
776 spaceLeft= p - temp + plen;
777 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){
778 ppMode->error++;
779 break;
780 }
781 memmove(p + newlen, p, plen+1);
782 memcpy(p, replaceTable[2*i + 1], newlen);
783 filterNameOk=1;
784 }
785 }
786
787 for(i=0; filters[i].shortName; i++){
788 if( !strcmp(filters[i].longName, filterName)
789 || !strcmp(filters[i].shortName, filterName)){
790 ppMode->lumMode &= ~filters[i].mask;
791 ppMode->chromMode &= ~filters[i].mask;
792
793 filterNameOk=1;
794 if(!enable) break; // user wants to disable it
795
796 if(q >= filters[i].minLumQuality && luma)
797 ppMode->lumMode|= filters[i].mask;
798 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
799 if(q >= filters[i].minChromQuality)
800 ppMode->chromMode|= filters[i].mask;
801
802 if(filters[i].mask == LEVEL_FIX){
803 int o;
804 ppMode->minAllowedY= 16;
805 ppMode->maxAllowedY= 234;
806 for(o=0; options[o]; o++){
807 if( !strcmp(options[o],"fullyrange")
808 ||!strcmp(options[o],"f")){
809 ppMode->minAllowedY= 0;
810 ppMode->maxAllowedY= 255;
811 numOfUnknownOptions--;
812 }
813 }
814 }
815 else if(filters[i].mask == TEMP_NOISE_FILTER)
816 {
817 int o;
818 int numOfNoises=0;
819
820 for(o=0; options[o]; o++){
821 char *tail;
822 ppMode->maxTmpNoise[numOfNoises]=
823 strtol(options[o], &tail, 0);
824 if(tail!=options[o]){
825 numOfNoises++;
826 numOfUnknownOptions--;
827 if(numOfNoises >= 3) break;
828 }
829 }
830 }
831 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
832 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
833 int o;
834
835 for(o=0; options[o] && o<2; o++){
836 char *tail;
837 int val= strtol(options[o], &tail, 0);
838 if(tail==options[o]) break;
839
840 numOfUnknownOptions--;
841 if(o==0) ppMode->baseDcDiff= val;
842 else ppMode->flatnessThreshold= val;
843 }
844 }
845 else if(filters[i].mask == FORCE_QUANT){
846 int o;
847 ppMode->forcedQuant= 15;
848
849 for(o=0; options[o] && o<1; o++){
850 char *tail;
851 int val= strtol(options[o], &tail, 0);
852 if(tail==options[o]) break;
853
854 numOfUnknownOptions--;
855 ppMode->forcedQuant= val;
856 }
857 }
858 }
859 }
860 if(!filterNameOk) ppMode->error++;
861 ppMode->error += numOfUnknownOptions;
862 }
863
864 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
865 if(ppMode->error){
866 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
867 av_free(ppMode);
868 return NULL;
869 }
870 return ppMode;
871}
872
873void pp_free_mode(pp_mode *mode){
874 av_free(mode);
875}
876
f6fa7814 877static void reallocAlign(void **p, int size){
2ba45a60
DM
878 av_free(*p);
879 *p= av_mallocz(size);
880}
881
882static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
883 int mbWidth = (width+15)>>4;
884 int mbHeight= (height+15)>>4;
885 int i;
886
887 c->stride= stride;
888 c->qpStride= qpStride;
889
f6fa7814
DM
890 reallocAlign((void **)&c->tempDst, stride*24+32);
891 reallocAlign((void **)&c->tempSrc, stride*24);
892 reallocAlign((void **)&c->tempBlocks, 2*16*8);
893 reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
2ba45a60
DM
894 for(i=0; i<256; i++)
895 c->yHistogram[i]= width*height/64*15/256;
896
897 for(i=0; i<3; i++){
898 //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
f6fa7814
DM
899 reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
900 reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
2ba45a60
DM
901 }
902
f6fa7814
DM
903 reallocAlign((void **)&c->deintTemp, 2*width+32);
904 reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
905 reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
906 reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(QP_STORE_T));
2ba45a60
DM
907}
908
909static const char * context_to_name(void * ptr) {
910 return "postproc";
911}
912
913static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
914
915pp_context *pp_get_context(int width, int height, int cpuCaps){
916 PPContext *c= av_malloc(sizeof(PPContext));
917 int stride= FFALIGN(width, 16); //assumed / will realloc if needed
918 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
919
920 memset(c, 0, sizeof(PPContext));
921 c->av_class = &av_codec_context_class;
922 if(cpuCaps&PP_FORMAT){
923 c->hChromaSubSample= cpuCaps&0x3;
924 c->vChromaSubSample= (cpuCaps>>4)&0x3;
925 }else{
926 c->hChromaSubSample= 1;
927 c->vChromaSubSample= 1;
928 }
929 if (cpuCaps & PP_CPU_CAPS_AUTO) {
930 c->cpuCaps = av_get_cpu_flags();
931 } else {
932 c->cpuCaps = 0;
933 if (cpuCaps & PP_CPU_CAPS_MMX) c->cpuCaps |= AV_CPU_FLAG_MMX;
934 if (cpuCaps & PP_CPU_CAPS_MMX2) c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
935 if (cpuCaps & PP_CPU_CAPS_3DNOW) c->cpuCaps |= AV_CPU_FLAG_3DNOW;
936 if (cpuCaps & PP_CPU_CAPS_ALTIVEC) c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
937 }
938
939 reallocBuffers(c, width, height, stride, qpStride);
940
941 c->frameNum=-1;
942
943 return c;
944}
945
946void pp_free_context(void *vc){
947 PPContext *c = (PPContext*)vc;
948 int i;
949
f6fa7814
DM
950 for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
951 av_free(c->tempBlurred[i]);
952 for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
953 av_free(c->tempBlurredPast[i]);
2ba45a60
DM
954
955 av_free(c->tempBlocks);
956 av_free(c->yHistogram);
957 av_free(c->tempDst);
958 av_free(c->tempSrc);
959 av_free(c->deintTemp);
960 av_free(c->stdQPTable);
961 av_free(c->nonBQPTable);
962 av_free(c->forcedQPTable);
963
964 memset(c, 0, sizeof(PPContext));
965
966 av_free(c);
967}
968
969void pp_postprocess(const uint8_t * src[3], const int srcStride[3],
970 uint8_t * dst[3], const int dstStride[3],
971 int width, int height,
972 const QP_STORE_T *QP_store, int QPStride,
973 pp_mode *vm, void *vc, int pict_type)
974{
975 int mbWidth = (width+15)>>4;
976 int mbHeight= (height+15)>>4;
f6fa7814
DM
977 PPMode *mode = vm;
978 PPContext *c = vc;
2ba45a60
DM
979 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
980 int absQPStride = FFABS(QPStride);
981
982 // c->stride and c->QPStride are always positive
983 if(c->stride < minStride || c->qpStride < absQPStride)
984 reallocBuffers(c, width, height,
985 FFMAX(minStride, c->stride),
986 FFMAX(c->qpStride, absQPStride));
987
988 if(!QP_store || (mode->lumMode & FORCE_QUANT)){
989 int i;
990 QP_store= c->forcedQPTable;
991 absQPStride = QPStride = 0;
992 if(mode->lumMode & FORCE_QUANT)
993 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
994 else
995 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
996 }
997
998 if(pict_type & PP_PICT_TYPE_QP2){
999 int i;
1000 const int count= FFMAX(mbHeight * absQPStride, mbWidth);
1001 for(i=0; i<(count>>2); i++){
1002 ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1003 }
1004 for(i<<=2; i<count; i++){
1005 c->stdQPTable[i] = QP_store[i]>>1;
1006 }
1007 QP_store= c->stdQPTable;
1008 QPStride= absQPStride;
1009 }
1010
1011 if(0){
1012 int x,y;
1013 for(y=0; y<mbHeight; y++){
1014 for(x=0; x<mbWidth; x++){
1015 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1016 }
1017 av_log(c, AV_LOG_INFO, "\n");
1018 }
1019 av_log(c, AV_LOG_INFO, "\n");
1020 }
1021
1022 if((pict_type&7)!=3){
1023 if (QPStride >= 0){
1024 int i;
1025 const int count= FFMAX(mbHeight * QPStride, mbWidth);
1026 for(i=0; i<(count>>2); i++){
1027 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1028 }
1029 for(i<<=2; i<count; i++){
1030 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1031 }
1032 } else {
1033 int i,j;
1034 for(i=0; i<mbHeight; i++) {
1035 for(j=0; j<absQPStride; j++) {
1036 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1037 }
1038 }
1039 }
1040 }
1041
1042 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1043 mode->lumMode, mode->chromMode);
1044
1045 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1046 width, height, QP_store, QPStride, 0, mode, c);
1047
f6fa7814
DM
1048 if (!(src[1] && src[2] && dst[1] && dst[2]))
1049 return;
1050
2ba45a60
DM
1051 width = (width )>>c->hChromaSubSample;
1052 height = (height)>>c->vChromaSubSample;
1053
1054 if(mode->chromMode){
1055 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1056 width, height, QP_store, QPStride, 1, mode, c);
1057 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1058 width, height, QP_store, QPStride, 2, mode, c);
1059 }
1060 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1061 linecpy(dst[1], src[1], height, srcStride[1]);
1062 linecpy(dst[2], src[2], height, srcStride[2]);
1063 }else{
1064 int y;
1065 for(y=0; y<height; y++){
1066 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1067 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1068 }
1069 }
1070}