Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libpostproc / postprocess.c
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/**
24 * @file
25 * postprocessing.
26 */
27
28/*
29 C MMX MMX2 3DNow AltiVec
30isVertDC Ec Ec Ec
31isVertMinMaxOk Ec Ec Ec
32doVertLowPass E e e Ec
33doVertDefFilter Ec Ec e e Ec
34isHorizDC Ec Ec Ec
35isHorizMinMaxOk a E Ec
36doHorizLowPass E e e Ec
37doHorizDefFilter Ec Ec e e Ec
38do_a_deblock Ec E Ec E
39deRing E e e* Ecp
40Vertical RKAlgo1 E a a
41Horizontal RKAlgo1 a a
42Vertical X1# a E E
43Horizontal X1# a E E
44LinIpolDeinterlace e E E*
45CubicIpolDeinterlace a e e*
46LinBlendDeinterlace e E E*
47MedianDeinterlace# E Ec Ec
48TempDeNoiser# E e e Ec
49
50* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51# more or less selfinvented filters so the exactness is not too meaningful
52E = Exact implementation
53e = almost exact implementation (slightly different rounding,...)
54a = alternative / approximate impl
55c = checked against the other implementations (-vo md5)
56p = partially optimized, still some work to do
57*/
58
59/*
60TODO:
61reduce the time wasted on the mem transfer
62unroll stuff if instructions depend too much on the prior one
63move YScale thing to the end instead of fixing QP
64write a faster and higher quality deblocking filter :)
65make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67compare the quality & speed of all filters
68split this huge file
69optimize c versions
70try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71...
72*/
73
74//Changelog: use git log
75
76#include "config.h"
77#include "libavutil/avutil.h"
78#include "libavutil/avassert.h"
79#include <inttypes.h>
80#include <stdio.h>
81#include <stdlib.h>
82#include <string.h>
83//#undef HAVE_MMXEXT_INLINE
84//#define HAVE_AMD3DNOW_INLINE
85//#undef HAVE_MMX_INLINE
86//#undef ARCH_X86
87//#define DEBUG_BRIGHTNESS
88#include "postprocess.h"
89#include "postprocess_internal.h"
90#include "libavutil/avstring.h"
91
92unsigned postproc_version(void)
93{
94 av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
95 return LIBPOSTPROC_VERSION_INT;
96}
97
98const char *postproc_configuration(void)
99{
100 return FFMPEG_CONFIGURATION;
101}
102
103const char *postproc_license(void)
104{
105#define LICENSE_PREFIX "libpostproc license: "
106 return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
107}
108
109#if HAVE_ALTIVEC_H
110#include <altivec.h>
111#endif
112
113#define GET_MODE_BUFFER_SIZE 500
114#define OPTIONS_ARRAY_SIZE 10
115#define BLOCK_SIZE 8
116#define TEMP_STRIDE 8
117//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
118
119#if ARCH_X86 && HAVE_INLINE_ASM
120DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
121DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
122DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
123DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
124DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
125DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
126DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
127DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
128#endif
129
130DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
131
132
133static const struct PPFilter filters[]=
134{
135 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
136 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
137/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
138 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
139 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
140 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
141 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
142 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
143 {"dr", "dering", 1, 5, 6, DERING},
144 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
145 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
146 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
147 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
148 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
149 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
150 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
151 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
152 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
153 {"be", "bitexact", 1, 0, 0, BITEXACT},
154 {NULL, NULL,0,0,0,0} //End Marker
155};
156
157static const char * const replaceTable[]=
158{
159 "default", "hb:a,vb:a,dr:a",
160 "de", "hb:a,vb:a,dr:a",
161 "fast", "h1:a,v1:a,dr:a",
162 "fa", "h1:a,v1:a,dr:a",
163 "ac", "ha:a:128:7,va:a,dr:a",
164 NULL //End Marker
165};
166
167
168#if ARCH_X86 && HAVE_INLINE_ASM
169static inline void prefetchnta(void *p)
170{
171 __asm__ volatile( "prefetchnta (%0)\n\t"
172 : : "r" (p)
173 );
174}
175
176static inline void prefetcht0(void *p)
177{
178 __asm__ volatile( "prefetcht0 (%0)\n\t"
179 : : "r" (p)
180 );
181}
182
183static inline void prefetcht1(void *p)
184{
185 __asm__ volatile( "prefetcht1 (%0)\n\t"
186 : : "r" (p)
187 );
188}
189
190static inline void prefetcht2(void *p)
191{
192 __asm__ volatile( "prefetcht2 (%0)\n\t"
193 : : "r" (p)
194 );
195}
196#endif
197
198/* The horizontal functions exist only in C because the MMX
199 * code is faster with vertical filters and transposing. */
200
201/**
202 * Check if the given 8x8 Block is mostly "flat"
203 */
204static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
205{
206 int numEq= 0;
207 int y;
208 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
209 const int dcThreshold= dcOffset*2 + 1;
210
211 for(y=0; y<BLOCK_SIZE; y++){
212 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
213 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
214 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
215 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
216 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
217 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
218 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
219 src+= stride;
220 }
221 return numEq > c->ppMode.flatnessThreshold;
222}
223
224/**
225 * Check if the middle 8x8 Block in the given 8x16 block is flat
226 */
227static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
228{
229 int numEq= 0;
230 int y;
231 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
232 const int dcThreshold= dcOffset*2 + 1;
233
234 src+= stride*4; // src points to begin of the 8x8 Block
235 for(y=0; y<BLOCK_SIZE-1; y++){
236 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
237 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
238 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
239 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
240 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
241 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
242 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
243 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
244 src+= stride;
245 }
246 return numEq > c->ppMode.flatnessThreshold;
247}
248
249static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
250{
251 int i;
252 for(i=0; i<2; i++){
253 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
254 src += stride;
255 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
256 src += stride;
257 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
258 src += stride;
259 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
260 src += stride;
261 }
262 return 1;
263}
264
265static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
266{
267 int x;
268 src+= stride*4;
269 for(x=0; x<BLOCK_SIZE; x+=4){
270 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
271 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
272 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
273 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
274 }
275 return 1;
276}
277
278static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
279{
280 if( isHorizDC_C(src, stride, c) ){
281 if( isHorizMinMaxOk_C(src, stride, c->QP) )
282 return 1;
283 else
284 return 0;
285 }else{
286 return 2;
287 }
288}
289
290static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
291{
292 if( isVertDC_C(src, stride, c) ){
293 if( isVertMinMaxOk_C(src, stride, c->QP) )
294 return 1;
295 else
296 return 0;
297 }else{
298 return 2;
299 }
300}
301
302static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
303{
304 int y;
305 for(y=0; y<BLOCK_SIZE; y++){
306 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
307
308 if(FFABS(middleEnergy) < 8*c->QP){
309 const int q=(dst[3] - dst[4])/2;
310 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
311 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
312
313 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
314 d= FFMAX(d, 0);
315
316 d= (5*d + 32) >> 6;
317 d*= FFSIGN(-middleEnergy);
318
319 if(q>0)
320 {
321 d = FFMAX(d, 0);
322 d = FFMIN(d, q);
323 }
324 else
325 {
326 d = FFMIN(d, 0);
327 d = FFMAX(d, q);
328 }
329
330 dst[3]-= d;
331 dst[4]+= d;
332 }
333 dst+= stride;
334 }
335}
336
337/**
338 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
339 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
340 */
341static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
342{
343 int y;
344 for(y=0; y<BLOCK_SIZE; y++){
345 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
346 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
347
348 int sums[10];
349 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
350 sums[1] = sums[0] - first + dst[3];
351 sums[2] = sums[1] - first + dst[4];
352 sums[3] = sums[2] - first + dst[5];
353 sums[4] = sums[3] - first + dst[6];
354 sums[5] = sums[4] - dst[0] + dst[7];
355 sums[6] = sums[5] - dst[1] + last;
356 sums[7] = sums[6] - dst[2] + last;
357 sums[8] = sums[7] - dst[3] + last;
358 sums[9] = sums[8] - dst[4] + last;
359
360 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
361 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
362 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
363 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
364 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
365 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
366 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
367 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
368
369 dst+= stride;
370 }
371}
372
373/**
374 * Experimental Filter 1 (Horizontal)
375 * will not damage linear gradients
376 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
377 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
378 * MMX2 version does correct clipping C version does not
379 * not identical with the vertical one
380 */
381static inline void horizX1Filter(uint8_t *src, int stride, int QP)
382{
383 int y;
384 static uint64_t lut[256];
385 if(!lut[255])
386 {
387 int i;
388 for(i=0; i<256; i++)
389 {
390 int v= i < 128 ? 2*i : 2*(i-256);
391/*
392//Simulate 112242211 9-Tap filter
393 uint64_t a= (v/16) & 0xFF;
394 uint64_t b= (v/8) & 0xFF;
395 uint64_t c= (v/4) & 0xFF;
396 uint64_t d= (3*v/8) & 0xFF;
397*/
398//Simulate piecewise linear interpolation
399 uint64_t a= (v/16) & 0xFF;
400 uint64_t b= (v*3/16) & 0xFF;
401 uint64_t c= (v*5/16) & 0xFF;
402 uint64_t d= (7*v/16) & 0xFF;
403 uint64_t A= (0x100 - a)&0xFF;
404 uint64_t B= (0x100 - b)&0xFF;
405 uint64_t C= (0x100 - c)&0xFF;
406 uint64_t D= (0x100 - c)&0xFF;
407
408 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
409 (D<<24) | (C<<16) | (B<<8) | (A);
410 //lut[i] = (v<<32) | (v<<24);
411 }
412 }
413
414 for(y=0; y<BLOCK_SIZE; y++){
415 int a= src[1] - src[2];
416 int b= src[3] - src[4];
417 int c= src[5] - src[6];
418
419 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
420
421 if(d < QP){
422 int v = d * FFSIGN(-b);
423
424 src[1] +=v/8;
425 src[2] +=v/4;
426 src[3] +=3*v/8;
427 src[4] -=3*v/8;
428 src[5] -=v/4;
429 src[6] -=v/8;
430 }
431 src+=stride;
432 }
433}
434
435/**
436 * accurate deblock filter
437 */
438static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
439 int stride, const PPContext *c)
440{
441 int y;
442 const int QP= c->QP;
443 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
444 const int dcThreshold= dcOffset*2 + 1;
445//START_TIMER
446 src+= step*4; // src points to begin of the 8x8 Block
447 for(y=0; y<8; y++){
448 int numEq= 0;
449
450 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
451 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
452 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
453 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
454 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
455 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
456 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
457 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
458 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
459 if(numEq > c->ppMode.flatnessThreshold){
460 int min, max, x;
461
462 if(src[0] > src[step]){
463 max= src[0];
464 min= src[step];
465 }else{
466 max= src[step];
467 min= src[0];
468 }
469 for(x=2; x<8; x+=2){
470 if(src[x*step] > src[(x+1)*step]){
471 if(src[x *step] > max) max= src[ x *step];
472 if(src[(x+1)*step] < min) min= src[(x+1)*step];
473 }else{
474 if(src[(x+1)*step] > max) max= src[(x+1)*step];
475 if(src[ x *step] < min) min= src[ x *step];
476 }
477 }
478 if(max-min < 2*QP){
479 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
480 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
481
482 int sums[10];
483 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
484 sums[1] = sums[0] - first + src[3*step];
485 sums[2] = sums[1] - first + src[4*step];
486 sums[3] = sums[2] - first + src[5*step];
487 sums[4] = sums[3] - first + src[6*step];
488 sums[5] = sums[4] - src[0*step] + src[7*step];
489 sums[6] = sums[5] - src[1*step] + last;
490 sums[7] = sums[6] - src[2*step] + last;
491 sums[8] = sums[7] - src[3*step] + last;
492 sums[9] = sums[8] - src[4*step] + last;
493
494 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
495 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
496 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
497 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
498 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
499 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
500 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
501 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
502 }
503 }else{
504 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
505
506 if(FFABS(middleEnergy) < 8*QP){
507 const int q=(src[3*step] - src[4*step])/2;
508 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
509 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
510
511 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
512 d= FFMAX(d, 0);
513
514 d= (5*d + 32) >> 6;
515 d*= FFSIGN(-middleEnergy);
516
517 if(q>0){
518 d = FFMAX(d, 0);
519 d = FFMIN(d, q);
520 }else{
521 d = FFMIN(d, 0);
522 d = FFMAX(d, q);
523 }
524
525 src[3*step]-= d;
526 src[4*step]+= d;
527 }
528 }
529
530 src += stride;
531 }
532/*if(step==16){
533 STOP_TIMER("step16")
534}else{
535 STOP_TIMER("stepX")
536}*/
537}
538
539//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
540//Plain C versions
541//we always compile C for testing which needs bitexactness
542#define TEMPLATE_PP_C 1
543#include "postprocess_template.c"
544
545#if HAVE_ALTIVEC
546# define TEMPLATE_PP_ALTIVEC 1
547# include "postprocess_altivec_template.c"
548# include "postprocess_template.c"
549#endif
550
551#if ARCH_X86 && HAVE_INLINE_ASM
552# if CONFIG_RUNTIME_CPUDETECT
553# define TEMPLATE_PP_MMX 1
554# include "postprocess_template.c"
555# define TEMPLATE_PP_MMXEXT 1
556# include "postprocess_template.c"
557# define TEMPLATE_PP_3DNOW 1
558# include "postprocess_template.c"
559# define TEMPLATE_PP_SSE2 1
560# include "postprocess_template.c"
561# else
562# if HAVE_SSE2_INLINE
563# define TEMPLATE_PP_SSE2 1
564# include "postprocess_template.c"
565# elif HAVE_MMXEXT_INLINE
566# define TEMPLATE_PP_MMXEXT 1
567# include "postprocess_template.c"
568# elif HAVE_AMD3DNOW_INLINE
569# define TEMPLATE_PP_3DNOW 1
570# include "postprocess_template.c"
571# elif HAVE_MMX_INLINE
572# define TEMPLATE_PP_MMX 1
573# include "postprocess_template.c"
574# endif
575# endif
576#endif
577
578typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
579 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
580
581static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
582 const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
583{
584 pp_fn pp = postProcess_C;
585 PPContext *c= (PPContext *)vc;
586 PPMode *ppMode= (PPMode *)vm;
587 c->ppMode= *ppMode; //FIXME
588
589 if (!(ppMode->lumMode & BITEXACT)) {
590#if CONFIG_RUNTIME_CPUDETECT
591#if ARCH_X86 && HAVE_INLINE_ASM
592 // ordered per speed fastest first
593 if (c->cpuCaps & AV_CPU_FLAG_SSE2) pp = postProcess_SSE2;
594 else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT) pp = postProcess_MMX2;
595 else if (c->cpuCaps & AV_CPU_FLAG_3DNOW) pp = postProcess_3DNow;
596 else if (c->cpuCaps & AV_CPU_FLAG_MMX) pp = postProcess_MMX;
597#elif HAVE_ALTIVEC
598 if (c->cpuCaps & AV_CPU_FLAG_ALTIVEC) pp = postProcess_altivec;
599#endif
600#else /* CONFIG_RUNTIME_CPUDETECT */
601#if HAVE_SSE2_INLINE
602 pp = postProcess_SSE2;
603#elif HAVE_MMXEXT_INLINE
604 pp = postProcess_MMX2;
605#elif HAVE_AMD3DNOW_INLINE
606 pp = postProcess_3DNow;
607#elif HAVE_MMX_INLINE
608 pp = postProcess_MMX;
609#elif HAVE_ALTIVEC
610 pp = postProcess_altivec;
611#endif
612#endif /* !CONFIG_RUNTIME_CPUDETECT */
613 }
614
615 pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
616}
617
618/* -pp Command line Help
619*/
620const char pp_help[] =
621"Available postprocessing filters:\n"
622"Filters Options\n"
623"short long name short long option Description\n"
624"* * a autoq CPU power dependent enabler\n"
625" c chrom chrominance filtering enabled\n"
626" y nochrom chrominance filtering disabled\n"
627" n noluma luma filtering disabled\n"
628"hb hdeblock (2 threshold) horizontal deblocking filter\n"
629" 1. difference factor: default=32, higher -> more deblocking\n"
630" 2. flatness threshold: default=39, lower -> more deblocking\n"
631" the h & v deblocking filters share these\n"
632" so you can't set different thresholds for h / v\n"
633"vb vdeblock (2 threshold) vertical deblocking filter\n"
634"ha hadeblock (2 threshold) horizontal deblocking filter\n"
635"va vadeblock (2 threshold) vertical deblocking filter\n"
636"h1 x1hdeblock experimental h deblock filter 1\n"
637"v1 x1vdeblock experimental v deblock filter 1\n"
638"dr dering deringing filter\n"
639"al autolevels automatic brightness / contrast\n"
640" f fullyrange stretch luminance to (0..255)\n"
641"lb linblenddeint linear blend deinterlacer\n"
642"li linipoldeint linear interpolating deinterlace\n"
643"ci cubicipoldeint cubic interpolating deinterlacer\n"
644"md mediandeint median deinterlacer\n"
645"fd ffmpegdeint ffmpeg deinterlacer\n"
646"l5 lowpass5 FIR lowpass deinterlacer\n"
647"de default hb:a,vb:a,dr:a\n"
648"fa fast h1:a,v1:a,dr:a\n"
649"ac ha:a:128:7,va:a,dr:a\n"
650"tn tmpnoise (3 threshold) temporal noise reducer\n"
651" 1. <= 2. <= 3. larger -> stronger filtering\n"
652"fq forceQuant <quantizer> force quantizer\n"
653"Usage:\n"
654"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
655"long form example:\n"
656"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
657"short form example:\n"
658"vb:a/hb:a/lb de,-vb\n"
659"more examples:\n"
660"tn:64:128:256\n"
661"\n"
662;
663
664pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
665{
666 char temp[GET_MODE_BUFFER_SIZE];
667 char *p= temp;
668 static const char filterDelimiters[] = ",/";
669 static const char optionDelimiters[] = ":|";
670 struct PPMode *ppMode;
671 char *filterToken;
672
673 if (!name) {
674 av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
675 return NULL;
676 }
677
678 if (!strcmp(name, "help")) {
679 const char *p;
680 for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
681 av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
682 av_log(NULL, AV_LOG_INFO, "%s", temp);
683 }
684 return NULL;
685 }
686
687 ppMode= av_malloc(sizeof(PPMode));
688
689 ppMode->lumMode= 0;
690 ppMode->chromMode= 0;
691 ppMode->maxTmpNoise[0]= 700;
692 ppMode->maxTmpNoise[1]= 1500;
693 ppMode->maxTmpNoise[2]= 3000;
694 ppMode->maxAllowedY= 234;
695 ppMode->minAllowedY= 16;
696 ppMode->baseDcDiff= 256/8;
697 ppMode->flatnessThreshold= 56-16-1;
698 ppMode->maxClippedThreshold= 0.01;
699 ppMode->error=0;
700
701 memset(temp, 0, GET_MODE_BUFFER_SIZE);
702 av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
703
704 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
705
706 for(;;){
707 char *filterName;
708 int q= 1000000; //PP_QUALITY_MAX;
709 int chrom=-1;
710 int luma=-1;
711 char *option;
712 char *options[OPTIONS_ARRAY_SIZE];
713 int i;
714 int filterNameOk=0;
715 int numOfUnknownOptions=0;
716 int enable=1; //does the user want us to enabled or disabled the filter
717
718 filterToken= strtok(p, filterDelimiters);
719 if(!filterToken) break;
720 p+= strlen(filterToken) + 1; // p points to next filterToken
721 filterName= strtok(filterToken, optionDelimiters);
722 if (!filterName) {
723 ppMode->error++;
724 break;
725 }
726 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
727
728 if(*filterName == '-'){
729 enable=0;
730 filterName++;
731 }
732
733 for(;;){ //for all options
734 option= strtok(NULL, optionDelimiters);
735 if(!option) break;
736
737 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
738 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
739 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
740 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
741 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
742 else{
743 options[numOfUnknownOptions] = option;
744 numOfUnknownOptions++;
745 }
746 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
747 }
748 options[numOfUnknownOptions] = NULL;
749
750 /* replace stuff from the replace Table */
751 for(i=0; replaceTable[2*i]; i++){
752 if(!strcmp(replaceTable[2*i], filterName)){
753 int newlen= strlen(replaceTable[2*i + 1]);
754 int plen;
755 int spaceLeft;
756
757 p--, *p=',';
758
759 plen= strlen(p);
760 spaceLeft= p - temp + plen;
761 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){
762 ppMode->error++;
763 break;
764 }
765 memmove(p + newlen, p, plen+1);
766 memcpy(p, replaceTable[2*i + 1], newlen);
767 filterNameOk=1;
768 }
769 }
770
771 for(i=0; filters[i].shortName; i++){
772 if( !strcmp(filters[i].longName, filterName)
773 || !strcmp(filters[i].shortName, filterName)){
774 ppMode->lumMode &= ~filters[i].mask;
775 ppMode->chromMode &= ~filters[i].mask;
776
777 filterNameOk=1;
778 if(!enable) break; // user wants to disable it
779
780 if(q >= filters[i].minLumQuality && luma)
781 ppMode->lumMode|= filters[i].mask;
782 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
783 if(q >= filters[i].minChromQuality)
784 ppMode->chromMode|= filters[i].mask;
785
786 if(filters[i].mask == LEVEL_FIX){
787 int o;
788 ppMode->minAllowedY= 16;
789 ppMode->maxAllowedY= 234;
790 for(o=0; options[o]; o++){
791 if( !strcmp(options[o],"fullyrange")
792 ||!strcmp(options[o],"f")){
793 ppMode->minAllowedY= 0;
794 ppMode->maxAllowedY= 255;
795 numOfUnknownOptions--;
796 }
797 }
798 }
799 else if(filters[i].mask == TEMP_NOISE_FILTER)
800 {
801 int o;
802 int numOfNoises=0;
803
804 for(o=0; options[o]; o++){
805 char *tail;
806 ppMode->maxTmpNoise[numOfNoises]=
807 strtol(options[o], &tail, 0);
808 if(tail!=options[o]){
809 numOfNoises++;
810 numOfUnknownOptions--;
811 if(numOfNoises >= 3) break;
812 }
813 }
814 }
815 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
816 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
817 int o;
818
819 for(o=0; options[o] && o<2; o++){
820 char *tail;
821 int val= strtol(options[o], &tail, 0);
822 if(tail==options[o]) break;
823
824 numOfUnknownOptions--;
825 if(o==0) ppMode->baseDcDiff= val;
826 else ppMode->flatnessThreshold= val;
827 }
828 }
829 else if(filters[i].mask == FORCE_QUANT){
830 int o;
831 ppMode->forcedQuant= 15;
832
833 for(o=0; options[o] && o<1; o++){
834 char *tail;
835 int val= strtol(options[o], &tail, 0);
836 if(tail==options[o]) break;
837
838 numOfUnknownOptions--;
839 ppMode->forcedQuant= val;
840 }
841 }
842 }
843 }
844 if(!filterNameOk) ppMode->error++;
845 ppMode->error += numOfUnknownOptions;
846 }
847
848 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
849 if(ppMode->error){
850 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
851 av_free(ppMode);
852 return NULL;
853 }
854 return ppMode;
855}
856
857void pp_free_mode(pp_mode *mode){
858 av_free(mode);
859}
860
861static void reallocAlign(void **p, int alignment, int size){
862 av_free(*p);
863 *p= av_mallocz(size);
864}
865
866static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
867 int mbWidth = (width+15)>>4;
868 int mbHeight= (height+15)>>4;
869 int i;
870
871 c->stride= stride;
872 c->qpStride= qpStride;
873
874 reallocAlign((void **)&c->tempDst, 8, stride*24+32);
875 reallocAlign((void **)&c->tempSrc, 8, stride*24);
876 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
877 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
878 for(i=0; i<256; i++)
879 c->yHistogram[i]= width*height/64*15/256;
880
881 for(i=0; i<3; i++){
882 //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
883 reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
884 reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
885 }
886
887 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
888 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
889 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
890 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
891}
892
893static const char * context_to_name(void * ptr) {
894 return "postproc";
895}
896
897static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
898
899pp_context *pp_get_context(int width, int height, int cpuCaps){
900 PPContext *c= av_malloc(sizeof(PPContext));
901 int stride= FFALIGN(width, 16); //assumed / will realloc if needed
902 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
903
904 memset(c, 0, sizeof(PPContext));
905 c->av_class = &av_codec_context_class;
906 if(cpuCaps&PP_FORMAT){
907 c->hChromaSubSample= cpuCaps&0x3;
908 c->vChromaSubSample= (cpuCaps>>4)&0x3;
909 }else{
910 c->hChromaSubSample= 1;
911 c->vChromaSubSample= 1;
912 }
913 if (cpuCaps & PP_CPU_CAPS_AUTO) {
914 c->cpuCaps = av_get_cpu_flags();
915 } else {
916 c->cpuCaps = 0;
917 if (cpuCaps & PP_CPU_CAPS_MMX) c->cpuCaps |= AV_CPU_FLAG_MMX;
918 if (cpuCaps & PP_CPU_CAPS_MMX2) c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
919 if (cpuCaps & PP_CPU_CAPS_3DNOW) c->cpuCaps |= AV_CPU_FLAG_3DNOW;
920 if (cpuCaps & PP_CPU_CAPS_ALTIVEC) c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
921 }
922
923 reallocBuffers(c, width, height, stride, qpStride);
924
925 c->frameNum=-1;
926
927 return c;
928}
929
930void pp_free_context(void *vc){
931 PPContext *c = (PPContext*)vc;
932 int i;
933
934 for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
935 for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
936
937 av_free(c->tempBlocks);
938 av_free(c->yHistogram);
939 av_free(c->tempDst);
940 av_free(c->tempSrc);
941 av_free(c->deintTemp);
942 av_free(c->stdQPTable);
943 av_free(c->nonBQPTable);
944 av_free(c->forcedQPTable);
945
946 memset(c, 0, sizeof(PPContext));
947
948 av_free(c);
949}
950
951void pp_postprocess(const uint8_t * src[3], const int srcStride[3],
952 uint8_t * dst[3], const int dstStride[3],
953 int width, int height,
954 const QP_STORE_T *QP_store, int QPStride,
955 pp_mode *vm, void *vc, int pict_type)
956{
957 int mbWidth = (width+15)>>4;
958 int mbHeight= (height+15)>>4;
959 PPMode *mode = (PPMode*)vm;
960 PPContext *c = (PPContext*)vc;
961 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
962 int absQPStride = FFABS(QPStride);
963
964 // c->stride and c->QPStride are always positive
965 if(c->stride < minStride || c->qpStride < absQPStride)
966 reallocBuffers(c, width, height,
967 FFMAX(minStride, c->stride),
968 FFMAX(c->qpStride, absQPStride));
969
970 if(!QP_store || (mode->lumMode & FORCE_QUANT)){
971 int i;
972 QP_store= c->forcedQPTable;
973 absQPStride = QPStride = 0;
974 if(mode->lumMode & FORCE_QUANT)
975 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
976 else
977 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
978 }
979
980 if(pict_type & PP_PICT_TYPE_QP2){
981 int i;
982 const int count= FFMAX(mbHeight * absQPStride, mbWidth);
983 for(i=0; i<(count>>2); i++){
984 ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
985 }
986 for(i<<=2; i<count; i++){
987 c->stdQPTable[i] = QP_store[i]>>1;
988 }
989 QP_store= c->stdQPTable;
990 QPStride= absQPStride;
991 }
992
993 if(0){
994 int x,y;
995 for(y=0; y<mbHeight; y++){
996 for(x=0; x<mbWidth; x++){
997 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
998 }
999 av_log(c, AV_LOG_INFO, "\n");
1000 }
1001 av_log(c, AV_LOG_INFO, "\n");
1002 }
1003
1004 if((pict_type&7)!=3){
1005 if (QPStride >= 0){
1006 int i;
1007 const int count= FFMAX(mbHeight * QPStride, mbWidth);
1008 for(i=0; i<(count>>2); i++){
1009 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1010 }
1011 for(i<<=2; i<count; i++){
1012 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1013 }
1014 } else {
1015 int i,j;
1016 for(i=0; i<mbHeight; i++) {
1017 for(j=0; j<absQPStride; j++) {
1018 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1019 }
1020 }
1021 }
1022 }
1023
1024 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1025 mode->lumMode, mode->chromMode);
1026
1027 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1028 width, height, QP_store, QPStride, 0, mode, c);
1029
1030 width = (width )>>c->hChromaSubSample;
1031 height = (height)>>c->vChromaSubSample;
1032
1033 if(mode->chromMode){
1034 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1035 width, height, QP_store, QPStride, 1, mode, c);
1036 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1037 width, height, QP_store, QPStride, 2, mode, c);
1038 }
1039 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1040 linecpy(dst[1], src[1], height, srcStride[1]);
1041 linecpy(dst[2], src[2], height, srcStride[2]);
1042 }else{
1043 int y;
1044 for(y=0; y<height; y++){
1045 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1046 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1047 }
1048 }
1049}