[deb_ffmpeg.git] / ffmpeg / libavcodec / ppc / fft_vsx.c

/*
 * FFT  transform, optimized with VSX built-in functions
 * Copyright (c) 2014 Rong Yan
 *
 * This algorithm (though not any of the implementation details) is
 * based on libdjbfft by D. J. Bernstein.
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */


#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft-internal.h"
#include "fft_vsx.h"

#if HAVE_VSX

static void fft32_vsx_interleave(FFTComplex *z)
{
    fft16_vsx_interleave(z);
    fft8_vsx_interleave(z+16);
    fft8_vsx_interleave(z+24);
    pass_vsx_interleave(z,ff_cos_32,4);
}

static void fft64_vsx_interleave(FFTComplex *z)
{
    fft32_vsx_interleave(z);
    fft16_vsx_interleave(z+32);
    fft16_vsx_interleave(z+48);
    pass_vsx_interleave(z,ff_cos_64, 8);
}
static void fft128_vsx_interleave(FFTComplex *z)
{
    fft64_vsx_interleave(z);
    fft32_vsx_interleave(z+64);
    fft32_vsx_interleave(z+96);
    pass_vsx_interleave(z,ff_cos_128,16);
}
static void fft256_vsx_interleave(FFTComplex *z)
{
    fft128_vsx_interleave(z);
    fft64_vsx_interleave(z+128);
    fft64_vsx_interleave(z+192);
    pass_vsx_interleave(z,ff_cos_256,32);
}
static void fft512_vsx_interleave(FFTComplex *z)
{
    fft256_vsx_interleave(z);
    fft128_vsx_interleave(z+256);
    fft128_vsx_interleave(z+384);
    pass_vsx_interleave(z,ff_cos_512,64);
}
static void fft1024_vsx_interleave(FFTComplex *z)
{
    fft512_vsx_interleave(z);
    fft256_vsx_interleave(z+512);
    fft256_vsx_interleave(z+768);
    pass_vsx_interleave(z,ff_cos_1024,128);

}
static void fft2048_vsx_interleave(FFTComplex *z)
{
    fft1024_vsx_interleave(z);
    fft512_vsx_interleave(z+1024);
    fft512_vsx_interleave(z+1536);
    pass_vsx_interleave(z,ff_cos_2048,256);
}
static void fft4096_vsx_interleave(FFTComplex *z)
{
    fft2048_vsx_interleave(z);
    fft1024_vsx_interleave(z+2048);
    fft1024_vsx_interleave(z+3072);
    pass_vsx_interleave(z,ff_cos_4096, 512);
}
static void fft8192_vsx_interleave(FFTComplex *z)
{
    fft4096_vsx_interleave(z);
    fft2048_vsx_interleave(z+4096);
    fft2048_vsx_interleave(z+6144);
    pass_vsx_interleave(z,ff_cos_8192,1024);
}
static void fft16384_vsx_interleave(FFTComplex *z)
{
    fft8192_vsx_interleave(z);
    fft4096_vsx_interleave(z+8192);
    fft4096_vsx_interleave(z+12288);
    pass_vsx_interleave(z,ff_cos_16384,2048);
}
static void fft32768_vsx_interleave(FFTComplex *z)
{
    fft16384_vsx_interleave(z);
    fft8192_vsx_interleave(z+16384);
    fft8192_vsx_interleave(z+24576);
    pass_vsx_interleave(z,ff_cos_32768,4096);
}
static void fft65536_vsx_interleave(FFTComplex *z)
{
    fft32768_vsx_interleave(z);
    fft16384_vsx_interleave(z+32768);
    fft16384_vsx_interleave(z+49152);
    pass_vsx_interleave(z,ff_cos_65536,8192);
}

static void fft32_vsx(FFTComplex *z)
{
    fft16_vsx(z);
    fft8_vsx(z+16);
    fft8_vsx(z+24);
    pass_vsx(z,ff_cos_32,4);
}

static void fft64_vsx(FFTComplex *z)
{
    fft32_vsx(z);
    fft16_vsx(z+32);
    fft16_vsx(z+48);
    pass_vsx(z,ff_cos_64, 8);
}
static void fft128_vsx(FFTComplex *z)
{
    fft64_vsx(z);
    fft32_vsx(z+64);
    fft32_vsx(z+96);
    pass_vsx(z,ff_cos_128,16);
}
static void fft256_vsx(FFTComplex *z)
{
    fft128_vsx(z);
    fft64_vsx(z+128);
    fft64_vsx(z+192);
    pass_vsx(z,ff_cos_256,32);
}
static void fft512_vsx(FFTComplex *z)
{
    fft256_vsx(z);
    fft128_vsx(z+256);
    fft128_vsx(z+384);
    pass_vsx(z,ff_cos_512,64);
}
static void fft1024_vsx(FFTComplex *z)
{
    fft512_vsx(z);
    fft256_vsx(z+512);
    fft256_vsx(z+768);
    pass_vsx(z,ff_cos_1024,128);

}
static void fft2048_vsx(FFTComplex *z)
{
    fft1024_vsx(z);
    fft512_vsx(z+1024);
    fft512_vsx(z+1536);
    pass_vsx(z,ff_cos_2048,256);
}
static void fft4096_vsx(FFTComplex *z)
{
    fft2048_vsx(z);
    fft1024_vsx(z+2048);
    fft1024_vsx(z+3072);
    pass_vsx(z,ff_cos_4096, 512);
}
static void fft8192_vsx(FFTComplex *z)
{
    fft4096_vsx(z);
    fft2048_vsx(z+4096);
    fft2048_vsx(z+6144);
    pass_vsx(z,ff_cos_8192,1024);
}
static void fft16384_vsx(FFTComplex *z)
{
    fft8192_vsx(z);
    fft4096_vsx(z+8192);
    fft4096_vsx(z+12288);
    pass_vsx(z,ff_cos_16384,2048);
}
static void fft32768_vsx(FFTComplex *z)
{
    fft16384_vsx(z);
    fft8192_vsx(z+16384);
    fft8192_vsx(z+24576);
    pass_vsx(z,ff_cos_32768,4096);
}
static void fft65536_vsx(FFTComplex *z)
{
    fft32768_vsx(z);
    fft16384_vsx(z+32768);
    fft16384_vsx(z+49152);
    pass_vsx(z,ff_cos_65536,8192);
}

static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
    fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
    fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
};
static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
    fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
    fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
    fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
};
void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
{
     fft_dispatch_vsx_interleave[s->nbits-2](z);
}
void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
{
     fft_dispatch_vsx[s->nbits-2](z);
}
#endif /* HAVE_VSX */
Commit	Line	Data
	1	/*
	2	* FFT transform, optimized with VSX built-in functions
	3	* Copyright (c) 2014 Rong Yan
	4	*
	5	* This algorithm (though not any of the implementation details) is
	6	* based on libdjbfft by D. J. Bernstein.
	7	*
	8	* This file is part of FFmpeg.
	9	*
	10	* FFmpeg is free software; you can redistribute it and/or
	11	* modify it under the terms of the GNU Lesser General Public
	12	* License as published by the Free Software Foundation; either
	13	* version 2.1 of the License, or (at your option) any later version.
	14	*
	15	* FFmpeg is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	18	* Lesser General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU Lesser General Public
	21	* License along with FFmpeg; if not, write to the Free Software
	22	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	23	*/
	24
	25
	26	#include "config.h"
	27	#include "libavutil/cpu.h"
	28	#include "libavutil/ppc/types_altivec.h"
	29	#include "libavutil/ppc/util_altivec.h"
	30	#include "libavcodec/fft.h"
	31	#include "libavcodec/fft-internal.h"
	32	#include "fft_vsx.h"
	33
	34	#if HAVE_VSX
	35
	36	static void fft32_vsx_interleave(FFTComplex *z)
	37	{
	38	fft16_vsx_interleave(z);
	39	fft8_vsx_interleave(z+16);
	40	fft8_vsx_interleave(z+24);
	41	pass_vsx_interleave(z,ff_cos_32,4);
	42	}
	43
	44	static void fft64_vsx_interleave(FFTComplex *z)
	45	{
	46	fft32_vsx_interleave(z);
	47	fft16_vsx_interleave(z+32);
	48	fft16_vsx_interleave(z+48);
	49	pass_vsx_interleave(z,ff_cos_64, 8);
	50	}
	51	static void fft128_vsx_interleave(FFTComplex *z)
	52	{
	53	fft64_vsx_interleave(z);
	54	fft32_vsx_interleave(z+64);
	55	fft32_vsx_interleave(z+96);
	56	pass_vsx_interleave(z,ff_cos_128,16);
	57	}
	58	static void fft256_vsx_interleave(FFTComplex *z)
	59	{
	60	fft128_vsx_interleave(z);
	61	fft64_vsx_interleave(z+128);
	62	fft64_vsx_interleave(z+192);
	63	pass_vsx_interleave(z,ff_cos_256,32);
	64	}
	65	static void fft512_vsx_interleave(FFTComplex *z)
	66	{
	67	fft256_vsx_interleave(z);
	68	fft128_vsx_interleave(z+256);
	69	fft128_vsx_interleave(z+384);
	70	pass_vsx_interleave(z,ff_cos_512,64);
	71	}
	72	static void fft1024_vsx_interleave(FFTComplex *z)
	73	{
	74	fft512_vsx_interleave(z);
	75	fft256_vsx_interleave(z+512);
	76	fft256_vsx_interleave(z+768);
	77	pass_vsx_interleave(z,ff_cos_1024,128);
	78
	79	}
	80	static void fft2048_vsx_interleave(FFTComplex *z)
	81	{
	82	fft1024_vsx_interleave(z);
	83	fft512_vsx_interleave(z+1024);
	84	fft512_vsx_interleave(z+1536);
	85	pass_vsx_interleave(z,ff_cos_2048,256);
	86	}
	87	static void fft4096_vsx_interleave(FFTComplex *z)
	88	{
	89	fft2048_vsx_interleave(z);
	90	fft1024_vsx_interleave(z+2048);
	91	fft1024_vsx_interleave(z+3072);
	92	pass_vsx_interleave(z,ff_cos_4096, 512);
	93	}
	94	static void fft8192_vsx_interleave(FFTComplex *z)
	95	{
	96	fft4096_vsx_interleave(z);
	97	fft2048_vsx_interleave(z+4096);
	98	fft2048_vsx_interleave(z+6144);
	99	pass_vsx_interleave(z,ff_cos_8192,1024);
	100	}
	101	static void fft16384_vsx_interleave(FFTComplex *z)
	102	{
	103	fft8192_vsx_interleave(z);
	104	fft4096_vsx_interleave(z+8192);
	105	fft4096_vsx_interleave(z+12288);
	106	pass_vsx_interleave(z,ff_cos_16384,2048);
	107	}
	108	static void fft32768_vsx_interleave(FFTComplex *z)
	109	{
	110	fft16384_vsx_interleave(z);
	111	fft8192_vsx_interleave(z+16384);
	112	fft8192_vsx_interleave(z+24576);
	113	pass_vsx_interleave(z,ff_cos_32768,4096);
	114	}
	115	static void fft65536_vsx_interleave(FFTComplex *z)
	116	{
	117	fft32768_vsx_interleave(z);
	118	fft16384_vsx_interleave(z+32768);
	119	fft16384_vsx_interleave(z+49152);
	120	pass_vsx_interleave(z,ff_cos_65536,8192);
	121	}
	122
	123	static void fft32_vsx(FFTComplex *z)
	124	{
	125	fft16_vsx(z);
	126	fft8_vsx(z+16);
	127	fft8_vsx(z+24);
	128	pass_vsx(z,ff_cos_32,4);
	129	}
	130
	131	static void fft64_vsx(FFTComplex *z)
	132	{
	133	fft32_vsx(z);
	134	fft16_vsx(z+32);
	135	fft16_vsx(z+48);
	136	pass_vsx(z,ff_cos_64, 8);
	137	}
	138	static void fft128_vsx(FFTComplex *z)
	139	{
	140	fft64_vsx(z);
	141	fft32_vsx(z+64);
	142	fft32_vsx(z+96);
	143	pass_vsx(z,ff_cos_128,16);
	144	}
	145	static void fft256_vsx(FFTComplex *z)
	146	{
	147	fft128_vsx(z);
	148	fft64_vsx(z+128);
	149	fft64_vsx(z+192);
	150	pass_vsx(z,ff_cos_256,32);
	151	}
	152	static void fft512_vsx(FFTComplex *z)
	153	{
	154	fft256_vsx(z);
	155	fft128_vsx(z+256);
	156	fft128_vsx(z+384);
	157	pass_vsx(z,ff_cos_512,64);
	158	}
	159	static void fft1024_vsx(FFTComplex *z)
	160	{
	161	fft512_vsx(z);
	162	fft256_vsx(z+512);
	163	fft256_vsx(z+768);
	164	pass_vsx(z,ff_cos_1024,128);
	165
	166	}
	167	static void fft2048_vsx(FFTComplex *z)
	168	{
	169	fft1024_vsx(z);
	170	fft512_vsx(z+1024);
	171	fft512_vsx(z+1536);
	172	pass_vsx(z,ff_cos_2048,256);
	173	}
	174	static void fft4096_vsx(FFTComplex *z)
	175	{
	176	fft2048_vsx(z);
	177	fft1024_vsx(z+2048);
	178	fft1024_vsx(z+3072);
	179	pass_vsx(z,ff_cos_4096, 512);
	180	}
	181	static void fft8192_vsx(FFTComplex *z)
	182	{
	183	fft4096_vsx(z);
	184	fft2048_vsx(z+4096);
	185	fft2048_vsx(z+6144);
	186	pass_vsx(z,ff_cos_8192,1024);
	187	}
	188	static void fft16384_vsx(FFTComplex *z)
	189	{
	190	fft8192_vsx(z);
	191	fft4096_vsx(z+8192);
	192	fft4096_vsx(z+12288);
	193	pass_vsx(z,ff_cos_16384,2048);
	194	}
	195	static void fft32768_vsx(FFTComplex *z)
	196	{
	197	fft16384_vsx(z);
	198	fft8192_vsx(z+16384);
	199	fft8192_vsx(z+24576);
	200	pass_vsx(z,ff_cos_32768,4096);
	201	}
	202	static void fft65536_vsx(FFTComplex *z)
	203	{
	204	fft32768_vsx(z);
	205	fft16384_vsx(z+32768);
	206	fft16384_vsx(z+49152);
	207	pass_vsx(z,ff_cos_65536,8192);
	208	}
	209
	210	static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
	211	fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
	212	fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
	213	};
	214	static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
	215	fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
	216	fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
	217	fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
	218	};
	219	void ff_fft_calc_interleave_vsx(FFTContext s, FFTComplex z)
	220	{
	221	fft_dispatch_vsx_interleave[s->nbits-2](z);
	222	}
	223	void ff_fft_calc_vsx(FFTContext s, FFTComplex z)
	224	{
	225	fft_dispatch_vsx[s->nbits-2](z);
	226	}
	227	#endif /* HAVE_VSX */