[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / mdct_vfp.S

/*
 * Copyright (c) 2013 RISC OS Open Ltd
 * Author: Ben Avison <bavison@riscosopen.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

CONTEXT .req    a1
ORIGOUT .req    a2
IN      .req    a3
OUT     .req    v1
REVTAB  .req    v2
TCOS    .req    v3
TSIN    .req    v4
OLDFPSCR .req   v5
J0      .req    a2
J1      .req    a4
J2      .req    ip
J3      .req    lr
REVTAB_HI .req  v5
IN_HI   .req    v6
OUT_HI  .req    v6
TCOS_HI .req    sl
TSIN_HI .req    fp

.macro prerotation_innerloop
 .set trig_lo, k
 .set trig_hi, n4 - k - 2
 .set in_lo, trig_lo * 2
 .set in_hi, trig_hi * 2
        vldr    d8, [TCOS, #trig_lo*4]          @ s16,s17
        vldr    d9, [TCOS, #trig_hi*4]          @ s18,s19
        vldr    s0, [IN, #in_hi*4 + 12]
        vldr    s1, [IN, #in_hi*4 + 4]
        vldr    s2, [IN, #in_lo*4 + 12]
        vldr    s3, [IN, #in_lo*4 + 4]
        vmul.f  s8, s0, s16                     @ vector operation
        vldr    d10, [TSIN, #trig_lo*4]         @ s20,s21
        vldr    d11, [TSIN, #trig_hi*4]         @ s22,s23
        vldr    s4, [IN, #in_lo*4]
        vldr    s5, [IN, #in_lo*4 + 8]
        vldr    s6, [IN, #in_hi*4]
        vldr    s7, [IN, #in_hi*4 + 8]
        ldr     J0, [REVTAB, #trig_lo*2]
        vmul.f  s12, s0, s20                    @ vector operation
        ldr     J2, [REVTAB, #trig_hi*2]
        mov     J1, J0, lsr #16
        and     J0, J0, #255                    @ halfword value will be < n4
        vmls.f  s8, s4, s20                     @ vector operation
        mov     J3, J2, lsr #16
        and     J2, J2, #255                    @ halfword value will be < n4
        add     J0, OUT, J0, lsl #3
        vmla.f  s12, s4, s16                    @ vector operation
        add     J1, OUT, J1, lsl #3
        add     J2, OUT, J2, lsl #3
        add     J3, OUT, J3, lsl #3
        vstr    s8, [J0]
        vstr    s9, [J1]
        vstr    s10, [J2]
        vstr    s11, [J3]
        vstr    s12, [J0, #4]
        vstr    s13, [J1, #4]
        vstr    s14, [J2, #4]
        vstr    s15, [J3, #4]
 .set k, k + 2
.endm

.macro prerotation_innerloop_rolled
        vldmia  TCOS!, {s16,s17}
        vldmdb  TCOS_HI!, {s18,s19}
        vldr    s0, [IN_HI, #-4]
        vldr    s1, [IN_HI, #-12]
        vldr    s2, [IN, #12]
        vldr    s3, [IN, #4]
        vmul.f  s8, s0, s16                     @ vector operation
        vldmia  TSIN!, {s20,s21}
        vldmdb  TSIN_HI!, {s22,s23}
        vldr    s4, [IN]
        vldr    s5, [IN, #8]
        vldr    s6, [IN_HI, #-16]
        vldr    s7, [IN_HI, #-8]
        vmul.f  s12, s0, s20                    @ vector operation
        add     IN, IN, #16
        sub     IN_HI, IN_HI, #16
        ldrh    J0, [REVTAB], #2
        ldrh    J1, [REVTAB], #2
        vmls.f  s8, s4, s20                     @ vector operation
        ldrh    J3, [REVTAB_HI, #-2]!
        ldrh    J2, [REVTAB_HI, #-2]!
        add     J0, OUT, J0, lsl #3
        vmla.f  s12, s4, s16                    @ vector operation
        add     J1, OUT, J1, lsl #3
        add     J2, OUT, J2, lsl #3
        add     J3, OUT, J3, lsl #3
        vstr    s8, [J0]
        vstr    s9, [J1]
        vstr    s10, [J2]
        vstr    s11, [J3]
        vstr    s12, [J0, #4]
        vstr    s13, [J1, #4]
        vstr    s14, [J2, #4]
        vstr    s15, [J3, #4]
.endm

.macro postrotation_innerloop tail, head
 .set trig_lo_head, n8 - k - 2
 .set trig_hi_head, n8 + k
 .set out_lo_head, trig_lo_head * 2
 .set out_hi_head, trig_hi_head * 2
 .set trig_lo_tail, n8 - (k - 2) - 2
 .set trig_hi_tail, n8 + (k - 2)
 .set out_lo_tail, trig_lo_tail * 2
 .set out_hi_tail, trig_hi_tail * 2
 .if (k & 2) == 0
  TCOS_D0_HEAD .req d10 @ s20,s21
  TCOS_D1_HEAD .req d11 @ s22,s23
  TCOS_S0_TAIL .req s24
 .else
  TCOS_D0_HEAD .req d12 @ s24,s25
  TCOS_D1_HEAD .req d13 @ s26,s27
  TCOS_S0_TAIL .req s20
 .endif
 .ifnc "\tail",""
        vmls.f  s8, s0, TCOS_S0_TAIL        @ vector operation
 .endif
 .ifnc "\head",""
        vldr    d8, [TSIN, #trig_lo_head*4] @ s16,s17
        vldr    d9, [TSIN, #trig_hi_head*4] @ s18,s19
        vldr    TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
 .endif
 .ifnc "\tail",""
        vmla.f  s12, s4, TCOS_S0_TAIL       @ vector operation
 .endif
 .ifnc "\head",""
        vldr    s0, [OUT, #out_lo_head*4]
        vldr    s1, [OUT, #out_lo_head*4 + 8]
        vldr    s2, [OUT, #out_hi_head*4]
        vldr    s3, [OUT, #out_hi_head*4 + 8]
        vldr    s4, [OUT, #out_lo_head*4 + 4]
        vldr    s5, [OUT, #out_lo_head*4 + 12]
        vldr    s6, [OUT, #out_hi_head*4 + 4]
        vldr    s7, [OUT, #out_hi_head*4 + 12]
 .endif
 .ifnc "\tail",""
        vstr    s8, [OUT, #out_lo_tail*4]
        vstr    s9, [OUT, #out_lo_tail*4 + 8]
        vstr    s10, [OUT, #out_hi_tail*4]
        vstr    s11, [OUT, #out_hi_tail*4 + 8]
 .endif
 .ifnc "\head",""
        vmul.f  s8, s4, s16                 @ vector operation
 .endif
 .ifnc "\tail",""
        vstr    s12, [OUT, #out_hi_tail*4 + 12]
        vstr    s13, [OUT, #out_hi_tail*4 + 4]
        vstr    s14, [OUT, #out_lo_tail*4 + 12]
        vstr    s15, [OUT, #out_lo_tail*4 + 4]
 .endif
 .ifnc "\head",""
        vmul.f  s12, s0, s16                @ vector operation
        vldr    TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
 .endif
 .unreq TCOS_D0_HEAD
 .unreq TCOS_D1_HEAD
 .unreq TCOS_S0_TAIL
 .ifnc "\head",""
  .set k, k + 2
 .endif
.endm

.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
 .ifnc "\tail",""
        vmls.f  s8, s0, \tcos_s0_tail       @ vector operation
 .endif
 .ifnc "\head",""
        vldmia  TSIN!, {s16,s17}
        vldmdb  TSIN_HI!, {s18,s19}
        vldmia  TCOS!, {\tcos_s0_head,\tcos_s1_head}
 .endif
 .ifnc "\tail",""
        vmla.f  s12, s4, \tcos_s0_tail      @ vector operation
 .endif
 .ifnc "\head",""
        vldr    s0, [OUT, #+\out_offset_head+0]
        vldr    s1, [OUT, #+\out_offset_head+8]
        vldr    s2, [OUT_HI, #-\out_offset_head-16]
        vldr    s3, [OUT_HI, #-\out_offset_head-8]
        vldr    s4, [OUT, #+\out_offset_head+4]
        vldr    s5, [OUT, #+\out_offset_head+12]
        vldr    s6, [OUT_HI, #-\out_offset_head-12]
        vldr    s7, [OUT_HI, #-\out_offset_head-4]
 .endif
 .ifnc "\tail",""
        vstr    s8, [OUT, #+\out_offset_tail+0]
        vstr    s9, [OUT, #+\out_offset_tail+8]
        vstr    s10, [OUT_HI, #-\out_offset_tail-16]
        vstr    s11, [OUT_HI, #-\out_offset_tail-8]
 .endif
 .ifnc "\head",""
        vmul.f  s8, s4, s16                 @ vector operation
 .endif
 .ifnc "\tail",""
        vstr    s12, [OUT_HI, #-\out_offset_tail-4]
        vstr    s13, [OUT_HI, #-\out_offset_tail-12]
        vstr    s14, [OUT, #+\out_offset_tail+12]
        vstr    s15, [OUT, #+\out_offset_tail+4]
 .endif
 .ifnc "\head",""
        vmul.f  s12, s0, s16                @ vector operation
        vldmdb  TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
 .endif
.endm


/* void ff_imdct_half_vfp(FFTContext *s,
 *                        FFTSample *output,
 *                        const FFTSample *input)
 */
function ff_imdct_half_vfp, export=1
        ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
        teq     ip, #6
        bne     10f

 .set n, 1<<6
 .set n2, n/2
 .set n4, n/4
 .set n8, n/8

        push    {v1-v5,lr}
        vpush   {s16-s27}
        fmrx    OLDFPSCR, FPSCR
        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, lr
        mov     OUT, ORIGOUT
        ldr     REVTAB, [CONTEXT, #2*4]
        ldr     TCOS, [CONTEXT, #6*4]
        ldr     TSIN, [CONTEXT, #7*4]

 .set k, 0
 .rept n8/2
        prerotation_innerloop
 .endr

        fmxr    FPSCR, OLDFPSCR
        mov     a1, OUT
        bl      X(ff_fft16_vfp)
        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, lr

 .set k, 0
        postrotation_innerloop , head
 .rept n8/2 - 1
        postrotation_innerloop tail, head
 .endr
        postrotation_innerloop tail

        fmxr    FPSCR, OLDFPSCR
        vpop    {s16-s27}
        pop     {v1-v5,pc}

10:
        push    {v1-v6,sl,fp,lr}
        vpush   {s16-s27}
        fmrx    OLDFPSCR, FPSCR
        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, lr
        mov     lr, #1
        mov     OUT, ORIGOUT
        ldr     REVTAB, [CONTEXT, #2*4]
        ldr     TCOS, [CONTEXT, #6*4]
        ldr     TSIN, [CONTEXT, #7*4]
        mov     lr, lr, lsl ip

        push    {CONTEXT,OLDFPSCR}
        add     IN_HI, IN, lr, lsl #1
        add     REVTAB_HI, REVTAB, lr, lsr #1
        add     TCOS_HI, TCOS, lr
        add     TSIN_HI, TSIN, lr
0:      prerotation_innerloop_rolled
        teq     IN, IN_HI
        bne     0b
        ldmia   sp, {CONTEXT,OLDFPSCR}

        mov     ORIGOUT, OUT
        fmxr    FPSCR, OLDFPSCR
        ldr     ip, [CONTEXT, #9*4]
        blx     ip                          @ s->fft_calc(s, output)

        pop     {CONTEXT,OLDFPSCR}
        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
        ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
        fmxr    FPSCR, lr
        mov     lr, #1
        mov     lr, lr, lsl ip
        sub     TCOS, TCOS, lr, lsr #1
        sub     TSIN, TSIN, lr, lsr #1
        add     OUT_HI, OUT, lr, lsl #1
        add     TCOS_HI, TCOS, lr
        add     TSIN_HI, TSIN, lr
        postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
        b       1f
0:      add     OUT, OUT, #32
        sub     OUT_HI, OUT_HI, #32
        postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
1:      postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
        teq     TSIN, TSIN_HI
        bne     0b
        postrotation_innerloop_rolled tail,,,,,, s24,, 16

        fmxr    FPSCR, OLDFPSCR
        vpop    {s16-s27}
        pop     {v1-v6,sl,fp,pc}
endfunc

        .unreq  CONTEXT
        .unreq  ORIGOUT
        .unreq  IN
        .unreq  OUT
        .unreq  REVTAB
        .unreq  TCOS
        .unreq  TSIN
        .unreq  OLDFPSCR
        .unreq  J0
        .unreq  J1
        .unreq  J2
        .unreq  J3
        .unreq  REVTAB_HI
        .unreq  IN_HI
        .unreq  OUT_HI
        .unreq  TCOS_HI
        .unreq  TSIN_HI
Commit	Line	Data
	1	/*
	2	* Copyright (c) 2013 RISC OS Open Ltd
	3	* Author: Ben Avison <bavison@riscosopen.org>
	4	*
	5	* This file is part of FFmpeg.
	6	*
	7	* FFmpeg is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU Lesser General Public
	9	* License as published by the Free Software Foundation; either
	10	* version 2.1 of the License, or (at your option) any later version.
	11	*
	12	* FFmpeg is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	* Lesser General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU Lesser General Public
	18	* License along with FFmpeg; if not, write to the Free Software
	19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	20	*/
	21
	22	#include "libavutil/arm/asm.S"
	23
	24	CONTEXT .req a1
	25	ORIGOUT .req a2
	26	IN .req a3
	27	OUT .req v1
	28	REVTAB .req v2
	29	TCOS .req v3
	30	TSIN .req v4
	31	OLDFPSCR .req v5
	32	J0 .req a2
	33	J1 .req a4
	34	J2 .req ip
	35	J3 .req lr
	36	REVTAB_HI .req v5
	37	IN_HI .req v6
	38	OUT_HI .req v6
	39	TCOS_HI .req sl
	40	TSIN_HI .req fp
	41
	42	.macro prerotation_innerloop
	43	.set trig_lo, k
	44	.set trig_hi, n4 - k - 2
	45	.set in_lo, trig_lo * 2
	46	.set in_hi, trig_hi * 2
	47	vldr d8, [TCOS, #trig_lo*4] @ s16,s17
	48	vldr d9, [TCOS, #trig_hi*4] @ s18,s19
	49	vldr s0, [IN, #in_hi*4 + 12]
	50	vldr s1, [IN, #in_hi*4 + 4]
	51	vldr s2, [IN, #in_lo*4 + 12]
	52	vldr s3, [IN, #in_lo*4 + 4]
	53	vmul.f s8, s0, s16 @ vector operation
	54	vldr d10, [TSIN, #trig_lo*4] @ s20,s21
	55	vldr d11, [TSIN, #trig_hi*4] @ s22,s23
	56	vldr s4, [IN, #in_lo*4]
	57	vldr s5, [IN, #in_lo*4 + 8]
	58	vldr s6, [IN, #in_hi*4]
	59	vldr s7, [IN, #in_hi*4 + 8]
	60	ldr J0, [REVTAB, #trig_lo*2]
	61	vmul.f s12, s0, s20 @ vector operation
	62	ldr J2, [REVTAB, #trig_hi*2]
	63	mov J1, J0, lsr #16
	64	and J0, J0, #255 @ halfword value will be < n4
	65	vmls.f s8, s4, s20 @ vector operation
	66	mov J3, J2, lsr #16
	67	and J2, J2, #255 @ halfword value will be < n4
	68	add J0, OUT, J0, lsl #3
	69	vmla.f s12, s4, s16 @ vector operation
	70	add J1, OUT, J1, lsl #3
	71	add J2, OUT, J2, lsl #3
	72	add J3, OUT, J3, lsl #3
	73	vstr s8, [J0]
	74	vstr s9, [J1]
	75	vstr s10, [J2]
	76	vstr s11, [J3]
	77	vstr s12, [J0, #4]
	78	vstr s13, [J1, #4]
	79	vstr s14, [J2, #4]
	80	vstr s15, [J3, #4]
	81	.set k, k + 2
	82	.endm
	83
	84	.macro prerotation_innerloop_rolled
	85	vldmia TCOS!, {s16,s17}
	86	vldmdb TCOS_HI!, {s18,s19}
	87	vldr s0, [IN_HI, #-4]
	88	vldr s1, [IN_HI, #-12]
	89	vldr s2, [IN, #12]
	90	vldr s3, [IN, #4]
	91	vmul.f s8, s0, s16 @ vector operation
	92	vldmia TSIN!, {s20,s21}
	93	vldmdb TSIN_HI!, {s22,s23}
	94	vldr s4, [IN]
	95	vldr s5, [IN, #8]
	96	vldr s6, [IN_HI, #-16]
	97	vldr s7, [IN_HI, #-8]
	98	vmul.f s12, s0, s20 @ vector operation
	99	add IN, IN, #16
	100	sub IN_HI, IN_HI, #16
	101	ldrh J0, [REVTAB], #2
	102	ldrh J1, [REVTAB], #2
	103	vmls.f s8, s4, s20 @ vector operation
	104	ldrh J3, [REVTAB_HI, #-2]!
	105	ldrh J2, [REVTAB_HI, #-2]!
	106	add J0, OUT, J0, lsl #3
	107	vmla.f s12, s4, s16 @ vector operation
	108	add J1, OUT, J1, lsl #3
	109	add J2, OUT, J2, lsl #3
	110	add J3, OUT, J3, lsl #3
	111	vstr s8, [J0]
	112	vstr s9, [J1]
	113	vstr s10, [J2]
	114	vstr s11, [J3]
	115	vstr s12, [J0, #4]
	116	vstr s13, [J1, #4]
	117	vstr s14, [J2, #4]
	118	vstr s15, [J3, #4]
	119	.endm
	120
	121	.macro postrotation_innerloop tail, head
	122	.set trig_lo_head, n8 - k - 2
	123	.set trig_hi_head, n8 + k
	124	.set out_lo_head, trig_lo_head * 2
	125	.set out_hi_head, trig_hi_head * 2
	126	.set trig_lo_tail, n8 - (k - 2) - 2
	127	.set trig_hi_tail, n8 + (k - 2)
	128	.set out_lo_tail, trig_lo_tail * 2
	129	.set out_hi_tail, trig_hi_tail * 2
	130	.if (k & 2) == 0
	131	TCOS_D0_HEAD .req d10 @ s20,s21
	132	TCOS_D1_HEAD .req d11 @ s22,s23
	133	TCOS_S0_TAIL .req s24
	134	.else
	135	TCOS_D0_HEAD .req d12 @ s24,s25
	136	TCOS_D1_HEAD .req d13 @ s26,s27
	137	TCOS_S0_TAIL .req s20
	138	.endif
	139	.ifnc "\tail",""
	140	vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
	141	.endif
	142	.ifnc "\head",""
	143	vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
	144	vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
	145	vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
	146	.endif
	147	.ifnc "\tail",""
	148	vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
	149	.endif
	150	.ifnc "\head",""
	151	vldr s0, [OUT, #out_lo_head*4]
	152	vldr s1, [OUT, #out_lo_head*4 + 8]
	153	vldr s2, [OUT, #out_hi_head*4]
	154	vldr s3, [OUT, #out_hi_head*4 + 8]
	155	vldr s4, [OUT, #out_lo_head*4 + 4]
	156	vldr s5, [OUT, #out_lo_head*4 + 12]
	157	vldr s6, [OUT, #out_hi_head*4 + 4]
	158	vldr s7, [OUT, #out_hi_head*4 + 12]
	159	.endif
	160	.ifnc "\tail",""
	161	vstr s8, [OUT, #out_lo_tail*4]
	162	vstr s9, [OUT, #out_lo_tail*4 + 8]
	163	vstr s10, [OUT, #out_hi_tail*4]
	164	vstr s11, [OUT, #out_hi_tail*4 + 8]
	165	.endif
	166	.ifnc "\head",""
	167	vmul.f s8, s4, s16 @ vector operation
	168	.endif
	169	.ifnc "\tail",""
	170	vstr s12, [OUT, #out_hi_tail*4 + 12]
	171	vstr s13, [OUT, #out_hi_tail*4 + 4]
	172	vstr s14, [OUT, #out_lo_tail*4 + 12]
	173	vstr s15, [OUT, #out_lo_tail*4 + 4]
	174	.endif
	175	.ifnc "\head",""
	176	vmul.f s12, s0, s16 @ vector operation
	177	vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
	178	.endif
	179	.unreq TCOS_D0_HEAD
	180	.unreq TCOS_D1_HEAD
	181	.unreq TCOS_S0_TAIL
	182	.ifnc "\head",""
	183	.set k, k + 2
	184	.endif
	185	.endm
	186
	187	.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
	188	.ifnc "\tail",""
	189	vmls.f s8, s0, \tcos_s0_tail @ vector operation
	190	.endif
	191	.ifnc "\head",""
	192	vldmia TSIN!, {s16,s17}
	193	vldmdb TSIN_HI!, {s18,s19}
	194	vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
	195	.endif
	196	.ifnc "\tail",""
	197	vmla.f s12, s4, \tcos_s0_tail @ vector operation
	198	.endif
	199	.ifnc "\head",""
	200	vldr s0, [OUT, #+\out_offset_head+0]
	201	vldr s1, [OUT, #+\out_offset_head+8]
	202	vldr s2, [OUT_HI, #-\out_offset_head-16]
	203	vldr s3, [OUT_HI, #-\out_offset_head-8]
	204	vldr s4, [OUT, #+\out_offset_head+4]
	205	vldr s5, [OUT, #+\out_offset_head+12]
	206	vldr s6, [OUT_HI, #-\out_offset_head-12]
	207	vldr s7, [OUT_HI, #-\out_offset_head-4]
	208	.endif
	209	.ifnc "\tail",""
	210	vstr s8, [OUT, #+\out_offset_tail+0]
	211	vstr s9, [OUT, #+\out_offset_tail+8]
	212	vstr s10, [OUT_HI, #-\out_offset_tail-16]
	213	vstr s11, [OUT_HI, #-\out_offset_tail-8]
	214	.endif
	215	.ifnc "\head",""
	216	vmul.f s8, s4, s16 @ vector operation
	217	.endif
	218	.ifnc "\tail",""
	219	vstr s12, [OUT_HI, #-\out_offset_tail-4]
	220	vstr s13, [OUT_HI, #-\out_offset_tail-12]
	221	vstr s14, [OUT, #+\out_offset_tail+12]
	222	vstr s15, [OUT, #+\out_offset_tail+4]
	223	.endif
	224	.ifnc "\head",""
	225	vmul.f s12, s0, s16 @ vector operation
	226	vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
	227	.endif
	228	.endm
	229
	230
	231	/* void ff_imdct_half_vfp(FFTContext *s,
	232	* FFTSample *output,
	233	* const FFTSample *input)
	234	*/
	235	function ff_imdct_half_vfp, export=1
	236	ldr ip, [CONTEXT, #5*4] @ mdct_bits
	237	teq ip, #6
	238	bne 10f
	239
	240	.set n, 1<<6
	241	.set n2, n/2
	242	.set n4, n/4
	243	.set n8, n/8
	244
	245	push {v1-v5,lr}
	246	vpush {s16-s27}
	247	fmrx OLDFPSCR, FPSCR
	248	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
	249	fmxr FPSCR, lr
	250	mov OUT, ORIGOUT
	251	ldr REVTAB, [CONTEXT, #2*4]
	252	ldr TCOS, [CONTEXT, #6*4]
	253	ldr TSIN, [CONTEXT, #7*4]
	254
	255	.set k, 0
	256	.rept n8/2
	257	prerotation_innerloop
	258	.endr
	259
	260	fmxr FPSCR, OLDFPSCR
	261	mov a1, OUT
	262	bl X(ff_fft16_vfp)
	263	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
	264	fmxr FPSCR, lr
	265
	266	.set k, 0
	267	postrotation_innerloop , head
	268	.rept n8/2 - 1
	269	postrotation_innerloop tail, head
	270	.endr
	271	postrotation_innerloop tail
	272
	273	fmxr FPSCR, OLDFPSCR
	274	vpop {s16-s27}
	275	pop {v1-v5,pc}
	276
	277	10:
	278	push {v1-v6,sl,fp,lr}
	279	vpush {s16-s27}
	280	fmrx OLDFPSCR, FPSCR
	281	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
	282	fmxr FPSCR, lr
	283	mov lr, #1
	284	mov OUT, ORIGOUT
	285	ldr REVTAB, [CONTEXT, #2*4]
	286	ldr TCOS, [CONTEXT, #6*4]
	287	ldr TSIN, [CONTEXT, #7*4]
	288	mov lr, lr, lsl ip
	289
	290	push {CONTEXT,OLDFPSCR}
	291	add IN_HI, IN, lr, lsl #1
	292	add REVTAB_HI, REVTAB, lr, lsr #1
	293	add TCOS_HI, TCOS, lr
	294	add TSIN_HI, TSIN, lr
	295	0: prerotation_innerloop_rolled
	296	teq IN, IN_HI
	297	bne 0b
	298	ldmia sp, {CONTEXT,OLDFPSCR}
	299
	300	mov ORIGOUT, OUT
	301	fmxr FPSCR, OLDFPSCR
	302	ldr ip, [CONTEXT, #9*4]
	303	blx ip @ s->fft_calc(s, output)
	304
	305	pop {CONTEXT,OLDFPSCR}
	306	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
	307	ldr ip, [CONTEXT, #5*4] @ mdct_bits
	308	fmxr FPSCR, lr
	309	mov lr, #1
	310	mov lr, lr, lsl ip
	311	sub TCOS, TCOS, lr, lsr #1
	312	sub TSIN, TSIN, lr, lsr #1
	313	add OUT_HI, OUT, lr, lsl #1
	314	add TCOS_HI, TCOS, lr
	315	add TSIN_HI, TSIN, lr
	316	postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
	317	b 1f
	318	0: add OUT, OUT, #32
	319	sub OUT_HI, OUT_HI, #32
	320	postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
	321	1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
	322	teq TSIN, TSIN_HI
	323	bne 0b
	324	postrotation_innerloop_rolled tail,,,,,, s24,, 16
	325
	326	fmxr FPSCR, OLDFPSCR
	327	vpop {s16-s27}
	328	pop {v1-v6,sl,fp,pc}
	329	endfunc
	330
	331	.unreq CONTEXT
	332	.unreq ORIGOUT
	333	.unreq IN
	334	.unreq OUT
	335	.unreq REVTAB
	336	.unreq TCOS
	337	.unreq TSIN
	338	.unreq OLDFPSCR
	339	.unreq J0
	340	.unreq J1
	341	.unreq J2
	342	.unreq J3
	343	.unreq REVTAB_HI
	344	.unreq IN_HI
	345	.unreq OUT_HI
	346	.unreq TCOS_HI
	347	.unreq TSIN_HI