[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / mdct_vfp.S

/*
 * Copyright (c) 2013 RISC OS Open Ltd
 * Author: Ben Avison <bavison@riscosopen.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

CONTEXT .req    a1
ORIGOUT .req    a2
IN      .req    a3
OUT     .req    v1
REVTAB  .req    v2
TCOS    .req    v3
TSIN    .req    v4
OLDFPSCR .req   v5
J0      .req    a2
J1      .req    a4
J2      .req    ip
J3      .req    lr
REVTAB_HI .req  v5
IN_HI   .req    v6
OUT_HI  .req    v6
TCOS_HI .req    sl
TSIN_HI .req    fp

.macro prerotation_innerloop
 .set trig_lo, k
 .set trig_hi, n4 - k - 2
 .set in_lo, trig_lo * 2
 .set in_hi, trig_hi * 2
        vldr    d8, [TCOS, #trig_lo*4]          @ s16,s17
        vldr    d9, [TCOS, #trig_hi*4]          @ s18,s19
        vldr    s0, [IN, #in_hi*4 + 12]
        vldr    s1, [IN, #in_hi*4 + 4]
        vldr    s2, [IN, #in_lo*4 + 12]
        vldr    s3, [IN, #in_lo*4 + 4]
        vmul.f  s8, s0, s16                     @ vector operation
        vldr    d10, [TSIN, #trig_lo*4]         @ s20,s21
        vldr    d11, [TSIN, #trig_hi*4]         @ s22,s23
        vldr    s4, [IN, #in_lo*4]
        vldr    s5, [IN, #in_lo*4 + 8]
        vldr    s6, [IN, #in_hi*4]
        vldr    s7, [IN, #in_hi*4 + 8]
        ldr     J0, [REVTAB, #trig_lo*2]
        vmul.f  s12, s0, s20                    @ vector operation
        ldr     J2, [REVTAB, #trig_hi*2]
        mov     J1, J0, lsr #16
        and     J0, J0, #255                    @ halfword value will be < n4
        vmls.f  s8, s4, s20                     @ vector operation
        mov     J3, J2, lsr #16
        and     J2, J2, #255                    @ halfword value will be < n4
        add     J0, OUT, J0, lsl #3
        vmla.f  s12, s4, s16                    @ vector operation
        add     J1, OUT, J1, lsl #3
        add     J2, OUT, J2, lsl #3
        add     J3, OUT, J3, lsl #3
        vstr    s8, [J0]
        vstr    s9, [J1]
        vstr    s10, [J2]
        vstr    s11, [J3]
        vstr    s12, [J0, #4]
        vstr    s13, [J1, #4]
        vstr    s14, [J2, #4]
        vstr    s15, [J3, #4]
 .set k, k + 2
.endm

.macro prerotation_innerloop_rolled
        vldmia  TCOS!, {s16,s17}
        vldmdb  TCOS_HI!, {s18,s19}
        vldr    s0, [IN_HI, #-4]
        vldr    s1, [IN_HI, #-12]
        vldr    s2, [IN, #12]
        vldr    s3, [IN, #4]
        vmul.f  s8, s0, s16                     @ vector operation
        vldmia  TSIN!, {s20,s21}
        vldmdb  TSIN_HI!, {s22,s23}
        vldr    s4, [IN]
        vldr    s5, [IN, #8]
        vldr    s6, [IN_HI, #-16]
        vldr    s7, [IN_HI, #-8]
        vmul.f  s12, s0, s20                    @ vector operation
        add     IN, IN, #16
        sub     IN_HI, IN_HI, #16
        ldrh    J0, [REVTAB], #2
        ldrh    J1, [REVTAB], #2
        vmls.f  s8, s4, s20                     @ vector operation
        ldrh    J3, [REVTAB_HI, #-2]!
        ldrh    J2, [REVTAB_HI, #-2]!
        add     J0, OUT, J0, lsl #3
        vmla.f  s12, s4, s16                    @ vector operation
        add     J1, OUT, J1, lsl #3
        add     J2, OUT, J2, lsl #3
        add     J3, OUT, J3, lsl #3
        vstr    s8, [J0]
        vstr    s9, [J1]
        vstr    s10, [J2]
        vstr    s11, [J3]
        vstr    s12, [J0, #4]
        vstr    s13, [J1, #4]
        vstr    s14, [J2, #4]
        vstr    s15, [J3, #4]
.endm

.macro postrotation_innerloop tail, head
 .set trig_lo_head, n8 - k - 2
 .set trig_hi_head, n8 + k
 .set out_lo_head, trig_lo_head * 2
 .set out_hi_head, trig_hi_head * 2
 .set trig_lo_tail, n8 - (k - 2) - 2
 .set trig_hi_tail, n8 + (k - 2)
 .set out_lo_tail, trig_lo_tail * 2
 .set out_hi_tail, trig_hi_tail * 2
 .if (k & 2) == 0
  TCOS_D0_HEAD .req d10 @ s20,s21
  TCOS_D1_HEAD .req d11 @ s22,s23
  TCOS_S0_TAIL .req s24
 .else
  TCOS_D0_HEAD .req d12 @ s24,s25
  TCOS_D1_HEAD .req d13 @ s26,s27
  TCOS_S0_TAIL .req s20
 .endif
 .ifnc "\tail",""
        vmls.f  s8, s0, TCOS_S0_TAIL        @ vector operation
 .endif
 .ifnc "\head",""
        vldr    d8, [TSIN, #trig_lo_head*4] @ s16,s17
        vldr    d9, [TSIN, #trig_hi_head*4] @ s18,s19
        vldr    TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
 .endif
 .ifnc "\tail",""
        vmla.f  s12, s4, TCOS_S0_TAIL       @ vector operation
 .endif
 .ifnc "\head",""
        vldr    s0, [OUT, #out_lo_head*4]
        vldr    s1, [OUT, #out_lo_head*4 + 8]
        vldr    s2, [OUT, #out_hi_head*4]
        vldr    s3, [OUT, #out_hi_head*4 + 8]
        vldr    s4, [OUT, #out_lo_head*4 + 4]
        vldr    s5, [OUT, #out_lo_head*4 + 12]
        vldr    s6, [OUT, #out_hi_head*4 + 4]
        vldr    s7, [OUT, #out_hi_head*4 + 12]
 .endif
 .ifnc "\tail",""
        vstr    s8, [OUT, #out_lo_tail*4]
        vstr    s9, [OUT, #out_lo_tail*4 + 8]
        vstr    s10, [OUT, #out_hi_tail*4]
        vstr    s11, [OUT, #out_hi_tail*4 + 8]
 .endif
 .ifnc "\head",""
        vmul.f  s8, s4, s16                 @ vector operation
 .endif
 .ifnc "\tail",""
        vstr    s12, [OUT, #out_hi_tail*4 + 12]
        vstr    s13, [OUT, #out_hi_tail*4 + 4]
        vstr    s14, [OUT, #out_lo_tail*4 + 12]
        vstr    s15, [OUT, #out_lo_tail*4 + 4]
 .endif
 .ifnc "\head",""
        vmul.f  s12, s0, s16                @ vector operation
        vldr    TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
 .endif
 .unreq TCOS_D0_HEAD
 .unreq TCOS_D1_HEAD
 .unreq TCOS_S0_TAIL
 .ifnc "\head",""
  .set k, k + 2
 .endif
.endm

.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
 .ifnc "\tail",""
        vmls.f  s8, s0, \tcos_s0_tail       @ vector operation
 .endif
 .ifnc "\head",""
        vldmia  TSIN!, {s16,s17}
        vldmdb  TSIN_HI!, {s18,s19}
        vldmia  TCOS!, {\tcos_s0_head,\tcos_s1_head}
 .endif
 .ifnc "\tail",""
        vmla.f  s12, s4, \tcos_s0_tail      @ vector operation
 .endif
 .ifnc "\head",""
        vldr    s0, [OUT, #+\out_offset_head+0]
        vldr    s1, [OUT, #+\out_offset_head+8]
        vldr    s2, [OUT_HI, #-\out_offset_head-16]
        vldr    s3, [OUT_HI, #-\out_offset_head-8]
        vldr    s4, [OUT, #+\out_offset_head+4]
        vldr    s5, [OUT, #+\out_offset_head+12]
        vldr    s6, [OUT_HI, #-\out_offset_head-12]
        vldr    s7, [OUT_HI, #-\out_offset_head-4]
 .endif
 .ifnc "\tail",""
        vstr    s8, [OUT, #+\out_offset_tail+0]
        vstr    s9, [OUT, #+\out_offset_tail+8]
        vstr    s10, [OUT_HI, #-\out_offset_tail-16]
        vstr    s11, [OUT_HI, #-\out_offset_tail-8]
 .endif
 .ifnc "\head",""
        vmul.f  s8, s4, s16                 @ vector operation
 .endif
 .ifnc "\tail",""
        vstr    s12, [OUT_HI, #-\out_offset_tail-4]
        vstr    s13, [OUT_HI, #-\out_offset_tail-12]
        vstr    s14, [OUT, #+\out_offset_tail+12]
        vstr    s15, [OUT, #+\out_offset_tail+4]
 .endif
 .ifnc "\head",""
        vmul.f  s12, s0, s16                @ vector operation
        vldmdb  TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
 .endif
.endm


/* void ff_imdct_half_vfp(FFTContext *s,
 *                        FFTSample *output,
 *                        const FFTSample *input)
 */
function ff_imdct_half_vfp, export=1
        ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
        teq     ip, #6
        bne     10f

 .set n, 1<<6
 .set n2, n/2
 .set n4, n/4
 .set n8, n/8

        push    {v1-v5,lr}
        vpush   {s16-s27}
        fmrx    OLDFPSCR, FPSCR
        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, lr
        mov     OUT, ORIGOUT
        ldr     REVTAB, [CONTEXT, #2*4]
        ldr     TCOS, [CONTEXT, #6*4]
        ldr     TSIN, [CONTEXT, #7*4]

 .set k, 0
 .rept n8/2
        prerotation_innerloop
 .endr

        fmxr    FPSCR, OLDFPSCR
        mov     a1, OUT
        bl      X(ff_fft16_vfp)
        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, lr

 .set k, 0
        postrotation_innerloop , head
 .rept n8/2 - 1
        postrotation_innerloop tail, head
 .endr
        postrotation_innerloop tail

        fmxr    FPSCR, OLDFPSCR
        vpop    {s16-s27}
        pop     {v1-v5,pc}

10:
        push    {v1-v6,sl,fp,lr}
        vpush   {s16-s27}
        fmrx    OLDFPSCR, FPSCR
        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, lr
        mov     lr, #1
        mov     OUT, ORIGOUT
        ldr     REVTAB, [CONTEXT, #2*4]
        ldr     TCOS, [CONTEXT, #6*4]
        ldr     TSIN, [CONTEXT, #7*4]
        mov     lr, lr, lsl ip

        push    {CONTEXT,OLDFPSCR}
        add     IN_HI, IN, lr, lsl #1
        add     REVTAB_HI, REVTAB, lr, lsr #1
        add     TCOS_HI, TCOS, lr
        add     TSIN_HI, TSIN, lr
0:      prerotation_innerloop_rolled
        teq     IN, IN_HI
        bne     0b
        ldmia   sp, {CONTEXT,OLDFPSCR}

        mov     ORIGOUT, OUT
        fmxr    FPSCR, OLDFPSCR
        ldr     ip, [CONTEXT, #9*4]
        blx     ip                          @ s->fft_calc(s, output)

        pop     {CONTEXT,OLDFPSCR}
        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
        ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
        fmxr    FPSCR, lr
        mov     lr, #1
        mov     lr, lr, lsl ip
        sub     TCOS, TCOS, lr, lsr #1
        sub     TSIN, TSIN, lr, lsr #1
        add     OUT_HI, OUT, lr, lsl #1
        add     TCOS_HI, TCOS, lr
        add     TSIN_HI, TSIN, lr
        postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
        b       1f
0:      add     OUT, OUT, #32
        sub     OUT_HI, OUT_HI, #32
        postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
1:      postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
        teq     TSIN, TSIN_HI
        bne     0b
        postrotation_innerloop_rolled tail,,,,,, s24,, 16

        fmxr    FPSCR, OLDFPSCR
        vpop    {s16-s27}
        pop     {v1-v6,sl,fp,pc}
endfunc

        .unreq  CONTEXT
        .unreq  ORIGOUT
        .unreq  IN
        .unreq  OUT
        .unreq  REVTAB
        .unreq  TCOS
        .unreq  TSIN
        .unreq  OLDFPSCR
        .unreq  J0
        .unreq  J1
        .unreq  J2
        .unreq  J3
        .unreq  REVTAB_HI
        .unreq  IN_HI
        .unreq  OUT_HI
        .unreq  TCOS_HI
        .unreq  TSIN_HI
Commit	Line	Data
2ba45a60 DM	1	/*
	2	* Copyright (c) 2013 RISC OS Open Ltd
	3	* Author: Ben Avison <bavison@riscosopen.org>
	4	*
	5	* This file is part of FFmpeg.
	6	*
	7	* FFmpeg is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU Lesser General Public
	9	* License as published by the Free Software Foundation; either
	10	* version 2.1 of the License, or (at your option) any later version.
	11	*
	12	* FFmpeg is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	* Lesser General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU Lesser General Public
	18	* License along with FFmpeg; if not, write to the Free Software
	19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	20	*/
	21
	22	#include "libavutil/arm/asm.S"
	23
	24	CONTEXT .req a1
	25	ORIGOUT .req a2
	26	IN .req a3
	27	OUT .req v1
	28	REVTAB .req v2
	29	TCOS .req v3
	30	TSIN .req v4
	31	OLDFPSCR .req v5
	32	J0 .req a2
	33	J1 .req a4
	34	J2 .req ip
	35	J3 .req lr
	36	REVTAB_HI .req v5
	37	IN_HI .req v6
	38	OUT_HI .req v6
	39	TCOS_HI .req sl
	40	TSIN_HI .req fp
	41
	42	.macro prerotation_innerloop
	43	.set trig_lo, k
	44	.set trig_hi, n4 - k - 2
	45	.set in_lo, trig_lo * 2
	46	.set in_hi, trig_hi * 2
	47	vldr d8, [TCOS, #trig_lo*4] @ s16,s17
	48	vldr d9, [TCOS, #trig_hi*4] @ s18,s19
	49	vldr s0, [IN, #in_hi*4 + 12]
	50	vldr s1, [IN, #in_hi*4 + 4]
	51	vldr s2, [IN, #in_lo*4 + 12]
	52	vldr s3, [IN, #in_lo*4 + 4]
	53	vmul.f s8, s0, s16 @ vector operation
	54	vldr d10, [TSIN, #trig_lo*4] @ s20,s21
	55	vldr d11, [TSIN, #trig_hi*4] @ s22,s23
	56	vldr s4, [IN, #in_lo*4]
	57	vldr s5, [IN, #in_lo*4 + 8]
	58	vldr s6, [IN, #in_hi*4]
	59	vldr s7, [IN, #in_hi*4 + 8]
	60	ldr J0, [REVTAB, #trig_lo*2]
	61	vmul.f s12, s0, s20 @ vector operation
	62	ldr J2, [REVTAB, #trig_hi*2]
	63	mov J1, J0, lsr #16
	64	and J0, J0, #255 @ halfword value will be < n4
65	vmls.f s8, s4, s20 @ vector operation
66	mov J3, J2, lsr #16
67	and J2, J2, #255 @ halfword value will be < n4
68	add J0, OUT, J0, lsl #3
69	vmla.f s12, s4, s16 @ vector operation
70	add J1, OUT, J1, lsl #3
71	add J2, OUT, J2, lsl #3
72	add J3, OUT, J3, lsl #3
73	vstr s8, [J0]
74	vstr s9, [J1]
75	vstr s10, [J2]
76	vstr s11, [J3]
77	vstr s12, [J0, #4]
78	vstr s13, [J1, #4]
79	vstr s14, [J2, #4]
80	vstr s15, [J3, #4]
81	.set k, k + 2
82	.endm
83
84	.macro prerotation_innerloop_rolled
85	vldmia TCOS!, {s16,s17}
86	vldmdb TCOS_HI!, {s18,s19}
87	vldr s0, [IN_HI, #-4]
88	vldr s1, [IN_HI, #-12]
89	vldr s2, [IN, #12]
90	vldr s3, [IN, #4]
91	vmul.f s8, s0, s16 @ vector operation
92	vldmia TSIN!, {s20,s21}
93	vldmdb TSIN_HI!, {s22,s23}
94	vldr s4, [IN]
95	vldr s5, [IN, #8]
96	vldr s6, [IN_HI, #-16]
97	vldr s7, [IN_HI, #-8]
98	vmul.f s12, s0, s20 @ vector operation
99	add IN, IN, #16
100	sub IN_HI, IN_HI, #16
101	ldrh J0, [REVTAB], #2
102	ldrh J1, [REVTAB], #2
103	vmls.f s8, s4, s20 @ vector operation
104	ldrh J3, [REVTAB_HI, #-2]!
105	ldrh J2, [REVTAB_HI, #-2]!
106	add J0, OUT, J0, lsl #3
107	vmla.f s12, s4, s16 @ vector operation
108	add J1, OUT, J1, lsl #3
109	add J2, OUT, J2, lsl #3
110	add J3, OUT, J3, lsl #3
111	vstr s8, [J0]
112	vstr s9, [J1]
113	vstr s10, [J2]
114	vstr s11, [J3]
115	vstr s12, [J0, #4]
116	vstr s13, [J1, #4]
117	vstr s14, [J2, #4]
118	vstr s15, [J3, #4]
119	.endm
120
121	.macro postrotation_innerloop tail, head
122	.set trig_lo_head, n8 - k - 2
123	.set trig_hi_head, n8 + k
124	.set out_lo_head, trig_lo_head * 2
125	.set out_hi_head, trig_hi_head * 2
126	.set trig_lo_tail, n8 - (k - 2) - 2
127	.set trig_hi_tail, n8 + (k - 2)
128	.set out_lo_tail, trig_lo_tail * 2
129	.set out_hi_tail, trig_hi_tail * 2
130	.if (k & 2) == 0
131	TCOS_D0_HEAD .req d10 @ s20,s21
132	TCOS_D1_HEAD .req d11 @ s22,s23
133	TCOS_S0_TAIL .req s24
134	.else
135	TCOS_D0_HEAD .req d12 @ s24,s25
136	TCOS_D1_HEAD .req d13 @ s26,s27
137	TCOS_S0_TAIL .req s20
138	.endif
139	.ifnc "\tail",""
140	vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
141	.endif
142	.ifnc "\head",""
143	vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
144	vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
145	vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
146	.endif
147	.ifnc "\tail",""
148	vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
149	.endif
150	.ifnc "\head",""
151	vldr s0, [OUT, #out_lo_head*4]
152	vldr s1, [OUT, #out_lo_head*4 + 8]
153	vldr s2, [OUT, #out_hi_head*4]
154	vldr s3, [OUT, #out_hi_head*4 + 8]
155	vldr s4, [OUT, #out_lo_head*4 + 4]
156	vldr s5, [OUT, #out_lo_head*4 + 12]
157	vldr s6, [OUT, #out_hi_head*4 + 4]
158	vldr s7, [OUT, #out_hi_head*4 + 12]
159	.endif
160	.ifnc "\tail",""
161	vstr s8, [OUT, #out_lo_tail*4]
162	vstr s9, [OUT, #out_lo_tail*4 + 8]
163	vstr s10, [OUT, #out_hi_tail*4]
164	vstr s11, [OUT, #out_hi_tail*4 + 8]
165	.endif
166	.ifnc "\head",""
167	vmul.f s8, s4, s16 @ vector operation
168	.endif
169	.ifnc "\tail",""
170	vstr s12, [OUT, #out_hi_tail*4 + 12]
171	vstr s13, [OUT, #out_hi_tail*4 + 4]
172	vstr s14, [OUT, #out_lo_tail*4 + 12]
173	vstr s15, [OUT, #out_lo_tail*4 + 4]
174	.endif
175	.ifnc "\head",""
176	vmul.f s12, s0, s16 @ vector operation
177	vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
178	.endif
179	.unreq TCOS_D0_HEAD
180	.unreq TCOS_D1_HEAD
181	.unreq TCOS_S0_TAIL
182	.ifnc "\head",""
183	.set k, k + 2
184	.endif
185	.endm
186
187	.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
188	.ifnc "\tail",""
189	vmls.f s8, s0, \tcos_s0_tail @ vector operation
190	.endif
191	.ifnc "\head",""
192	vldmia TSIN!, {s16,s17}
193	vldmdb TSIN_HI!, {s18,s19}
194	vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
195	.endif
196	.ifnc "\tail",""
197	vmla.f s12, s4, \tcos_s0_tail @ vector operation
198	.endif
199	.ifnc "\head",""
200	vldr s0, [OUT, #+\out_offset_head+0]
201	vldr s1, [OUT, #+\out_offset_head+8]
202	vldr s2, [OUT_HI, #-\out_offset_head-16]
203	vldr s3, [OUT_HI, #-\out_offset_head-8]
204	vldr s4, [OUT, #+\out_offset_head+4]
205	vldr s5, [OUT, #+\out_offset_head+12]
206	vldr s6, [OUT_HI, #-\out_offset_head-12]
207	vldr s7, [OUT_HI, #-\out_offset_head-4]
208	.endif
209	.ifnc "\tail",""
210	vstr s8, [OUT, #+\out_offset_tail+0]
211	vstr s9, [OUT, #+\out_offset_tail+8]
212	vstr s10, [OUT_HI, #-\out_offset_tail-16]
213	vstr s11, [OUT_HI, #-\out_offset_tail-8]
214	.endif
215	.ifnc "\head",""
216	vmul.f s8, s4, s16 @ vector operation
217	.endif
218	.ifnc "\tail",""
219	vstr s12, [OUT_HI, #-\out_offset_tail-4]
220	vstr s13, [OUT_HI, #-\out_offset_tail-12]
221	vstr s14, [OUT, #+\out_offset_tail+12]
222	vstr s15, [OUT, #+\out_offset_tail+4]
223	.endif
224	.ifnc "\head",""
225	vmul.f s12, s0, s16 @ vector operation
226	vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
227	.endif
228	.endm
229
230
231	/* void ff_imdct_half_vfp(FFTContext *s,
232	* FFTSample *output,
233	* const FFTSample *input)
234	*/
235	function ff_imdct_half_vfp, export=1
236	ldr ip, [CONTEXT, #5*4] @ mdct_bits
237	teq ip, #6
238	bne 10f
239
240	.set n, 1<<6
241	.set n2, n/2
242	.set n4, n/4
243	.set n8, n/8
244
245	push {v1-v5,lr}
246	vpush {s16-s27}
247	fmrx OLDFPSCR, FPSCR
248	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
249	fmxr FPSCR, lr
250	mov OUT, ORIGOUT
251	ldr REVTAB, [CONTEXT, #2*4]
252	ldr TCOS, [CONTEXT, #6*4]
253	ldr TSIN, [CONTEXT, #7*4]
254
255	.set k, 0
256	.rept n8/2
257	prerotation_innerloop
258	.endr
259
260	fmxr FPSCR, OLDFPSCR
261	mov a1, OUT
262	bl X(ff_fft16_vfp)
263	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
264	fmxr FPSCR, lr
265
266	.set k, 0
267	postrotation_innerloop , head
268	.rept n8/2 - 1
269	postrotation_innerloop tail, head
270	.endr
271	postrotation_innerloop tail
272
273	fmxr FPSCR, OLDFPSCR
274	vpop {s16-s27}
275	pop {v1-v5,pc}
276
277	10:
278	push {v1-v6,sl,fp,lr}
279	vpush {s16-s27}
280	fmrx OLDFPSCR, FPSCR
281	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
282	fmxr FPSCR, lr
283	mov lr, #1
284	mov OUT, ORIGOUT
285	ldr REVTAB, [CONTEXT, #2*4]
286	ldr TCOS, [CONTEXT, #6*4]
287	ldr TSIN, [CONTEXT, #7*4]
288	mov lr, lr, lsl ip
289
290	push {CONTEXT,OLDFPSCR}
291	add IN_HI, IN, lr, lsl #1
292	add REVTAB_HI, REVTAB, lr, lsr #1
293	add TCOS_HI, TCOS, lr
294	add TSIN_HI, TSIN, lr
295	0: prerotation_innerloop_rolled
296	teq IN, IN_HI
297	bne 0b
298	ldmia sp, {CONTEXT,OLDFPSCR}
299
300	mov ORIGOUT, OUT
301	fmxr FPSCR, OLDFPSCR
302	ldr ip, [CONTEXT, #9*4]
303	blx ip @ s->fft_calc(s, output)
304
305	pop {CONTEXT,OLDFPSCR}
306	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
307	ldr ip, [CONTEXT, #5*4] @ mdct_bits
308	fmxr FPSCR, lr
309	mov lr, #1
310	mov lr, lr, lsl ip
311	sub TCOS, TCOS, lr, lsr #1
312	sub TSIN, TSIN, lr, lsr #1
313	add OUT_HI, OUT, lr, lsl #1
314	add TCOS_HI, TCOS, lr
315	add TSIN_HI, TSIN, lr
316	postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
317	b 1f
318	0: add OUT, OUT, #32
319	sub OUT_HI, OUT_HI, #32
320	postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
321	1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
322	teq TSIN, TSIN_HI
323	bne 0b
324	postrotation_innerloop_rolled tail,,,,,, s24,, 16
325
326	fmxr FPSCR, OLDFPSCR
327	vpop {s16-s27}
328	pop {v1-v6,sl,fp,pc}
329	endfunc
330
331	.unreq CONTEXT
332	.unreq ORIGOUT
333	.unreq IN
334	.unreq OUT
335	.unreq REVTAB
336	.unreq TCOS
337	.unreq TSIN
338	.unreq OLDFPSCR
339	.unreq J0
340	.unreq J1
341	.unreq J2
342	.unreq J3
343	.unreq REVTAB_HI
344	.unreq IN_HI
345	.unreq OUT_HI
346	.unreq TCOS_HI
347	.unreq TSIN_HI