[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / synth_filter_vfp.S

/*
 * Copyright (c) 2013 RISC OS Open Ltd
 * Author: Ben Avison <bavison@riscosopen.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

IMDCT         .req    r0
ORIG_P_SB     .req    r1
P_SB_OFF      .req    r2
I             .req    r0
P_SB2_UP      .req    r1
OLDFPSCR      .req    r2
P_SB2_DN      .req    r3
P_WIN_DN      .req    r4
P_OUT_DN      .req    r5
P_SB          .req    r6
J_WRAP        .req    r7
P_WIN_UP      .req    r12
P_OUT_UP      .req    r14

SCALE         .req    s0
SBUF_DAT_REV0 .req    s4
SBUF_DAT_REV1 .req    s5
SBUF_DAT_REV2 .req    s6
SBUF_DAT_REV3 .req    s7
VA0           .req    s8
VA3           .req    s11
VB0           .req    s12
VB3           .req    s15
VC0           .req    s8
VC3           .req    s11
VD0           .req    s12
VD3           .req    s15
SBUF_DAT0     .req    s16
SBUF_DAT1     .req    s17
SBUF_DAT2     .req    s18
SBUF_DAT3     .req    s19
SBUF_DAT_ALT0 .req    s20
SBUF_DAT_ALT1 .req    s21
SBUF_DAT_ALT2 .req    s22
SBUF_DAT_ALT3 .req    s23
WIN_DN_DAT0   .req    s24
WIN_UP_DAT0   .req    s28


.macro inner_loop  half, tail, head
 .if (OFFSET & (64*4)) == 0                @ even numbered call
        SBUF_DAT_THIS0 .req SBUF_DAT0
        SBUF_DAT_THIS1 .req SBUF_DAT1
        SBUF_DAT_THIS2 .req SBUF_DAT2
        SBUF_DAT_THIS3 .req SBUF_DAT3
  .ifnc "\head",""
        vldr    d8, [P_SB, #OFFSET]        @ d8 = SBUF_DAT
        vldr    d9, [P_SB, #OFFSET+8]
  .endif
 .else
        SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
        SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
        SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
        SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
  .ifnc "\head",""
        vldr    d10, [P_SB, #OFFSET]       @ d10 = SBUF_DAT_ALT
        vldr    d11, [P_SB, #OFFSET+8]
  .endif
 .endif
 .ifnc "\tail",""
  .ifc "\half","ab"
        vmls.f  VA0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
  .else
        vmla.f  VD0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
  .endif
 .endif
 .ifnc "\head",""
        vldr    d14, [P_WIN_UP, #OFFSET]   @ d14 = WIN_UP_DAT
        vldr    d15, [P_WIN_UP, #OFFSET+8]
        vldr    d12, [P_WIN_DN, #OFFSET]   @ d12 = WIN_DN_DAT
        vldr    d13, [P_WIN_DN, #OFFSET+8]
        vmov    SBUF_DAT_REV3, SBUF_DAT_THIS0
        vmov    SBUF_DAT_REV2, SBUF_DAT_THIS1
        vmov    SBUF_DAT_REV1, SBUF_DAT_THIS2
        vmov    SBUF_DAT_REV0, SBUF_DAT_THIS3
  .ifc "\half","ab"
        vmla.f  VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
  .else
        vmla.f  VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
  .endif
        teq     J_WRAP, #J
        bne     2f             @ strongly predictable, so better than cond exec in this case
        sub     P_SB, P_SB, #512*4
2:
  .set J, J - 64
  .set OFFSET, OFFSET + 64*4
 .endif
        .unreq  SBUF_DAT_THIS0
        .unreq  SBUF_DAT_THIS1
        .unreq  SBUF_DAT_THIS2
        .unreq  SBUF_DAT_THIS3
.endm


/* void ff_synth_filter_float_vfp(FFTContext *imdct,
 *                                float *synth_buf_ptr, int *synth_buf_offset,
 *                                float synth_buf2[32], const float window[512],
 *                                float out[32], const float in[32], float scale)
 */
function ff_synth_filter_float_vfp, export=1
        push    {r3-r7,lr}
        vpush   {s16-s31}
        ldr     lr, [P_SB_OFF]
        add     a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half
        mov     P_SB, a2                  @ and keep a copy for ourselves
        bic     J_WRAP, lr, #63           @ mangled to make testing for wrap easier in inner loop
        sub     lr, lr, #32
        and     lr, lr, #512-32
        str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call
        ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half
VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
        bl      X(ff_imdct_half_vfp)
VFP     vmov    SCALE, s16

        fmrx    OLDFPSCR, FPSCR
        ldr     lr, =0x03030000           @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, lr
        ldr     P_SB2_DN, [sp, #16*4]
        ldr     P_WIN_DN, [sp, #(16+6+0)*4]
        ldr     P_OUT_DN, [sp, #(16+6+1)*4]
NOVFP   vldr    SCALE, [sp, #(16+6+3)*4]

#define IMM_OFF_SKEW 956                   /* also valid immediate constant when you add 16*4 */
        add     P_SB, P_SB, #IMM_OFF_SKEW  @ so we can use -ve offsets to use full immediate offset range
        add     P_SB2_UP, P_SB2_DN, #16*4
        add     P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
        add     P_OUT_UP, P_OUT_DN, #16*4
        add     P_SB2_DN, P_SB2_DN, #16*4
        add     P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
        add     P_OUT_DN, P_OUT_DN, #16*4
        mov     I, #4
1:
        vldmia  P_SB2_UP!, {VB0-VB3}
        vldmdb  P_SB2_DN!, {VA0-VA3}
 .set J, 512 - 64
 .set OFFSET, -IMM_OFF_SKEW
        inner_loop  ab,, head
 .rept 7
        inner_loop  ab, tail, head
 .endr
        inner_loop  ab, tail
        add     P_WIN_UP, P_WIN_UP, #4*4
        sub     P_WIN_DN, P_WIN_DN, #4*4
        vmul.f  VB0, VB0, SCALE      @ SCALE treated as scalar
        add     P_SB, P_SB, #(512+4)*4
        subs    I, I, #1
        vmul.f  VA0, VA0, SCALE
        vstmia  P_OUT_UP!, {VB0-VB3}
        vstmdb  P_OUT_DN!, {VA0-VA3}
        bne     1b

        add     P_SB2_DN, P_SB2_DN, #(16+28-12)*4
        sub     P_SB2_UP, P_SB2_UP, #(16+16)*4
        add     P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
        mov     I, #4
1:
        vldr.d  d4, zero             @ d4 = VC0
        vldr.d  d5, zero
        vldr.d  d6, zero             @ d6 = VD0
        vldr.d  d7, zero
 .set J, 512 - 64
 .set OFFSET, -IMM_OFF_SKEW
        inner_loop  cd,, head
 .rept 7
        inner_loop  cd, tail, head
 .endr
        inner_loop  cd, tail
        add     P_WIN_UP, P_WIN_UP, #4*4
        sub     P_WIN_DN, P_WIN_DN, #4*4
        add     P_SB, P_SB, #(512+4)*4
        subs    I, I, #1
        vstmia  P_SB2_UP!, {VC0-VC3}
        vstmdb  P_SB2_DN!, {VD0-VD3}
        bne     1b

        fmxr    FPSCR, OLDFPSCR
        vpop    {s16-s31}
        pop     {r3-r7,pc}
endfunc

        .unreq  IMDCT
        .unreq  ORIG_P_SB
        .unreq  P_SB_OFF
        .unreq  I
        .unreq  P_SB2_UP
        .unreq  OLDFPSCR
        .unreq  P_SB2_DN
        .unreq  P_WIN_DN
        .unreq  P_OUT_DN
        .unreq  P_SB
        .unreq  J_WRAP
        .unreq  P_WIN_UP
        .unreq  P_OUT_UP

        .unreq  SCALE
        .unreq  SBUF_DAT_REV0
        .unreq  SBUF_DAT_REV1
        .unreq  SBUF_DAT_REV2
        .unreq  SBUF_DAT_REV3
        .unreq  VA0
        .unreq  VA3
        .unreq  VB0
        .unreq  VB3
        .unreq  VC0
        .unreq  VC3
        .unreq  VD0
        .unreq  VD3
        .unreq  SBUF_DAT0
        .unreq  SBUF_DAT1
        .unreq  SBUF_DAT2
        .unreq  SBUF_DAT3
        .unreq  SBUF_DAT_ALT0
        .unreq  SBUF_DAT_ALT1
        .unreq  SBUF_DAT_ALT2
        .unreq  SBUF_DAT_ALT3
        .unreq  WIN_DN_DAT0
        .unreq  WIN_UP_DAT0

        .align  3
zero:   .word   0, 0
Commit	Line	Data
2ba45a60 DM	1	/*
	2	* Copyright (c) 2013 RISC OS Open Ltd
	3	* Author: Ben Avison <bavison@riscosopen.org>
	4	*
	5	* This file is part of FFmpeg.
	6	*
	7	* FFmpeg is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU Lesser General Public
	9	* License as published by the Free Software Foundation; either
	10	* version 2.1 of the License, or (at your option) any later version.
	11	*
	12	* FFmpeg is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	* Lesser General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU Lesser General Public
	18	* License along with FFmpeg; if not, write to the Free Software
	19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	20	*/
	21
	22	#include "libavutil/arm/asm.S"
	23
	24	IMDCT .req r0
	25	ORIG_P_SB .req r1
	26	P_SB_OFF .req r2
	27	I .req r0
	28	P_SB2_UP .req r1
	29	OLDFPSCR .req r2
	30	P_SB2_DN .req r3
	31	P_WIN_DN .req r4
	32	P_OUT_DN .req r5
	33	P_SB .req r6
	34	J_WRAP .req r7
	35	P_WIN_UP .req r12
	36	P_OUT_UP .req r14
	37
	38	SCALE .req s0
	39	SBUF_DAT_REV0 .req s4
	40	SBUF_DAT_REV1 .req s5
	41	SBUF_DAT_REV2 .req s6
	42	SBUF_DAT_REV3 .req s7
	43	VA0 .req s8
	44	VA3 .req s11
	45	VB0 .req s12
	46	VB3 .req s15
	47	VC0 .req s8
	48	VC3 .req s11
	49	VD0 .req s12
	50	VD3 .req s15
	51	SBUF_DAT0 .req s16
	52	SBUF_DAT1 .req s17
	53	SBUF_DAT2 .req s18
	54	SBUF_DAT3 .req s19
	55	SBUF_DAT_ALT0 .req s20
	56	SBUF_DAT_ALT1 .req s21
	57	SBUF_DAT_ALT2 .req s22
	58	SBUF_DAT_ALT3 .req s23
	59	WIN_DN_DAT0 .req s24
	60	WIN_UP_DAT0 .req s28
	61
	62
	63	.macro inner_loop half, tail, head
	64	.if (OFFSET & (64*4)) == 0 @ even numbered call
65	SBUF_DAT_THIS0 .req SBUF_DAT0
66	SBUF_DAT_THIS1 .req SBUF_DAT1
67	SBUF_DAT_THIS2 .req SBUF_DAT2
68	SBUF_DAT_THIS3 .req SBUF_DAT3
69	.ifnc "\head",""
70	vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT
71	vldr d9, [P_SB, #OFFSET+8]
72	.endif
73	.else
74	SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
75	SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
76	SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
77	SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
78	.ifnc "\head",""
79	vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT
80	vldr d11, [P_SB, #OFFSET+8]
81	.endif
82	.endif
83	.ifnc "\tail",""
84	.ifc "\half","ab"
85	vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
86	.else
87	vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
88	.endif
89	.endif
90	.ifnc "\head",""
91	vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT
92	vldr d15, [P_WIN_UP, #OFFSET+8]
93	vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT
94	vldr d13, [P_WIN_DN, #OFFSET+8]
95	vmov SBUF_DAT_REV3, SBUF_DAT_THIS0
96	vmov SBUF_DAT_REV2, SBUF_DAT_THIS1
97	vmov SBUF_DAT_REV1, SBUF_DAT_THIS2
98	vmov SBUF_DAT_REV0, SBUF_DAT_THIS3
99	.ifc "\half","ab"
100	vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
101	.else
102	vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
103	.endif
104	teq J_WRAP, #J
105	bne 2f @ strongly predictable, so better than cond exec in this case
106	sub P_SB, P_SB, #512*4
107	2:
108	.set J, J - 64
109	.set OFFSET, OFFSET + 64*4
110	.endif
111	.unreq SBUF_DAT_THIS0
112	.unreq SBUF_DAT_THIS1
113	.unreq SBUF_DAT_THIS2
114	.unreq SBUF_DAT_THIS3
115	.endm
116
117
118	/* void ff_synth_filter_float_vfp(FFTContext *imdct,
119	* float synth_buf_ptr, int synth_buf_offset,
120	* float synth_buf2[32], const float window[512],
121	* float out[32], const float in[32], float scale)
122	*/
123	function ff_synth_filter_float_vfp, export=1
124	push {r3-r7,lr}
125	vpush {s16-s31}
126	ldr lr, [P_SB_OFF]
127	add a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half
128	mov P_SB, a2 @ and keep a copy for ourselves
129	bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop
130	sub lr, lr, #32
131	and lr, lr, #512-32
132	str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
133	ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
134	VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
135	bl X(ff_imdct_half_vfp)
136	VFP vmov SCALE, s16
137
138	fmrx OLDFPSCR, FPSCR
139	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
140	fmxr FPSCR, lr
141	ldr P_SB2_DN, [sp, #16*4]
142	ldr P_WIN_DN, [sp, #(16+6+0)*4]
143	ldr P_OUT_DN, [sp, #(16+6+1)*4]
144	NOVFP vldr SCALE, [sp, #(16+6+3)*4]
145
146	#define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 164 /
147	add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range
148	add P_SB2_UP, P_SB2_DN, #16*4
149	add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
150	add P_OUT_UP, P_OUT_DN, #16*4
151	add P_SB2_DN, P_SB2_DN, #16*4
152	add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
153	add P_OUT_DN, P_OUT_DN, #16*4
154	mov I, #4
155	1:
156	vldmia P_SB2_UP!, {VB0-VB3}
157	vldmdb P_SB2_DN!, {VA0-VA3}
158	.set J, 512 - 64
159	.set OFFSET, -IMM_OFF_SKEW
160	inner_loop ab,, head
161	.rept 7
162	inner_loop ab, tail, head
163	.endr
164	inner_loop ab, tail
165	add P_WIN_UP, P_WIN_UP, #4*4
166	sub P_WIN_DN, P_WIN_DN, #4*4
167	vmul.f VB0, VB0, SCALE @ SCALE treated as scalar
168	add P_SB, P_SB, #(512+4)*4
169	subs I, I, #1
170	vmul.f VA0, VA0, SCALE
171	vstmia P_OUT_UP!, {VB0-VB3}
172	vstmdb P_OUT_DN!, {VA0-VA3}
173	bne 1b
174
175	add P_SB2_DN, P_SB2_DN, #(16+28-12)*4
176	sub P_SB2_UP, P_SB2_UP, #(16+16)*4
177	add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
178	mov I, #4
179	1:
180	vldr.d d4, zero @ d4 = VC0
181	vldr.d d5, zero
182	vldr.d d6, zero @ d6 = VD0
183	vldr.d d7, zero
184	.set J, 512 - 64
185	.set OFFSET, -IMM_OFF_SKEW
186	inner_loop cd,, head
187	.rept 7
188	inner_loop cd, tail, head
189	.endr
190	inner_loop cd, tail
191	add P_WIN_UP, P_WIN_UP, #4*4
192	sub P_WIN_DN, P_WIN_DN, #4*4
193	add P_SB, P_SB, #(512+4)*4
194	subs I, I, #1
195	vstmia P_SB2_UP!, {VC0-VC3}
196	vstmdb P_SB2_DN!, {VD0-VD3}
197	bne 1b
198
199	fmxr FPSCR, OLDFPSCR
200	vpop {s16-s31}
201	pop {r3-r7,pc}
202	endfunc
203
204	.unreq IMDCT
205	.unreq ORIG_P_SB
206	.unreq P_SB_OFF
207	.unreq I
208	.unreq P_SB2_UP
209	.unreq OLDFPSCR
210	.unreq P_SB2_DN
211	.unreq P_WIN_DN
212	.unreq P_OUT_DN
213	.unreq P_SB
214	.unreq J_WRAP
215	.unreq P_WIN_UP
216	.unreq P_OUT_UP
217
218	.unreq SCALE
219	.unreq SBUF_DAT_REV0
220	.unreq SBUF_DAT_REV1
221	.unreq SBUF_DAT_REV2
222	.unreq SBUF_DAT_REV3
223	.unreq VA0
224	.unreq VA3
225	.unreq VB0
226	.unreq VB3
227	.unreq VC0
228	.unreq VC3
229	.unreq VD0
230	.unreq VD3
231	.unreq SBUF_DAT0
232	.unreq SBUF_DAT1
233	.unreq SBUF_DAT2
234	.unreq SBUF_DAT3
235	.unreq SBUF_DAT_ALT0
236	.unreq SBUF_DAT_ALT1
237	.unreq SBUF_DAT_ALT2
238	.unreq SBUF_DAT_ALT3
239	.unreq WIN_DN_DAT0
240	.unreq WIN_UP_DAT0
241
242	.align 3
243	zero: .word 0, 0