[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / synth_filter_vfp.S

/*
 * Copyright (c) 2013 RISC OS Open Ltd
 * Author: Ben Avison <bavison@riscosopen.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

IMDCT         .req    r0
ORIG_P_SB     .req    r1
P_SB_OFF      .req    r2
I             .req    r0
P_SB2_UP      .req    r1
OLDFPSCR      .req    r2
P_SB2_DN      .req    r3
P_WIN_DN      .req    r4
P_OUT_DN      .req    r5
P_SB          .req    r6
J_WRAP        .req    r7
P_WIN_UP      .req    r12
P_OUT_UP      .req    r14

SCALE         .req    s0
SBUF_DAT_REV0 .req    s4
SBUF_DAT_REV1 .req    s5
SBUF_DAT_REV2 .req    s6
SBUF_DAT_REV3 .req    s7
VA0           .req    s8
VA3           .req    s11
VB0           .req    s12
VB3           .req    s15
VC0           .req    s8
VC3           .req    s11
VD0           .req    s12
VD3           .req    s15
SBUF_DAT0     .req    s16
SBUF_DAT1     .req    s17
SBUF_DAT2     .req    s18
SBUF_DAT3     .req    s19
SBUF_DAT_ALT0 .req    s20
SBUF_DAT_ALT1 .req    s21
SBUF_DAT_ALT2 .req    s22
SBUF_DAT_ALT3 .req    s23
WIN_DN_DAT0   .req    s24
WIN_UP_DAT0   .req    s28


.macro inner_loop  half, tail, head
 .if (OFFSET & (64*4)) == 0                @ even numbered call
        SBUF_DAT_THIS0 .req SBUF_DAT0
        SBUF_DAT_THIS1 .req SBUF_DAT1
        SBUF_DAT_THIS2 .req SBUF_DAT2
        SBUF_DAT_THIS3 .req SBUF_DAT3
  .ifnc "\head",""
        vldr    d8, [P_SB, #OFFSET]        @ d8 = SBUF_DAT
        vldr    d9, [P_SB, #OFFSET+8]
  .endif
 .else
        SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
        SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
        SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
        SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
  .ifnc "\head",""
        vldr    d10, [P_SB, #OFFSET]       @ d10 = SBUF_DAT_ALT
        vldr    d11, [P_SB, #OFFSET+8]
  .endif
 .endif
 .ifnc "\tail",""
  .ifc "\half","ab"
        vmls.f  VA0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
  .else
        vmla.f  VD0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
  .endif
 .endif
 .ifnc "\head",""
        vldr    d14, [P_WIN_UP, #OFFSET]   @ d14 = WIN_UP_DAT
        vldr    d15, [P_WIN_UP, #OFFSET+8]
        vldr    d12, [P_WIN_DN, #OFFSET]   @ d12 = WIN_DN_DAT
        vldr    d13, [P_WIN_DN, #OFFSET+8]
        vmov    SBUF_DAT_REV3, SBUF_DAT_THIS0
        vmov    SBUF_DAT_REV2, SBUF_DAT_THIS1
        vmov    SBUF_DAT_REV1, SBUF_DAT_THIS2
        vmov    SBUF_DAT_REV0, SBUF_DAT_THIS3
  .ifc "\half","ab"
        vmla.f  VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
  .else
        vmla.f  VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
  .endif
        teq     J_WRAP, #J
        bne     2f             @ strongly predictable, so better than cond exec in this case
        sub     P_SB, P_SB, #512*4
2:
  .set J, J - 64
  .set OFFSET, OFFSET + 64*4
 .endif
        .unreq  SBUF_DAT_THIS0
        .unreq  SBUF_DAT_THIS1
        .unreq  SBUF_DAT_THIS2
        .unreq  SBUF_DAT_THIS3
.endm


/* void ff_synth_filter_float_vfp(FFTContext *imdct,
 *                                float *synth_buf_ptr, int *synth_buf_offset,
 *                                float synth_buf2[32], const float window[512],
 *                                float out[32], const float in[32], float scale)
 */
function ff_synth_filter_float_vfp, export=1
        push    {r3-r7,lr}
        vpush   {s16-s31}
        ldr     lr, [P_SB_OFF]
        add     a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half
        mov     P_SB, a2                  @ and keep a copy for ourselves
        bic     J_WRAP, lr, #63           @ mangled to make testing for wrap easier in inner loop
        sub     lr, lr, #32
        and     lr, lr, #512-32
        str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call
        ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half
VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
        bl      X(ff_imdct_half_vfp)
VFP     vmov    SCALE, s16

        fmrx    OLDFPSCR, FPSCR
        ldr     lr, =0x03030000           @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, lr
        ldr     P_SB2_DN, [sp, #16*4]
        ldr     P_WIN_DN, [sp, #(16+6+0)*4]
        ldr     P_OUT_DN, [sp, #(16+6+1)*4]
NOVFP   vldr    SCALE, [sp, #(16+6+3)*4]

#define IMM_OFF_SKEW 956                   /* also valid immediate constant when you add 16*4 */
        add     P_SB, P_SB, #IMM_OFF_SKEW  @ so we can use -ve offsets to use full immediate offset range
        add     P_SB2_UP, P_SB2_DN, #16*4
        add     P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
        add     P_OUT_UP, P_OUT_DN, #16*4
        add     P_SB2_DN, P_SB2_DN, #16*4
        add     P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
        add     P_OUT_DN, P_OUT_DN, #16*4
        mov     I, #4
1:
        vldmia  P_SB2_UP!, {VB0-VB3}
        vldmdb  P_SB2_DN!, {VA0-VA3}
 .set J, 512 - 64
 .set OFFSET, -IMM_OFF_SKEW
        inner_loop  ab,, head
 .rept 7
        inner_loop  ab, tail, head
 .endr
        inner_loop  ab, tail
        add     P_WIN_UP, P_WIN_UP, #4*4
        sub     P_WIN_DN, P_WIN_DN, #4*4
        vmul.f  VB0, VB0, SCALE      @ SCALE treated as scalar
        add     P_SB, P_SB, #(512+4)*4
        subs    I, I, #1
        vmul.f  VA0, VA0, SCALE
        vstmia  P_OUT_UP!, {VB0-VB3}
        vstmdb  P_OUT_DN!, {VA0-VA3}
        bne     1b

        add     P_SB2_DN, P_SB2_DN, #(16+28-12)*4
        sub     P_SB2_UP, P_SB2_UP, #(16+16)*4
        add     P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
        mov     I, #4
1:
        vldr.d  d4, zero             @ d4 = VC0
        vldr.d  d5, zero
        vldr.d  d6, zero             @ d6 = VD0
        vldr.d  d7, zero
 .set J, 512 - 64
 .set OFFSET, -IMM_OFF_SKEW
        inner_loop  cd,, head
 .rept 7
        inner_loop  cd, tail, head
 .endr
        inner_loop  cd, tail
        add     P_WIN_UP, P_WIN_UP, #4*4
        sub     P_WIN_DN, P_WIN_DN, #4*4
        add     P_SB, P_SB, #(512+4)*4
        subs    I, I, #1
        vstmia  P_SB2_UP!, {VC0-VC3}
        vstmdb  P_SB2_DN!, {VD0-VD3}
        bne     1b

        fmxr    FPSCR, OLDFPSCR
        vpop    {s16-s31}
        pop     {r3-r7,pc}
endfunc

        .unreq  IMDCT
        .unreq  ORIG_P_SB
        .unreq  P_SB_OFF
        .unreq  I
        .unreq  P_SB2_UP
        .unreq  OLDFPSCR
        .unreq  P_SB2_DN
        .unreq  P_WIN_DN
        .unreq  P_OUT_DN
        .unreq  P_SB
        .unreq  J_WRAP
        .unreq  P_WIN_UP
        .unreq  P_OUT_UP

        .unreq  SCALE
        .unreq  SBUF_DAT_REV0
        .unreq  SBUF_DAT_REV1
        .unreq  SBUF_DAT_REV2
        .unreq  SBUF_DAT_REV3
        .unreq  VA0
        .unreq  VA3
        .unreq  VB0
        .unreq  VB3
        .unreq  VC0
        .unreq  VC3
        .unreq  VD0
        .unreq  VD3
        .unreq  SBUF_DAT0
        .unreq  SBUF_DAT1
        .unreq  SBUF_DAT2
        .unreq  SBUF_DAT3
        .unreq  SBUF_DAT_ALT0
        .unreq  SBUF_DAT_ALT1
        .unreq  SBUF_DAT_ALT2
        .unreq  SBUF_DAT_ALT3
        .unreq  WIN_DN_DAT0
        .unreq  WIN_UP_DAT0

        .align  3
zero:   .word   0, 0
Commit	Line	Data
	1	/*
	2	* Copyright (c) 2013 RISC OS Open Ltd
	3	* Author: Ben Avison <bavison@riscosopen.org>
	4	*
	5	* This file is part of FFmpeg.
	6	*
	7	* FFmpeg is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU Lesser General Public
	9	* License as published by the Free Software Foundation; either
	10	* version 2.1 of the License, or (at your option) any later version.
	11	*
	12	* FFmpeg is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	* Lesser General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU Lesser General Public
	18	* License along with FFmpeg; if not, write to the Free Software
	19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	20	*/
	21
	22	#include "libavutil/arm/asm.S"
	23
	24	IMDCT .req r0
	25	ORIG_P_SB .req r1
	26	P_SB_OFF .req r2
	27	I .req r0
	28	P_SB2_UP .req r1
	29	OLDFPSCR .req r2
	30	P_SB2_DN .req r3
	31	P_WIN_DN .req r4
	32	P_OUT_DN .req r5
	33	P_SB .req r6
	34	J_WRAP .req r7
	35	P_WIN_UP .req r12
	36	P_OUT_UP .req r14
	37
	38	SCALE .req s0
	39	SBUF_DAT_REV0 .req s4
	40	SBUF_DAT_REV1 .req s5
	41	SBUF_DAT_REV2 .req s6
	42	SBUF_DAT_REV3 .req s7
	43	VA0 .req s8
	44	VA3 .req s11
	45	VB0 .req s12
	46	VB3 .req s15
	47	VC0 .req s8
	48	VC3 .req s11
	49	VD0 .req s12
	50	VD3 .req s15
	51	SBUF_DAT0 .req s16
	52	SBUF_DAT1 .req s17
	53	SBUF_DAT2 .req s18
	54	SBUF_DAT3 .req s19
	55	SBUF_DAT_ALT0 .req s20
	56	SBUF_DAT_ALT1 .req s21
	57	SBUF_DAT_ALT2 .req s22
	58	SBUF_DAT_ALT3 .req s23
	59	WIN_DN_DAT0 .req s24
	60	WIN_UP_DAT0 .req s28
	61
	62
	63	.macro inner_loop half, tail, head
	64	.if (OFFSET & (64*4)) == 0 @ even numbered call
	65	SBUF_DAT_THIS0 .req SBUF_DAT0
	66	SBUF_DAT_THIS1 .req SBUF_DAT1
	67	SBUF_DAT_THIS2 .req SBUF_DAT2
	68	SBUF_DAT_THIS3 .req SBUF_DAT3
	69	.ifnc "\head",""
	70	vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT
	71	vldr d9, [P_SB, #OFFSET+8]
	72	.endif
	73	.else
	74	SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
	75	SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
	76	SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
	77	SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
	78	.ifnc "\head",""
	79	vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT
	80	vldr d11, [P_SB, #OFFSET+8]
	81	.endif
	82	.endif
	83	.ifnc "\tail",""
	84	.ifc "\half","ab"
	85	vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
	86	.else
	87	vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
	88	.endif
	89	.endif
	90	.ifnc "\head",""
	91	vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT
	92	vldr d15, [P_WIN_UP, #OFFSET+8]
	93	vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT
	94	vldr d13, [P_WIN_DN, #OFFSET+8]
	95	vmov SBUF_DAT_REV3, SBUF_DAT_THIS0
	96	vmov SBUF_DAT_REV2, SBUF_DAT_THIS1
	97	vmov SBUF_DAT_REV1, SBUF_DAT_THIS2
	98	vmov SBUF_DAT_REV0, SBUF_DAT_THIS3
	99	.ifc "\half","ab"
	100	vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
	101	.else
	102	vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
	103	.endif
	104	teq J_WRAP, #J
	105	bne 2f @ strongly predictable, so better than cond exec in this case
	106	sub P_SB, P_SB, #512*4
	107	2:
	108	.set J, J - 64
	109	.set OFFSET, OFFSET + 64*4
	110	.endif
	111	.unreq SBUF_DAT_THIS0
	112	.unreq SBUF_DAT_THIS1
	113	.unreq SBUF_DAT_THIS2
	114	.unreq SBUF_DAT_THIS3
	115	.endm
	116
	117
	118	/* void ff_synth_filter_float_vfp(FFTContext *imdct,
	119	* float synth_buf_ptr, int synth_buf_offset,
	120	* float synth_buf2[32], const float window[512],
	121	* float out[32], const float in[32], float scale)
	122	*/
	123	function ff_synth_filter_float_vfp, export=1
	124	push {r3-r7,lr}
	125	vpush {s16-s31}
	126	ldr lr, [P_SB_OFF]
	127	add a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half
	128	mov P_SB, a2 @ and keep a copy for ourselves
	129	bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop
	130	sub lr, lr, #32
	131	and lr, lr, #512-32
	132	str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
	133	ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
	134	VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
	135	bl X(ff_imdct_half_vfp)
	136	VFP vmov SCALE, s16
	137
	138	fmrx OLDFPSCR, FPSCR
	139	ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
	140	fmxr FPSCR, lr
	141	ldr P_SB2_DN, [sp, #16*4]
	142	ldr P_WIN_DN, [sp, #(16+6+0)*4]
	143	ldr P_OUT_DN, [sp, #(16+6+1)*4]
	144	NOVFP vldr SCALE, [sp, #(16+6+3)*4]
	145
	146	#define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 164 /
	147	add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range
	148	add P_SB2_UP, P_SB2_DN, #16*4
	149	add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
	150	add P_OUT_UP, P_OUT_DN, #16*4
	151	add P_SB2_DN, P_SB2_DN, #16*4
	152	add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
	153	add P_OUT_DN, P_OUT_DN, #16*4
	154	mov I, #4
	155	1:
	156	vldmia P_SB2_UP!, {VB0-VB3}
	157	vldmdb P_SB2_DN!, {VA0-VA3}
	158	.set J, 512 - 64
	159	.set OFFSET, -IMM_OFF_SKEW
	160	inner_loop ab,, head
	161	.rept 7
	162	inner_loop ab, tail, head
	163	.endr
	164	inner_loop ab, tail
	165	add P_WIN_UP, P_WIN_UP, #4*4
	166	sub P_WIN_DN, P_WIN_DN, #4*4
	167	vmul.f VB0, VB0, SCALE @ SCALE treated as scalar
	168	add P_SB, P_SB, #(512+4)*4
	169	subs I, I, #1
	170	vmul.f VA0, VA0, SCALE
	171	vstmia P_OUT_UP!, {VB0-VB3}
	172	vstmdb P_OUT_DN!, {VA0-VA3}
	173	bne 1b
	174
	175	add P_SB2_DN, P_SB2_DN, #(16+28-12)*4
	176	sub P_SB2_UP, P_SB2_UP, #(16+16)*4
	177	add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
	178	mov I, #4
	179	1:
	180	vldr.d d4, zero @ d4 = VC0
	181	vldr.d d5, zero
	182	vldr.d d6, zero @ d6 = VD0
	183	vldr.d d7, zero
	184	.set J, 512 - 64
	185	.set OFFSET, -IMM_OFF_SKEW
	186	inner_loop cd,, head
	187	.rept 7
	188	inner_loop cd, tail, head
	189	.endr
	190	inner_loop cd, tail
	191	add P_WIN_UP, P_WIN_UP, #4*4
	192	sub P_WIN_DN, P_WIN_DN, #4*4
	193	add P_SB, P_SB, #(512+4)*4
	194	subs I, I, #1
	195	vstmia P_SB2_UP!, {VC0-VC3}
	196	vstmdb P_SB2_DN!, {VD0-VD3}
	197	bne 1b
	198
	199	fmxr FPSCR, OLDFPSCR
	200	vpop {s16-s31}
	201	pop {r3-r7,pc}
	202	endfunc
	203
	204	.unreq IMDCT
	205	.unreq ORIG_P_SB
	206	.unreq P_SB_OFF
	207	.unreq I
	208	.unreq P_SB2_UP
	209	.unreq OLDFPSCR
	210	.unreq P_SB2_DN
	211	.unreq P_WIN_DN
	212	.unreq P_OUT_DN
	213	.unreq P_SB
	214	.unreq J_WRAP
	215	.unreq P_WIN_UP
	216	.unreq P_OUT_UP
	217
	218	.unreq SCALE
	219	.unreq SBUF_DAT_REV0
	220	.unreq SBUF_DAT_REV1
	221	.unreq SBUF_DAT_REV2
	222	.unreq SBUF_DAT_REV3
	223	.unreq VA0
	224	.unreq VA3
	225	.unreq VB0
	226	.unreq VB3
	227	.unreq VC0
	228	.unreq VC3
	229	.unreq VD0
	230	.unreq VD3
	231	.unreq SBUF_DAT0
	232	.unreq SBUF_DAT1
	233	.unreq SBUF_DAT2
	234	.unreq SBUF_DAT3
	235	.unreq SBUF_DAT_ALT0
	236	.unreq SBUF_DAT_ALT1
	237	.unreq SBUF_DAT_ALT2
	238	.unreq SBUF_DAT_ALT3
	239	.unreq WIN_DN_DAT0
	240	.unreq WIN_UP_DAT0
	241
	242	.align 3
	243	zero: .word 0, 0