[deb_ffmpeg.git] / ffmpeg / libavcodec / aarch64 / mpegaudiodsp_neon.S

/*
 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/aarch64/asm.S"

#define FRAC_BITS   23   // fractional bits for sb_samples and dct
#define WFRAC_BITS  16   // fractional bits for window
#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)

const   tbl_rev128.s, align=4
        .byte           12, 13, 14, 15
        .byte            8,  9, 10, 11
        .byte            4,  5,  6,  7
        .byte            0,  1,  2,  3
endconst

.macro   apply_window   type, st
function ff_mpadsp_apply_window_\type\()_neon, export=1
        mov             x7,  x0
        sxtw            x4,  w4  // incr
        add             x8,  x0,  #512<<2
        ld1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x7],  #64
        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x7],  #64
        st1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x8],  #64
        st1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x8],  #64
        movrel          x15, tbl_rev128.s
        ld1             {v27.4s}, [x15]
.ifc \type, fixed
        lsl             x4,  x4,  #1
.else
        lsl             x4,  x4,  #2
.endif
        add             x10, x0,  #45<<2
        add             x0,  x0,  #16<<2
        add             x1,  x1,  #16<<2
        add             x5,  x3,  x4,  lsl #5
        sub             x5,  x5,  x4            // samples2
        neg             x13, x4                 // -incr
        mov             x9,  #64<<2
.ifc \type, fixed
        ld1r            {v16.2s}, [x2]          // dither_state
        sxtl            v16.2d, v16.2s
        movi            v29.2d, #0
        movi            v30.2d, #(1<<OUT_SHIFT)-1
        trn1            v31.2d, v29.2d, v30.2d
        trn2            v30.2d, v30.2d, v29.2d
        trn1            v16.2d, v16.2d, v29.2d
.else
        movi            v16.4s, #0
        movi            v28.4s, #0
.endif
        mov             x14, #4
1:
        mov             x8,  x0
        sub             x7,  x1,  #3<<2
        sub             x6,  x1,  x14, lsl #4
        add             x7,  x7,  x14, lsl #4
        add             x11, x6, #(32)<<2      // w  + 32
        add             x12, x7, #(32)<<2      // w2 + 32
        mov             x15, #8
        movi            v17.2d, #0
        movi            v18.2d, #0
        movi            v19.2d, #0
2:
        subs            x15, x15, #1
        ld1             {v0.4s},  [x8],  x9
        ld1             {v1.4s},  [x10], x9
        ld1             {v2.4s},  [x6],  x9
        ld1             {v3.4s},  [x7],  x9
        tbl             v6.16b, {v0.16b}, v27.16b
        tbl             v7.16b, {v1.16b}, v27.16b
        ld1             {v4.4s},  [x11], x9
        ld1             {v5.4s},  [x12], x9
        MLA             v16, v2, v0
        MLA2            v17, v2, v0
        MLS             v18, v3, v6
        MLS2            v19, v3, v6
        MLS             v16, v4, v7
        MLS2            v17, v4, v7
        MLS             v18, v5, v1
        MLS2            v19, v5, v1
        b.gt            2b

        cmp             x14, #4
        sub             x10, x10, #64<<5        // 64 * 8 * sizeof(int32_t)

.ifc \type, fixed
        and             v28.16b, v16.16b, v30.16b
        ext             v28.16b, v29.16b, v28.16b, #8

        b.eq            4f
        round_sample    v19, 1, 1
4:
        round_sample    v16, 1, 0
        shrn            v16.2s, v16.2d,  #OUT_SHIFT
        round_sample    v19, 0, 0
        shrn            v19.2s, v19.2d,  #OUT_SHIFT
        round_sample    v17, 0, 1
        round_sample    v18, 1, 1
        round_sample    v17, 1, 0
        shrn2           v16.4s, v17.2d,  #OUT_SHIFT
        round_sample    v18, 0, 0
        shrn2           v19.4s, v18.2d,  #OUT_SHIFT
        sqxtn           v16.4h, v16.4s
        sqxtn           v18.4h, v19.4s
.else
        ext             v18.16b, v18.16b, v18.16b, #8
.endif

        st1             {v16.\st\()}[0], [x3], x4
        b.eq            4f
        st1             {v18.\st\()}[1], [x5], x13
4:
        st1             {v16.\st\()}[1], [x3], x4
        st1             {v18.\st\()}[0], [x5], x13
        st1             {v16.\st\()}[2], [x3], x4
        st1             {v18.\st\()}[3], [x5], x13
        st1             {v16.\st\()}[3], [x3], x4
        st1             {v18.\st\()}[2], [x5], x13

        mov             v16.16b, v28.16b

        subs            x14, x14, #1
        add             x0,  x0,  #4<<2
        sub             x10, x10, #4<<2
        b.gt            1b

// comuting samples[16]
        add             x6,  x1,  #32<<2
        ld1             {v0.2s},  [x6],  x9
        ld1             {v1.2s},  [x0],  x9
.rept   3
        ld1             {v2.2s},  [x6],  x9
        ld1             {v3.2s},  [x0],  x9
        MLS             v16, v0,  v1
        ld1             {v0.2s},  [x6],  x9
        ld1             {v1.2s},  [x0],  x9
        MLS             v16, v2,  v3
.endr
        ld1             {v2.2s},  [x6],  x9
        ld1             {v3.2s},  [x0],  x9
        MLS             v16, v0,  v1
        MLS             v16, v2,  v3

.ifc \type, fixed
        and             v28.16b, v16.16b, v30.16b
        shrn            v20.2s,  v16.2d,  #OUT_SHIFT
        xtn             v28.2s,  v28.2d
        sqxtn           v20.4h,  v20.4s
        st1             {v28.s}[0], [x2]        // save dither_state
        st1             {v20.h}[0], [x3]
.else
        st1             {v16.s}[0], [x3]
.endif

        ret
endfunc
.purgem round_sample
.purgem MLA
.purgem MLA2
.purgem MLS
.purgem MLS2
.endm


.macro  round_sample    r, idx, next
        add             \r\().2d, \r\().2d, v28.2d
.if \idx == 0
        and             v28.16b,  \r\().16b,  v30.16b
.else // \idx == 1
        and             v28.16b,  \r\().16b,  v31.16b
.endif
.if \idx != \next
  .if \next == 0
        ext             v28.16b, v28.16b, v29.16b, #8
  .else
        ext             v28.16b, v29.16b, v28.16b, #8
  .endif
.endif
.endm
.macro  MLA             d, s1, s2
        smlal           \d\().2d, \s1\().2s, \s2\().2s
.endm
.macro  MLA2            d, s1, s2
        smlal2          \d\().2d, \s1\().4s, \s2\().4s
.endm
.macro  MLS             d, s1, s2
        smlsl           \d\().2d, \s1\().2s, \s2\().2s
.endm
.macro  MLS2            d, s1, s2
        smlsl2          \d\().2d, \s1\().4s, \s2\().4s
.endm
apply_window fixed, h


// nothing to do for round_sample and ML{A,S}2
.macro  round_sample    r, idx, next
.endm
.macro  MLA2            d, s1, s2
.endm
.macro  MLS2            d, s1, s2
.endm
.macro  MLA             d, s1, s2
        fmla            \d\().4s, \s1\().4s, \s2\().4s
.endm
.macro  MLS             d, s1, s2
        fmls            \d\().4s, \s1\().4s, \s2\().4s
.endm
apply_window float, s
Commit	Line	Data
2ba45a60 DM	1	/*
	2	* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
	3	*
	4	* This file is part of FFmpeg.
	5	*
	6	* FFmpeg is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2.1 of the License, or (at your option) any later version.
	10	*
	11	* FFmpeg is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with FFmpeg; if not, write to the Free Software
	18	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	19	*/
	20
	21	#include "libavutil/aarch64/asm.S"
	22
	23	#define FRAC_BITS 23 // fractional bits for sb_samples and dct
	24	#define WFRAC_BITS 16 // fractional bits for window
	25	#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
	26
	27	const tbl_rev128.s, align=4
	28	.byte 12, 13, 14, 15
	29	.byte 8, 9, 10, 11
	30	.byte 4, 5, 6, 7
	31	.byte 0, 1, 2, 3
	32	endconst
	33
	34	.macro apply_window type, st
	35	function ff_mpadsp_apply_window_\type\()_neon, export=1
	36	mov x7, x0
	37	sxtw x4, w4 // incr
	38	add x8, x0, #512<<2
	39	ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x7], #64
	40	ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x7], #64
	41	st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x8], #64
	42	st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x8], #64
	43	movrel x15, tbl_rev128.s
	44	ld1 {v27.4s}, [x15]
	45	.ifc \type, fixed
	46	lsl x4, x4, #1
	47	.else
	48	lsl x4, x4, #2
	49	.endif
	50	add x10, x0, #45<<2
	51	add x0, x0, #16<<2
	52	add x1, x1, #16<<2
	53	add x5, x3, x4, lsl #5
	54	sub x5, x5, x4 // samples2
	55	neg x13, x4 // -incr
	56	mov x9, #64<<2
	57	.ifc \type, fixed
	58	ld1r {v16.2s}, [x2] // dither_state
	59	sxtl v16.2d, v16.2s
	60	movi v29.2d, #0
	61	movi v30.2d, #(1<<OUT_SHIFT)-1
	62	trn1 v31.2d, v29.2d, v30.2d
	63	trn2 v30.2d, v30.2d, v29.2d
	64	trn1 v16.2d, v16.2d, v29.2d
65	.else
66	movi v16.4s, #0
67	movi v28.4s, #0
68	.endif
69	mov x14, #4
70	1:
71	mov x8, x0
72	sub x7, x1, #3<<2
73	sub x6, x1, x14, lsl #4
74	add x7, x7, x14, lsl #4
75	add x11, x6, #(32)<<2 // w + 32
76	add x12, x7, #(32)<<2 // w2 + 32
77	mov x15, #8
78	movi v17.2d, #0
79	movi v18.2d, #0
80	movi v19.2d, #0
81	2:
82	subs x15, x15, #1
83	ld1 {v0.4s}, [x8], x9
84	ld1 {v1.4s}, [x10], x9
85	ld1 {v2.4s}, [x6], x9
86	ld1 {v3.4s}, [x7], x9
87	tbl v6.16b, {v0.16b}, v27.16b
88	tbl v7.16b, {v1.16b}, v27.16b
89	ld1 {v4.4s}, [x11], x9
90	ld1 {v5.4s}, [x12], x9
91	MLA v16, v2, v0
92	MLA2 v17, v2, v0
93	MLS v18, v3, v6
94	MLS2 v19, v3, v6
95	MLS v16, v4, v7
96	MLS2 v17, v4, v7
97	MLS v18, v5, v1
98	MLS2 v19, v5, v1
99	b.gt 2b
100
101	cmp x14, #4
102	sub x10, x10, #64<<5 // 64 * 8 * sizeof(int32_t)
103
104	.ifc \type, fixed
105	and v28.16b, v16.16b, v30.16b
106	ext v28.16b, v29.16b, v28.16b, #8
107
108	b.eq 4f
109	round_sample v19, 1, 1
110	4:
111	round_sample v16, 1, 0
112	shrn v16.2s, v16.2d, #OUT_SHIFT
113	round_sample v19, 0, 0
114	shrn v19.2s, v19.2d, #OUT_SHIFT
115	round_sample v17, 0, 1
116	round_sample v18, 1, 1
117	round_sample v17, 1, 0
118	shrn2 v16.4s, v17.2d, #OUT_SHIFT
119	round_sample v18, 0, 0
120	shrn2 v19.4s, v18.2d, #OUT_SHIFT
121	sqxtn v16.4h, v16.4s
122	sqxtn v18.4h, v19.4s
123	.else
124	ext v18.16b, v18.16b, v18.16b, #8
125	.endif
126
127	st1 {v16.\st\()}[0], [x3], x4
128	b.eq 4f
129	st1 {v18.\st\()}[1], [x5], x13
130	4:
131	st1 {v16.\st\()}[1], [x3], x4
132	st1 {v18.\st\()}[0], [x5], x13
133	st1 {v16.\st\()}[2], [x3], x4
134	st1 {v18.\st\()}[3], [x5], x13
135	st1 {v16.\st\()}[3], [x3], x4
136	st1 {v18.\st\()}[2], [x5], x13
137
138	mov v16.16b, v28.16b
139
140	subs x14, x14, #1
141	add x0, x0, #4<<2
142	sub x10, x10, #4<<2
143	b.gt 1b
144
145	// comuting samples[16]
146	add x6, x1, #32<<2
147	ld1 {v0.2s}, [x6], x9
148	ld1 {v1.2s}, [x0], x9
149	.rept 3
150	ld1 {v2.2s}, [x6], x9
151	ld1 {v3.2s}, [x0], x9
152	MLS v16, v0, v1
153	ld1 {v0.2s}, [x6], x9
154	ld1 {v1.2s}, [x0], x9
155	MLS v16, v2, v3
156	.endr
157	ld1 {v2.2s}, [x6], x9
158	ld1 {v3.2s}, [x0], x9
159	MLS v16, v0, v1
160	MLS v16, v2, v3
161
162	.ifc \type, fixed
163	and v28.16b, v16.16b, v30.16b
164	shrn v20.2s, v16.2d, #OUT_SHIFT
165	xtn v28.2s, v28.2d
166	sqxtn v20.4h, v20.4s
167	st1 {v28.s}[0], [x2] // save dither_state
168	st1 {v20.h}[0], [x3]
169	.else
170	st1 {v16.s}[0], [x3]
171	.endif
172
173	ret
174	endfunc
175	.purgem round_sample
176	.purgem MLA
177	.purgem MLA2
178	.purgem MLS
179	.purgem MLS2
180	.endm
181
182
183	.macro round_sample r, idx, next
184	add \r\().2d, \r\().2d, v28.2d
185	.if \idx == 0
186	and v28.16b, \r\().16b, v30.16b
187	.else // \idx == 1
188	and v28.16b, \r\().16b, v31.16b
189	.endif
190	.if \idx != \next
191	.if \next == 0
192	ext v28.16b, v28.16b, v29.16b, #8
193	.else
194	ext v28.16b, v29.16b, v28.16b, #8
195	.endif
196	.endif
197	.endm
198	.macro MLA d, s1, s2
199	smlal \d\().2d, \s1\().2s, \s2\().2s
200	.endm
201	.macro MLA2 d, s1, s2
202	smlal2 \d\().2d, \s1\().4s, \s2\().4s
203	.endm
204	.macro MLS d, s1, s2
205	smlsl \d\().2d, \s1\().2s, \s2\().2s
206	.endm
207	.macro MLS2 d, s1, s2
208	smlsl2 \d\().2d, \s1\().4s, \s2\().4s
209	.endm
210	apply_window fixed, h
211
212
213	// nothing to do for round_sample and ML{A,S}2
214	.macro round_sample r, idx, next
215	.endm
216	.macro MLA2 d, s1, s2
217	.endm
218	.macro MLS2 d, s1, s2
219	.endm
220	.macro MLA d, s1, s2
221	fmla \d\().4s, \s1\().4s, \s2\().4s
222	.endm
223	.macro MLS d, s1, s2
224	fmls \d\().4s, \s1\().4s, \s2\().4s
225	.endm
226	apply_window float, s