Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* FLAC DSP SIMD optimizations | |
3 | ;* | |
4 | ;* Copyright (C) 2014 Loren Merritt | |
f6fa7814 | 5 | ;* Copyright (C) 2014 James Almer |
2ba45a60 DM |
6 | ;* |
7 | ;* This file is part of FFmpeg. | |
8 | ;* | |
9 | ;* FFmpeg is free software; you can redistribute it and/or | |
10 | ;* modify it under the terms of the GNU Lesser General Public | |
11 | ;* License as published by the Free Software Foundation; either | |
12 | ;* version 2.1 of the License, or (at your option) any later version. | |
13 | ;* | |
14 | ;* FFmpeg is distributed in the hope that it will be useful, | |
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;* Lesser General Public License for more details. | |
18 | ;* | |
19 | ;* You should have received a copy of the GNU Lesser General Public | |
20 | ;* License along with FFmpeg; if not, write to the Free Software | |
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | ;****************************************************************************** | |
23 | ||
24 | %include "libavutil/x86/x86util.asm" | |
25 | ||
26 | SECTION .text | |
27 | ||
28 | %macro LPC_32 1 | |
29 | INIT_XMM %1 | |
30 | cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j | |
31 | sub lend, pred_orderd | |
32 | jle .ret | |
33 | lea decodedq, [decodedq+pred_orderq*4-8] | |
34 | lea coeffsq, [coeffsq+pred_orderq*4] | |
35 | neg pred_orderq | |
36 | movd m4, qlevelm | |
37 | ALIGN 16 | |
38 | .loop_sample: | |
39 | movd m0, [decodedq+pred_orderq*4+8] | |
40 | add decodedq, 8 | |
41 | movd m1, [coeffsq+pred_orderq*4] | |
42 | pxor m2, m2 | |
43 | pxor m3, m3 | |
44 | lea jq, [pred_orderq+1] | |
45 | test jq, jq | |
46 | jz .end_order | |
47 | .loop_order: | |
48 | PMACSDQL m2, m0, m1, m2, m0 | |
49 | movd m0, [decodedq+jq*4] | |
50 | PMACSDQL m3, m1, m0, m3, m1 | |
51 | movd m1, [coeffsq+jq*4] | |
52 | inc jq | |
53 | jl .loop_order | |
54 | .end_order: | |
55 | PMACSDQL m2, m0, m1, m2, m0 | |
56 | psrlq m2, m4 | |
57 | movd m0, [decodedq] | |
58 | paddd m0, m2 | |
59 | movd [decodedq], m0 | |
60 | sub lend, 2 | |
61 | jl .ret | |
62 | PMACSDQL m3, m1, m0, m3, m1 | |
63 | psrlq m3, m4 | |
64 | movd m1, [decodedq+4] | |
65 | paddd m1, m3 | |
66 | movd [decodedq+4], m1 | |
67 | jg .loop_sample | |
68 | .ret: | |
69 | REP_RET | |
70 | %endmacro | |
71 | ||
72 | %if HAVE_XOP_EXTERNAL | |
73 | LPC_32 xop | |
74 | %endif | |
75 | LPC_32 sse4 | |
f6fa7814 DM |
76 | |
77 | ;---------------------------------------------------------------------------------- | |
78 | ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels, | |
79 | ; int len, int shift); | |
80 | ;---------------------------------------------------------------------------------- | |
81 | %macro FLAC_DECORRELATE_16 3-4 | |
82 | cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len | |
83 | %if ARCH_X86_32 || WIN64 | |
84 | movd m3, r4m | |
85 | %if ARCH_X86_32 | |
86 | mov lend, lenm | |
87 | %endif | |
88 | %else ; UNIX64 | |
89 | movd m3, r4d | |
90 | %endif | |
91 | shl lend, 2 | |
92 | mov in1q, [in0q + gprsize] | |
93 | mov in0q, [in0q] | |
94 | mov outq, [outq] | |
95 | add in1q, lenq | |
96 | add in0q, lenq | |
97 | add outq, lenq | |
98 | neg lenq | |
99 | ||
100 | align 16 | |
101 | .loop: | |
102 | mova m0, [in0q + lenq] | |
103 | mova m1, [in1q + lenq] | |
104 | %ifidn %1, ms | |
105 | psrad m2, m1, 1 | |
106 | psubd m0, m2 | |
107 | %endif | |
108 | %ifnidn %1, indep2 | |
109 | p%4d m2, m0, m1 | |
110 | %endif | |
111 | packssdw m%2, m%2 | |
112 | packssdw m%3, m%3 | |
113 | punpcklwd m%2, m%3 | |
114 | psllw m%2, m3 | |
115 | mova [outq + lenq], m%2 | |
116 | add lenq, 16 | |
117 | jl .loop | |
118 | REP_RET | |
119 | %endmacro | |
120 | ||
121 | INIT_XMM sse2 | |
122 | FLAC_DECORRELATE_16 ls, 0, 2, sub | |
123 | FLAC_DECORRELATE_16 rs, 2, 1, add | |
124 | FLAC_DECORRELATE_16 ms, 2, 0, add | |
125 | ||
126 | ;---------------------------------------------------------------------------------- | |
127 | ;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels, | |
128 | ; int len, int shift); | |
129 | ;---------------------------------------------------------------------------------- | |
130 | %macro FLAC_DECORRELATE_32 5 | |
131 | cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len | |
132 | %if ARCH_X86_32 || WIN64 | |
133 | movd m3, r4m | |
134 | %if ARCH_X86_32 | |
135 | mov lend, lenm | |
136 | %endif | |
137 | %else ; UNIX64 | |
138 | movd m3, r4d | |
139 | %endif | |
140 | mov in1q, [in0q + gprsize] | |
141 | mov in0q, [in0q] | |
142 | mov outq, [outq] | |
143 | sub in1q, in0q | |
144 | ||
145 | align 16 | |
146 | .loop: | |
147 | mova m0, [in0q] | |
148 | mova m1, [in0q + in1q] | |
149 | %ifidn %1, ms | |
150 | psrad m2, m1, 1 | |
151 | psubd m0, m2 | |
152 | %endif | |
153 | p%5d m2, m0, m1 | |
154 | pslld m%2, m3 | |
155 | pslld m%3, m3 | |
156 | ||
157 | SBUTTERFLY dq, %2, %3, %4 | |
158 | ||
159 | mova [outq ], m%2 | |
160 | mova [outq + mmsize], m%3 | |
161 | ||
162 | add in0q, mmsize | |
163 | add outq, mmsize*2 | |
164 | sub lend, mmsize/4 | |
165 | jg .loop | |
166 | REP_RET | |
167 | %endmacro | |
168 | ||
169 | INIT_XMM sse2 | |
170 | FLAC_DECORRELATE_32 ls, 0, 2, 1, sub | |
171 | FLAC_DECORRELATE_32 rs, 2, 1, 0, add | |
172 | FLAC_DECORRELATE_32 ms, 2, 0, 1, add | |
173 | ||
174 | ;----------------------------------------------------------------------------------------- | |
175 | ;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels, | |
176 | ; int len, int shift); | |
177 | ;----------------------------------------------------------------------------------------- | |
178 | %macro TRANSPOSE8x4D 9 | |
179 | SBUTTERFLY dq, %1, %2, %9 | |
180 | SBUTTERFLY dq, %3, %4, %9 | |
181 | SBUTTERFLY dq, %5, %6, %9 | |
182 | SBUTTERFLY dq, %7, %8, %9 | |
183 | SBUTTERFLY qdq, %1, %3, %9 | |
184 | SBUTTERFLY qdq, %2, %4, %9 | |
185 | SBUTTERFLY qdq, %5, %7, %9 | |
186 | SBUTTERFLY qdq, %6, %8, %9 | |
187 | SWAP %2, %5 | |
188 | SWAP %4, %7 | |
189 | %endmacro | |
190 | ||
191 | ;%1 = bps | |
192 | ;%2 = channels | |
193 | ;%3 = last xmm reg used | |
194 | ;%4 = word/dword (shift instruction) | |
195 | %macro FLAC_DECORRELATE_INDEP 4 | |
196 | %define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels | |
197 | cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7 | |
198 | %if ARCH_X86_32 | |
199 | movd m%3, r4m | |
200 | %if %2 == 6 | |
201 | DEFINE_ARGS out, in0, in1, in2, in3, in4, in5 | |
202 | %define lend dword r3m | |
203 | %else | |
204 | mov lend, lenm | |
205 | %endif | |
206 | %elif WIN64 | |
207 | movd m%3, r4m | |
208 | %else ; UNIX64 | |
209 | movd m%3, r4d | |
210 | %endif | |
211 | ||
212 | %assign %%i 1 | |
213 | %rep %2-1 | |
214 | mov in %+ %%i %+ q, [in0q+%%i*gprsize] | |
215 | %assign %%i %%i+1 | |
216 | %endrep | |
217 | ||
218 | mov in0q, [in0q] | |
219 | mov outq, [outq] | |
220 | ||
221 | %assign %%i 1 | |
222 | %rep %2-1 | |
223 | sub in %+ %%i %+ q, in0q | |
224 | %assign %%i %%i+1 | |
225 | %endrep | |
226 | ||
227 | align 16 | |
228 | .loop: | |
229 | mova m0, [in0q] | |
230 | ||
231 | %assign %%i 1 | |
232 | %rep REPCOUNT-1 | |
233 | mova m %+ %%i, [in0q + in %+ %%i %+ q] | |
234 | %assign %%i %%i+1 | |
235 | %endrep | |
236 | ||
237 | %if %1 == 32 | |
238 | ||
239 | %if %2 == 8 | |
240 | TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 | |
241 | %elif %2 == 6 | |
242 | SBUTTERFLY dq, 0, 1, 6 | |
243 | SBUTTERFLY dq, 2, 3, 6 | |
244 | SBUTTERFLY dq, 4, 5, 6 | |
245 | ||
246 | punpcklqdq m6, m0, m2 | |
247 | punpckhqdq m2, m4 | |
248 | shufps m4, m0, 0xe4 | |
249 | punpcklqdq m0, m1, m3 | |
250 | punpckhqdq m3, m5 | |
251 | shufps m5, m1, 0xe4 | |
252 | SWAP 0,6,1,4,5,3 | |
253 | %elif %2 == 4 | |
254 | TRANSPOSE4x4D 0, 1, 2, 3, 4 | |
255 | %else ; %2 == 2 | |
256 | SBUTTERFLY dq, 0, 1, 2 | |
257 | %endif | |
258 | ||
259 | %else ; %1 == 16 | |
260 | ||
261 | %if %2 == 8 | |
262 | packssdw m0, [in0q + in4q] | |
263 | packssdw m1, [in0q + in5q] | |
264 | packssdw m2, [in0q + in6q] | |
265 | packssdw m3, [in0q + in7q] | |
266 | TRANSPOSE2x4x4W 0, 1, 2, 3, 4 | |
267 | %elif %2 == 6 | |
268 | packssdw m0, [in0q + in3q] | |
269 | packssdw m1, [in0q + in4q] | |
270 | packssdw m2, [in0q + in5q] | |
271 | pshufd m3, m0, q1032 | |
272 | punpcklwd m0, m1 | |
273 | punpckhwd m1, m2 | |
274 | punpcklwd m2, m3 | |
275 | ||
276 | shufps m3, m0, m2, q2020 | |
277 | shufps m0, m1, q2031 | |
278 | shufps m2, m1, q3131 | |
279 | shufps m1, m2, m3, q3120 | |
280 | shufps m3, m0, q0220 | |
281 | shufps m0, m2, q3113 | |
282 | SWAP 2, 0, 3 | |
283 | %else ; %2 == 4 | |
284 | packssdw m0, [in0q + in2q] | |
285 | packssdw m1, [in0q + in3q] | |
286 | SBUTTERFLY wd, 0, 1, 2 | |
287 | SBUTTERFLY dq, 0, 1, 2 | |
288 | %endif | |
289 | ||
290 | %endif | |
291 | ||
292 | %assign %%i 0 | |
293 | %rep REPCOUNT | |
294 | psll%4 m %+ %%i, m%3 | |
295 | %assign %%i %%i+1 | |
296 | %endrep | |
297 | ||
298 | %assign %%i 0 | |
299 | %rep REPCOUNT | |
300 | mova [outq + %%i*mmsize], m %+ %%i | |
301 | %assign %%i %%i+1 | |
302 | %endrep | |
303 | ||
304 | add in0q, mmsize | |
305 | add outq, mmsize*REPCOUNT | |
306 | sub lend, mmsize/4 | |
307 | jg .loop | |
308 | REP_RET | |
309 | %endmacro | |
310 | ||
311 | INIT_XMM sse2 | |
312 | FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro | |
313 | FLAC_DECORRELATE_INDEP 32, 2, 3, d | |
314 | FLAC_DECORRELATE_INDEP 16, 4, 3, w | |
315 | FLAC_DECORRELATE_INDEP 32, 4, 5, d | |
316 | FLAC_DECORRELATE_INDEP 16, 6, 4, w | |
317 | FLAC_DECORRELATE_INDEP 32, 6, 7, d | |
318 | %if ARCH_X86_64 | |
319 | FLAC_DECORRELATE_INDEP 16, 8, 5, w | |
320 | FLAC_DECORRELATE_INDEP 32, 8, 9, d | |
321 | %endif | |
322 | ||
323 | INIT_XMM avx | |
324 | FLAC_DECORRELATE_INDEP 32, 4, 5, d | |
325 | FLAC_DECORRELATE_INDEP 32, 6, 7, d | |
326 | %if ARCH_X86_64 | |
327 | FLAC_DECORRELATE_INDEP 16, 8, 5, w | |
328 | FLAC_DECORRELATE_INDEP 32, 8, 9, d | |
329 | %endif |