Commit | Line | Data |
---|---|---|
f6fa7814 DM |
1 | ;****************************************************************************** |
2 | ;* SIMD-optimized MLP DSP functions | |
3 | ;* Copyright (c) 2014 James Almer <jamrial@gmail.com> | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION_TEXT | |
25 | ||
26 | %if ARCH_X86_64 | |
27 | ||
28 | %macro SHLX 2 | |
29 | %if cpuflag(bmi2) | |
30 | shlx %1, %1, %2q | |
31 | %else | |
32 | shl %1, %2b | |
33 | %endif | |
34 | %endmacro | |
35 | ||
36 | %macro REMATRIX 0 | |
37 | movdqa m0, [samplesq] | |
38 | movdqa m1, [coeffsq ] | |
39 | pshufd m2, m0, q2301 | |
40 | pshufd m3, m1, q2301 | |
41 | pmuldq m0, m1 | |
42 | pmuldq m3, m2 | |
43 | paddq m0, m3 | |
44 | %if notcpuflag(avx2) | |
45 | movdqa m1, [samplesq + 16] | |
46 | movdqa m2, [coeffsq + 16] | |
47 | pshufd m3, m1, q2301 | |
48 | pshufd m4, m2, q2301 | |
49 | pmuldq m1, m2 | |
50 | pmuldq m4, m3 | |
51 | paddq m0, m1 | |
52 | paddq m0, m4 | |
53 | %else | |
54 | vextracti128 xm1, m0, 1 | |
55 | paddq xm0, xm1 | |
56 | %endif | |
57 | %endmacro | |
58 | ||
59 | %macro LOOP_END 0 | |
60 | pshufd xm1, xm0, q0032 | |
61 | paddq xm0, xm1 | |
62 | movq accumq, xm0 | |
63 | movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs | |
64 | sar accumq, 14 ; accum >>= 14 | |
65 | and accumd, maskd ; accum &= mask | |
66 | add accumd, blsbsd ; accum += *bypassed_lsbs | |
67 | mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum | |
68 | add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; | |
69 | add samplesq, 32 ; samples += MAX_CHANNELS; | |
70 | cmp blsbs_ptrq, cntq | |
71 | %endmacro | |
72 | ||
73 | %macro LOOP_SHIFT_END 0 | |
74 | pshufd xm1, xm0, q0032 | |
75 | paddq xm0, xm1 | |
76 | movq accumq, xm0 | |
77 | and indexd, auspd ; index &= access_unit_size_pow2; | |
78 | movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index] | |
79 | add indexd, index2d ; index += index2 | |
80 | SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift | |
81 | add accumq, noiseq ; accum += noise_buffer[index] | |
82 | movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register) | |
83 | sar accumq, 14 ; accum >>= 14 | |
84 | and accumd, maskd ; accum &= mask | |
85 | add accumd, noised ; accum += *bypassed_lsbs | |
86 | mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum | |
87 | add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; | |
88 | add samplesq, 32 ; samples += MAX_CHANNELS; | |
89 | cmp blsbs_ptrq, cntq | |
90 | %endmacro | |
91 | ||
92 | ;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs, | |
93 | ; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, | |
94 | ; int index, unsigned int dest_ch, uint16_t blockpos, | |
95 | ; unsigned int maxchan, int matrix_noise_shift, | |
96 | ; int access_unit_size_pow2, int32_t mask) | |
97 | %macro MLP_REMATRIX_CHANNEL 0 | |
98 | cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \ | |
99 | index, dest_ch, blockpos, maxchan, mns, \ | |
100 | accum, mask, cnt | |
101 | mov mnsd, mnsm ; load matrix_noise_shift | |
102 | movzx blockposq, word blockposm ; load and zero extend blockpos (16bit) | |
103 | mov maxchand, maxchanm ; load maxchan | |
104 | mov maskd, maskm ; load mask | |
105 | %if WIN64 | |
106 | mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64) | |
107 | %endif | |
108 | shl dest_chd, 2 | |
109 | lea cntq, [blsbs_ptrq + blockposq*8] | |
110 | test mnsd, mnsd ; is matrix_noise_shift != 0? | |
111 | jne .shift ; jump if true | |
112 | cmp maxchand, 4 ; is maxchan < 4? | |
113 | jl .loop4 ; jump if true | |
114 | ||
115 | align 16 | |
116 | .loop8: | |
117 | ; Process 5 or more channels | |
118 | REMATRIX | |
119 | LOOP_END | |
120 | jne .loop8 | |
121 | RET | |
122 | ||
123 | align 16 | |
124 | .loop4: | |
125 | ; Process up to 4 channels | |
126 | movdqa xm0, [samplesq] | |
127 | movdqa xm1, [coeffsq ] | |
128 | pshufd xm2, xm0, q2301 | |
129 | pshufd xm3, xm1, q2301 | |
130 | pmuldq xm0, xm1 | |
131 | pmuldq xm3, xm2 | |
132 | paddq xm0, xm3 | |
133 | LOOP_END | |
134 | jne .loop4 | |
135 | RET | |
136 | ||
137 | .shift: | |
138 | %if WIN64 | |
139 | mov indexd, indexm ; load index (not needed on UNIX64) | |
140 | %endif | |
141 | mov r9d, r9m ; load access_unit_size_pow2 | |
142 | %if cpuflag(bmi2) | |
143 | ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place. | |
144 | DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \ | |
145 | index, dest_ch, accum, index2, mns, \ | |
146 | ausp, mask, cnt, noise | |
147 | add mnsd, 7 ; matrix_noise_shift += 7 | |
148 | %else ; sse4 | |
149 | mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift | |
150 | %if WIN64 | |
151 | ; r0 = rcx | |
152 | DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \ | |
153 | index2, accum, ausp, mask, cnt, noise | |
154 | %else ; UNIX64 | |
155 | ; r3 = rcx | |
156 | DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \ | |
157 | index2, accum, ausp, mask, cnt, noise | |
158 | %endif | |
159 | lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7 | |
160 | %endif ; cpuflag | |
161 | sub auspd, 1 ; access_unit_size_pow2 -= 1 | |
162 | cmp r7d, 4 ; is maxchan < 4? | |
163 | lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1; | |
164 | jl .loop4_shift ; jump if maxchan < 4 | |
165 | ||
166 | align 16 | |
167 | .loop8_shift: | |
168 | ; Process 5 or more channels | |
169 | REMATRIX | |
170 | LOOP_SHIFT_END | |
171 | jne .loop8_shift | |
172 | RET | |
173 | ||
174 | align 16 | |
175 | .loop4_shift: | |
176 | ; Process up to 4 channels | |
177 | movdqa xm0, [samplesq] | |
178 | movdqa xm1, [coeffsq ] | |
179 | pshufd xm2, xm0, q2301 | |
180 | pshufd xm3, xm1, q2301 | |
181 | pmuldq xm0, xm1 | |
182 | pmuldq xm3, xm2 | |
183 | paddq xm0, xm3 | |
184 | LOOP_SHIFT_END | |
185 | jne .loop4_shift | |
186 | RET | |
187 | %endmacro | |
188 | ||
189 | INIT_XMM sse4 | |
190 | MLP_REMATRIX_CHANNEL | |
191 | %if HAVE_AVX2_EXTERNAL | |
192 | INIT_YMM avx2, bmi2 | |
193 | MLP_REMATRIX_CHANNEL | |
194 | %endif | |
195 | ||
196 | %endif ; ARCH_X86_64 |