Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libswresample / x86 / rematrix.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* Copyright (c) 2012 Michael Niedermayer
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or
7;* modify it under the terms of the GNU Lesser General Public
8;* License as published by the Free Software Foundation; either
9;* version 2.1 of the License, or (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14;* Lesser General Public License for more details.
15;*
16;* You should have received a copy of the GNU Lesser General Public
17;* License along with FFmpeg; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;******************************************************************************
20
21%include "libavutil/x86/x86util.asm"
22
23
24SECTION_RODATA 32
25dw1: times 8 dd 1
26w1 : times 16 dw 1
27
28SECTION .text
29
30%macro MIX2_FLT 1
31cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len
32%ifidn %1, a
33 test in1q, mmsize-1
34 jne mix_2_1_float_u_int %+ SUFFIX
35 test in2q, mmsize-1
36 jne mix_2_1_float_u_int %+ SUFFIX
37 test outq, mmsize-1
38 jne mix_2_1_float_u_int %+ SUFFIX
39%else
40mix_2_1_float_u_int %+ SUFFIX
41%endif
42 VBROADCASTSS m4, [coeffpq + 4*index1q]
43 VBROADCASTSS m5, [coeffpq + 4*index2q]
44 shl lend , 2
45 add in1q , lenq
46 add in2q , lenq
47 add outq , lenq
48 neg lenq
49.next:
50%ifidn %1, a
51 mulps m0, m4, [in1q + lenq ]
52 mulps m1, m5, [in2q + lenq ]
53 mulps m2, m4, [in1q + lenq + mmsize]
54 mulps m3, m5, [in2q + lenq + mmsize]
55%else
56 movu m0, [in1q + lenq ]
57 movu m1, [in2q + lenq ]
58 movu m2, [in1q + lenq + mmsize]
59 movu m3, [in2q + lenq + mmsize]
60 mulps m0, m0, m4
61 mulps m1, m1, m5
62 mulps m2, m2, m4
63 mulps m3, m3, m5
64%endif
65 addps m0, m0, m1
66 addps m2, m2, m3
67 mov%1 [outq + lenq ], m0
68 mov%1 [outq + lenq + mmsize], m2
69 add lenq, mmsize*2
70 jl .next
71 REP_RET
72%endmacro
73
74%macro MIX1_FLT 1
75cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len
76%ifidn %1, a
77 test inq, mmsize-1
78 jne mix_1_1_float_u_int %+ SUFFIX
79 test outq, mmsize-1
80 jne mix_1_1_float_u_int %+ SUFFIX
81%else
82mix_1_1_float_u_int %+ SUFFIX
83%endif
84 VBROADCASTSS m2, [coeffpq + 4*indexq]
85 shl lenq , 2
86 add inq , lenq
87 add outq , lenq
88 neg lenq
89.next:
90%ifidn %1, a
91 mulps m0, m2, [inq + lenq ]
92 mulps m1, m2, [inq + lenq + mmsize]
93%else
94 movu m0, [inq + lenq ]
95 movu m1, [inq + lenq + mmsize]
96 mulps m0, m0, m2
97 mulps m1, m1, m2
98%endif
99 mov%1 [outq + lenq ], m0
100 mov%1 [outq + lenq + mmsize], m1
101 add lenq, mmsize*2
102 jl .next
103 REP_RET
104%endmacro
105
106%macro MIX1_INT16 1
107cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len
108%ifidn %1, a
109 test inq, mmsize-1
110 jne mix_1_1_int16_u_int %+ SUFFIX
111 test outq, mmsize-1
112 jne mix_1_1_int16_u_int %+ SUFFIX
113%else
114mix_1_1_int16_u_int %+ SUFFIX
115%endif
116 movd m4, [coeffpq + 4*indexq]
117 SPLATW m5, m4
118 psllq m4, 32
119 psrlq m4, 48
120 mova m0, [w1]
121 psllw m0, m4
122 psrlw m0, 1
123 punpcklwd m5, m0
124 add lenq , lenq
125 add inq , lenq
126 add outq , lenq
127 neg lenq
128.next:
129 mov%1 m0, [inq + lenq ]
130 mov%1 m2, [inq + lenq + mmsize]
131 mova m1, m0
132 mova m3, m2
133 punpcklwd m0, [w1]
134 punpckhwd m1, [w1]
135 punpcklwd m2, [w1]
136 punpckhwd m3, [w1]
137 pmaddwd m0, m5
138 pmaddwd m1, m5
139 pmaddwd m2, m5
140 pmaddwd m3, m5
141 psrad m0, m4
142 psrad m1, m4
143 psrad m2, m4
144 psrad m3, m4
145 packssdw m0, m1
146 packssdw m2, m3
147 mov%1 [outq + lenq ], m0
148 mov%1 [outq + lenq + mmsize], m2
149 add lenq, mmsize*2
150 jl .next
151%if mmsize == 8
152 emms
153 RET
154%else
155 REP_RET
156%endif
157%endmacro
158
159%macro MIX2_INT16 1
160cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len
161%ifidn %1, a
162 test in1q, mmsize-1
163 jne mix_2_1_int16_u_int %+ SUFFIX
164 test in2q, mmsize-1
165 jne mix_2_1_int16_u_int %+ SUFFIX
166 test outq, mmsize-1
167 jne mix_2_1_int16_u_int %+ SUFFIX
168%else
169mix_2_1_int16_u_int %+ SUFFIX
170%endif
171 movd m4, [coeffpq + 4*index1q]
172 movd m6, [coeffpq + 4*index2q]
173 SPLATW m5, m4
174 SPLATW m6, m6
175 psllq m4, 32
176 psrlq m4, 48
177 mova m7, [dw1]
178 pslld m7, m4
179 psrld m7, 1
180 punpcklwd m5, m6
181 add lend , lend
182 add in1q , lenq
183 add in2q , lenq
184 add outq , lenq
185 neg lenq
186.next:
187 mov%1 m0, [in1q + lenq ]
188 mov%1 m2, [in2q + lenq ]
189 mova m1, m0
190 punpcklwd m0, m2
191 punpckhwd m1, m2
192
193 mov%1 m2, [in1q + lenq + mmsize]
194 mov%1 m6, [in2q + lenq + mmsize]
195 mova m3, m2
196 punpcklwd m2, m6
197 punpckhwd m3, m6
198
199 pmaddwd m0, m5
200 pmaddwd m1, m5
201 pmaddwd m2, m5
202 pmaddwd m3, m5
203 paddd m0, m7
204 paddd m1, m7
205 paddd m2, m7
206 paddd m3, m7
207 psrad m0, m4
208 psrad m1, m4
209 psrad m2, m4
210 psrad m3, m4
211 packssdw m0, m1
212 packssdw m2, m3
213 mov%1 [outq + lenq ], m0
214 mov%1 [outq + lenq + mmsize], m2
215 add lenq, mmsize*2
216 jl .next
217%if mmsize == 8
218 emms
219 RET
220%else
221 REP_RET
222%endif
223%endmacro
224
225
226INIT_MMX mmx
227MIX1_INT16 u
228MIX1_INT16 a
229MIX2_INT16 u
230MIX2_INT16 a
231
232INIT_XMM sse
233MIX2_FLT u
234MIX2_FLT a
235MIX1_FLT u
236MIX1_FLT a
237
238INIT_XMM sse2
239MIX1_INT16 u
240MIX1_INT16 a
241MIX2_INT16 u
242MIX2_INT16 a
243
244%if HAVE_AVX_EXTERNAL
245INIT_YMM avx
246MIX2_FLT u
247MIX2_FLT a
248MIX1_FLT u
249MIX1_FLT a
250%endif