Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* optimized audio functions | |
3 | ;* Copyright (c) 2008 Loren Merritt | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION_TEXT | |
25 | ||
26 | %macro SCALARPRODUCT 0 | |
27 | ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order) | |
28 | cglobal scalarproduct_int16, 3,3,3, v1, v2, order | |
29 | shl orderq, 1 | |
30 | add v1q, orderq | |
31 | add v2q, orderq | |
32 | neg orderq | |
33 | pxor m2, m2 | |
34 | .loop: | |
35 | movu m0, [v1q + orderq] | |
36 | movu m1, [v1q + orderq + mmsize] | |
37 | pmaddwd m0, [v2q + orderq] | |
38 | pmaddwd m1, [v2q + orderq + mmsize] | |
39 | paddd m2, m0 | |
40 | paddd m2, m1 | |
41 | add orderq, mmsize*2 | |
42 | jl .loop | |
43 | HADDD m2, m0 | |
44 | movd eax, m2 | |
45 | %if mmsize == 8 | |
46 | emms | |
47 | %endif | |
48 | RET | |
49 | %endmacro | |
50 | ||
51 | INIT_MMX mmxext | |
52 | SCALARPRODUCT | |
53 | INIT_XMM sse2 | |
54 | SCALARPRODUCT | |
55 | ||
56 | ||
57 | ;----------------------------------------------------------------------------- | |
58 | ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, | |
59 | ; int32_t max, unsigned int len) | |
60 | ;----------------------------------------------------------------------------- | |
61 | ||
62 | ; %1 = number of xmm registers used | |
63 | ; %2 = number of inline load/process/store loops per asm loop | |
64 | ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop | |
65 | ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) | |
66 | ; %5 = suffix | |
67 | %macro VECTOR_CLIP_INT32 4-5 | |
68 | cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len | |
69 | %if %4 | |
70 | cvtsi2ss m4, minm | |
71 | cvtsi2ss m5, maxm | |
72 | %else | |
73 | movd m4, minm | |
74 | movd m5, maxm | |
75 | %endif | |
76 | SPLATD m4 | |
77 | SPLATD m5 | |
78 | .loop: | |
79 | %assign %%i 0 | |
80 | %rep %2 | |
81 | mova m0, [srcq+mmsize*(0+%%i)] | |
82 | mova m1, [srcq+mmsize*(1+%%i)] | |
83 | mova m2, [srcq+mmsize*(2+%%i)] | |
84 | mova m3, [srcq+mmsize*(3+%%i)] | |
85 | %if %3 | |
86 | mova m7, [srcq+mmsize*(4+%%i)] | |
87 | mova m8, [srcq+mmsize*(5+%%i)] | |
88 | mova m9, [srcq+mmsize*(6+%%i)] | |
89 | mova m10, [srcq+mmsize*(7+%%i)] | |
90 | %endif | |
91 | CLIPD m0, m4, m5, m6 | |
92 | CLIPD m1, m4, m5, m6 | |
93 | CLIPD m2, m4, m5, m6 | |
94 | CLIPD m3, m4, m5, m6 | |
95 | %if %3 | |
96 | CLIPD m7, m4, m5, m6 | |
97 | CLIPD m8, m4, m5, m6 | |
98 | CLIPD m9, m4, m5, m6 | |
99 | CLIPD m10, m4, m5, m6 | |
100 | %endif | |
101 | mova [dstq+mmsize*(0+%%i)], m0 | |
102 | mova [dstq+mmsize*(1+%%i)], m1 | |
103 | mova [dstq+mmsize*(2+%%i)], m2 | |
104 | mova [dstq+mmsize*(3+%%i)], m3 | |
105 | %if %3 | |
106 | mova [dstq+mmsize*(4+%%i)], m7 | |
107 | mova [dstq+mmsize*(5+%%i)], m8 | |
108 | mova [dstq+mmsize*(6+%%i)], m9 | |
109 | mova [dstq+mmsize*(7+%%i)], m10 | |
110 | %endif | |
111 | %assign %%i %%i+4*(%3+1) | |
112 | %endrep | |
113 | add srcq, mmsize*4*(%2+%3) | |
114 | add dstq, mmsize*4*(%2+%3) | |
115 | sub lend, mmsize*(%2+%3) | |
116 | jg .loop | |
117 | REP_RET | |
118 | %endmacro | |
119 | ||
120 | INIT_MMX mmx | |
121 | %define CLIPD CLIPD_MMX | |
122 | VECTOR_CLIP_INT32 0, 1, 0, 0 | |
123 | INIT_XMM sse2 | |
124 | VECTOR_CLIP_INT32 6, 1, 0, 0, _int | |
125 | %define CLIPD CLIPD_SSE2 | |
126 | VECTOR_CLIP_INT32 6, 2, 0, 1 | |
127 | INIT_XMM sse4 | |
128 | %define CLIPD CLIPD_SSE41 | |
129 | %ifdef m8 | |
130 | VECTOR_CLIP_INT32 11, 1, 1, 0 | |
131 | %else | |
132 | VECTOR_CLIP_INT32 6, 1, 0, 0 | |
133 | %endif | |
134 | ||
135 | ;----------------------------------------------------- | |
136 | ;void ff_vector_clipf(float *dst, const float *src, | |
137 | ; float min, float max, int len) | |
138 | ;----------------------------------------------------- | |
139 | INIT_XMM sse | |
140 | %if UNIX64 | |
141 | cglobal vector_clipf, 3,3,6, dst, src, len | |
142 | %else | |
143 | cglobal vector_clipf, 5,5,6, dst, src, min, max, len | |
144 | %endif | |
145 | %if WIN64 | |
146 | SWAP 0, 2 | |
147 | SWAP 1, 3 | |
148 | %elif ARCH_X86_32 | |
149 | movss m0, minm | |
150 | movss m1, maxm | |
151 | %endif | |
152 | SPLATD m0 | |
153 | SPLATD m1 | |
154 | shl lend, 2 | |
155 | add srcq, lenq | |
156 | add dstq, lenq | |
157 | neg lenq | |
158 | .loop: | |
159 | mova m2, [srcq+lenq+mmsize*0] | |
160 | mova m3, [srcq+lenq+mmsize*1] | |
161 | mova m4, [srcq+lenq+mmsize*2] | |
162 | mova m5, [srcq+lenq+mmsize*3] | |
163 | maxps m2, m0 | |
164 | maxps m3, m0 | |
165 | maxps m4, m0 | |
166 | maxps m5, m0 | |
167 | minps m2, m1 | |
168 | minps m3, m1 | |
169 | minps m4, m1 | |
170 | minps m5, m1 | |
171 | mova [dstq+lenq+mmsize*0], m2 | |
172 | mova [dstq+lenq+mmsize*1], m3 | |
173 | mova [dstq+lenq+mmsize*2], m4 | |
174 | mova [dstq+lenq+mmsize*3], m5 | |
175 | add lenq, mmsize*4 | |
176 | jl .loop | |
177 | REP_RET |