Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (C) 2005-2011 x264 project | |
5 | ;* | |
6 | ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> | |
7 | ;* | |
8 | ;* This file is part of FFmpeg. | |
9 | ;* | |
10 | ;* FFmpeg is free software; you can redistribute it and/or | |
11 | ;* modify it under the terms of the GNU Lesser General Public | |
12 | ;* License as published by the Free Software Foundation; either | |
13 | ;* version 2.1 of the License, or (at your option) any later version. | |
14 | ;* | |
15 | ;* FFmpeg is distributed in the hope that it will be useful, | |
16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | ;* Lesser General Public License for more details. | |
19 | ;* | |
20 | ;* You should have received a copy of the GNU Lesser General Public | |
21 | ;* License along with FFmpeg; if not, write to the Free Software | |
22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 | ;****************************************************************************** | |
24 | ||
25 | %include "libavutil/x86/x86util.asm" | |
26 | ||
27 | SECTION_RODATA 32 | |
28 | ||
29 | pw_pixel_max: times 8 dw ((1 << 10)-1) | |
30 | sq_1: dq 1 | |
31 | dq 0 | |
32 | ||
33 | cextern pw_1 | |
34 | ||
35 | SECTION .text | |
36 | ||
37 | ;----------------------------------------------------------------------------- | |
38 | ; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height, | |
39 | ; int log2_denom, int weight, int offset); | |
40 | ;----------------------------------------------------------------------------- | |
41 | %macro WEIGHT_PROLOGUE 0 | |
42 | .prologue: | |
43 | PROLOGUE 0,6,8 | |
44 | movifnidn r0, r0mp | |
45 | movifnidn r1d, r1m | |
46 | movifnidn r2d, r2m | |
47 | movifnidn r4d, r4m | |
48 | movifnidn r5d, r5m | |
49 | %endmacro | |
50 | ||
51 | %macro WEIGHT_SETUP 0 | |
52 | mova m0, [pw_1] | |
53 | movd m2, r3m | |
54 | pslld m0, m2 ; 1<<log2_denom | |
55 | SPLATW m0, m0 | |
56 | shl r5, 19 ; *8, move to upper half of dword | |
57 | lea r5, [r5+r4*2+0x10000] | |
58 | movd m3, r5d ; weight<<1 | 1+(offset<<(3)) | |
59 | pshufd m3, m3, 0 | |
60 | mova m4, [pw_pixel_max] | |
61 | paddw m2, [sq_1] ; log2_denom+1 | |
62 | %if notcpuflag(sse4) | |
63 | pxor m7, m7 | |
64 | %endif | |
65 | %endmacro | |
66 | ||
67 | %macro WEIGHT_OP 1-2 | |
68 | %if %0==1 | |
69 | mova m5, [r0+%1] | |
70 | punpckhwd m6, m5, m0 | |
71 | punpcklwd m5, m0 | |
72 | %else | |
73 | movq m5, [r0+%1] | |
74 | movq m6, [r0+%2] | |
75 | punpcklwd m5, m0 | |
76 | punpcklwd m6, m0 | |
77 | %endif | |
78 | pmaddwd m5, m3 | |
79 | pmaddwd m6, m3 | |
80 | psrad m5, m2 | |
81 | psrad m6, m2 | |
82 | %if cpuflag(sse4) | |
83 | packusdw m5, m6 | |
84 | pminsw m5, m4 | |
85 | %else | |
86 | packssdw m5, m6 | |
87 | CLIPW m5, m7, m4 | |
88 | %endif | |
89 | %endmacro | |
90 | ||
91 | %macro WEIGHT_FUNC_DBL 0 | |
92 | cglobal h264_weight_16_10 | |
93 | WEIGHT_PROLOGUE | |
94 | WEIGHT_SETUP | |
95 | .nextrow: | |
96 | WEIGHT_OP 0 | |
97 | mova [r0 ], m5 | |
98 | WEIGHT_OP 16 | |
99 | mova [r0+16], m5 | |
100 | add r0, r1 | |
101 | dec r2d | |
102 | jnz .nextrow | |
103 | REP_RET | |
104 | %endmacro | |
105 | ||
106 | INIT_XMM sse2 | |
107 | WEIGHT_FUNC_DBL | |
108 | INIT_XMM sse4 | |
109 | WEIGHT_FUNC_DBL | |
110 | ||
111 | ||
112 | %macro WEIGHT_FUNC_MM 0 | |
113 | cglobal h264_weight_8_10 | |
114 | WEIGHT_PROLOGUE | |
115 | WEIGHT_SETUP | |
116 | .nextrow: | |
117 | WEIGHT_OP 0 | |
118 | mova [r0], m5 | |
119 | add r0, r1 | |
120 | dec r2d | |
121 | jnz .nextrow | |
122 | REP_RET | |
123 | %endmacro | |
124 | ||
125 | INIT_XMM sse2 | |
126 | WEIGHT_FUNC_MM | |
127 | INIT_XMM sse4 | |
128 | WEIGHT_FUNC_MM | |
129 | ||
130 | ||
131 | %macro WEIGHT_FUNC_HALF_MM 0 | |
132 | cglobal h264_weight_4_10 | |
133 | WEIGHT_PROLOGUE | |
134 | sar r2d, 1 | |
135 | WEIGHT_SETUP | |
136 | lea r3, [r1*2] | |
137 | .nextrow: | |
138 | WEIGHT_OP 0, r1 | |
139 | movh [r0], m5 | |
140 | movhps [r0+r1], m5 | |
141 | add r0, r3 | |
142 | dec r2d | |
143 | jnz .nextrow | |
144 | REP_RET | |
145 | %endmacro | |
146 | ||
147 | INIT_XMM sse2 | |
148 | WEIGHT_FUNC_HALF_MM | |
149 | INIT_XMM sse4 | |
150 | WEIGHT_FUNC_HALF_MM | |
151 | ||
152 | ||
153 | ;----------------------------------------------------------------------------- | |
154 | ; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride, | |
155 | ; int height, int log2_denom, int weightd, | |
156 | ; int weights, int offset); | |
157 | ;----------------------------------------------------------------------------- | |
158 | %if ARCH_X86_32 | |
159 | DECLARE_REG_TMP 3 | |
160 | %else | |
161 | DECLARE_REG_TMP 7 | |
162 | %endif | |
163 | ||
164 | %macro BIWEIGHT_PROLOGUE 0 | |
165 | .prologue: | |
166 | PROLOGUE 0,8,8 | |
167 | movifnidn r0, r0mp | |
168 | movifnidn r1, r1mp | |
169 | movifnidn r2d, r2m | |
170 | movifnidn r5d, r5m | |
171 | movifnidn r6d, r6m | |
172 | movifnidn t0d, r7m | |
173 | %endmacro | |
174 | ||
175 | %macro BIWEIGHT_SETUP 0 | |
176 | lea t0, [t0*4+1] ; (offset<<2)+1 | |
177 | or t0, 1 | |
178 | shl r6, 16 | |
179 | or r5, r6 | |
180 | movd m4, r5d ; weightd | weights | |
181 | movd m5, t0d ; (offset+1)|1 | |
182 | movd m6, r4m ; log2_denom | |
183 | pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom | |
184 | paddd m6, [sq_1] | |
185 | pshufd m4, m4, 0 | |
186 | pshufd m5, m5, 0 | |
187 | mova m3, [pw_pixel_max] | |
188 | movifnidn r3d, r3m | |
189 | %if notcpuflag(sse4) | |
190 | pxor m7, m7 | |
191 | %endif | |
192 | %endmacro | |
193 | ||
194 | %macro BIWEIGHT 1-2 | |
195 | %if %0==1 | |
196 | mova m0, [r0+%1] | |
197 | mova m1, [r1+%1] | |
198 | punpckhwd m2, m0, m1 | |
199 | punpcklwd m0, m1 | |
200 | %else | |
201 | movq m0, [r0+%1] | |
202 | movq m1, [r1+%1] | |
203 | punpcklwd m0, m1 | |
204 | movq m2, [r0+%2] | |
205 | movq m1, [r1+%2] | |
206 | punpcklwd m2, m1 | |
207 | %endif | |
208 | pmaddwd m0, m4 | |
209 | pmaddwd m2, m4 | |
210 | paddd m0, m5 | |
211 | paddd m2, m5 | |
212 | psrad m0, m6 | |
213 | psrad m2, m6 | |
214 | %if cpuflag(sse4) | |
215 | packusdw m0, m2 | |
216 | pminsw m0, m3 | |
217 | %else | |
218 | packssdw m0, m2 | |
219 | CLIPW m0, m7, m3 | |
220 | %endif | |
221 | %endmacro | |
222 | ||
223 | %macro BIWEIGHT_FUNC_DBL 0 | |
224 | cglobal h264_biweight_16_10 | |
225 | BIWEIGHT_PROLOGUE | |
226 | BIWEIGHT_SETUP | |
227 | .nextrow: | |
228 | BIWEIGHT 0 | |
229 | mova [r0 ], m0 | |
230 | BIWEIGHT 16 | |
231 | mova [r0+16], m0 | |
232 | add r0, r2 | |
233 | add r1, r2 | |
234 | dec r3d | |
235 | jnz .nextrow | |
236 | REP_RET | |
237 | %endmacro | |
238 | ||
239 | INIT_XMM sse2 | |
240 | BIWEIGHT_FUNC_DBL | |
241 | INIT_XMM sse4 | |
242 | BIWEIGHT_FUNC_DBL | |
243 | ||
244 | %macro BIWEIGHT_FUNC 0 | |
245 | cglobal h264_biweight_8_10 | |
246 | BIWEIGHT_PROLOGUE | |
247 | BIWEIGHT_SETUP | |
248 | .nextrow: | |
249 | BIWEIGHT 0 | |
250 | mova [r0], m0 | |
251 | add r0, r2 | |
252 | add r1, r2 | |
253 | dec r3d | |
254 | jnz .nextrow | |
255 | REP_RET | |
256 | %endmacro | |
257 | ||
258 | INIT_XMM sse2 | |
259 | BIWEIGHT_FUNC | |
260 | INIT_XMM sse4 | |
261 | BIWEIGHT_FUNC | |
262 | ||
263 | %macro BIWEIGHT_FUNC_HALF 0 | |
264 | cglobal h264_biweight_4_10 | |
265 | BIWEIGHT_PROLOGUE | |
266 | BIWEIGHT_SETUP | |
267 | sar r3d, 1 | |
268 | lea r4, [r2*2] | |
269 | .nextrow: | |
270 | BIWEIGHT 0, r2 | |
271 | movh [r0 ], m0 | |
272 | movhps [r0+r2], m0 | |
273 | add r0, r4 | |
274 | add r1, r4 | |
275 | dec r3d | |
276 | jnz .nextrow | |
277 | REP_RET | |
278 | %endmacro | |
279 | ||
280 | INIT_XMM sse2 | |
281 | BIWEIGHT_FUNC_HALF | |
282 | INIT_XMM sse4 | |
283 | BIWEIGHT_FUNC_HALF |