Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ; /* |
2 | ; * Provide SIMD optimizations for transform_add functions for HEVC decoding | |
3 | ; * Copyright (c) 2014 Pierre-Edouard LEPERE | |
4 | ; * | |
5 | ; * This file is part of FFmpeg. | |
6 | ; * | |
7 | ; * FFmpeg is free software; you can redistribute it and/or | |
8 | ; * modify it under the terms of the GNU Lesser General Public | |
9 | ; * License as published by the Free Software Foundation; either | |
10 | ; * version 2.1 of the License, or (at your option) any later version. | |
11 | ; * | |
12 | ; * FFmpeg is distributed in the hope that it will be useful, | |
13 | ; * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ; * Lesser General Public License for more details. | |
16 | ; * | |
17 | ; * You should have received a copy of the GNU Lesser General Public | |
18 | ; * License along with FFmpeg; if not, write to the Free Software | |
19 | ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ; */ | |
21 | %include "libavutil/x86/x86util.asm" | |
22 | ||
23 | SECTION_RODATA 32 | |
24 | max_pixels_10: times 16 dw ((1 << 10)-1) | |
25 | ||
26 | ||
27 | SECTION .text | |
28 | ||
29 | ;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file | |
30 | %macro TR_ADD_MMX_4_8 0 | |
31 | mova m2, [r1] | |
32 | mova m4, [r1+8] | |
33 | pxor m3, m3 | |
34 | psubw m3, m2 | |
35 | packuswb m2, m2 | |
36 | packuswb m3, m3 | |
37 | pxor m5, m5 | |
38 | psubw m5, m4 | |
39 | packuswb m4, m4 | |
40 | packuswb m5, m5 | |
41 | ||
42 | movh m0, [r0 ] | |
43 | movh m1, [r0+r2 ] | |
44 | paddusb m0, m2 | |
45 | paddusb m1, m4 | |
46 | psubusb m0, m3 | |
47 | psubusb m1, m5 | |
48 | movh [r0 ], m0 | |
49 | movh [r0+r2 ], m1 | |
50 | %endmacro | |
51 | ||
52 | ||
53 | INIT_MMX mmxext | |
54 | ; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |
55 | cglobal hevc_transform_add4_8, 3, 4, 6 | |
56 | TR_ADD_MMX_4_8 | |
57 | add r1, 16 | |
58 | lea r0, [r0+r2*2] | |
59 | TR_ADD_MMX_4_8 | |
60 | RET | |
61 | ||
62 | %macro TR_ADD_SSE_8_8 0 | |
63 | pxor m3, m3 | |
64 | mova m4, [r1] | |
65 | mova m6, [r1+16] | |
66 | mova m0, [r1+32] | |
67 | mova m2, [r1+48] | |
68 | psubw m5, m3, m4 | |
69 | psubw m7, m3, m6 | |
70 | psubw m1, m3, m0 | |
71 | packuswb m4, m0 | |
72 | packuswb m5, m1 | |
73 | psubw m3, m2 | |
74 | packuswb m6, m2 | |
75 | packuswb m7, m3 | |
76 | ||
77 | movq m0, [r0 ] | |
78 | movq m1, [r0+r2 ] | |
79 | movhps m0, [r0+r2*2] | |
80 | movhps m1, [r0+r3 ] | |
81 | paddusb m0, m4 | |
82 | paddusb m1, m6 | |
83 | psubusb m0, m5 | |
84 | psubusb m1, m7 | |
85 | movq [r0 ], m0 | |
86 | movq [r0+r2 ], m1 | |
87 | movhps [r0+2*r2], m0 | |
88 | movhps [r0+r3 ], m1 | |
89 | %endmacro | |
90 | ||
91 | %macro TR_ADD_SSE_16_32_8 3 | |
92 | mova xm2, [r1+%1 ] | |
93 | mova xm6, [r1+%1+16] | |
94 | %if cpuflag(avx2) | |
95 | vinserti128 m2, m2, [r1+%1+32], 1 | |
96 | vinserti128 m6, m6, [r1+%1+48], 1 | |
97 | %endif | |
98 | %if cpuflag(avx) | |
99 | psubw m1, m0, m2 | |
100 | psubw m5, m0, m6 | |
101 | %else | |
102 | mova m1, m0 | |
103 | mova m5, m0 | |
104 | psubw m1, m2 | |
105 | psubw m5, m6 | |
106 | %endif | |
107 | packuswb m2, m6 | |
108 | packuswb m1, m5 | |
109 | ||
110 | mova xm4, [r1+%1+mmsize*2 ] | |
111 | mova xm6, [r1+%1+mmsize*2+16] | |
112 | %if cpuflag(avx2) | |
113 | vinserti128 m4, m4, [r1+%1+96 ], 1 | |
114 | vinserti128 m6, m6, [r1+%1+112], 1 | |
115 | %endif | |
116 | %if cpuflag(avx) | |
117 | psubw m3, m0, m4 | |
118 | psubw m5, m0, m6 | |
119 | %else | |
120 | mova m3, m0 | |
121 | mova m5, m0 | |
122 | psubw m3, m4 | |
123 | psubw m5, m6 | |
124 | %endif | |
125 | packuswb m4, m6 | |
126 | packuswb m3, m5 | |
127 | ||
128 | paddusb m2, [%2] | |
129 | paddusb m4, [%3] | |
130 | psubusb m2, m1 | |
131 | psubusb m4, m3 | |
132 | mova [%2], m2 | |
133 | mova [%3], m4 | |
134 | %endmacro | |
135 | ||
136 | ||
137 | %macro TRANSFORM_ADD_8 0 | |
138 | ; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |
139 | cglobal hevc_transform_add8_8, 3, 4, 8 | |
140 | lea r3, [r2*3] | |
141 | TR_ADD_SSE_8_8 | |
142 | add r1, 64 | |
143 | lea r0, [r0+r2*4] | |
144 | TR_ADD_SSE_8_8 | |
145 | RET | |
146 | ||
147 | ; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |
148 | cglobal hevc_transform_add16_8, 3, 4, 7 | |
149 | pxor m0, m0 | |
150 | lea r3, [r2*3] | |
151 | TR_ADD_SSE_16_32_8 0, r0, r0+r2 | |
152 | TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 | |
153 | %rep 3 | |
154 | add r1, 128 | |
155 | lea r0, [r0+r2*4] | |
156 | TR_ADD_SSE_16_32_8 0, r0, r0+r2 | |
157 | TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 | |
158 | %endrep | |
159 | RET | |
160 | ||
161 | ; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |
162 | cglobal hevc_transform_add32_8, 3, 4, 7 | |
163 | pxor m0, m0 | |
164 | TR_ADD_SSE_16_32_8 0, r0, r0+16 | |
165 | TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 | |
166 | %rep 15 | |
167 | add r1, 128 | |
168 | lea r0, [r0+r2*2] | |
169 | TR_ADD_SSE_16_32_8 0, r0, r0+16 | |
170 | TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 | |
171 | %endrep | |
172 | RET | |
173 | %endmacro | |
174 | ||
175 | INIT_XMM sse2 | |
176 | TRANSFORM_ADD_8 | |
177 | INIT_XMM avx | |
178 | TRANSFORM_ADD_8 | |
179 | ||
180 | %if HAVE_AVX2_EXTERNAL | |
181 | INIT_YMM avx2 | |
182 | ; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |
183 | cglobal hevc_transform_add32_8, 3, 4, 7 | |
184 | pxor m0, m0 | |
185 | lea r3, [r2*3] | |
186 | TR_ADD_SSE_16_32_8 0, r0, r0+r2 | |
187 | TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3 | |
188 | %rep 7 | |
189 | add r1, 256 | |
190 | lea r0, [r0+r2*4] | |
191 | TR_ADD_SSE_16_32_8 0, r0, r0+r2 | |
192 | TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3 | |
193 | %endrep | |
194 | RET | |
195 | %endif | |
196 | ||
197 | ;----------------------------------------------------------------------------- | |
198 | ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride) | |
199 | ;----------------------------------------------------------------------------- | |
200 | %macro TR_ADD_SSE_8_10 4 | |
201 | mova m0, [%4] | |
202 | mova m1, [%4+16] | |
203 | mova m2, [%4+32] | |
204 | mova m3, [%4+48] | |
205 | paddw m0, [%1+0 ] | |
206 | paddw m1, [%1+%2 ] | |
207 | paddw m2, [%1+%2*2] | |
208 | paddw m3, [%1+%3 ] | |
209 | CLIPW m0, m4, m5 | |
210 | CLIPW m1, m4, m5 | |
211 | CLIPW m2, m4, m5 | |
212 | CLIPW m3, m4, m5 | |
213 | mova [%1+0 ], m0 | |
214 | mova [%1+%2 ], m1 | |
215 | mova [%1+%2*2], m2 | |
216 | mova [%1+%3 ], m3 | |
217 | %endmacro | |
218 | ||
219 | %macro TR_ADD_MMX4_10 3 | |
220 | mova m0, [%1+0 ] | |
221 | mova m1, [%1+%2 ] | |
222 | paddw m0, [%3] | |
223 | paddw m1, [%3+8] | |
224 | CLIPW m0, m2, m3 | |
225 | CLIPW m1, m2, m3 | |
226 | mova [%1+0 ], m0 | |
227 | mova [%1+%2 ], m1 | |
228 | %endmacro | |
229 | ||
230 | %macro TRANS_ADD_SSE_16_10 3 | |
231 | mova m0, [%3] | |
232 | mova m1, [%3+16] | |
233 | mova m2, [%3+32] | |
234 | mova m3, [%3+48] | |
235 | paddw m0, [%1 ] | |
236 | paddw m1, [%1+16 ] | |
237 | paddw m2, [%1+%2 ] | |
238 | paddw m3, [%1+%2+16] | |
239 | CLIPW m0, m4, m5 | |
240 | CLIPW m1, m4, m5 | |
241 | CLIPW m2, m4, m5 | |
242 | CLIPW m3, m4, m5 | |
243 | mova [%1 ], m0 | |
244 | mova [%1+16 ], m1 | |
245 | mova [%1+%2 ], m2 | |
246 | mova [%1+%2+16], m3 | |
247 | %endmacro | |
248 | ||
249 | %macro TRANS_ADD_SSE_32_10 2 | |
250 | mova m0, [%2] | |
251 | mova m1, [%2+16] | |
252 | mova m2, [%2+32] | |
253 | mova m3, [%2+48] | |
254 | ||
255 | paddw m0, [%1 ] | |
256 | paddw m1, [%1+16] | |
257 | paddw m2, [%1+32] | |
258 | paddw m3, [%1+48] | |
259 | CLIPW m0, m4, m5 | |
260 | CLIPW m1, m4, m5 | |
261 | CLIPW m2, m4, m5 | |
262 | CLIPW m3, m4, m5 | |
263 | mova [%1 ], m0 | |
264 | mova [%1+16], m1 | |
265 | mova [%1+32], m2 | |
266 | mova [%1+48], m3 | |
267 | %endmacro | |
268 | ||
269 | %macro TRANS_ADD16_AVX2 4 | |
270 | mova m0, [%4] | |
271 | mova m1, [%4+32] | |
272 | mova m2, [%4+64] | |
273 | mova m3, [%4+96] | |
274 | ||
275 | paddw m0, [%1+0 ] | |
276 | paddw m1, [%1+%2 ] | |
277 | paddw m2, [%1+%2*2] | |
278 | paddw m3, [%1+%3 ] | |
279 | ||
280 | CLIPW m0, m4, m5 | |
281 | CLIPW m1, m4, m5 | |
282 | CLIPW m2, m4, m5 | |
283 | CLIPW m3, m4, m5 | |
284 | mova [%1+0 ], m0 | |
285 | mova [%1+%2 ], m1 | |
286 | mova [%1+%2*2], m2 | |
287 | mova [%1+%3 ], m3 | |
288 | %endmacro | |
289 | ||
290 | %macro TRANS_ADD32_AVX2 3 | |
291 | mova m0, [%3] | |
292 | mova m1, [%3+32] | |
293 | mova m2, [%3+64] | |
294 | mova m3, [%3+96] | |
295 | ||
296 | paddw m0, [%1 ] | |
297 | paddw m1, [%1+32 ] | |
298 | paddw m2, [%1+%2 ] | |
299 | paddw m3, [%1+%2+32] | |
300 | ||
301 | CLIPW m0, m4, m5 | |
302 | CLIPW m1, m4, m5 | |
303 | CLIPW m2, m4, m5 | |
304 | CLIPW m3, m4, m5 | |
305 | mova [%1 ], m0 | |
306 | mova [%1+32 ], m1 | |
307 | mova [%1+%2 ], m2 | |
308 | mova [%1+%2+32], m3 | |
309 | %endmacro | |
310 | ||
311 | ||
312 | INIT_MMX mmxext | |
313 | cglobal hevc_transform_add4_10,3,4, 6 | |
314 | pxor m2, m2 | |
315 | mova m3, [max_pixels_10] | |
316 | TR_ADD_MMX4_10 r0, r2, r1 | |
317 | add r1, 16 | |
318 | lea r0, [r0+2*r2] | |
319 | TR_ADD_MMX4_10 r0, r2, r1 | |
320 | RET | |
321 | ||
322 | ;----------------------------------------------------------------------------- | |
323 | ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride) | |
324 | ;----------------------------------------------------------------------------- | |
325 | INIT_XMM sse2 | |
326 | cglobal hevc_transform_add8_10,3,4,6 | |
327 | pxor m4, m4 | |
328 | mova m5, [max_pixels_10] | |
329 | lea r3, [r2*3] | |
330 | ||
331 | TR_ADD_SSE_8_10 r0, r2, r3, r1 | |
332 | lea r0, [r0+r2*4] | |
333 | add r1, 64 | |
334 | TR_ADD_SSE_8_10 r0, r2, r3, r1 | |
335 | RET | |
336 | ||
337 | cglobal hevc_transform_add16_10,3,4,6 | |
338 | pxor m4, m4 | |
339 | mova m5, [max_pixels_10] | |
340 | ||
341 | TRANS_ADD_SSE_16_10 r0, r2, r1 | |
342 | %rep 7 | |
343 | lea r0, [r0+r2*2] | |
344 | add r1, 64 | |
345 | TRANS_ADD_SSE_16_10 r0, r2, r1 | |
346 | %endrep | |
347 | RET | |
348 | ||
349 | cglobal hevc_transform_add32_10,3,4,6 | |
350 | pxor m4, m4 | |
351 | mova m5, [max_pixels_10] | |
352 | ||
353 | TRANS_ADD_SSE_32_10 r0, r1 | |
354 | %rep 31 | |
355 | lea r0, [r0+r2] | |
356 | add r1, 64 | |
357 | TRANS_ADD_SSE_32_10 r0, r1 | |
358 | %endrep | |
359 | RET | |
360 | ||
361 | %if HAVE_AVX2_EXTERNAL | |
362 | INIT_YMM avx2 | |
363 | ||
364 | cglobal hevc_transform_add16_10,3,4,6 | |
365 | pxor m4, m4 | |
366 | mova m5, [max_pixels_10] | |
367 | lea r3, [r2*3] | |
368 | ||
369 | TRANS_ADD16_AVX2 r0, r2, r3, r1 | |
370 | %rep 3 | |
371 | lea r0, [r0+r2*4] | |
372 | add r1, 128 | |
373 | TRANS_ADD16_AVX2 r0, r2, r3, r1 | |
374 | %endrep | |
375 | RET | |
376 | ||
377 | cglobal hevc_transform_add32_10,3,4,6 | |
378 | pxor m4, m4 | |
379 | mova m5, [max_pixels_10] | |
380 | ||
381 | TRANS_ADD32_AVX2 r0, r2, r1 | |
382 | %rep 15 | |
383 | lea r0, [r0+r2*2] | |
384 | add r1, 128 | |
385 | TRANS_ADD32_AVX2 r0, r2, r1 | |
386 | %endrep | |
387 | RET | |
388 | %endif ;HAVE_AVX_EXTERNAL |