Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / hevc_res_add.asm
CommitLineData
2ba45a60
DM
1; /*
2; * Provide SIMD optimizations for transform_add functions for HEVC decoding
3; * Copyright (c) 2014 Pierre-Edouard LEPERE
4; *
5; * This file is part of FFmpeg.
6; *
7; * FFmpeg is free software; you can redistribute it and/or
8; * modify it under the terms of the GNU Lesser General Public
9; * License as published by the Free Software Foundation; either
10; * version 2.1 of the License, or (at your option) any later version.
11; *
12; * FFmpeg is distributed in the hope that it will be useful,
13; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15; * Lesser General Public License for more details.
16; *
17; * You should have received a copy of the GNU Lesser General Public
18; * License along with FFmpeg; if not, write to the Free Software
19; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20; */
21%include "libavutil/x86/x86util.asm"
22
23SECTION_RODATA 32
24max_pixels_10: times 16 dw ((1 << 10)-1)
25
26
27SECTION .text
28
29;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
30%macro TR_ADD_MMX_4_8 0
31 mova m2, [r1]
32 mova m4, [r1+8]
33 pxor m3, m3
34 psubw m3, m2
35 packuswb m2, m2
36 packuswb m3, m3
37 pxor m5, m5
38 psubw m5, m4
39 packuswb m4, m4
40 packuswb m5, m5
41
42 movh m0, [r0 ]
43 movh m1, [r0+r2 ]
44 paddusb m0, m2
45 paddusb m1, m4
46 psubusb m0, m3
47 psubusb m1, m5
48 movh [r0 ], m0
49 movh [r0+r2 ], m1
50%endmacro
51
52
53INIT_MMX mmxext
54; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
55cglobal hevc_transform_add4_8, 3, 4, 6
56 TR_ADD_MMX_4_8
57 add r1, 16
58 lea r0, [r0+r2*2]
59 TR_ADD_MMX_4_8
60 RET
61
62%macro TR_ADD_SSE_8_8 0
63 pxor m3, m3
64 mova m4, [r1]
65 mova m6, [r1+16]
66 mova m0, [r1+32]
67 mova m2, [r1+48]
68 psubw m5, m3, m4
69 psubw m7, m3, m6
70 psubw m1, m3, m0
71 packuswb m4, m0
72 packuswb m5, m1
73 psubw m3, m2
74 packuswb m6, m2
75 packuswb m7, m3
76
77 movq m0, [r0 ]
78 movq m1, [r0+r2 ]
79 movhps m0, [r0+r2*2]
80 movhps m1, [r0+r3 ]
81 paddusb m0, m4
82 paddusb m1, m6
83 psubusb m0, m5
84 psubusb m1, m7
85 movq [r0 ], m0
86 movq [r0+r2 ], m1
87 movhps [r0+2*r2], m0
88 movhps [r0+r3 ], m1
89%endmacro
90
91%macro TR_ADD_SSE_16_32_8 3
92 mova xm2, [r1+%1 ]
93 mova xm6, [r1+%1+16]
94%if cpuflag(avx2)
95 vinserti128 m2, m2, [r1+%1+32], 1
96 vinserti128 m6, m6, [r1+%1+48], 1
97%endif
98%if cpuflag(avx)
99 psubw m1, m0, m2
100 psubw m5, m0, m6
101%else
102 mova m1, m0
103 mova m5, m0
104 psubw m1, m2
105 psubw m5, m6
106%endif
107 packuswb m2, m6
108 packuswb m1, m5
109
110 mova xm4, [r1+%1+mmsize*2 ]
111 mova xm6, [r1+%1+mmsize*2+16]
112%if cpuflag(avx2)
113 vinserti128 m4, m4, [r1+%1+96 ], 1
114 vinserti128 m6, m6, [r1+%1+112], 1
115%endif
116%if cpuflag(avx)
117 psubw m3, m0, m4
118 psubw m5, m0, m6
119%else
120 mova m3, m0
121 mova m5, m0
122 psubw m3, m4
123 psubw m5, m6
124%endif
125 packuswb m4, m6
126 packuswb m3, m5
127
128 paddusb m2, [%2]
129 paddusb m4, [%3]
130 psubusb m2, m1
131 psubusb m4, m3
132 mova [%2], m2
133 mova [%3], m4
134%endmacro
135
136
137%macro TRANSFORM_ADD_8 0
138; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
139cglobal hevc_transform_add8_8, 3, 4, 8
140 lea r3, [r2*3]
141 TR_ADD_SSE_8_8
142 add r1, 64
143 lea r0, [r0+r2*4]
144 TR_ADD_SSE_8_8
145 RET
146
147; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
148cglobal hevc_transform_add16_8, 3, 4, 7
149 pxor m0, m0
150 lea r3, [r2*3]
151 TR_ADD_SSE_16_32_8 0, r0, r0+r2
152 TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
153%rep 3
154 add r1, 128
155 lea r0, [r0+r2*4]
156 TR_ADD_SSE_16_32_8 0, r0, r0+r2
157 TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
158%endrep
159 RET
160
161; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
162cglobal hevc_transform_add32_8, 3, 4, 7
163 pxor m0, m0
164 TR_ADD_SSE_16_32_8 0, r0, r0+16
165 TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
166%rep 15
167 add r1, 128
168 lea r0, [r0+r2*2]
169 TR_ADD_SSE_16_32_8 0, r0, r0+16
170 TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
171%endrep
172 RET
173%endmacro
174
175INIT_XMM sse2
176TRANSFORM_ADD_8
177INIT_XMM avx
178TRANSFORM_ADD_8
179
180%if HAVE_AVX2_EXTERNAL
181INIT_YMM avx2
182; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
183cglobal hevc_transform_add32_8, 3, 4, 7
184 pxor m0, m0
185 lea r3, [r2*3]
186 TR_ADD_SSE_16_32_8 0, r0, r0+r2
187 TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
188%rep 7
189 add r1, 256
190 lea r0, [r0+r2*4]
191 TR_ADD_SSE_16_32_8 0, r0, r0+r2
192 TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
193%endrep
194 RET
195%endif
196
197;-----------------------------------------------------------------------------
198; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
199;-----------------------------------------------------------------------------
200%macro TR_ADD_SSE_8_10 4
201 mova m0, [%4]
202 mova m1, [%4+16]
203 mova m2, [%4+32]
204 mova m3, [%4+48]
205 paddw m0, [%1+0 ]
206 paddw m1, [%1+%2 ]
207 paddw m2, [%1+%2*2]
208 paddw m3, [%1+%3 ]
209 CLIPW m0, m4, m5
210 CLIPW m1, m4, m5
211 CLIPW m2, m4, m5
212 CLIPW m3, m4, m5
213 mova [%1+0 ], m0
214 mova [%1+%2 ], m1
215 mova [%1+%2*2], m2
216 mova [%1+%3 ], m3
217%endmacro
218
219%macro TR_ADD_MMX4_10 3
220 mova m0, [%1+0 ]
221 mova m1, [%1+%2 ]
222 paddw m0, [%3]
223 paddw m1, [%3+8]
224 CLIPW m0, m2, m3
225 CLIPW m1, m2, m3
226 mova [%1+0 ], m0
227 mova [%1+%2 ], m1
228%endmacro
229
230%macro TRANS_ADD_SSE_16_10 3
231 mova m0, [%3]
232 mova m1, [%3+16]
233 mova m2, [%3+32]
234 mova m3, [%3+48]
235 paddw m0, [%1 ]
236 paddw m1, [%1+16 ]
237 paddw m2, [%1+%2 ]
238 paddw m3, [%1+%2+16]
239 CLIPW m0, m4, m5
240 CLIPW m1, m4, m5
241 CLIPW m2, m4, m5
242 CLIPW m3, m4, m5
243 mova [%1 ], m0
244 mova [%1+16 ], m1
245 mova [%1+%2 ], m2
246 mova [%1+%2+16], m3
247%endmacro
248
249%macro TRANS_ADD_SSE_32_10 2
250 mova m0, [%2]
251 mova m1, [%2+16]
252 mova m2, [%2+32]
253 mova m3, [%2+48]
254
255 paddw m0, [%1 ]
256 paddw m1, [%1+16]
257 paddw m2, [%1+32]
258 paddw m3, [%1+48]
259 CLIPW m0, m4, m5
260 CLIPW m1, m4, m5
261 CLIPW m2, m4, m5
262 CLIPW m3, m4, m5
263 mova [%1 ], m0
264 mova [%1+16], m1
265 mova [%1+32], m2
266 mova [%1+48], m3
267%endmacro
268
269%macro TRANS_ADD16_AVX2 4
270 mova m0, [%4]
271 mova m1, [%4+32]
272 mova m2, [%4+64]
273 mova m3, [%4+96]
274
275 paddw m0, [%1+0 ]
276 paddw m1, [%1+%2 ]
277 paddw m2, [%1+%2*2]
278 paddw m3, [%1+%3 ]
279
280 CLIPW m0, m4, m5
281 CLIPW m1, m4, m5
282 CLIPW m2, m4, m5
283 CLIPW m3, m4, m5
284 mova [%1+0 ], m0
285 mova [%1+%2 ], m1
286 mova [%1+%2*2], m2
287 mova [%1+%3 ], m3
288%endmacro
289
290%macro TRANS_ADD32_AVX2 3
291 mova m0, [%3]
292 mova m1, [%3+32]
293 mova m2, [%3+64]
294 mova m3, [%3+96]
295
296 paddw m0, [%1 ]
297 paddw m1, [%1+32 ]
298 paddw m2, [%1+%2 ]
299 paddw m3, [%1+%2+32]
300
301 CLIPW m0, m4, m5
302 CLIPW m1, m4, m5
303 CLIPW m2, m4, m5
304 CLIPW m3, m4, m5
305 mova [%1 ], m0
306 mova [%1+32 ], m1
307 mova [%1+%2 ], m2
308 mova [%1+%2+32], m3
309%endmacro
310
311
312INIT_MMX mmxext
313cglobal hevc_transform_add4_10,3,4, 6
314 pxor m2, m2
315 mova m3, [max_pixels_10]
316 TR_ADD_MMX4_10 r0, r2, r1
317 add r1, 16
318 lea r0, [r0+2*r2]
319 TR_ADD_MMX4_10 r0, r2, r1
320 RET
321
322;-----------------------------------------------------------------------------
323; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
324;-----------------------------------------------------------------------------
325INIT_XMM sse2
326cglobal hevc_transform_add8_10,3,4,6
327 pxor m4, m4
328 mova m5, [max_pixels_10]
329 lea r3, [r2*3]
330
331 TR_ADD_SSE_8_10 r0, r2, r3, r1
332 lea r0, [r0+r2*4]
333 add r1, 64
334 TR_ADD_SSE_8_10 r0, r2, r3, r1
335 RET
336
337cglobal hevc_transform_add16_10,3,4,6
338 pxor m4, m4
339 mova m5, [max_pixels_10]
340
341 TRANS_ADD_SSE_16_10 r0, r2, r1
342%rep 7
343 lea r0, [r0+r2*2]
344 add r1, 64
345 TRANS_ADD_SSE_16_10 r0, r2, r1
346%endrep
347 RET
348
349cglobal hevc_transform_add32_10,3,4,6
350 pxor m4, m4
351 mova m5, [max_pixels_10]
352
353 TRANS_ADD_SSE_32_10 r0, r1
354%rep 31
355 lea r0, [r0+r2]
356 add r1, 64
357 TRANS_ADD_SSE_32_10 r0, r1
358%endrep
359 RET
360
361%if HAVE_AVX2_EXTERNAL
362INIT_YMM avx2
363
364cglobal hevc_transform_add16_10,3,4,6
365 pxor m4, m4
366 mova m5, [max_pixels_10]
367 lea r3, [r2*3]
368
369 TRANS_ADD16_AVX2 r0, r2, r3, r1
370%rep 3
371 lea r0, [r0+r2*4]
372 add r1, 128
373 TRANS_ADD16_AVX2 r0, r2, r3, r1
374%endrep
375 RET
376
377cglobal hevc_transform_add32_10,3,4,6
378 pxor m4, m4
379 mova m5, [max_pixels_10]
380
381 TRANS_ADD32_AVX2 r0, r2, r1
382%rep 15
383 lea r0, [r0+r2*2]
384 add r1, 128
385 TRANS_ADD32_AVX2 r0, r2, r1
386%endrep
387 RET
388%endif ;HAVE_AVX_EXTERNAL