Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / proresdsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* x86-SIMD-optimized IDCT for prores
3;* this is identical to "simple" IDCT written by Michael Niedermayer
4;* except for the clip range
5;*
6;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
28%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
29%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
30%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
31%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
32%define W6sh2 8867 ; W6 = 35468 = 8867<<2
33%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
34
35%if ARCH_X86_64
36
37SECTION_RODATA
38
39w4_plus_w2: times 4 dw W4sh2, +W2sh2
40w4_min_w2: times 4 dw W4sh2, -W2sh2
41w4_plus_w6: times 4 dw W4sh2, +W6sh2
42w4_min_w6: times 4 dw W4sh2, -W6sh2
43w1_plus_w3: times 4 dw W1sh2, +W3sh2
44w3_min_w1: times 4 dw W3sh2, -W1sh2
45w7_plus_w3: times 4 dw W7sh2, +W3sh2
46w3_min_w7: times 4 dw W3sh2, -W7sh2
47w1_plus_w5: times 4 dw W1sh2, +W5sh2
48w5_min_w1: times 4 dw W5sh2, -W1sh2
49w5_plus_w7: times 4 dw W5sh2, +W7sh2
50w7_min_w5: times 4 dw W7sh2, -W5sh2
51pw_88: times 8 dw 0x2008
52
53cextern pw_1
54cextern pw_4
55cextern pw_512
56cextern pw_1019
57
58section .text align=16
59
60; interleave data while maintaining source
61; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
62%macro SBUTTERFLY3 5
63 punpckl%1 m%2, m%4, m%5
64 punpckh%1 m%3, m%4, m%5
65%endmacro
66
67; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
68; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
69; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
70%macro SUMSUB_SHPK 7
71 psubd %3, %1, %5 ; { a0 - b0 }[0-3]
72 psubd %4, %2, %6 ; { a0 - b0 }[4-7]
73 paddd %1, %5 ; { a0 + b0 }[0-3]
74 paddd %2, %6 ; { a0 + b0 }[4-7]
75 psrad %1, %7
76 psrad %2, %7
77 psrad %3, %7
78 psrad %4, %7
79 packssdw %1, %2 ; row[0]
80 packssdw %3, %4 ; row[7]
81%endmacro
82
83; %1 = row or col (for rounding variable)
84; %2 = number of bits to shift at the end
85%macro IDCT_1D 2
86 ; a0 = (W4 * row[0]) + (1 << (15 - 1));
87 ; a1 = a0;
88 ; a2 = a0;
89 ; a3 = a0;
90 ; a0 += W2 * row[2];
91 ; a1 += W6 * row[2];
92 ; a2 -= W6 * row[2];
93 ; a3 -= W2 * row[2];
94%ifidn %1, col
95 paddw m10,[pw_88]
96%endif
97%ifidn %1, row
98 paddw m10,[pw_1]
99%endif
100 SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
101 pmaddwd m2, m0, [w4_plus_w6]
102 pmaddwd m3, m1, [w4_plus_w6]
103 pmaddwd m4, m0, [w4_min_w6]
104 pmaddwd m5, m1, [w4_min_w6]
105 pmaddwd m6, m0, [w4_min_w2]
106 pmaddwd m7, m1, [w4_min_w2]
107 pmaddwd m0, [w4_plus_w2]
108 pmaddwd m1, [w4_plus_w2]
109
110 ; a0: -1*row[0]-1*row[2]
111 ; a1: -1*row[0]
112 ; a2: -1*row[0]
113 ; a3: -1*row[0]+1*row[2]
114
115 ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
116 ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
117 ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
118 ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
119 SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
120 pmaddwd m10, m8, [w4_plus_w6]
121 pmaddwd m11, m9, [w4_plus_w6]
122 paddd m0, m10 ; a0[0-3]
123 paddd m1, m11 ; a0[4-7]
124 pmaddwd m10, m8, [w4_min_w6]
125 pmaddwd m11, m9, [w4_min_w6]
126 paddd m6, m10 ; a3[0-3]
127 paddd m7, m11 ; a3[4-7]
128 pmaddwd m10, m8, [w4_min_w2]
129 pmaddwd m11, m9, [w4_min_w2]
130 pmaddwd m8, [w4_plus_w2]
131 pmaddwd m9, [w4_plus_w2]
132 psubd m4, m10 ; a2[0-3] intermediate
133 psubd m5, m11 ; a2[4-7] intermediate
134 psubd m2, m8 ; a1[0-3] intermediate
135 psubd m3, m9 ; a1[4-7] intermediate
136
137 ; load/store
138 mova [r2+ 0], m0
139 mova [r2+ 32], m2
140 mova [r2+ 64], m4
141 mova [r2+ 96], m6
142 mova m10,[r2+ 16] ; { row[1] }[0-7]
143 mova m8, [r2+ 48] ; { row[3] }[0-7]
144 mova m13,[r2+ 80] ; { row[5] }[0-7]
145 mova m14,[r2+112] ; { row[7] }[0-7]
146 mova [r2+ 16], m1
147 mova [r2+ 48], m3
148 mova [r2+ 80], m5
149 mova [r2+112], m7
150%ifidn %1, row
151 pmullw m10,[r3+ 16]
152 pmullw m8, [r3+ 48]
153 pmullw m13,[r3+ 80]
154 pmullw m14,[r3+112]
155%endif
156
157 ; b0 = MUL(W1, row[1]);
158 ; MAC(b0, W3, row[3]);
159 ; b1 = MUL(W3, row[1]);
160 ; MAC(b1, -W7, row[3]);
161 ; b2 = MUL(W5, row[1]);
162 ; MAC(b2, -W1, row[3]);
163 ; b3 = MUL(W7, row[1]);
164 ; MAC(b3, -W5, row[3]);
165 SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
166 pmaddwd m2, m0, [w3_min_w7]
167 pmaddwd m3, m1, [w3_min_w7]
168 pmaddwd m4, m0, [w5_min_w1]
169 pmaddwd m5, m1, [w5_min_w1]
170 pmaddwd m6, m0, [w7_min_w5]
171 pmaddwd m7, m1, [w7_min_w5]
172 pmaddwd m0, [w1_plus_w3]
173 pmaddwd m1, [w1_plus_w3]
174
175 ; b0: +1*row[1]+2*row[3]
176 ; b1: +2*row[1]-1*row[3]
177 ; b2: -1*row[1]-1*row[3]
178 ; b3: +1*row[1]+1*row[3]
179
180 ; MAC(b0, W5, row[5]);
181 ; MAC(b0, W7, row[7]);
182 ; MAC(b1, -W1, row[5]);
183 ; MAC(b1, -W5, row[7]);
184 ; MAC(b2, W7, row[5]);
185 ; MAC(b2, W3, row[7]);
186 ; MAC(b3, W3, row[5]);
187 ; MAC(b3, -W1, row[7]);
188 SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
189
190 ; b0: -1*row[5]+1*row[7]
191 ; b1: -1*row[5]+1*row[7]
192 ; b2: +1*row[5]+2*row[7]
193 ; b3: +2*row[5]-1*row[7]
194
195 pmaddwd m10, m8, [w1_plus_w5]
196 pmaddwd m11, m9, [w1_plus_w5]
197 pmaddwd m12, m8, [w5_plus_w7]
198 pmaddwd m13, m9, [w5_plus_w7]
199 psubd m2, m10 ; b1[0-3]
200 psubd m3, m11 ; b1[4-7]
201 paddd m0, m12 ; b0[0-3]
202 paddd m1, m13 ; b0[4-7]
203 pmaddwd m12, m8, [w7_plus_w3]
204 pmaddwd m13, m9, [w7_plus_w3]
205 pmaddwd m8, [w3_min_w1]
206 pmaddwd m9, [w3_min_w1]
207 paddd m4, m12 ; b2[0-3]
208 paddd m5, m13 ; b2[4-7]
209 paddd m6, m8 ; b3[0-3]
210 paddd m7, m9 ; b3[4-7]
211
212 ; row[0] = (a0 + b0) >> 15;
213 ; row[7] = (a0 - b0) >> 15;
214 ; row[1] = (a1 + b1) >> 15;
215 ; row[6] = (a1 - b1) >> 15;
216 ; row[2] = (a2 + b2) >> 15;
217 ; row[5] = (a2 - b2) >> 15;
218 ; row[3] = (a3 + b3) >> 15;
219 ; row[4] = (a3 - b3) >> 15;
220 mova m8, [r2+ 0] ; a0[0-3]
221 mova m9, [r2+16] ; a0[4-7]
222 SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
223 mova m0, [r2+32] ; a1[0-3]
224 mova m1, [r2+48] ; a1[4-7]
225 SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
226 mova m1, [r2+64] ; a2[0-3]
227 mova m2, [r2+80] ; a2[4-7]
228 SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
229 mova m2, [r2+96] ; a3[0-3]
230 mova m3, [r2+112] ; a3[4-7]
231 SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
232%endmacro
233
234; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
235; int16_t *block, const int16_t *qmat);
236%macro idct_put_fn 1
237cglobal prores_idct_put_10, 4, 4, %1
238 movsxd r1, r1d
239 pxor m15, m15 ; zero
240
241 ; for (i = 0; i < 8; i++)
242 ; idctRowCondDC(block + i*8);
243 mova m10,[r2+ 0] ; { row[0] }[0-7]
244 mova m8, [r2+32] ; { row[2] }[0-7]
245 mova m13,[r2+64] ; { row[4] }[0-7]
246 mova m12,[r2+96] ; { row[6] }[0-7]
247
248 pmullw m10,[r3+ 0]
249 pmullw m8, [r3+32]
250 pmullw m13,[r3+64]
251 pmullw m12,[r3+96]
252
253 IDCT_1D row, 15
254
255 ; transpose for second part of IDCT
256 TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
257 mova [r2+ 16], m0
258 mova [r2+ 48], m2
259 mova [r2+ 80], m11
260 mova [r2+112], m10
261 SWAP 8, 10
262 SWAP 1, 8
263 SWAP 4, 13
264 SWAP 9, 12
265
266 ; for (i = 0; i < 8; i++)
267 ; idctSparseColAdd(dest + i, line_size, block + i);
268 IDCT_1D col, 18
269
270 ; clip/store
271 mova m3, [pw_4]
272 mova m5, [pw_1019]
273 pmaxsw m8, m3
274 pmaxsw m0, m3
275 pmaxsw m1, m3
276 pmaxsw m2, m3
277 pmaxsw m4, m3
278 pmaxsw m11, m3
279 pmaxsw m9, m3
280 pmaxsw m10, m3
281 pminsw m8, m5
282 pminsw m0, m5
283 pminsw m1, m5
284 pminsw m2, m5
285 pminsw m4, m5
286 pminsw m11, m5
287 pminsw m9, m5
288 pminsw m10, m5
289
290 lea r2, [r1*3]
291 mova [r0 ], m8
292 mova [r0+r1 ], m0
293 mova [r0+r1*2], m1
294 mova [r0+r2 ], m2
295 lea r0, [r0+r1*4]
296 mova [r0 ], m4
297 mova [r0+r1 ], m11
298 mova [r0+r1*2], m9
299 mova [r0+r2 ], m10
300 RET
301%endmacro
302
303%macro SIGNEXTEND 2-3
304%if cpuflag(sse4) ; dstlow, dsthigh
305 movhlps %2, %1
306 pmovsxwd %1, %1
307 pmovsxwd %2, %2
308%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
309 pxor %3, %3
310 pcmpgtw %3, %1
311 mova %2, %1
312 punpcklwd %1, %3
313 punpckhwd %2, %3
314%endif
315%endmacro
316
317INIT_XMM sse2
318idct_put_fn 16
319INIT_XMM sse4
320idct_put_fn 16
321%if HAVE_AVX_EXTERNAL
322INIT_XMM avx
323idct_put_fn 16
324%endif
325
326%endif