Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vp9itxfm.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* VP9 IDCT SIMD optimizations
3;*
4;* Copyright (C) 2013 Clément Bœsch <u pkh me>
5;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28pw_11585x2: times 8 dw 23170
29pw_m11585x2: times 8 dw -23170
30
31%macro VP9_IDCT_COEFFS 2-3 0
32pw_%1x2: times 8 dw %1*2
33pw_m%1x2: times 8 dw -%1*2
34pw_%2x2: times 8 dw %2*2
35pw_m%2x2: times 8 dw -%2*2
36pw_m%1_%2: times 4 dw -%1, %2
37pw_%2_%1: times 4 dw %2, %1
38pw_m%2_m%1: times 4 dw -%2, -%1
39%if %3 == 1
40pw_m%2_%1: times 4 dw -%2, %1
41pw_%1_%2: times 4 dw %1, %2
42%endif
43%endmacro
44
45VP9_IDCT_COEFFS 15137, 6270, 1
46VP9_IDCT_COEFFS 16069, 3196, 1
47VP9_IDCT_COEFFS 9102, 13623, 1
48VP9_IDCT_COEFFS 16305, 1606
49VP9_IDCT_COEFFS 10394, 12665
50VP9_IDCT_COEFFS 14449, 7723
51VP9_IDCT_COEFFS 4756, 15679
52VP9_IDCT_COEFFS 16364, 804
53VP9_IDCT_COEFFS 11003, 12140
54VP9_IDCT_COEFFS 14811, 7005
55VP9_IDCT_COEFFS 5520, 15426
56VP9_IDCT_COEFFS 15893, 3981
57VP9_IDCT_COEFFS 8423, 14053
58VP9_IDCT_COEFFS 13160, 9760
59VP9_IDCT_COEFFS 2404, 16207
60
61pw_5283_13377: times 4 dw 5283, 13377
62pw_9929_13377: times 4 dw 9929, 13377
63pw_15212_m13377: times 4 dw 15212, -13377
64pw_15212_9929: times 4 dw 15212, 9929
65pw_m5283_m15212: times 4 dw -5283, -15212
66pw_13377x2: times 8 dw 13377*2
67
68pd_8192: times 4 dd 8192
69
70cextern pw_512
71cextern pw_1024
72cextern pw_2048
73cextern pw_m1
74
75SECTION .text
76
77; (a*x + b*y + round) >> shift
78%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
79 pmaddwd m%1, m%2, %4
80 pmaddwd m%2, %5
81 paddd m%1, %3
82 paddd m%2, %3
83 psrad m%1, 14
84 psrad m%2, 14
85%endmacro
86
87%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
88 VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3]
89 VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3]
90 packssdw m%1, m%7
91 packssdw m%2, m%6
92%endmacro
93
94%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
95%if %0 == 7
96 punpckhwd m%6, m%2, m%1
97 punpcklwd m%2, m%1
98 VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7
99%else
100 punpckhwd m%8, m%4, m%3
101 punpcklwd m%2, m%4, m%3
102 VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9
103%endif
104%endmacro
105
106%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
107 punpckhwd m%4, m%2, m%1
108 punpcklwd m%2, m%1
109 pmaddwd m%3, m%4, [pw_m%5_%6]
110 pmaddwd m%4, [pw_%6_%5]
111 pmaddwd m%1, m%2, [pw_m%5_%6]
112 pmaddwd m%2, [pw_%6_%5]
113%endmacro
114
115%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
116 SUMSUB_BA d, %1, %2, %5
117 SUMSUB_BA d, %3, %4, %5
118 paddd m%1, %6
119 paddd m%2, %6
120 paddd m%3, %6
121 paddd m%4, %6
122 psrad m%1, 14
123 psrad m%2, 14
124 psrad m%3, 14
125 psrad m%4, 14
126 packssdw m%1, m%3
127 packssdw m%2, m%4
128%endmacro
129
130%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
131 movh m%3, [%6]
132 movh m%4, [%6+strideq]
133 punpcklbw m%3, m%5
134 punpcklbw m%4, m%5
135 paddw m%3, m%1
136 paddw m%4, m%2
137 packuswb m%3, m%5
138 packuswb m%4, m%5
139 movh [%6], m%3
140 movh [%6+strideq], m%4
141%endmacro
142
143%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
144%assign %%y 0
145%rep %3
146%assign %%x 0
147%rep %3*2/mmsize
148 mova [%1+%%y+%%x], %4
149%assign %%x (%%x+mmsize)
150%endrep
151%assign %%y (%%y+%2)
152%endrep
153%endmacro
154
155;-------------------------------------------------------------------------------------------
156; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
157;-------------------------------------------------------------------------------------------
158
159%macro VP9_IWHT4_1D 0
160 SWAP 1, 2, 3
161 paddw m0, m2
162 psubw m3, m1
163 psubw m4, m0, m3
164 psraw m4, 1
165 psubw m5, m4, m1
166 SWAP 5, 1
167 psubw m4, m2
168 SWAP 4, 2
169 psubw m0, m1
170 paddw m3, m2
171 SWAP 3, 2, 1
172%endmacro
173
174INIT_MMX mmx
175cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
176 mova m0, [blockq+0*8]
177 mova m1, [blockq+1*8]
178 mova m2, [blockq+2*8]
179 mova m3, [blockq+3*8]
180 psraw m0, 2
181 psraw m1, 2
182 psraw m2, 2
183 psraw m3, 2
184
185 VP9_IWHT4_1D
186 TRANSPOSE4x4W 0, 1, 2, 3, 4
187 VP9_IWHT4_1D
188
189 pxor m4, m4
190 VP9_STORE_2X 0, 1, 5, 6, 4
191 lea dstq, [dstq+strideq*2]
192 VP9_STORE_2X 2, 3, 5, 6, 4
193 ZERO_BLOCK blockq, 8, 4, m4
194 RET
195
196;-------------------------------------------------------------------------------------------
197; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
198;-------------------------------------------------------------------------------------------
199
200%macro VP9_IDCT4_1D_FINALIZE 0
201 SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
202 SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
203 SWAP 0, 3, 2 ; 3102 -> 0123
204%endmacro
205
206%macro VP9_IDCT4_1D 0
207 SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
208 pmulhrsw m2, m6 ; m2=t0
209 pmulhrsw m0, m6 ; m0=t1
210 VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3
211 VP9_IDCT4_1D_FINALIZE
212%endmacro
213
214; 2x2 top left corner
215%macro VP9_IDCT4_2x2_1D 0
216 pmulhrsw m0, m5 ; m0=t1
217 mova m2, m0 ; m2=t0
218 mova m3, m1
219 pmulhrsw m1, m6 ; m1=t2
220 pmulhrsw m3, m7 ; m3=t3
221 VP9_IDCT4_1D_FINALIZE
222%endmacro
223
224%macro VP9_IDCT4_WRITEOUT 0
225 mova m5, [pw_2048]
226 pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
227 pmulhrsw m1, m5
228 VP9_STORE_2X 0, 1, 6, 7, 4
229 lea dstq, [dstq+2*strideq]
230 pmulhrsw m2, m5
231 pmulhrsw m3, m5
232 VP9_STORE_2X 2, 3, 6, 7, 4
233%endmacro
234
235INIT_MMX ssse3
236cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
237
238 cmp eobd, 4 ; 2x2 or smaller
239 jg .idctfull
240
241 cmp eobd, 1 ; faster path for when only DC is set
242 jne .idct2x2
243
244 movd m0, [blockq]
245 mova m5, [pw_11585x2]
246 pmulhrsw m0, m5
247 pmulhrsw m0, m5
248 pshufw m0, m0, 0
249 pxor m4, m4
250 movh [blockq], m4
251 pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
252 VP9_STORE_2X 0, 0, 6, 7, 4
253 lea dstq, [dstq+2*strideq]
254 VP9_STORE_2X 0, 0, 6, 7, 4
255 RET
256
257; faster path for when only top left 2x2 block is set
258.idct2x2:
259 movd m0, [blockq+0]
260 movd m1, [blockq+8]
261 mova m5, [pw_11585x2]
262 mova m6, [pw_6270x2]
263 mova m7, [pw_15137x2]
264 VP9_IDCT4_2x2_1D
265 TRANSPOSE4x4W 0, 1, 2, 3, 4
266 VP9_IDCT4_2x2_1D
267 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
268 movh [blockq+ 0], m4
269 movh [blockq+ 8], m4
270 VP9_IDCT4_WRITEOUT
271 RET
272
273.idctfull: ; generic full 4x4 idct/idct
274 mova m0, [blockq+ 0]
275 mova m1, [blockq+ 8]
276 mova m2, [blockq+16]
277 mova m3, [blockq+24]
278 mova m6, [pw_11585x2]
279 mova m7, [pd_8192] ; rounding
280 VP9_IDCT4_1D
281 TRANSPOSE4x4W 0, 1, 2, 3, 4
282 VP9_IDCT4_1D
283 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
284 mova [blockq+ 0], m4
285 mova [blockq+ 8], m4
286 mova [blockq+16], m4
287 mova [blockq+24], m4
288 VP9_IDCT4_WRITEOUT
289 RET
290
291;-------------------------------------------------------------------------------------------
292; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
293;-------------------------------------------------------------------------------------------
294
295%macro VP9_IADST4_1D 0
296 movq2dq xmm0, m0
297 movq2dq xmm1, m1
298 movq2dq xmm2, m2
299 movq2dq xmm3, m3
300 paddw m3, m0
301 punpcklwd xmm0, xmm1
302 punpcklwd xmm2, xmm3
303 pmaddwd xmm1, xmm0, [pw_5283_13377]
304 pmaddwd xmm4, xmm0, [pw_9929_13377]
305 pmaddwd xmm0, [pw_15212_m13377]
306 pmaddwd xmm3, xmm2, [pw_15212_9929]
307 pmaddwd xmm2, [pw_m5283_m15212]
308 psubw m3, m2
309 paddd xmm0, xmm2
310 paddd xmm3, [pd_8192]
311 paddd xmm2, [pd_8192]
312 paddd xmm1, xmm3
313 paddd xmm0, xmm3
314 paddd xmm4, xmm2
315 psrad xmm1, 14
316 psrad xmm0, 14
317 psrad xmm4, 14
318 pmulhrsw m3, [pw_13377x2] ; out2
319 packssdw xmm0, xmm0
320 packssdw xmm1, xmm1
321 packssdw xmm4, xmm4
322 movdq2q m0, xmm0 ; out3
323 movdq2q m1, xmm1 ; out0
324 movdq2q m2, xmm4 ; out1
325 SWAP 0, 1, 2, 3
326%endmacro
327
328%macro IADST4_FN 5
329INIT_MMX %5
330cglobal vp9_%1_%3_4x4_add, 3, 3, 8, dst, stride, block, eob
331 mova m0, [blockq+ 0]
332 mova m1, [blockq+ 8]
333 mova m2, [blockq+16]
334 mova m3, [blockq+24]
335 mova m6, [pw_11585x2]
336 mova m7, [pd_8192] ; rounding
337 VP9_%2_1D
338 TRANSPOSE4x4W 0, 1, 2, 3, 4
339 VP9_%4_1D
340 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
341 mova [blockq+ 0], m4
342 mova [blockq+ 8], m4
343 mova [blockq+16], m4
344 mova [blockq+24], m4
345 VP9_IDCT4_WRITEOUT
346 RET
347%endmacro
348
349IADST4_FN idct, IDCT4, iadst, IADST4, ssse3
350IADST4_FN iadst, IADST4, idct, IDCT4, ssse3
351IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
352
353%if ARCH_X86_64 ; TODO: 32-bit? (32-bit limited to 8 xmm reg, we use more)
354
355;-------------------------------------------------------------------------------------------
356; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
357;-------------------------------------------------------------------------------------------
358
359%macro VP9_IDCT8_1D_FINALIZE 0
360 SUMSUB_BA w, 3, 10, 4 ; m3=t0+t7, m10=t0-t7
361 SUMSUB_BA w, 1, 2, 4 ; m1=t1+t6, m2=t1-t6
362 SUMSUB_BA w, 11, 0, 4 ; m11=t2+t5, m0=t2-t5
363 SUMSUB_BA w, 9, 8, 4 ; m9=t3+t4, m8=t3-t4
364 SWAP 11, 10, 2
365 SWAP 3, 9, 0
366%endmacro
367
368%macro VP9_IDCT8_1D 0
369 SUMSUB_BA w, 8, 0, 4 ; m8=IN(0)+IN(4) m0=IN(0)-IN(4)
370 pmulhrsw m8, m12 ; m8=t0a
371 pmulhrsw m0, m12 ; m0=t1a
372 VP9_UNPACK_MULSUB_2W_4X 2, 10, 15137, 6270, m7, 4, 5 ; m2=t2a, m10=t3a
373 VP9_UNPACK_MULSUB_2W_4X 1, 11, 16069, 3196, m7, 4, 5 ; m1=t4a, m11=t7a
374 VP9_UNPACK_MULSUB_2W_4X 9, 3, 9102, 13623, m7, 4, 5 ; m9=t5a, m3=t6a
375 SUMSUB_BA w, 10, 8, 4 ; m10=t0a+t3a (t0), m8=t0a-t3a (t3)
376 SUMSUB_BA w, 2, 0, 4 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
377 SUMSUB_BA w, 9, 1, 4 ; m9=t4a+t5a (t4), m1=t4a-t5a (t5a)
378 SUMSUB_BA w, 3, 11, 4 ; m3=t7a+t6a (t7), m11=t7a-t6a (t6a)
379 SUMSUB_BA w, 1, 11, 4 ; m1=t6a+t5a (t6), m11=t6a-t5a (t5)
380 pmulhrsw m1, m12 ; m1=t6
381 pmulhrsw m11, m12 ; m11=t5
382 VP9_IDCT8_1D_FINALIZE
383%endmacro
384
385%macro VP9_IDCT8_4x4_1D 0
386 pmulhrsw m0, m12 ; m0=t1a/t0a
387 pmulhrsw m10, m2, [pw_15137x2] ; m10=t3a
388 pmulhrsw m2, [pw_6270x2] ; m2=t2a
389 pmulhrsw m11, m1, [pw_16069x2] ; m11=t7a
390 pmulhrsw m1, [pw_3196x2] ; m1=t4a
391 pmulhrsw m9, m3, [pw_9102x2] ; m9=-t5a
392 pmulhrsw m3, [pw_13623x2] ; m3=t6a
393 psubw m8, m0, m10 ; m8=t0a-t3a (t3)
394 paddw m10, m0 ; m10=t0a+t3a (t0)
395 SUMSUB_BA w, 2, 0, 4 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
396 SUMSUB_BA w, 9, 1, 4 ; m1=t4a+t5a (t4), m9=t4a-t5a (t5a)
397 SWAP 1, 9
398 SUMSUB_BA w, 3, 11, 4 ; m3=t7a+t6a (t7), m11=t7a-t6a (t6a)
399 SUMSUB_BA w, 1, 11, 4 ; m1=t6a+t5a (t6), m11=t6a-t5a (t5)
400 pmulhrsw m1, m12 ; m1=t6
401 pmulhrsw m11, m12 ; m11=t5
402 VP9_IDCT8_1D_FINALIZE
403%endmacro
404
405; TODO: a lot of t* copies can probably be removed and merged with
406; following SUMSUBs from VP9_IDCT8_1D_FINALIZE with AVX
407%macro VP9_IDCT8_2x2_1D 0
408 pmulhrsw m0, m12 ; m0=t0
409 mova m3, m1
410 pmulhrsw m1, m6 ; m1=t4
411 pmulhrsw m3, m7 ; m3=t7
412 mova m2, m0 ; m2=t1
413 mova m10, m0 ; m10=t2
414 mova m8, m0 ; m8=t3
415 mova m11, m3 ; t5 = t7a ...
416 mova m9, m3 ; t6 = t7a ...
417 psubw m11, m1 ; t5 = t7a - t4a
418 paddw m9, m1 ; t6 = t7a + t4a
419 pmulhrsw m11, m12 ; m11=t5
420 pmulhrsw m9, m12 ; m9=t6
421 SWAP 0, 10
422 SWAP 9, 1
423 VP9_IDCT8_1D_FINALIZE
424%endmacro
425
426%macro VP9_IDCT8_WRITEOUT 0
427 mova m5, [pw_1024]
428 pmulhrsw m0, m5 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
429 pmulhrsw m1, m5
430 VP9_STORE_2X 0, 1, 6, 7, 4
431 lea dstq, [dstq+2*strideq]
432 pmulhrsw m2, m5
433 pmulhrsw m3, m5
434 VP9_STORE_2X 2, 3, 6, 7, 4
435 lea dstq, [dstq+2*strideq]
436 pmulhrsw m8, m5
437 pmulhrsw m9, m5
438 VP9_STORE_2X 8, 9, 6, 7, 4
439 lea dstq, [dstq+2*strideq]
440 pmulhrsw m10, m5
441 pmulhrsw m11, m5
442 VP9_STORE_2X 10, 11, 6, 7, 4
443%endmacro
444
445%macro VP9_IDCT_IDCT_8x8_ADD_XMM 1
446INIT_XMM %1
447cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
448
449 mova m12, [pw_11585x2] ; often used
450
451 cmp eobd, 12 ; top left half or less
452 jg .idctfull
453
454 cmp eobd, 3 ; top left corner or less
455 jg .idcthalf
456
457 cmp eobd, 1 ; faster path for when only DC is set
458 jne .idcttopleftcorner
459
460 movd m0, [blockq]
461 pmulhrsw m0, m12
462 pmulhrsw m0, m12
463 SPLATW m0, m0, 0
464 pxor m4, m4
465 movd [blockq], m4
466 mova m5, [pw_1024]
467 pmulhrsw m0, m5 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
468 VP9_STORE_2X 0, 0, 6, 7, 4
469 lea dstq, [dstq+2*strideq]
470 VP9_STORE_2X 0, 0, 6, 7, 4
471 lea dstq, [dstq+2*strideq]
472 VP9_STORE_2X 0, 0, 6, 7, 4
473 lea dstq, [dstq+2*strideq]
474 VP9_STORE_2X 0, 0, 6, 7, 4
475 RET
476
477; faster path for when only left corner is set (3 input: DC, right to DC, below
478; to DC). Note: also working with a 2x2 block
479.idcttopleftcorner:
480 movd m0, [blockq+0]
481 movd m1, [blockq+16]
482 mova m6, [pw_3196x2]
483 mova m7, [pw_16069x2]
484 VP9_IDCT8_2x2_1D
485 TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4
486 VP9_IDCT8_2x2_1D
487 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
488 movd [blockq+ 0], m4
489 movd [blockq+16], m4
490 VP9_IDCT8_WRITEOUT
491 RET
492
493.idcthalf:
494 movh m0, [blockq + 0]
495 movh m1, [blockq +16]
496 movh m2, [blockq +32]
497 movh m3, [blockq +48]
498 VP9_IDCT8_4x4_1D
499 TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4
500 VP9_IDCT8_4x4_1D
501 pxor m4, m4
502 movh [blockq+ 0], m4
503 movh [blockq+16], m4
504 movh [blockq+32], m4
505 movh [blockq+48], m4
506 VP9_IDCT8_WRITEOUT
507 RET
508
509.idctfull: ; generic full 8x8 idct/idct
510 mova m0, [blockq+ 0] ; IN(0)
511 mova m1, [blockq+ 16] ; IN(1)
512 mova m2, [blockq+ 32] ; IN(2)
513 mova m3, [blockq+ 48] ; IN(3)
514 mova m8, [blockq+ 64] ; IN(4)
515 mova m9, [blockq+ 80] ; IN(5)
516 mova m10, [blockq+ 96] ; IN(6)
517 mova m11, [blockq+112] ; IN(7)
518 mova m7, [pd_8192] ; rounding
519 VP9_IDCT8_1D
520 TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4
521 VP9_IDCT8_1D
522
523 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
524 ZERO_BLOCK blockq, 16, 8, m4
525 VP9_IDCT8_WRITEOUT
526 RET
527%endmacro
528
529VP9_IDCT_IDCT_8x8_ADD_XMM ssse3
530VP9_IDCT_IDCT_8x8_ADD_XMM avx
531
532;---------------------------------------------------------------------------------------------
533; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
534;---------------------------------------------------------------------------------------------
535
536%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/8/9/10/11
537 VP9_UNPACK_MULSUB_2D_4X 11, 0, 4, 5, 16305, 1606 ; m11/4=t1[d], m0/5=t0[d]
538 VP9_UNPACK_MULSUB_2D_4X 3, 8, 6, 13, 10394, 12665 ; m3/6=t5[d], m8/13=t4[d]
539 VP9_RND_SH_SUMSUB_BA 8, 0, 13, 5, 14, m7 ; m8=t0[w], m0=t4[w]
540 VP9_RND_SH_SUMSUB_BA 3, 11, 6, 4, 14, m7 ; m3=t1[w], m11=t5[w]
541
542 VP9_UNPACK_MULSUB_2D_4X 9, 2, 4, 5, 14449, 7723 ; m9/4=t3[d], m2/5=t2[d]
543 VP9_UNPACK_MULSUB_2D_4X 1, 10, 6, 13, 4756, 15679 ; m1/6=t7[d], m10/13=t6[d]
544 VP9_RND_SH_SUMSUB_BA 10, 2, 13, 5, 14, m7 ; m10=t2[w], m2=t6[w]
545 VP9_RND_SH_SUMSUB_BA 1, 9, 6, 4, 14, m7 ; m1=t3[w], m9=t7[w]
546
547 ; m8=t0, m3=t1, m10=t2, m1=t3, m0=t4, m11=t5, m2=t6, m9=t7
548
549 VP9_UNPACK_MULSUB_2D_4X 0, 11, 4, 5, 15137, 6270 ; m0/4=t5[d], m11/5=t4[d]
550 VP9_UNPACK_MULSUB_2D_4X 9, 2, 6, 13, 6270, 15137 ; m9/6=t6[d], m2/13=t7[d]
551 VP9_RND_SH_SUMSUB_BA 9, 11, 6, 5, 14, m7
552 psignw m9, [pw_m1] ; m9=out1[w], m11=t6[w]
553 VP9_RND_SH_SUMSUB_BA 2, 0, 13, 4, 14, m7 ; m2=out6[w], m0=t7[w]
554
555 SUMSUB_BA w, 10, 8, 14 ; m10=out0[w], m8=t2[w]
556 SUMSUB_BA w, 1, 3, 14
557 psignw m1, [pw_m1] ; m1=out7[w], m3=t3[w]
558
559 ; m10=out0, m9=out1, m8=t2, m3=t3, m11=t6, m0=t7, m2=out6, m1=out7
560
561 SUMSUB_BA w, 3, 8, 4
562 SUMSUB_BA w, 0, 11, 5
563 pmulhrsw m3, m12
564 pmulhrsw m11, m12
565 pmulhrsw m8, m12 ; out4
566 pmulhrsw m0, m12 ; out2
567 psignw m3, [pw_m1] ; out3
568 psignw m11, [pw_m1] ; out5
569
570 ; m10=out0, m9=out1, m0=out2, m3=out3, m8=out4, m11=out5, m2=out6, m1=out7
571
572 SWAP 0, 10, 2
573 SWAP 11, 1, 9
574%endmacro
575
576%macro IADST8_FN 5
577INIT_XMM %5
578cglobal vp9_%1_%3_8x8_add, 3, 3, 15, dst, stride, block, eob
579 mova m0, [blockq+ 0] ; IN(0)
580 mova m1, [blockq+ 16] ; IN(1)
581 mova m2, [blockq+ 32] ; IN(2)
582 mova m3, [blockq+ 48] ; IN(3)
583 mova m8, [blockq+ 64] ; IN(4)
584 mova m9, [blockq+ 80] ; IN(5)
585 mova m10, [blockq+ 96] ; IN(6)
586 mova m11, [blockq+112] ; IN(7)
587
588 mova m12, [pw_11585x2] ; often used
589 mova m7, [pd_8192] ; rounding
590 VP9_%2_1D
591 TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4
592 VP9_%4_1D
593
594 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
595 ZERO_BLOCK blockq, 16, 8, m4
596 VP9_IDCT8_WRITEOUT
597 RET
598%endmacro
599
600IADST8_FN idct, IDCT8, iadst, IADST8, ssse3
601IADST8_FN idct, IDCT8, iadst, IADST8, avx
602IADST8_FN iadst, IADST8, idct, IDCT8, ssse3
603IADST8_FN iadst, IADST8, idct, IDCT8, avx
604IADST8_FN iadst, IADST8, iadst, IADST8, ssse3
605IADST8_FN iadst, IADST8, iadst, IADST8, avx
606
607;---------------------------------------------------------------------------------------------
608; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
609;---------------------------------------------------------------------------------------------
610
611; at the end of this macro, m7 is stored in stack_scratch
612; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
613; the following sumsubs have not been done yet:
614; SUMSUB_BA w, 6, 9, 15 ; t6, t9
615; SUMSUB_BA w, 7, 8, 15 ; t7, t8
616%macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch
617%if %2 <= 4
618 mova m3, [%1+ 1*%3] ; IN(1)
619 mova m12, [%1+ 2*%3] ; IN(2)
620 mova m0, [%1+ 3*%3] ; IN(3)
621
622 pmulhrsw m15, m12, [pw_16069x2] ; t6-7
623 pmulhrsw m12, [pw_3196x2] ; t4-5
624 pmulhrsw m4, m3, [pw_16305x2] ; t14-15
625 pmulhrsw m3, [pw_1606x2] ; t8-9
626 pmulhrsw m7, m0, [pw_m4756x2] ; t10-11
627 pmulhrsw m0, [pw_15679x2] ; t12-13
628
629 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
630 ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
631
632 paddw m14, m15, m12
633 psubw m13, m15, m12
634 pmulhrsw m13, [pw_11585x2] ; t5
635 pmulhrsw m14, [pw_11585x2] ; t6
636
637 VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 10, 11 ; t9, t14
638 VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
639
640 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
641 ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
642%else
643 mova m5, [%1+ 1*%3] ; IN(1)
644 mova m14, [%1+ 2*%3] ; IN(2)
645 mova m6, [%1+ 3*%3] ; IN(3)
646 mova m9, [%1+ 4*%3] ; IN(4)
647 mova m7, [%1+ 5*%3] ; IN(5)
648 mova m15, [%1+ 6*%3] ; IN(6)
649 mova m4, [%1+ 7*%3] ; IN(7)
650%if %2 <= 8
651 pmulhrsw m8, m9, [pw_15137x2] ; t3
652 pmulhrsw m9, [pw_6270x2] ; t2
653 pmulhrsw m13, m14, [pw_16069x2] ; t7
654 pmulhrsw m14, [pw_3196x2] ; t4
655 pmulhrsw m12, m15, [pw_m9102x2] ; t5
656 pmulhrsw m15, [pw_13623x2] ; t6
657 pmulhrsw m2, m5, [pw_16305x2] ; t15
658 pmulhrsw m5, [pw_1606x2] ; t8
659 pmulhrsw m3, m4, [pw_m10394x2] ; t9
660 pmulhrsw m4, [pw_12665x2] ; t14
661 pmulhrsw m0, m7, [pw_14449x2] ; t13
662 pmulhrsw m7, [pw_7723x2] ; t10
663 pmulhrsw m1, m6, [pw_m4756x2] ; t11
664 pmulhrsw m6, [pw_15679x2] ; t12
665%else
666 mova m3, [%1+ 9*%3] ; IN(9)
667 mova m12, [%1+10*%3] ; IN(10)
668 mova m0, [%1+11*%3] ; IN(11)
669 mova m8, [%1+12*%3] ; IN(12)
670 mova m1, [%1+13*%3] ; IN(13)
671 mova m13, [%1+14*%3] ; IN(14)
672 mova m2, [%1+15*%3] ; IN(15)
673
674 ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
675 ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
676
677 VP9_UNPACK_MULSUB_2W_4X 9, 8, 15137, 6270, [pd_8192], 10, 11 ; t2, t3
678 VP9_UNPACK_MULSUB_2W_4X 14, 13, 16069, 3196, [pd_8192], 10, 11 ; t4, t7
679 VP9_UNPACK_MULSUB_2W_4X 12, 15, 9102, 13623, [pd_8192], 10, 11 ; t5, t6
680 VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 10, 11 ; t8, t15
681 VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 10, 11 ; t9, t14
682 VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 10, 11 ; t10, t13
683 VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 10, 11 ; t11, t12
684%endif
685
686 ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
687 ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
688
689 SUMSUB_BA w, 12, 14, 10 ; t4, t5
690 SUMSUB_BA w, 15, 13, 10 ; t7, t6
691 SUMSUB_BA w, 3, 5, 10 ; t8, t9
692 SUMSUB_BA w, 7, 1, 10 ; t11, t10
693 SUMSUB_BA w, 0, 6, 10 ; t12, t13
694 SUMSUB_BA w, 4, 2, 10 ; t15, t14
695
696 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
697 ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
698
699 SUMSUB_BA w, 14, 13, 10
700 pmulhrsw m13, [pw_11585x2] ; t5
701 pmulhrsw m14, [pw_11585x2] ; t6
702 VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 10, 11 ; t9, t14
703 VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
704%endif
705
706 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
707 ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
708
709 SUMSUB_BA w, 7, 3, 10 ; t8, t11
710 SUMSUB_BA w, 6, 2, 10 ; t9, t10
711 SUMSUB_BA w, 0, 4, 10 ; t15, t12
712 SUMSUB_BA w, 1, 5, 10 ; t14. t13
713
714 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
715 ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
716
717 SUMSUB_BA w, 2, 5, 10
718 SUMSUB_BA w, 3, 4, 10
719 pmulhrsw m5, [pw_11585x2] ; t10
720 pmulhrsw m4, [pw_11585x2] ; t11
721 pmulhrsw m3, [pw_11585x2] ; t12
722 pmulhrsw m2, [pw_11585x2] ; t13
723
724 ; backup first register
725 mova [%4], m7
726
727 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
728 ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
729
730 ; from load/start
731%if %2 <= 4
732 mova m11, [%1+ 0*%3] ; IN(0)
733 pmulhrsw m11, [pw_11585x2] ; t0-t3
734
735 psubw m8, m11, m15
736 paddw m15, m11
737 psubw m9, m11, m14
738 paddw m14, m11
739 psubw m10, m11, m13
740 paddw m13, m11
741%else
742 mova m10, [%1+ 0*%3] ; IN(0)
743%if %2 <= 8
744 pmulhrsw m10, [pw_11585x2] ; t0 and t1
745 psubw m11, m10, m8
746 paddw m8, m10
747%else
748 mova m11, [%1+ 8*%3] ; IN(8)
749
750 ; from 3 stages back
751 SUMSUB_BA w, 11, 10, 7
752 pmulhrsw m11, [pw_11585x2] ; t0
753 pmulhrsw m10, [pw_11585x2] ; t1
754
755 ; from 2 stages back
756 SUMSUB_BA w, 8, 11, 7 ; t0, t3
757%endif
758 SUMSUB_BA w, 9, 10, 7 ; t1, t2
759
760 ; from 1 stage back
761 SUMSUB_BA w, 15, 8, 7 ; t0, t7
762 SUMSUB_BA w, 14, 9, 7 ; t1, t6
763 SUMSUB_BA w, 13, 10, 7 ; t2, t5
764%endif
765 SUMSUB_BA w, 12, 11, 7 ; t3, t4
766
767 SUMSUB_BA w, 0, 15, 7 ; t0, t15
768 SUMSUB_BA w, 1, 14, 7 ; t1, t14
769 SUMSUB_BA w, 2, 13, 7 ; t2, t13
770 SUMSUB_BA w, 3, 12, 7 ; t3, t12
771 SUMSUB_BA w, 4, 11, 7 ; t4, t11
772 SUMSUB_BA w, 5, 10, 7 ; t5, t10
773%endmacro
774
775%macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc
776 VP9_IDCT16_1D_START %1, %3, 32, tmpq+32
777
778%if %2 == 1
779 ; backup a different register
780 mova [tmpq+16], m15
781 mova m7, [tmpq+32]
782
783 SUMSUB_BA w, 6, 9, 15 ; t6, t9
784 SUMSUB_BA w, 7, 8, 15 ; t7, t8
785
786 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15
787 mova [tmpq+ 0], m0
788 mova [tmpq+ 32], m1
789 mova [tmpq+ 64], m2
790 mova [tmpq+ 96], m3
791 mova [tmpq+128], m4
792 mova [tmpq+160], m5
793 mova [tmpq+192], m6
794 mova [tmpq+224], m7
795
796 mova m15, [tmpq+16]
797 TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
798 mova [tmpq+ 16], m8
799 mova [tmpq+ 48], m9
800 mova [tmpq+ 80], m10
801 mova [tmpq+112], m11
802 mova [tmpq+144], m12
803 mova [tmpq+176], m13
804 mova [tmpq+208], m14
805 mova [tmpq+240], m15
806%else ; %2 == 2
807 ; backup more registers
808 mova [tmpq+64], m8
809 mova [tmpq+96], m9
810
811 pxor m7, m7
812 pmulhrsw m0, [pw_512]
813 pmulhrsw m1, [pw_512]
814 VP9_STORE_2X 0, 1, 8, 9, 7
815 lea dstq, [dstq+strideq*2]
816 pmulhrsw m2, [pw_512]
817 pmulhrsw m3, [pw_512]
818 VP9_STORE_2X 2, 3, 8, 9, 7
819 lea dstq, [dstq+strideq*2]
820 pmulhrsw m4, [pw_512]
821 pmulhrsw m5, [pw_512]
822 VP9_STORE_2X 4, 5, 8, 9, 7
823 lea dstq, [dstq+strideq*2]
824
825 ; restore from cache
826 SWAP 0, 7 ; move zero from m7 to m0
827 mova m7, [tmpq+32]
828 mova m8, [tmpq+64]
829 mova m9, [tmpq+96]
830
831 SUMSUB_BA w, 6, 9, 1 ; t6, t9
832 SUMSUB_BA w, 7, 8, 1 ; t7, t8
833
834 pmulhrsw m6, [pw_512]
835 pmulhrsw m7, [pw_512]
836 VP9_STORE_2X 6, 7, 1, 2, 0
837 lea dstq, [dstq+strideq*2]
838 pmulhrsw m8, [pw_512]
839 pmulhrsw m9, [pw_512]
840 VP9_STORE_2X 8, 9, 1, 2, 0
841 lea dstq, [dstq+strideq*2]
842 pmulhrsw m10, [pw_512]
843 pmulhrsw m11, [pw_512]
844 VP9_STORE_2X 10, 11, 1, 2, 0
845 lea dstq, [dstq+strideq*2]
846 pmulhrsw m12, [pw_512]
847 pmulhrsw m13, [pw_512]
848 VP9_STORE_2X 12, 13, 1, 2, 0
849 lea dstq, [dstq+strideq*2]
850 pmulhrsw m14, [pw_512]
851 pmulhrsw m15, [pw_512]
852 VP9_STORE_2X 14, 15, 1, 2, 0
853%endif ; %2 == 1/2
854%endmacro
855
856%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
857 mova m%3, [dstq]
858 mova m%5, [dstq+%7]
859 punpcklbw m%2, m%3, m%6
860 punpckhbw m%3, m%6
861 punpcklbw m%4, m%5, m%6
862 punpckhbw m%5, m%6
863 paddw m%2, m%1
864 paddw m%3, m%1
865 paddw m%4, m%1
866 paddw m%5, m%1
867 packuswb m%2, m%3
868 packuswb m%4, m%5
869 mova [dstq], m%2
870 mova [dstq+%7], m%4
871%endmacro
872
873%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
874INIT_XMM %1
875cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
876 ; 2x2=eob=3, 4x4=eob=10
877 cmp eobd, 38
878 jg .idctfull
879 cmp eobd, 1 ; faster path for when only DC is set
880 jne .idct8x8
881
882 ; dc-only
883 movd m0, [blockq]
884 mova m1, [pw_11585x2]
885 pmulhrsw m0, m1
886 pmulhrsw m0, m1
887 SPLATW m0, m0, q0000
888 pmulhrsw m0, [pw_512]
889 pxor m5, m5
890 movd [blockq], m5
891%rep 7
892 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
893 lea dstq, [dstq+2*strideq]
894%endrep
895 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
896 RET
897
898 DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
899.idct8x8:
900 mov tmpq, rsp
901 VP9_IDCT16_1D blockq, 1, 8
902
903 mov cntd, 2
904 mov dst_bakq, dstq
905.loop2_8x8:
906 VP9_IDCT16_1D tmpq, 2, 8
907 lea dstq, [dst_bakq+8]
908 add tmpq, 16
909 dec cntd
910 jg .loop2_8x8
911
912 ; at the end of the loop, m0 should still be zero
913 ; use that to zero out block coefficients
914 ZERO_BLOCK blockq, 32, 8, m0
915 RET
916
917.idctfull:
918 mov cntd, 2
919 mov tmpq, rsp
920.loop1_full:
921 VP9_IDCT16_1D blockq, 1
922 add blockq, 16
923 add tmpq, 256
924 dec cntd
925 jg .loop1_full
926 sub blockq, 32
927
928 mov cntd, 2
929 mov tmpq, rsp
930 mov dst_bakq, dstq
931.loop2_full:
932 VP9_IDCT16_1D tmpq, 2
933 lea dstq, [dst_bakq+8]
934 add tmpq, 16
935 dec cntd
936 jg .loop2_full
937
938 ; at the end of the loop, m0 should still be zero
939 ; use that to zero out block coefficients
940 ZERO_BLOCK blockq, 32, 16, m0
941 RET
942%endmacro
943
944VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
945VP9_IDCT_IDCT_16x16_ADD_XMM avx
946
947;---------------------------------------------------------------------------------------------
948; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
949;---------------------------------------------------------------------------------------------
950
951%macro VP9_IADST16_1D 2 ; src, pass
952%assign %%str 16*%2
953 mova m0, [%1+ 0*32] ; in0
954 mova m1, [%1+15*32] ; in15
955 mova m8, [%1+ 7*32] ; in7
956 mova m9, [%1+ 8*32] ; in8
957
958 VP9_UNPACK_MULSUB_2D_4X 1, 0, 2, 3, 16364, 804 ; m1/2=t1[d], m0/3=t0[d]
959 VP9_UNPACK_MULSUB_2D_4X 8, 9, 11, 10, 11003, 12140 ; m8/11=t9[d], m9/10=t8[d]
960 VP9_RND_SH_SUMSUB_BA 9, 0, 10, 3, 4, [pd_8192] ; m9=t0[w], m0=t8[w]
961 VP9_RND_SH_SUMSUB_BA 8, 1, 11, 2, 4, [pd_8192] ; m8=t1[w], m1=t9[w]
962
963 mova m11, [%1+ 2*32] ; in2
964 mova m10, [%1+13*32] ; in13
965 mova m3, [%1+ 5*32] ; in5
966 mova m2, [%1+10*32] ; in10
967
968 VP9_UNPACK_MULSUB_2D_4X 10, 11, 6, 7, 15893, 3981 ; m10/6=t3[d], m11/7=t2[d]
969 VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d]
970 VP9_RND_SH_SUMSUB_BA 2, 11, 5, 7, 12, [pd_8192] ; m2=t2[w], m11=t10[w]
971 VP9_RND_SH_SUMSUB_BA 3, 10, 4, 6, 12, [pd_8192] ; m3=t3[w], m10=t11[w]
972
973 mova [tmpq+ 0*%%str], m9 ; make some scratch space (t0:m9->r0)
974 mova m4, [%1+ 4*32] ; in4
975 mova m5, [%1+11*32] ; in11
976 mova m12, [%1+ 3*32] ; in3
977 mova m13, [%1+12*32] ; in12
978
979 VP9_UNPACK_MULSUB_2D_4X 5, 4, 7, 6, 14811, 7005 ; m5/7=t5[d], m4/6=t4[d]
980 VP9_UNPACK_MULSUB_2D_4X 12, 13, 14, 15, 5520, 15426 ; m12/14=t13[d], m13/15=t12[d]
981 VP9_RND_SH_SUMSUB_BA 13, 4, 15, 6, 9, [pd_8192] ; m13=t4[w], m4=t12[w]
982 VP9_RND_SH_SUMSUB_BA 12, 5, 14, 7, 9, [pd_8192] ; m12=t5[w], m5=t13[w]
983
984 mova [tmpq+ 2*%%str], m8 ; t1:m9->r2
985 mova [tmpq+ 3*%%str], m2 ; t2:m2->r3
986 mova [tmpq+ 4*%%str], m3 ; t3:m3->r4
987 mova [tmpq+ 5*%%str], m13 ; t4:m13->r5
988 mova m2, [%1+ 6*32] ; in6
989 mova m3, [%1+ 9*32] ; in9
990 mova m8, [%1+ 1*32] ; in1
991 mova m9, [%1+14*32] ; in14
992
993 VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d]
994 VP9_UNPACK_MULSUB_2D_4X 8, 9, 13, 14, 2404, 16207 ; m8/13=t15[d], m9/14=t14[d]
995 VP9_RND_SH_SUMSUB_BA 9, 2, 14, 6, 15, [pd_8192] ; m9=t6[w], m2=t14[w]
996 VP9_RND_SH_SUMSUB_BA 8, 3, 13, 7, 15, [pd_8192] ; m8=t7[w], m3=t15[w]
997
998 ; r0=t0, r2=t1, r3=t2, r4=t3, r5=t4, m12=t5, m9=t6, m8=t7
999 ; m0=t8, m1=t9, m11=t10, m10=t11, m4=t12, m5=t13, m2=t14, m3=t15
1000
1001 ; handle t8-15 first
1002 VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d]
1003 VP9_UNPACK_MULSUB_2D_4X 5, 4, 13, 14, 3196, 16069 ; m5/13=t12[d], m4/14=t13[d]
1004 VP9_RND_SH_SUMSUB_BA 5, 1, 13, 7, 15, [pd_8192] ; m5=t8[w], m1=t12[w]
1005 VP9_RND_SH_SUMSUB_BA 4, 0, 14, 6, 15, [pd_8192] ; m4=t9[w], m0=t13[w]
1006
1007 VP9_UNPACK_MULSUB_2D_4X 11, 10, 6, 7, 9102, 13623 ; m11/6=t11[d], m10/7=t10[d]
1008 VP9_UNPACK_MULSUB_2D_4X 3, 2, 13, 14, 13623, 9102 ; m3/13=t14[d], m2/14=t15[d]
1009 VP9_RND_SH_SUMSUB_BA 3, 10, 13, 7, 15, [pd_8192] ; m3=t10[w], m10=t14[w]
1010 VP9_RND_SH_SUMSUB_BA 2, 11, 14, 6, 15, [pd_8192] ; m2=t11[w], m11=t15[w]
1011
1012 ; m5=t8, m4=t9, m3=t10, m2=t11, m1=t12, m0=t13, m10=t14, m11=t15
1013
1014 VP9_UNPACK_MULSUB_2D_4X 1, 0, 6, 7, 15137, 6270 ; m1/6=t13[d], m0/7=t12[d]
1015 VP9_UNPACK_MULSUB_2D_4X 11, 10, 13, 14, 6270, 15137 ; m11/13=t14[d], m10/14=t15[d]
1016 VP9_RND_SH_SUMSUB_BA 11, 0, 13, 7, 15, [pd_8192] ; m11=out2[w], m0=t14[w]
1017 VP9_RND_SH_SUMSUB_BA 10, 1, 14, 6, 15, [pd_8192]
1018 psignw m10, [pw_m1] ; m10=out13[w], m1=t15[w]
1019
1020 SUMSUB_BA w, 3, 5, 15
1021 psignw m3, [pw_m1] ; m3=out1[w], m5=t10[w]
1022 SUMSUB_BA w, 2, 4, 15 ; m2=out14[w], m4=t11[w]
1023
1024 SUMSUB_BA w, 5, 4, 15
1025 pmulhrsw m5, [pw_11585x2] ; m5=out6[w]
1026 pmulhrsw m4, [pw_11585x2] ; m4=out9[w]
1027 SUMSUB_BA w, 1, 0, 15
1028 pmulhrsw m1, [pw_m11585x2] ; m1=out5[w]
1029 pmulhrsw m0, [pw_11585x2] ; m0=out10[w]
1030
1031 ; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14
1032
1033 mova m6, [tmpq+ 0*%%str]
1034 mova m7, [tmpq+ 2*%%str]
1035 mova m13, [tmpq+ 3*%%str]
1036 mova m14, [tmpq+ 4*%%str]
1037 mova m15, [tmpq+ 5*%%str]
1038 mova [tmpq+ 8*%%str], m5
1039 mova [tmpq+ 9*%%str], m4
1040 mova [tmpq+10*%%str], m0
1041 mova [tmpq+11*%%str], m10
1042 mova [tmpq+12*%%str], m2
1043
1044 ; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7
1045 ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
1046
1047 SUMSUB_BA w, 15, 6, 0 ; m15=t0[w], m6=t4[w]
1048 SUMSUB_BA w, 12, 7, 0 ; m12=t1[w], m7=t5[w]
1049 SUMSUB_BA w, 9, 13, 0 ; m9=t2[w], m13=t6[w]
1050 SUMSUB_BA w, 8, 14, 0 ; m8=t3[w], m14=t7[w]
1051
1052 VP9_UNPACK_MULSUB_2D_4X 6, 7, 0, 2, 15137, 6270 ; m6/0=t5[d], m7/2=t4[d]
1053 VP9_UNPACK_MULSUB_2D_4X 14, 13, 4, 5, 6270, 15137 ; m14/4=t6[d], m13/5=t7[d]
1054 VP9_RND_SH_SUMSUB_BA 14, 7, 4, 2, 10, [pd_8192]
1055 psignw m14, [pw_m1] ; m14=out3[w], m7=t6[w]
1056 VP9_RND_SH_SUMSUB_BA 13, 6, 5, 0, 10, [pd_8192] ; m13=out12[w], m6=t7[w]
1057 SUMSUB_BA w, 9, 15, 10 ; m9=out0[w], m15=t2[w]
1058 SUMSUB_BA w, 8, 12, 10
1059 psignw m8, [pw_m1] ; m8=out15[w], m12=t3[w]
1060
1061 SUMSUB_BA w, 12, 15, 10
1062 pmulhrsw m12, [pw_m11585x2] ; m12=out7[w]
1063 pmulhrsw m15, [pw_11585x2] ; m15=out8[w]
1064 SUMSUB_BA w, 7, 6, 10
1065 pmulhrsw m7, [pw_11585x2] ; m7=out4[w]
1066 pmulhrsw m6, [pw_11585x2] ; m6=out11[w]
1067
1068 ; m9=out0, m14=out3, m7=out4, m12=out7, m15=out8, m6=out11, m13=out12, m8=out15
1069 ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
1070
1071%if %2 == 1
1072 mova m0, [tmpq+ 8*%%str]
1073 TRANSPOSE8x8W 9, 3, 11, 14, 7, 1, 0, 12, 2
1074 mova [tmpq+ 0*16], m9
1075 mova [tmpq+ 2*16], m3
1076 mova [tmpq+ 4*16], m11
1077 mova [tmpq+ 6*16], m14
1078 mova m9, [tmpq+ 9*%%str]
1079 mova m3, [tmpq+10*%%str]
1080 mova m11, [tmpq+11*%%str]
1081 mova m14, [tmpq+12*%%str]
1082 mova [tmpq+ 8*16], m7
1083 mova [tmpq+10*16], m1
1084 mova [tmpq+12*16], m0
1085 mova [tmpq+14*16], m12
1086
1087 TRANSPOSE8x8W 15, 9, 3, 6, 13, 11, 14, 8, 2
1088 mova [tmpq+ 1*16], m15
1089 mova [tmpq+ 3*16], m9
1090 mova [tmpq+ 5*16], m3
1091 mova [tmpq+ 7*16], m6
1092 mova [tmpq+ 9*16], m13
1093 mova [tmpq+11*16], m11
1094 mova [tmpq+13*16], m14
1095 mova [tmpq+15*16], m8
1096%else
1097 mova m5, [tmpq+ 8*%%str]
1098 pxor m0, m0
1099
1100 pmulhrsw m9, [pw_512]
1101 pmulhrsw m3, [pw_512]
1102 VP9_STORE_2X 9, 3, 2, 4, 0
1103 lea dstq, [dstq+strideq*2]
1104 pmulhrsw m11, [pw_512]
1105 pmulhrsw m14, [pw_512]
1106 VP9_STORE_2X 11, 14, 2, 4, 0
1107 lea dstq, [dstq+strideq*2]
1108 pmulhrsw m7, [pw_512]
1109 pmulhrsw m1, [pw_512]
1110 VP9_STORE_2X 7, 1, 2, 4, 0
1111 lea dstq, [dstq+strideq*2]
1112 pmulhrsw m5, [pw_512]
1113 pmulhrsw m12, [pw_512]
1114 VP9_STORE_2X 5, 12, 2, 4, 0
1115 lea dstq, [dstq+strideq*2]
1116
1117 mova m9, [tmpq+ 9*%%str]
1118 mova m3, [tmpq+10*%%str]
1119 mova m11, [tmpq+11*%%str]
1120 mova m14, [tmpq+12*%%str]
1121
1122 pmulhrsw m15, [pw_512]
1123 pmulhrsw m9, [pw_512]
1124 VP9_STORE_2X 15, 9, 2, 4, 0
1125 lea dstq, [dstq+strideq*2]
1126 pmulhrsw m3, [pw_512]
1127 pmulhrsw m6, [pw_512]
1128 VP9_STORE_2X 3, 6, 2, 4, 0
1129 lea dstq, [dstq+strideq*2]
1130 pmulhrsw m13, [pw_512]
1131 pmulhrsw m11, [pw_512]
1132 VP9_STORE_2X 13, 11, 2, 4, 0
1133 lea dstq, [dstq+strideq*2]
1134 pmulhrsw m14, [pw_512]
1135 pmulhrsw m8, [pw_512]
1136 VP9_STORE_2X 14, 8, 2, 4, 0
1137%endif
1138%endmacro
1139
1140%macro IADST16_FN 5
1141INIT_XMM %5
1142cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
1143 mov cntd, 2
1144 mov tmpq, rsp
1145.loop1_full:
1146 VP9_%2_1D blockq, 1
1147 add blockq, 16
1148 add tmpq, 256
1149 dec cntd
1150 jg .loop1_full
1151 sub blockq, 32
1152
1153 mov cntd, 2
1154 mov tmpq, rsp
1155 mov dst_bakq, dstq
1156.loop2_full:
1157 VP9_%4_1D tmpq, 2
1158 lea dstq, [dst_bakq+8]
1159 add tmpq, 16
1160 dec cntd
1161 jg .loop2_full
1162
1163 ; at the end of the loop, m0 should still be zero
1164 ; use that to zero out block coefficients
1165 ZERO_BLOCK blockq, 32, 16, m0
1166 RET
1167%endmacro
1168
1169IADST16_FN idct, IDCT16, iadst, IADST16, ssse3
1170IADST16_FN idct, IDCT16, iadst, IADST16, avx
1171IADST16_FN iadst, IADST16, idct, IDCT16, ssse3
1172IADST16_FN iadst, IADST16, idct, IDCT16, avx
1173IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
1174IADST16_FN iadst, IADST16, iadst, IADST16, avx
1175
1176;---------------------------------------------------------------------------------------------
1177; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
1178;---------------------------------------------------------------------------------------------
1179
1180%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
1181%assign %%str 16*%2*%2
1182 ; first do t0-15, this can be done identical to idct16x16
1183 VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq+ 4*%%str
1184
1185 ; backup a different register
1186 mova [tmpq+30*%%str], m15 ; t15
1187 mova m7, [tmpq+ 4*%%str]
1188
1189 SUMSUB_BA w, 6, 9, 15 ; t6, t9
1190 SUMSUB_BA w, 7, 8, 15 ; t7, t8
1191
1192 ; store everything on stack to make space available for t16-31
1193 ; we store interleaved with the output of the second half (t16-31)
1194 ; so we don't need to allocate extra stack space
1195 mova [tmpq+ 0*%%str], m0 ; t0
1196 mova [tmpq+ 4*%%str], m1 ; t1
1197 mova [tmpq+ 8*%%str], m2 ; t2
1198 mova [tmpq+12*%%str], m3 ; t3
1199 mova [tmpq+16*%%str], m4 ; t4
1200 mova [tmpq+20*%%str], m5 ; t5
1201 mova [tmpq+24*%%str], m6 ; t6
1202 mova [tmpq+28*%%str], m7 ; t7
1203 mova [tmpq+ 2*%%str], m8 ; t8
1204 mova [tmpq+ 6*%%str], m9 ; t9
1205 mova [tmpq+10*%%str], m10 ; t10
1206 mova [tmpq+14*%%str], m11 ; t11
1207 mova [tmpq+18*%%str], m12 ; t12
1208 mova [tmpq+22*%%str], m13 ; t13
1209 mova [tmpq+26*%%str], m14 ; t14
1210
1211 ; then, secondly, do t16-31
1212%if %3 <= 8
1213 mova m4, [%1+ 1*64]
1214 mova m3, [%1+ 3*64]
1215 mova m0, [%1+ 5*64]
1216 mova m7, [%1+ 7*64]
1217
1218 pmulhrsw m11, m4, [pw_16364x2] ;t31
1219 pmulhrsw m4, [pw_804x2] ;t16
1220 pmulhrsw m8, m7, [pw_m5520x2] ;t19
1221 pmulhrsw m7, [pw_15426x2] ;t28
1222 pmulhrsw m15, m0, [pw_15893x2] ;t27
1223 pmulhrsw m0, [pw_3981x2] ;t20
1224 pmulhrsw m12, m3, [pw_m2404x2] ;t23
1225 pmulhrsw m3, [pw_16207x2] ;t24
1226
1227 ; m4=t16/17, m8=t18/19, m0=t20/21, m12=t22/23,
1228 ; m3=t24/25, m15=t26/27, m7=t28/29, m11=t30/31
1229
1230 VP9_UNPACK_MULSUB_2W_4X 5, 10, 11, 4, 16069, 3196, [pd_8192], 6, 9 ; t17, t30
1231 VP9_UNPACK_MULSUB_2W_4X 9, 6, 7, 8, 3196, m16069, [pd_8192], 1, 14 ; t18, t29
1232 ; from 1 stage forward
1233 SUMSUB_BA w, 8, 4, 1
1234 ; temporary storage
1235 mova [tmpq+17*%%str], m8 ; t16
1236 mova [tmpq+21*%%str], m4 ; t19
1237 VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
1238 VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
1239
1240 ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
1241 ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
1242%else
1243 mova m10, [%1+ 1*64]
1244 mova m13, [%1+ 3*64]
1245 mova m14, [%1+ 5*64]
1246 mova m9, [%1+ 7*64]
1247 mova m8, [%1+ 9*64]
1248 mova m15, [%1+11*64]
1249 mova m12, [%1+13*64]
1250 mova m11, [%1+15*64]
1251%if %3 <= 16
1252 pmulhrsw m5, m10, [pw_16364x2]
1253 pmulhrsw m10, [pw_804x2]
1254 pmulhrsw m4, m11, [pw_m11003x2]
1255 pmulhrsw m11, [pw_12140x2]
1256 pmulhrsw m7, m8, [pw_14811x2]
1257 pmulhrsw m8, [pw_7005x2]
1258 pmulhrsw m6, m9, [pw_m5520x2]
1259 pmulhrsw m9, [pw_15426x2]
1260 pmulhrsw m1, m14, [pw_15893x2]
1261 pmulhrsw m14, [pw_3981x2]
1262 pmulhrsw m0, m15, [pw_m8423x2]
1263 pmulhrsw m15, [pw_14053x2]
1264%else
1265 mova m4, [%1+17*64]
1266 mova m0, [%1+21*64]
1267 mova m7, [%1+23*64]
1268 mova m6, [%1+25*64]
1269 mova m1, [%1+27*64]
1270 mova m5, [%1+31*64]
1271
1272 ; m10=in1, m4=in17, m8=in9, m6=in25, m14=in5, m0=in21, m12=in13, m2=in29,
1273 ; m13=in3, m3=in19, m15=in11, m1=in27, m9=in7, m7=in23, m11=in15, m5=in31
1274
1275 VP9_UNPACK_MULSUB_2W_4X 10, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31
1276 VP9_UNPACK_MULSUB_2W_4X 4, 11, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
1277 VP9_UNPACK_MULSUB_2W_4X 8, 7, 14811, 7005, [pd_8192], 2, 3 ; t18, t29
1278 VP9_UNPACK_MULSUB_2W_4X 6, 9, 5520, 15426, [pd_8192], 2, 3 ; t19, t28
1279 VP9_UNPACK_MULSUB_2W_4X 14, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27
1280 VP9_UNPACK_MULSUB_2W_4X 0, 15, 8423, 14053, [pd_8192], 2, 3 ; t21, t26
1281%endif
1282
1283 ; from 1 stage forward
1284 SUMSUB_BA w, 4, 10, 2
1285 SUMSUB_BA w, 8, 6, 2
1286 ; from 2 stages forward
1287 SUMSUB_BA w, 8, 4, 2
1288 ; temporary storage
1289 mova [tmpq+17*%%str], m8 ; t16
1290 mova [tmpq+21*%%str], m4 ; t19
1291%if %3 <= 16
1292 pmulhrsw m3, m12, [pw_13160x2]
1293 pmulhrsw m12, [pw_9760x2]
1294 pmulhrsw m2, m13, [pw_m2404x2]
1295 pmulhrsw m13, [pw_16207x2]
1296%else
1297 mova m2, [%1+29*64]
1298 mova m3, [%1+19*64]
1299 VP9_UNPACK_MULSUB_2W_4X 12, 3, 13160, 9760, [pd_8192], 4, 8 ; t22, t25
1300 VP9_UNPACK_MULSUB_2W_4X 2, 13, 2404, 16207, [pd_8192], 4, 8 ; t23, t24
1301%endif
1302
1303 ; m10=t16, m4=t17, m8=t18, m6=t19, m14=t20, m0=t21, m12=t22, m2=t23,
1304 ; m13=t24, m3=t25, m15=t26, m1=t27, m9=t28, m7=t29, m11=t30, m5=t31
1305
1306 SUMSUB_BA w, 0, 14, 4
1307 SUMSUB_BA w, 12, 2, 4
1308 SUMSUB_BA w, 3, 13, 4
1309 SUMSUB_BA w, 15, 1, 4
1310 SUMSUB_BA w, 7, 9, 4
1311 SUMSUB_BA w, 11, 5, 4
1312
1313 ; m4=t16, m10=t17, m6=t18, m8=t19, m0=t20, m14=t21, m2=t22, m12=t23,
1314 ; m3=t24, m13=t25, m1=t26, m15=t27, m7=t28, m9=t29, m5=t30, m11=t31
1315
1316 VP9_UNPACK_MULSUB_2W_4X 5, 10, 16069, 3196, [pd_8192], 4, 8 ; t17, t30
1317 VP9_UNPACK_MULSUB_2W_4X 9, 6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29
1318 VP9_UNPACK_MULSUB_2W_4X 1, 14, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
1319 VP9_UNPACK_MULSUB_2W_4X 13, 2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
1320%endif
1321
1322 ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
1323 ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
1324
1325 SUMSUB_BA w, 9, 5, 4
1326 SUMSUB_BA w, 1, 13, 4
1327 SUMSUB_BA w, 0, 12, 4
1328 SUMSUB_BA w, 15, 3, 4
1329 SUMSUB_BA w, 14, 2, 4
1330 SUMSUB_BA w, 6, 10, 4
1331 SUMSUB_BA w, 7, 11, 4
1332
1333 ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
1334 ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
1335
1336 mova m8, [tmpq+17*%%str] ; t16
1337 ; from 2 stages forward
1338 SUMSUB_BA w, 0, 8, 4
1339 SUMSUB_BA w, 15, 7, 4
1340 ; from 3 stages forward
1341 SUMSUB_BA w, 8, 7, 4
1342 pmulhrsw m7, [pw_11585x2]
1343 pmulhrsw m8, [pw_11585x2]
1344 ; store t16/t23
1345 mova [tmpq+ 1*%%str], m0 ; t16
1346 mova [tmpq+29*%%str], m7 ; t23
1347
1348 mova m4, [tmpq+21*%%str] ; t19
1349 VP9_UNPACK_MULSUB_2W_4X 10, 5, 15137, 6270, [pd_8192], 0, 7 ; t18, t29
1350 VP9_UNPACK_MULSUB_2W_4X 11, 4, 15137, 6270, [pd_8192], 0, 7 ; t19, t28
1351 VP9_UNPACK_MULSUB_2W_4X 3, 12, 6270, m15137, [pd_8192], 0, 7 ; t20, t27
1352 VP9_UNPACK_MULSUB_2W_4X 2, 13, 6270, m15137, [pd_8192], 0, 7 ; t21, t26
1353
1354 ; m8=t16, m9=t17, m10=t18, m11=t19, m3=t20, m2=t21, m1=t22, m0=t23,
1355 ; m15=t24, m14=t25, m13=t26, m12=t27, m4=t28, m5=t29, m6=t30, m7=t31
1356
1357 SUMSUB_BA w, 1, 9, 0
1358 SUMSUB_BA w, 2, 10, 0
1359 SUMSUB_BA w, 3, 11, 0
1360 SUMSUB_BA w, 12, 4, 0
1361 SUMSUB_BA w, 13, 5, 0
1362 SUMSUB_BA w, 14, 6, 0
1363
1364 ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
1365 ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
1366
1367 SUMSUB_BA w, 9, 6, 0
1368 SUMSUB_BA w, 10, 5, 0
1369 SUMSUB_BA w, 11, 4, 0
1370
1371 pmulhrsw m6, [pw_11585x2]
1372 pmulhrsw m9, [pw_11585x2]
1373 pmulhrsw m5, [pw_11585x2]
1374 pmulhrsw m10, [pw_11585x2]
1375 pmulhrsw m4, [pw_11585x2]
1376 pmulhrsw m11, [pw_11585x2]
1377
1378 ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
1379 ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
1380
1381 ; store t17-19 (and t20-22 for pass 1) - keep t24-31 in registers for
1382 ; final sumsub in pass 1, or keep t20-22 and t24-31 in registers for
1383 ; final sumsub of pass 2
1384 mova [tmpq+ 5*%%str], m1 ; t17
1385 mova [tmpq+ 9*%%str], m2 ; t18
1386 mova [tmpq+13*%%str], m3 ; t19
1387
1388 ; then do final pass to sumsub+store the two halves
1389%if %2 == 1
1390 mova [tmpq+17*%%str], m4 ; t20
1391 mova [tmpq+21*%%str], m5 ; t21
1392 mova [tmpq+25*%%str], m6 ; t22
1393
1394 mova m0, [tmpq+ 0*%%str] ; t0
1395 mova m1, [tmpq+ 4*%%str] ; t1
1396 mova m2, [tmpq+ 8*%%str] ; t2
1397 mova m3, [tmpq+12*%%str] ; t3
1398 mova m4, [tmpq+16*%%str] ; t4
1399 mova m5, [tmpq+20*%%str] ; t5
1400 mova m6, [tmpq+24*%%str] ; t6
1401
1402 SUMSUB_BA w, 15, 0, 7
1403 mova [tmpq+ 3*%%str], m0 ; t15
1404 mova m7, [tmpq+28*%%str] ; t7
1405 SUMSUB_BA w, 14, 1, 0
1406 SUMSUB_BA w, 13, 2, 0
1407 SUMSUB_BA w, 12, 3, 0
1408 SUMSUB_BA w, 11, 4, 0
1409 SUMSUB_BA w, 10, 5, 0
1410 SUMSUB_BA w, 9, 6, 0
1411 SUMSUB_BA w, 8, 7, 0
1412
1413 TRANSPOSE8x8W 15, 14, 13, 12, 11, 10, 9, 8, 0
1414 mova [tmpq+ 0*%%str], m15
1415 mova [tmpq+ 4*%%str], m14
1416 mova [tmpq+ 8*%%str], m13
1417 mova [tmpq+12*%%str], m12
1418 mova [tmpq+16*%%str], m11
1419 mova [tmpq+20*%%str], m10
1420 mova [tmpq+24*%%str], m9
1421 mova [tmpq+28*%%str], m8
1422
1423 mova m0, [tmpq+ 3*%%str] ; t15
1424 TRANSPOSE8x8W 7, 6, 5, 4, 3, 2, 1, 0, 8
1425 mova [tmpq+ 3*%%str], m7
1426 mova [tmpq+ 7*%%str], m6
1427 mova [tmpq+11*%%str], m5
1428 mova [tmpq+15*%%str], m4
1429 mova [tmpq+19*%%str], m3
1430 mova [tmpq+23*%%str], m2
1431 mova [tmpq+27*%%str], m1
1432 mova [tmpq+31*%%str], m0
1433
1434 mova m15, [tmpq+ 2*%%str] ; t8
1435 mova m14, [tmpq+ 6*%%str] ; t9
1436 mova m13, [tmpq+10*%%str] ; t10
1437 mova m12, [tmpq+14*%%str] ; t11
1438 mova m11, [tmpq+18*%%str] ; t12
1439 mova m10, [tmpq+22*%%str] ; t13
1440 mova m9, [tmpq+26*%%str] ; t14
1441 mova m8, [tmpq+30*%%str] ; t15
1442 mova m7, [tmpq+ 1*%%str] ; t16
1443 mova m6, [tmpq+ 5*%%str] ; t17
1444 mova m5, [tmpq+ 9*%%str] ; t18
1445 mova m4, [tmpq+13*%%str] ; t19
1446 mova m3, [tmpq+17*%%str] ; t20
1447 mova m2, [tmpq+21*%%str] ; t21
1448 mova m1, [tmpq+25*%%str] ; t22
1449
1450 SUMSUB_BA w, 7, 8, 0
1451 mova [tmpq+ 2*%%str], m8
1452 mova m0, [tmpq+29*%%str] ; t23
1453 SUMSUB_BA w, 6, 9, 8
1454 SUMSUB_BA w, 5, 10, 8
1455 SUMSUB_BA w, 4, 11, 8
1456 SUMSUB_BA w, 3, 12, 8
1457 SUMSUB_BA w, 2, 13, 8
1458 SUMSUB_BA w, 1, 14, 8
1459 SUMSUB_BA w, 0, 15, 8
1460
1461 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
1462 mova [tmpq+ 1*%%str], m0
1463 mova [tmpq+ 5*%%str], m1
1464 mova [tmpq+ 9*%%str], m2
1465 mova [tmpq+13*%%str], m3
1466 mova [tmpq+17*%%str], m4
1467 mova [tmpq+21*%%str], m5
1468 mova [tmpq+25*%%str], m6
1469 mova [tmpq+29*%%str], m7
1470
1471 mova m8, [tmpq+ 2*%%str]
1472 TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
1473 mova [tmpq+ 2*%%str], m8
1474 mova [tmpq+ 6*%%str], m9
1475 mova [tmpq+10*%%str], m10
1476 mova [tmpq+14*%%str], m11
1477 mova [tmpq+18*%%str], m12
1478 mova [tmpq+22*%%str], m13
1479 mova [tmpq+26*%%str], m14
1480 mova [tmpq+30*%%str], m15
1481%else
1482 ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
1483 ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
1484 ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
1485 ; t20-22 is in m4-6
1486 ; t24-31 is in m8-15
1487 pxor m7, m7
1488
1489%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
1490 SUMSUB_BA w, %4, %1, %5
1491 SUMSUB_BA w, %3, %2, %5
1492 pmulhrsw m%4, [pw_512]
1493 pmulhrsw m%3, [pw_512]
1494 VP9_STORE_2X %4, %3, %5, %6, %7
1495%if %8 == 1
1496 add dstq, stride2q
1497%endif
1498 pmulhrsw m%2, [pw_512]
1499 pmulhrsw m%1, [pw_512]
1500 VP9_STORE_2X %2, %1, %5, %6, %7, dst_endq
1501%if %8 == 1
1502 sub dst_endq, stride2q
1503%endif
1504%endmacro
1505
1506 ; store t0-1 and t30-31
1507 mova m0, [tmpq+ 0*%%str]
1508 mova m1, [tmpq+ 4*%%str]
1509 %%STORE_2X2 0, 1, 14, 15, 2, 3, 7
1510
1511 ; store t2-3 and t28-29
1512 mova m0, [tmpq+ 8*%%str]
1513 mova m1, [tmpq+12*%%str]
1514 %%STORE_2X2 0, 1, 12, 13, 2, 3, 7
1515
1516 ; store t4-5 and t26-27
1517 mova m0, [tmpq+16*%%str]
1518 mova m1, [tmpq+20*%%str]
1519 %%STORE_2X2 0, 1, 10, 11, 2, 3, 7
1520
1521 ; store t6-7 and t24-25
1522 mova m0, [tmpq+24*%%str]
1523 mova m1, [tmpq+28*%%str]
1524 %%STORE_2X2 0, 1, 8, 9, 2, 3, 7
1525
1526 ; store t8-9 and t22-23
1527 mova m0, [tmpq+ 2*%%str]
1528 mova m1, [tmpq+ 6*%%str]
1529 mova m8, [tmpq+29*%%str]
1530 %%STORE_2X2 0, 1, 6, 8, 2, 3, 7
1531
1532 ; store t10-11 and t20-21
1533 mova m0, [tmpq+10*%%str]
1534 mova m1, [tmpq+14*%%str]
1535 %%STORE_2X2 0, 1, 4, 5, 2, 3, 7
1536
1537 ; store t12-13 and t18-19
1538 mova m0, [tmpq+18*%%str]
1539 mova m1, [tmpq+22*%%str]
1540 mova m5, [tmpq+13*%%str]
1541 mova m4, [tmpq+ 9*%%str]
1542 %%STORE_2X2 0, 1, 4, 5, 2, 3, 7
1543
1544 ; store t14-17
1545 mova m0, [tmpq+26*%%str]
1546 mova m1, [tmpq+30*%%str]
1547 mova m5, [tmpq+ 5*%%str]
1548 mova m4, [tmpq+ 1*%%str]
1549 %%STORE_2X2 0, 1, 4, 5, 2, 3, 7, 0
1550%endif
1551%endmacro
1552
1553%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
1554INIT_XMM %1
1555cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
1556 cmp eobd, 135
1557 jg .idctfull
1558 cmp eobd, 34
1559 jg .idct16x16
1560 cmp eobd, 1
1561 jg .idct8x8
1562
1563 ; dc-only case
1564 movd m0, [blockq]
1565 mova m1, [pw_11585x2]
1566 pmulhrsw m0, m1
1567 pmulhrsw m0, m1
1568 SPLATW m0, m0, q0000
1569 pmulhrsw m0, [pw_512]
1570 pxor m5, m5
1571 movd [blockq], m5
1572 DEFINE_ARGS dst, stride, block, cnt
1573%rep 31
1574 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
1575 add dstq, strideq
1576%endrep
1577 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
1578 RET
1579
1580 DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
1581.idct8x8:
1582 mov tmpq, rsp
1583 VP9_IDCT32_1D blockq, 1, 8
1584
1585 mov stride30q, strideq ; stride
1586 lea stride2q, [strideq*2] ; stride*2
1587 shl stride30q, 5 ; stride*32
1588 mov cntd, 4
1589 sub stride30q, stride2q ; stride*30
1590.loop2_8x8:
1591 mov dstq, dst_bakq
1592 lea dst_endq, [dst_bakq+stride30q]
1593 VP9_IDCT32_1D tmpq, 2, 8
1594 add dst_bakq, 8
1595 add tmpq, 16
1596 dec cntd
1597 jg .loop2_8x8
1598
1599 ; at the end of the loop, m7 should still be zero
1600 ; use that to zero out block coefficients
1601 ZERO_BLOCK blockq, 64, 8, m7
1602 RET
1603
1604.idct16x16:
1605 mov cntd, 2
1606 mov tmpq, rsp
1607.loop1_16x16:
1608 VP9_IDCT32_1D blockq, 1, 16
1609 add blockq, 16
1610 add tmpq, 512
1611 dec cntd
1612 jg .loop1_16x16
1613 sub blockq, 32
1614
1615 mov stride30q, strideq ; stride
1616 lea stride2q, [strideq*2] ; stride*2
1617 shl stride30q, 5 ; stride*32
1618 mov cntd, 4
1619 mov tmpq, rsp
1620 sub stride30q, stride2q ; stride*30
1621.loop2_16x16:
1622 mov dstq, dst_bakq
1623 lea dst_endq, [dst_bakq+stride30q]
1624 VP9_IDCT32_1D tmpq, 2, 16
1625 add dst_bakq, 8
1626 add tmpq, 16
1627 dec cntd
1628 jg .loop2_16x16
1629
1630 ; at the end of the loop, m7 should still be zero
1631 ; use that to zero out block coefficients
1632 ZERO_BLOCK blockq, 64, 16, m7
1633 RET
1634
1635.idctfull:
1636 mov cntd, 4
1637 mov tmpq, rsp
1638.loop1_full:
1639 VP9_IDCT32_1D blockq, 1
1640 add blockq, 16
1641 add tmpq, 512
1642 dec cntd
1643 jg .loop1_full
1644 sub blockq, 64
1645
1646 mov stride30q, strideq ; stride
1647 lea stride2q, [strideq*2] ; stride*2
1648 shl stride30q, 5 ; stride*32
1649 mov cntd, 4
1650 mov tmpq, rsp
1651 sub stride30q, stride2q ; stride*30
1652.loop2_full:
1653 mov dstq, dst_bakq
1654 lea dst_endq, [dst_bakq+stride30q]
1655 VP9_IDCT32_1D tmpq, 2
1656 add dst_bakq, 8
1657 add tmpq, 16
1658 dec cntd
1659 jg .loop2_full
1660
1661 ; at the end of the loop, m7 should still be zero
1662 ; use that to zero out block coefficients
1663 ZERO_BLOCK blockq, 64, 32, m7
1664 RET
1665%endmacro
1666
1667VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
1668VP9_IDCT_IDCT_32x32_ADD_XMM avx
1669
1670%endif ; x86-64