Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / dct8.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* Copyright (C) 2013 x265 project
3;*
4;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
5;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
6;* Li Cao <li@multicorewareinc.com>
7;* Praveen Kumar Tiwari <Praveen@multicorewareinc.com>
8;*
9;* This program is free software; you can redistribute it and/or modify
10;* it under the terms of the GNU General Public License as published by
11;* the Free Software Foundation; either version 2 of the License, or
12;* (at your option) any later version.
13;*
14;* This program is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17;* GNU General Public License for more details.
18;*
19;* You should have received a copy of the GNU General Public License
20;* along with this program; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22;*
23;* This program is also available under a commercial proprietary license.
24;* For more information, contact us at license @ x265.com.
25;*****************************************************************************/
26
27;TO-DO : Further optimize the routines.
28
29%include "x86inc.asm"
30%include "x86util.asm"
31SECTION_RODATA 32
32tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64
33 dw 89, 75, 50, 18, -18, -50, -75, -89
34 dw 83, 36, -36, -83, -83, -36, 36, 83
35 dw 75, -18, -89, -50, 50, 89, 18, -75
36 dw 64, -64, -64, 64, 64, -64, -64, 64
37 dw 50, -89, 18, 75, -75, -18, 89, -50
38 dw 36, -83, 83, -36, -36, 83, -83, 36
39 dw 18, -50, 75, -89, 89, -75, 50, -18
40
41dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
42
43tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64
44 dw 90, 87, 80, 70, 57, 43, 25, 9
45 dw 89, 75, 50, 18, -18, -50, -75, -89
46 dw 87, 57, 9, -43, -80, -90, -70, -25
47 dw 83, 36, -36, -83, -83, -36, 36, 83
48 dw 80, 9, -70, -87, -25, 57, 90, 43
49 dw 75, -18, -89, -50, 50, 89, 18, -75
50 dw 70, -43, -87, 9, 90, 25, -80, -57
51 dw 64, -64, -64, 64, 64, -64, -64, 64
52 dw 57, -80, -25, 90, -9, -87, 43, 70
53 dw 50, -89, 18, 75, -75, -18, 89, -50
54 dw 43, -90, 57, 25, -87, 70, 9, -80
55 dw 36, -83, 83, -36, -36, 83, -83, 36
56 dw 25, -70, 90, -80, 43, 9, -57, 87
57 dw 18, -50, 75, -89, 89, -75, 50, -18
58 dw 9, -25, 43, -57, 70, -80, 87, -90
59
60
61tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64
62 dw -9, -25, -43, -57, -70, -80, -87, -90
63 dw -89, -75, -50, -18, 18, 50, 75, 89
64 dw 25, 70, 90, 80, 43, -9, -57, -87
65 dw 83, 36, -36, -83, -83, -36, 36, 83
66 dw -43, -90, -57, 25, 87, 70, -9, -80
67 dw -75, 18, 89, 50, -50, -89, -18, 75
68 dw 57, 80, -25, -90, -9, 87, 43, -70
69 dw 64, -64, -64, 64, 64, -64, -64, 64
70 dw -70, -43, 87, 9, -90, 25, 80, -57
71 dw -50, 89, -18, -75, 75, 18, -89, 50
72 dw 80, -9, -70, 87, -25, -57, 90, -43
73 dw 36, -83, 83, -36, -36, 83, -83, 36
74 dw -87, 57, -9, -43, 80, -90, 70, -25
75 dw -18, 50, -75, 89, -89, 75, -50, 18
76 dw 90, -87, 80, -70, 57, -43, 25, -9
77
78dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
79
80dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9
81
82tab_dct32_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
83 dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
84 dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90
85 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
86 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
87 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
88 dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
89 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
90 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
91 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
92 dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
93 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
94 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
95 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
96 dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
97 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
98 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
99 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
100 dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57
101 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
102 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
103 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
104 dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43
105 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
106 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
107 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
108 dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
109 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
110 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
111 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
112 dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
113 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
114
115tab_dct32_2: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
116 dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
117 dw -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90
118 dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90
119 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
120 dw -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88
121 dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87
122 dw 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
123 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
124 dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
125 dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80
126 dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78
127 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
128 dw -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
129 dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70
130 dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
131 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
132 dw -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61
133 dw -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
134 dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54
135 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
136 dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
137 dw -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43
138 dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38
139 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
140 dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
141 dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25
142 dw 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22
143 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
144 dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
145 dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9
146 dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
147
148avx2_idct8_1: times 4 dw 64, 83, 64, 36
149 times 4 dw 64, 36, -64, -83
150 times 4 dw 64, -36, -64, 83
151 times 4 dw 64, -83, 64, -36
152
153avx2_idct8_2: times 4 dw 89, 75, 50, 18
154 times 4 dw 75, -18, -89, -50
155 times 4 dw 50, -89, 18, 75
156 times 4 dw 18, -50, 75, -89
157
158idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7
159
160idct8_shuf2: times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
161
162idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
163
164tab_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9
165 dw 87, 57, 9, -43, -80, -90, -70, -25
166 dw 80, 9, -70, -87, -25, 57, 90, 43
167 dw 70, -43, -87, 9, 90, 25, -80, -57
168 dw 57, -80, -25, 90, -9, -87, 43, 70
169 dw 43, -90, 57, 25, -87, 70, 9, -80
170 dw 25, -70, 90, -80, 43, 9, -57, 87
171 dw 9, -25, 43, -57, 70, -80, 87, -90
172
173tab_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18
174 dw 64, 75, 36, -18, -64, -89, -83, -50
175 dw 64, 50, -36, -89, -64, 18, 83, 75
176 dw 64, 18, -83, -50, 64, 75, -36, -89
177 dw 64, -18, -83, 50, 64, -75, -36, 89
178 dw 64, -50, -36, 89, -64, -18, 83, -75
179 dw 64, -75, 36, 18, -64, 89, -83, 50
180 dw 64, -89, 83, -75, 64, -50, 36, -18
181
182idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7
183
184idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5
185
186tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
187 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
188 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
189 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
190 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
191 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
192 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
193 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
194 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
195 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
196 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
197 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
198 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
199 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
200 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
201 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
202
203
204tab_idct32_2: dw 64, 89, 83, 75, 64, 50, 36, 18
205 dw 64, 75, 36, -18, -64, -89, -83, -50
206 dw 64, 50, -36, -89, -64, 18, 83, 75
207 dw 64, 18, -83, -50, 64, 75, -36, -89
208 dw 64, -18, -83, 50, 64, -75, -36, 89
209 dw 64, -50, -36, 89, -64, -18, 83, -75
210 dw 64, -75, 36, 18, -64, 89, -83, 50
211 dw 64, -89, 83, -75, 64, -50, 36, -18
212
213
214tab_idct32_3: dw 90, 87, 80, 70, 57, 43, 25, 9
215 dw 87, 57, 9, -43, -80, -90, -70, -25
216 dw 80, 9, -70, -87, -25, 57, 90, 43
217 dw 70, -43, -87, 9, 90, 25, -80, -57
218 dw 57, -80, -25, 90, -9, -87, 43, 70
219 dw 43, -90, 57, 25, -87, 70, 9, -80
220 dw 25, -70, 90, -80, 43, 9, -57, 87
221 dw 9, -25, 43, -57, 70, -80, 87, -90
222
223tab_idct32_4: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
224 dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
225 dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
226 dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
227 dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
228 dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
229 dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
230 dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
231 dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
232 dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
233 dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
234 dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
235 dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
236 dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
237 dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
238 dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
239
240avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
241 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
242
243avx2_idct4_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
244 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83
245
246avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83
247
248const idct4_shuf1, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
249
250idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
251
252tab_dct4: times 4 dw 64, 64
253 times 4 dw 83, 36
254 times 4 dw 64, -64
255 times 4 dw 36, -83
256
257dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
258
259tab_dst4: times 2 dw 29, 55, 74, 84
260 times 2 dw 74, 74, 0, -74
261 times 2 dw 84, -29, -74, 55
262 times 2 dw 55, -84, 74, -29
263
264tab_idst4: times 4 dw 29, +84
265 times 4 dw +74, +55
266 times 4 dw 55, -29
267 times 4 dw +74, -84
268 times 4 dw 74, -74
269 times 4 dw 0, +74
270 times 4 dw 84, +55
271 times 4 dw -74, -29
272
273tab_dct8_1: times 2 dw 89, 50, 75, 18
274 times 2 dw 75, -89, -18, -50
275 times 2 dw 50, 18, -89, 75
276 times 2 dw 18, 75, -50, -89
277
278tab_dct8_2: times 2 dd 83, 36
279 times 2 dd 36, 83
280 times 1 dd 89, 75, 50, 18
281 times 1 dd 75, -18, -89, -50
282 times 1 dd 50, -89, 18, 75
283 times 1 dd 18, -50, 75, -89
284
285tab_idct8_3: times 4 dw 89, 75
286 times 4 dw 50, 18
287 times 4 dw 75, -18
288 times 4 dw -89, -50
289 times 4 dw 50, -89
290 times 4 dw 18, 75
291 times 4 dw 18, -50
292 times 4 dw 75, -89
293
294pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
295
296pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13
297
298tab_idct8_1: times 1 dw 64, -64, 36, -83, 64, 64, 83, 36
299
300tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
301 times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
302
303pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
304
305SECTION .text
306cextern pd_1
307cextern pd_2
308cextern pd_4
309cextern pd_8
310cextern pd_16
311cextern pd_32
312cextern pd_64
313cextern pd_128
314cextern pd_256
315cextern pd_512
316cextern pd_1024
317cextern pd_2048
318cextern pw_ppppmmmm
319
320;------------------------------------------------------
321;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
322;------------------------------------------------------
323INIT_XMM sse2
324cglobal dct4, 3, 4, 8
325%if BIT_DEPTH == 10
326 %define DCT_SHIFT 3
327 mova m7, [pd_4]
328%elif BIT_DEPTH == 8
329 %define DCT_SHIFT 1
330 mova m7, [pd_1]
331%else
332 %error Unsupported BIT_DEPTH!
333%endif
334 add r2d, r2d
335 lea r3, [tab_dct4]
336
337 mova m4, [r3 + 0 * 16]
338 mova m5, [r3 + 1 * 16]
339 mova m6, [r3 + 2 * 16]
340 movh m0, [r0 + 0 * r2]
341 movh m1, [r0 + 1 * r2]
342 punpcklqdq m0, m1
343 pshufd m0, m0, 0xD8
344 pshufhw m0, m0, 0xB1
345
346 lea r0, [r0 + 2 * r2]
347 movh m1, [r0]
348 movh m2, [r0 + r2]
349 punpcklqdq m1, m2
350 pshufd m1, m1, 0xD8
351 pshufhw m1, m1, 0xB1
352
353 punpcklqdq m2, m0, m1
354 punpckhqdq m0, m1
355
356 paddw m1, m2, m0
357 psubw m2, m0
358 pmaddwd m0, m1, m4
359 paddd m0, m7
360 psrad m0, DCT_SHIFT
361 pmaddwd m3, m2, m5
362 paddd m3, m7
363 psrad m3, DCT_SHIFT
364 packssdw m0, m3
365 pshufd m0, m0, 0xD8
366 pshufhw m0, m0, 0xB1
367 pmaddwd m1, m6
368 paddd m1, m7
369 psrad m1, DCT_SHIFT
370 pmaddwd m2, [r3 + 3 * 16]
371 paddd m2, m7
372 psrad m2, DCT_SHIFT
373 packssdw m1, m2
374 pshufd m1, m1, 0xD8
375 pshufhw m1, m1, 0xB1
376
377 punpcklqdq m2, m0, m1
378 punpckhqdq m0, m1
379
380 mova m7, [pd_128]
381
382 pmaddwd m1, m2, m4
383 pmaddwd m3, m0, m4
384 paddd m1, m3
385 paddd m1, m7
386 psrad m1, 8
387 movu [r1 + 0 * 16], m1
388
389 pmaddwd m1, m2, m5
390 pmaddwd m3, m0, m5
391 psubd m1, m3
392 paddd m1, m7
393 psrad m1, 8
394 movu [r1 + 1 * 16], m1
395
396 pmaddwd m1, m2, m6
397 pmaddwd m3, m0, m6
398 paddd m1, m3
399 paddd m1, m7
400 psrad m1, 8
401 movu [r1 + 2 * 16], m1
402
403 pmaddwd m2, [r3 + 3 * 16]
404 pmaddwd m0, [r3 + 3 * 16]
405 psubd m2, m0
406 paddd m2, m7
407 psrad m2, 8
408 movu [r1 + 3 * 16], m2
409 RET
410
411; DCT 4x4
412;
413; Input parameters:
414; - r0: source
415; - r1: destination
416; - r2: source stride
417INIT_YMM avx2
418cglobal dct4, 3, 4, 8, src, dst, srcStride
419%if BIT_DEPTH == 10
420 %define DCT_SHIFT 3
421 vbroadcasti128 m7, [pd_4]
422%elif BIT_DEPTH == 8
423 %define DCT_SHIFT 1
424 vbroadcasti128 m7, [pd_1]
425%else
426 %error Unsupported BIT_DEPTH!
427%endif
428 add r2d, r2d
429 lea r3, [avx2_dct4]
430
431 vbroadcasti128 m4, [dct4_shuf]
432 mova m5, [r3]
433 mova m6, [r3 + 32]
434 movq xm0, [r0]
435 movhps xm0, [r0 + r2]
436 lea r0, [r0 + 2 * r2]
437 movq xm1, [r0]
438 movhps xm1, [r0 + r2]
439
440 vinserti128 m0, m0, xm1, 1
441 pshufb m0, m4
442 vpermq m1, m0, 11011101b
443 vpermq m0, m0, 10001000b
444 paddw m2, m0, m1
445 psubw m0, m1
446
447 pmaddwd m2, m5
448 paddd m2, m7
449 psrad m2, DCT_SHIFT
450
451 pmaddwd m0, m6
452 paddd m0, m7
453 psrad m0, DCT_SHIFT
454
455 packssdw m2, m0
456 pshufb m2, m4
457 vpermq m1, m2, 11011101b
458 vpermq m2, m2, 10001000b
459 vbroadcasti128 m7, [pd_128]
460
461 pmaddwd m0, m2, m5
462 pmaddwd m3, m1, m5
463 paddd m3, m0
464 paddd m3, m7
465 psrad m3, 8
466
467 pmaddwd m2, m6
468 pmaddwd m1, m6
469 psubd m2, m1
470 paddd m2, m7
471 psrad m2, 8
472
473 movu [r1], xm3
474 movu [r1 + mmsize/2], m2
475 vextracti128 [r1 + mmsize], m3, 1
476 vextracti128 [r1 + mmsize + mmsize/2], m2, 1
477 RET
478
479;-------------------------------------------------------
480;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
481;-------------------------------------------------------
482INIT_XMM sse2
483cglobal idct4, 3, 4, 7
484%if BIT_DEPTH == 8
485 %define IDCT4_OFFSET [pd_2048]
486 %define IDCT4_SHIFT 12
487%elif BIT_DEPTH == 10
488 %define IDCT4_OFFSET [pd_512]
489 %define IDCT4_SHIFT 10
490%else
491 %error Unsupported BIT_DEPTH!
492%endif
493 add r2d, r2d
494 lea r3, [tab_dct4]
495
496 mova m6, [pd_64]
497
498 movu m0, [r0 + 0 * 16]
499 movu m1, [r0 + 1 * 16]
500 packssdw m0, m1
501
502 movu m1, [r0 + 2 * 16]
503 movu m2, [r0 + 3 * 16]
504 packssdw m1, m2
505
506 punpcklwd m2, m0, m1
507 pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1
508 paddd m3, m6
509
510 pmaddwd m2, [r3 + 2 * 16] ; m2 = E2
511 paddd m2, m6
512
513 punpckhwd m0, m1
514 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
515 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
516
517 paddd m4, m3, m1
518 psrad m4, 7 ; m4 = m128iA
519 paddd m5, m2, m0
520 psrad m5, 7
521 packssdw m4, m5 ; m4 = m128iA
522
523 psubd m2, m0
524 psrad m2, 7
525 psubd m3, m1
526 psrad m3, 7
527 packssdw m2, m3 ; m2 = m128iD
528
529 punpcklwd m1, m4, m2 ; m1 = S0
530 punpckhwd m4, m2 ; m4 = S8
531
532 punpcklwd m0, m1, m4 ; m0 = m128iA
533 punpckhwd m1, m4 ; m1 = m128iD
534
535 mova m6, IDCT4_OFFSET
536
537 punpcklwd m2, m0, m1
538 pmaddwd m3, m2, [r3 + 0 * 16]
539 paddd m3, m6 ; m3 = E1
540
541 pmaddwd m2, [r3 + 2 * 16]
542 paddd m2, m6 ; m2 = E2
543
544 punpckhwd m0, m1
545 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
546 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
547
548 paddd m4, m3, m1
549 psrad m4, IDCT4_SHIFT ; m4 = m128iA
550 paddd m5, m2, m0
551 psrad m5, IDCT4_SHIFT
552 packssdw m4, m5 ; m4 = m128iA
553
554 psubd m2, m0
555 psrad m2, IDCT4_SHIFT
556 psubd m3, m1
557 psrad m3, IDCT4_SHIFT
558 packssdw m2, m3 ; m2 = m128iD
559
560 punpcklwd m1, m4, m2
561 punpckhwd m4, m2
562
563 punpcklwd m0, m1, m4
564 movlps [r1 + 0 * r2], m0
565 movhps [r1 + 1 * r2], m0
566
567 punpckhwd m1, m4
568 movlps [r1 + 2 * r2], m1
569 lea r1, [r1 + 2 * r2]
570 movhps [r1 + r2], m1
571
572 RET
573
574;------------------------------------------------------
575;void dst4(int16_t *src, int32_t *dst, intptr_t stride)
576;------------------------------------------------------
577INIT_XMM ssse3
578%if ARCH_X86_64
579cglobal dst4, 3, 4, 8+2
580 %define coef2 m8
581 %define coef3 m9
582%else ; ARCH_X86_64 = 0
583cglobal dst4, 3, 4, 8
584 %define coef2 [r3 + 2 * 16]
585 %define coef3 [r3 + 3 * 16]
586%endif ; ARCH_X86_64
587%define coef0 m6
588%define coef1 m7
589
590%if BIT_DEPTH == 8
591 %define DST_SHIFT 1
592 mova m5, [pd_1]
593%elif BIT_DEPTH == 10
594 %define DST_SHIFT 3
595 mova m5, [pd_4]
596%endif
597 add r2d, r2d
598 lea r3, [tab_dst4]
599 mova coef0, [r3 + 0 * 16]
600 mova coef1, [r3 + 1 * 16]
601%if ARCH_X86_64
602 mova coef2, [r3 + 2 * 16]
603 mova coef3, [r3 + 3 * 16]
604%endif
605 movh m0, [r0 + 0 * r2] ; load
606 movh m1, [r0 + 1 * r2]
607 punpcklqdq m0, m1
608 lea r0, [r0 + 2 * r2]
609 movh m1, [r0]
610 movh m2, [r0 + r2]
611 punpcklqdq m1, m2
612 pmaddwd m2, m0, coef0 ; DST1
613 pmaddwd m3, m1, coef0
614 phaddd m2, m3
615 paddd m2, m5
616 psrad m2, DST_SHIFT
617 pmaddwd m3, m0, coef1
618 pmaddwd m4, m1, coef1
619 phaddd m3, m4
620 paddd m3, m5
621 psrad m3, DST_SHIFT
622 packssdw m2, m3 ; m2 = T70
623 pmaddwd m3, m0, coef2
624 pmaddwd m4, m1, coef2
625 phaddd m3, m4
626 paddd m3, m5
627 psrad m3, DST_SHIFT
628 pmaddwd m0, coef3
629 pmaddwd m1, coef3
630 phaddd m0, m1
631 paddd m0, m5
632 psrad m0, DST_SHIFT
633 packssdw m3, m0 ; m3 = T71
634 mova m5, [pd_128]
635
636 pmaddwd m0, m2, coef0 ; DST2
637 pmaddwd m1, m3, coef0
638 phaddd m0, m1
639 paddd m0, m5
640 psrad m0, 8
641 movu [r1 + 0 * 16], m0
642
643 pmaddwd m0, m2, coef1
644 pmaddwd m1, m3, coef1
645 phaddd m0, m1
646 paddd m0, m5
647 psrad m0, 8
648 movu [r1 + 1 * 16], m0
649
650 pmaddwd m0, m2, coef2
651 pmaddwd m1, m3, coef2
652 phaddd m0, m1
653 paddd m0, m5
654 psrad m0, 8
655 movu [r1 + 2 * 16], m0
656
657 pmaddwd m2, coef3
658 pmaddwd m3, coef3
659 phaddd m2, m3
660 paddd m2, m5
661 psrad m2, 8
662 movu [r1 + 3 * 16], m2
663
664 RET
665
666;-------------------------------------------------------
667;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
668;-------------------------------------------------------
669INIT_XMM sse2
670cglobal idst4, 3, 4, 7
671%if BIT_DEPTH == 8
672 mova m6, [pd_2048]
673 %define IDCT4_SHIFT 12
674%elif BIT_DEPTH == 10
675 mova m6, [pd_512]
676 %define IDCT4_SHIFT 10
677%else
678 %error Unsupported BIT_DEPTH!
679%endif
680 add r2d, r2d
681 lea r3, [tab_idst4]
682 mova m5, [pd_64]
683
684 movu m0, [r0 + 0 * 16]
685 movu m1, [r0 + 1 * 16]
686 packssdw m0, m1
687
688 movu m1, [r0 + 2 * 16]
689 movu m2, [r0 + 3 * 16]
690 packssdw m1, m2
691
692 punpcklwd m2, m0, m1 ; m2 = m128iAC
693 punpckhwd m0, m1 ; m0 = m128iBD
694
695 pmaddwd m1, m2, [r3 + 0 * 16]
696 pmaddwd m3, m0, [r3 + 1 * 16]
697 paddd m1, m3
698 paddd m1, m5
699 psrad m1, 7 ; m1 = S0
700
701 pmaddwd m3, m2, [r3 + 2 * 16]
702 pmaddwd m4, m0, [r3 + 3 * 16]
703 paddd m3, m4
704 paddd m3, m5
705 psrad m3, 7 ; m3 = S8
706 packssdw m1, m3 ; m1 = m128iA
707
708 pmaddwd m3, m2, [r3 + 4 * 16]
709 pmaddwd m4, m0, [r3 + 5 * 16]
710 paddd m3, m4
711 paddd m3, m5
712 psrad m3, 7 ; m3 = S0
713
714 pmaddwd m2, [r3 + 6 * 16]
715 pmaddwd m0, [r3 + 7 * 16]
716 paddd m2, m0
717 paddd m2, m5
718 psrad m2, 7 ; m2 = S8
719 packssdw m3, m2 ; m3 = m128iD
720
721 punpcklwd m0, m1, m3
722 punpckhwd m1, m3
723
724 punpcklwd m2, m0, m1
725 punpckhwd m0, m1
726 punpcklwd m1, m2, m0
727 punpckhwd m2, m0
728 pmaddwd m0, m1, [r3 + 0 * 16]
729 pmaddwd m3, m2, [r3 + 1 * 16]
730 paddd m0, m3
731 paddd m0, m6
732 psrad m0, IDCT4_SHIFT ; m0 = S0
733 pmaddwd m3, m1, [r3 + 2 * 16]
734 pmaddwd m4, m2, [r3 + 3 * 16]
735 paddd m3, m4
736 paddd m3, m6
737 psrad m3, IDCT4_SHIFT ; m3 = S8
738 packssdw m0, m3 ; m0 = m128iA
739 pmaddwd m3, m1, [r3 + 4 * 16]
740 pmaddwd m4, m2, [r3 + 5 * 16]
741 paddd m3, m4
742 paddd m3, m6
743 psrad m3, IDCT4_SHIFT ; m3 = S0
744 pmaddwd m1, [r3 + 6 * 16]
745 pmaddwd m2, [r3 + 7 * 16]
746 paddd m1, m2
747 paddd m1, m6
748 psrad m1, IDCT4_SHIFT ; m1 = S8
749 packssdw m3, m1 ; m3 = m128iD
750 punpcklwd m1, m0, m3
751 punpckhwd m0, m3
752
753 punpcklwd m2, m1, m0
754 movlps [r1 + 0 * r2], m2
755 movhps [r1 + 1 * r2], m2
756
757 punpckhwd m1, m0
758 movlps [r1 + 2 * r2], m1
759 lea r1, [r1 + 2 * r2]
760 movhps [r1 + r2], m1
761 RET
762
763
764;-------------------------------------------------------
765; void dct8(int16_t *src, int32_t *dst, intptr_t stride)
766;-------------------------------------------------------
767INIT_XMM sse4
768cglobal dct8, 3,6,7,0-16*mmsize
769 ;------------------------
770 ; Stack Mapping(dword)
771 ;------------------------
772 ; Row0[0-3] Row1[0-3]
773 ; ...
774 ; Row6[0-3] Row7[0-3]
775 ; Row0[0-3] Row7[0-3]
776 ; ...
777 ; Row6[4-7] Row7[4-7]
778 ;------------------------
779%if BIT_DEPTH == 10
780 %define DCT_SHIFT 4
781 mova m6, [pd_8]
782%elif BIT_DEPTH == 8
783 %define DCT_SHIFT 2
784 mova m6, [pd_2]
785%else
786 %error Unsupported BIT_DEPTH!
787%endif
788
789 add r2, r2
790 lea r3, [r2 * 3]
791 mov r5, rsp
792%assign x 0
793%rep 2
794 movu m0, [r0]
795 movu m1, [r0 + r2]
796 movu m2, [r0 + r2 * 2]
797 movu m3, [r0 + r3]
798
799 punpcklwd m4, m0, m1
800 punpckhwd m0, m1
801 punpcklwd m5, m2, m3
802 punpckhwd m2, m3
803 punpckldq m1, m4, m5 ; m1 = [1 0]
804 punpckhdq m4, m5 ; m4 = [3 2]
805 punpckldq m3, m0, m2
806 punpckhdq m0, m2
807 pshufd m2, m3, 0x4E ; m2 = [4 5]
808 pshufd m0, m0, 0x4E ; m0 = [6 7]
809
810 paddw m3, m1, m0
811 psubw m1, m0 ; m1 = [d1 d0]
812 paddw m0, m4, m2
813 psubw m4, m2 ; m4 = [d3 d2]
814 punpcklqdq m2, m3, m0 ; m2 = [s2 s0]
815 punpckhqdq m3, m0
816 pshufd m3, m3, 0x4E ; m3 = [s1 s3]
817
818 punpcklwd m0, m1, m4 ; m0 = [d2/d0]
819 punpckhwd m1, m4 ; m1 = [d3/d1]
820 punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0]
821 punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0]
822
823 ; odd
824 lea r4, [tab_dct8_1]
825 pmaddwd m1, m4, [r4 + 0*16]
826 pmaddwd m5, m0, [r4 + 0*16]
827 phaddd m1, m5
828 paddd m1, m6
829 psrad m1, DCT_SHIFT
830 %if x == 1
831 pshufd m1, m1, 0x1B
832 %endif
833 mova [r5 + 1*2*mmsize], m1 ; Row 1
834
835 pmaddwd m1, m4, [r4 + 1*16]
836 pmaddwd m5, m0, [r4 + 1*16]
837 phaddd m1, m5
838 paddd m1, m6
839 psrad m1, DCT_SHIFT
840 %if x == 1
841 pshufd m1, m1, 0x1B
842 %endif
843 mova [r5 + 3*2*mmsize], m1 ; Row 3
844
845 pmaddwd m1, m4, [r4 + 2*16]
846 pmaddwd m5, m0, [r4 + 2*16]
847 phaddd m1, m5
848 paddd m1, m6
849 psrad m1, DCT_SHIFT
850 %if x == 1
851 pshufd m1, m1, 0x1B
852 %endif
853 mova [r5 + 5*2*mmsize], m1 ; Row 5
854
855 pmaddwd m4, [r4 + 3*16]
856 pmaddwd m0, [r4 + 3*16]
857 phaddd m4, m0
858 paddd m4, m6
859 psrad m4, DCT_SHIFT
860 %if x == 1
861 pshufd m4, m4, 0x1B
862 %endif
863 mova [r5 + 7*2*mmsize], m4; Row 7
864
865 ; even
866 lea r4, [tab_dct4]
867 paddw m0, m2, m3 ; m0 = [EE1 EE0]
868 pshufb m0, [pb_unpackhlw1]
869 psubw m2, m3 ; m2 = [EO1 EO0]
870 psignw m2, [pw_ppppmmmm]
871 pshufb m2, [pb_unpackhlw1]
872 pmaddwd m3, m0, [r4 + 0*16]
873 paddd m3, m6
874 psrad m3, DCT_SHIFT
875 %if x == 1
876 pshufd m3, m3, 0x1B
877 %endif
878 mova [r5 + 0*2*mmsize], m3 ; Row 0
879 pmaddwd m0, [r4 + 2*16]
880 paddd m0, m6
881 psrad m0, DCT_SHIFT
882 %if x == 1
883 pshufd m0, m0, 0x1B
884 %endif
885 mova [r5 + 4*2*mmsize], m0 ; Row 4
886 pmaddwd m3, m2, [r4 + 1*16]
887 paddd m3, m6
888 psrad m3, DCT_SHIFT
889 %if x == 1
890 pshufd m3, m3, 0x1B
891 %endif
892 mova [r5 + 2*2*mmsize], m3 ; Row 2
893 pmaddwd m2, [r4 + 3*16]
894 paddd m2, m6
895 psrad m2, DCT_SHIFT
896 %if x == 1
897 pshufd m2, m2, 0x1B
898 %endif
899 mova [r5 + 6*2*mmsize], m2 ; Row 6
900
901 %if x != 1
902 lea r0, [r0 + r2 * 4]
903 add r5, mmsize
904 %endif
905%assign x x+1
906%endrep
907
908 mov r2, 2
909 mov r0, rsp ; r0 = pointer to Low Part
910 lea r4, [tab_dct8_2]
911 mova m6, [pd_256]
912
913.pass2:
914%rep 2
915 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0]
916 mova m1, [r0 + 1*2*mmsize]
917 paddd m2, m0, [r0 + (0*2+1)*mmsize]
918 pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0]
919 paddd m3, m1, [r0 + (1*2+1)*mmsize]
920 pshufd m3, m3, 0x9C ; m3 = ^^
921 psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0]
922 psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^
923
924 ; even
925 phaddd m4, m2, m3 ; m4 = [EE1 EE0 EE1 EE0]
926 phsubd m2, m3 ; m2 = [EO1 EO0 EO1 EO0]
927
928 pslld m4, 6 ; m4 = [64*EE1 64*EE0]
929 pmulld m5, m2, [r4 + 0*16] ; m5 = [36*EO1 83*EO0]
930 pmulld m2, [r4 + 1*16] ; m2 = [83*EO1 36*EO0]
931
932 phaddd m3, m4, m5 ; m3 = [Row2 Row0]
933 paddd m3, m6
934 psrad m3, 9
935 phsubd m4, m2 ; m4 = [Row6 Row4]
936 paddd m4, m6
937 psrad m4, 9
938 movh [r1 + 0*2*mmsize], m3
939 movhps [r1 + 2*2*mmsize], m3
940 movh [r1 + 4*2*mmsize], m4
941 movhps [r1 + 6*2*mmsize], m4
942
943 ; odd
944 pmulld m2, m0, [r4 + 2*16]
945 pmulld m3, m1, [r4 + 2*16]
946 pmulld m4, m0, [r4 + 3*16]
947 pmulld m5, m1, [r4 + 3*16]
948 phaddd m2, m3
949 phaddd m4, m5
950 phaddd m2, m4 ; m2 = [Row3 Row1]
951 paddd m2, m6
952 psrad m2, 9
953 movh [r1 + 1*2*mmsize], m2
954 movhps [r1 + 3*2*mmsize], m2
955
956 pmulld m2, m0, [r4 + 4*16]
957 pmulld m3, m1, [r4 + 4*16]
958 pmulld m4, m0, [r4 + 5*16]
959 pmulld m5, m1, [r4 + 5*16]
960 phaddd m2, m3
961 phaddd m4, m5
962 phaddd m2, m4 ; m2 = [Row7 Row5]
963 paddd m2, m6
964 psrad m2, 9
965 movh [r1 + 5*2*mmsize], m2
966 movhps [r1 + 7*2*mmsize], m2
967
968 add r1, mmsize/2
969 add r0, 2*2*mmsize
970%endrep
971
972 dec r2
973 jnz .pass2
974 RET
975
976;-------------------------------------------------------
977; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
978;-------------------------------------------------------
979INIT_XMM ssse3
980
981cglobal patial_butterfly_inverse_internal_pass1
982 movu m0, [r0]
983 movu m1, [r0 + 4 * 32]
984 movu m2, [r0 + 2 * 32]
985 movu m3, [r0 + 6 * 32]
986 packssdw m0, m2
987 packssdw m1, m3
988 punpckhwd m2, m0, m1 ; [2 6]
989 punpcklwd m0, m1 ; [0 4]
990 pmaddwd m1, m0, [r6] ; EE[0]
991 pmaddwd m0, [r6 + 32] ; EE[1]
992 pmaddwd m3, m2, [r6 + 16] ; EO[0]
993 pmaddwd m2, [r6 + 48] ; EO[1]
994
995 paddd m4, m1, m3 ; E[0]
996 psubd m1, m3 ; E[3]
997 paddd m3, m0, m2 ; E[1]
998 psubd m0, m2 ; E[2]
999
1000 ;E[K] = E[k] + add
1001 mova m5, [pd_64]
1002 paddd m0, m5
1003 paddd m1, m5
1004 paddd m3, m5
1005 paddd m4, m5
1006
1007 movu m2, [r0 + 32]
1008 movu m5, [r0 + 5 * 32]
1009 packssdw m2, m5
1010 movu m5, [r0 + 3 * 32]
1011 movu m6, [r0 + 7 * 32]
1012 packssdw m5, m6
1013 punpcklwd m6, m2, m5 ;[1 3]
1014 punpckhwd m2, m5 ;[5 7]
1015
1016 pmaddwd m5, m6, [r4]
1017 pmaddwd m7, m2, [r4 + 16]
1018 paddd m5, m7 ; O[0]
1019
1020 paddd m7, m4, m5
1021 psrad m7, 7
1022
1023 psubd m4, m5
1024 psrad m4, 7
1025
1026 packssdw m7, m4
1027 movh [r5 + 0 * 16], m7
1028 movhps [r5 + 7 * 16], m7
1029
1030 pmaddwd m5, m6, [r4 + 32]
1031 pmaddwd m4, m2, [r4 + 48]
1032 paddd m5, m4 ; O[1]
1033
1034 paddd m4, m3, m5
1035 psrad m4, 7
1036
1037 psubd m3, m5
1038 psrad m3, 7
1039
1040 packssdw m4, m3
1041 movh [r5 + 1 * 16], m4
1042 movhps [r5 + 6 * 16], m4
1043
1044 pmaddwd m5, m6, [r4 + 64]
1045 pmaddwd m4, m2, [r4 + 80]
1046 paddd m5, m4 ; O[2]
1047
1048 paddd m4, m0, m5
1049 psrad m4, 7
1050
1051 psubd m0, m5
1052 psrad m0, 7
1053
1054 packssdw m4, m0
1055 movh [r5 + 2 * 16], m4
1056 movhps [r5 + 5 * 16], m4
1057
1058 pmaddwd m5, m6, [r4 + 96]
1059 pmaddwd m4, m2, [r4 + 112]
1060 paddd m5, m4 ; O[3]
1061
1062 paddd m4, m1, m5
1063 psrad m4, 7
1064
1065 psubd m1, m5
1066 psrad m1, 7
1067
1068 packssdw m4, m1
1069 movh [r5 + 3 * 16], m4
1070 movhps [r5 + 4 * 16], m4
1071
1072 ret
1073
1074%macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
1075%if BIT_DEPTH == 10
1076 %define IDCT_SHIFT 10
1077%elif BIT_DEPTH == 8
1078 %define IDCT_SHIFT 12
1079%else
1080 %error Unsupported BIT_DEPTH!
1081%endif
1082 pshufb m4, %1, [pb_idct8even]
1083 pmaddwd m4, [tab_idct8_1]
1084 phsubd m5, m4
1085 pshufd m4, m4, 0x4E
1086 phaddd m4, m4
1087 punpckhqdq m4, m5 ;m4 = dd e[ 0 1 2 3]
1088 paddd m4, m6
1089
1090 pshufb %1, %1, [r6]
1091 pmaddwd m5, %1, [r4]
1092 pmaddwd %1, [r4 + 16]
1093 phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3]
1094
1095 paddd %1, m4, m5
1096 psrad %1, IDCT_SHIFT
1097
1098 psubd m4, m5
1099 psrad m4, IDCT_SHIFT
1100 pshufd m4, m4, 0x1B
1101
1102 packssdw %1, m4
1103%undef IDCT_SHIFT
1104%endmacro
1105
1106cglobal patial_butterfly_inverse_internal_pass2
1107
1108 mova m0, [r5]
1109 PARTIAL_BUTTERFLY_PROCESS_ROW m0
1110 movu [r1], m0
1111
1112 mova m2, [r5 + 16]
1113 PARTIAL_BUTTERFLY_PROCESS_ROW m2
1114 movu [r1 + r2], m2
1115
1116 mova m1, [r5 + 32]
1117 PARTIAL_BUTTERFLY_PROCESS_ROW m1
1118 movu [r1 + 2 * r2], m1
1119
1120 mova m3, [r5 + 48]
1121 PARTIAL_BUTTERFLY_PROCESS_ROW m3
1122 movu [r1 + r3], m3
1123
1124 ret
1125
1126cglobal idct8, 3,7,8 ;,0-16*mmsize
1127 ; alignment stack to 64-bytes
1128 mov r5, rsp
1129 sub rsp, 16*mmsize + gprsize
1130 and rsp, ~(64-1)
1131 mov [rsp + 16*mmsize], r5
1132 mov r5, rsp
1133
1134 lea r4, [tab_idct8_3]
1135 lea r6, [tab_dct4]
1136
1137 call patial_butterfly_inverse_internal_pass1
1138
1139 add r0, 16
1140 add r5, 8
1141
1142 call patial_butterfly_inverse_internal_pass1
1143
1144%if BIT_DEPTH == 10
1145 mova m6, [pd_512]
1146%elif BIT_DEPTH == 8
1147 mova m6, [pd_2048]
1148%else
1149 %error Unsupported BIT_DEPTH!
1150%endif
1151 add r2, r2
1152 lea r3, [r2 * 3]
1153 lea r4, [tab_idct8_2]
1154 lea r6, [pb_idct8odd]
1155 sub r5, 8
1156
1157 call patial_butterfly_inverse_internal_pass2
1158
1159 lea r1, [r1 + 4 * r2]
1160 add r5, 64
1161
1162 call patial_butterfly_inverse_internal_pass2
1163
1164 ; restore origin stack pointer
1165 mov rsp, [rsp + 16*mmsize]
1166 RET
1167
1168
1169;-----------------------------------------------------------------------------
1170; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size)
1171;-----------------------------------------------------------------------------
1172INIT_XMM sse4
1173cglobal denoise_dct, 4, 4, 6
1174 pxor m5, m5
1175 shr r3d, 2
1176.loop:
1177 mova m0, [r0]
1178 pabsd m1, m0
1179 mova m2, [r1]
1180 paddd m2, m1
1181 mova [r1], m2
1182 pmovzxwd m3, [r2]
1183 psubd m1, m3
1184 pcmpgtd m4, m1, m5
1185 pand m1, m4
1186 psignd m1, m0
1187 mova [r0], m1
1188 add r0, 16
1189 add r1, 16
1190 add r2, 8
1191 dec r3d
1192 jnz .loop
1193 RET
1194
1195INIT_YMM avx2
1196cglobal denoise_dct, 4, 4, 6
1197 pxor m5, m5
1198 shr r3d, 3
1199.loop:
1200 movu m0, [r0]
1201 pabsd m1, m0
1202 movu m2, [r1]
1203 paddd m2, m1
1204 movu [r1], m2
1205 pmovzxwd m3, [r2]
1206 psubd m1, m3
1207 pcmpgtd m4, m1, m5
1208 pand m1, m4
1209 psignd m1, m0
1210 movu [r0], m1
1211 add r0, 32
1212 add r1, 32
1213 add r2, 16
1214 dec r3d
1215 jnz .loop
1216 RET
1217%if ARCH_X86_64 == 1
1218%macro DCT8_PASS_1 4
1219 vpbroadcastq m0, [r6 + %1]
1220 pmaddwd m2, m%3, m0
1221 pmaddwd m0, m%4
1222 phaddd m2, m0
1223 paddd m2, m5
1224 psrad m2, DCT_SHIFT
1225 packssdw m2, m2
1226 vpermq m2, m2, 0x08
1227 mova [r5 + %2], xm2
1228%endmacro
1229
1230%macro DCT8_PASS_2 1
1231 vbroadcasti128 m4, [r6 + %1]
1232 pmaddwd m6, m0, m4
1233 pmaddwd m7, m1, m4
1234 pmaddwd m8, m2, m4
1235 pmaddwd m9, m3, m4
1236 phaddd m6, m7
1237 phaddd m8, m9
1238 phaddd m6, m8
1239 paddd m6, m5
1240 psrad m6, DCT_SHIFT2
1241%endmacro
1242
1243INIT_YMM avx2
1244cglobal dct8, 3, 7, 10, 0-8*16
1245%if BIT_DEPTH == 10
1246 %define DCT_SHIFT 4
1247 vbroadcasti128 m5, [pd_8]
1248%elif BIT_DEPTH == 8
1249 %define DCT_SHIFT 2
1250 vbroadcasti128 m5, [pd_2]
1251%else
1252 %error Unsupported BIT_DEPTH!
1253%endif
1254%define DCT_SHIFT2 9
1255
1256 add r2d, r2d
1257 lea r3, [r2 * 3]
1258 lea r4, [r0 + r2 * 4]
1259 mov r5, rsp
1260 lea r6, [tab_dct8]
1261 mova m6, [dct8_shuf]
1262
1263 ;pass1
1264 mova xm0, [r0]
1265 vinserti128 m0, m0, [r4], 1
1266 mova xm1, [r0 + r2]
1267 vinserti128 m1, m1, [r4 + r2], 1
1268 mova xm2, [r0 + r2 * 2]
1269 vinserti128 m2, m2, [r4 + r2 * 2], 1
1270 mova xm3, [r0 + r3]
1271 vinserti128 m3, m3, [r4 + r3], 1
1272
1273 punpcklqdq m4, m0, m1
1274 punpckhqdq m0, m1
1275 punpcklqdq m1, m2, m3
1276 punpckhqdq m2, m3
1277
1278 pshufb m0, m6
1279 pshufb m2, m6
1280
1281 paddw m3, m4, m0
1282 paddw m7, m1, m2
1283
1284 psubw m4, m0
1285 psubw m1, m2
1286
1287 DCT8_PASS_1 0 * 16, 0 * 16, 3, 7
1288 DCT8_PASS_1 1 * 16, 2 * 16, 4, 1
1289 DCT8_PASS_1 2 * 16, 4 * 16, 3, 7
1290 DCT8_PASS_1 3 * 16, 6 * 16, 4, 1
1291 DCT8_PASS_1 4 * 16, 1 * 16, 3, 7
1292 DCT8_PASS_1 5 * 16, 3 * 16, 4, 1
1293 DCT8_PASS_1 6 * 16, 5 * 16, 3, 7
1294 DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
1295
1296 ;pass2
1297 mov r2d, 32
1298 lea r3, [r2 * 3]
1299 lea r4, [r1 + r2 * 4]
1300 vbroadcasti128 m5, [pd_256]
1301
1302 mova m0, [r5]
1303 mova m1, [r5 + 32]
1304 mova m2, [r5 + 64]
1305 mova m3, [r5 + 96]
1306
1307 DCT8_PASS_2 0 * 16
1308 movu [r1], m6
1309 DCT8_PASS_2 1 * 16
1310 movu [r1 + r2], m6
1311 DCT8_PASS_2 2 * 16
1312 movu [r1 + r2 * 2], m6
1313 DCT8_PASS_2 3 * 16
1314 movu [r1 + r3], m6
1315 DCT8_PASS_2 4 * 16
1316 movu [r4], m6
1317 DCT8_PASS_2 5 * 16
1318 movu [r4 + r2], m6
1319 DCT8_PASS_2 6 * 16
1320 movu [r4 + r2 * 2], m6
1321 DCT8_PASS_2 7 * 16
1322 movu [r4 + r3], m6
1323 RET
1324
1325%macro DCT16_PASS_1_E 2
1326 vpbroadcastq m7, [r7 + %1]
1327
1328 pmaddwd m4, m0, m7
1329 pmaddwd m6, m2, m7
1330 phaddd m4, m6
1331
1332 paddd m4, m9
1333 psrad m4, DCT_SHIFT
1334
1335 packssdw m4, m4
1336 vpermq m4, m4, 0x08
1337
1338 mova [r5 + %2], xm4
1339%endmacro
1340
1341%macro DCT16_PASS_1_O 2
1342 vbroadcasti128 m7, [r7 + %1]
1343
1344 pmaddwd m10, m0, m7
1345 pmaddwd m11, m2, m7
1346 phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5]
1347
1348 pmaddwd m11, m4, m7
1349 pmaddwd m12, m6, m7
1350 phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7]
1351
1352 phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7]
1353
1354 paddd m10, m9
1355 psrad m10, DCT_SHIFT
1356
1357 packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -]
1358 vpermq m10, m10, 0x08
1359
1360 mova [r5 + %2], xm10
1361%endmacro
1362
1363%macro DCT16_PASS_2 1
1364 vbroadcasti128 m8, [r7 + %1]
1365 vbroadcasti128 m13, [r8 + %1]
1366
1367 pmaddwd m10, m0, m8
1368 pmaddwd m11, m1, m13
1369 paddd m10, m11
1370
1371 pmaddwd m11, m2, m8
1372 pmaddwd m12, m3, m13
1373 paddd m11, m12
1374 phaddd m10, m11
1375
1376 pmaddwd m11, m4, m8
1377 pmaddwd m12, m5, m13
1378 paddd m11, m12
1379
1380 pmaddwd m12, m6, m8
1381 pmaddwd m13, m7, m13
1382 paddd m12, m13
1383 phaddd m11, m12
1384
1385 phaddd m10, m11
1386 paddd m10, m9
1387 psrad m10, DCT_SHIFT2
1388%endmacro
1389INIT_YMM avx2
1390cglobal dct16, 3, 9, 15, 0-16*mmsize
1391%if BIT_DEPTH == 10
1392 %define DCT_SHIFT 5
1393 vbroadcasti128 m9, [pd_16]
1394%elif BIT_DEPTH == 8
1395 %define DCT_SHIFT 3
1396 vbroadcasti128 m9, [pd_4]
1397%else
1398 %error Unsupported BIT_DEPTH!
1399%endif
1400%define DCT_SHIFT2 10
1401
1402 add r2d, r2d
1403
1404 mova m13, [dct16_shuf1]
1405 mova m14, [dct16_shuf2]
1406 lea r7, [tab_dct16_1 + 8 * 16]
1407 lea r8, [tab_dct16_2 + 8 * 16]
1408 lea r3, [r2 * 3]
1409 mov r5, rsp
1410 mov r4d, 2 ; Each iteration process 8 rows, so 16/8 iterations
1411
1412.pass1:
1413 lea r6, [r0 + r2 * 4]
1414
1415 movu m2, [r0]
1416 movu m1, [r6]
1417 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo]
1418 vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi]
1419
1420 movu m4, [r0 + r2]
1421 movu m3, [r6 + r2]
1422 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo]
1423 vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi]
1424
1425 movu m6, [r0 + r2 * 2]
1426 movu m5, [r6 + r2 * 2]
1427 vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo]
1428 vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi]
1429
1430 movu m8, [r0 + r3]
1431 movu m7, [r6 + r3]
1432 vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo]
1433 vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi]
1434
1435 pshufb m1, m13
1436 pshufb m3, m13
1437 pshufb m5, m13
1438 pshufb m7, m13
1439
1440 paddw m8, m0, m1 ;E
1441 psubw m0, m1 ;O
1442
1443 paddw m1, m2, m3 ;E
1444 psubw m2, m3 ;O
1445
1446 paddw m3, m4, m5 ;E
1447 psubw m4, m5 ;O
1448
1449 paddw m5, m6, m7 ;E
1450 psubw m6, m7 ;O
1451
1452 DCT16_PASS_1_O -7 * 16, 1 * 32
1453 DCT16_PASS_1_O -5 * 16, 3 * 32
1454 DCT16_PASS_1_O -3 * 16, 1 * 32 + 16
1455 DCT16_PASS_1_O -1 * 16, 3 * 32 + 16
1456 DCT16_PASS_1_O 1 * 16, 5 * 32
1457 DCT16_PASS_1_O 3 * 16, 7 * 32
1458 DCT16_PASS_1_O 5 * 16, 5 * 32 + 16
1459 DCT16_PASS_1_O 7 * 16, 7 * 32 + 16
1460
1461 pshufb m8, m14
1462 pshufb m1, m14
1463 phaddw m0, m8, m1
1464
1465 pshufb m3, m14
1466 pshufb m5, m14
1467 phaddw m2, m3, m5
1468
1469 DCT16_PASS_1_E -8 * 16, 0 * 32
1470 DCT16_PASS_1_E -4 * 16, 0 * 32 + 16
1471 DCT16_PASS_1_E 0 * 16, 4 * 32
1472 DCT16_PASS_1_E 4 * 16, 4 * 32 + 16
1473
1474 phsubw m0, m8, m1
1475 phsubw m2, m3, m5
1476
1477 DCT16_PASS_1_E -6 * 16, 2 * 32
1478 DCT16_PASS_1_E -2 * 16, 2 * 32 + 16
1479 DCT16_PASS_1_E 2 * 16, 6 * 32
1480 DCT16_PASS_1_E 6 * 16, 6 * 32 + 16
1481
1482 lea r0, [r0 + 8 * r2]
1483 add r5, 256
1484
1485 dec r4d
1486 jnz .pass1
1487
1488 mov r5, rsp
1489 mov r4d, 2
1490 mov r2d, 64
1491 lea r3, [r2 * 3]
1492 vbroadcasti128 m9, [pd_512]
1493
1494.pass2:
1495 mova m0, [r5 + 0 * 32] ; [row0lo row4lo]
1496 mova m1, [r5 + 8 * 32] ; [row0hi row4hi]
1497
1498 mova m2, [r5 + 1 * 32] ; [row1lo row5lo]
1499 mova m3, [r5 + 9 * 32] ; [row1hi row5hi]
1500
1501 mova m4, [r5 + 2 * 32] ; [row2lo row6lo]
1502 mova m5, [r5 + 10 * 32] ; [row2hi row6hi]
1503
1504 mova m6, [r5 + 3 * 32] ; [row3lo row7lo]
1505 mova m7, [r5 + 11 * 32] ; [row3hi row7hi]
1506
1507 DCT16_PASS_2 -8 * 16
1508 movu [r1], m10
1509 DCT16_PASS_2 -7 * 16
1510 movu [r1 + r2], m10
1511 DCT16_PASS_2 -6 * 16
1512 movu [r1 + r2 * 2], m10
1513 DCT16_PASS_2 -5 * 16
1514 movu [r1 + r3], m10
1515
1516 lea r6, [r1 + r2 * 4]
1517 DCT16_PASS_2 -4 * 16
1518 movu [r6], m10
1519 DCT16_PASS_2 -3 * 16
1520 movu [r6 + r2], m10
1521 DCT16_PASS_2 -2 * 16
1522 movu [r6 + r2 * 2], m10
1523 DCT16_PASS_2 -1 * 16
1524 movu [r6 + r3], m10
1525
1526 lea r6, [r6 + r2 * 4]
1527 DCT16_PASS_2 0 * 16
1528 movu [r6], m10
1529 DCT16_PASS_2 1 * 16
1530 movu [r6 + r2], m10
1531 DCT16_PASS_2 2 * 16
1532 movu [r6 + r2 * 2], m10
1533 DCT16_PASS_2 3 * 16
1534 movu [r6 + r3], m10
1535
1536 lea r6, [r6 + r2 * 4]
1537 DCT16_PASS_2 4 * 16
1538 movu [r6], m10
1539 DCT16_PASS_2 5 * 16
1540 movu [r6 + r2], m10
1541 DCT16_PASS_2 6 * 16
1542 movu [r6 + r2 * 2], m10
1543 DCT16_PASS_2 7 * 16
1544 movu [r6 + r3], m10
1545
1546 add r1, 32
1547 add r5, 128
1548
1549 dec r4d
1550 jnz .pass2
1551 RET
1552
1553%macro DCT32_PASS_1 4
1554 vbroadcasti128 m8, [r7 + %1]
1555
1556 pmaddwd m11, m%3, m8
1557 pmaddwd m12, m%4, m8
1558 phaddd m11, m12
1559
1560 vbroadcasti128 m8, [r7 + %1 + 32]
1561 vbroadcasti128 m10, [r7 + %1 + 48]
1562 pmaddwd m12, m5, m8
1563 pmaddwd m13, m6, m10
1564 phaddd m12, m13
1565
1566 pmaddwd m13, m4, m8
1567 pmaddwd m14, m7, m10
1568 phaddd m13, m14
1569
1570 phaddd m12, m13
1571
1572 phaddd m11, m12
1573 paddd m11, m9
1574 psrad m11, DCT_SHIFT
1575
1576 vpermq m11, m11, 0xD8
1577 packssdw m11, m11
1578 movq [r5 + %2], xm11
1579 vextracti128 xm10, m11, 1
1580 movq [r5 + %2 + 64], xm10
1581%endmacro
1582
1583%macro DCT32_PASS_2 1
1584 mova m8, [r7 + %1]
1585 mova m10, [r8 + %1]
1586 pmaddwd m11, m0, m8
1587 pmaddwd m12, m1, m10
1588 paddd m11, m12
1589
1590 pmaddwd m12, m2, m8
1591 pmaddwd m13, m3, m10
1592 paddd m12, m13
1593
1594 phaddd m11, m12
1595
1596 pmaddwd m12, m4, m8
1597 pmaddwd m13, m5, m10
1598 paddd m12, m13
1599
1600 pmaddwd m13, m6, m8
1601 pmaddwd m14, m7, m10
1602 paddd m13, m14
1603
1604 phaddd m12, m13
1605
1606 phaddd m11, m12
1607 vextracti128 xm10, m11, 1
1608 paddd xm11, xm10
1609
1610 paddd xm11, xm9
1611 psrad xm11, DCT_SHIFT2
1612
1613%endmacro
1614
1615INIT_YMM avx2
1616cglobal dct32, 3, 9, 16, 0-64*mmsize
1617%if BIT_DEPTH == 10
1618 %define DCT_SHIFT 6
1619 vpbroadcastq m9, [pd_32]
1620%elif BIT_DEPTH == 8
1621 %define DCT_SHIFT 4
1622 vpbroadcastq m9, [pd_8]
1623%else
1624 %error Unsupported BIT_DEPTH!
1625%endif
1626%define DCT_SHIFT2 11
1627
1628 add r2d, r2d
1629
1630 lea r7, [tab_dct32_1]
1631 lea r8, [tab_dct32_2]
1632 lea r3, [r2 * 3]
1633 mov r5, rsp
1634 mov r4d, 8
1635 mova m15, [dct16_shuf1]
1636
1637.pass1:
1638 movu m2, [r0]
1639 movu m1, [r0 + 32]
1640 pshufb m1, m15
1641 vpermq m1, m1, 0x4E
1642 psubw m7, m2, m1
1643 paddw m2, m1
1644
1645 movu m1, [r0 + r2 * 2]
1646 movu m0, [r0 + r2 * 2 + 32]
1647 pshufb m0, m15
1648 vpermq m0, m0, 0x4E
1649 psubw m8, m1, m0
1650 paddw m1, m0
1651 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E
1652 vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E
1653 pshufb m3, m15
1654 psubw m1, m0, m3
1655 paddw m0, m3
1656
1657 vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O
1658 vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O
1659
1660
1661 movu m4, [r0 + r2]
1662 movu m2, [r0 + r2 + 32]
1663 pshufb m2, m15
1664 vpermq m2, m2, 0x4E
1665 psubw m10, m4, m2
1666 paddw m4, m2
1667
1668 movu m3, [r0 + r3]
1669 movu m2, [r0 + r3 + 32]
1670 pshufb m2, m15
1671 vpermq m2, m2, 0x4E
1672 psubw m11, m3, m2
1673 paddw m3, m2
1674 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E
1675 vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E
1676 pshufb m8, m15
1677 psubw m3, m2, m8
1678 paddw m2, m8
1679
1680 vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O
1681 vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O
1682
1683
1684 DCT32_PASS_1 0 * 32, 0 * 64, 0, 2
1685 DCT32_PASS_1 2 * 32, 2 * 64, 1, 3
1686 DCT32_PASS_1 4 * 32, 4 * 64, 0, 2
1687 DCT32_PASS_1 6 * 32, 6 * 64, 1, 3
1688 DCT32_PASS_1 8 * 32, 8 * 64, 0, 2
1689 DCT32_PASS_1 10 * 32, 10 * 64, 1, 3
1690 DCT32_PASS_1 12 * 32, 12 * 64, 0, 2
1691 DCT32_PASS_1 14 * 32, 14 * 64, 1, 3
1692 DCT32_PASS_1 16 * 32, 16 * 64, 0, 2
1693 DCT32_PASS_1 18 * 32, 18 * 64, 1, 3
1694 DCT32_PASS_1 20 * 32, 20 * 64, 0, 2
1695 DCT32_PASS_1 22 * 32, 22 * 64, 1, 3
1696 DCT32_PASS_1 24 * 32, 24 * 64, 0, 2
1697 DCT32_PASS_1 26 * 32, 26 * 64, 1, 3
1698 DCT32_PASS_1 28 * 32, 28 * 64, 0, 2
1699 DCT32_PASS_1 30 * 32, 30 * 64, 1, 3
1700
1701 add r5, 8
1702 lea r0, [r0 + r2 * 4]
1703
1704 dec r4d
1705 jnz .pass1
1706
1707 mov r2d, 128
1708 lea r3, [r2 * 3]
1709 mov r5, rsp
1710 mov r4d, 8
1711 vpbroadcastq m9, [pd_1024]
1712
1713.pass2:
1714 mova m0, [r5 + 0 * 64]
1715 mova m1, [r5 + 0 * 64 + 32]
1716
1717 mova m2, [r5 + 1 * 64]
1718 mova m3, [r5 + 1 * 64 + 32]
1719
1720 mova m4, [r5 + 2 * 64]
1721 mova m5, [r5 + 2 * 64 + 32]
1722
1723 mova m6, [r5 + 3 * 64]
1724 mova m7, [r5 + 3 * 64 + 32]
1725
1726 DCT32_PASS_2 0 * 32
1727 movu [r1], xm11
1728 DCT32_PASS_2 1 * 32
1729 movu [r1 + r2], xm11
1730 DCT32_PASS_2 2 * 32
1731 movu [r1 + r2 * 2], xm11
1732 DCT32_PASS_2 3 * 32
1733 movu [r1 + r3], xm11
1734
1735 lea r6, [r1 + r2 * 4]
1736 DCT32_PASS_2 4 * 32
1737 movu [r6], xm11
1738 DCT32_PASS_2 5 * 32
1739 movu [r6 + r2], xm11
1740 DCT32_PASS_2 6 * 32
1741 movu [r6 + r2 * 2], xm11
1742 DCT32_PASS_2 7 * 32
1743 movu [r6 + r3], xm11
1744
1745 lea r6, [r6 + r2 * 4]
1746 DCT32_PASS_2 8 * 32
1747 movu [r6], xm11
1748 DCT32_PASS_2 9 * 32
1749 movu [r6 + r2], xm11
1750 DCT32_PASS_2 10 * 32
1751 movu [r6 + r2 * 2], xm11
1752 DCT32_PASS_2 11 * 32
1753 movu [r6 + r3], xm11
1754
1755 lea r6, [r6 + r2 * 4]
1756 DCT32_PASS_2 12 * 32
1757 movu [r6], xm11
1758 DCT32_PASS_2 13 * 32
1759 movu [r6 + r2], xm11
1760 DCT32_PASS_2 14 * 32
1761 movu [r6 + r2 * 2], xm11
1762 DCT32_PASS_2 15 * 32
1763 movu [r6 + r3], xm11
1764
1765 lea r6, [r6 + r2 * 4]
1766 DCT32_PASS_2 16 * 32
1767 movu [r6], xm11
1768 DCT32_PASS_2 17 * 32
1769 movu [r6 + r2], xm11
1770 DCT32_PASS_2 18 * 32
1771 movu [r6 + r2 * 2], xm11
1772 DCT32_PASS_2 19 * 32
1773 movu [r6 + r3], xm11
1774
1775 lea r6, [r6 + r2 * 4]
1776 DCT32_PASS_2 20 * 32
1777 movu [r6], xm11
1778 DCT32_PASS_2 21 * 32
1779 movu [r6 + r2], xm11
1780 DCT32_PASS_2 22 * 32
1781 movu [r6 + r2 * 2], xm11
1782 DCT32_PASS_2 23 * 32
1783 movu [r6 + r3], xm11
1784
1785 lea r6, [r6 + r2 * 4]
1786 DCT32_PASS_2 24 * 32
1787 movu [r6], xm11
1788 DCT32_PASS_2 25 * 32
1789 movu [r6 + r2], xm11
1790 DCT32_PASS_2 26 * 32
1791 movu [r6 + r2 * 2], xm11
1792 DCT32_PASS_2 27 * 32
1793 movu [r6 + r3], xm11
1794
1795 lea r6, [r6 + r2 * 4]
1796 DCT32_PASS_2 28 * 32
1797 movu [r6], xm11
1798 DCT32_PASS_2 29 * 32
1799 movu [r6 + r2], xm11
1800 DCT32_PASS_2 30 * 32
1801 movu [r6 + r2 * 2], xm11
1802 DCT32_PASS_2 31 * 32
1803 movu [r6 + r3], xm11
1804
1805 add r5, 256
1806 add r1, 16
1807
1808 dec r4d
1809 jnz .pass2
1810 RET
1811
1812%macro IDCT8_PASS_1 1
1813 vpbroadcastd m7, [r5 + %1]
1814 vpbroadcastd m10, [r5 + %1 + 4]
1815 pmaddwd m5, m4, m7
1816 pmaddwd m6, m0, m10
1817 paddd m5, m6
1818
1819 vpbroadcastd m7, [r6 + %1]
1820 vpbroadcastd m10, [r6 + %1 + 4]
1821 pmaddwd m6, m1, m7
1822 pmaddwd m3, m2, m10
1823 paddd m6, m3
1824
1825 paddd m3, m5, m6
1826 paddd m3, m11
1827 psrad m3, IDCT_SHIFT1
1828
1829 psubd m5, m6
1830 paddd m5, m11
1831 psrad m5, IDCT_SHIFT1
1832
1833 vpbroadcastd m7, [r5 + %1 + 32]
1834 vpbroadcastd m10, [r5 + %1 + 36]
1835 pmaddwd m6, m4, m7
1836 pmaddwd m8, m0, m10
1837 paddd m6, m8
1838
1839 vpbroadcastd m7, [r6 + %1 + 32]
1840 vpbroadcastd m10, [r6 + %1 + 36]
1841 pmaddwd m8, m1, m7
1842 pmaddwd m9, m2, m10
1843 paddd m8, m9
1844
1845 paddd m9, m6, m8
1846 paddd m9, m11
1847 psrad m9, IDCT_SHIFT1
1848
1849 psubd m6, m8
1850 paddd m6, m11
1851 psrad m6, IDCT_SHIFT1
1852
1853 packssdw m3, m9
1854 vpermq m3, m3, 0xD8
1855
1856 packssdw m6, m5
1857 vpermq m6, m6, 0xD8
1858%endmacro
1859
1860%macro IDCT8_PASS_2 0
1861 punpcklqdq m2, m0, m1
1862 punpckhqdq m0, m1
1863
1864 pmaddwd m3, m2, [r5]
1865 pmaddwd m5, m2, [r5 + 32]
1866 pmaddwd m6, m2, [r5 + 64]
1867 pmaddwd m7, m2, [r5 + 96]
1868 phaddd m3, m5
1869 phaddd m6, m7
1870 pshufb m3, [idct8_shuf2]
1871 pshufb m6, [idct8_shuf2]
1872 punpcklqdq m7, m3, m6
1873 punpckhqdq m3, m6
1874
1875 pmaddwd m5, m0, [r6]
1876 pmaddwd m6, m0, [r6 + 32]
1877 pmaddwd m8, m0, [r6 + 64]
1878 pmaddwd m9, m0, [r6 + 96]
1879 phaddd m5, m6
1880 phaddd m8, m9
1881 pshufb m5, [idct8_shuf2]
1882 pshufb m8, [idct8_shuf2]
1883 punpcklqdq m6, m5, m8
1884 punpckhqdq m5, m8
1885
1886 paddd m8, m7, m6
1887 paddd m8, m12
1888 psrad m8, IDCT_SHIFT2
1889
1890 psubd m7, m6
1891 paddd m7, m12
1892 psrad m7, IDCT_SHIFT2
1893
1894 pshufb m7, [idct8_shuf3]
1895 packssdw m8, m7
1896
1897 paddd m9, m3, m5
1898 paddd m9, m12
1899 psrad m9, IDCT_SHIFT2
1900
1901 psubd m3, m5
1902 paddd m3, m12
1903 psrad m3, IDCT_SHIFT2
1904
1905 pshufb m3, [idct8_shuf3]
1906 packssdw m9, m3
1907%endmacro
1908
1909INIT_YMM avx2
1910cglobal idct8, 3, 7, 13, 0-8*16
1911%if BIT_DEPTH == 10
1912 %define IDCT_SHIFT2 10
1913 vpbroadcastd m12, [pd_512]
1914%elif BIT_DEPTH == 8
1915 %define IDCT_SHIFT2 12
1916 vpbroadcastd m12, [pd_2048]
1917%else
1918 %error Unsupported BIT_DEPTH!
1919%endif
1920%define IDCT_SHIFT1 7
1921
1922 vbroadcasti128 m11, [pd_64]
1923
1924 mov r4, rsp
1925 lea r5, [avx2_idct8_1]
1926 lea r6, [avx2_idct8_2]
1927
1928 ;pass1
1929 mova m0, [r0 + 0 * 32]
1930 mova m1, [r0 + 4 * 32]
1931 packssdw m0, m1 ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4]
1932 mova m1, [r0 + 2 * 32]
1933 mova m2, [r0 + 6 * 32]
1934 packssdw m1, m2 ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6]
1935 mova m2, [r0 + 1 * 32]
1936 mova m3, [r0 + 5 * 32]
1937 packssdw m2, m3 ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5]
1938 mova m3, [r0 + 3 * 32]
1939 mova m4, [r0 + 7 * 32]
1940 packssdw m3, m4 ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7]
1941
1942 mova m5, [idct8_shuf1]
1943
1944 punpcklwd m4, m0, m1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
1945 punpckhwd m0, m1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
1946 vpermd m4, m5, m4
1947 vpermd m0, m5, m0
1948
1949 punpcklwd m1, m2, m3 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
1950 punpckhwd m2, m3 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
1951 vpermd m1, m5, m1
1952 vpermd m2, m5, m2
1953
1954 IDCT8_PASS_1 0
1955 mova [r4], m3
1956 mova [r4 + 96], m6
1957
1958 IDCT8_PASS_1 64
1959 mova [r4 + 32], m3
1960 mova [r4 + 64], m6
1961
1962 ;pass2
1963 add r2d, r2d
1964 lea r3, [r2 * 3]
1965
1966 mova m0, [r4]
1967 mova m1, [r4 + 32]
1968 IDCT8_PASS_2
1969
1970 vextracti128 xm3, m8, 1
1971 mova [r1], xm8
1972 mova [r1 + r2], xm3
1973 vextracti128 xm3, m9, 1
1974 mova [r1 + r2 * 2], xm9
1975 mova [r1 + r3], xm3
1976
1977 lea r1, [r1 + r2 * 4]
1978 mova m0, [r4 + 64]
1979 mova m1, [r4 + 96]
1980 IDCT8_PASS_2
1981
1982 vextracti128 xm3, m8, 1
1983 mova [r1], xm8
1984 mova [r1 + r2], xm3
1985 vextracti128 xm3, m9, 1
1986 mova [r1 + r2 * 2], xm9
1987 mova [r1 + r3], xm3
1988 RET
1989
1990%macro IDCT_PASS1 2
1991 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16]
1992
1993 pmaddwd m9, m0, m5
1994 pmaddwd m10, m7, m5
1995 phaddd m9, m10
1996
1997 pmaddwd m10, m6, m5
1998 pmaddwd m11, m8, m5
1999 phaddd m10, m11
2000
2001 phaddd m9, m10
2002 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16]
2003
2004 pmaddwd m10, m1, m5
2005 pmaddwd m11, m3, m5
2006 phaddd m10, m11
2007
2008 pmaddwd m11, m4, m5
2009 pmaddwd m12, m2, m5
2010 phaddd m11, m12
2011
2012 phaddd m10, m11
2013
2014 paddd m11, m9, m10
2015 paddd m11, m14
2016 psrad m11, IDCT_SHIFT1
2017
2018 psubd m9, m10
2019 paddd m9, m14
2020 psrad m9, IDCT_SHIFT1
2021
2022 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16 + 16]
2023
2024 pmaddwd m10, m0, m5
2025 pmaddwd m12, m7, m5
2026 phaddd m10, m12
2027
2028 pmaddwd m12, m6, m5
2029 pmaddwd m13, m8, m5
2030 phaddd m12, m13
2031
2032 phaddd m10, m12
2033 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16 + 16]
2034
2035 pmaddwd m12, m1, m5
2036 pmaddwd m13, m3, m5
2037 phaddd m12, m13
2038
2039 pmaddwd m13, m4, m5
2040 pmaddwd m5, m2
2041 phaddd m13, m5
2042
2043 phaddd m12, m13
2044
2045 paddd m5, m10, m12
2046 paddd m5, m14
2047 psrad m5, IDCT_SHIFT1
2048
2049 psubd m10, m12
2050 paddd m10, m14
2051 psrad m10, IDCT_SHIFT1
2052
2053 packssdw m11, m5
2054 packssdw m9, m10
2055
2056 mova m10, [idct16_shuff]
2057 mova m5, [idct16_shuff1]
2058
2059 vpermd m12, m10, m11
2060 vpermd m13, m5, m9
2061 mova [r3 + %1 * 16 * 2], xm12
2062 mova [r3 + %2 * 16 * 2], xm13
2063 vextracti128 [r3 + %2 * 16 * 2 + 32], m13, 1
2064 vextracti128 [r3 + %1 * 16 * 2 + 32], m12, 1
2065%endmacro
2066
2067;-------------------------------------------------------
2068; void idct16(int32_t *src, int16_t *dst, intptr_t stride)
2069;-------------------------------------------------------
2070INIT_YMM avx2
2071cglobal idct16, 3, 7, 16, 0-16*mmsize
2072%if BIT_DEPTH == 10
2073 %define IDCT_SHIFT2 10
2074 vpbroadcastd m15, [pd_512]
2075%elif BIT_DEPTH == 8
2076 %define IDCT_SHIFT2 12
2077 vpbroadcastd m15, [pd_2048]
2078%else
2079 %error Unsupported BIT_DEPTH!
2080%endif
2081%define IDCT_SHIFT1 7
2082
2083 vbroadcasti128 m14, [pd_64]
2084
2085 add r2d, r2d
2086 mov r3, rsp
2087 mov r4d, 2
2088
2089.pass1:
2090 movu m0, [r0 + 0 * 64]
2091 movu m1, [r0 + 8 * 64]
2092 packssdw m0, m1 ;[0L 8L 0H 8H]
2093
2094 movu m1, [r0 + 1 * 64]
2095 movu m2, [r0 + 9 * 64]
2096 packssdw m1, m2 ;[1L 9L 1H 9H]
2097
2098 movu m2, [r0 + 2 * 64]
2099 movu m3, [r0 + 10 * 64]
2100 packssdw m2, m3 ;[2L 10L 2H 10H]
2101
2102 movu m3, [r0 + 3 * 64]
2103 movu m4, [r0 + 11 * 64]
2104 packssdw m3, m4 ;[3L 11L 3H 11H]
2105
2106 movu m4, [r0 + 4 * 64]
2107 movu m5, [r0 + 12 * 64]
2108 packssdw m4, m5 ;[4L 12L 4H 12H]
2109
2110 movu m5, [r0 + 5 * 64]
2111 movu m6, [r0 + 13 * 64]
2112 packssdw m5, m6 ;[5L 13L 5H 13H]
2113
2114 movu m6, [r0 + 6 * 64]
2115 movu m7, [r0 + 14 * 64]
2116 packssdw m6, m7 ;[6L 14L 6H 14H]
2117
2118 movu m7, [r0 + 7 * 64]
2119 movu m8, [r0 + 15 * 64]
2120 packssdw m7, m8 ;[7L 15L 7H 15H]
2121
2122 punpckhwd m8, m0, m2 ;[8 10]
2123 punpcklwd m0, m2 ;[0 2]
2124
2125 punpckhwd m2, m1, m3 ;[9 11]
2126 punpcklwd m1, m3 ;[1 3]
2127
2128 punpckhwd m3, m4, m6 ;[12 14]
2129 punpcklwd m4, m6 ;[4 6]
2130
2131 punpckhwd m6, m5, m7 ;[13 15]
2132 punpcklwd m5, m7 ;[5 7]
2133
2134 punpckhdq m7, m0, m4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
2135 punpckldq m0, m4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
2136
2137 punpckhdq m4, m8, m3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
2138 punpckldq m8, m3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
2139
2140 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
2141 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
2142
2143 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
2144 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
2145
2146 punpckhqdq m6, m0, m8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
2147 punpcklqdq m0, m8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
2148
2149 punpckhqdq m8, m7, m4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
2150 punpcklqdq m7, m4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
2151
2152 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
2153 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
2154
2155 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
2156 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
2157
2158 IDCT_PASS1 0, 14
2159 IDCT_PASS1 2, 12
2160 IDCT_PASS1 4, 10
2161 IDCT_PASS1 6, 8
2162
2163 add r0, 32
2164 add r3, 16
2165 dec r4d
2166 jnz .pass1
2167
2168 mov r3, rsp
2169 mov r4d, 8
2170 lea r5, [tab_idct16_2]
2171 lea r6, [tab_idct16_1]
2172
2173 vbroadcasti128 m7, [r5]
2174 vbroadcasti128 m8, [r5 + 16]
2175 vbroadcasti128 m9, [r5 + 32]
2176 vbroadcasti128 m10, [r5 + 48]
2177 vbroadcasti128 m11, [r5 + 64]
2178 vbroadcasti128 m12, [r5 + 80]
2179 vbroadcasti128 m13, [r5 + 96]
2180
2181.pass2:
2182 movu m1, [r3]
2183 vpermq m0, m1, 0xD8
2184
2185 pmaddwd m1, m0, m7
2186 pmaddwd m2, m0, m8
2187 phaddd m1, m2
2188
2189 pmaddwd m2, m0, m9
2190 pmaddwd m3, m0, m10
2191 phaddd m2, m3
2192
2193 phaddd m1, m2
2194
2195 pmaddwd m2, m0, m11
2196 pmaddwd m3, m0, m12
2197 phaddd m2, m3
2198
2199 vbroadcasti128 m14, [r5 + 112]
2200 pmaddwd m3, m0, m13
2201 pmaddwd m4, m0, m14
2202 phaddd m3, m4
2203
2204 phaddd m2, m3
2205
2206 movu m3, [r3 + 32]
2207 vpermq m0, m3, 0xD8
2208
2209 vbroadcasti128 m14, [r6]
2210 pmaddwd m3, m0, m14
2211 vbroadcasti128 m14, [r6 + 16]
2212 pmaddwd m4, m0, m14
2213 phaddd m3, m4
2214
2215 vbroadcasti128 m14, [r6 + 32]
2216 pmaddwd m4, m0, m14
2217 vbroadcasti128 m14, [r6 + 48]
2218 pmaddwd m5, m0, m14
2219 phaddd m4, m5
2220
2221 phaddd m3, m4
2222
2223 vbroadcasti128 m14, [r6 + 64]
2224 pmaddwd m4, m0, m14
2225 vbroadcasti128 m14, [r6 + 80]
2226 pmaddwd m5, m0, m14
2227 phaddd m4, m5
2228
2229 vbroadcasti128 m14, [r6 + 96]
2230 pmaddwd m6, m0, m14
2231 vbroadcasti128 m14, [r6 + 112]
2232 pmaddwd m0, m14
2233 phaddd m6, m0
2234
2235 phaddd m4, m6
2236
2237 paddd m5, m1, m3
2238 paddd m5, m15
2239 psrad m5, IDCT_SHIFT2
2240
2241 psubd m1, m3
2242 paddd m1, m15
2243 psrad m1, IDCT_SHIFT2
2244
2245 paddd m6, m2, m4
2246 paddd m6, m15
2247 psrad m6, IDCT_SHIFT2
2248
2249 psubd m2, m4
2250 paddd m2, m15
2251 psrad m2, IDCT_SHIFT2
2252
2253 packssdw m5, m6
2254 packssdw m1, m2
2255 pshufb m2, m1, [dct16_shuf1]
2256
2257 mova [r1], xm5
2258 mova [r1 + 16], xm2
2259 vextracti128 [r1 + r2], m5, 1
2260 vextracti128 [r1 + r2 + 16], m2, 1
2261
2262 lea r1, [r1 + 2 * r2]
2263 add r3, 64
2264 dec r4d
2265 jnz .pass2
2266 RET
2267
2268%macro IDCT32_PASS1 1
2269 vbroadcasti128 m3, [tab_idct32_1 + %1 * 32]
2270 vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16]
2271 pmaddwd m9, m4, m3
2272 pmaddwd m10, m8, m13
2273 phaddd m9, m10
2274
2275 pmaddwd m10, m2, m3
2276 pmaddwd m11, m1, m13
2277 phaddd m10, m11
2278
2279 phaddd m9, m10
2280
2281 vbroadcasti128 m3, [tab_idct32_1 + (15 - %1) * 32]
2282 vbroadcasti128 m13, [tab_idct32_1 + (15- %1) * 32 + 16]
2283 pmaddwd m10, m4, m3
2284 pmaddwd m11, m8, m13
2285 phaddd m10, m11
2286
2287 pmaddwd m11, m2, m3
2288 pmaddwd m12, m1, m13
2289 phaddd m11, m12
2290
2291 phaddd m10, m11
2292 phaddd m9, m10 ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15]
2293
2294 vbroadcasti128 m3, [tab_idct32_2 + %1 * 16]
2295 pmaddwd m10, m0, m3
2296 pmaddwd m11, m7, m3
2297 phaddd m10, m11
2298 phaddd m10, m10
2299
2300 vbroadcasti128 m3, [tab_idct32_3 + %1 * 16]
2301 pmaddwd m11, m5, m3
2302 pmaddwd m12, m6, m3
2303 phaddd m11, m12
2304 phaddd m11, m11
2305
2306 paddd m12, m10, m11 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL]
2307 psubd m10, m11 ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL]
2308
2309 punpcklqdq m12, m10 ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15]
2310 paddd m10, m9, m12
2311 paddd m10, m15
2312 psrad m10, IDCT_SHIFT1
2313
2314 psubd m12, m9
2315 paddd m12, m15
2316 psrad m12, IDCT_SHIFT1
2317
2318 packssdw m10, m12
2319 vextracti128 xm12, m10, 1
2320 movd [r3 + %1 * 64], xm10
2321 movd [r3 + 32 + %1 * 64], xm12
2322 pextrd [r4 - %1 * 64], xm10, 1
2323 pextrd [r4+ 32 - %1 * 64], xm12, 1
2324 pextrd [r3 + 16 * 64 + %1 *64], xm10, 3
2325 pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
2326 pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2
2327 pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
2328%endmacro
2329
2330;-------------------------------------------------------
2331; void idct32(int32_t *src, int16_t *dst, intptr_t stride)
2332;-------------------------------------------------------
2333
2334; TODO: Reduce PHADDD instruction by PADDD
2335
2336INIT_YMM avx2
2337cglobal idct32, 3, 6, 16, 0-32*64
2338
2339%define IDCT_SHIFT1 7
2340
2341 vbroadcasti128 m15, [pd_64]
2342
2343 mov r3, rsp
2344 lea r4, [r3 + 15 * 64]
2345 mov r5d, 8
2346
2347.pass1:
2348 movu xm0, [r0 + 2 * 128]
2349 movu xm1, [r0 + 18 * 128]
2350 vinserti128 m0, m0, [r0 + 0 * 128], 1
2351 vinserti128 m1, m1, [r0 + 16 * 128], 1
2352
2353 packssdw m0, m1 ;[2 18 0 16]
2354
2355 movu xm1, [r0 + 1 * 128]
2356 movu xm2, [r0 + 9 * 128]
2357 vinserti128 m1, m1, [r0 + 17 * 128], 1
2358 vinserti128 m2, m2, [r0 + 25 * 128], 1
2359 packssdw m1, m2 ;[1 9 17 25]
2360
2361 movu xm2, [r0 + 6 * 128]
2362 movu xm3, [r0 + 22 * 128]
2363 vinserti128 m2, m2, [r0 + 4 * 128], 1
2364 vinserti128 m3, m3, [r0 + 20 * 128], 1
2365 packssdw m2, m3 ;[6 22 4 20]
2366
2367 movu xm3, [r0 + 3 * 128]
2368 movu xm4, [r0 + 11 * 128]
2369 vinserti128 m3, m3, [r0 + 19 * 128], 1
2370 vinserti128 m4, m4, [r0 + 27 * 128], 1
2371 packssdw m3, m4 ;[3 11 19 27]
2372
2373 movu xm4, [r0 + 10 * 128]
2374 movu xm5, [r0 + 26 * 128]
2375 vinserti128 m4, m4, [r0 + 8 * 128], 1
2376 vinserti128 m5, m5, [r0 + 24 * 128], 1
2377 packssdw m4, m5 ;[10 26 8 24]
2378
2379 movu xm5, [r0 + 5 * 128]
2380 movu xm6, [r0 + 13 * 128]
2381 vinserti128 m5, m5, [r0 + 21 * 128], 1
2382 vinserti128 m6, m6, [r0 + 29 * 128], 1
2383 packssdw m5, m6 ;[5 13 21 29]
2384
2385 movu xm6, [r0 + 14 * 128]
2386 movu xm7, [r0 + 30 * 128]
2387 vinserti128 m6, m6, [r0 + 12 * 128], 1
2388 vinserti128 m7, m7, [r0 + 28 * 128], 1
2389 packssdw m6, m7 ;[14 30 12 28]
2390
2391 movu xm7, [r0 + 7 * 128]
2392 movu xm8, [r0 + 15 * 128]
2393 vinserti128 m7, m7, [r0 + 23 * 128], 1
2394 vinserti128 m8, m8, [r0 + 31 * 128], 1
2395 packssdw m7, m8 ;[7 15 23 31]
2396
2397 punpckhwd m8, m0, m2 ;[18 22 16 20]
2398 punpcklwd m0, m2 ;[2 6 0 4]
2399
2400 punpckhwd m2, m1, m3 ;[9 11 25 27]
2401 punpcklwd m1, m3 ;[1 3 17 19]
2402
2403 punpckhwd m3, m4, m6 ;[26 30 24 28]
2404 punpcklwd m4, m6 ;[10 14 8 12]
2405
2406 punpckhwd m6, m5, m7 ;[13 15 29 31]
2407 punpcklwd m5, m7 ;[5 7 21 23]
2408
2409 punpckhdq m7, m0, m4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
2410 punpckldq m0, m4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]
2411
2412 punpckhdq m4, m8, m3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
2413 punpckldq m8, m3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]
2414
2415 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
2416 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]
2417
2418 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
2419 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]
2420
2421 punpckhqdq m6, m0, m8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
2422 punpcklqdq m0, m8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]
2423
2424 punpckhqdq m8, m7, m4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
2425 punpcklqdq m7, m4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]
2426
2427 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
2428 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]
2429
2430 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
2431 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]
2432
2433 vperm2i128 m5, m0, m6, 0x20 ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301]
2434 vperm2i128 m0, m0, m6, 0x31 ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281]
2435
2436 vperm2i128 m6, m7, m8, 0x20 ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303]
2437 vperm2i128 m7, m7, m8, 0x31 ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283]
2438
2439 vperm2i128 m8, m1, m4, 0x31 ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311]
2440 vperm2i128 m4, m1, m4, 0x20 ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151]
2441
2442 vperm2i128 m1, m3, m2, 0x31 ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313]
2443 vperm2i128 m2, m3, m2, 0x20 ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153]
2444
2445 IDCT32_PASS1 0
2446 IDCT32_PASS1 1
2447 IDCT32_PASS1 2
2448 IDCT32_PASS1 3
2449 IDCT32_PASS1 4
2450 IDCT32_PASS1 5
2451 IDCT32_PASS1 6
2452 IDCT32_PASS1 7
2453
2454 add r0, 16
2455 add r3, 4
2456 add r4, 4
2457 dec r5d
2458 jnz .pass1
2459
2460%if BIT_DEPTH == 10
2461 %define IDCT_SHIFT2 10
2462 vpbroadcastd m15, [pd_512]
2463%elif BIT_DEPTH == 8
2464 %define IDCT_SHIFT2 12
2465 vpbroadcastd m15, [pd_2048]
2466%else
2467 %error Unsupported BIT_DEPTH!
2468%endif
2469
2470 mov r3, rsp
2471 add r2d, r2d
2472 mov r4d, 32
2473
2474 mova m7, [tab_idct32_4]
2475 mova m8, [tab_idct32_4 + 32]
2476 mova m9, [tab_idct32_4 + 64]
2477 mova m10, [tab_idct32_4 + 96]
2478 mova m11, [tab_idct32_4 + 128]
2479 mova m12, [tab_idct32_4 + 160]
2480 mova m13, [tab_idct32_4 + 192]
2481 mova m14, [tab_idct32_4 + 224]
2482.pass2:
2483 movu m0, [r3]
2484 movu m1, [r3 + 32]
2485
2486 pmaddwd m2, m0, m7
2487 pmaddwd m3, m0, m8
2488 phaddd m2, m3
2489
2490 pmaddwd m3, m0, m9
2491 pmaddwd m4, m0, m10
2492 phaddd m3, m4
2493
2494 phaddd m2, m3
2495
2496 pmaddwd m3, m0, m11
2497 pmaddwd m4, m0, m12
2498 phaddd m3, m4
2499
2500 pmaddwd m4, m0, m13
2501 pmaddwd m5, m0, m14
2502 phaddd m4, m5
2503
2504 phaddd m3, m4
2505
2506 vperm2i128 m4, m2, m3, 0x31
2507 vperm2i128 m2, m2, m3, 0x20
2508 paddd m2, m4
2509
2510 pmaddwd m3, m0, [tab_idct32_4 + 256]
2511 pmaddwd m4, m0, [tab_idct32_4 + 288]
2512 phaddd m3, m4
2513
2514 pmaddwd m4, m0, [tab_idct32_4 + 320]
2515 pmaddwd m5, m0, [tab_idct32_4 + 352]
2516 phaddd m4, m5
2517
2518 phaddd m3, m4
2519
2520 pmaddwd m4, m0, [tab_idct32_4 + 384]
2521 pmaddwd m5, m0, [tab_idct32_4 + 416]
2522 phaddd m4, m5
2523
2524 pmaddwd m5, m0, [tab_idct32_4 + 448]
2525 pmaddwd m0, [tab_idct32_4 + 480]
2526 phaddd m5, m0
2527
2528 phaddd m4, m5
2529
2530 vperm2i128 m0, m3, m4, 0x31
2531 vperm2i128 m3, m3, m4, 0x20
2532 paddd m3, m0
2533
2534 pmaddwd m4, m1, [tab_idct32_1]
2535 pmaddwd m0, m1, [tab_idct32_1 + 32]
2536 phaddd m4, m0
2537
2538 pmaddwd m5, m1, [tab_idct32_1 + 64]
2539 pmaddwd m0, m1, [tab_idct32_1 + 96]
2540 phaddd m5, m0
2541
2542 phaddd m4, m5
2543
2544 pmaddwd m5, m1, [tab_idct32_1 + 128]
2545 pmaddwd m0, m1, [tab_idct32_1 + 160]
2546 phaddd m5, m0
2547
2548 pmaddwd m6, m1, [tab_idct32_1 + 192]
2549 pmaddwd m0, m1, [tab_idct32_1 + 224]
2550 phaddd m6, m0
2551
2552 phaddd m5, m6
2553
2554 vperm2i128 m0, m4, m5, 0x31
2555 vperm2i128 m4, m4, m5, 0x20
2556 paddd m4, m0
2557
2558 pmaddwd m5, m1, [tab_idct32_1 + 256]
2559 pmaddwd m0, m1, [tab_idct32_1 + 288]
2560 phaddd m5, m0
2561
2562 pmaddwd m6, m1, [tab_idct32_1 + 320]
2563 pmaddwd m0, m1, [tab_idct32_1 + 352]
2564 phaddd m6, m0
2565
2566 phaddd m5, m6
2567
2568 pmaddwd m6, m1, [tab_idct32_1 + 384]
2569 pmaddwd m0, m1, [tab_idct32_1 + 416]
2570 phaddd m6, m0
2571
2572 pmaddwd m0, m1, [tab_idct32_1 + 448]
2573 pmaddwd m1, [tab_idct32_1 + 480]
2574 phaddd m0, m1
2575
2576 phaddd m6, m0
2577
2578 vperm2i128 m0, m5, m6, 0x31
2579 vperm2i128 m5, m5, m6, 0x20
2580 paddd m5, m0
2581
2582 paddd m6, m2, m4
2583 paddd m6, m15
2584 psrad m6, IDCT_SHIFT2
2585
2586 psubd m2, m4
2587 paddd m2, m15
2588 psrad m2, IDCT_SHIFT2
2589
2590 paddd m4, m3, m5
2591 paddd m4, m15
2592 psrad m4, IDCT_SHIFT2
2593
2594 psubd m3, m5
2595 paddd m3, m15
2596 psrad m3, IDCT_SHIFT2
2597
2598 packssdw m6, m4
2599 packssdw m2, m3
2600
2601 vpermq m6, m6, 0xD8
2602 vpermq m2, m2, 0x8D
2603 pshufb m2, [dct16_shuf1]
2604
2605 mova [r1], m6
2606 mova [r1 + 32], m2
2607
2608 add r1, r2
2609 add r3, 64
2610 dec r4d
2611 jnz .pass2
2612 RET
2613
2614;-------------------------------------------------------
2615; void idct4(int32_t *src, int16_t *dst, intptr_t stride)
2616;-------------------------------------------------------
2617INIT_YMM avx2
2618cglobal idct4, 3, 4, 6
2619
2620%define IDCT_SHIFT1 7
2621%if BIT_DEPTH == 10
2622 %define IDCT_SHIFT2 10
2623 vpbroadcastd m5, [pd_512]
2624%elif BIT_DEPTH == 8
2625 %define IDCT_SHIFT2 12
2626 vpbroadcastd m5, [pd_2048]
2627%else
2628 %error Unsupported BIT_DEPTH!
2629%endif
2630 vbroadcasti128 m4, [pd_64]
2631
2632 add r2d, r2d
2633 lea r3, [r2 * 3]
2634
2635 movu m0, [r0] ;[00 01 02 03 10 11 12 13]
2636 movu m1, [r0 + 32] ;[20 21 22 23 30 31 32 33]
2637
2638 packssdw m0, m1 ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33]
2639 pshufb m0, [idct4_shuf1] ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33]
2640 vpermq m2, m0, 0x44 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
2641 vpermq m0, m0, 0xEE ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
2642
2643 mova m1, [avx2_idct4_1]
2644 mova m3, [avx2_idct4_1 + 32]
2645 pmaddwd m1, m2
2646 pmaddwd m3, m0
2647
2648 paddd m0, m1, m3
2649 paddd m0, m4
2650 psrad m0, IDCT_SHIFT1 ;[00 20 10 30 01 21 11 31]
2651
2652 psubd m1, m3
2653 paddd m1, m4
2654 psrad m1, IDCT_SHIFT1 ;[03 23 13 33 02 22 12 32]
2655
2656 packssdw m0, m1 ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32]
2657 vmovshdup m1, m0 ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32]
2658 vmovsldup m0, m0 ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22]
2659
2660 vpbroadcastq m2, [avx2_idct4_2]
2661 vpbroadcastq m3, [avx2_idct4_2 + 8]
2662 pmaddwd m0, m2
2663 pmaddwd m1, m3
2664
2665 paddd m2, m0, m1
2666 paddd m2, m5
2667 psrad m2, IDCT_SHIFT2 ;[00 01 10 11 30 31 20 21]
2668
2669 psubd m0, m1
2670 paddd m0, m5
2671 psrad m0, IDCT_SHIFT2 ;[03 02 13 12 33 32 23 22]
2672
2673 pshufb m0, [idct4_shuf2] ;[02 03 12 13 32 33 22 23]
2674 punpcklqdq m1, m2, m0 ;[00 01 02 03 10 11 12 13]
2675 punpckhqdq m2, m0 ;[30 31 32 33 20 21 22 23]
2676 packssdw m1, m2 ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23]
2677 vextracti128 xm0, m1, 1
2678
2679 movq [r1], xm1
2680 movq [r1 + r2], xm0
2681 movhps [r1 + 2 * r2], xm0
2682 movhps [r1 + r3], xm1
2683 RET
2684%endif