Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / x86 / dct8.asm
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
5 ;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
6 ;* Li Cao <li@multicorewareinc.com>
7 ;* Praveen Kumar Tiwari <Praveen@multicorewareinc.com>
8 ;*
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
13 ;*
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
18 ;*
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*
23 ;* This program is also available under a commercial proprietary license.
24 ;* For more information, contact us at license @ x265.com.
25 ;*****************************************************************************/
26
27 ;TO-DO : Further optimize the routines.
28
29 %include "x86inc.asm"
30 %include "x86util.asm"
31 SECTION_RODATA 32
32 tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64
33 dw 89, 75, 50, 18, -18, -50, -75, -89
34 dw 83, 36, -36, -83, -83, -36, 36, 83
35 dw 75, -18, -89, -50, 50, 89, 18, -75
36 dw 64, -64, -64, 64, 64, -64, -64, 64
37 dw 50, -89, 18, 75, -75, -18, 89, -50
38 dw 36, -83, 83, -36, -36, 83, -83, 36
39 dw 18, -50, 75, -89, 89, -75, 50, -18
40
41 dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
42
43 tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64
44 dw 90, 87, 80, 70, 57, 43, 25, 9
45 dw 89, 75, 50, 18, -18, -50, -75, -89
46 dw 87, 57, 9, -43, -80, -90, -70, -25
47 dw 83, 36, -36, -83, -83, -36, 36, 83
48 dw 80, 9, -70, -87, -25, 57, 90, 43
49 dw 75, -18, -89, -50, 50, 89, 18, -75
50 dw 70, -43, -87, 9, 90, 25, -80, -57
51 dw 64, -64, -64, 64, 64, -64, -64, 64
52 dw 57, -80, -25, 90, -9, -87, 43, 70
53 dw 50, -89, 18, 75, -75, -18, 89, -50
54 dw 43, -90, 57, 25, -87, 70, 9, -80
55 dw 36, -83, 83, -36, -36, 83, -83, 36
56 dw 25, -70, 90, -80, 43, 9, -57, 87
57 dw 18, -50, 75, -89, 89, -75, 50, -18
58 dw 9, -25, 43, -57, 70, -80, 87, -90
59
60
61 tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64
62 dw -9, -25, -43, -57, -70, -80, -87, -90
63 dw -89, -75, -50, -18, 18, 50, 75, 89
64 dw 25, 70, 90, 80, 43, -9, -57, -87
65 dw 83, 36, -36, -83, -83, -36, 36, 83
66 dw -43, -90, -57, 25, 87, 70, -9, -80
67 dw -75, 18, 89, 50, -50, -89, -18, 75
68 dw 57, 80, -25, -90, -9, 87, 43, -70
69 dw 64, -64, -64, 64, 64, -64, -64, 64
70 dw -70, -43, 87, 9, -90, 25, 80, -57
71 dw -50, 89, -18, -75, 75, 18, -89, 50
72 dw 80, -9, -70, 87, -25, -57, 90, -43
73 dw 36, -83, 83, -36, -36, 83, -83, 36
74 dw -87, 57, -9, -43, 80, -90, 70, -25
75 dw -18, 50, -75, 89, -89, 75, -50, 18
76 dw 90, -87, 80, -70, 57, -43, 25, -9
77
78 dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
79
80 dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9
81
82 tab_dct32_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
83 dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
84 dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90
85 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
86 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
87 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
88 dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
89 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
90 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
91 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
92 dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
93 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
94 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
95 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
96 dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
97 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
98 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
99 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
100 dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57
101 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
102 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
103 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
104 dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43
105 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
106 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
107 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
108 dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
109 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
110 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
111 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
112 dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
113 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
114
115 tab_dct32_2: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
116 dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
117 dw -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90
118 dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90
119 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
120 dw -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88
121 dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87
122 dw 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
123 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
124 dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
125 dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80
126 dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78
127 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
128 dw -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
129 dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70
130 dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
131 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
132 dw -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61
133 dw -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
134 dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54
135 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
136 dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
137 dw -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43
138 dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38
139 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
140 dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
141 dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25
142 dw 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22
143 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
144 dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
145 dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9
146 dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
147
148 avx2_idct8_1: times 4 dw 64, 83, 64, 36
149 times 4 dw 64, 36, -64, -83
150 times 4 dw 64, -36, -64, 83
151 times 4 dw 64, -83, 64, -36
152
153 avx2_idct8_2: times 4 dw 89, 75, 50, 18
154 times 4 dw 75, -18, -89, -50
155 times 4 dw 50, -89, 18, 75
156 times 4 dw 18, -50, 75, -89
157
158 idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7
159
160 idct8_shuf2: times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
161
162 idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
163
164 tab_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9
165 dw 87, 57, 9, -43, -80, -90, -70, -25
166 dw 80, 9, -70, -87, -25, 57, 90, 43
167 dw 70, -43, -87, 9, 90, 25, -80, -57
168 dw 57, -80, -25, 90, -9, -87, 43, 70
169 dw 43, -90, 57, 25, -87, 70, 9, -80
170 dw 25, -70, 90, -80, 43, 9, -57, 87
171 dw 9, -25, 43, -57, 70, -80, 87, -90
172
173 tab_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18
174 dw 64, 75, 36, -18, -64, -89, -83, -50
175 dw 64, 50, -36, -89, -64, 18, 83, 75
176 dw 64, 18, -83, -50, 64, 75, -36, -89
177 dw 64, -18, -83, 50, 64, -75, -36, 89
178 dw 64, -50, -36, 89, -64, -18, 83, -75
179 dw 64, -75, 36, 18, -64, 89, -83, 50
180 dw 64, -89, 83, -75, 64, -50, 36, -18
181
182 idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7
183
184 idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5
185
186 tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
187 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
188 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
189 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
190 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
191 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
192 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
193 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
194 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
195 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
196 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
197 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
198 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
199 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
200 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
201 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
202
203
204 tab_idct32_2: dw 64, 89, 83, 75, 64, 50, 36, 18
205 dw 64, 75, 36, -18, -64, -89, -83, -50
206 dw 64, 50, -36, -89, -64, 18, 83, 75
207 dw 64, 18, -83, -50, 64, 75, -36, -89
208 dw 64, -18, -83, 50, 64, -75, -36, 89
209 dw 64, -50, -36, 89, -64, -18, 83, -75
210 dw 64, -75, 36, 18, -64, 89, -83, 50
211 dw 64, -89, 83, -75, 64, -50, 36, -18
212
213
214 tab_idct32_3: dw 90, 87, 80, 70, 57, 43, 25, 9
215 dw 87, 57, 9, -43, -80, -90, -70, -25
216 dw 80, 9, -70, -87, -25, 57, 90, 43
217 dw 70, -43, -87, 9, 90, 25, -80, -57
218 dw 57, -80, -25, 90, -9, -87, 43, 70
219 dw 43, -90, 57, 25, -87, 70, 9, -80
220 dw 25, -70, 90, -80, 43, 9, -57, 87
221 dw 9, -25, 43, -57, 70, -80, 87, -90
222
223 tab_idct32_4: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
224 dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
225 dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
226 dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
227 dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
228 dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
229 dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
230 dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
231 dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
232 dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
233 dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
234 dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
235 dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
236 dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
237 dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
238 dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
239
240 avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
241 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
242
243 avx2_idct4_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
244 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83
245
246 avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83
247
248 const idct4_shuf1, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
249
250 idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
251
252 tab_dct4: times 4 dw 64, 64
253 times 4 dw 83, 36
254 times 4 dw 64, -64
255 times 4 dw 36, -83
256
257 dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
258
259 tab_dst4: times 2 dw 29, 55, 74, 84
260 times 2 dw 74, 74, 0, -74
261 times 2 dw 84, -29, -74, 55
262 times 2 dw 55, -84, 74, -29
263
264 tab_idst4: times 4 dw 29, +84
265 times 4 dw +74, +55
266 times 4 dw 55, -29
267 times 4 dw +74, -84
268 times 4 dw 74, -74
269 times 4 dw 0, +74
270 times 4 dw 84, +55
271 times 4 dw -74, -29
272
273 tab_dct8_1: times 2 dw 89, 50, 75, 18
274 times 2 dw 75, -89, -18, -50
275 times 2 dw 50, 18, -89, 75
276 times 2 dw 18, 75, -50, -89
277
278 tab_dct8_2: times 2 dd 83, 36
279 times 2 dd 36, 83
280 times 1 dd 89, 75, 50, 18
281 times 1 dd 75, -18, -89, -50
282 times 1 dd 50, -89, 18, 75
283 times 1 dd 18, -50, 75, -89
284
285 tab_idct8_3: times 4 dw 89, 75
286 times 4 dw 50, 18
287 times 4 dw 75, -18
288 times 4 dw -89, -50
289 times 4 dw 50, -89
290 times 4 dw 18, 75
291 times 4 dw 18, -50
292 times 4 dw 75, -89
293
294 pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
295
296 pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13
297
298 tab_idct8_1: times 1 dw 64, -64, 36, -83, 64, 64, 83, 36
299
300 tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
301 times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
302
303 pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
304
305 SECTION .text
306 cextern pd_1
307 cextern pd_2
308 cextern pd_4
309 cextern pd_8
310 cextern pd_16
311 cextern pd_32
312 cextern pd_64
313 cextern pd_128
314 cextern pd_256
315 cextern pd_512
316 cextern pd_1024
317 cextern pd_2048
318 cextern pw_ppppmmmm
319
320 ;------------------------------------------------------
321 ;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
322 ;------------------------------------------------------
323 INIT_XMM sse2
324 cglobal dct4, 3, 4, 8
325 %if BIT_DEPTH == 10
326 %define DCT_SHIFT 3
327 mova m7, [pd_4]
328 %elif BIT_DEPTH == 8
329 %define DCT_SHIFT 1
330 mova m7, [pd_1]
331 %else
332 %error Unsupported BIT_DEPTH!
333 %endif
334 add r2d, r2d
335 lea r3, [tab_dct4]
336
337 mova m4, [r3 + 0 * 16]
338 mova m5, [r3 + 1 * 16]
339 mova m6, [r3 + 2 * 16]
340 movh m0, [r0 + 0 * r2]
341 movh m1, [r0 + 1 * r2]
342 punpcklqdq m0, m1
343 pshufd m0, m0, 0xD8
344 pshufhw m0, m0, 0xB1
345
346 lea r0, [r0 + 2 * r2]
347 movh m1, [r0]
348 movh m2, [r0 + r2]
349 punpcklqdq m1, m2
350 pshufd m1, m1, 0xD8
351 pshufhw m1, m1, 0xB1
352
353 punpcklqdq m2, m0, m1
354 punpckhqdq m0, m1
355
356 paddw m1, m2, m0
357 psubw m2, m0
358 pmaddwd m0, m1, m4
359 paddd m0, m7
360 psrad m0, DCT_SHIFT
361 pmaddwd m3, m2, m5
362 paddd m3, m7
363 psrad m3, DCT_SHIFT
364 packssdw m0, m3
365 pshufd m0, m0, 0xD8
366 pshufhw m0, m0, 0xB1
367 pmaddwd m1, m6
368 paddd m1, m7
369 psrad m1, DCT_SHIFT
370 pmaddwd m2, [r3 + 3 * 16]
371 paddd m2, m7
372 psrad m2, DCT_SHIFT
373 packssdw m1, m2
374 pshufd m1, m1, 0xD8
375 pshufhw m1, m1, 0xB1
376
377 punpcklqdq m2, m0, m1
378 punpckhqdq m0, m1
379
380 mova m7, [pd_128]
381
382 pmaddwd m1, m2, m4
383 pmaddwd m3, m0, m4
384 paddd m1, m3
385 paddd m1, m7
386 psrad m1, 8
387
388 pmaddwd m4, m2, m5
389 pmaddwd m3, m0, m5
390 psubd m4, m3
391 paddd m4, m7
392 psrad m4, 8
393 packssdw m1, m4
394 movu [r1 + 0 * 16], m1
395
396 pmaddwd m1, m2, m6
397 pmaddwd m3, m0, m6
398 paddd m1, m3
399 paddd m1, m7
400 psrad m1, 8
401
402 pmaddwd m2, [r3 + 3 * 16]
403 pmaddwd m0, [r3 + 3 * 16]
404 psubd m2, m0
405 paddd m2, m7
406 psrad m2, 8
407 packssdw m1, m2
408 movu [r1 + 1 * 16], m1
409 RET
410
411 ; DCT 4x4
412 ;
413 ; Input parameters:
414 ; - r0: source
415 ; - r1: destination
416 ; - r2: source stride
417 INIT_YMM avx2
418 cglobal dct4, 3, 4, 8, src, dst, srcStride
419 %if BIT_DEPTH == 10
420 %define DCT_SHIFT 3
421 vbroadcasti128 m7, [pd_4]
422 %elif BIT_DEPTH == 8
423 %define DCT_SHIFT 1
424 vbroadcasti128 m7, [pd_1]
425 %else
426 %error Unsupported BIT_DEPTH!
427 %endif
428 add r2d, r2d
429 lea r3, [avx2_dct4]
430
431 vbroadcasti128 m4, [dct4_shuf]
432 mova m5, [r3]
433 mova m6, [r3 + 32]
434 movq xm0, [r0]
435 movhps xm0, [r0 + r2]
436 lea r0, [r0 + 2 * r2]
437 movq xm1, [r0]
438 movhps xm1, [r0 + r2]
439
440 vinserti128 m0, m0, xm1, 1
441 pshufb m0, m4
442 vpermq m1, m0, 11011101b
443 vpermq m0, m0, 10001000b
444 paddw m2, m0, m1
445 psubw m0, m1
446
447 pmaddwd m2, m5
448 paddd m2, m7
449 psrad m2, DCT_SHIFT
450
451 pmaddwd m0, m6
452 paddd m0, m7
453 psrad m0, DCT_SHIFT
454
455 packssdw m2, m0
456 pshufb m2, m4
457 vpermq m1, m2, 11011101b
458 vpermq m2, m2, 10001000b
459 vbroadcasti128 m7, [pd_128]
460
461 pmaddwd m0, m2, m5
462 pmaddwd m3, m1, m5
463 paddd m3, m0
464 paddd m3, m7
465 psrad m3, 8
466
467 pmaddwd m2, m6
468 pmaddwd m1, m6
469 psubd m2, m1
470 paddd m2, m7
471 psrad m2, 8
472
473 packssdw m3, m2
474 movu [r1], m3
475 RET
476
477 ;-------------------------------------------------------
478 ;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
479 ;-------------------------------------------------------
480 INIT_XMM sse2
481 cglobal idct4, 3, 4, 7
482 %if BIT_DEPTH == 8
483 %define IDCT4_OFFSET [pd_2048]
484 %define IDCT4_SHIFT 12
485 %elif BIT_DEPTH == 10
486 %define IDCT4_OFFSET [pd_512]
487 %define IDCT4_SHIFT 10
488 %else
489 %error Unsupported BIT_DEPTH!
490 %endif
491 add r2d, r2d
492 lea r3, [tab_dct4]
493
494 mova m6, [pd_64]
495
496 movu m0, [r0 + 0 * 16]
497 movu m1, [r0 + 1 * 16]
498
499 punpcklwd m2, m0, m1
500 pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1
501 paddd m3, m6
502
503 pmaddwd m2, [r3 + 2 * 16] ; m2 = E2
504 paddd m2, m6
505
506 punpckhwd m0, m1
507 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
508 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
509
510 paddd m4, m3, m1
511 psrad m4, 7 ; m4 = m128iA
512 paddd m5, m2, m0
513 psrad m5, 7
514 packssdw m4, m5 ; m4 = m128iA
515
516 psubd m2, m0
517 psrad m2, 7
518 psubd m3, m1
519 psrad m3, 7
520 packssdw m2, m3 ; m2 = m128iD
521
522 punpcklwd m1, m4, m2 ; m1 = S0
523 punpckhwd m4, m2 ; m4 = S8
524
525 punpcklwd m0, m1, m4 ; m0 = m128iA
526 punpckhwd m1, m4 ; m1 = m128iD
527
528 mova m6, IDCT4_OFFSET
529
530 punpcklwd m2, m0, m1
531 pmaddwd m3, m2, [r3 + 0 * 16]
532 paddd m3, m6 ; m3 = E1
533
534 pmaddwd m2, [r3 + 2 * 16]
535 paddd m2, m6 ; m2 = E2
536
537 punpckhwd m0, m1
538 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
539 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
540
541 paddd m4, m3, m1
542 psrad m4, IDCT4_SHIFT ; m4 = m128iA
543 paddd m5, m2, m0
544 psrad m5, IDCT4_SHIFT
545 packssdw m4, m5 ; m4 = m128iA
546
547 psubd m2, m0
548 psrad m2, IDCT4_SHIFT
549 psubd m3, m1
550 psrad m3, IDCT4_SHIFT
551 packssdw m2, m3 ; m2 = m128iD
552
553 punpcklwd m1, m4, m2
554 punpckhwd m4, m2
555
556 punpcklwd m0, m1, m4
557 movlps [r1 + 0 * r2], m0
558 movhps [r1 + 1 * r2], m0
559
560 punpckhwd m1, m4
561 movlps [r1 + 2 * r2], m1
562 lea r1, [r1 + 2 * r2]
563 movhps [r1 + r2], m1
564
565 RET
566
567 ;------------------------------------------------------
568 ;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
569 ;------------------------------------------------------
570 INIT_XMM ssse3
571 %if ARCH_X86_64
572 cglobal dst4, 3, 4, 8+2
573 %define coef2 m8
574 %define coef3 m9
575 %else ; ARCH_X86_64 = 0
576 cglobal dst4, 3, 4, 8
577 %define coef2 [r3 + 2 * 16]
578 %define coef3 [r3 + 3 * 16]
579 %endif ; ARCH_X86_64
580 %define coef0 m6
581 %define coef1 m7
582
583 %if BIT_DEPTH == 8
584 %define DST_SHIFT 1
585 mova m5, [pd_1]
586 %elif BIT_DEPTH == 10
587 %define DST_SHIFT 3
588 mova m5, [pd_4]
589 %endif
590 add r2d, r2d
591 lea r3, [tab_dst4]
592 mova coef0, [r3 + 0 * 16]
593 mova coef1, [r3 + 1 * 16]
594 %if ARCH_X86_64
595 mova coef2, [r3 + 2 * 16]
596 mova coef3, [r3 + 3 * 16]
597 %endif
598 movh m0, [r0 + 0 * r2] ; load
599 movh m1, [r0 + 1 * r2]
600 punpcklqdq m0, m1
601 lea r0, [r0 + 2 * r2]
602 movh m1, [r0]
603 movh m2, [r0 + r2]
604 punpcklqdq m1, m2
605 pmaddwd m2, m0, coef0 ; DST1
606 pmaddwd m3, m1, coef0
607 phaddd m2, m3
608 paddd m2, m5
609 psrad m2, DST_SHIFT
610 pmaddwd m3, m0, coef1
611 pmaddwd m4, m1, coef1
612 phaddd m3, m4
613 paddd m3, m5
614 psrad m3, DST_SHIFT
615 packssdw m2, m3 ; m2 = T70
616 pmaddwd m3, m0, coef2
617 pmaddwd m4, m1, coef2
618 phaddd m3, m4
619 paddd m3, m5
620 psrad m3, DST_SHIFT
621 pmaddwd m0, coef3
622 pmaddwd m1, coef3
623 phaddd m0, m1
624 paddd m0, m5
625 psrad m0, DST_SHIFT
626 packssdw m3, m0 ; m3 = T71
627 mova m5, [pd_128]
628
629 pmaddwd m0, m2, coef0 ; DST2
630 pmaddwd m1, m3, coef0
631 phaddd m0, m1
632 paddd m0, m5
633 psrad m0, 8
634
635 pmaddwd m4, m2, coef1
636 pmaddwd m1, m3, coef1
637 phaddd m4, m1
638 paddd m4, m5
639 psrad m4, 8
640 packssdw m0, m4
641 movu [r1 + 0 * 16], m0
642
643 pmaddwd m0, m2, coef2
644 pmaddwd m1, m3, coef2
645 phaddd m0, m1
646 paddd m0, m5
647 psrad m0, 8
648
649 pmaddwd m2, coef3
650 pmaddwd m3, coef3
651 phaddd m2, m3
652 paddd m2, m5
653 psrad m2, 8
654 packssdw m0, m2
655 movu [r1 + 1 * 16], m0
656
657 RET
658
659 ;-------------------------------------------------------
660 ;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
661 ;-------------------------------------------------------
662 INIT_XMM sse2
663 cglobal idst4, 3, 4, 7
664 %if BIT_DEPTH == 8
665 mova m6, [pd_2048]
666 %define IDCT4_SHIFT 12
667 %elif BIT_DEPTH == 10
668 mova m6, [pd_512]
669 %define IDCT4_SHIFT 10
670 %else
671 %error Unsupported BIT_DEPTH!
672 %endif
673 add r2d, r2d
674 lea r3, [tab_idst4]
675 mova m5, [pd_64]
676
677 movu m0, [r0 + 0 * 16]
678 movu m1, [r0 + 1 * 16]
679
680 punpcklwd m2, m0, m1 ; m2 = m128iAC
681 punpckhwd m0, m1 ; m0 = m128iBD
682
683 pmaddwd m1, m2, [r3 + 0 * 16]
684 pmaddwd m3, m0, [r3 + 1 * 16]
685 paddd m1, m3
686 paddd m1, m5
687 psrad m1, 7 ; m1 = S0
688
689 pmaddwd m3, m2, [r3 + 2 * 16]
690 pmaddwd m4, m0, [r3 + 3 * 16]
691 paddd m3, m4
692 paddd m3, m5
693 psrad m3, 7 ; m3 = S8
694 packssdw m1, m3 ; m1 = m128iA
695
696 pmaddwd m3, m2, [r3 + 4 * 16]
697 pmaddwd m4, m0, [r3 + 5 * 16]
698 paddd m3, m4
699 paddd m3, m5
700 psrad m3, 7 ; m3 = S0
701
702 pmaddwd m2, [r3 + 6 * 16]
703 pmaddwd m0, [r3 + 7 * 16]
704 paddd m2, m0
705 paddd m2, m5
706 psrad m2, 7 ; m2 = S8
707 packssdw m3, m2 ; m3 = m128iD
708
709 punpcklwd m0, m1, m3
710 punpckhwd m1, m3
711
712 punpcklwd m2, m0, m1
713 punpckhwd m0, m1
714 punpcklwd m1, m2, m0
715 punpckhwd m2, m0
716 pmaddwd m0, m1, [r3 + 0 * 16]
717 pmaddwd m3, m2, [r3 + 1 * 16]
718 paddd m0, m3
719 paddd m0, m6
720 psrad m0, IDCT4_SHIFT ; m0 = S0
721 pmaddwd m3, m1, [r3 + 2 * 16]
722 pmaddwd m4, m2, [r3 + 3 * 16]
723 paddd m3, m4
724 paddd m3, m6
725 psrad m3, IDCT4_SHIFT ; m3 = S8
726 packssdw m0, m3 ; m0 = m128iA
727 pmaddwd m3, m1, [r3 + 4 * 16]
728 pmaddwd m4, m2, [r3 + 5 * 16]
729 paddd m3, m4
730 paddd m3, m6
731 psrad m3, IDCT4_SHIFT ; m3 = S0
732 pmaddwd m1, [r3 + 6 * 16]
733 pmaddwd m2, [r3 + 7 * 16]
734 paddd m1, m2
735 paddd m1, m6
736 psrad m1, IDCT4_SHIFT ; m1 = S8
737 packssdw m3, m1 ; m3 = m128iD
738 punpcklwd m1, m0, m3
739 punpckhwd m0, m3
740
741 punpcklwd m2, m1, m0
742 movlps [r1 + 0 * r2], m2
743 movhps [r1 + 1 * r2], m2
744
745 punpckhwd m1, m0
746 movlps [r1 + 2 * r2], m1
747 lea r1, [r1 + 2 * r2]
748 movhps [r1 + r2], m1
749 RET
750
751
752 ;-------------------------------------------------------
753 ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
754 ;-------------------------------------------------------
755 INIT_XMM sse4
756 cglobal dct8, 3,6,7,0-16*mmsize
757 ;------------------------
758 ; Stack Mapping(dword)
759 ;------------------------
760 ; Row0[0-3] Row1[0-3]
761 ; ...
762 ; Row6[0-3] Row7[0-3]
763 ; Row0[0-3] Row7[0-3]
764 ; ...
765 ; Row6[4-7] Row7[4-7]
766 ;------------------------
767 %if BIT_DEPTH == 10
768 %define DCT_SHIFT 4
769 mova m6, [pd_8]
770 %elif BIT_DEPTH == 8
771 %define DCT_SHIFT 2
772 mova m6, [pd_2]
773 %else
774 %error Unsupported BIT_DEPTH!
775 %endif
776
777 add r2, r2
778 lea r3, [r2 * 3]
779 mov r5, rsp
780 %assign x 0
781 %rep 2
782 movu m0, [r0]
783 movu m1, [r0 + r2]
784 movu m2, [r0 + r2 * 2]
785 movu m3, [r0 + r3]
786
787 punpcklwd m4, m0, m1
788 punpckhwd m0, m1
789 punpcklwd m5, m2, m3
790 punpckhwd m2, m3
791 punpckldq m1, m4, m5 ; m1 = [1 0]
792 punpckhdq m4, m5 ; m4 = [3 2]
793 punpckldq m3, m0, m2
794 punpckhdq m0, m2
795 pshufd m2, m3, 0x4E ; m2 = [4 5]
796 pshufd m0, m0, 0x4E ; m0 = [6 7]
797
798 paddw m3, m1, m0
799 psubw m1, m0 ; m1 = [d1 d0]
800 paddw m0, m4, m2
801 psubw m4, m2 ; m4 = [d3 d2]
802 punpcklqdq m2, m3, m0 ; m2 = [s2 s0]
803 punpckhqdq m3, m0
804 pshufd m3, m3, 0x4E ; m3 = [s1 s3]
805
806 punpcklwd m0, m1, m4 ; m0 = [d2/d0]
807 punpckhwd m1, m4 ; m1 = [d3/d1]
808 punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0]
809 punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0]
810
811 ; odd
812 lea r4, [tab_dct8_1]
813 pmaddwd m1, m4, [r4 + 0*16]
814 pmaddwd m5, m0, [r4 + 0*16]
815 phaddd m1, m5
816 paddd m1, m6
817 psrad m1, DCT_SHIFT
818 %if x == 1
819 pshufd m1, m1, 0x1B
820 %endif
821 mova [r5 + 1*2*mmsize], m1 ; Row 1
822
823 pmaddwd m1, m4, [r4 + 1*16]
824 pmaddwd m5, m0, [r4 + 1*16]
825 phaddd m1, m5
826 paddd m1, m6
827 psrad m1, DCT_SHIFT
828 %if x == 1
829 pshufd m1, m1, 0x1B
830 %endif
831 mova [r5 + 3*2*mmsize], m1 ; Row 3
832
833 pmaddwd m1, m4, [r4 + 2*16]
834 pmaddwd m5, m0, [r4 + 2*16]
835 phaddd m1, m5
836 paddd m1, m6
837 psrad m1, DCT_SHIFT
838 %if x == 1
839 pshufd m1, m1, 0x1B
840 %endif
841 mova [r5 + 5*2*mmsize], m1 ; Row 5
842
843 pmaddwd m4, [r4 + 3*16]
844 pmaddwd m0, [r4 + 3*16]
845 phaddd m4, m0
846 paddd m4, m6
847 psrad m4, DCT_SHIFT
848 %if x == 1
849 pshufd m4, m4, 0x1B
850 %endif
851 mova [r5 + 7*2*mmsize], m4; Row 7
852
853 ; even
854 lea r4, [tab_dct4]
855 paddw m0, m2, m3 ; m0 = [EE1 EE0]
856 pshufb m0, [pb_unpackhlw1]
857 psubw m2, m3 ; m2 = [EO1 EO0]
858 psignw m2, [pw_ppppmmmm]
859 pshufb m2, [pb_unpackhlw1]
860 pmaddwd m3, m0, [r4 + 0*16]
861 paddd m3, m6
862 psrad m3, DCT_SHIFT
863 %if x == 1
864 pshufd m3, m3, 0x1B
865 %endif
866 mova [r5 + 0*2*mmsize], m3 ; Row 0
867 pmaddwd m0, [r4 + 2*16]
868 paddd m0, m6
869 psrad m0, DCT_SHIFT
870 %if x == 1
871 pshufd m0, m0, 0x1B
872 %endif
873 mova [r5 + 4*2*mmsize], m0 ; Row 4
874 pmaddwd m3, m2, [r4 + 1*16]
875 paddd m3, m6
876 psrad m3, DCT_SHIFT
877 %if x == 1
878 pshufd m3, m3, 0x1B
879 %endif
880 mova [r5 + 2*2*mmsize], m3 ; Row 2
881 pmaddwd m2, [r4 + 3*16]
882 paddd m2, m6
883 psrad m2, DCT_SHIFT
884 %if x == 1
885 pshufd m2, m2, 0x1B
886 %endif
887 mova [r5 + 6*2*mmsize], m2 ; Row 6
888
889 %if x != 1
890 lea r0, [r0 + r2 * 4]
891 add r5, mmsize
892 %endif
893 %assign x x+1
894 %endrep
895
896 mov r2, 2
897 mov r0, rsp ; r0 = pointer to Low Part
898 lea r4, [tab_dct8_2]
899 mova m6, [pd_256]
900
901 .pass2:
902 %rep 2
903 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0]
904 mova m1, [r0 + 1*2*mmsize]
905 paddd m2, m0, [r0 + (0*2+1)*mmsize]
906 pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0]
907 paddd m3, m1, [r0 + (1*2+1)*mmsize]
908 pshufd m3, m3, 0x9C ; m3 = ^^
909 psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0]
910 psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^
911
912 ; even
913 phaddd m4, m2, m3 ; m4 = [EE1 EE0 EE1 EE0]
914 phsubd m2, m3 ; m2 = [EO1 EO0 EO1 EO0]
915
916 pslld m4, 6 ; m4 = [64*EE1 64*EE0]
917 pmulld m5, m2, [r4 + 0*16] ; m5 = [36*EO1 83*EO0]
918 pmulld m2, [r4 + 1*16] ; m2 = [83*EO1 36*EO0]
919
920 phaddd m3, m4, m5 ; m3 = [Row2 Row0]
921 paddd m3, m6
922 psrad m3, 9
923 phsubd m4, m2 ; m4 = [Row6 Row4]
924 paddd m4, m6
925 psrad m4, 9
926
927 packssdw m3, m3
928 movd [r1 + 0*mmsize], m3
929 pshufd m3, m3, 1
930 movd [r1 + 2*mmsize], m3
931
932 packssdw m4, m4
933 movd [r1 + 4*mmsize], m4
934 pshufd m4, m4, 1
935 movd [r1 + 6*mmsize], m4
936
937 ; odd
938 pmulld m2, m0, [r4 + 2*16]
939 pmulld m3, m1, [r4 + 2*16]
940 pmulld m4, m0, [r4 + 3*16]
941 pmulld m5, m1, [r4 + 3*16]
942 phaddd m2, m3
943 phaddd m4, m5
944 phaddd m2, m4 ; m2 = [Row3 Row1]
945 paddd m2, m6
946 psrad m2, 9
947
948 packssdw m2, m2
949 movd [r1 + 1*mmsize], m2
950 pshufd m2, m2, 1
951 movd [r1 + 3*mmsize], m2
952
953 pmulld m2, m0, [r4 + 4*16]
954 pmulld m3, m1, [r4 + 4*16]
955 pmulld m4, m0, [r4 + 5*16]
956 pmulld m5, m1, [r4 + 5*16]
957 phaddd m2, m3
958 phaddd m4, m5
959 phaddd m2, m4 ; m2 = [Row7 Row5]
960 paddd m2, m6
961 psrad m2, 9
962
963 packssdw m2, m2
964 movd [r1 + 5*mmsize], m2
965 pshufd m2, m2, 1
966 movd [r1 + 7*mmsize], m2
967
968 add r1, mmsize/4
969 add r0, 2*2*mmsize
970 %endrep
971
972 dec r2
973 jnz .pass2
974 RET
975
976 ;-------------------------------------------------------
977 ; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
978 ;-------------------------------------------------------
979 %if ARCH_X86_64
980 INIT_XMM sse2
981 %if BIT_DEPTH == 10
982 %define IDCT_SHIFT 10
983 %define IDCT_ADD pd_512
984 %elif BIT_DEPTH == 8
985 %define IDCT_SHIFT 12
986 %define IDCT_ADD pd_2048
987 %else
988 %error Unsupported BIT_DEPTH!
989 %endif
990
991 cglobal idct8, 3, 6, 16, 0-5*mmsize
992 mova m9, [r0 + 1 * mmsize]
993 mova m1, [r0 + 3 * mmsize]
994 mova m7, m9
995 punpcklwd m7, m1
996 punpckhwd m9, m1
997 mova m14, [tab_idct8_3]
998 mova m3, m14
999 pmaddwd m14, m7
1000 pmaddwd m3, m9
1001 mova m0, [r0 + 5 * mmsize]
1002 mova m10, [r0 + 7 * mmsize]
1003 mova m2, m0
1004 punpcklwd m2, m10
1005 punpckhwd m0, m10
1006 mova m15, [tab_idct8_3 + 1 * mmsize]
1007 mova m11, [tab_idct8_3 + 1 * mmsize]
1008 pmaddwd m15, m2
1009 mova m4, [tab_idct8_3 + 2 * mmsize]
1010 pmaddwd m11, m0
1011 mova m1, [tab_idct8_3 + 2 * mmsize]
1012 paddd m15, m14
1013 mova m5, [tab_idct8_3 + 4 * mmsize]
1014 mova m12, [tab_idct8_3 + 4 * mmsize]
1015 paddd m11, m3
1016 mova [rsp + 0 * mmsize], m11
1017 mova [rsp + 1 * mmsize], m15
1018 pmaddwd m4, m7
1019 pmaddwd m1, m9
1020 mova m14, [tab_idct8_3 + 3 * mmsize]
1021 mova m3, [tab_idct8_3 + 3 * mmsize]
1022 pmaddwd m14, m2
1023 pmaddwd m3, m0
1024 paddd m14, m4
1025 paddd m3, m1
1026 mova [rsp + 2 * mmsize], m3
1027 pmaddwd m5, m9
1028 pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
1029 mova m6, [tab_idct8_3 + 5 * mmsize]
1030 pmaddwd m12, m7
1031 pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
1032 mova m4, [tab_idct8_3 + 5 * mmsize]
1033 pmaddwd m6, m2
1034 paddd m6, m12
1035 pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
1036 paddd m7, m2
1037 mova [rsp + 3 * mmsize], m6
1038 pmaddwd m4, m0
1039 pmaddwd m0, [tab_idct8_3 + 7 * mmsize]
1040 paddd m9, m0
1041 paddd m5, m4
1042 mova m6, [r0 + 0 * mmsize]
1043 mova m0, [r0 + 4 * mmsize]
1044 mova m4, m6
1045 punpcklwd m4, m0
1046 punpckhwd m6, m0
1047 mova m12, [r0 + 2 * mmsize]
1048 mova m0, [r0 + 6 * mmsize]
1049 mova m13, m12
1050 mova m8, [tab_dct4]
1051 punpcklwd m13, m0
1052 mova m10, [tab_dct4]
1053 punpckhwd m12, m0
1054 pmaddwd m8, m4
1055 mova m3, m8
1056 pmaddwd m4, [tab_dct4 + 2 * mmsize]
1057 pmaddwd m10, m6
1058 mova m2, [tab_dct4 + 1 * mmsize]
1059 mova m1, m10
1060 pmaddwd m6, [tab_dct4 + 2 * mmsize]
1061 mova m0, [tab_dct4 + 1 * mmsize]
1062 pmaddwd m2, m13
1063 paddd m3, m2
1064 psubd m8, m2
1065 mova m2, m6
1066 pmaddwd m13, [tab_dct4 + 3 * mmsize]
1067 pmaddwd m0, m12
1068 paddd m1, m0
1069 psubd m10, m0
1070 mova m0, m4
1071 pmaddwd m12, [tab_dct4 + 3 * mmsize]
1072 paddd m3, [pd_64]
1073 paddd m1, [pd_64]
1074 paddd m8, [pd_64]
1075 paddd m10, [pd_64]
1076 paddd m0, m13
1077 paddd m2, m12
1078 paddd m0, [pd_64]
1079 paddd m2, [pd_64]
1080 psubd m4, m13
1081 psubd m6, m12
1082 paddd m4, [pd_64]
1083 paddd m6, [pd_64]
1084 mova m12, m8
1085 psubd m8, m7
1086 psrad m8, 7
1087 paddd m15, m3
1088 psubd m3, [rsp + 1 * mmsize]
1089 psrad m15, 7
1090 paddd m12, m7
1091 psrad m12, 7
1092 paddd m11, m1
1093 mova m13, m14
1094 psrad m11, 7
1095 packssdw m15, m11
1096 psubd m1, [rsp + 0 * mmsize]
1097 psrad m1, 7
1098 mova m11, [rsp + 2 * mmsize]
1099 paddd m14, m0
1100 psrad m14, 7
1101 psubd m0, m13
1102 psrad m0, 7
1103 paddd m11, m2
1104 mova m13, [rsp + 3 * mmsize]
1105 psrad m11, 7
1106 packssdw m14, m11
1107 mova m11, m6
1108 psubd m6, m5
1109 paddd m13, m4
1110 psrad m13, 7
1111 psrad m6, 7
1112 paddd m11, m5
1113 psrad m11, 7
1114 packssdw m13, m11
1115 mova m11, m10
1116 psubd m4, [rsp + 3 * mmsize]
1117 psubd m10, m9
1118 psrad m4, 7
1119 psrad m10, 7
1120 packssdw m4, m6
1121 packssdw m8, m10
1122 paddd m11, m9
1123 psrad m11, 7
1124 packssdw m12, m11
1125 psubd m2, [rsp + 2 * mmsize]
1126 mova m5, m15
1127 psrad m2, 7
1128 packssdw m0, m2
1129 mova m2, m14
1130 psrad m3, 7
1131 packssdw m3, m1
1132 mova m6, m13
1133 punpcklwd m5, m8
1134 punpcklwd m2, m4
1135 mova m1, m12
1136 punpcklwd m6, m0
1137 punpcklwd m1, m3
1138 mova m9, m5
1139 punpckhwd m13, m0
1140 mova m0, m2
1141 punpcklwd m9, m6
1142 punpckhwd m5, m6
1143 punpcklwd m0, m1
1144 punpckhwd m2, m1
1145 punpckhwd m15, m8
1146 mova m1, m5
1147 punpckhwd m14, m4
1148 punpckhwd m12, m3
1149 mova m6, m9
1150 punpckhwd m9, m0
1151 punpcklwd m1, m2
1152 mova m4, [tab_idct8_3 + 0 * mmsize]
1153 punpckhwd m5, m2
1154 punpcklwd m6, m0
1155 mova m2, m15
1156 mova m0, m14
1157 mova m7, m9
1158 punpcklwd m2, m13
1159 punpcklwd m0, m12
1160 punpcklwd m7, m5
1161 punpckhwd m14, m12
1162 mova m10, m2
1163 punpckhwd m15, m13
1164 punpckhwd m9, m5
1165 pmaddwd m4, m7
1166 mova m13, m1
1167 punpckhwd m2, m0
1168 punpcklwd m10, m0
1169 mova m0, m15
1170 punpckhwd m15, m14
1171 mova m12, m1
1172 mova m3, [tab_idct8_3 + 0 * mmsize]
1173 punpcklwd m0, m14
1174 pmaddwd m3, m9
1175 mova m11, m2
1176 punpckhwd m2, m15
1177 punpcklwd m11, m15
1178 mova m8, [tab_idct8_3 + 1 * mmsize]
1179 punpcklwd m13, m0
1180 punpckhwd m12, m0
1181 pmaddwd m8, m11
1182 paddd m8, m4
1183 mova [rsp + 4 * mmsize], m8
1184 mova m4, [tab_idct8_3 + 2 * mmsize]
1185 pmaddwd m4, m7
1186 mova m15, [tab_idct8_3 + 2 * mmsize]
1187 mova m5, [tab_idct8_3 + 1 * mmsize]
1188 pmaddwd m15, m9
1189 pmaddwd m5, m2
1190 paddd m5, m3
1191 mova [rsp + 3 * mmsize], m5
1192 mova m14, [tab_idct8_3 + 3 * mmsize]
1193 mova m5, [tab_idct8_3 + 3 * mmsize]
1194 pmaddwd m14, m11
1195 paddd m14, m4
1196 mova [rsp + 2 * mmsize], m14
1197 pmaddwd m5, m2
1198 paddd m5, m15
1199 mova [rsp + 1 * mmsize], m5
1200 mova m15, [tab_idct8_3 + 4 * mmsize]
1201 mova m5, [tab_idct8_3 + 4 * mmsize]
1202 pmaddwd m15, m7
1203 pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
1204 pmaddwd m5, m9
1205 pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
1206 mova m4, [tab_idct8_3 + 5 * mmsize]
1207 pmaddwd m4, m2
1208 paddd m5, m4
1209 mova m4, m6
1210 mova m8, [tab_idct8_3 + 5 * mmsize]
1211 punpckhwd m6, m10
1212 pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
1213 punpcklwd m4, m10
1214 paddd m9, m2
1215 pmaddwd m8, m11
1216 mova m10, [tab_dct4]
1217 paddd m8, m15
1218 pmaddwd m11, [tab_idct8_3 + 7 * mmsize]
1219 paddd m7, m11
1220 mova [rsp + 0 * mmsize], m8
1221 pmaddwd m10, m6
1222 pmaddwd m6, [tab_dct4 + 2 * mmsize]
1223 mova m1, m10
1224 mova m8, [tab_dct4]
1225 mova m3, [tab_dct4 + 1 * mmsize]
1226 pmaddwd m8, m4
1227 pmaddwd m4, [tab_dct4 + 2 * mmsize]
1228 mova m0, m8
1229 mova m2, [tab_dct4 + 1 * mmsize]
1230 pmaddwd m3, m13
1231 psubd m8, m3
1232 paddd m0, m3
1233 mova m3, m6
1234 pmaddwd m13, [tab_dct4 + 3 * mmsize]
1235 pmaddwd m2, m12
1236 paddd m1, m2
1237 psubd m10, m2
1238 mova m2, m4
1239 pmaddwd m12, [tab_dct4 + 3 * mmsize]
1240 paddd m0, [IDCT_ADD]
1241 paddd m1, [IDCT_ADD]
1242 paddd m8, [IDCT_ADD]
1243 paddd m10, [IDCT_ADD]
1244 paddd m2, m13
1245 paddd m3, m12
1246 paddd m2, [IDCT_ADD]
1247 paddd m3, [IDCT_ADD]
1248 psubd m4, m13
1249 psubd m6, m12
1250 paddd m4, [IDCT_ADD]
1251 paddd m6, [IDCT_ADD]
1252 mova m15, [rsp + 4 * mmsize]
1253 mova m12, m8
1254 psubd m8, m7
1255 psrad m8, IDCT_SHIFT
1256 mova m11, [rsp + 3 * mmsize]
1257 paddd m15, m0
1258 psrad m15, IDCT_SHIFT
1259 psubd m0, [rsp + 4 * mmsize]
1260 psrad m0, IDCT_SHIFT
1261 paddd m12, m7
1262 paddd m11, m1
1263 mova m14, [rsp + 2 * mmsize]
1264 psrad m11, IDCT_SHIFT
1265 packssdw m15, m11
1266 psubd m1, [rsp + 3 * mmsize]
1267 psrad m1, IDCT_SHIFT
1268 mova m11, [rsp + 1 * mmsize]
1269 paddd m14, m2
1270 psrad m14, IDCT_SHIFT
1271 packssdw m0, m1
1272 psrad m12, IDCT_SHIFT
1273 psubd m2, [rsp + 2 * mmsize]
1274 paddd m11, m3
1275 mova m13, [rsp + 0 * mmsize]
1276 psrad m11, IDCT_SHIFT
1277 packssdw m14, m11
1278 mova m11, m6
1279 psubd m6, m5
1280 paddd m13, m4
1281 psrad m13, IDCT_SHIFT
1282 mova m1, m15
1283 paddd m11, m5
1284 psrad m11, IDCT_SHIFT
1285 packssdw m13, m11
1286 mova m11, m10
1287 psubd m10, m9
1288 psrad m10, IDCT_SHIFT
1289 packssdw m8, m10
1290 psrad m6, IDCT_SHIFT
1291 psubd m4, [rsp + 0 * mmsize]
1292 paddd m11, m9
1293 psrad m11, IDCT_SHIFT
1294 packssdw m12, m11
1295 punpcklwd m1, m14
1296 mova m5, m13
1297 psrad m4, IDCT_SHIFT
1298 packssdw m4, m6
1299 psubd m3, [rsp + 1 * mmsize]
1300 psrad m2, IDCT_SHIFT
1301 mova m6, m8
1302 psrad m3, IDCT_SHIFT
1303 punpcklwd m5, m12
1304 packssdw m2, m3
1305 punpcklwd m6, m4
1306 punpckhwd m8, m4
1307 mova m4, m1
1308 mova m3, m2
1309 punpckhdq m1, m5
1310 punpckldq m4, m5
1311 punpcklwd m3, m0
1312 punpckhwd m2, m0
1313 mova m0, m6
1314 lea r2, [r2 + r2]
1315 lea r4, [r2 + r2]
1316 lea r3, [r4 + r2]
1317 lea r4, [r4 + r3]
1318 lea r0, [r4 + r2 * 2]
1319 movq [r1], m4
1320 punpckhwd m15, m14
1321 movhps [r1 + r2], m4
1322 punpckhdq m0, m3
1323 movq [r1 + r2 * 2], m1
1324 punpckhwd m13, m12
1325 movhps [r1 + r3], m1
1326 mova m1, m6
1327 punpckldq m1, m3
1328 movq [r1 + 8], m1
1329 movhps [r1 + r2 + 8], m1
1330 movq [r1 + r2 * 2 + 8], m0
1331 movhps [r1 + r3 + 8], m0
1332 mova m0, m15
1333 punpckhdq m15, m13
1334 punpckldq m0, m13
1335 movq [r1 + r2 * 4], m0
1336 movhps [r1 + r4], m0
1337 mova m0, m8
1338 punpckhdq m8, m2
1339 movq [r1 + r3 * 2], m15
1340 punpckldq m0, m2
1341 movhps [r1 + r0], m15
1342 movq [r1 + r2 * 4 + 8], m0
1343 movhps [r1 + r4 + 8], m0
1344 movq [r1 + r3 * 2 + 8], m8
1345 movhps [r1 + r0 + 8], m8
1346 RET
1347
1348 %undef IDCT_SHIFT
1349 %undef IDCT_ADD
1350 %endif
1351
1352 ;-------------------------------------------------------
1353 ; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
1354 ;-------------------------------------------------------
1355 INIT_XMM ssse3
1356
1357 cglobal patial_butterfly_inverse_internal_pass1
1358 movh m0, [r0]
1359 movhps m0, [r0 + 2 * 16]
1360 movh m1, [r0 + 4 * 16]
1361 movhps m1, [r0 + 6 * 16]
1362
1363 punpckhwd m2, m0, m1 ; [2 6]
1364 punpcklwd m0, m1 ; [0 4]
1365 pmaddwd m1, m0, [r6] ; EE[0]
1366 pmaddwd m0, [r6 + 32] ; EE[1]
1367 pmaddwd m3, m2, [r6 + 16] ; EO[0]
1368 pmaddwd m2, [r6 + 48] ; EO[1]
1369
1370 paddd m4, m1, m3 ; E[0]
1371 psubd m1, m3 ; E[3]
1372 paddd m3, m0, m2 ; E[1]
1373 psubd m0, m2 ; E[2]
1374
1375 ;E[K] = E[k] + add
1376 mova m5, [pd_64]
1377 paddd m0, m5
1378 paddd m1, m5
1379 paddd m3, m5
1380 paddd m4, m5
1381
1382 movh m2, [r0 + 16]
1383 movhps m2, [r0 + 5 * 16]
1384 movh m5, [r0 + 3 * 16]
1385 movhps m5, [r0 + 7 * 16]
1386 punpcklwd m6, m2, m5 ;[1 3]
1387 punpckhwd m2, m5 ;[5 7]
1388
1389 pmaddwd m5, m6, [r4]
1390 pmaddwd m7, m2, [r4 + 16]
1391 paddd m5, m7 ; O[0]
1392
1393 paddd m7, m4, m5
1394 psrad m7, 7
1395
1396 psubd m4, m5
1397 psrad m4, 7
1398
1399 packssdw m7, m4
1400 movh [r5 + 0 * 16], m7
1401 movhps [r5 + 7 * 16], m7
1402
1403 pmaddwd m5, m6, [r4 + 32]
1404 pmaddwd m4, m2, [r4 + 48]
1405 paddd m5, m4 ; O[1]
1406
1407 paddd m4, m3, m5
1408 psrad m4, 7
1409
1410 psubd m3, m5
1411 psrad m3, 7
1412
1413 packssdw m4, m3
1414 movh [r5 + 1 * 16], m4
1415 movhps [r5 + 6 * 16], m4
1416
1417 pmaddwd m5, m6, [r4 + 64]
1418 pmaddwd m4, m2, [r4 + 80]
1419 paddd m5, m4 ; O[2]
1420
1421 paddd m4, m0, m5
1422 psrad m4, 7
1423
1424 psubd m0, m5
1425 psrad m0, 7
1426
1427 packssdw m4, m0
1428 movh [r5 + 2 * 16], m4
1429 movhps [r5 + 5 * 16], m4
1430
1431 pmaddwd m5, m6, [r4 + 96]
1432 pmaddwd m4, m2, [r4 + 112]
1433 paddd m5, m4 ; O[3]
1434
1435 paddd m4, m1, m5
1436 psrad m4, 7
1437
1438 psubd m1, m5
1439 psrad m1, 7
1440
1441 packssdw m4, m1
1442 movh [r5 + 3 * 16], m4
1443 movhps [r5 + 4 * 16], m4
1444
1445 ret
1446
1447 %macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
1448 %if BIT_DEPTH == 10
1449 %define IDCT_SHIFT 10
1450 %elif BIT_DEPTH == 8
1451 %define IDCT_SHIFT 12
1452 %else
1453 %error Unsupported BIT_DEPTH!
1454 %endif
1455 pshufb m4, %1, [pb_idct8even]
1456 pmaddwd m4, [tab_idct8_1]
1457 phsubd m5, m4
1458 pshufd m4, m4, 0x4E
1459 phaddd m4, m4
1460 punpckhqdq m4, m5 ;m4 = dd e[ 0 1 2 3]
1461 paddd m4, m6
1462
1463 pshufb %1, %1, [r6]
1464 pmaddwd m5, %1, [r4]
1465 pmaddwd %1, [r4 + 16]
1466 phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3]
1467
1468 paddd %1, m4, m5
1469 psrad %1, IDCT_SHIFT
1470
1471 psubd m4, m5
1472 psrad m4, IDCT_SHIFT
1473 pshufd m4, m4, 0x1B
1474
1475 packssdw %1, m4
1476 %undef IDCT_SHIFT
1477 %endmacro
1478
1479 cglobal patial_butterfly_inverse_internal_pass2
1480
1481 mova m0, [r5]
1482 PARTIAL_BUTTERFLY_PROCESS_ROW m0
1483 movu [r1], m0
1484
1485 mova m2, [r5 + 16]
1486 PARTIAL_BUTTERFLY_PROCESS_ROW m2
1487 movu [r1 + r2], m2
1488
1489 mova m1, [r5 + 32]
1490 PARTIAL_BUTTERFLY_PROCESS_ROW m1
1491 movu [r1 + 2 * r2], m1
1492
1493 mova m3, [r5 + 48]
1494 PARTIAL_BUTTERFLY_PROCESS_ROW m3
1495 movu [r1 + r3], m3
1496
1497 ret
1498
1499 cglobal idct8, 3,7,8 ;,0-16*mmsize
1500 ; alignment stack to 64-bytes
1501 mov r5, rsp
1502 sub rsp, 16*mmsize + gprsize
1503 and rsp, ~(64-1)
1504 mov [rsp + 16*mmsize], r5
1505 mov r5, rsp
1506
1507 lea r4, [tab_idct8_3]
1508 lea r6, [tab_dct4]
1509
1510 call patial_butterfly_inverse_internal_pass1
1511
1512 add r0, 8
1513 add r5, 8
1514
1515 call patial_butterfly_inverse_internal_pass1
1516
1517 %if BIT_DEPTH == 10
1518 mova m6, [pd_512]
1519 %elif BIT_DEPTH == 8
1520 mova m6, [pd_2048]
1521 %else
1522 %error Unsupported BIT_DEPTH!
1523 %endif
1524 add r2, r2
1525 lea r3, [r2 * 3]
1526 lea r4, [tab_idct8_2]
1527 lea r6, [pb_idct8odd]
1528 sub r5, 8
1529
1530 call patial_butterfly_inverse_internal_pass2
1531
1532 lea r1, [r1 + 4 * r2]
1533 add r5, 64
1534
1535 call patial_butterfly_inverse_internal_pass2
1536
1537 ; restore origin stack pointer
1538 mov rsp, [rsp + 16*mmsize]
1539 RET
1540
1541
1542 ;-----------------------------------------------------------------------------
1543 ; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
1544 ;-----------------------------------------------------------------------------
1545 INIT_XMM sse4
1546 cglobal denoise_dct, 4, 4, 6
1547 pxor m5, m5
1548 shr r3d, 3
1549 .loop:
1550 mova m0, [r0]
1551 pabsw m1, m0
1552
1553 mova m2, [r1]
1554 pmovsxwd m3, m1
1555 paddd m2, m3
1556 mova [r1], m2
1557 mova m2, [r1 + 16]
1558 psrldq m3, m1, 8
1559 pmovsxwd m4, m3
1560 paddd m2, m4
1561 mova [r1 + 16], m2
1562
1563 movu m3, [r2]
1564 psubsw m1, m3
1565 pcmpgtw m4, m1, m5
1566 pand m1, m4
1567 psignw m1, m0
1568 mova [r0], m1
1569 add r0, 16
1570 add r1, 32
1571 add r2, 16
1572 dec r3d
1573 jnz .loop
1574 RET
1575
1576 INIT_YMM avx2
1577 cglobal denoise_dct, 4, 4, 6
1578 pxor m5, m5
1579 shr r3d, 4
1580 .loop:
1581 movu m0, [r0]
1582 pabsw m1, m0
1583 movu m2, [r1]
1584 pmovsxwd m4, xm1
1585 paddd m2, m4
1586 movu [r1], m2
1587 vextracti128 xm4, m1, 1
1588 movu m2, [r1 + 32]
1589 pmovsxwd m3, xm4
1590 paddd m2, m3
1591 movu [r1 + 32], m2
1592 movu m3, [r2]
1593 psubw m1, m3
1594 pcmpgtw m4, m1, m5
1595 pand m1, m4
1596 psignw m1, m0
1597 movu [r0], m1
1598 add r0, 32
1599 add r1, 64
1600 add r2, 32
1601 dec r3d
1602 jnz .loop
1603 RET
1604
1605 %if ARCH_X86_64 == 1
1606 %macro DCT8_PASS_1 4
1607 vpbroadcastq m0, [r6 + %1]
1608 pmaddwd m2, m%3, m0
1609 pmaddwd m0, m%4
1610 phaddd m2, m0
1611 paddd m2, m5
1612 psrad m2, DCT_SHIFT
1613 packssdw m2, m2
1614 vpermq m2, m2, 0x08
1615 mova [r5 + %2], xm2
1616 %endmacro
1617
1618 %macro DCT8_PASS_2 2
1619 vbroadcasti128 m4, [r6 + %1]
1620 pmaddwd m6, m0, m4
1621 pmaddwd m7, m1, m4
1622 pmaddwd m8, m2, m4
1623 pmaddwd m9, m3, m4
1624 phaddd m6, m7
1625 phaddd m8, m9
1626 phaddd m6, m8
1627 paddd m6, m5
1628 psrad m6, DCT_SHIFT2
1629
1630 vbroadcasti128 m4, [r6 + %2]
1631 pmaddwd m10, m0, m4
1632 pmaddwd m7, m1, m4
1633 pmaddwd m8, m2, m4
1634 pmaddwd m9, m3, m4
1635 phaddd m10, m7
1636 phaddd m8, m9
1637 phaddd m10, m8
1638 paddd m10, m5
1639 psrad m10, DCT_SHIFT2
1640
1641 packssdw m6, m10
1642 vpermq m10, m6, 0xD8
1643
1644 %endmacro
1645
1646 INIT_YMM avx2
1647 cglobal dct8, 3, 7, 11, 0-8*16
1648 %if BIT_DEPTH == 10
1649 %define DCT_SHIFT 4
1650 vbroadcasti128 m5, [pd_8]
1651 %elif BIT_DEPTH == 8
1652 %define DCT_SHIFT 2
1653 vbroadcasti128 m5, [pd_2]
1654 %else
1655 %error Unsupported BIT_DEPTH!
1656 %endif
1657 %define DCT_SHIFT2 9
1658
1659 add r2d, r2d
1660 lea r3, [r2 * 3]
1661 lea r4, [r0 + r2 * 4]
1662 mov r5, rsp
1663 lea r6, [tab_dct8]
1664 mova m6, [dct8_shuf]
1665
1666 ;pass1
1667 mova xm0, [r0]
1668 vinserti128 m0, m0, [r4], 1
1669 mova xm1, [r0 + r2]
1670 vinserti128 m1, m1, [r4 + r2], 1
1671 mova xm2, [r0 + r2 * 2]
1672 vinserti128 m2, m2, [r4 + r2 * 2], 1
1673 mova xm3, [r0 + r3]
1674 vinserti128 m3, m3, [r4 + r3], 1
1675
1676 punpcklqdq m4, m0, m1
1677 punpckhqdq m0, m1
1678 punpcklqdq m1, m2, m3
1679 punpckhqdq m2, m3
1680
1681 pshufb m0, m6
1682 pshufb m2, m6
1683
1684 paddw m3, m4, m0
1685 paddw m7, m1, m2
1686
1687 psubw m4, m0
1688 psubw m1, m2
1689
1690 DCT8_PASS_1 0 * 16, 0 * 16, 3, 7
1691 DCT8_PASS_1 1 * 16, 2 * 16, 4, 1
1692 DCT8_PASS_1 2 * 16, 4 * 16, 3, 7
1693 DCT8_PASS_1 3 * 16, 6 * 16, 4, 1
1694 DCT8_PASS_1 4 * 16, 1 * 16, 3, 7
1695 DCT8_PASS_1 5 * 16, 3 * 16, 4, 1
1696 DCT8_PASS_1 6 * 16, 5 * 16, 3, 7
1697 DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
1698
1699 ;pass2
1700 vbroadcasti128 m5, [pd_256]
1701
1702 mova m0, [r5]
1703 mova m1, [r5 + 32]
1704 mova m2, [r5 + 64]
1705 mova m3, [r5 + 96]
1706
1707 DCT8_PASS_2 0 * 16, 1 * 16
1708 movu [r1], m10
1709 DCT8_PASS_2 2 * 16, 3 * 16
1710 movu [r1 + 32], m10
1711 DCT8_PASS_2 4 * 16, 5 * 16
1712 movu [r1 + 64], m10
1713 DCT8_PASS_2 6 * 16, 7 * 16
1714 movu [r1 + 96], m10
1715 RET
1716
1717 %macro DCT16_PASS_1_E 2
1718 vpbroadcastq m7, [r7 + %1]
1719
1720 pmaddwd m4, m0, m7
1721 pmaddwd m6, m2, m7
1722 phaddd m4, m6
1723
1724 paddd m4, m9
1725 psrad m4, DCT_SHIFT
1726
1727 packssdw m4, m4
1728 vpermq m4, m4, 0x08
1729
1730 mova [r5 + %2], xm4
1731 %endmacro
1732
1733 %macro DCT16_PASS_1_O 2
1734 vbroadcasti128 m7, [r7 + %1]
1735
1736 pmaddwd m10, m0, m7
1737 pmaddwd m11, m2, m7
1738 phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5]
1739
1740 pmaddwd m11, m4, m7
1741 pmaddwd m12, m6, m7
1742 phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7]
1743
1744 phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7]
1745
1746 paddd m10, m9
1747 psrad m10, DCT_SHIFT
1748
1749 packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -]
1750 vpermq m10, m10, 0x08
1751
1752 mova [r5 + %2], xm10
1753 %endmacro
1754
1755 %macro DCT16_PASS_2 2
1756 vbroadcasti128 m8, [r7 + %1]
1757 vbroadcasti128 m13, [r8 + %1]
1758
1759 pmaddwd m10, m0, m8
1760 pmaddwd m11, m1, m13
1761 paddd m10, m11
1762
1763 pmaddwd m11, m2, m8
1764 pmaddwd m12, m3, m13
1765 paddd m11, m12
1766 phaddd m10, m11
1767
1768 pmaddwd m11, m4, m8
1769 pmaddwd m12, m5, m13
1770 paddd m11, m12
1771
1772 pmaddwd m12, m6, m8
1773 pmaddwd m13, m7, m13
1774 paddd m12, m13
1775 phaddd m11, m12
1776
1777 phaddd m10, m11
1778 paddd m10, m9
1779 psrad m10, DCT_SHIFT2
1780
1781
1782 vbroadcasti128 m8, [r7 + %2]
1783 vbroadcasti128 m13, [r8 + %2]
1784
1785 pmaddwd m14, m0, m8
1786 pmaddwd m11, m1, m13
1787 paddd m14, m11
1788
1789 pmaddwd m11, m2, m8
1790 pmaddwd m12, m3, m13
1791 paddd m11, m12
1792 phaddd m14, m11
1793
1794 pmaddwd m11, m4, m8
1795 pmaddwd m12, m5, m13
1796 paddd m11, m12
1797
1798 pmaddwd m12, m6, m8
1799 pmaddwd m13, m7, m13
1800 paddd m12, m13
1801 phaddd m11, m12
1802
1803 phaddd m14, m11
1804 paddd m14, m9
1805 psrad m14, DCT_SHIFT2
1806
1807 packssdw m10, m14
1808 vextracti128 xm14, m10, 1
1809 movlhps xm15, xm10, xm14
1810 movhlps xm14, xm10
1811 %endmacro
1812 INIT_YMM avx2
1813 cglobal dct16, 3, 9, 16, 0-16*mmsize
1814 %if BIT_DEPTH == 10
1815 %define DCT_SHIFT 5
1816 vbroadcasti128 m9, [pd_16]
1817 %elif BIT_DEPTH == 8
1818 %define DCT_SHIFT 3
1819 vbroadcasti128 m9, [pd_4]
1820 %else
1821 %error Unsupported BIT_DEPTH!
1822 %endif
1823 %define DCT_SHIFT2 10
1824
1825 add r2d, r2d
1826
1827 mova m13, [dct16_shuf1]
1828 mova m14, [dct16_shuf2]
1829 lea r7, [tab_dct16_1 + 8 * 16]
1830 lea r8, [tab_dct16_2 + 8 * 16]
1831 lea r3, [r2 * 3]
1832 mov r5, rsp
1833 mov r4d, 2 ; Each iteration process 8 rows, so 16/8 iterations
1834
1835 .pass1:
1836 lea r6, [r0 + r2 * 4]
1837
1838 movu m2, [r0]
1839 movu m1, [r6]
1840 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo]
1841 vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi]
1842
1843 movu m4, [r0 + r2]
1844 movu m3, [r6 + r2]
1845 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo]
1846 vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi]
1847
1848 movu m6, [r0 + r2 * 2]
1849 movu m5, [r6 + r2 * 2]
1850 vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo]
1851 vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi]
1852
1853 movu m8, [r0 + r3]
1854 movu m7, [r6 + r3]
1855 vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo]
1856 vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi]
1857
1858 pshufb m1, m13
1859 pshufb m3, m13
1860 pshufb m5, m13
1861 pshufb m7, m13
1862
1863 paddw m8, m0, m1 ;E
1864 psubw m0, m1 ;O
1865
1866 paddw m1, m2, m3 ;E
1867 psubw m2, m3 ;O
1868
1869 paddw m3, m4, m5 ;E
1870 psubw m4, m5 ;O
1871
1872 paddw m5, m6, m7 ;E
1873 psubw m6, m7 ;O
1874
1875 DCT16_PASS_1_O -7 * 16, 1 * 32
1876 DCT16_PASS_1_O -5 * 16, 3 * 32
1877 DCT16_PASS_1_O -3 * 16, 1 * 32 + 16
1878 DCT16_PASS_1_O -1 * 16, 3 * 32 + 16
1879 DCT16_PASS_1_O 1 * 16, 5 * 32
1880 DCT16_PASS_1_O 3 * 16, 7 * 32
1881 DCT16_PASS_1_O 5 * 16, 5 * 32 + 16
1882 DCT16_PASS_1_O 7 * 16, 7 * 32 + 16
1883
1884 pshufb m8, m14
1885 pshufb m1, m14
1886 phaddw m0, m8, m1
1887
1888 pshufb m3, m14
1889 pshufb m5, m14
1890 phaddw m2, m3, m5
1891
1892 DCT16_PASS_1_E -8 * 16, 0 * 32
1893 DCT16_PASS_1_E -4 * 16, 0 * 32 + 16
1894 DCT16_PASS_1_E 0 * 16, 4 * 32
1895 DCT16_PASS_1_E 4 * 16, 4 * 32 + 16
1896
1897 phsubw m0, m8, m1
1898 phsubw m2, m3, m5
1899
1900 DCT16_PASS_1_E -6 * 16, 2 * 32
1901 DCT16_PASS_1_E -2 * 16, 2 * 32 + 16
1902 DCT16_PASS_1_E 2 * 16, 6 * 32
1903 DCT16_PASS_1_E 6 * 16, 6 * 32 + 16
1904
1905 lea r0, [r0 + 8 * r2]
1906 add r5, 256
1907
1908 dec r4d
1909 jnz .pass1
1910
1911 mov r5, rsp
1912 mov r4d, 2
1913 mov r2d, 32
1914 lea r3, [r2 * 3]
1915 vbroadcasti128 m9, [pd_512]
1916
1917 .pass2:
1918 mova m0, [r5 + 0 * 32] ; [row0lo row4lo]
1919 mova m1, [r5 + 8 * 32] ; [row0hi row4hi]
1920
1921 mova m2, [r5 + 1 * 32] ; [row1lo row5lo]
1922 mova m3, [r5 + 9 * 32] ; [row1hi row5hi]
1923
1924 mova m4, [r5 + 2 * 32] ; [row2lo row6lo]
1925 mova m5, [r5 + 10 * 32] ; [row2hi row6hi]
1926
1927 mova m6, [r5 + 3 * 32] ; [row3lo row7lo]
1928 mova m7, [r5 + 11 * 32] ; [row3hi row7hi]
1929
1930 DCT16_PASS_2 -8 * 16, -7 * 16
1931 movu [r1], xm15
1932 movu [r1 + r2], xm14
1933
1934 DCT16_PASS_2 -6 * 16, -5 * 16
1935 movu [r1 + r2 * 2], xm15
1936 movu [r1 + r3], xm14
1937
1938 lea r6, [r1 + r2 * 4]
1939 DCT16_PASS_2 -4 * 16, -3 * 16
1940 movu [r6], xm15
1941 movu [r6 + r2], xm14
1942
1943 DCT16_PASS_2 -2 * 16, -1 * 16
1944 movu [r6 + r2 * 2], xm15
1945 movu [r6 + r3], xm14
1946
1947 lea r6, [r6 + r2 * 4]
1948 DCT16_PASS_2 0 * 16, 1 * 16
1949 movu [r6], xm15
1950 movu [r6 + r2], xm14
1951
1952 DCT16_PASS_2 2 * 16, 3 * 16
1953 movu [r6 + r2 * 2], xm15
1954 movu [r6 + r3], xm14
1955
1956 lea r6, [r6 + r2 * 4]
1957 DCT16_PASS_2 4 * 16, 5 * 16
1958 movu [r6], xm15
1959 movu [r6 + r2], xm14
1960
1961 DCT16_PASS_2 6 * 16, 7 * 16
1962 movu [r6 + r2 * 2], xm15
1963 movu [r6 + r3], xm14
1964
1965 add r1, 16
1966 add r5, 128
1967
1968 dec r4d
1969 jnz .pass2
1970 RET
1971
1972 %macro DCT32_PASS_1 4
1973 vbroadcasti128 m8, [r7 + %1]
1974
1975 pmaddwd m11, m%3, m8
1976 pmaddwd m12, m%4, m8
1977 phaddd m11, m12
1978
1979 vbroadcasti128 m8, [r7 + %1 + 32]
1980 vbroadcasti128 m10, [r7 + %1 + 48]
1981 pmaddwd m12, m5, m8
1982 pmaddwd m13, m6, m10
1983 phaddd m12, m13
1984
1985 pmaddwd m13, m4, m8
1986 pmaddwd m14, m7, m10
1987 phaddd m13, m14
1988
1989 phaddd m12, m13
1990
1991 phaddd m11, m12
1992 paddd m11, m9
1993 psrad m11, DCT_SHIFT
1994
1995 vpermq m11, m11, 0xD8
1996 packssdw m11, m11
1997 movq [r5 + %2], xm11
1998 vextracti128 xm10, m11, 1
1999 movq [r5 + %2 + 64], xm10
2000 %endmacro
2001
2002 %macro DCT32_PASS_2 1
2003 mova m8, [r7 + %1]
2004 mova m10, [r8 + %1]
2005 pmaddwd m11, m0, m8
2006 pmaddwd m12, m1, m10
2007 paddd m11, m12
2008
2009 pmaddwd m12, m2, m8
2010 pmaddwd m13, m3, m10
2011 paddd m12, m13
2012
2013 phaddd m11, m12
2014
2015 pmaddwd m12, m4, m8
2016 pmaddwd m13, m5, m10
2017 paddd m12, m13
2018
2019 pmaddwd m13, m6, m8
2020 pmaddwd m14, m7, m10
2021 paddd m13, m14
2022
2023 phaddd m12, m13
2024
2025 phaddd m11, m12
2026 vextracti128 xm10, m11, 1
2027 paddd xm11, xm10
2028
2029 paddd xm11, xm9
2030 psrad xm11, DCT_SHIFT2
2031 packssdw xm11, xm11
2032
2033 %endmacro
2034
2035 INIT_YMM avx2
2036 cglobal dct32, 3, 9, 16, 0-64*mmsize
2037 %if BIT_DEPTH == 10
2038 %define DCT_SHIFT 6
2039 vpbroadcastq m9, [pd_32]
2040 %elif BIT_DEPTH == 8
2041 %define DCT_SHIFT 4
2042 vpbroadcastq m9, [pd_8]
2043 %else
2044 %error Unsupported BIT_DEPTH!
2045 %endif
2046 %define DCT_SHIFT2 11
2047
2048 add r2d, r2d
2049
2050 lea r7, [tab_dct32_1]
2051 lea r8, [tab_dct32_2]
2052 lea r3, [r2 * 3]
2053 mov r5, rsp
2054 mov r4d, 8
2055 mova m15, [dct16_shuf1]
2056
2057 .pass1:
2058 movu m2, [r0]
2059 movu m1, [r0 + 32]
2060 pshufb m1, m15
2061 vpermq m1, m1, 0x4E
2062 psubw m7, m2, m1
2063 paddw m2, m1
2064
2065 movu m1, [r0 + r2 * 2]
2066 movu m0, [r0 + r2 * 2 + 32]
2067 pshufb m0, m15
2068 vpermq m0, m0, 0x4E
2069 psubw m8, m1, m0
2070 paddw m1, m0
2071 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E
2072 vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E
2073 pshufb m3, m15
2074 psubw m1, m0, m3
2075 paddw m0, m3
2076
2077 vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O
2078 vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O
2079
2080
2081 movu m4, [r0 + r2]
2082 movu m2, [r0 + r2 + 32]
2083 pshufb m2, m15
2084 vpermq m2, m2, 0x4E
2085 psubw m10, m4, m2
2086 paddw m4, m2
2087
2088 movu m3, [r0 + r3]
2089 movu m2, [r0 + r3 + 32]
2090 pshufb m2, m15
2091 vpermq m2, m2, 0x4E
2092 psubw m11, m3, m2
2093 paddw m3, m2
2094 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E
2095 vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E
2096 pshufb m8, m15
2097 psubw m3, m2, m8
2098 paddw m2, m8
2099
2100 vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O
2101 vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O
2102
2103
2104 DCT32_PASS_1 0 * 32, 0 * 64, 0, 2
2105 DCT32_PASS_1 2 * 32, 2 * 64, 1, 3
2106 DCT32_PASS_1 4 * 32, 4 * 64, 0, 2
2107 DCT32_PASS_1 6 * 32, 6 * 64, 1, 3
2108 DCT32_PASS_1 8 * 32, 8 * 64, 0, 2
2109 DCT32_PASS_1 10 * 32, 10 * 64, 1, 3
2110 DCT32_PASS_1 12 * 32, 12 * 64, 0, 2
2111 DCT32_PASS_1 14 * 32, 14 * 64, 1, 3
2112 DCT32_PASS_1 16 * 32, 16 * 64, 0, 2
2113 DCT32_PASS_1 18 * 32, 18 * 64, 1, 3
2114 DCT32_PASS_1 20 * 32, 20 * 64, 0, 2
2115 DCT32_PASS_1 22 * 32, 22 * 64, 1, 3
2116 DCT32_PASS_1 24 * 32, 24 * 64, 0, 2
2117 DCT32_PASS_1 26 * 32, 26 * 64, 1, 3
2118 DCT32_PASS_1 28 * 32, 28 * 64, 0, 2
2119 DCT32_PASS_1 30 * 32, 30 * 64, 1, 3
2120
2121 add r5, 8
2122 lea r0, [r0 + r2 * 4]
2123
2124 dec r4d
2125 jnz .pass1
2126
2127 mov r2d, 64
2128 lea r3, [r2 * 3]
2129 mov r5, rsp
2130 mov r4d, 8
2131 vpbroadcastq m9, [pd_1024]
2132
2133 .pass2:
2134 mova m0, [r5 + 0 * 64]
2135 mova m1, [r5 + 0 * 64 + 32]
2136
2137 mova m2, [r5 + 1 * 64]
2138 mova m3, [r5 + 1 * 64 + 32]
2139
2140 mova m4, [r5 + 2 * 64]
2141 mova m5, [r5 + 2 * 64 + 32]
2142
2143 mova m6, [r5 + 3 * 64]
2144 mova m7, [r5 + 3 * 64 + 32]
2145
2146 DCT32_PASS_2 0 * 32
2147 movq [r1], xm11
2148 DCT32_PASS_2 1 * 32
2149 movq [r1 + r2], xm11
2150 DCT32_PASS_2 2 * 32
2151 movq [r1 + r2 * 2], xm11
2152 DCT32_PASS_2 3 * 32
2153 movq [r1 + r3], xm11
2154
2155 lea r6, [r1 + r2 * 4]
2156 DCT32_PASS_2 4 * 32
2157 movq [r6], xm11
2158 DCT32_PASS_2 5 * 32
2159 movq [r6 + r2], xm11
2160 DCT32_PASS_2 6 * 32
2161 movq [r6 + r2 * 2], xm11
2162 DCT32_PASS_2 7 * 32
2163 movq [r6 + r3], xm11
2164
2165 lea r6, [r6 + r2 * 4]
2166 DCT32_PASS_2 8 * 32
2167 movq [r6], xm11
2168 DCT32_PASS_2 9 * 32
2169 movq [r6 + r2], xm11
2170 DCT32_PASS_2 10 * 32
2171 movq [r6 + r2 * 2], xm11
2172 DCT32_PASS_2 11 * 32
2173 movq [r6 + r3], xm11
2174
2175 lea r6, [r6 + r2 * 4]
2176 DCT32_PASS_2 12 * 32
2177 movq [r6], xm11
2178 DCT32_PASS_2 13 * 32
2179 movq [r6 + r2], xm11
2180 DCT32_PASS_2 14 * 32
2181 movq [r6 + r2 * 2], xm11
2182 DCT32_PASS_2 15 * 32
2183 movq [r6 + r3], xm11
2184
2185 lea r6, [r6 + r2 * 4]
2186 DCT32_PASS_2 16 * 32
2187 movq [r6], xm11
2188 DCT32_PASS_2 17 * 32
2189 movq [r6 + r2], xm11
2190 DCT32_PASS_2 18 * 32
2191 movq [r6 + r2 * 2], xm11
2192 DCT32_PASS_2 19 * 32
2193 movq [r6 + r3], xm11
2194
2195 lea r6, [r6 + r2 * 4]
2196 DCT32_PASS_2 20 * 32
2197 movq [r6], xm11
2198 DCT32_PASS_2 21 * 32
2199 movq [r6 + r2], xm11
2200 DCT32_PASS_2 22 * 32
2201 movq [r6 + r2 * 2], xm11
2202 DCT32_PASS_2 23 * 32
2203 movq [r6 + r3], xm11
2204
2205 lea r6, [r6 + r2 * 4]
2206 DCT32_PASS_2 24 * 32
2207 movq [r6], xm11
2208 DCT32_PASS_2 25 * 32
2209 movq [r6 + r2], xm11
2210 DCT32_PASS_2 26 * 32
2211 movq [r6 + r2 * 2], xm11
2212 DCT32_PASS_2 27 * 32
2213 movq [r6 + r3], xm11
2214
2215 lea r6, [r6 + r2 * 4]
2216 DCT32_PASS_2 28 * 32
2217 movq [r6], xm11
2218 DCT32_PASS_2 29 * 32
2219 movq [r6 + r2], xm11
2220 DCT32_PASS_2 30 * 32
2221 movq [r6 + r2 * 2], xm11
2222 DCT32_PASS_2 31 * 32
2223 movq [r6 + r3], xm11
2224
2225 add r5, 256
2226 add r1, 8
2227
2228 dec r4d
2229 jnz .pass2
2230 RET
2231
2232 %macro IDCT8_PASS_1 1
2233 vpbroadcastd m7, [r5 + %1]
2234 vpbroadcastd m10, [r5 + %1 + 4]
2235 pmaddwd m5, m4, m7
2236 pmaddwd m6, m0, m10
2237 paddd m5, m6
2238
2239 vpbroadcastd m7, [r6 + %1]
2240 vpbroadcastd m10, [r6 + %1 + 4]
2241 pmaddwd m6, m1, m7
2242 pmaddwd m3, m2, m10
2243 paddd m6, m3
2244
2245 paddd m3, m5, m6
2246 paddd m3, m11
2247 psrad m3, IDCT_SHIFT1
2248
2249 psubd m5, m6
2250 paddd m5, m11
2251 psrad m5, IDCT_SHIFT1
2252
2253 vpbroadcastd m7, [r5 + %1 + 32]
2254 vpbroadcastd m10, [r5 + %1 + 36]
2255 pmaddwd m6, m4, m7
2256 pmaddwd m8, m0, m10
2257 paddd m6, m8
2258
2259 vpbroadcastd m7, [r6 + %1 + 32]
2260 vpbroadcastd m10, [r6 + %1 + 36]
2261 pmaddwd m8, m1, m7
2262 pmaddwd m9, m2, m10
2263 paddd m8, m9
2264
2265 paddd m9, m6, m8
2266 paddd m9, m11
2267 psrad m9, IDCT_SHIFT1
2268
2269 psubd m6, m8
2270 paddd m6, m11
2271 psrad m6, IDCT_SHIFT1
2272
2273 packssdw m3, m9
2274 vpermq m3, m3, 0xD8
2275
2276 packssdw m6, m5
2277 vpermq m6, m6, 0xD8
2278 %endmacro
2279
2280 %macro IDCT8_PASS_2 0
2281 punpcklqdq m2, m0, m1
2282 punpckhqdq m0, m1
2283
2284 pmaddwd m3, m2, [r5]
2285 pmaddwd m5, m2, [r5 + 32]
2286 pmaddwd m6, m2, [r5 + 64]
2287 pmaddwd m7, m2, [r5 + 96]
2288 phaddd m3, m5
2289 phaddd m6, m7
2290 pshufb m3, [idct8_shuf2]
2291 pshufb m6, [idct8_shuf2]
2292 punpcklqdq m7, m3, m6
2293 punpckhqdq m3, m6
2294
2295 pmaddwd m5, m0, [r6]
2296 pmaddwd m6, m0, [r6 + 32]
2297 pmaddwd m8, m0, [r6 + 64]
2298 pmaddwd m9, m0, [r6 + 96]
2299 phaddd m5, m6
2300 phaddd m8, m9
2301 pshufb m5, [idct8_shuf2]
2302 pshufb m8, [idct8_shuf2]
2303 punpcklqdq m6, m5, m8
2304 punpckhqdq m5, m8
2305
2306 paddd m8, m7, m6
2307 paddd m8, m12
2308 psrad m8, IDCT_SHIFT2
2309
2310 psubd m7, m6
2311 paddd m7, m12
2312 psrad m7, IDCT_SHIFT2
2313
2314 pshufb m7, [idct8_shuf3]
2315 packssdw m8, m7
2316
2317 paddd m9, m3, m5
2318 paddd m9, m12
2319 psrad m9, IDCT_SHIFT2
2320
2321 psubd m3, m5
2322 paddd m3, m12
2323 psrad m3, IDCT_SHIFT2
2324
2325 pshufb m3, [idct8_shuf3]
2326 packssdw m9, m3
2327 %endmacro
2328
2329 INIT_YMM avx2
2330 cglobal idct8, 3, 7, 13, 0-8*16
2331 %if BIT_DEPTH == 10
2332 %define IDCT_SHIFT2 10
2333 vpbroadcastd m12, [pd_512]
2334 %elif BIT_DEPTH == 8
2335 %define IDCT_SHIFT2 12
2336 vpbroadcastd m12, [pd_2048]
2337 %else
2338 %error Unsupported BIT_DEPTH!
2339 %endif
2340 %define IDCT_SHIFT1 7
2341
2342 vbroadcasti128 m11, [pd_64]
2343
2344 mov r4, rsp
2345 lea r5, [avx2_idct8_1]
2346 lea r6, [avx2_idct8_2]
2347
2348 ;pass1
2349 mova m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
2350 mova m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
2351 vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
2352 vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
2353 vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
2354 vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3]
2355 vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
2356
2357 mova m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
2358 mova m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
2359 vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
2360 vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
2361 vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
2362 vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7]
2363 vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
2364
2365 mova m5, [idct8_shuf1]
2366 vpermd m4, m5, m4
2367 vpermd m0, m5, m0
2368 vpermd m1, m5, m1
2369 vpermd m2, m5, m2
2370
2371 IDCT8_PASS_1 0
2372 mova [r4], m3
2373 mova [r4 + 96], m6
2374
2375 IDCT8_PASS_1 64
2376 mova [r4 + 32], m3
2377 mova [r4 + 64], m6
2378
2379 ;pass2
2380 add r2d, r2d
2381 lea r3, [r2 * 3]
2382
2383 mova m0, [r4]
2384 mova m1, [r4 + 32]
2385 IDCT8_PASS_2
2386
2387 vextracti128 xm3, m8, 1
2388 mova [r1], xm8
2389 mova [r1 + r2], xm3
2390 vextracti128 xm3, m9, 1
2391 mova [r1 + r2 * 2], xm9
2392 mova [r1 + r3], xm3
2393
2394 lea r1, [r1 + r2 * 4]
2395 mova m0, [r4 + 64]
2396 mova m1, [r4 + 96]
2397 IDCT8_PASS_2
2398
2399 vextracti128 xm3, m8, 1
2400 mova [r1], xm8
2401 mova [r1 + r2], xm3
2402 vextracti128 xm3, m9, 1
2403 mova [r1 + r2 * 2], xm9
2404 mova [r1 + r3], xm3
2405 RET
2406
2407 %macro IDCT_PASS1 2
2408 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16]
2409
2410 pmaddwd m9, m0, m5
2411 pmaddwd m10, m7, m5
2412 phaddd m9, m10
2413
2414 pmaddwd m10, m6, m5
2415 pmaddwd m11, m8, m5
2416 phaddd m10, m11
2417
2418 phaddd m9, m10
2419 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16]
2420
2421 pmaddwd m10, m1, m5
2422 pmaddwd m11, m3, m5
2423 phaddd m10, m11
2424
2425 pmaddwd m11, m4, m5
2426 pmaddwd m12, m2, m5
2427 phaddd m11, m12
2428
2429 phaddd m10, m11
2430
2431 paddd m11, m9, m10
2432 paddd m11, m14
2433 psrad m11, IDCT_SHIFT1
2434
2435 psubd m9, m10
2436 paddd m9, m14
2437 psrad m9, IDCT_SHIFT1
2438
2439 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16 + 16]
2440
2441 pmaddwd m10, m0, m5
2442 pmaddwd m12, m7, m5
2443 phaddd m10, m12
2444
2445 pmaddwd m12, m6, m5
2446 pmaddwd m13, m8, m5
2447 phaddd m12, m13
2448
2449 phaddd m10, m12
2450 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16 + 16]
2451
2452 pmaddwd m12, m1, m5
2453 pmaddwd m13, m3, m5
2454 phaddd m12, m13
2455
2456 pmaddwd m13, m4, m5
2457 pmaddwd m5, m2
2458 phaddd m13, m5
2459
2460 phaddd m12, m13
2461
2462 paddd m5, m10, m12
2463 paddd m5, m14
2464 psrad m5, IDCT_SHIFT1
2465
2466 psubd m10, m12
2467 paddd m10, m14
2468 psrad m10, IDCT_SHIFT1
2469
2470 packssdw m11, m5
2471 packssdw m9, m10
2472
2473 mova m10, [idct16_shuff]
2474 mova m5, [idct16_shuff1]
2475
2476 vpermd m12, m10, m11
2477 vpermd m13, m5, m9
2478 mova [r3 + %1 * 16 * 2], xm12
2479 mova [r3 + %2 * 16 * 2], xm13
2480 vextracti128 [r3 + %2 * 16 * 2 + 32], m13, 1
2481 vextracti128 [r3 + %1 * 16 * 2 + 32], m12, 1
2482 %endmacro
2483
2484 ;-------------------------------------------------------
2485 ; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
2486 ;-------------------------------------------------------
2487 INIT_YMM avx2
2488 cglobal idct16, 3, 7, 16, 0-16*mmsize
2489 %if BIT_DEPTH == 10
2490 %define IDCT_SHIFT2 10
2491 vpbroadcastd m15, [pd_512]
2492 %elif BIT_DEPTH == 8
2493 %define IDCT_SHIFT2 12
2494 vpbroadcastd m15, [pd_2048]
2495 %else
2496 %error Unsupported BIT_DEPTH!
2497 %endif
2498 %define IDCT_SHIFT1 7
2499
2500 vbroadcasti128 m14, [pd_64]
2501
2502 add r2d, r2d
2503 mov r3, rsp
2504 mov r4d, 2
2505
2506 .pass1:
2507 movu xm0, [r0 + 0 * 32]
2508 movu xm1, [r0 + 8 * 32]
2509 punpckhqdq xm2, xm0, xm1
2510 punpcklqdq xm0, xm1
2511 vinserti128 m0, m0, xm2, 1
2512
2513 movu xm1, [r0 + 1 * 32]
2514 movu xm2, [r0 + 9 * 32]
2515 punpckhqdq xm3, xm1, xm2
2516 punpcklqdq xm1, xm2
2517 vinserti128 m1, m1, xm3, 1
2518
2519 movu xm2, [r0 + 2 * 32]
2520 movu xm3, [r0 + 10 * 32]
2521 punpckhqdq xm4, xm2, xm3
2522 punpcklqdq xm2, xm3
2523 vinserti128 m2, m2, xm4, 1
2524
2525 movu xm3, [r0 + 3 * 32]
2526 movu xm4, [r0 + 11 * 32]
2527 punpckhqdq xm5, xm3, xm4
2528 punpcklqdq xm3, xm4
2529 vinserti128 m3, m3, xm5, 1
2530
2531 movu xm4, [r0 + 4 * 32]
2532 movu xm5, [r0 + 12 * 32]
2533 punpckhqdq xm6, xm4, xm5
2534 punpcklqdq xm4, xm5
2535 vinserti128 m4, m4, xm6, 1
2536
2537 movu xm5, [r0 + 5 * 32]
2538 movu xm6, [r0 + 13 * 32]
2539 punpckhqdq xm7, xm5, xm6
2540 punpcklqdq xm5, xm6
2541 vinserti128 m5, m5, xm7, 1
2542
2543 movu xm6, [r0 + 6 * 32]
2544 movu xm7, [r0 + 14 * 32]
2545 punpckhqdq xm8, xm6, xm7
2546 punpcklqdq xm6, xm7
2547 vinserti128 m6, m6, xm8, 1
2548
2549 movu xm7, [r0 + 7 * 32]
2550 movu xm8, [r0 + 15 * 32]
2551 punpckhqdq xm9, xm7, xm8
2552 punpcklqdq xm7, xm8
2553 vinserti128 m7, m7, xm9, 1
2554
2555 punpckhwd m8, m0, m2 ;[8 10]
2556 punpcklwd m0, m2 ;[0 2]
2557
2558 punpckhwd m2, m1, m3 ;[9 11]
2559 punpcklwd m1, m3 ;[1 3]
2560
2561 punpckhwd m3, m4, m6 ;[12 14]
2562 punpcklwd m4, m6 ;[4 6]
2563
2564 punpckhwd m6, m5, m7 ;[13 15]
2565 punpcklwd m5, m7 ;[5 7]
2566
2567 punpckhdq m7, m0, m4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
2568 punpckldq m0, m4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
2569
2570 punpckhdq m4, m8, m3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
2571 punpckldq m8, m3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
2572
2573 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
2574 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
2575
2576 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
2577 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
2578
2579 punpckhqdq m6, m0, m8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
2580 punpcklqdq m0, m8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
2581
2582 punpckhqdq m8, m7, m4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
2583 punpcklqdq m7, m4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
2584
2585 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
2586 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
2587
2588 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
2589 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
2590
2591 IDCT_PASS1 0, 14
2592 IDCT_PASS1 2, 12
2593 IDCT_PASS1 4, 10
2594 IDCT_PASS1 6, 8
2595
2596 add r0, 16
2597 add r3, 16
2598 dec r4d
2599 jnz .pass1
2600
2601 mov r3, rsp
2602 mov r4d, 8
2603 lea r5, [tab_idct16_2]
2604 lea r6, [tab_idct16_1]
2605
2606 vbroadcasti128 m7, [r5]
2607 vbroadcasti128 m8, [r5 + 16]
2608 vbroadcasti128 m9, [r5 + 32]
2609 vbroadcasti128 m10, [r5 + 48]
2610 vbroadcasti128 m11, [r5 + 64]
2611 vbroadcasti128 m12, [r5 + 80]
2612 vbroadcasti128 m13, [r5 + 96]
2613
2614 .pass2:
2615 movu m1, [r3]
2616 vpermq m0, m1, 0xD8
2617
2618 pmaddwd m1, m0, m7
2619 pmaddwd m2, m0, m8
2620 phaddd m1, m2
2621
2622 pmaddwd m2, m0, m9
2623 pmaddwd m3, m0, m10
2624 phaddd m2, m3
2625
2626 phaddd m1, m2
2627
2628 pmaddwd m2, m0, m11
2629 pmaddwd m3, m0, m12
2630 phaddd m2, m3
2631
2632 vbroadcasti128 m14, [r5 + 112]
2633 pmaddwd m3, m0, m13
2634 pmaddwd m4, m0, m14
2635 phaddd m3, m4
2636
2637 phaddd m2, m3
2638
2639 movu m3, [r3 + 32]
2640 vpermq m0, m3, 0xD8
2641
2642 vbroadcasti128 m14, [r6]
2643 pmaddwd m3, m0, m14
2644 vbroadcasti128 m14, [r6 + 16]
2645 pmaddwd m4, m0, m14
2646 phaddd m3, m4
2647
2648 vbroadcasti128 m14, [r6 + 32]
2649 pmaddwd m4, m0, m14
2650 vbroadcasti128 m14, [r6 + 48]
2651 pmaddwd m5, m0, m14
2652 phaddd m4, m5
2653
2654 phaddd m3, m4
2655
2656 vbroadcasti128 m14, [r6 + 64]
2657 pmaddwd m4, m0, m14
2658 vbroadcasti128 m14, [r6 + 80]
2659 pmaddwd m5, m0, m14
2660 phaddd m4, m5
2661
2662 vbroadcasti128 m14, [r6 + 96]
2663 pmaddwd m6, m0, m14
2664 vbroadcasti128 m14, [r6 + 112]
2665 pmaddwd m0, m14
2666 phaddd m6, m0
2667
2668 phaddd m4, m6
2669
2670 paddd m5, m1, m3
2671 paddd m5, m15
2672 psrad m5, IDCT_SHIFT2
2673
2674 psubd m1, m3
2675 paddd m1, m15
2676 psrad m1, IDCT_SHIFT2
2677
2678 paddd m6, m2, m4
2679 paddd m6, m15
2680 psrad m6, IDCT_SHIFT2
2681
2682 psubd m2, m4
2683 paddd m2, m15
2684 psrad m2, IDCT_SHIFT2
2685
2686 packssdw m5, m6
2687 packssdw m1, m2
2688 pshufb m2, m1, [dct16_shuf1]
2689
2690 mova [r1], xm5
2691 mova [r1 + 16], xm2
2692 vextracti128 [r1 + r2], m5, 1
2693 vextracti128 [r1 + r2 + 16], m2, 1
2694
2695 lea r1, [r1 + 2 * r2]
2696 add r3, 64
2697 dec r4d
2698 jnz .pass2
2699 RET
2700
2701 %macro IDCT32_PASS1 1
2702 vbroadcasti128 m3, [tab_idct32_1 + %1 * 32]
2703 vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16]
2704 pmaddwd m9, m4, m3
2705 pmaddwd m10, m8, m13
2706 phaddd m9, m10
2707
2708 pmaddwd m10, m2, m3
2709 pmaddwd m11, m1, m13
2710 phaddd m10, m11
2711
2712 phaddd m9, m10
2713
2714 vbroadcasti128 m3, [tab_idct32_1 + (15 - %1) * 32]
2715 vbroadcasti128 m13, [tab_idct32_1 + (15- %1) * 32 + 16]
2716 pmaddwd m10, m4, m3
2717 pmaddwd m11, m8, m13
2718 phaddd m10, m11
2719
2720 pmaddwd m11, m2, m3
2721 pmaddwd m12, m1, m13
2722 phaddd m11, m12
2723
2724 phaddd m10, m11
2725 phaddd m9, m10 ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15]
2726
2727 vbroadcasti128 m3, [tab_idct32_2 + %1 * 16]
2728 pmaddwd m10, m0, m3
2729 pmaddwd m11, m7, m3
2730 phaddd m10, m11
2731 phaddd m10, m10
2732
2733 vbroadcasti128 m3, [tab_idct32_3 + %1 * 16]
2734 pmaddwd m11, m5, m3
2735 pmaddwd m12, m6, m3
2736 phaddd m11, m12
2737 phaddd m11, m11
2738
2739 paddd m12, m10, m11 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL]
2740 psubd m10, m11 ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL]
2741
2742 punpcklqdq m12, m10 ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15]
2743 paddd m10, m9, m12
2744 paddd m10, m15
2745 psrad m10, IDCT_SHIFT1
2746
2747 psubd m12, m9
2748 paddd m12, m15
2749 psrad m12, IDCT_SHIFT1
2750
2751 packssdw m10, m12
2752 vextracti128 xm12, m10, 1
2753 movd [r3 + %1 * 64], xm10
2754 movd [r3 + 32 + %1 * 64], xm12
2755 pextrd [r4 - %1 * 64], xm10, 1
2756 pextrd [r4+ 32 - %1 * 64], xm12, 1
2757 pextrd [r3 + 16 * 64 + %1 *64], xm10, 3
2758 pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
2759 pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2
2760 pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
2761 %endmacro
2762
2763 ;-------------------------------------------------------
2764 ; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
2765 ;-------------------------------------------------------
2766
2767 ; TODO: Reduce PHADDD instruction by PADDD
2768
2769 INIT_YMM avx2
2770 cglobal idct32, 3, 6, 16, 0-32*64
2771
2772 %define IDCT_SHIFT1 7
2773
2774 vbroadcasti128 m15, [pd_64]
2775
2776 mov r3, rsp
2777 lea r4, [r3 + 15 * 64]
2778 mov r5d, 8
2779
2780 .pass1:
2781 movq xm0, [r0 + 2 * 64]
2782 movq xm1, [r0 + 18 * 64]
2783 punpcklqdq xm0, xm0, xm1
2784 movq xm1, [r0 + 0 * 64]
2785 movq xm2, [r0 + 16 * 64]
2786 punpcklqdq xm1, xm1, xm2
2787 vinserti128 m0, m0, xm1, 1 ;[2 18 0 16]
2788
2789 movq xm1, [r0 + 1 * 64]
2790 movq xm2, [r0 + 9 * 64]
2791 punpcklqdq xm1, xm1, xm2
2792 movq xm2, [r0 + 17 * 64]
2793 movq xm3, [r0 + 25 * 64]
2794 punpcklqdq xm2, xm2, xm3
2795 vinserti128 m1, m1, xm2, 1 ;[1 9 17 25]
2796
2797 movq xm2, [r0 + 6 * 64]
2798 movq xm3, [r0 + 22 * 64]
2799 punpcklqdq xm2, xm2, xm3
2800 movq xm3, [r0 + 4 * 64]
2801 movq xm4, [r0 + 20 * 64]
2802 punpcklqdq xm3, xm3, xm4
2803 vinserti128 m2, m2, xm3, 1 ;[6 22 4 20]
2804
2805 movq xm3, [r0 + 3 * 64]
2806 movq xm4, [r0 + 11 * 64]
2807 punpcklqdq xm3, xm3, xm4
2808 movq xm4, [r0 + 19 * 64]
2809 movq xm5, [r0 + 27 * 64]
2810 punpcklqdq xm4, xm4, xm5
2811 vinserti128 m3, m3, xm4, 1 ;[3 11 17 25]
2812
2813 movq xm4, [r0 + 10 * 64]
2814 movq xm5, [r0 + 26 * 64]
2815 punpcklqdq xm4, xm4, xm5
2816 movq xm5, [r0 + 8 * 64]
2817 movq xm6, [r0 + 24 * 64]
2818 punpcklqdq xm5, xm5, xm6
2819 vinserti128 m4, m4, xm5, 1 ;[10 26 8 24]
2820
2821 movq xm5, [r0 + 5 * 64]
2822 movq xm6, [r0 + 13 * 64]
2823 punpcklqdq xm5, xm5, xm6
2824 movq xm6, [r0 + 21 * 64]
2825 movq xm7, [r0 + 29 * 64]
2826 punpcklqdq xm6, xm6, xm7
2827 vinserti128 m5, m5, xm6, 1 ;[5 13 21 9]
2828
2829 movq xm6, [r0 + 14 * 64]
2830 movq xm7, [r0 + 30 * 64]
2831 punpcklqdq xm6, xm6, xm7
2832 movq xm7, [r0 + 12 * 64]
2833 movq xm8, [r0 + 28 * 64]
2834 punpcklqdq xm7, xm7, xm8
2835 vinserti128 m6, m6, xm7, 1 ;[14 30 12 28]
2836
2837 movq xm7, [r0 + 7 * 64]
2838 movq xm8, [r0 + 15 * 64]
2839 punpcklqdq xm7, xm7, xm8
2840 movq xm8, [r0 + 23 * 64]
2841 movq xm9, [r0 + 31 * 64]
2842 punpcklqdq xm8, xm8, xm9
2843 vinserti128 m7, m7, xm8, 1 ;[7 15 23 31]
2844
2845 punpckhwd m8, m0, m2 ;[18 22 16 20]
2846 punpcklwd m0, m2 ;[2 6 0 4]
2847
2848 punpckhwd m2, m1, m3 ;[9 11 25 27]
2849 punpcklwd m1, m3 ;[1 3 17 19]
2850
2851 punpckhwd m3, m4, m6 ;[26 30 24 28]
2852 punpcklwd m4, m6 ;[10 14 8 12]
2853
2854 punpckhwd m6, m5, m7 ;[13 15 29 31]
2855 punpcklwd m5, m7 ;[5 7 21 23]
2856
2857 punpckhdq m7, m0, m4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
2858 punpckldq m0, m4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]
2859
2860 punpckhdq m4, m8, m3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
2861 punpckldq m8, m3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]
2862
2863 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
2864 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]
2865
2866 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
2867 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]
2868
2869 punpckhqdq m6, m0, m8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
2870 punpcklqdq m0, m8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]
2871
2872 punpckhqdq m8, m7, m4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
2873 punpcklqdq m7, m4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]
2874
2875 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
2876 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]
2877
2878 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
2879 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]
2880
2881 vperm2i128 m5, m0, m6, 0x20 ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301]
2882 vperm2i128 m0, m0, m6, 0x31 ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281]
2883
2884 vperm2i128 m6, m7, m8, 0x20 ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303]
2885 vperm2i128 m7, m7, m8, 0x31 ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283]
2886
2887 vperm2i128 m8, m1, m4, 0x31 ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311]
2888 vperm2i128 m4, m1, m4, 0x20 ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151]
2889
2890 vperm2i128 m1, m3, m2, 0x31 ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313]
2891 vperm2i128 m2, m3, m2, 0x20 ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153]
2892
2893 IDCT32_PASS1 0
2894 IDCT32_PASS1 1
2895 IDCT32_PASS1 2
2896 IDCT32_PASS1 3
2897 IDCT32_PASS1 4
2898 IDCT32_PASS1 5
2899 IDCT32_PASS1 6
2900 IDCT32_PASS1 7
2901
2902 add r0, 8
2903 add r3, 4
2904 add r4, 4
2905 dec r5d
2906 jnz .pass1
2907
2908 %if BIT_DEPTH == 10
2909 %define IDCT_SHIFT2 10
2910 vpbroadcastd m15, [pd_512]
2911 %elif BIT_DEPTH == 8
2912 %define IDCT_SHIFT2 12
2913 vpbroadcastd m15, [pd_2048]
2914 %else
2915 %error Unsupported BIT_DEPTH!
2916 %endif
2917
2918 mov r3, rsp
2919 add r2d, r2d
2920 mov r4d, 32
2921
2922 mova m7, [tab_idct32_4]
2923 mova m8, [tab_idct32_4 + 32]
2924 mova m9, [tab_idct32_4 + 64]
2925 mova m10, [tab_idct32_4 + 96]
2926 mova m11, [tab_idct32_4 + 128]
2927 mova m12, [tab_idct32_4 + 160]
2928 mova m13, [tab_idct32_4 + 192]
2929 mova m14, [tab_idct32_4 + 224]
2930 .pass2:
2931 movu m0, [r3]
2932 movu m1, [r3 + 32]
2933
2934 pmaddwd m2, m0, m7
2935 pmaddwd m3, m0, m8
2936 phaddd m2, m3
2937
2938 pmaddwd m3, m0, m9
2939 pmaddwd m4, m0, m10
2940 phaddd m3, m4
2941
2942 phaddd m2, m3
2943
2944 pmaddwd m3, m0, m11
2945 pmaddwd m4, m0, m12
2946 phaddd m3, m4
2947
2948 pmaddwd m4, m0, m13
2949 pmaddwd m5, m0, m14
2950 phaddd m4, m5
2951
2952 phaddd m3, m4
2953
2954 vperm2i128 m4, m2, m3, 0x31
2955 vperm2i128 m2, m2, m3, 0x20
2956 paddd m2, m4
2957
2958 pmaddwd m3, m0, [tab_idct32_4 + 256]
2959 pmaddwd m4, m0, [tab_idct32_4 + 288]
2960 phaddd m3, m4
2961
2962 pmaddwd m4, m0, [tab_idct32_4 + 320]
2963 pmaddwd m5, m0, [tab_idct32_4 + 352]
2964 phaddd m4, m5
2965
2966 phaddd m3, m4
2967
2968 pmaddwd m4, m0, [tab_idct32_4 + 384]
2969 pmaddwd m5, m0, [tab_idct32_4 + 416]
2970 phaddd m4, m5
2971
2972 pmaddwd m5, m0, [tab_idct32_4 + 448]
2973 pmaddwd m0, [tab_idct32_4 + 480]
2974 phaddd m5, m0
2975
2976 phaddd m4, m5
2977
2978 vperm2i128 m0, m3, m4, 0x31
2979 vperm2i128 m3, m3, m4, 0x20
2980 paddd m3, m0
2981
2982 pmaddwd m4, m1, [tab_idct32_1]
2983 pmaddwd m0, m1, [tab_idct32_1 + 32]
2984 phaddd m4, m0
2985
2986 pmaddwd m5, m1, [tab_idct32_1 + 64]
2987 pmaddwd m0, m1, [tab_idct32_1 + 96]
2988 phaddd m5, m0
2989
2990 phaddd m4, m5
2991
2992 pmaddwd m5, m1, [tab_idct32_1 + 128]
2993 pmaddwd m0, m1, [tab_idct32_1 + 160]
2994 phaddd m5, m0
2995
2996 pmaddwd m6, m1, [tab_idct32_1 + 192]
2997 pmaddwd m0, m1, [tab_idct32_1 + 224]
2998 phaddd m6, m0
2999
3000 phaddd m5, m6
3001
3002 vperm2i128 m0, m4, m5, 0x31
3003 vperm2i128 m4, m4, m5, 0x20
3004 paddd m4, m0
3005
3006 pmaddwd m5, m1, [tab_idct32_1 + 256]
3007 pmaddwd m0, m1, [tab_idct32_1 + 288]
3008 phaddd m5, m0
3009
3010 pmaddwd m6, m1, [tab_idct32_1 + 320]
3011 pmaddwd m0, m1, [tab_idct32_1 + 352]
3012 phaddd m6, m0
3013
3014 phaddd m5, m6
3015
3016 pmaddwd m6, m1, [tab_idct32_1 + 384]
3017 pmaddwd m0, m1, [tab_idct32_1 + 416]
3018 phaddd m6, m0
3019
3020 pmaddwd m0, m1, [tab_idct32_1 + 448]
3021 pmaddwd m1, [tab_idct32_1 + 480]
3022 phaddd m0, m1
3023
3024 phaddd m6, m0
3025
3026 vperm2i128 m0, m5, m6, 0x31
3027 vperm2i128 m5, m5, m6, 0x20
3028 paddd m5, m0
3029
3030 paddd m6, m2, m4
3031 paddd m6, m15
3032 psrad m6, IDCT_SHIFT2
3033
3034 psubd m2, m4
3035 paddd m2, m15
3036 psrad m2, IDCT_SHIFT2
3037
3038 paddd m4, m3, m5
3039 paddd m4, m15
3040 psrad m4, IDCT_SHIFT2
3041
3042 psubd m3, m5
3043 paddd m3, m15
3044 psrad m3, IDCT_SHIFT2
3045
3046 packssdw m6, m4
3047 packssdw m2, m3
3048
3049 vpermq m6, m6, 0xD8
3050 vpermq m2, m2, 0x8D
3051 pshufb m2, [dct16_shuf1]
3052
3053 mova [r1], m6
3054 mova [r1 + 32], m2
3055
3056 add r1, r2
3057 add r3, 64
3058 dec r4d
3059 jnz .pass2
3060 RET
3061
3062 ;-------------------------------------------------------
3063 ; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
3064 ;-------------------------------------------------------
3065 INIT_YMM avx2
3066 cglobal idct4, 3, 4, 6
3067
3068 %define IDCT_SHIFT1 7
3069 %if BIT_DEPTH == 10
3070 %define IDCT_SHIFT2 10
3071 vpbroadcastd m5, [pd_512]
3072 %elif BIT_DEPTH == 8
3073 %define IDCT_SHIFT2 12
3074 vpbroadcastd m5, [pd_2048]
3075 %else
3076 %error Unsupported BIT_DEPTH!
3077 %endif
3078 vbroadcasti128 m4, [pd_64]
3079
3080 add r2d, r2d
3081 lea r3, [r2 * 3]
3082
3083 movu m0, [r0] ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]
3084
3085 pshufb m0, [idct4_shuf1] ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
3086 vextracti128 xm1, m0, 1 ;[20 22 21 23 30 32 31 33]
3087 punpcklwd xm2, xm0, xm1 ;[00 20 02 22 01 21 03 23]
3088 punpckhwd xm0, xm1 ;[10 30 12 32 11 31 13 33]
3089 vinserti128 m2, m2, xm2, 1 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
3090 vinserti128 m0, m0, xm0, 1 ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
3091
3092 mova m1, [avx2_idct4_1]
3093 mova m3, [avx2_idct4_1 + 32]
3094 pmaddwd m1, m2
3095 pmaddwd m3, m0
3096
3097 paddd m0, m1, m3
3098 paddd m0, m4
3099 psrad m0, IDCT_SHIFT1 ;[00 20 10 30 01 21 11 31]
3100
3101 psubd m1, m3
3102 paddd m1, m4
3103 psrad m1, IDCT_SHIFT1 ;[03 23 13 33 02 22 12 32]
3104
3105 packssdw m0, m1 ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32]
3106 vmovshdup m1, m0 ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32]
3107 vmovsldup m0, m0 ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22]
3108
3109 vpbroadcastq m2, [avx2_idct4_2]
3110 vpbroadcastq m3, [avx2_idct4_2 + 8]
3111 pmaddwd m0, m2
3112 pmaddwd m1, m3
3113
3114 paddd m2, m0, m1
3115 paddd m2, m5
3116 psrad m2, IDCT_SHIFT2 ;[00 01 10 11 30 31 20 21]
3117
3118 psubd m0, m1
3119 paddd m0, m5
3120 psrad m0, IDCT_SHIFT2 ;[03 02 13 12 33 32 23 22]
3121
3122 pshufb m0, [idct4_shuf2] ;[02 03 12 13 32 33 22 23]
3123 punpcklqdq m1, m2, m0 ;[00 01 02 03 10 11 12 13]
3124 punpckhqdq m2, m0 ;[30 31 32 33 20 21 22 23]
3125 packssdw m1, m2 ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23]
3126 vextracti128 xm0, m1, 1
3127
3128 movq [r1], xm1
3129 movq [r1 + r2], xm0
3130 movhps [r1 + 2 * r2], xm0
3131 movhps [r1 + r3], xm1
3132 RET
3133 %endif