1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
5 ;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
6 ;* Li Cao <li@multicorewareinc.com>
7 ;* Praveen Kumar Tiwari <Praveen@multicorewareinc.com>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;* This program is also available under a commercial proprietary license.
24 ;* For more information, contact us at license @ x265.com.
25 ;*****************************************************************************/
27 ;TO-DO : Further optimize the routines.
30 %include "x86util.asm"
32 tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64
33 dw 89, 75, 50, 18, -18, -50, -75, -89
34 dw 83, 36, -36, -83, -83, -36, 36, 83
35 dw 75, -18, -89, -50, 50, 89, 18, -75
36 dw 64, -64, -64, 64, 64, -64, -64, 64
37 dw 50, -89, 18, 75, -75, -18, 89, -50
38 dw 36, -83, 83, -36, -36, 83, -83, 36
39 dw 18, -50, 75, -89, 89, -75, 50, -18
41 dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
43 tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64
44 dw 90, 87, 80, 70, 57, 43, 25, 9
45 dw 89, 75, 50, 18, -18, -50, -75, -89
46 dw 87, 57, 9, -43, -80, -90, -70, -25
47 dw 83, 36, -36, -83, -83, -36, 36, 83
48 dw 80, 9, -70, -87, -25, 57, 90, 43
49 dw 75, -18, -89, -50, 50, 89, 18, -75
50 dw 70, -43, -87, 9, 90, 25, -80, -57
51 dw 64, -64, -64, 64, 64, -64, -64, 64
52 dw 57, -80, -25, 90, -9, -87, 43, 70
53 dw 50, -89, 18, 75, -75, -18, 89, -50
54 dw 43, -90, 57, 25, -87, 70, 9, -80
55 dw 36, -83, 83, -36, -36, 83, -83, 36
56 dw 25, -70, 90, -80, 43, 9, -57, 87
57 dw 18, -50, 75, -89, 89, -75, 50, -18
58 dw 9, -25, 43, -57, 70, -80, 87, -90
61 tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64
62 dw -9, -25, -43, -57, -70, -80, -87, -90
63 dw -89, -75, -50, -18, 18, 50, 75, 89
64 dw 25, 70, 90, 80, 43, -9, -57, -87
65 dw 83, 36, -36, -83, -83, -36, 36, 83
66 dw -43, -90, -57, 25, 87, 70, -9, -80
67 dw -75, 18, 89, 50, -50, -89, -18, 75
68 dw 57, 80, -25, -90, -9, 87, 43, -70
69 dw 64, -64, -64, 64, 64, -64, -64, 64
70 dw -70, -43, 87, 9, -90, 25, 80, -57
71 dw -50, 89, -18, -75, 75, 18, -89, 50
72 dw 80, -9, -70, 87, -25, -57, 90, -43
73 dw 36, -83, 83, -36, -36, 83, -83, 36
74 dw -87, 57, -9, -43, 80, -90, 70, -25
75 dw -18, 50, -75, 89, -89, 75, -50, 18
76 dw 90, -87, 80, -70, 57, -43, 25, -9
78 dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
80 dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9
82 tab_dct32_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
83 dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
84 dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90
85 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
86 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
87 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
88 dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
89 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
90 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
91 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
92 dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
93 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
94 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
95 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
96 dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
97 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
98 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
99 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
100 dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57
101 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
102 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
103 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
104 dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43
105 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
106 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
107 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
108 dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
109 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
110 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
111 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
112 dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
113 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
115 tab_dct32_2: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
116 dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
117 dw -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90
118 dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90
119 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
120 dw -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88
121 dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87
122 dw 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
123 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
124 dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
125 dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80
126 dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78
127 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
128 dw -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
129 dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70
130 dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
131 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
132 dw -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61
133 dw -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
134 dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54
135 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
136 dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
137 dw -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43
138 dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38
139 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
140 dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
141 dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25
142 dw 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22
143 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
144 dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
145 dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9
146 dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
148 avx2_idct8_1: times 4 dw 64, 83, 64, 36
149 times 4 dw 64, 36, -64, -83
150 times 4 dw 64, -36, -64, 83
151 times 4 dw 64, -83, 64, -36
153 avx2_idct8_2: times 4 dw 89, 75, 50, 18
154 times 4 dw 75, -18, -89, -50
155 times 4 dw 50, -89, 18, 75
156 times 4 dw 18, -50, 75, -89
158 idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7
160 idct8_shuf2: times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
162 idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
164 tab_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9
165 dw 87, 57, 9, -43, -80, -90, -70, -25
166 dw 80, 9, -70, -87, -25, 57, 90, 43
167 dw 70, -43, -87, 9, 90, 25, -80, -57
168 dw 57, -80, -25, 90, -9, -87, 43, 70
169 dw 43, -90, 57, 25, -87, 70, 9, -80
170 dw 25, -70, 90, -80, 43, 9, -57, 87
171 dw 9, -25, 43, -57, 70, -80, 87, -90
173 tab_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18
174 dw 64, 75, 36, -18, -64, -89, -83, -50
175 dw 64, 50, -36, -89, -64, 18, 83, 75
176 dw 64, 18, -83, -50, 64, 75, -36, -89
177 dw 64, -18, -83, 50, 64, -75, -36, 89
178 dw 64, -50, -36, 89, -64, -18, 83, -75
179 dw 64, -75, 36, 18, -64, 89, -83, 50
180 dw 64, -89, 83, -75, 64, -50, 36, -18
182 idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7
184 idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5
186 tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
187 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
188 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
189 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
190 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
191 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
192 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
193 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
194 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
195 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
196 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
197 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
198 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
199 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
200 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
201 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
204 tab_idct32_2: dw 64, 89, 83, 75, 64, 50, 36, 18
205 dw 64, 75, 36, -18, -64, -89, -83, -50
206 dw 64, 50, -36, -89, -64, 18, 83, 75
207 dw 64, 18, -83, -50, 64, 75, -36, -89
208 dw 64, -18, -83, 50, 64, -75, -36, 89
209 dw 64, -50, -36, 89, -64, -18, 83, -75
210 dw 64, -75, 36, 18, -64, 89, -83, 50
211 dw 64, -89, 83, -75, 64, -50, 36, -18
214 tab_idct32_3: dw 90, 87, 80, 70, 57, 43, 25, 9
215 dw 87, 57, 9, -43, -80, -90, -70, -25
216 dw 80, 9, -70, -87, -25, 57, 90, 43
217 dw 70, -43, -87, 9, 90, 25, -80, -57
218 dw 57, -80, -25, 90, -9, -87, 43, 70
219 dw 43, -90, 57, 25, -87, 70, 9, -80
220 dw 25, -70, 90, -80, 43, 9, -57, 87
221 dw 9, -25, 43, -57, 70, -80, 87, -90
223 tab_idct32_4: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
224 dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
225 dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
226 dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
227 dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
228 dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
229 dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
230 dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
231 dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
232 dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
233 dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
234 dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
235 dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
236 dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
237 dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
238 dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
240 avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
241 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
243 avx2_idct4_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
244 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83
246 avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83
248 const idct4_shuf1, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
250 idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
252 tab_dct4: times 4 dw 64, 64
257 dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
259 tab_dst4: times 2 dw 29, 55, 74, 84
260 times 2 dw 74, 74, 0, -74
261 times 2 dw 84, -29, -74, 55
262 times 2 dw 55, -84, 74, -29
264 tab_idst4: times 4 dw 29, +84
273 tab_dct8_1: times 2 dw 89, 50, 75, 18
274 times 2 dw 75, -89, -18, -50
275 times 2 dw 50, 18, -89, 75
276 times 2 dw 18, 75, -50, -89
278 tab_dct8_2: times 2 dd 83, 36
280 times 1 dd 89, 75, 50, 18
281 times 1 dd 75, -18, -89, -50
282 times 1 dd 50, -89, 18, 75
283 times 1 dd 18, -50, 75, -89
285 tab_idct8_3: times 4 dw 89, 75
294 pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
296 pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13
298 tab_idct8_1: times 1 dw 64, -64, 36, -83, 64, 64, 83, 36
300 tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
301 times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
303 pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
320 ;------------------------------------------------------
321 ;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
322 ;------------------------------------------------------
324 cglobal dct4, 3, 4, 8
332 %error Unsupported BIT_DEPTH!
337 mova m4, [r3 + 0 * 16]
338 mova m5, [r3 + 1 * 16]
339 mova m6, [r3 + 2 * 16]
340 movh m0, [r0 + 0 * r2]
341 movh m1, [r0 + 1 * r2]
346 lea r0, [r0 + 2 * r2]
353 punpcklqdq m2, m0, m1
370 pmaddwd m2, [r3 + 3 * 16]
377 punpcklqdq m2, m0, m1
387 movu [r1 + 0 * 16], m1
394 movu [r1 + 1 * 16], m1
401 movu [r1 + 2 * 16], m1
403 pmaddwd m2, [r3 + 3 * 16]
404 pmaddwd m0, [r3 + 3 * 16]
408 movu [r1 + 3 * 16], m2
416 ; - r2: source stride
418 cglobal dct4, 3, 4, 8, src, dst, srcStride
421 vbroadcasti128 m7, [pd_4]
424 vbroadcasti128 m7, [pd_1]
426 %error Unsupported BIT_DEPTH!
431 vbroadcasti128 m4, [dct4_shuf]
435 movhps xm0, [r0 + r2]
436 lea r0, [r0 + 2 * r2]
438 movhps xm1, [r0 + r2]
440 vinserti128 m0, m0, xm1, 1
442 vpermq m1, m0, 11011101b
443 vpermq m0, m0, 10001000b
457 vpermq m1, m2, 11011101b
458 vpermq m2, m2, 10001000b
459 vbroadcasti128 m7, [pd_128]
474 movu [r1 + mmsize/2], m2
475 vextracti128 [r1 + mmsize], m3, 1
476 vextracti128 [r1 + mmsize + mmsize/2], m2, 1
479 ;-------------------------------------------------------
480 ;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
481 ;-------------------------------------------------------
483 cglobal idct4, 3, 4, 7
485 %define IDCT4_OFFSET [pd_2048]
486 %define IDCT4_SHIFT 12
487 %elif BIT_DEPTH == 10
488 %define IDCT4_OFFSET [pd_512]
489 %define IDCT4_SHIFT 10
491 %error Unsupported BIT_DEPTH!
498 movu m0, [r0 + 0 * 16]
499 movu m1, [r0 + 1 * 16]
502 movu m1, [r0 + 2 * 16]
503 movu m2, [r0 + 3 * 16]
507 pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1
510 pmaddwd m2, [r3 + 2 * 16] ; m2 = E2
514 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
515 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
518 psrad m4, 7 ; m4 = m128iA
521 packssdw m4, m5 ; m4 = m128iA
527 packssdw m2, m3 ; m2 = m128iD
529 punpcklwd m1, m4, m2 ; m1 = S0
530 punpckhwd m4, m2 ; m4 = S8
532 punpcklwd m0, m1, m4 ; m0 = m128iA
533 punpckhwd m1, m4 ; m1 = m128iD
535 mova m6, IDCT4_OFFSET
538 pmaddwd m3, m2, [r3 + 0 * 16]
539 paddd m3, m6 ; m3 = E1
541 pmaddwd m2, [r3 + 2 * 16]
542 paddd m2, m6 ; m2 = E2
545 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
546 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
549 psrad m4, IDCT4_SHIFT ; m4 = m128iA
551 psrad m5, IDCT4_SHIFT
552 packssdw m4, m5 ; m4 = m128iA
555 psrad m2, IDCT4_SHIFT
557 psrad m3, IDCT4_SHIFT
558 packssdw m2, m3 ; m2 = m128iD
564 movlps [r1 + 0 * r2], m0
565 movhps [r1 + 1 * r2], m0
568 movlps [r1 + 2 * r2], m1
569 lea r1, [r1 + 2 * r2]
574 ;------------------------------------------------------
575 ;void dst4(int16_t *src, int32_t *dst, intptr_t stride)
576 ;------------------------------------------------------
579 cglobal dst4, 3, 4, 8+2
582 %else ; ARCH_X86_64 = 0
583 cglobal dst4, 3, 4, 8
584 %define coef2 [r3 + 2 * 16]
585 %define coef3 [r3 + 3 * 16]
593 %elif BIT_DEPTH == 10
599 mova coef0, [r3 + 0 * 16]
600 mova coef1, [r3 + 1 * 16]
602 mova coef2, [r3 + 2 * 16]
603 mova coef3, [r3 + 3 * 16]
605 movh m0, [r0 + 0 * r2] ; load
606 movh m1, [r0 + 1 * r2]
608 lea r0, [r0 + 2 * r2]
612 pmaddwd m2, m0, coef0 ; DST1
613 pmaddwd m3, m1, coef0
617 pmaddwd m3, m0, coef1
618 pmaddwd m4, m1, coef1
622 packssdw m2, m3 ; m2 = T70
623 pmaddwd m3, m0, coef2
624 pmaddwd m4, m1, coef2
633 packssdw m3, m0 ; m3 = T71
636 pmaddwd m0, m2, coef0 ; DST2
637 pmaddwd m1, m3, coef0
641 movu [r1 + 0 * 16], m0
643 pmaddwd m0, m2, coef1
644 pmaddwd m1, m3, coef1
648 movu [r1 + 1 * 16], m0
650 pmaddwd m0, m2, coef2
651 pmaddwd m1, m3, coef2
655 movu [r1 + 2 * 16], m0
662 movu [r1 + 3 * 16], m2
666 ;-------------------------------------------------------
667 ;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
668 ;-------------------------------------------------------
670 cglobal idst4, 3, 4, 7
673 %define IDCT4_SHIFT 12
674 %elif BIT_DEPTH == 10
676 %define IDCT4_SHIFT 10
678 %error Unsupported BIT_DEPTH!
684 movu m0, [r0 + 0 * 16]
685 movu m1, [r0 + 1 * 16]
688 movu m1, [r0 + 2 * 16]
689 movu m2, [r0 + 3 * 16]
692 punpcklwd m2, m0, m1 ; m2 = m128iAC
693 punpckhwd m0, m1 ; m0 = m128iBD
695 pmaddwd m1, m2, [r3 + 0 * 16]
696 pmaddwd m3, m0, [r3 + 1 * 16]
699 psrad m1, 7 ; m1 = S0
701 pmaddwd m3, m2, [r3 + 2 * 16]
702 pmaddwd m4, m0, [r3 + 3 * 16]
705 psrad m3, 7 ; m3 = S8
706 packssdw m1, m3 ; m1 = m128iA
708 pmaddwd m3, m2, [r3 + 4 * 16]
709 pmaddwd m4, m0, [r3 + 5 * 16]
712 psrad m3, 7 ; m3 = S0
714 pmaddwd m2, [r3 + 6 * 16]
715 pmaddwd m0, [r3 + 7 * 16]
718 psrad m2, 7 ; m2 = S8
719 packssdw m3, m2 ; m3 = m128iD
728 pmaddwd m0, m1, [r3 + 0 * 16]
729 pmaddwd m3, m2, [r3 + 1 * 16]
732 psrad m0, IDCT4_SHIFT ; m0 = S0
733 pmaddwd m3, m1, [r3 + 2 * 16]
734 pmaddwd m4, m2, [r3 + 3 * 16]
737 psrad m3, IDCT4_SHIFT ; m3 = S8
738 packssdw m0, m3 ; m0 = m128iA
739 pmaddwd m3, m1, [r3 + 4 * 16]
740 pmaddwd m4, m2, [r3 + 5 * 16]
743 psrad m3, IDCT4_SHIFT ; m3 = S0
744 pmaddwd m1, [r3 + 6 * 16]
745 pmaddwd m2, [r3 + 7 * 16]
748 psrad m1, IDCT4_SHIFT ; m1 = S8
749 packssdw m3, m1 ; m3 = m128iD
754 movlps [r1 + 0 * r2], m2
755 movhps [r1 + 1 * r2], m2
758 movlps [r1 + 2 * r2], m1
759 lea r1, [r1 + 2 * r2]
764 ;-------------------------------------------------------
765 ; void dct8(int16_t *src, int32_t *dst, intptr_t stride)
766 ;-------------------------------------------------------
768 cglobal dct8, 3,6,7,0-16*mmsize
769 ;------------------------
770 ; Stack Mapping(dword)
771 ;------------------------
772 ; Row0[0-3] Row1[0-3]
774 ; Row6[0-3] Row7[0-3]
775 ; Row0[0-3] Row7[0-3]
777 ; Row6[4-7] Row7[4-7]
778 ;------------------------
786 %error Unsupported BIT_DEPTH!
796 movu m2, [r0 + r2 * 2]
803 punpckldq m1, m4, m5 ; m1 = [1 0]
804 punpckhdq m4, m5 ; m4 = [3 2]
807 pshufd m2, m3, 0x4E ; m2 = [4 5]
808 pshufd m0, m0, 0x4E ; m0 = [6 7]
811 psubw m1, m0 ; m1 = [d1 d0]
813 psubw m4, m2 ; m4 = [d3 d2]
814 punpcklqdq m2, m3, m0 ; m2 = [s2 s0]
816 pshufd m3, m3, 0x4E ; m3 = [s1 s3]
818 punpcklwd m0, m1, m4 ; m0 = [d2/d0]
819 punpckhwd m1, m4 ; m1 = [d3/d1]
820 punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0]
821 punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0]
825 pmaddwd m1, m4, [r4 + 0*16]
826 pmaddwd m5, m0, [r4 + 0*16]
833 mova [r5 + 1*2*mmsize], m1 ; Row 1
835 pmaddwd m1, m4, [r4 + 1*16]
836 pmaddwd m5, m0, [r4 + 1*16]
843 mova [r5 + 3*2*mmsize], m1 ; Row 3
845 pmaddwd m1, m4, [r4 + 2*16]
846 pmaddwd m5, m0, [r4 + 2*16]
853 mova [r5 + 5*2*mmsize], m1 ; Row 5
855 pmaddwd m4, [r4 + 3*16]
856 pmaddwd m0, [r4 + 3*16]
863 mova [r5 + 7*2*mmsize], m4; Row 7
867 paddw m0, m2, m3 ; m0 = [EE1 EE0]
868 pshufb m0, [pb_unpackhlw1]
869 psubw m2, m3 ; m2 = [EO1 EO0]
870 psignw m2, [pw_ppppmmmm]
871 pshufb m2, [pb_unpackhlw1]
872 pmaddwd m3, m0, [r4 + 0*16]
878 mova [r5 + 0*2*mmsize], m3 ; Row 0
879 pmaddwd m0, [r4 + 2*16]
885 mova [r5 + 4*2*mmsize], m0 ; Row 4
886 pmaddwd m3, m2, [r4 + 1*16]
892 mova [r5 + 2*2*mmsize], m3 ; Row 2
893 pmaddwd m2, [r4 + 3*16]
899 mova [r5 + 6*2*mmsize], m2 ; Row 6
902 lea r0, [r0 + r2 * 4]
909 mov r0, rsp ; r0 = pointer to Low Part
915 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0]
916 mova m1, [r0 + 1*2*mmsize]
917 paddd m2, m0, [r0 + (0*2+1)*mmsize]
918 pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0]
919 paddd m3, m1, [r0 + (1*2+1)*mmsize]
920 pshufd m3, m3, 0x9C ; m3 = ^^
921 psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0]
922 psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^
925 phaddd m4, m2, m3 ; m4 = [EE1 EE0 EE1 EE0]
926 phsubd m2, m3 ; m2 = [EO1 EO0 EO1 EO0]
928 pslld m4, 6 ; m4 = [64*EE1 64*EE0]
929 pmulld m5, m2, [r4 + 0*16] ; m5 = [36*EO1 83*EO0]
930 pmulld m2, [r4 + 1*16] ; m2 = [83*EO1 36*EO0]
932 phaddd m3, m4, m5 ; m3 = [Row2 Row0]
935 phsubd m4, m2 ; m4 = [Row6 Row4]
938 movh [r1 + 0*2*mmsize], m3
939 movhps [r1 + 2*2*mmsize], m3
940 movh [r1 + 4*2*mmsize], m4
941 movhps [r1 + 6*2*mmsize], m4
944 pmulld m2, m0, [r4 + 2*16]
945 pmulld m3, m1, [r4 + 2*16]
946 pmulld m4, m0, [r4 + 3*16]
947 pmulld m5, m1, [r4 + 3*16]
950 phaddd m2, m4 ; m2 = [Row3 Row1]
953 movh [r1 + 1*2*mmsize], m2
954 movhps [r1 + 3*2*mmsize], m2
956 pmulld m2, m0, [r4 + 4*16]
957 pmulld m3, m1, [r4 + 4*16]
958 pmulld m4, m0, [r4 + 5*16]
959 pmulld m5, m1, [r4 + 5*16]
962 phaddd m2, m4 ; m2 = [Row7 Row5]
965 movh [r1 + 5*2*mmsize], m2
966 movhps [r1 + 7*2*mmsize], m2
976 ;-------------------------------------------------------
977 ; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
978 ;-------------------------------------------------------
981 cglobal patial_butterfly_inverse_internal_pass1
983 movu m1, [r0 + 4 * 32]
984 movu m2, [r0 + 2 * 32]
985 movu m3, [r0 + 6 * 32]
988 punpckhwd m2, m0, m1 ; [2 6]
989 punpcklwd m0, m1 ; [0 4]
990 pmaddwd m1, m0, [r6] ; EE[0]
991 pmaddwd m0, [r6 + 32] ; EE[1]
992 pmaddwd m3, m2, [r6 + 16] ; EO[0]
993 pmaddwd m2, [r6 + 48] ; EO[1]
995 paddd m4, m1, m3 ; E[0]
997 paddd m3, m0, m2 ; E[1]
1008 movu m5, [r0 + 5 * 32]
1010 movu m5, [r0 + 3 * 32]
1011 movu m6, [r0 + 7 * 32]
1013 punpcklwd m6, m2, m5 ;[1 3]
1014 punpckhwd m2, m5 ;[5 7]
1016 pmaddwd m5, m6, [r4]
1017 pmaddwd m7, m2, [r4 + 16]
1027 movh [r5 + 0 * 16], m7
1028 movhps [r5 + 7 * 16], m7
1030 pmaddwd m5, m6, [r4 + 32]
1031 pmaddwd m4, m2, [r4 + 48]
1041 movh [r5 + 1 * 16], m4
1042 movhps [r5 + 6 * 16], m4
1044 pmaddwd m5, m6, [r4 + 64]
1045 pmaddwd m4, m2, [r4 + 80]
1055 movh [r5 + 2 * 16], m4
1056 movhps [r5 + 5 * 16], m4
1058 pmaddwd m5, m6, [r4 + 96]
1059 pmaddwd m4, m2, [r4 + 112]
1069 movh [r5 + 3 * 16], m4
1070 movhps [r5 + 4 * 16], m4
1074 %macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
1076 %define IDCT_SHIFT 10
1077 %elif BIT_DEPTH == 8
1078 %define IDCT_SHIFT 12
1080 %error Unsupported BIT_DEPTH!
1082 pshufb m4, %1, [pb_idct8even]
1083 pmaddwd m4, [tab_idct8_1]
1087 punpckhqdq m4, m5 ;m4 = dd e[ 0 1 2 3]
1091 pmaddwd m5, %1, [r4]
1092 pmaddwd %1, [r4 + 16]
1093 phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3]
1096 psrad %1, IDCT_SHIFT
1099 psrad m4, IDCT_SHIFT
1106 cglobal patial_butterfly_inverse_internal_pass2
1109 PARTIAL_BUTTERFLY_PROCESS_ROW m0
1113 PARTIAL_BUTTERFLY_PROCESS_ROW m2
1117 PARTIAL_BUTTERFLY_PROCESS_ROW m1
1118 movu [r1 + 2 * r2], m1
1121 PARTIAL_BUTTERFLY_PROCESS_ROW m3
1126 cglobal idct8, 3,7,8 ;,0-16*mmsize
1127 ; alignment stack to 64-bytes
1129 sub rsp, 16*mmsize + gprsize
1131 mov [rsp + 16*mmsize], r5
1134 lea r4, [tab_idct8_3]
1137 call patial_butterfly_inverse_internal_pass1
1142 call patial_butterfly_inverse_internal_pass1
1146 %elif BIT_DEPTH == 8
1149 %error Unsupported BIT_DEPTH!
1153 lea r4, [tab_idct8_2]
1154 lea r6, [pb_idct8odd]
1157 call patial_butterfly_inverse_internal_pass2
1159 lea r1, [r1 + 4 * r2]
1162 call patial_butterfly_inverse_internal_pass2
1164 ; restore origin stack pointer
1165 mov rsp, [rsp + 16*mmsize]
1169 ;-----------------------------------------------------------------------------
1170 ; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size)
1171 ;-----------------------------------------------------------------------------
1173 cglobal denoise_dct, 4, 4, 6
1196 cglobal denoise_dct, 4, 4, 6
1217 %if ARCH_X86_64 == 1
1218 %macro DCT8_PASS_1 4
1219 vpbroadcastq m0, [r6 + %1]
1230 %macro DCT8_PASS_2 1
1231 vbroadcasti128 m4, [r6 + %1]
1240 psrad m6, DCT_SHIFT2
1244 cglobal dct8, 3, 7, 10, 0-8*16
1247 vbroadcasti128 m5, [pd_8]
1248 %elif BIT_DEPTH == 8
1250 vbroadcasti128 m5, [pd_2]
1252 %error Unsupported BIT_DEPTH!
1254 %define DCT_SHIFT2 9
1258 lea r4, [r0 + r2 * 4]
1261 mova m6, [dct8_shuf]
1265 vinserti128 m0, m0, [r4], 1
1267 vinserti128 m1, m1, [r4 + r2], 1
1268 mova xm2, [r0 + r2 * 2]
1269 vinserti128 m2, m2, [r4 + r2 * 2], 1
1271 vinserti128 m3, m3, [r4 + r3], 1
1273 punpcklqdq m4, m0, m1
1275 punpcklqdq m1, m2, m3
1287 DCT8_PASS_1 0 * 16, 0 * 16, 3, 7
1288 DCT8_PASS_1 1 * 16, 2 * 16, 4, 1
1289 DCT8_PASS_1 2 * 16, 4 * 16, 3, 7
1290 DCT8_PASS_1 3 * 16, 6 * 16, 4, 1
1291 DCT8_PASS_1 4 * 16, 1 * 16, 3, 7
1292 DCT8_PASS_1 5 * 16, 3 * 16, 4, 1
1293 DCT8_PASS_1 6 * 16, 5 * 16, 3, 7
1294 DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
1299 lea r4, [r1 + r2 * 4]
1300 vbroadcasti128 m5, [pd_256]
1312 movu [r1 + r2 * 2], m6
1320 movu [r4 + r2 * 2], m6
1325 %macro DCT16_PASS_1_E 2
1326 vpbroadcastq m7, [r7 + %1]
1341 %macro DCT16_PASS_1_O 2
1342 vbroadcasti128 m7, [r7 + %1]
1346 phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5]
1350 phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7]
1352 phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7]
1355 psrad m10, DCT_SHIFT
1357 packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -]
1358 vpermq m10, m10, 0x08
1360 mova [r5 + %2], xm10
1363 %macro DCT16_PASS_2 1
1364 vbroadcasti128 m8, [r7 + %1]
1365 vbroadcasti128 m13, [r8 + %1]
1368 pmaddwd m11, m1, m13
1372 pmaddwd m12, m3, m13
1377 pmaddwd m12, m5, m13
1381 pmaddwd m13, m7, m13
1387 psrad m10, DCT_SHIFT2
1390 cglobal dct16, 3, 9, 15, 0-16*mmsize
1393 vbroadcasti128 m9, [pd_16]
1394 %elif BIT_DEPTH == 8
1396 vbroadcasti128 m9, [pd_4]
1398 %error Unsupported BIT_DEPTH!
1400 %define DCT_SHIFT2 10
1404 mova m13, [dct16_shuf1]
1405 mova m14, [dct16_shuf2]
1406 lea r7, [tab_dct16_1 + 8 * 16]
1407 lea r8, [tab_dct16_2 + 8 * 16]
1410 mov r4d, 2 ; Each iteration process 8 rows, so 16/8 iterations
1413 lea r6, [r0 + r2 * 4]
1417 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo]
1418 vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi]
1422 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo]
1423 vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi]
1425 movu m6, [r0 + r2 * 2]
1426 movu m5, [r6 + r2 * 2]
1427 vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo]
1428 vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi]
1432 vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo]
1433 vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi]
1452 DCT16_PASS_1_O -7 * 16, 1 * 32
1453 DCT16_PASS_1_O -5 * 16, 3 * 32
1454 DCT16_PASS_1_O -3 * 16, 1 * 32 + 16
1455 DCT16_PASS_1_O -1 * 16, 3 * 32 + 16
1456 DCT16_PASS_1_O 1 * 16, 5 * 32
1457 DCT16_PASS_1_O 3 * 16, 7 * 32
1458 DCT16_PASS_1_O 5 * 16, 5 * 32 + 16
1459 DCT16_PASS_1_O 7 * 16, 7 * 32 + 16
1469 DCT16_PASS_1_E -8 * 16, 0 * 32
1470 DCT16_PASS_1_E -4 * 16, 0 * 32 + 16
1471 DCT16_PASS_1_E 0 * 16, 4 * 32
1472 DCT16_PASS_1_E 4 * 16, 4 * 32 + 16
1477 DCT16_PASS_1_E -6 * 16, 2 * 32
1478 DCT16_PASS_1_E -2 * 16, 2 * 32 + 16
1479 DCT16_PASS_1_E 2 * 16, 6 * 32
1480 DCT16_PASS_1_E 6 * 16, 6 * 32 + 16
1482 lea r0, [r0 + 8 * r2]
1492 vbroadcasti128 m9, [pd_512]
1495 mova m0, [r5 + 0 * 32] ; [row0lo row4lo]
1496 mova m1, [r5 + 8 * 32] ; [row0hi row4hi]
1498 mova m2, [r5 + 1 * 32] ; [row1lo row5lo]
1499 mova m3, [r5 + 9 * 32] ; [row1hi row5hi]
1501 mova m4, [r5 + 2 * 32] ; [row2lo row6lo]
1502 mova m5, [r5 + 10 * 32] ; [row2hi row6hi]
1504 mova m6, [r5 + 3 * 32] ; [row3lo row7lo]
1505 mova m7, [r5 + 11 * 32] ; [row3hi row7hi]
1507 DCT16_PASS_2 -8 * 16
1509 DCT16_PASS_2 -7 * 16
1511 DCT16_PASS_2 -6 * 16
1512 movu [r1 + r2 * 2], m10
1513 DCT16_PASS_2 -5 * 16
1516 lea r6, [r1 + r2 * 4]
1517 DCT16_PASS_2 -4 * 16
1519 DCT16_PASS_2 -3 * 16
1521 DCT16_PASS_2 -2 * 16
1522 movu [r6 + r2 * 2], m10
1523 DCT16_PASS_2 -1 * 16
1526 lea r6, [r6 + r2 * 4]
1532 movu [r6 + r2 * 2], m10
1536 lea r6, [r6 + r2 * 4]
1542 movu [r6 + r2 * 2], m10
1553 %macro DCT32_PASS_1 4
1554 vbroadcasti128 m8, [r7 + %1]
1556 pmaddwd m11, m%3, m8
1557 pmaddwd m12, m%4, m8
1560 vbroadcasti128 m8, [r7 + %1 + 32]
1561 vbroadcasti128 m10, [r7 + %1 + 48]
1563 pmaddwd m13, m6, m10
1567 pmaddwd m14, m7, m10
1574 psrad m11, DCT_SHIFT
1576 vpermq m11, m11, 0xD8
1578 movq [r5 + %2], xm11
1579 vextracti128 xm10, m11, 1
1580 movq [r5 + %2 + 64], xm10
1583 %macro DCT32_PASS_2 1
1587 pmaddwd m12, m1, m10
1591 pmaddwd m13, m3, m10
1597 pmaddwd m13, m5, m10
1601 pmaddwd m14, m7, m10
1607 vextracti128 xm10, m11, 1
1611 psrad xm11, DCT_SHIFT2
1616 cglobal dct32, 3, 9, 16, 0-64*mmsize
1619 vpbroadcastq m9, [pd_32]
1620 %elif BIT_DEPTH == 8
1622 vpbroadcastq m9, [pd_8]
1624 %error Unsupported BIT_DEPTH!
1626 %define DCT_SHIFT2 11
1630 lea r7, [tab_dct32_1]
1631 lea r8, [tab_dct32_2]
1635 mova m15, [dct16_shuf1]
1645 movu m1, [r0 + r2 * 2]
1646 movu m0, [r0 + r2 * 2 + 32]
1651 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E
1652 vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E
1657 vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O
1658 vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O
1662 movu m2, [r0 + r2 + 32]
1669 movu m2, [r0 + r3 + 32]
1674 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E
1675 vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E
1680 vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O
1681 vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O
1684 DCT32_PASS_1 0 * 32, 0 * 64, 0, 2
1685 DCT32_PASS_1 2 * 32, 2 * 64, 1, 3
1686 DCT32_PASS_1 4 * 32, 4 * 64, 0, 2
1687 DCT32_PASS_1 6 * 32, 6 * 64, 1, 3
1688 DCT32_PASS_1 8 * 32, 8 * 64, 0, 2
1689 DCT32_PASS_1 10 * 32, 10 * 64, 1, 3
1690 DCT32_PASS_1 12 * 32, 12 * 64, 0, 2
1691 DCT32_PASS_1 14 * 32, 14 * 64, 1, 3
1692 DCT32_PASS_1 16 * 32, 16 * 64, 0, 2
1693 DCT32_PASS_1 18 * 32, 18 * 64, 1, 3
1694 DCT32_PASS_1 20 * 32, 20 * 64, 0, 2
1695 DCT32_PASS_1 22 * 32, 22 * 64, 1, 3
1696 DCT32_PASS_1 24 * 32, 24 * 64, 0, 2
1697 DCT32_PASS_1 26 * 32, 26 * 64, 1, 3
1698 DCT32_PASS_1 28 * 32, 28 * 64, 0, 2
1699 DCT32_PASS_1 30 * 32, 30 * 64, 1, 3
1702 lea r0, [r0 + r2 * 4]
1711 vpbroadcastq m9, [pd_1024]
1714 mova m0, [r5 + 0 * 64]
1715 mova m1, [r5 + 0 * 64 + 32]
1717 mova m2, [r5 + 1 * 64]
1718 mova m3, [r5 + 1 * 64 + 32]
1720 mova m4, [r5 + 2 * 64]
1721 mova m5, [r5 + 2 * 64 + 32]
1723 mova m6, [r5 + 3 * 64]
1724 mova m7, [r5 + 3 * 64 + 32]
1729 movu [r1 + r2], xm11
1731 movu [r1 + r2 * 2], xm11
1733 movu [r1 + r3], xm11
1735 lea r6, [r1 + r2 * 4]
1739 movu [r6 + r2], xm11
1741 movu [r6 + r2 * 2], xm11
1743 movu [r6 + r3], xm11
1745 lea r6, [r6 + r2 * 4]
1749 movu [r6 + r2], xm11
1750 DCT32_PASS_2 10 * 32
1751 movu [r6 + r2 * 2], xm11
1752 DCT32_PASS_2 11 * 32
1753 movu [r6 + r3], xm11
1755 lea r6, [r6 + r2 * 4]
1756 DCT32_PASS_2 12 * 32
1758 DCT32_PASS_2 13 * 32
1759 movu [r6 + r2], xm11
1760 DCT32_PASS_2 14 * 32
1761 movu [r6 + r2 * 2], xm11
1762 DCT32_PASS_2 15 * 32
1763 movu [r6 + r3], xm11
1765 lea r6, [r6 + r2 * 4]
1766 DCT32_PASS_2 16 * 32
1768 DCT32_PASS_2 17 * 32
1769 movu [r6 + r2], xm11
1770 DCT32_PASS_2 18 * 32
1771 movu [r6 + r2 * 2], xm11
1772 DCT32_PASS_2 19 * 32
1773 movu [r6 + r3], xm11
1775 lea r6, [r6 + r2 * 4]
1776 DCT32_PASS_2 20 * 32
1778 DCT32_PASS_2 21 * 32
1779 movu [r6 + r2], xm11
1780 DCT32_PASS_2 22 * 32
1781 movu [r6 + r2 * 2], xm11
1782 DCT32_PASS_2 23 * 32
1783 movu [r6 + r3], xm11
1785 lea r6, [r6 + r2 * 4]
1786 DCT32_PASS_2 24 * 32
1788 DCT32_PASS_2 25 * 32
1789 movu [r6 + r2], xm11
1790 DCT32_PASS_2 26 * 32
1791 movu [r6 + r2 * 2], xm11
1792 DCT32_PASS_2 27 * 32
1793 movu [r6 + r3], xm11
1795 lea r6, [r6 + r2 * 4]
1796 DCT32_PASS_2 28 * 32
1798 DCT32_PASS_2 29 * 32
1799 movu [r6 + r2], xm11
1800 DCT32_PASS_2 30 * 32
1801 movu [r6 + r2 * 2], xm11
1802 DCT32_PASS_2 31 * 32
1803 movu [r6 + r3], xm11
1812 %macro IDCT8_PASS_1 1
1813 vpbroadcastd m7, [r5 + %1]
1814 vpbroadcastd m10, [r5 + %1 + 4]
1819 vpbroadcastd m7, [r6 + %1]
1820 vpbroadcastd m10, [r6 + %1 + 4]
1827 psrad m3, IDCT_SHIFT1
1831 psrad m5, IDCT_SHIFT1
1833 vpbroadcastd m7, [r5 + %1 + 32]
1834 vpbroadcastd m10, [r5 + %1 + 36]
1839 vpbroadcastd m7, [r6 + %1 + 32]
1840 vpbroadcastd m10, [r6 + %1 + 36]
1847 psrad m9, IDCT_SHIFT1
1851 psrad m6, IDCT_SHIFT1
1860 %macro IDCT8_PASS_2 0
1861 punpcklqdq m2, m0, m1
1864 pmaddwd m3, m2, [r5]
1865 pmaddwd m5, m2, [r5 + 32]
1866 pmaddwd m6, m2, [r5 + 64]
1867 pmaddwd m7, m2, [r5 + 96]
1870 pshufb m3, [idct8_shuf2]
1871 pshufb m6, [idct8_shuf2]
1872 punpcklqdq m7, m3, m6
1875 pmaddwd m5, m0, [r6]
1876 pmaddwd m6, m0, [r6 + 32]
1877 pmaddwd m8, m0, [r6 + 64]
1878 pmaddwd m9, m0, [r6 + 96]
1881 pshufb m5, [idct8_shuf2]
1882 pshufb m8, [idct8_shuf2]
1883 punpcklqdq m6, m5, m8
1888 psrad m8, IDCT_SHIFT2
1892 psrad m7, IDCT_SHIFT2
1894 pshufb m7, [idct8_shuf3]
1899 psrad m9, IDCT_SHIFT2
1903 psrad m3, IDCT_SHIFT2
1905 pshufb m3, [idct8_shuf3]
1910 cglobal idct8, 3, 7, 13, 0-8*16
1912 %define IDCT_SHIFT2 10
1913 vpbroadcastd m12, [pd_512]
1914 %elif BIT_DEPTH == 8
1915 %define IDCT_SHIFT2 12
1916 vpbroadcastd m12, [pd_2048]
1918 %error Unsupported BIT_DEPTH!
1920 %define IDCT_SHIFT1 7
1922 vbroadcasti128 m11, [pd_64]
1925 lea r5, [avx2_idct8_1]
1926 lea r6, [avx2_idct8_2]
1929 mova m0, [r0 + 0 * 32]
1930 mova m1, [r0 + 4 * 32]
1931 packssdw m0, m1 ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4]
1932 mova m1, [r0 + 2 * 32]
1933 mova m2, [r0 + 6 * 32]
1934 packssdw m1, m2 ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6]
1935 mova m2, [r0 + 1 * 32]
1936 mova m3, [r0 + 5 * 32]
1937 packssdw m2, m3 ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5]
1938 mova m3, [r0 + 3 * 32]
1939 mova m4, [r0 + 7 * 32]
1940 packssdw m3, m4 ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7]
1942 mova m5, [idct8_shuf1]
1944 punpcklwd m4, m0, m1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
1945 punpckhwd m0, m1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
1949 punpcklwd m1, m2, m3 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
1950 punpckhwd m2, m3 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
1970 vextracti128 xm3, m8, 1
1973 vextracti128 xm3, m9, 1
1974 mova [r1 + r2 * 2], xm9
1977 lea r1, [r1 + r2 * 4]
1982 vextracti128 xm3, m8, 1
1985 vextracti128 xm3, m9, 1
1986 mova [r1 + r2 * 2], xm9
1991 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16]
2002 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16]
2016 psrad m11, IDCT_SHIFT1
2020 psrad m9, IDCT_SHIFT1
2022 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16 + 16]
2033 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16 + 16]
2047 psrad m5, IDCT_SHIFT1
2051 psrad m10, IDCT_SHIFT1
2056 mova m10, [idct16_shuff]
2057 mova m5, [idct16_shuff1]
2059 vpermd m12, m10, m11
2061 mova [r3 + %1 * 16 * 2], xm12
2062 mova [r3 + %2 * 16 * 2], xm13
2063 vextracti128 [r3 + %2 * 16 * 2 + 32], m13, 1
2064 vextracti128 [r3 + %1 * 16 * 2 + 32], m12, 1
2067 ;-------------------------------------------------------
2068 ; void idct16(int32_t *src, int16_t *dst, intptr_t stride)
2069 ;-------------------------------------------------------
2071 cglobal idct16, 3, 7, 16, 0-16*mmsize
2073 %define IDCT_SHIFT2 10
2074 vpbroadcastd m15, [pd_512]
2075 %elif BIT_DEPTH == 8
2076 %define IDCT_SHIFT2 12
2077 vpbroadcastd m15, [pd_2048]
2079 %error Unsupported BIT_DEPTH!
2081 %define IDCT_SHIFT1 7
2083 vbroadcasti128 m14, [pd_64]
2090 movu m0, [r0 + 0 * 64]
2091 movu m1, [r0 + 8 * 64]
2092 packssdw m0, m1 ;[0L 8L 0H 8H]
2094 movu m1, [r0 + 1 * 64]
2095 movu m2, [r0 + 9 * 64]
2096 packssdw m1, m2 ;[1L 9L 1H 9H]
2098 movu m2, [r0 + 2 * 64]
2099 movu m3, [r0 + 10 * 64]
2100 packssdw m2, m3 ;[2L 10L 2H 10H]
2102 movu m3, [r0 + 3 * 64]
2103 movu m4, [r0 + 11 * 64]
2104 packssdw m3, m4 ;[3L 11L 3H 11H]
2106 movu m4, [r0 + 4 * 64]
2107 movu m5, [r0 + 12 * 64]
2108 packssdw m4, m5 ;[4L 12L 4H 12H]
2110 movu m5, [r0 + 5 * 64]
2111 movu m6, [r0 + 13 * 64]
2112 packssdw m5, m6 ;[5L 13L 5H 13H]
2114 movu m6, [r0 + 6 * 64]
2115 movu m7, [r0 + 14 * 64]
2116 packssdw m6, m7 ;[6L 14L 6H 14H]
2118 movu m7, [r0 + 7 * 64]
2119 movu m8, [r0 + 15 * 64]
2120 packssdw m7, m8 ;[7L 15L 7H 15H]
2122 punpckhwd m8, m0, m2 ;[8 10]
2123 punpcklwd m0, m2 ;[0 2]
2125 punpckhwd m2, m1, m3 ;[9 11]
2126 punpcklwd m1, m3 ;[1 3]
2128 punpckhwd m3, m4, m6 ;[12 14]
2129 punpcklwd m4, m6 ;[4 6]
2131 punpckhwd m6, m5, m7 ;[13 15]
2132 punpcklwd m5, m7 ;[5 7]
2134 punpckhdq m7, m0, m4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
2135 punpckldq m0, m4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
2137 punpckhdq m4, m8, m3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
2138 punpckldq m8, m3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
2140 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
2141 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
2143 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
2144 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
2146 punpckhqdq m6, m0, m8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
2147 punpcklqdq m0, m8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
2149 punpckhqdq m8, m7, m4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
2150 punpcklqdq m7, m4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
2152 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
2153 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
2155 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
2156 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
2170 lea r5, [tab_idct16_2]
2171 lea r6, [tab_idct16_1]
2173 vbroadcasti128 m7, [r5]
2174 vbroadcasti128 m8, [r5 + 16]
2175 vbroadcasti128 m9, [r5 + 32]
2176 vbroadcasti128 m10, [r5 + 48]
2177 vbroadcasti128 m11, [r5 + 64]
2178 vbroadcasti128 m12, [r5 + 80]
2179 vbroadcasti128 m13, [r5 + 96]
2199 vbroadcasti128 m14, [r5 + 112]
2209 vbroadcasti128 m14, [r6]
2211 vbroadcasti128 m14, [r6 + 16]
2215 vbroadcasti128 m14, [r6 + 32]
2217 vbroadcasti128 m14, [r6 + 48]
2223 vbroadcasti128 m14, [r6 + 64]
2225 vbroadcasti128 m14, [r6 + 80]
2229 vbroadcasti128 m14, [r6 + 96]
2231 vbroadcasti128 m14, [r6 + 112]
2239 psrad m5, IDCT_SHIFT2
2243 psrad m1, IDCT_SHIFT2
2247 psrad m6, IDCT_SHIFT2
2251 psrad m2, IDCT_SHIFT2
2255 pshufb m2, m1, [dct16_shuf1]
2259 vextracti128 [r1 + r2], m5, 1
2260 vextracti128 [r1 + r2 + 16], m2, 1
2262 lea r1, [r1 + 2 * r2]
2268 %macro IDCT32_PASS1 1
2269 vbroadcasti128 m3, [tab_idct32_1 + %1 * 32]
2270 vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16]
2272 pmaddwd m10, m8, m13
2276 pmaddwd m11, m1, m13
2281 vbroadcasti128 m3, [tab_idct32_1 + (15 - %1) * 32]
2282 vbroadcasti128 m13, [tab_idct32_1 + (15- %1) * 32 + 16]
2284 pmaddwd m11, m8, m13
2288 pmaddwd m12, m1, m13
2292 phaddd m9, m10 ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15]
2294 vbroadcasti128 m3, [tab_idct32_2 + %1 * 16]
2300 vbroadcasti128 m3, [tab_idct32_3 + %1 * 16]
2306 paddd m12, m10, m11 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL]
2307 psubd m10, m11 ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL]
2309 punpcklqdq m12, m10 ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15]
2312 psrad m10, IDCT_SHIFT1
2316 psrad m12, IDCT_SHIFT1
2319 vextracti128 xm12, m10, 1
2320 movd [r3 + %1 * 64], xm10
2321 movd [r3 + 32 + %1 * 64], xm12
2322 pextrd [r4 - %1 * 64], xm10, 1
2323 pextrd [r4+ 32 - %1 * 64], xm12, 1
2324 pextrd [r3 + 16 * 64 + %1 *64], xm10, 3
2325 pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
2326 pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2
2327 pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
2330 ;-------------------------------------------------------
2331 ; void idct32(int32_t *src, int16_t *dst, intptr_t stride)
2332 ;-------------------------------------------------------
2334 ; TODO: Reduce PHADDD instruction by PADDD
2337 cglobal idct32, 3, 6, 16, 0-32*64
2339 %define IDCT_SHIFT1 7
2341 vbroadcasti128 m15, [pd_64]
2344 lea r4, [r3 + 15 * 64]
2348 movu xm0, [r0 + 2 * 128]
2349 movu xm1, [r0 + 18 * 128]
2350 vinserti128 m0, m0, [r0 + 0 * 128], 1
2351 vinserti128 m1, m1, [r0 + 16 * 128], 1
2353 packssdw m0, m1 ;[2 18 0 16]
2355 movu xm1, [r0 + 1 * 128]
2356 movu xm2, [r0 + 9 * 128]
2357 vinserti128 m1, m1, [r0 + 17 * 128], 1
2358 vinserti128 m2, m2, [r0 + 25 * 128], 1
2359 packssdw m1, m2 ;[1 9 17 25]
2361 movu xm2, [r0 + 6 * 128]
2362 movu xm3, [r0 + 22 * 128]
2363 vinserti128 m2, m2, [r0 + 4 * 128], 1
2364 vinserti128 m3, m3, [r0 + 20 * 128], 1
2365 packssdw m2, m3 ;[6 22 4 20]
2367 movu xm3, [r0 + 3 * 128]
2368 movu xm4, [r0 + 11 * 128]
2369 vinserti128 m3, m3, [r0 + 19 * 128], 1
2370 vinserti128 m4, m4, [r0 + 27 * 128], 1
2371 packssdw m3, m4 ;[3 11 19 27]
2373 movu xm4, [r0 + 10 * 128]
2374 movu xm5, [r0 + 26 * 128]
2375 vinserti128 m4, m4, [r0 + 8 * 128], 1
2376 vinserti128 m5, m5, [r0 + 24 * 128], 1
2377 packssdw m4, m5 ;[10 26 8 24]
2379 movu xm5, [r0 + 5 * 128]
2380 movu xm6, [r0 + 13 * 128]
2381 vinserti128 m5, m5, [r0 + 21 * 128], 1
2382 vinserti128 m6, m6, [r0 + 29 * 128], 1
2383 packssdw m5, m6 ;[5 13 21 29]
2385 movu xm6, [r0 + 14 * 128]
2386 movu xm7, [r0 + 30 * 128]
2387 vinserti128 m6, m6, [r0 + 12 * 128], 1
2388 vinserti128 m7, m7, [r0 + 28 * 128], 1
2389 packssdw m6, m7 ;[14 30 12 28]
2391 movu xm7, [r0 + 7 * 128]
2392 movu xm8, [r0 + 15 * 128]
2393 vinserti128 m7, m7, [r0 + 23 * 128], 1
2394 vinserti128 m8, m8, [r0 + 31 * 128], 1
2395 packssdw m7, m8 ;[7 15 23 31]
2397 punpckhwd m8, m0, m2 ;[18 22 16 20]
2398 punpcklwd m0, m2 ;[2 6 0 4]
2400 punpckhwd m2, m1, m3 ;[9 11 25 27]
2401 punpcklwd m1, m3 ;[1 3 17 19]
2403 punpckhwd m3, m4, m6 ;[26 30 24 28]
2404 punpcklwd m4, m6 ;[10 14 8 12]
2406 punpckhwd m6, m5, m7 ;[13 15 29 31]
2407 punpcklwd m5, m7 ;[5 7 21 23]
2409 punpckhdq m7, m0, m4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
2410 punpckldq m0, m4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]
2412 punpckhdq m4, m8, m3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
2413 punpckldq m8, m3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]
2415 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
2416 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]
2418 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
2419 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]
2421 punpckhqdq m6, m0, m8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
2422 punpcklqdq m0, m8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]
2424 punpckhqdq m8, m7, m4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
2425 punpcklqdq m7, m4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]
2427 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
2428 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]
2430 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
2431 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]
2433 vperm2i128 m5, m0, m6, 0x20 ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301]
2434 vperm2i128 m0, m0, m6, 0x31 ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281]
2436 vperm2i128 m6, m7, m8, 0x20 ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303]
2437 vperm2i128 m7, m7, m8, 0x31 ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283]
2439 vperm2i128 m8, m1, m4, 0x31 ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311]
2440 vperm2i128 m4, m1, m4, 0x20 ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151]
2442 vperm2i128 m1, m3, m2, 0x31 ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313]
2443 vperm2i128 m2, m3, m2, 0x20 ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153]
2461 %define IDCT_SHIFT2 10
2462 vpbroadcastd m15, [pd_512]
2463 %elif BIT_DEPTH == 8
2464 %define IDCT_SHIFT2 12
2465 vpbroadcastd m15, [pd_2048]
2467 %error Unsupported BIT_DEPTH!
2474 mova m7, [tab_idct32_4]
2475 mova m8, [tab_idct32_4 + 32]
2476 mova m9, [tab_idct32_4 + 64]
2477 mova m10, [tab_idct32_4 + 96]
2478 mova m11, [tab_idct32_4 + 128]
2479 mova m12, [tab_idct32_4 + 160]
2480 mova m13, [tab_idct32_4 + 192]
2481 mova m14, [tab_idct32_4 + 224]
2506 vperm2i128 m4, m2, m3, 0x31
2507 vperm2i128 m2, m2, m3, 0x20
2510 pmaddwd m3, m0, [tab_idct32_4 + 256]
2511 pmaddwd m4, m0, [tab_idct32_4 + 288]
2514 pmaddwd m4, m0, [tab_idct32_4 + 320]
2515 pmaddwd m5, m0, [tab_idct32_4 + 352]
2520 pmaddwd m4, m0, [tab_idct32_4 + 384]
2521 pmaddwd m5, m0, [tab_idct32_4 + 416]
2524 pmaddwd m5, m0, [tab_idct32_4 + 448]
2525 pmaddwd m0, [tab_idct32_4 + 480]
2530 vperm2i128 m0, m3, m4, 0x31
2531 vperm2i128 m3, m3, m4, 0x20
2534 pmaddwd m4, m1, [tab_idct32_1]
2535 pmaddwd m0, m1, [tab_idct32_1 + 32]
2538 pmaddwd m5, m1, [tab_idct32_1 + 64]
2539 pmaddwd m0, m1, [tab_idct32_1 + 96]
2544 pmaddwd m5, m1, [tab_idct32_1 + 128]
2545 pmaddwd m0, m1, [tab_idct32_1 + 160]
2548 pmaddwd m6, m1, [tab_idct32_1 + 192]
2549 pmaddwd m0, m1, [tab_idct32_1 + 224]
2554 vperm2i128 m0, m4, m5, 0x31
2555 vperm2i128 m4, m4, m5, 0x20
2558 pmaddwd m5, m1, [tab_idct32_1 + 256]
2559 pmaddwd m0, m1, [tab_idct32_1 + 288]
2562 pmaddwd m6, m1, [tab_idct32_1 + 320]
2563 pmaddwd m0, m1, [tab_idct32_1 + 352]
2568 pmaddwd m6, m1, [tab_idct32_1 + 384]
2569 pmaddwd m0, m1, [tab_idct32_1 + 416]
2572 pmaddwd m0, m1, [tab_idct32_1 + 448]
2573 pmaddwd m1, [tab_idct32_1 + 480]
2578 vperm2i128 m0, m5, m6, 0x31
2579 vperm2i128 m5, m5, m6, 0x20
2584 psrad m6, IDCT_SHIFT2
2588 psrad m2, IDCT_SHIFT2
2592 psrad m4, IDCT_SHIFT2
2596 psrad m3, IDCT_SHIFT2
2603 pshufb m2, [dct16_shuf1]
2614 ;-------------------------------------------------------
2615 ; void idct4(int32_t *src, int16_t *dst, intptr_t stride)
2616 ;-------------------------------------------------------
2618 cglobal idct4, 3, 4, 6
2620 %define IDCT_SHIFT1 7
2622 %define IDCT_SHIFT2 10
2623 vpbroadcastd m5, [pd_512]
2624 %elif BIT_DEPTH == 8
2625 %define IDCT_SHIFT2 12
2626 vpbroadcastd m5, [pd_2048]
2628 %error Unsupported BIT_DEPTH!
2630 vbroadcasti128 m4, [pd_64]
2635 movu m0, [r0] ;[00 01 02 03 10 11 12 13]
2636 movu m1, [r0 + 32] ;[20 21 22 23 30 31 32 33]
2638 packssdw m0, m1 ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33]
2639 pshufb m0, [idct4_shuf1] ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33]
2640 vpermq m2, m0, 0x44 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
2641 vpermq m0, m0, 0xEE ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
2643 mova m1, [avx2_idct4_1]
2644 mova m3, [avx2_idct4_1 + 32]
2650 psrad m0, IDCT_SHIFT1 ;[00 20 10 30 01 21 11 31]
2654 psrad m1, IDCT_SHIFT1 ;[03 23 13 33 02 22 12 32]
2656 packssdw m0, m1 ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32]
2657 vmovshdup m1, m0 ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32]
2658 vmovsldup m0, m0 ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22]
2660 vpbroadcastq m2, [avx2_idct4_2]
2661 vpbroadcastq m3, [avx2_idct4_2 + 8]
2667 psrad m2, IDCT_SHIFT2 ;[00 01 10 11 30 31 20 21]
2671 psrad m0, IDCT_SHIFT2 ;[03 02 13 12 33 32 23 22]
2673 pshufb m0, [idct4_shuf2] ;[02 03 12 13 32 33 22 23]
2674 punpcklqdq m1, m2, m0 ;[00 01 02 03 10 11 12 13]
2675 punpckhqdq m2, m0 ;[30 31 32 33 20 21 22 23]
2676 packssdw m1, m2 ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23]
2677 vextracti128 xm0, m1, 1
2681 movhps [r1 + 2 * r2], xm0
2682 movhps [r1 + r3], xm1