1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
5 ;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
6 ;* Li Cao <li@multicorewareinc.com>
7 ;* Praveen Kumar Tiwari <Praveen@multicorewareinc.com>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;* This program is also available under a commercial proprietary license.
24 ;* For more information, contact us at license @ x265.com.
25 ;*****************************************************************************/
27 ;TO-DO : Further optimize the routines.
30 %include "x86util.asm"
32 tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64
33 dw 89, 75, 50, 18, -18, -50, -75, -89
34 dw 83, 36, -36, -83, -83, -36, 36, 83
35 dw 75, -18, -89, -50, 50, 89, 18, -75
36 dw 64, -64, -64, 64, 64, -64, -64, 64
37 dw 50, -89, 18, 75, -75, -18, 89, -50
38 dw 36, -83, 83, -36, -36, 83, -83, 36
39 dw 18, -50, 75, -89, 89, -75, 50, -18
41 dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
43 tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64
44 dw 90, 87, 80, 70, 57, 43, 25, 9
45 dw 89, 75, 50, 18, -18, -50, -75, -89
46 dw 87, 57, 9, -43, -80, -90, -70, -25
47 dw 83, 36, -36, -83, -83, -36, 36, 83
48 dw 80, 9, -70, -87, -25, 57, 90, 43
49 dw 75, -18, -89, -50, 50, 89, 18, -75
50 dw 70, -43, -87, 9, 90, 25, -80, -57
51 dw 64, -64, -64, 64, 64, -64, -64, 64
52 dw 57, -80, -25, 90, -9, -87, 43, 70
53 dw 50, -89, 18, 75, -75, -18, 89, -50
54 dw 43, -90, 57, 25, -87, 70, 9, -80
55 dw 36, -83, 83, -36, -36, 83, -83, 36
56 dw 25, -70, 90, -80, 43, 9, -57, 87
57 dw 18, -50, 75, -89, 89, -75, 50, -18
58 dw 9, -25, 43, -57, 70, -80, 87, -90
61 tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64
62 dw -9, -25, -43, -57, -70, -80, -87, -90
63 dw -89, -75, -50, -18, 18, 50, 75, 89
64 dw 25, 70, 90, 80, 43, -9, -57, -87
65 dw 83, 36, -36, -83, -83, -36, 36, 83
66 dw -43, -90, -57, 25, 87, 70, -9, -80
67 dw -75, 18, 89, 50, -50, -89, -18, 75
68 dw 57, 80, -25, -90, -9, 87, 43, -70
69 dw 64, -64, -64, 64, 64, -64, -64, 64
70 dw -70, -43, 87, 9, -90, 25, 80, -57
71 dw -50, 89, -18, -75, 75, 18, -89, 50
72 dw 80, -9, -70, 87, -25, -57, 90, -43
73 dw 36, -83, 83, -36, -36, 83, -83, 36
74 dw -87, 57, -9, -43, 80, -90, 70, -25
75 dw -18, 50, -75, 89, -89, 75, -50, 18
76 dw 90, -87, 80, -70, 57, -43, 25, -9
78 dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
80 dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9
82 tab_dct32_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
83 dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
84 dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90
85 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
86 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
87 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
88 dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
89 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
90 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
91 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
92 dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
93 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
94 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
95 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
96 dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
97 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
98 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
99 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
100 dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57
101 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
102 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
103 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
104 dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43
105 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
106 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
107 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
108 dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
109 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
110 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
111 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
112 dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
113 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
115 tab_dct32_2: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
116 dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
117 dw -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90
118 dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90
119 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
120 dw -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88
121 dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87
122 dw 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
123 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
124 dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
125 dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80
126 dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78
127 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
128 dw -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
129 dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70
130 dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
131 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
132 dw -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61
133 dw -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
134 dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54
135 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
136 dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
137 dw -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43
138 dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38
139 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
140 dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
141 dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25
142 dw 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22
143 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
144 dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
145 dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9
146 dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
148 avx2_idct8_1: times 4 dw 64, 83, 64, 36
149 times 4 dw 64, 36, -64, -83
150 times 4 dw 64, -36, -64, 83
151 times 4 dw 64, -83, 64, -36
153 avx2_idct8_2: times 4 dw 89, 75, 50, 18
154 times 4 dw 75, -18, -89, -50
155 times 4 dw 50, -89, 18, 75
156 times 4 dw 18, -50, 75, -89
158 idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7
160 idct8_shuf2: times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
162 idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
164 tab_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9
165 dw 87, 57, 9, -43, -80, -90, -70, -25
166 dw 80, 9, -70, -87, -25, 57, 90, 43
167 dw 70, -43, -87, 9, 90, 25, -80, -57
168 dw 57, -80, -25, 90, -9, -87, 43, 70
169 dw 43, -90, 57, 25, -87, 70, 9, -80
170 dw 25, -70, 90, -80, 43, 9, -57, 87
171 dw 9, -25, 43, -57, 70, -80, 87, -90
173 tab_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18
174 dw 64, 75, 36, -18, -64, -89, -83, -50
175 dw 64, 50, -36, -89, -64, 18, 83, 75
176 dw 64, 18, -83, -50, 64, 75, -36, -89
177 dw 64, -18, -83, 50, 64, -75, -36, 89
178 dw 64, -50, -36, 89, -64, -18, 83, -75
179 dw 64, -75, 36, 18, -64, 89, -83, 50
180 dw 64, -89, 83, -75, 64, -50, 36, -18
182 idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7
184 idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5
186 tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
187 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
188 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
189 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
190 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
191 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
192 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
193 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
194 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
195 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
196 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
197 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
198 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
199 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
200 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
201 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
204 tab_idct32_2: dw 64, 89, 83, 75, 64, 50, 36, 18
205 dw 64, 75, 36, -18, -64, -89, -83, -50
206 dw 64, 50, -36, -89, -64, 18, 83, 75
207 dw 64, 18, -83, -50, 64, 75, -36, -89
208 dw 64, -18, -83, 50, 64, -75, -36, 89
209 dw 64, -50, -36, 89, -64, -18, 83, -75
210 dw 64, -75, 36, 18, -64, 89, -83, 50
211 dw 64, -89, 83, -75, 64, -50, 36, -18
214 tab_idct32_3: dw 90, 87, 80, 70, 57, 43, 25, 9
215 dw 87, 57, 9, -43, -80, -90, -70, -25
216 dw 80, 9, -70, -87, -25, 57, 90, 43
217 dw 70, -43, -87, 9, 90, 25, -80, -57
218 dw 57, -80, -25, 90, -9, -87, 43, 70
219 dw 43, -90, 57, 25, -87, 70, 9, -80
220 dw 25, -70, 90, -80, 43, 9, -57, 87
221 dw 9, -25, 43, -57, 70, -80, 87, -90
223 tab_idct32_4: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
224 dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
225 dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
226 dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
227 dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
228 dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
229 dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
230 dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
231 dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
232 dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
233 dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
234 dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
235 dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
236 dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
237 dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
238 dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
240 avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
241 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
243 avx2_idct4_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
244 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83
246 avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83
248 const idct4_shuf1, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
250 idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
252 tab_dct4: times 4 dw 64, 64
257 dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
259 tab_dst4: times 2 dw 29, 55, 74, 84
260 times 2 dw 74, 74, 0, -74
261 times 2 dw 84, -29, -74, 55
262 times 2 dw 55, -84, 74, -29
264 tab_idst4: times 4 dw 29, +84
273 tab_dct8_1: times 2 dw 89, 50, 75, 18
274 times 2 dw 75, -89, -18, -50
275 times 2 dw 50, 18, -89, 75
276 times 2 dw 18, 75, -50, -89
278 tab_dct8_2: times 2 dd 83, 36
280 times 1 dd 89, 75, 50, 18
281 times 1 dd 75, -18, -89, -50
282 times 1 dd 50, -89, 18, 75
283 times 1 dd 18, -50, 75, -89
285 tab_idct8_3: times 4 dw 89, 75
294 pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
296 pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13
298 tab_idct8_1: times 1 dw 64, -64, 36, -83, 64, 64, 83, 36
300 tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
301 times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
303 pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
320 ;------------------------------------------------------
321 ;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
322 ;------------------------------------------------------
324 cglobal dct4, 3, 4, 8
332 %error Unsupported BIT_DEPTH!
337 mova m4, [r3 + 0 * 16]
338 mova m5, [r3 + 1 * 16]
339 mova m6, [r3 + 2 * 16]
340 movh m0, [r0 + 0 * r2]
341 movh m1, [r0 + 1 * r2]
346 lea r0, [r0 + 2 * r2]
353 punpcklqdq m2, m0, m1
370 pmaddwd m2, [r3 + 3 * 16]
377 punpcklqdq m2, m0, m1
394 movu [r1 + 0 * 16], m1
402 pmaddwd m2, [r3 + 3 * 16]
403 pmaddwd m0, [r3 + 3 * 16]
408 movu [r1 + 1 * 16], m1
416 ; - r2: source stride
418 cglobal dct4, 3, 4, 8, src, dst, srcStride
421 vbroadcasti128 m7, [pd_4]
424 vbroadcasti128 m7, [pd_1]
426 %error Unsupported BIT_DEPTH!
431 vbroadcasti128 m4, [dct4_shuf]
435 movhps xm0, [r0 + r2]
436 lea r0, [r0 + 2 * r2]
438 movhps xm1, [r0 + r2]
440 vinserti128 m0, m0, xm1, 1
442 vpermq m1, m0, 11011101b
443 vpermq m0, m0, 10001000b
457 vpermq m1, m2, 11011101b
458 vpermq m2, m2, 10001000b
459 vbroadcasti128 m7, [pd_128]
477 ;-------------------------------------------------------
478 ;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
479 ;-------------------------------------------------------
481 cglobal idct4, 3, 4, 7
483 %define IDCT4_OFFSET [pd_2048]
484 %define IDCT4_SHIFT 12
485 %elif BIT_DEPTH == 10
486 %define IDCT4_OFFSET [pd_512]
487 %define IDCT4_SHIFT 10
489 %error Unsupported BIT_DEPTH!
496 movu m0, [r0 + 0 * 16]
497 movu m1, [r0 + 1 * 16]
500 pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1
503 pmaddwd m2, [r3 + 2 * 16] ; m2 = E2
507 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
508 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
511 psrad m4, 7 ; m4 = m128iA
514 packssdw m4, m5 ; m4 = m128iA
520 packssdw m2, m3 ; m2 = m128iD
522 punpcklwd m1, m4, m2 ; m1 = S0
523 punpckhwd m4, m2 ; m4 = S8
525 punpcklwd m0, m1, m4 ; m0 = m128iA
526 punpckhwd m1, m4 ; m1 = m128iD
528 mova m6, IDCT4_OFFSET
531 pmaddwd m3, m2, [r3 + 0 * 16]
532 paddd m3, m6 ; m3 = E1
534 pmaddwd m2, [r3 + 2 * 16]
535 paddd m2, m6 ; m2 = E2
538 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
539 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
542 psrad m4, IDCT4_SHIFT ; m4 = m128iA
544 psrad m5, IDCT4_SHIFT
545 packssdw m4, m5 ; m4 = m128iA
548 psrad m2, IDCT4_SHIFT
550 psrad m3, IDCT4_SHIFT
551 packssdw m2, m3 ; m2 = m128iD
557 movlps [r1 + 0 * r2], m0
558 movhps [r1 + 1 * r2], m0
561 movlps [r1 + 2 * r2], m1
562 lea r1, [r1 + 2 * r2]
567 ;------------------------------------------------------
568 ;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
569 ;------------------------------------------------------
572 cglobal dst4, 3, 4, 8+2
575 %else ; ARCH_X86_64 = 0
576 cglobal dst4, 3, 4, 8
577 %define coef2 [r3 + 2 * 16]
578 %define coef3 [r3 + 3 * 16]
586 %elif BIT_DEPTH == 10
592 mova coef0, [r3 + 0 * 16]
593 mova coef1, [r3 + 1 * 16]
595 mova coef2, [r3 + 2 * 16]
596 mova coef3, [r3 + 3 * 16]
598 movh m0, [r0 + 0 * r2] ; load
599 movh m1, [r0 + 1 * r2]
601 lea r0, [r0 + 2 * r2]
605 pmaddwd m2, m0, coef0 ; DST1
606 pmaddwd m3, m1, coef0
610 pmaddwd m3, m0, coef1
611 pmaddwd m4, m1, coef1
615 packssdw m2, m3 ; m2 = T70
616 pmaddwd m3, m0, coef2
617 pmaddwd m4, m1, coef2
626 packssdw m3, m0 ; m3 = T71
629 pmaddwd m0, m2, coef0 ; DST2
630 pmaddwd m1, m3, coef0
635 pmaddwd m4, m2, coef1
636 pmaddwd m1, m3, coef1
641 movu [r1 + 0 * 16], m0
643 pmaddwd m0, m2, coef2
644 pmaddwd m1, m3, coef2
655 movu [r1 + 1 * 16], m0
659 ;-------------------------------------------------------
660 ;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
661 ;-------------------------------------------------------
663 cglobal idst4, 3, 4, 7
666 %define IDCT4_SHIFT 12
667 %elif BIT_DEPTH == 10
669 %define IDCT4_SHIFT 10
671 %error Unsupported BIT_DEPTH!
677 movu m0, [r0 + 0 * 16]
678 movu m1, [r0 + 1 * 16]
680 punpcklwd m2, m0, m1 ; m2 = m128iAC
681 punpckhwd m0, m1 ; m0 = m128iBD
683 pmaddwd m1, m2, [r3 + 0 * 16]
684 pmaddwd m3, m0, [r3 + 1 * 16]
687 psrad m1, 7 ; m1 = S0
689 pmaddwd m3, m2, [r3 + 2 * 16]
690 pmaddwd m4, m0, [r3 + 3 * 16]
693 psrad m3, 7 ; m3 = S8
694 packssdw m1, m3 ; m1 = m128iA
696 pmaddwd m3, m2, [r3 + 4 * 16]
697 pmaddwd m4, m0, [r3 + 5 * 16]
700 psrad m3, 7 ; m3 = S0
702 pmaddwd m2, [r3 + 6 * 16]
703 pmaddwd m0, [r3 + 7 * 16]
706 psrad m2, 7 ; m2 = S8
707 packssdw m3, m2 ; m3 = m128iD
716 pmaddwd m0, m1, [r3 + 0 * 16]
717 pmaddwd m3, m2, [r3 + 1 * 16]
720 psrad m0, IDCT4_SHIFT ; m0 = S0
721 pmaddwd m3, m1, [r3 + 2 * 16]
722 pmaddwd m4, m2, [r3 + 3 * 16]
725 psrad m3, IDCT4_SHIFT ; m3 = S8
726 packssdw m0, m3 ; m0 = m128iA
727 pmaddwd m3, m1, [r3 + 4 * 16]
728 pmaddwd m4, m2, [r3 + 5 * 16]
731 psrad m3, IDCT4_SHIFT ; m3 = S0
732 pmaddwd m1, [r3 + 6 * 16]
733 pmaddwd m2, [r3 + 7 * 16]
736 psrad m1, IDCT4_SHIFT ; m1 = S8
737 packssdw m3, m1 ; m3 = m128iD
742 movlps [r1 + 0 * r2], m2
743 movhps [r1 + 1 * r2], m2
746 movlps [r1 + 2 * r2], m1
747 lea r1, [r1 + 2 * r2]
752 ;-------------------------------------------------------
753 ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
754 ;-------------------------------------------------------
756 cglobal dct8, 3,6,7,0-16*mmsize
757 ;------------------------
758 ; Stack Mapping(dword)
759 ;------------------------
760 ; Row0[0-3] Row1[0-3]
762 ; Row6[0-3] Row7[0-3]
763 ; Row0[0-3] Row7[0-3]
765 ; Row6[4-7] Row7[4-7]
766 ;------------------------
774 %error Unsupported BIT_DEPTH!
784 movu m2, [r0 + r2 * 2]
791 punpckldq m1, m4, m5 ; m1 = [1 0]
792 punpckhdq m4, m5 ; m4 = [3 2]
795 pshufd m2, m3, 0x4E ; m2 = [4 5]
796 pshufd m0, m0, 0x4E ; m0 = [6 7]
799 psubw m1, m0 ; m1 = [d1 d0]
801 psubw m4, m2 ; m4 = [d3 d2]
802 punpcklqdq m2, m3, m0 ; m2 = [s2 s0]
804 pshufd m3, m3, 0x4E ; m3 = [s1 s3]
806 punpcklwd m0, m1, m4 ; m0 = [d2/d0]
807 punpckhwd m1, m4 ; m1 = [d3/d1]
808 punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0]
809 punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0]
813 pmaddwd m1, m4, [r4 + 0*16]
814 pmaddwd m5, m0, [r4 + 0*16]
821 mova [r5 + 1*2*mmsize], m1 ; Row 1
823 pmaddwd m1, m4, [r4 + 1*16]
824 pmaddwd m5, m0, [r4 + 1*16]
831 mova [r5 + 3*2*mmsize], m1 ; Row 3
833 pmaddwd m1, m4, [r4 + 2*16]
834 pmaddwd m5, m0, [r4 + 2*16]
841 mova [r5 + 5*2*mmsize], m1 ; Row 5
843 pmaddwd m4, [r4 + 3*16]
844 pmaddwd m0, [r4 + 3*16]
851 mova [r5 + 7*2*mmsize], m4; Row 7
855 paddw m0, m2, m3 ; m0 = [EE1 EE0]
856 pshufb m0, [pb_unpackhlw1]
857 psubw m2, m3 ; m2 = [EO1 EO0]
858 psignw m2, [pw_ppppmmmm]
859 pshufb m2, [pb_unpackhlw1]
860 pmaddwd m3, m0, [r4 + 0*16]
866 mova [r5 + 0*2*mmsize], m3 ; Row 0
867 pmaddwd m0, [r4 + 2*16]
873 mova [r5 + 4*2*mmsize], m0 ; Row 4
874 pmaddwd m3, m2, [r4 + 1*16]
880 mova [r5 + 2*2*mmsize], m3 ; Row 2
881 pmaddwd m2, [r4 + 3*16]
887 mova [r5 + 6*2*mmsize], m2 ; Row 6
890 lea r0, [r0 + r2 * 4]
897 mov r0, rsp ; r0 = pointer to Low Part
903 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0]
904 mova m1, [r0 + 1*2*mmsize]
905 paddd m2, m0, [r0 + (0*2+1)*mmsize]
906 pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0]
907 paddd m3, m1, [r0 + (1*2+1)*mmsize]
908 pshufd m3, m3, 0x9C ; m3 = ^^
909 psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0]
910 psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^
913 phaddd m4, m2, m3 ; m4 = [EE1 EE0 EE1 EE0]
914 phsubd m2, m3 ; m2 = [EO1 EO0 EO1 EO0]
916 pslld m4, 6 ; m4 = [64*EE1 64*EE0]
917 pmulld m5, m2, [r4 + 0*16] ; m5 = [36*EO1 83*EO0]
918 pmulld m2, [r4 + 1*16] ; m2 = [83*EO1 36*EO0]
920 phaddd m3, m4, m5 ; m3 = [Row2 Row0]
923 phsubd m4, m2 ; m4 = [Row6 Row4]
928 movd [r1 + 0*mmsize], m3
930 movd [r1 + 2*mmsize], m3
933 movd [r1 + 4*mmsize], m4
935 movd [r1 + 6*mmsize], m4
938 pmulld m2, m0, [r4 + 2*16]
939 pmulld m3, m1, [r4 + 2*16]
940 pmulld m4, m0, [r4 + 3*16]
941 pmulld m5, m1, [r4 + 3*16]
944 phaddd m2, m4 ; m2 = [Row3 Row1]
949 movd [r1 + 1*mmsize], m2
951 movd [r1 + 3*mmsize], m2
953 pmulld m2, m0, [r4 + 4*16]
954 pmulld m3, m1, [r4 + 4*16]
955 pmulld m4, m0, [r4 + 5*16]
956 pmulld m5, m1, [r4 + 5*16]
959 phaddd m2, m4 ; m2 = [Row7 Row5]
964 movd [r1 + 5*mmsize], m2
966 movd [r1 + 7*mmsize], m2
976 ;-------------------------------------------------------
977 ; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
978 ;-------------------------------------------------------
982 %define IDCT_SHIFT 10
983 %define IDCT_ADD pd_512
985 %define IDCT_SHIFT 12
986 %define IDCT_ADD pd_2048
988 %error Unsupported BIT_DEPTH!
991 cglobal idct8, 3, 6, 16, 0-5*mmsize
992 mova m9, [r0 + 1 * mmsize]
993 mova m1, [r0 + 3 * mmsize]
997 mova m14, [tab_idct8_3]
1001 mova m0, [r0 + 5 * mmsize]
1002 mova m10, [r0 + 7 * mmsize]
1006 mova m15, [tab_idct8_3 + 1 * mmsize]
1007 mova m11, [tab_idct8_3 + 1 * mmsize]
1009 mova m4, [tab_idct8_3 + 2 * mmsize]
1011 mova m1, [tab_idct8_3 + 2 * mmsize]
1013 mova m5, [tab_idct8_3 + 4 * mmsize]
1014 mova m12, [tab_idct8_3 + 4 * mmsize]
1016 mova [rsp + 0 * mmsize], m11
1017 mova [rsp + 1 * mmsize], m15
1020 mova m14, [tab_idct8_3 + 3 * mmsize]
1021 mova m3, [tab_idct8_3 + 3 * mmsize]
1026 mova [rsp + 2 * mmsize], m3
1028 pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
1029 mova m6, [tab_idct8_3 + 5 * mmsize]
1031 pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
1032 mova m4, [tab_idct8_3 + 5 * mmsize]
1035 pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
1037 mova [rsp + 3 * mmsize], m6
1039 pmaddwd m0, [tab_idct8_3 + 7 * mmsize]
1042 mova m6, [r0 + 0 * mmsize]
1043 mova m0, [r0 + 4 * mmsize]
1047 mova m12, [r0 + 2 * mmsize]
1048 mova m0, [r0 + 6 * mmsize]
1052 mova m10, [tab_dct4]
1056 pmaddwd m4, [tab_dct4 + 2 * mmsize]
1058 mova m2, [tab_dct4 + 1 * mmsize]
1060 pmaddwd m6, [tab_dct4 + 2 * mmsize]
1061 mova m0, [tab_dct4 + 1 * mmsize]
1066 pmaddwd m13, [tab_dct4 + 3 * mmsize]
1071 pmaddwd m12, [tab_dct4 + 3 * mmsize]
1088 psubd m3, [rsp + 1 * mmsize]
1096 psubd m1, [rsp + 0 * mmsize]
1098 mova m11, [rsp + 2 * mmsize]
1104 mova m13, [rsp + 3 * mmsize]
1116 psubd m4, [rsp + 3 * mmsize]
1125 psubd m2, [rsp + 2 * mmsize]
1152 mova m4, [tab_idct8_3 + 0 * mmsize]
1172 mova m3, [tab_idct8_3 + 0 * mmsize]
1178 mova m8, [tab_idct8_3 + 1 * mmsize]
1183 mova [rsp + 4 * mmsize], m8
1184 mova m4, [tab_idct8_3 + 2 * mmsize]
1186 mova m15, [tab_idct8_3 + 2 * mmsize]
1187 mova m5, [tab_idct8_3 + 1 * mmsize]
1191 mova [rsp + 3 * mmsize], m5
1192 mova m14, [tab_idct8_3 + 3 * mmsize]
1193 mova m5, [tab_idct8_3 + 3 * mmsize]
1196 mova [rsp + 2 * mmsize], m14
1199 mova [rsp + 1 * mmsize], m5
1200 mova m15, [tab_idct8_3 + 4 * mmsize]
1201 mova m5, [tab_idct8_3 + 4 * mmsize]
1203 pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
1205 pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
1206 mova m4, [tab_idct8_3 + 5 * mmsize]
1210 mova m8, [tab_idct8_3 + 5 * mmsize]
1212 pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
1216 mova m10, [tab_dct4]
1218 pmaddwd m11, [tab_idct8_3 + 7 * mmsize]
1220 mova [rsp + 0 * mmsize], m8
1222 pmaddwd m6, [tab_dct4 + 2 * mmsize]
1225 mova m3, [tab_dct4 + 1 * mmsize]
1227 pmaddwd m4, [tab_dct4 + 2 * mmsize]
1229 mova m2, [tab_dct4 + 1 * mmsize]
1234 pmaddwd m13, [tab_dct4 + 3 * mmsize]
1239 pmaddwd m12, [tab_dct4 + 3 * mmsize]
1240 paddd m0, [IDCT_ADD]
1241 paddd m1, [IDCT_ADD]
1242 paddd m8, [IDCT_ADD]
1243 paddd m10, [IDCT_ADD]
1246 paddd m2, [IDCT_ADD]
1247 paddd m3, [IDCT_ADD]
1250 paddd m4, [IDCT_ADD]
1251 paddd m6, [IDCT_ADD]
1252 mova m15, [rsp + 4 * mmsize]
1255 psrad m8, IDCT_SHIFT
1256 mova m11, [rsp + 3 * mmsize]
1258 psrad m15, IDCT_SHIFT
1259 psubd m0, [rsp + 4 * mmsize]
1260 psrad m0, IDCT_SHIFT
1263 mova m14, [rsp + 2 * mmsize]
1264 psrad m11, IDCT_SHIFT
1266 psubd m1, [rsp + 3 * mmsize]
1267 psrad m1, IDCT_SHIFT
1268 mova m11, [rsp + 1 * mmsize]
1270 psrad m14, IDCT_SHIFT
1272 psrad m12, IDCT_SHIFT
1273 psubd m2, [rsp + 2 * mmsize]
1275 mova m13, [rsp + 0 * mmsize]
1276 psrad m11, IDCT_SHIFT
1281 psrad m13, IDCT_SHIFT
1284 psrad m11, IDCT_SHIFT
1288 psrad m10, IDCT_SHIFT
1290 psrad m6, IDCT_SHIFT
1291 psubd m4, [rsp + 0 * mmsize]
1293 psrad m11, IDCT_SHIFT
1297 psrad m4, IDCT_SHIFT
1299 psubd m3, [rsp + 1 * mmsize]
1300 psrad m2, IDCT_SHIFT
1302 psrad m3, IDCT_SHIFT
1318 lea r0, [r4 + r2 * 2]
1321 movhps [r1 + r2], m4
1323 movq [r1 + r2 * 2], m1
1325 movhps [r1 + r3], m1
1329 movhps [r1 + r2 + 8], m1
1330 movq [r1 + r2 * 2 + 8], m0
1331 movhps [r1 + r3 + 8], m0
1335 movq [r1 + r2 * 4], m0
1336 movhps [r1 + r4], m0
1339 movq [r1 + r3 * 2], m15
1341 movhps [r1 + r0], m15
1342 movq [r1 + r2 * 4 + 8], m0
1343 movhps [r1 + r4 + 8], m0
1344 movq [r1 + r3 * 2 + 8], m8
1345 movhps [r1 + r0 + 8], m8
1352 ;-------------------------------------------------------
1353 ; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
1354 ;-------------------------------------------------------
1357 cglobal patial_butterfly_inverse_internal_pass1
1359 movhps m0, [r0 + 2 * 16]
1360 movh m1, [r0 + 4 * 16]
1361 movhps m1, [r0 + 6 * 16]
1363 punpckhwd m2, m0, m1 ; [2 6]
1364 punpcklwd m0, m1 ; [0 4]
1365 pmaddwd m1, m0, [r6] ; EE[0]
1366 pmaddwd m0, [r6 + 32] ; EE[1]
1367 pmaddwd m3, m2, [r6 + 16] ; EO[0]
1368 pmaddwd m2, [r6 + 48] ; EO[1]
1370 paddd m4, m1, m3 ; E[0]
1372 paddd m3, m0, m2 ; E[1]
1383 movhps m2, [r0 + 5 * 16]
1384 movh m5, [r0 + 3 * 16]
1385 movhps m5, [r0 + 7 * 16]
1386 punpcklwd m6, m2, m5 ;[1 3]
1387 punpckhwd m2, m5 ;[5 7]
1389 pmaddwd m5, m6, [r4]
1390 pmaddwd m7, m2, [r4 + 16]
1400 movh [r5 + 0 * 16], m7
1401 movhps [r5 + 7 * 16], m7
1403 pmaddwd m5, m6, [r4 + 32]
1404 pmaddwd m4, m2, [r4 + 48]
1414 movh [r5 + 1 * 16], m4
1415 movhps [r5 + 6 * 16], m4
1417 pmaddwd m5, m6, [r4 + 64]
1418 pmaddwd m4, m2, [r4 + 80]
1428 movh [r5 + 2 * 16], m4
1429 movhps [r5 + 5 * 16], m4
1431 pmaddwd m5, m6, [r4 + 96]
1432 pmaddwd m4, m2, [r4 + 112]
1442 movh [r5 + 3 * 16], m4
1443 movhps [r5 + 4 * 16], m4
1447 %macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
1449 %define IDCT_SHIFT 10
1450 %elif BIT_DEPTH == 8
1451 %define IDCT_SHIFT 12
1453 %error Unsupported BIT_DEPTH!
1455 pshufb m4, %1, [pb_idct8even]
1456 pmaddwd m4, [tab_idct8_1]
1460 punpckhqdq m4, m5 ;m4 = dd e[ 0 1 2 3]
1464 pmaddwd m5, %1, [r4]
1465 pmaddwd %1, [r4 + 16]
1466 phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3]
1469 psrad %1, IDCT_SHIFT
1472 psrad m4, IDCT_SHIFT
1479 cglobal patial_butterfly_inverse_internal_pass2
1482 PARTIAL_BUTTERFLY_PROCESS_ROW m0
1486 PARTIAL_BUTTERFLY_PROCESS_ROW m2
1490 PARTIAL_BUTTERFLY_PROCESS_ROW m1
1491 movu [r1 + 2 * r2], m1
1494 PARTIAL_BUTTERFLY_PROCESS_ROW m3
1499 cglobal idct8, 3,7,8 ;,0-16*mmsize
1500 ; alignment stack to 64-bytes
1502 sub rsp, 16*mmsize + gprsize
1504 mov [rsp + 16*mmsize], r5
1507 lea r4, [tab_idct8_3]
1510 call patial_butterfly_inverse_internal_pass1
1515 call patial_butterfly_inverse_internal_pass1
1519 %elif BIT_DEPTH == 8
1522 %error Unsupported BIT_DEPTH!
1526 lea r4, [tab_idct8_2]
1527 lea r6, [pb_idct8odd]
1530 call patial_butterfly_inverse_internal_pass2
1532 lea r1, [r1 + 4 * r2]
1535 call patial_butterfly_inverse_internal_pass2
1537 ; restore origin stack pointer
1538 mov rsp, [rsp + 16*mmsize]
1542 ;-----------------------------------------------------------------------------
1543 ; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
1544 ;-----------------------------------------------------------------------------
1546 cglobal denoise_dct, 4, 4, 6
1577 cglobal denoise_dct, 4, 4, 6
1587 vextracti128 xm4, m1, 1
1605 %if ARCH_X86_64 == 1
1606 %macro DCT8_PASS_1 4
1607 vpbroadcastq m0, [r6 + %1]
1618 %macro DCT8_PASS_2 2
1619 vbroadcasti128 m4, [r6 + %1]
1628 psrad m6, DCT_SHIFT2
1630 vbroadcasti128 m4, [r6 + %2]
1639 psrad m10, DCT_SHIFT2
1642 vpermq m10, m6, 0xD8
1647 cglobal dct8, 3, 7, 11, 0-8*16
1650 vbroadcasti128 m5, [pd_8]
1651 %elif BIT_DEPTH == 8
1653 vbroadcasti128 m5, [pd_2]
1655 %error Unsupported BIT_DEPTH!
1657 %define DCT_SHIFT2 9
1661 lea r4, [r0 + r2 * 4]
1664 mova m6, [dct8_shuf]
1668 vinserti128 m0, m0, [r4], 1
1670 vinserti128 m1, m1, [r4 + r2], 1
1671 mova xm2, [r0 + r2 * 2]
1672 vinserti128 m2, m2, [r4 + r2 * 2], 1
1674 vinserti128 m3, m3, [r4 + r3], 1
1676 punpcklqdq m4, m0, m1
1678 punpcklqdq m1, m2, m3
1690 DCT8_PASS_1 0 * 16, 0 * 16, 3, 7
1691 DCT8_PASS_1 1 * 16, 2 * 16, 4, 1
1692 DCT8_PASS_1 2 * 16, 4 * 16, 3, 7
1693 DCT8_PASS_1 3 * 16, 6 * 16, 4, 1
1694 DCT8_PASS_1 4 * 16, 1 * 16, 3, 7
1695 DCT8_PASS_1 5 * 16, 3 * 16, 4, 1
1696 DCT8_PASS_1 6 * 16, 5 * 16, 3, 7
1697 DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
1700 vbroadcasti128 m5, [pd_256]
1707 DCT8_PASS_2 0 * 16, 1 * 16
1709 DCT8_PASS_2 2 * 16, 3 * 16
1711 DCT8_PASS_2 4 * 16, 5 * 16
1713 DCT8_PASS_2 6 * 16, 7 * 16
1717 %macro DCT16_PASS_1_E 2
1718 vpbroadcastq m7, [r7 + %1]
1733 %macro DCT16_PASS_1_O 2
1734 vbroadcasti128 m7, [r7 + %1]
1738 phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5]
1742 phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7]
1744 phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7]
1747 psrad m10, DCT_SHIFT
1749 packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -]
1750 vpermq m10, m10, 0x08
1752 mova [r5 + %2], xm10
1755 %macro DCT16_PASS_2 2
1756 vbroadcasti128 m8, [r7 + %1]
1757 vbroadcasti128 m13, [r8 + %1]
1760 pmaddwd m11, m1, m13
1764 pmaddwd m12, m3, m13
1769 pmaddwd m12, m5, m13
1773 pmaddwd m13, m7, m13
1779 psrad m10, DCT_SHIFT2
1782 vbroadcasti128 m8, [r7 + %2]
1783 vbroadcasti128 m13, [r8 + %2]
1786 pmaddwd m11, m1, m13
1790 pmaddwd m12, m3, m13
1795 pmaddwd m12, m5, m13
1799 pmaddwd m13, m7, m13
1805 psrad m14, DCT_SHIFT2
1808 vextracti128 xm14, m10, 1
1809 movlhps xm15, xm10, xm14
1813 cglobal dct16, 3, 9, 16, 0-16*mmsize
1816 vbroadcasti128 m9, [pd_16]
1817 %elif BIT_DEPTH == 8
1819 vbroadcasti128 m9, [pd_4]
1821 %error Unsupported BIT_DEPTH!
1823 %define DCT_SHIFT2 10
1827 mova m13, [dct16_shuf1]
1828 mova m14, [dct16_shuf2]
1829 lea r7, [tab_dct16_1 + 8 * 16]
1830 lea r8, [tab_dct16_2 + 8 * 16]
1833 mov r4d, 2 ; Each iteration process 8 rows, so 16/8 iterations
1836 lea r6, [r0 + r2 * 4]
1840 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo]
1841 vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi]
1845 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo]
1846 vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi]
1848 movu m6, [r0 + r2 * 2]
1849 movu m5, [r6 + r2 * 2]
1850 vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo]
1851 vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi]
1855 vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo]
1856 vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi]
1875 DCT16_PASS_1_O -7 * 16, 1 * 32
1876 DCT16_PASS_1_O -5 * 16, 3 * 32
1877 DCT16_PASS_1_O -3 * 16, 1 * 32 + 16
1878 DCT16_PASS_1_O -1 * 16, 3 * 32 + 16
1879 DCT16_PASS_1_O 1 * 16, 5 * 32
1880 DCT16_PASS_1_O 3 * 16, 7 * 32
1881 DCT16_PASS_1_O 5 * 16, 5 * 32 + 16
1882 DCT16_PASS_1_O 7 * 16, 7 * 32 + 16
1892 DCT16_PASS_1_E -8 * 16, 0 * 32
1893 DCT16_PASS_1_E -4 * 16, 0 * 32 + 16
1894 DCT16_PASS_1_E 0 * 16, 4 * 32
1895 DCT16_PASS_1_E 4 * 16, 4 * 32 + 16
1900 DCT16_PASS_1_E -6 * 16, 2 * 32
1901 DCT16_PASS_1_E -2 * 16, 2 * 32 + 16
1902 DCT16_PASS_1_E 2 * 16, 6 * 32
1903 DCT16_PASS_1_E 6 * 16, 6 * 32 + 16
1905 lea r0, [r0 + 8 * r2]
1915 vbroadcasti128 m9, [pd_512]
1918 mova m0, [r5 + 0 * 32] ; [row0lo row4lo]
1919 mova m1, [r5 + 8 * 32] ; [row0hi row4hi]
1921 mova m2, [r5 + 1 * 32] ; [row1lo row5lo]
1922 mova m3, [r5 + 9 * 32] ; [row1hi row5hi]
1924 mova m4, [r5 + 2 * 32] ; [row2lo row6lo]
1925 mova m5, [r5 + 10 * 32] ; [row2hi row6hi]
1927 mova m6, [r5 + 3 * 32] ; [row3lo row7lo]
1928 mova m7, [r5 + 11 * 32] ; [row3hi row7hi]
1930 DCT16_PASS_2 -8 * 16, -7 * 16
1932 movu [r1 + r2], xm14
1934 DCT16_PASS_2 -6 * 16, -5 * 16
1935 movu [r1 + r2 * 2], xm15
1936 movu [r1 + r3], xm14
1938 lea r6, [r1 + r2 * 4]
1939 DCT16_PASS_2 -4 * 16, -3 * 16
1941 movu [r6 + r2], xm14
1943 DCT16_PASS_2 -2 * 16, -1 * 16
1944 movu [r6 + r2 * 2], xm15
1945 movu [r6 + r3], xm14
1947 lea r6, [r6 + r2 * 4]
1948 DCT16_PASS_2 0 * 16, 1 * 16
1950 movu [r6 + r2], xm14
1952 DCT16_PASS_2 2 * 16, 3 * 16
1953 movu [r6 + r2 * 2], xm15
1954 movu [r6 + r3], xm14
1956 lea r6, [r6 + r2 * 4]
1957 DCT16_PASS_2 4 * 16, 5 * 16
1959 movu [r6 + r2], xm14
1961 DCT16_PASS_2 6 * 16, 7 * 16
1962 movu [r6 + r2 * 2], xm15
1963 movu [r6 + r3], xm14
1972 %macro DCT32_PASS_1 4
1973 vbroadcasti128 m8, [r7 + %1]
1975 pmaddwd m11, m%3, m8
1976 pmaddwd m12, m%4, m8
1979 vbroadcasti128 m8, [r7 + %1 + 32]
1980 vbroadcasti128 m10, [r7 + %1 + 48]
1982 pmaddwd m13, m6, m10
1986 pmaddwd m14, m7, m10
1993 psrad m11, DCT_SHIFT
1995 vpermq m11, m11, 0xD8
1997 movq [r5 + %2], xm11
1998 vextracti128 xm10, m11, 1
1999 movq [r5 + %2 + 64], xm10
2002 %macro DCT32_PASS_2 1
2006 pmaddwd m12, m1, m10
2010 pmaddwd m13, m3, m10
2016 pmaddwd m13, m5, m10
2020 pmaddwd m14, m7, m10
2026 vextracti128 xm10, m11, 1
2030 psrad xm11, DCT_SHIFT2
2036 cglobal dct32, 3, 9, 16, 0-64*mmsize
2039 vpbroadcastq m9, [pd_32]
2040 %elif BIT_DEPTH == 8
2042 vpbroadcastq m9, [pd_8]
2044 %error Unsupported BIT_DEPTH!
2046 %define DCT_SHIFT2 11
2050 lea r7, [tab_dct32_1]
2051 lea r8, [tab_dct32_2]
2055 mova m15, [dct16_shuf1]
2065 movu m1, [r0 + r2 * 2]
2066 movu m0, [r0 + r2 * 2 + 32]
2071 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E
2072 vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E
2077 vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O
2078 vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O
2082 movu m2, [r0 + r2 + 32]
2089 movu m2, [r0 + r3 + 32]
2094 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E
2095 vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E
2100 vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O
2101 vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O
2104 DCT32_PASS_1 0 * 32, 0 * 64, 0, 2
2105 DCT32_PASS_1 2 * 32, 2 * 64, 1, 3
2106 DCT32_PASS_1 4 * 32, 4 * 64, 0, 2
2107 DCT32_PASS_1 6 * 32, 6 * 64, 1, 3
2108 DCT32_PASS_1 8 * 32, 8 * 64, 0, 2
2109 DCT32_PASS_1 10 * 32, 10 * 64, 1, 3
2110 DCT32_PASS_1 12 * 32, 12 * 64, 0, 2
2111 DCT32_PASS_1 14 * 32, 14 * 64, 1, 3
2112 DCT32_PASS_1 16 * 32, 16 * 64, 0, 2
2113 DCT32_PASS_1 18 * 32, 18 * 64, 1, 3
2114 DCT32_PASS_1 20 * 32, 20 * 64, 0, 2
2115 DCT32_PASS_1 22 * 32, 22 * 64, 1, 3
2116 DCT32_PASS_1 24 * 32, 24 * 64, 0, 2
2117 DCT32_PASS_1 26 * 32, 26 * 64, 1, 3
2118 DCT32_PASS_1 28 * 32, 28 * 64, 0, 2
2119 DCT32_PASS_1 30 * 32, 30 * 64, 1, 3
2122 lea r0, [r0 + r2 * 4]
2131 vpbroadcastq m9, [pd_1024]
2134 mova m0, [r5 + 0 * 64]
2135 mova m1, [r5 + 0 * 64 + 32]
2137 mova m2, [r5 + 1 * 64]
2138 mova m3, [r5 + 1 * 64 + 32]
2140 mova m4, [r5 + 2 * 64]
2141 mova m5, [r5 + 2 * 64 + 32]
2143 mova m6, [r5 + 3 * 64]
2144 mova m7, [r5 + 3 * 64 + 32]
2149 movq [r1 + r2], xm11
2151 movq [r1 + r2 * 2], xm11
2153 movq [r1 + r3], xm11
2155 lea r6, [r1 + r2 * 4]
2159 movq [r6 + r2], xm11
2161 movq [r6 + r2 * 2], xm11
2163 movq [r6 + r3], xm11
2165 lea r6, [r6 + r2 * 4]
2169 movq [r6 + r2], xm11
2170 DCT32_PASS_2 10 * 32
2171 movq [r6 + r2 * 2], xm11
2172 DCT32_PASS_2 11 * 32
2173 movq [r6 + r3], xm11
2175 lea r6, [r6 + r2 * 4]
2176 DCT32_PASS_2 12 * 32
2178 DCT32_PASS_2 13 * 32
2179 movq [r6 + r2], xm11
2180 DCT32_PASS_2 14 * 32
2181 movq [r6 + r2 * 2], xm11
2182 DCT32_PASS_2 15 * 32
2183 movq [r6 + r3], xm11
2185 lea r6, [r6 + r2 * 4]
2186 DCT32_PASS_2 16 * 32
2188 DCT32_PASS_2 17 * 32
2189 movq [r6 + r2], xm11
2190 DCT32_PASS_2 18 * 32
2191 movq [r6 + r2 * 2], xm11
2192 DCT32_PASS_2 19 * 32
2193 movq [r6 + r3], xm11
2195 lea r6, [r6 + r2 * 4]
2196 DCT32_PASS_2 20 * 32
2198 DCT32_PASS_2 21 * 32
2199 movq [r6 + r2], xm11
2200 DCT32_PASS_2 22 * 32
2201 movq [r6 + r2 * 2], xm11
2202 DCT32_PASS_2 23 * 32
2203 movq [r6 + r3], xm11
2205 lea r6, [r6 + r2 * 4]
2206 DCT32_PASS_2 24 * 32
2208 DCT32_PASS_2 25 * 32
2209 movq [r6 + r2], xm11
2210 DCT32_PASS_2 26 * 32
2211 movq [r6 + r2 * 2], xm11
2212 DCT32_PASS_2 27 * 32
2213 movq [r6 + r3], xm11
2215 lea r6, [r6 + r2 * 4]
2216 DCT32_PASS_2 28 * 32
2218 DCT32_PASS_2 29 * 32
2219 movq [r6 + r2], xm11
2220 DCT32_PASS_2 30 * 32
2221 movq [r6 + r2 * 2], xm11
2222 DCT32_PASS_2 31 * 32
2223 movq [r6 + r3], xm11
2232 %macro IDCT8_PASS_1 1
2233 vpbroadcastd m7, [r5 + %1]
2234 vpbroadcastd m10, [r5 + %1 + 4]
2239 vpbroadcastd m7, [r6 + %1]
2240 vpbroadcastd m10, [r6 + %1 + 4]
2247 psrad m3, IDCT_SHIFT1
2251 psrad m5, IDCT_SHIFT1
2253 vpbroadcastd m7, [r5 + %1 + 32]
2254 vpbroadcastd m10, [r5 + %1 + 36]
2259 vpbroadcastd m7, [r6 + %1 + 32]
2260 vpbroadcastd m10, [r6 + %1 + 36]
2267 psrad m9, IDCT_SHIFT1
2271 psrad m6, IDCT_SHIFT1
2280 %macro IDCT8_PASS_2 0
2281 punpcklqdq m2, m0, m1
2284 pmaddwd m3, m2, [r5]
2285 pmaddwd m5, m2, [r5 + 32]
2286 pmaddwd m6, m2, [r5 + 64]
2287 pmaddwd m7, m2, [r5 + 96]
2290 pshufb m3, [idct8_shuf2]
2291 pshufb m6, [idct8_shuf2]
2292 punpcklqdq m7, m3, m6
2295 pmaddwd m5, m0, [r6]
2296 pmaddwd m6, m0, [r6 + 32]
2297 pmaddwd m8, m0, [r6 + 64]
2298 pmaddwd m9, m0, [r6 + 96]
2301 pshufb m5, [idct8_shuf2]
2302 pshufb m8, [idct8_shuf2]
2303 punpcklqdq m6, m5, m8
2308 psrad m8, IDCT_SHIFT2
2312 psrad m7, IDCT_SHIFT2
2314 pshufb m7, [idct8_shuf3]
2319 psrad m9, IDCT_SHIFT2
2323 psrad m3, IDCT_SHIFT2
2325 pshufb m3, [idct8_shuf3]
2330 cglobal idct8, 3, 7, 13, 0-8*16
2332 %define IDCT_SHIFT2 10
2333 vpbroadcastd m12, [pd_512]
2334 %elif BIT_DEPTH == 8
2335 %define IDCT_SHIFT2 12
2336 vpbroadcastd m12, [pd_2048]
2338 %error Unsupported BIT_DEPTH!
2340 %define IDCT_SHIFT1 7
2342 vbroadcasti128 m11, [pd_64]
2345 lea r5, [avx2_idct8_1]
2346 lea r6, [avx2_idct8_2]
2349 mova m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
2350 mova m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
2351 vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
2352 vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
2353 vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
2354 vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3]
2355 vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
2357 mova m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
2358 mova m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
2359 vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
2360 vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
2361 vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
2362 vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7]
2363 vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
2365 mova m5, [idct8_shuf1]
2387 vextracti128 xm3, m8, 1
2390 vextracti128 xm3, m9, 1
2391 mova [r1 + r2 * 2], xm9
2394 lea r1, [r1 + r2 * 4]
2399 vextracti128 xm3, m8, 1
2402 vextracti128 xm3, m9, 1
2403 mova [r1 + r2 * 2], xm9
2408 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16]
2419 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16]
2433 psrad m11, IDCT_SHIFT1
2437 psrad m9, IDCT_SHIFT1
2439 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16 + 16]
2450 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16 + 16]
2464 psrad m5, IDCT_SHIFT1
2468 psrad m10, IDCT_SHIFT1
2473 mova m10, [idct16_shuff]
2474 mova m5, [idct16_shuff1]
2476 vpermd m12, m10, m11
2478 mova [r3 + %1 * 16 * 2], xm12
2479 mova [r3 + %2 * 16 * 2], xm13
2480 vextracti128 [r3 + %2 * 16 * 2 + 32], m13, 1
2481 vextracti128 [r3 + %1 * 16 * 2 + 32], m12, 1
2484 ;-------------------------------------------------------
2485 ; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
2486 ;-------------------------------------------------------
2488 cglobal idct16, 3, 7, 16, 0-16*mmsize
2490 %define IDCT_SHIFT2 10
2491 vpbroadcastd m15, [pd_512]
2492 %elif BIT_DEPTH == 8
2493 %define IDCT_SHIFT2 12
2494 vpbroadcastd m15, [pd_2048]
2496 %error Unsupported BIT_DEPTH!
2498 %define IDCT_SHIFT1 7
2500 vbroadcasti128 m14, [pd_64]
2507 movu xm0, [r0 + 0 * 32]
2508 movu xm1, [r0 + 8 * 32]
2509 punpckhqdq xm2, xm0, xm1
2511 vinserti128 m0, m0, xm2, 1
2513 movu xm1, [r0 + 1 * 32]
2514 movu xm2, [r0 + 9 * 32]
2515 punpckhqdq xm3, xm1, xm2
2517 vinserti128 m1, m1, xm3, 1
2519 movu xm2, [r0 + 2 * 32]
2520 movu xm3, [r0 + 10 * 32]
2521 punpckhqdq xm4, xm2, xm3
2523 vinserti128 m2, m2, xm4, 1
2525 movu xm3, [r0 + 3 * 32]
2526 movu xm4, [r0 + 11 * 32]
2527 punpckhqdq xm5, xm3, xm4
2529 vinserti128 m3, m3, xm5, 1
2531 movu xm4, [r0 + 4 * 32]
2532 movu xm5, [r0 + 12 * 32]
2533 punpckhqdq xm6, xm4, xm5
2535 vinserti128 m4, m4, xm6, 1
2537 movu xm5, [r0 + 5 * 32]
2538 movu xm6, [r0 + 13 * 32]
2539 punpckhqdq xm7, xm5, xm6
2541 vinserti128 m5, m5, xm7, 1
2543 movu xm6, [r0 + 6 * 32]
2544 movu xm7, [r0 + 14 * 32]
2545 punpckhqdq xm8, xm6, xm7
2547 vinserti128 m6, m6, xm8, 1
2549 movu xm7, [r0 + 7 * 32]
2550 movu xm8, [r0 + 15 * 32]
2551 punpckhqdq xm9, xm7, xm8
2553 vinserti128 m7, m7, xm9, 1
2555 punpckhwd m8, m0, m2 ;[8 10]
2556 punpcklwd m0, m2 ;[0 2]
2558 punpckhwd m2, m1, m3 ;[9 11]
2559 punpcklwd m1, m3 ;[1 3]
2561 punpckhwd m3, m4, m6 ;[12 14]
2562 punpcklwd m4, m6 ;[4 6]
2564 punpckhwd m6, m5, m7 ;[13 15]
2565 punpcklwd m5, m7 ;[5 7]
2567 punpckhdq m7, m0, m4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
2568 punpckldq m0, m4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
2570 punpckhdq m4, m8, m3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
2571 punpckldq m8, m3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
2573 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
2574 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
2576 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
2577 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
2579 punpckhqdq m6, m0, m8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
2580 punpcklqdq m0, m8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
2582 punpckhqdq m8, m7, m4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
2583 punpcklqdq m7, m4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
2585 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
2586 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
2588 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
2589 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
2603 lea r5, [tab_idct16_2]
2604 lea r6, [tab_idct16_1]
2606 vbroadcasti128 m7, [r5]
2607 vbroadcasti128 m8, [r5 + 16]
2608 vbroadcasti128 m9, [r5 + 32]
2609 vbroadcasti128 m10, [r5 + 48]
2610 vbroadcasti128 m11, [r5 + 64]
2611 vbroadcasti128 m12, [r5 + 80]
2612 vbroadcasti128 m13, [r5 + 96]
2632 vbroadcasti128 m14, [r5 + 112]
2642 vbroadcasti128 m14, [r6]
2644 vbroadcasti128 m14, [r6 + 16]
2648 vbroadcasti128 m14, [r6 + 32]
2650 vbroadcasti128 m14, [r6 + 48]
2656 vbroadcasti128 m14, [r6 + 64]
2658 vbroadcasti128 m14, [r6 + 80]
2662 vbroadcasti128 m14, [r6 + 96]
2664 vbroadcasti128 m14, [r6 + 112]
2672 psrad m5, IDCT_SHIFT2
2676 psrad m1, IDCT_SHIFT2
2680 psrad m6, IDCT_SHIFT2
2684 psrad m2, IDCT_SHIFT2
2688 pshufb m2, m1, [dct16_shuf1]
2692 vextracti128 [r1 + r2], m5, 1
2693 vextracti128 [r1 + r2 + 16], m2, 1
2695 lea r1, [r1 + 2 * r2]
2701 %macro IDCT32_PASS1 1
2702 vbroadcasti128 m3, [tab_idct32_1 + %1 * 32]
2703 vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16]
2705 pmaddwd m10, m8, m13
2709 pmaddwd m11, m1, m13
2714 vbroadcasti128 m3, [tab_idct32_1 + (15 - %1) * 32]
2715 vbroadcasti128 m13, [tab_idct32_1 + (15- %1) * 32 + 16]
2717 pmaddwd m11, m8, m13
2721 pmaddwd m12, m1, m13
2725 phaddd m9, m10 ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15]
2727 vbroadcasti128 m3, [tab_idct32_2 + %1 * 16]
2733 vbroadcasti128 m3, [tab_idct32_3 + %1 * 16]
2739 paddd m12, m10, m11 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL]
2740 psubd m10, m11 ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL]
2742 punpcklqdq m12, m10 ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15]
2745 psrad m10, IDCT_SHIFT1
2749 psrad m12, IDCT_SHIFT1
2752 vextracti128 xm12, m10, 1
2753 movd [r3 + %1 * 64], xm10
2754 movd [r3 + 32 + %1 * 64], xm12
2755 pextrd [r4 - %1 * 64], xm10, 1
2756 pextrd [r4+ 32 - %1 * 64], xm12, 1
2757 pextrd [r3 + 16 * 64 + %1 *64], xm10, 3
2758 pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
2759 pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2
2760 pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
2763 ;-------------------------------------------------------
2764 ; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
2765 ;-------------------------------------------------------
2767 ; TODO: Reduce PHADDD instruction by PADDD
2770 cglobal idct32, 3, 6, 16, 0-32*64
2772 %define IDCT_SHIFT1 7
2774 vbroadcasti128 m15, [pd_64]
2777 lea r4, [r3 + 15 * 64]
2781 movq xm0, [r0 + 2 * 64]
2782 movq xm1, [r0 + 18 * 64]
2783 punpcklqdq xm0, xm0, xm1
2784 movq xm1, [r0 + 0 * 64]
2785 movq xm2, [r0 + 16 * 64]
2786 punpcklqdq xm1, xm1, xm2
2787 vinserti128 m0, m0, xm1, 1 ;[2 18 0 16]
2789 movq xm1, [r0 + 1 * 64]
2790 movq xm2, [r0 + 9 * 64]
2791 punpcklqdq xm1, xm1, xm2
2792 movq xm2, [r0 + 17 * 64]
2793 movq xm3, [r0 + 25 * 64]
2794 punpcklqdq xm2, xm2, xm3
2795 vinserti128 m1, m1, xm2, 1 ;[1 9 17 25]
2797 movq xm2, [r0 + 6 * 64]
2798 movq xm3, [r0 + 22 * 64]
2799 punpcklqdq xm2, xm2, xm3
2800 movq xm3, [r0 + 4 * 64]
2801 movq xm4, [r0 + 20 * 64]
2802 punpcklqdq xm3, xm3, xm4
2803 vinserti128 m2, m2, xm3, 1 ;[6 22 4 20]
2805 movq xm3, [r0 + 3 * 64]
2806 movq xm4, [r0 + 11 * 64]
2807 punpcklqdq xm3, xm3, xm4
2808 movq xm4, [r0 + 19 * 64]
2809 movq xm5, [r0 + 27 * 64]
2810 punpcklqdq xm4, xm4, xm5
2811 vinserti128 m3, m3, xm4, 1 ;[3 11 17 25]
2813 movq xm4, [r0 + 10 * 64]
2814 movq xm5, [r0 + 26 * 64]
2815 punpcklqdq xm4, xm4, xm5
2816 movq xm5, [r0 + 8 * 64]
2817 movq xm6, [r0 + 24 * 64]
2818 punpcklqdq xm5, xm5, xm6
2819 vinserti128 m4, m4, xm5, 1 ;[10 26 8 24]
2821 movq xm5, [r0 + 5 * 64]
2822 movq xm6, [r0 + 13 * 64]
2823 punpcklqdq xm5, xm5, xm6
2824 movq xm6, [r0 + 21 * 64]
2825 movq xm7, [r0 + 29 * 64]
2826 punpcklqdq xm6, xm6, xm7
2827 vinserti128 m5, m5, xm6, 1 ;[5 13 21 9]
2829 movq xm6, [r0 + 14 * 64]
2830 movq xm7, [r0 + 30 * 64]
2831 punpcklqdq xm6, xm6, xm7
2832 movq xm7, [r0 + 12 * 64]
2833 movq xm8, [r0 + 28 * 64]
2834 punpcklqdq xm7, xm7, xm8
2835 vinserti128 m6, m6, xm7, 1 ;[14 30 12 28]
2837 movq xm7, [r0 + 7 * 64]
2838 movq xm8, [r0 + 15 * 64]
2839 punpcklqdq xm7, xm7, xm8
2840 movq xm8, [r0 + 23 * 64]
2841 movq xm9, [r0 + 31 * 64]
2842 punpcklqdq xm8, xm8, xm9
2843 vinserti128 m7, m7, xm8, 1 ;[7 15 23 31]
2845 punpckhwd m8, m0, m2 ;[18 22 16 20]
2846 punpcklwd m0, m2 ;[2 6 0 4]
2848 punpckhwd m2, m1, m3 ;[9 11 25 27]
2849 punpcklwd m1, m3 ;[1 3 17 19]
2851 punpckhwd m3, m4, m6 ;[26 30 24 28]
2852 punpcklwd m4, m6 ;[10 14 8 12]
2854 punpckhwd m6, m5, m7 ;[13 15 29 31]
2855 punpcklwd m5, m7 ;[5 7 21 23]
2857 punpckhdq m7, m0, m4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
2858 punpckldq m0, m4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]
2860 punpckhdq m4, m8, m3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
2861 punpckldq m8, m3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]
2863 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
2864 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]
2866 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
2867 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]
2869 punpckhqdq m6, m0, m8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
2870 punpcklqdq m0, m8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]
2872 punpckhqdq m8, m7, m4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
2873 punpcklqdq m7, m4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]
2875 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
2876 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]
2878 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
2879 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]
2881 vperm2i128 m5, m0, m6, 0x20 ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301]
2882 vperm2i128 m0, m0, m6, 0x31 ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281]
2884 vperm2i128 m6, m7, m8, 0x20 ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303]
2885 vperm2i128 m7, m7, m8, 0x31 ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283]
2887 vperm2i128 m8, m1, m4, 0x31 ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311]
2888 vperm2i128 m4, m1, m4, 0x20 ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151]
2890 vperm2i128 m1, m3, m2, 0x31 ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313]
2891 vperm2i128 m2, m3, m2, 0x20 ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153]
2909 %define IDCT_SHIFT2 10
2910 vpbroadcastd m15, [pd_512]
2911 %elif BIT_DEPTH == 8
2912 %define IDCT_SHIFT2 12
2913 vpbroadcastd m15, [pd_2048]
2915 %error Unsupported BIT_DEPTH!
2922 mova m7, [tab_idct32_4]
2923 mova m8, [tab_idct32_4 + 32]
2924 mova m9, [tab_idct32_4 + 64]
2925 mova m10, [tab_idct32_4 + 96]
2926 mova m11, [tab_idct32_4 + 128]
2927 mova m12, [tab_idct32_4 + 160]
2928 mova m13, [tab_idct32_4 + 192]
2929 mova m14, [tab_idct32_4 + 224]
2954 vperm2i128 m4, m2, m3, 0x31
2955 vperm2i128 m2, m2, m3, 0x20
2958 pmaddwd m3, m0, [tab_idct32_4 + 256]
2959 pmaddwd m4, m0, [tab_idct32_4 + 288]
2962 pmaddwd m4, m0, [tab_idct32_4 + 320]
2963 pmaddwd m5, m0, [tab_idct32_4 + 352]
2968 pmaddwd m4, m0, [tab_idct32_4 + 384]
2969 pmaddwd m5, m0, [tab_idct32_4 + 416]
2972 pmaddwd m5, m0, [tab_idct32_4 + 448]
2973 pmaddwd m0, [tab_idct32_4 + 480]
2978 vperm2i128 m0, m3, m4, 0x31
2979 vperm2i128 m3, m3, m4, 0x20
2982 pmaddwd m4, m1, [tab_idct32_1]
2983 pmaddwd m0, m1, [tab_idct32_1 + 32]
2986 pmaddwd m5, m1, [tab_idct32_1 + 64]
2987 pmaddwd m0, m1, [tab_idct32_1 + 96]
2992 pmaddwd m5, m1, [tab_idct32_1 + 128]
2993 pmaddwd m0, m1, [tab_idct32_1 + 160]
2996 pmaddwd m6, m1, [tab_idct32_1 + 192]
2997 pmaddwd m0, m1, [tab_idct32_1 + 224]
3002 vperm2i128 m0, m4, m5, 0x31
3003 vperm2i128 m4, m4, m5, 0x20
3006 pmaddwd m5, m1, [tab_idct32_1 + 256]
3007 pmaddwd m0, m1, [tab_idct32_1 + 288]
3010 pmaddwd m6, m1, [tab_idct32_1 + 320]
3011 pmaddwd m0, m1, [tab_idct32_1 + 352]
3016 pmaddwd m6, m1, [tab_idct32_1 + 384]
3017 pmaddwd m0, m1, [tab_idct32_1 + 416]
3020 pmaddwd m0, m1, [tab_idct32_1 + 448]
3021 pmaddwd m1, [tab_idct32_1 + 480]
3026 vperm2i128 m0, m5, m6, 0x31
3027 vperm2i128 m5, m5, m6, 0x20
3032 psrad m6, IDCT_SHIFT2
3036 psrad m2, IDCT_SHIFT2
3040 psrad m4, IDCT_SHIFT2
3044 psrad m3, IDCT_SHIFT2
3051 pshufb m2, [dct16_shuf1]
3062 ;-------------------------------------------------------
3063 ; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
3064 ;-------------------------------------------------------
3066 cglobal idct4, 3, 4, 6
3068 %define IDCT_SHIFT1 7
3070 %define IDCT_SHIFT2 10
3071 vpbroadcastd m5, [pd_512]
3072 %elif BIT_DEPTH == 8
3073 %define IDCT_SHIFT2 12
3074 vpbroadcastd m5, [pd_2048]
3076 %error Unsupported BIT_DEPTH!
3078 vbroadcasti128 m4, [pd_64]
3083 movu m0, [r0] ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]
3085 pshufb m0, [idct4_shuf1] ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
3086 vextracti128 xm1, m0, 1 ;[20 22 21 23 30 32 31 33]
3087 punpcklwd xm2, xm0, xm1 ;[00 20 02 22 01 21 03 23]
3088 punpckhwd xm0, xm1 ;[10 30 12 32 11 31 13 33]
3089 vinserti128 m2, m2, xm2, 1 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
3090 vinserti128 m0, m0, xm0, 1 ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
3092 mova m1, [avx2_idct4_1]
3093 mova m3, [avx2_idct4_1 + 32]
3099 psrad m0, IDCT_SHIFT1 ;[00 20 10 30 01 21 11 31]
3103 psrad m1, IDCT_SHIFT1 ;[03 23 13 33 02 22 12 32]
3105 packssdw m0, m1 ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32]
3106 vmovshdup m1, m0 ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32]
3107 vmovsldup m0, m0 ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22]
3109 vpbroadcastq m2, [avx2_idct4_2]
3110 vpbroadcastq m3, [avx2_idct4_2 + 8]
3116 psrad m2, IDCT_SHIFT2 ;[00 01 10 11 30 31 20 21]
3120 psrad m0, IDCT_SHIFT2 ;[03 02 13 12 33 32 23 22]
3122 pshufb m0, [idct4_shuf2] ;[02 03 12 13 32 33 22 23]
3123 punpcklqdq m1, m2, m0 ;[00 01 02 03 10 11 12 13]
3124 punpckhqdq m2, m0 ;[30 31 32 33 20 21 22 23]
3125 packssdw m1, m2 ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23]
3126 vextracti128 xm0, m1, 1
3130 movhps [r1 + 2 * r2], xm0
3131 movhps [r1 + r3], xm1