251d500d56aefb92e3388876c1a9d588e6faefcf
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
7 * Mahesh Pittala <mahesh@multicorewareinc.com>
8 * Rajesh Paulraj <rajesh@multicorewareinc.com>
9 * Min Chen <min.chen@multicorewareinc.com>
10 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
11 * Nabajit Deka <nabajit@multicorewareinc.com>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 * This program is also available under a commercial proprietary license.
28 * For more information, contact us at license @ x265.com.
29 *****************************************************************************/
32 #include "primitives.h"
33 #include <xmmintrin.h> // SSE
34 #include <pmmintrin.h> // SSE3
35 #include <tmmintrin.h> // SSSE3
41 ALIGN_VAR_32(static const int16_t, tab_dct_8
[][8]) =
43 { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A },
45 { 64, 64, 64, 64, 64, 64, 64, 64 },
46 { 64, -64, 64, -64, 64, -64, 64, -64 },
47 { 83, 36, 83, 36, 83, 36, 83, 36 },
48 { 36, -83, 36, -83, 36, -83, 36, -83 },
49 { 89, 18, 75, 50, 89, 18, 75, 50 },
50 { 75, -50, -18, -89, 75, -50, -18, -89 },
51 { 50, 75, -89, 18, 50, 75, -89, 18 },
52 { 18, -89, -50, 75, 18, -89, -50, 75 },
54 { 83, 83, -83, -83, 36, 36, -36, -36 },
55 { 36, 36, -36, -36, -83, -83, 83, 83 },
56 { 89, -89, 18, -18, 75, -75, 50, -50 },
57 { 75, -75, -50, 50, -18, 18, -89, 89 },
58 { 50, -50, 75, -75, -89, 89, 18, -18 },
59 { 18, -18, -89, 89, -50, 50, 75, -75 },
62 ALIGN_VAR_32(static const int16_t, tab_dct_16_0
[][8]) =
64 { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 }, // 0
65 { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, // 1
66 { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A }, // 2
67 { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 }, // 3
70 ALIGN_VAR_32(static const int16_t, tab_dct_16_1
[][8]) =
72 { 90, 87, 80, 70, 57, 43, 25, 9 }, // 0
73 { 87, 57, 9, -43, -80, -90, -70, -25 }, // 1
74 { 80, 9, -70, -87, -25, 57, 90, 43 }, // 2
75 { 70, -43, -87, 9, 90, 25, -80, -57 }, // 3
76 { 57, -80, -25, 90, -9, -87, 43, 70 }, // 4
77 { 43, -90, 57, 25, -87, 70, 9, -80 }, // 5
78 { 25, -70, 90, -80, 43, 9, -57, 87 }, // 6
79 { 9, -25, 43, -57, 70, -80, 87, -90 }, // 7
80 { 83, 83, -83, -83, 36, 36, -36, -36 }, // 8
81 { 36, 36, -36, -36, -83, -83, 83, 83 }, // 9
82 { 89, 89, 18, 18, 75, 75, 50, 50 }, // 10
83 { 75, 75, -50, -50, -18, -18, -89, -89 }, // 11
84 { 50, 50, 75, 75, -89, -89, 18, 18 }, // 12
85 { 18, 18, -89, -89, -50, -50, 75, 75 }, // 13
87 #define MAKE_COEF(a0, a1, a2, a3, a4, a5, a6, a7) \
88 { (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2) \
90 { (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5) },
92 MAKE_COEF(90, 87, 80, 70, 57, 43, 25, 9)
93 MAKE_COEF(87, 57, 9, -43, -80, -90, -70, -25)
94 MAKE_COEF(80, 9, -70, -87, -25, 57, 90, 43)
95 MAKE_COEF(70, -43, -87, 9, 90, 25, -80, -57)
96 MAKE_COEF(57, -80, -25, 90, -9, -87, 43, 70)
97 MAKE_COEF(43, -90, 57, 25, -87, 70, 9, -80)
98 MAKE_COEF(25, -70, 90, -80, 43, 9, -57, 87)
99 MAKE_COEF(9, -25, 43, -57, 70, -80, 87, -90)
103 void dct16(const int16_t *src
, int16_t *dst
, intptr_t stride
)
106 __m128i c_4
= _mm_set1_epi32(4);
107 __m128i c_512
= _mm_set1_epi32(512);
111 ALIGN_VAR_32(int16_t, tmp
[16 * 16]);
113 __m128i T00A
, T01A
, T02A
, T03A
, T04A
, T05A
, T06A
, T07A
;
114 __m128i T00B
, T01B
, T02B
, T03B
, T04B
, T05B
, T06B
, T07B
;
115 __m128i T10
, T11
, T12
, T13
, T14
, T15
, T16
, T17
;
116 __m128i T20
, T21
, T22
, T23
, T24
, T25
, T26
, T27
;
117 __m128i T30
, T31
, T32
, T33
, T34
, T35
, T36
, T37
;
118 __m128i T40
, T41
, T42
, T43
, T44
, T45
, T46
, T47
;
119 __m128i T50
, T51
, T52
, T53
;
120 __m128i T60
, T61
, T62
, T63
, T64
, T65
, T66
, T67
;
124 for (i
= 0; i
< 16; i
+= 8)
126 T00A
= _mm_load_si128((__m128i
*)&src
[(i
+ 0) * stride
+ 0]); // [07 06 05 04 03 02 01 00]
127 T00B
= _mm_load_si128((__m128i
*)&src
[(i
+ 0) * stride
+ 8]); // [0F 0E 0D 0C 0B 0A 09 08]
128 T01A
= _mm_load_si128((__m128i
*)&src
[(i
+ 1) * stride
+ 0]); // [17 16 15 14 13 12 11 10]
129 T01B
= _mm_load_si128((__m128i
*)&src
[(i
+ 1) * stride
+ 8]); // [1F 1E 1D 1C 1B 1A 19 18]
130 T02A
= _mm_load_si128((__m128i
*)&src
[(i
+ 2) * stride
+ 0]); // [27 26 25 24 23 22 21 20]
131 T02B
= _mm_load_si128((__m128i
*)&src
[(i
+ 2) * stride
+ 8]); // [2F 2E 2D 2C 2B 2A 29 28]
132 T03A
= _mm_load_si128((__m128i
*)&src
[(i
+ 3) * stride
+ 0]); // [37 36 35 34 33 32 31 30]
133 T03B
= _mm_load_si128((__m128i
*)&src
[(i
+ 3) * stride
+ 8]); // [3F 3E 3D 3C 3B 3A 39 38]
134 T04A
= _mm_load_si128((__m128i
*)&src
[(i
+ 4) * stride
+ 0]); // [47 46 45 44 43 42 41 40]
135 T04B
= _mm_load_si128((__m128i
*)&src
[(i
+ 4) * stride
+ 8]); // [4F 4E 4D 4C 4B 4A 49 48]
136 T05A
= _mm_load_si128((__m128i
*)&src
[(i
+ 5) * stride
+ 0]); // [57 56 55 54 53 52 51 50]
137 T05B
= _mm_load_si128((__m128i
*)&src
[(i
+ 5) * stride
+ 8]); // [5F 5E 5D 5C 5B 5A 59 58]
138 T06A
= _mm_load_si128((__m128i
*)&src
[(i
+ 6) * stride
+ 0]); // [67 66 65 64 63 62 61 60]
139 T06B
= _mm_load_si128((__m128i
*)&src
[(i
+ 6) * stride
+ 8]); // [6F 6E 6D 6C 6B 6A 69 68]
140 T07A
= _mm_load_si128((__m128i
*)&src
[(i
+ 7) * stride
+ 0]); // [77 76 75 74 73 72 71 70]
141 T07B
= _mm_load_si128((__m128i
*)&src
[(i
+ 7) * stride
+ 8]); // [7F 7E 7D 7C 7B 7A 79 78]
143 T00B
= _mm_shuffle_epi8(T00B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
144 T01B
= _mm_shuffle_epi8(T01B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
145 T02B
= _mm_shuffle_epi8(T02B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
146 T03B
= _mm_shuffle_epi8(T03B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
147 T04B
= _mm_shuffle_epi8(T04B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
148 T05B
= _mm_shuffle_epi8(T05B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
149 T06B
= _mm_shuffle_epi8(T06B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
150 T07B
= _mm_shuffle_epi8(T07B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
152 T10
= _mm_add_epi16(T00A
, T00B
);
153 T11
= _mm_add_epi16(T01A
, T01B
);
154 T12
= _mm_add_epi16(T02A
, T02B
);
155 T13
= _mm_add_epi16(T03A
, T03B
);
156 T14
= _mm_add_epi16(T04A
, T04B
);
157 T15
= _mm_add_epi16(T05A
, T05B
);
158 T16
= _mm_add_epi16(T06A
, T06B
);
159 T17
= _mm_add_epi16(T07A
, T07B
);
161 T20
= _mm_sub_epi16(T00A
, T00B
);
162 T21
= _mm_sub_epi16(T01A
, T01B
);
163 T22
= _mm_sub_epi16(T02A
, T02B
);
164 T23
= _mm_sub_epi16(T03A
, T03B
);
165 T24
= _mm_sub_epi16(T04A
, T04B
);
166 T25
= _mm_sub_epi16(T05A
, T05B
);
167 T26
= _mm_sub_epi16(T06A
, T06B
);
168 T27
= _mm_sub_epi16(T07A
, T07B
);
170 T30
= _mm_shuffle_epi8(T10
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
171 T31
= _mm_shuffle_epi8(T11
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
172 T32
= _mm_shuffle_epi8(T12
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
173 T33
= _mm_shuffle_epi8(T13
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
174 T34
= _mm_shuffle_epi8(T14
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
175 T35
= _mm_shuffle_epi8(T15
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
176 T36
= _mm_shuffle_epi8(T16
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
177 T37
= _mm_shuffle_epi8(T17
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
179 T40
= _mm_hadd_epi16(T30
, T31
);
180 T41
= _mm_hadd_epi16(T32
, T33
);
181 T42
= _mm_hadd_epi16(T34
, T35
);
182 T43
= _mm_hadd_epi16(T36
, T37
);
183 T44
= _mm_hsub_epi16(T30
, T31
);
184 T45
= _mm_hsub_epi16(T32
, T33
);
185 T46
= _mm_hsub_epi16(T34
, T35
);
186 T47
= _mm_hsub_epi16(T36
, T37
);
188 T50
= _mm_hadd_epi16(T40
, T41
);
189 T51
= _mm_hadd_epi16(T42
, T43
);
190 T52
= _mm_hsub_epi16(T40
, T41
);
191 T53
= _mm_hsub_epi16(T42
, T43
);
193 T60
= _mm_madd_epi16(T50
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
194 T61
= _mm_madd_epi16(T51
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
195 T60
= _mm_srai_epi32(_mm_add_epi32(T60
, c_4
), 3);
196 T61
= _mm_srai_epi32(_mm_add_epi32(T61
, c_4
), 3);
197 T70
= _mm_packs_epi32(T60
, T61
);
198 _mm_store_si128((__m128i
*)&tmp
[0 * 16 + i
], T70
);
200 T60
= _mm_madd_epi16(T50
, _mm_load_si128((__m128i
*)tab_dct_8
[2]));
201 T61
= _mm_madd_epi16(T51
, _mm_load_si128((__m128i
*)tab_dct_8
[2]));
202 T60
= _mm_srai_epi32(_mm_add_epi32(T60
, c_4
), 3);
203 T61
= _mm_srai_epi32(_mm_add_epi32(T61
, c_4
), 3);
204 T70
= _mm_packs_epi32(T60
, T61
);
205 _mm_store_si128((__m128i
*)&tmp
[8 * 16 + i
], T70
);
207 T60
= _mm_madd_epi16(T52
, _mm_load_si128((__m128i
*)tab_dct_8
[3]));
208 T61
= _mm_madd_epi16(T53
, _mm_load_si128((__m128i
*)tab_dct_8
[3]));
209 T60
= _mm_srai_epi32(_mm_add_epi32(T60
, c_4
), 3);
210 T61
= _mm_srai_epi32(_mm_add_epi32(T61
, c_4
), 3);
211 T70
= _mm_packs_epi32(T60
, T61
);
212 _mm_store_si128((__m128i
*)&tmp
[4 * 16 + i
], T70
);
214 T60
= _mm_madd_epi16(T52
, _mm_load_si128((__m128i
*)tab_dct_8
[4]));
215 T61
= _mm_madd_epi16(T53
, _mm_load_si128((__m128i
*)tab_dct_8
[4]));
216 T60
= _mm_srai_epi32(_mm_add_epi32(T60
, c_4
), 3);
217 T61
= _mm_srai_epi32(_mm_add_epi32(T61
, c_4
), 3);
218 T70
= _mm_packs_epi32(T60
, T61
);
219 _mm_store_si128((__m128i
*)&tmp
[12 * 16 + i
], T70
);
221 T60
= _mm_madd_epi16(T44
, _mm_load_si128((__m128i
*)tab_dct_8
[5]));
222 T61
= _mm_madd_epi16(T45
, _mm_load_si128((__m128i
*)tab_dct_8
[5]));
223 T62
= _mm_madd_epi16(T46
, _mm_load_si128((__m128i
*)tab_dct_8
[5]));
224 T63
= _mm_madd_epi16(T47
, _mm_load_si128((__m128i
*)tab_dct_8
[5]));
225 T60
= _mm_hadd_epi32(T60
, T61
);
226 T61
= _mm_hadd_epi32(T62
, T63
);
227 T60
= _mm_srai_epi32(_mm_add_epi32(T60
, c_4
), 3);
228 T61
= _mm_srai_epi32(_mm_add_epi32(T61
, c_4
), 3);
229 T70
= _mm_packs_epi32(T60
, T61
);
230 _mm_store_si128((__m128i
*)&tmp
[2 * 16 + i
], T70
);
232 T60
= _mm_madd_epi16(T44
, _mm_load_si128((__m128i
*)tab_dct_8
[6]));
233 T61
= _mm_madd_epi16(T45
, _mm_load_si128((__m128i
*)tab_dct_8
[6]));
234 T62
= _mm_madd_epi16(T46
, _mm_load_si128((__m128i
*)tab_dct_8
[6]));
235 T63
= _mm_madd_epi16(T47
, _mm_load_si128((__m128i
*)tab_dct_8
[6]));
236 T60
= _mm_hadd_epi32(T60
, T61
);
237 T61
= _mm_hadd_epi32(T62
, T63
);
238 T60
= _mm_srai_epi32(_mm_add_epi32(T60
, c_4
), 3);
239 T61
= _mm_srai_epi32(_mm_add_epi32(T61
, c_4
), 3);
240 T70
= _mm_packs_epi32(T60
, T61
);
241 _mm_store_si128((__m128i
*)&tmp
[6 * 16 + i
], T70
);
243 T60
= _mm_madd_epi16(T44
, _mm_load_si128((__m128i
*)tab_dct_8
[7]));
244 T61
= _mm_madd_epi16(T45
, _mm_load_si128((__m128i
*)tab_dct_8
[7]));
245 T62
= _mm_madd_epi16(T46
, _mm_load_si128((__m128i
*)tab_dct_8
[7]));
246 T63
= _mm_madd_epi16(T47
, _mm_load_si128((__m128i
*)tab_dct_8
[7]));
247 T60
= _mm_hadd_epi32(T60
, T61
);
248 T61
= _mm_hadd_epi32(T62
, T63
);
249 T60
= _mm_srai_epi32(_mm_add_epi32(T60
, c_4
), 3);
250 T61
= _mm_srai_epi32(_mm_add_epi32(T61
, c_4
), 3);
251 T70
= _mm_packs_epi32(T60
, T61
);
252 _mm_store_si128((__m128i
*)&tmp
[10 * 16 + i
], T70
);
254 T60
= _mm_madd_epi16(T44
, _mm_load_si128((__m128i
*)tab_dct_8
[8]));
255 T61
= _mm_madd_epi16(T45
, _mm_load_si128((__m128i
*)tab_dct_8
[8]));
256 T62
= _mm_madd_epi16(T46
, _mm_load_si128((__m128i
*)tab_dct_8
[8]));
257 T63
= _mm_madd_epi16(T47
, _mm_load_si128((__m128i
*)tab_dct_8
[8]));
258 T60
= _mm_hadd_epi32(T60
, T61
);
259 T61
= _mm_hadd_epi32(T62
, T63
);
260 T60
= _mm_srai_epi32(_mm_add_epi32(T60
, c_4
), 3);
261 T61
= _mm_srai_epi32(_mm_add_epi32(T61
, c_4
), 3);
262 T70
= _mm_packs_epi32(T60
, T61
);
263 _mm_store_si128((__m128i
*)&tmp
[14 * 16 + i
], T70
);
265 #define MAKE_ODD(tab, dstPos) \
266 T60 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
267 T61 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
268 T62 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
269 T63 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
270 T64 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
271 T65 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
272 T66 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
273 T67 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
274 T60 = _mm_hadd_epi32(T60, T61); \
275 T61 = _mm_hadd_epi32(T62, T63); \
276 T62 = _mm_hadd_epi32(T64, T65); \
277 T63 = _mm_hadd_epi32(T66, T67); \
278 T60 = _mm_hadd_epi32(T60, T61); \
279 T61 = _mm_hadd_epi32(T62, T63); \
280 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); \
281 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); \
282 T70 = _mm_packs_epi32(T60, T61); \
283 _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
297 for (i
= 0; i
< 16; i
+= 4)
299 T00A
= _mm_load_si128((__m128i
*)&tmp
[(i
+ 0) * 16 + 0]); // [07 06 05 04 03 02 01 00]
300 T00B
= _mm_load_si128((__m128i
*)&tmp
[(i
+ 0) * 16 + 8]); // [0F 0E 0D 0C 0B 0A 09 08]
301 T01A
= _mm_load_si128((__m128i
*)&tmp
[(i
+ 1) * 16 + 0]); // [17 16 15 14 13 12 11 10]
302 T01B
= _mm_load_si128((__m128i
*)&tmp
[(i
+ 1) * 16 + 8]); // [1F 1E 1D 1C 1B 1A 19 18]
303 T02A
= _mm_load_si128((__m128i
*)&tmp
[(i
+ 2) * 16 + 0]); // [27 26 25 24 23 22 21 20]
304 T02B
= _mm_load_si128((__m128i
*)&tmp
[(i
+ 2) * 16 + 8]); // [2F 2E 2D 2C 2B 2A 29 28]
305 T03A
= _mm_load_si128((__m128i
*)&tmp
[(i
+ 3) * 16 + 0]); // [37 36 35 34 33 32 31 30]
306 T03B
= _mm_load_si128((__m128i
*)&tmp
[(i
+ 3) * 16 + 8]); // [3F 3E 3D 3C 3B 3A 39 38]
308 T00A
= _mm_shuffle_epi8(T00A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[2]));
309 T00B
= _mm_shuffle_epi8(T00B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[3]));
310 T01A
= _mm_shuffle_epi8(T01A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[2]));
311 T01B
= _mm_shuffle_epi8(T01B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[3]));
312 T02A
= _mm_shuffle_epi8(T02A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[2]));
313 T02B
= _mm_shuffle_epi8(T02B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[3]));
314 T03A
= _mm_shuffle_epi8(T03A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[2]));
315 T03B
= _mm_shuffle_epi8(T03B
, _mm_load_si128((__m128i
*)tab_dct_16_0
[3]));
317 T10
= _mm_unpacklo_epi16(T00A
, T00B
);
318 T11
= _mm_unpackhi_epi16(T00A
, T00B
);
319 T12
= _mm_unpacklo_epi16(T01A
, T01B
);
320 T13
= _mm_unpackhi_epi16(T01A
, T01B
);
321 T14
= _mm_unpacklo_epi16(T02A
, T02B
);
322 T15
= _mm_unpackhi_epi16(T02A
, T02B
);
323 T16
= _mm_unpacklo_epi16(T03A
, T03B
);
324 T17
= _mm_unpackhi_epi16(T03A
, T03B
);
326 T20
= _mm_madd_epi16(T10
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
327 T21
= _mm_madd_epi16(T11
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
328 T22
= _mm_madd_epi16(T12
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
329 T23
= _mm_madd_epi16(T13
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
330 T24
= _mm_madd_epi16(T14
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
331 T25
= _mm_madd_epi16(T15
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
332 T26
= _mm_madd_epi16(T16
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
333 T27
= _mm_madd_epi16(T17
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
335 T30
= _mm_add_epi32(T20
, T21
);
336 T31
= _mm_add_epi32(T22
, T23
);
337 T32
= _mm_add_epi32(T24
, T25
);
338 T33
= _mm_add_epi32(T26
, T27
);
340 T30
= _mm_hadd_epi32(T30
, T31
);
341 T31
= _mm_hadd_epi32(T32
, T33
);
343 T40
= _mm_hadd_epi32(T30
, T31
);
344 T41
= _mm_hsub_epi32(T30
, T31
);
345 T40
= _mm_srai_epi32(_mm_add_epi32(T40
, c_512
), 10);
346 T41
= _mm_srai_epi32(_mm_add_epi32(T41
, c_512
), 10);
347 T40
= _mm_packs_epi32(T40
, T40
);
348 T41
= _mm_packs_epi32(T41
, T41
);
349 _mm_storel_epi64((__m128i
*)&dst
[0 * 16 + i
], T40
);
350 _mm_storel_epi64((__m128i
*)&dst
[8 * 16 + i
], T41
);
352 T20
= _mm_madd_epi16(T10
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
353 T21
= _mm_madd_epi16(T11
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
354 T22
= _mm_madd_epi16(T12
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
355 T23
= _mm_madd_epi16(T13
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
356 T24
= _mm_madd_epi16(T14
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
357 T25
= _mm_madd_epi16(T15
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
358 T26
= _mm_madd_epi16(T16
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
359 T27
= _mm_madd_epi16(T17
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
361 T30
= _mm_add_epi32(T20
, T21
);
362 T31
= _mm_add_epi32(T22
, T23
);
363 T32
= _mm_add_epi32(T24
, T25
);
364 T33
= _mm_add_epi32(T26
, T27
);
366 T30
= _mm_hadd_epi32(T30
, T31
);
367 T31
= _mm_hadd_epi32(T32
, T33
);
369 T40
= _mm_hadd_epi32(T30
, T31
);
370 T40
= _mm_srai_epi32(_mm_add_epi32(T40
, c_512
), 10);
371 T40
= _mm_packs_epi32(T40
, T40
);
372 _mm_storel_epi64((__m128i
*)&dst
[4 * 16 + i
], T40
);
374 T20
= _mm_madd_epi16(T10
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
375 T21
= _mm_madd_epi16(T11
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
376 T22
= _mm_madd_epi16(T12
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
377 T23
= _mm_madd_epi16(T13
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
378 T24
= _mm_madd_epi16(T14
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
379 T25
= _mm_madd_epi16(T15
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
380 T26
= _mm_madd_epi16(T16
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
381 T27
= _mm_madd_epi16(T17
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
383 T30
= _mm_add_epi32(T20
, T21
);
384 T31
= _mm_add_epi32(T22
, T23
);
385 T32
= _mm_add_epi32(T24
, T25
);
386 T33
= _mm_add_epi32(T26
, T27
);
388 T30
= _mm_hadd_epi32(T30
, T31
);
389 T31
= _mm_hadd_epi32(T32
, T33
);
391 T40
= _mm_hadd_epi32(T30
, T31
);
392 T40
= _mm_srai_epi32(_mm_add_epi32(T40
, c_512
), 10);
393 T40
= _mm_packs_epi32(T40
, T40
);
394 _mm_storel_epi64((__m128i
*)&dst
[12 * 16 + i
], T40
);
396 T20
= _mm_madd_epi16(T10
, _mm_load_si128((__m128i
*)tab_dct_16_1
[10]));
397 T21
= _mm_madd_epi16(T11
, _mm_load_si128((__m128i
*)tab_dct_16_1
[10]));
398 T22
= _mm_madd_epi16(T12
, _mm_load_si128((__m128i
*)tab_dct_16_1
[10]));
399 T23
= _mm_madd_epi16(T13
, _mm_load_si128((__m128i
*)tab_dct_16_1
[10]));
400 T24
= _mm_madd_epi16(T14
, _mm_load_si128((__m128i
*)tab_dct_16_1
[10]));
401 T25
= _mm_madd_epi16(T15
, _mm_load_si128((__m128i
*)tab_dct_16_1
[10]));
402 T26
= _mm_madd_epi16(T16
, _mm_load_si128((__m128i
*)tab_dct_16_1
[10]));
403 T27
= _mm_madd_epi16(T17
, _mm_load_si128((__m128i
*)tab_dct_16_1
[10]));
405 T30
= _mm_sub_epi32(T20
, T21
);
406 T31
= _mm_sub_epi32(T22
, T23
);
407 T32
= _mm_sub_epi32(T24
, T25
);
408 T33
= _mm_sub_epi32(T26
, T27
);
410 T30
= _mm_hadd_epi32(T30
, T31
);
411 T31
= _mm_hadd_epi32(T32
, T33
);
413 T40
= _mm_hadd_epi32(T30
, T31
);
414 T40
= _mm_srai_epi32(_mm_add_epi32(T40
, c_512
), 10);
415 T40
= _mm_packs_epi32(T40
, T40
);
416 _mm_storel_epi64((__m128i
*)&dst
[2 * 16 + i
], T40
);
418 T20
= _mm_madd_epi16(T10
, _mm_load_si128((__m128i
*)tab_dct_16_1
[11]));
419 T21
= _mm_madd_epi16(T11
, _mm_load_si128((__m128i
*)tab_dct_16_1
[11]));
420 T22
= _mm_madd_epi16(T12
, _mm_load_si128((__m128i
*)tab_dct_16_1
[11]));
421 T23
= _mm_madd_epi16(T13
, _mm_load_si128((__m128i
*)tab_dct_16_1
[11]));
422 T24
= _mm_madd_epi16(T14
, _mm_load_si128((__m128i
*)tab_dct_16_1
[11]));
423 T25
= _mm_madd_epi16(T15
, _mm_load_si128((__m128i
*)tab_dct_16_1
[11]));
424 T26
= _mm_madd_epi16(T16
, _mm_load_si128((__m128i
*)tab_dct_16_1
[11]));
425 T27
= _mm_madd_epi16(T17
, _mm_load_si128((__m128i
*)tab_dct_16_1
[11]));
427 T30
= _mm_sub_epi32(T20
, T21
);
428 T31
= _mm_sub_epi32(T22
, T23
);
429 T32
= _mm_sub_epi32(T24
, T25
);
430 T33
= _mm_sub_epi32(T26
, T27
);
432 T30
= _mm_hadd_epi32(T30
, T31
);
433 T31
= _mm_hadd_epi32(T32
, T33
);
435 T40
= _mm_hadd_epi32(T30
, T31
);
436 T40
= _mm_srai_epi32(_mm_add_epi32(T40
, c_512
), 10);
437 T40
= _mm_packs_epi32(T40
, T40
);
438 _mm_storel_epi64((__m128i
*)&dst
[6 * 16 + i
], T40
);
440 T20
= _mm_madd_epi16(T10
, _mm_load_si128((__m128i
*)tab_dct_16_1
[12]));
441 T21
= _mm_madd_epi16(T11
, _mm_load_si128((__m128i
*)tab_dct_16_1
[12]));
442 T22
= _mm_madd_epi16(T12
, _mm_load_si128((__m128i
*)tab_dct_16_1
[12]));
443 T23
= _mm_madd_epi16(T13
, _mm_load_si128((__m128i
*)tab_dct_16_1
[12]));
444 T24
= _mm_madd_epi16(T14
, _mm_load_si128((__m128i
*)tab_dct_16_1
[12]));
445 T25
= _mm_madd_epi16(T15
, _mm_load_si128((__m128i
*)tab_dct_16_1
[12]));
446 T26
= _mm_madd_epi16(T16
, _mm_load_si128((__m128i
*)tab_dct_16_1
[12]));
447 T27
= _mm_madd_epi16(T17
, _mm_load_si128((__m128i
*)tab_dct_16_1
[12]));
449 T30
= _mm_sub_epi32(T20
, T21
);
450 T31
= _mm_sub_epi32(T22
, T23
);
451 T32
= _mm_sub_epi32(T24
, T25
);
452 T33
= _mm_sub_epi32(T26
, T27
);
454 T30
= _mm_hadd_epi32(T30
, T31
);
455 T31
= _mm_hadd_epi32(T32
, T33
);
457 T40
= _mm_hadd_epi32(T30
, T31
);
458 T40
= _mm_srai_epi32(_mm_add_epi32(T40
, c_512
), 10);
459 T40
= _mm_packs_epi32(T40
, T40
);
460 _mm_storel_epi64((__m128i
*)&dst
[10 * 16 + i
], T40
);
462 T20
= _mm_madd_epi16(T10
, _mm_load_si128((__m128i
*)tab_dct_16_1
[13]));
463 T21
= _mm_madd_epi16(T11
, _mm_load_si128((__m128i
*)tab_dct_16_1
[13]));
464 T22
= _mm_madd_epi16(T12
, _mm_load_si128((__m128i
*)tab_dct_16_1
[13]));
465 T23
= _mm_madd_epi16(T13
, _mm_load_si128((__m128i
*)tab_dct_16_1
[13]));
466 T24
= _mm_madd_epi16(T14
, _mm_load_si128((__m128i
*)tab_dct_16_1
[13]));
467 T25
= _mm_madd_epi16(T15
, _mm_load_si128((__m128i
*)tab_dct_16_1
[13]));
468 T26
= _mm_madd_epi16(T16
, _mm_load_si128((__m128i
*)tab_dct_16_1
[13]));
469 T27
= _mm_madd_epi16(T17
, _mm_load_si128((__m128i
*)tab_dct_16_1
[13]));
471 T30
= _mm_sub_epi32(T20
, T21
);
472 T31
= _mm_sub_epi32(T22
, T23
);
473 T32
= _mm_sub_epi32(T24
, T25
);
474 T33
= _mm_sub_epi32(T26
, T27
);
476 T30
= _mm_hadd_epi32(T30
, T31
);
477 T31
= _mm_hadd_epi32(T32
, T33
);
479 T40
= _mm_hadd_epi32(T30
, T31
);
480 T40
= _mm_srai_epi32(_mm_add_epi32(T40
, c_512
), 10);
481 T40
= _mm_packs_epi32(T40
, T40
);
482 _mm_storel_epi64((__m128i
*)&dst
[14 * 16 + i
], T40
);
484 #define MAKE_ODD(tab, dstPos) \
485 T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \
486 T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); /* [*O5_0 *O6_0 *O4_0 *O7_0] */ \
487 T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
488 T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \
489 T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
490 T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \
491 T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
492 T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \
494 T30 = _mm_add_epi32(T20, T21); \
495 T31 = _mm_add_epi32(T22, T23); \
496 T32 = _mm_add_epi32(T24, T25); \
497 T33 = _mm_add_epi32(T26, T27); \
499 T30 = _mm_hadd_epi32(T30, T31); \
500 T31 = _mm_hadd_epi32(T32, T33); \
502 T40 = _mm_hadd_epi32(T30, T31); \
503 T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \
504 T40 = _mm_packs_epi32(T40, T40); \
505 _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
519 ALIGN_VAR_32(static const int16_t, tab_dct_32_0
[][8]) =
521 { 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504 }, // 0
524 ALIGN_VAR_32(static const int16_t, tab_dct_32_1
[][8]) =
526 { 89, -89, 18, -18, 75, -75, 50, -50 }, // 0
527 { 75, -75, -50, 50, -18, 18, -89, 89 }, // 1
528 { 50, -50, 75, -75, -89, 89, 18, -18 }, // 2
529 { 18, -18, -89, 89, -50, 50, 75, -75 }, // 3
531 #define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \
532 { (a0), (a7), (a3), (a4), (a1), (a6), (a2), (a5) \
535 MAKE_COEF8(90, 87, 80, 70, 57, 43, 25, 9) // 4
536 MAKE_COEF8(87, 57, 9, -43, -80, -90, -70, -25) // 5
537 MAKE_COEF8(80, 9, -70, -87, -25, 57, 90, 43) // 6
538 MAKE_COEF8(70, -43, -87, 9, 90, 25, -80, -57) // 7
539 MAKE_COEF8(57, -80, -25, 90, -9, -87, 43, 70) // 8
540 MAKE_COEF8(43, -90, 57, 25, -87, 70, 9, -80) // 9
541 MAKE_COEF8(25, -70, 90, -80, 43, 9, -57, 87) // 10
542 MAKE_COEF8(9, -25, 43, -57, 70, -80, 87, -90) // 11
545 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \
546 { (a00), (a07), (a03), (a04), (a01), (a06), (a02), (a05) }, \
547 { (a15), (a08), (a12), (a11), (a14), (a09), (a13), (a10) },
549 MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 12
550 MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) // 14
551 MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) // 16
552 MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31) // 18
553 MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38) // 20
554 MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) // 22
555 MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) // 24
556 MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61) // 26
557 MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) // 28
558 MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) // 30
559 MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78) // 32
560 MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82) // 34
561 MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) // 36
562 MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) // 38
563 MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90) // 40
564 MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90) // 42
568 64, 64, 64, 64, 64, 64, 64, 64
571 { 64, 64, -64, -64, -64, -64, 64, 64 }, // 45
573 { 83, 83, 36, 36, -36, -36, -83, -83 }, // 46
574 { -83, -83, -36, -36, 36, 36, 83, 83 }, // 47
576 { 36, 36, -83, -83, 83, 83, -36, -36 }, // 48
577 { -36, -36, 83, 83, -83, -83, 36, 36 }, // 49
579 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \
580 { (a00), (a00), (a01), (a01), (a02), (a02), (a03), (a03) }, \
581 { (a04), (a04), (a05), (a05), (a06), (a06), (a07), (a07) }, \
582 { (a08), (a08), (a09), (a09), (a10), (a10), (a11), (a11) }, \
583 { (a12), (a12), (a13), (a13), (a14), (a14), (a15), (a15) },
585 MAKE_COEF16(89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89) // 50
586 MAKE_COEF16(75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75) // 54
588 // TODO: convert below table here
592 50, 50, -89, -89, 18, 18, 75, 75
594 { -75, -75, -18, -18, 89, 89, -50, -50 }, // 59
595 { -50, -50, 89, 89, -18, -18, -75, -75 }, // 60
596 { 75, 75, 18, 18, -89, -89, 50, 50 }, // 61
598 { 18, 18, -50, -50, 75, 75, -89, -89 }, // 62
599 { 89, 89, -75, -75, 50, 50, -18, -18 }, // 63
600 { -18, -18, 50, 50, -75, -75, 89, 89 }, // 64
601 { -89, -89, 75, 75, -50, -50, 18, 18 }, // 65
603 { 90, 90, 87, 87, 80, 80, 70, 70 }, // 66
604 { 57, 57, 43, 43, 25, 25, 9, 9 }, // 67
605 { -9, -9, -25, -25, -43, -43, -57, -57 }, // 68
606 { -70, -70, -80, -80, -87, -87, -90, -90 }, // 69
608 { 87, 87, 57, 57, 9, 9, -43, -43 }, // 70
609 { -80, -80, -90, -90, -70, -70, -25, -25 }, // 71
610 { 25, 25, 70, 70, 90, 90, 80, 80 }, // 72
611 { 43, 43, -9, -9, -57, -57, -87, -87 }, // 73
613 { 80, 80, 9, 9, -70, -70, -87, -87 }, // 74
614 { -25, -25, 57, 57, 90, 90, 43, 43 }, // 75
615 { -43, -43, -90, -90, -57, -57, 25, 25 }, // 76
616 { 87, 87, 70, 70, -9, -9, -80, -80 }, // 77
618 { 70, 70, -43, -43, -87, -87, 9, 9 }, // 78
619 { 90, 90, 25, 25, -80, -80, -57, -57 }, // 79
620 { 57, 57, 80, 80, -25, -25, -90, -90 }, // 80
621 { -9, -9, 87, 87, 43, 43, -70, -70 }, // 81
623 { 57, 57, -80, -80, -25, -25, 90, 90 }, // 82
624 { -9, -9, -87, -87, 43, 43, 70, 70 }, // 83
625 { -70, -70, -43, -43, 87, 87, 9, 9 }, // 84
626 { -90, -90, 25, 25, 80, 80, -57, -57 }, // 85
628 { 43, 43, -90, -90, 57, 57, 25, 25 }, // 86
629 { -87, -87, 70, 70, 9, 9, -80, -80 }, // 87
630 { 80, 80, -9, -9, -70, -70, 87, 87 }, // 88
631 { -25, -25, -57, -57, 90, 90, -43, -43 }, // 89
633 { 25, 25, -70, -70, 90, 90, -80, -80 }, // 90
634 { 43, 43, 9, 9, -57, -57, 87, 87 }, // 91
635 { -87, -87, 57, 57, -9, -9, -43, -43 }, // 92
636 { 80, 80, -90, -90, 70, 70, -25, -25 }, // 93
638 { 9, 9, -25, -25, 43, 43, -57, -57 }, // 94
639 { 70, 70, -80, -80, 87, 87, -90, -90 }, // 95
640 { 90, 90, -87, -87, 80, 80, -70, -70 }, // 96
641 { 57, 57, -43, -43, 25, 25, -9, -9 }, // 97
643 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \
644 { (a00), -(a00), (a01), -(a01), (a02), -(a02), (a03), -(a03) }, \
645 { (a04), -(a04), (a05), -(a05), (a06), -(a06), (a07), -(a07) }, \
646 { (a08), -(a08), (a09), -(a09), (a10), -(a10), (a11), -(a11) }, \
647 { (a12), -(a12), (a13), -(a13), (a14), -(a14), (a15), -(a15) },
649 MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 98
650 MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) //102
651 MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) //106
652 MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, +82, 88, 54, -4, -61, -90, -78, -31) //110
653 MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, +31, -46, -90, -67, 4, 73, 88, 38) //114
654 MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) //118
655 MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) //122
656 MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, +90, 13, -88, -31, 82, 46, -73, -61) //126
657 MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) //130
658 MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) //134
659 MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, +22, 67, -85, 13, 73, -82, 4, 78) //138
660 MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, +85, -78, 13, 61, -90, 54, 22, -82) //142
661 MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) //146
662 MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) //150
663 MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, +54, -31, 4, 22, -46, 67, -82, 90) //154
664 MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, +67, -73, 78, -82, 85, -88, 90, -90) //158
669 void dct32(const int16_t *src
, int16_t *dst
, intptr_t stride
)
672 __m128i c_8
= _mm_set1_epi32(8);
673 __m128i c_1024
= _mm_set1_epi32(1024);
677 __m128i T00A
, T01A
, T02A
, T03A
, T04A
, T05A
, T06A
, T07A
;
678 __m128i T00B
, T01B
, T02B
, T03B
, T04B
, T05B
, T06B
, T07B
;
679 __m128i T00C
, T01C
, T02C
, T03C
, T04C
, T05C
, T06C
, T07C
;
680 __m128i T00D
, T01D
, T02D
, T03D
, T04D
, T05D
, T06D
, T07D
;
681 __m128i T10A
, T11A
, T12A
, T13A
, T14A
, T15A
, T16A
, T17A
;
682 __m128i T10B
, T11B
, T12B
, T13B
, T14B
, T15B
, T16B
, T17B
;
683 __m128i T20
, T21
, T22
, T23
, T24
, T25
, T26
, T27
;
684 __m128i T30
, T31
, T32
, T33
, T34
, T35
, T36
, T37
;
685 __m128i T40
, T41
, T42
, T43
, T44
, T45
, T46
, T47
;
686 __m128i T50
, T51
, T52
, T53
;
687 __m128i T60
, T61
, T62
, T63
, T64
, T65
, T66
, T67
;
691 for (i
= 0; i
< 32 / 8; i
++)
693 T00A
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 0) * stride
+ 0]); // [07 06 05 04 03 02 01 00]
694 T00B
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 0) * stride
+ 8]); // [15 14 13 12 11 10 09 08]
695 T00C
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 0) * stride
+ 16]); // [23 22 21 20 19 18 17 16]
696 T00D
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 0) * stride
+ 24]); // [31 30 29 28 27 26 25 24]
697 T01A
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 1) * stride
+ 0]);
698 T01B
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 1) * stride
+ 8]);
699 T01C
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 1) * stride
+ 16]);
700 T01D
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 1) * stride
+ 24]);
701 T02A
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 2) * stride
+ 0]);
702 T02B
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 2) * stride
+ 8]);
703 T02C
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 2) * stride
+ 16]);
704 T02D
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 2) * stride
+ 24]);
705 T03A
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 3) * stride
+ 0]);
706 T03B
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 3) * stride
+ 8]);
707 T03C
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 3) * stride
+ 16]);
708 T03D
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 3) * stride
+ 24]);
709 T04A
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 4) * stride
+ 0]);
710 T04B
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 4) * stride
+ 8]);
711 T04C
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 4) * stride
+ 16]);
712 T04D
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 4) * stride
+ 24]);
713 T05A
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 5) * stride
+ 0]);
714 T05B
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 5) * stride
+ 8]);
715 T05C
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 5) * stride
+ 16]);
716 T05D
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 5) * stride
+ 24]);
717 T06A
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 6) * stride
+ 0]);
718 T06B
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 6) * stride
+ 8]);
719 T06C
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 6) * stride
+ 16]);
720 T06D
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 6) * stride
+ 24]);
721 T07A
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 7) * stride
+ 0]);
722 T07B
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 7) * stride
+ 8]);
723 T07C
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 7) * stride
+ 16]);
724 T07D
= _mm_load_si128((__m128i
*)&src
[(i
* 8 + 7) * stride
+ 24]);
726 T00A
= _mm_shuffle_epi8(T00A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1])); // [05 02 06 01 04 03 07 00]
727 T00B
= _mm_shuffle_epi8(T00B
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0])); // [10 13 09 14 11 12 08 15]
728 T00C
= _mm_shuffle_epi8(T00C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1])); // [21 18 22 17 20 19 23 16]
729 T00D
= _mm_shuffle_epi8(T00D
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0])); // [26 29 25 30 27 28 24 31]
730 T01A
= _mm_shuffle_epi8(T01A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
731 T01B
= _mm_shuffle_epi8(T01B
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
732 T01C
= _mm_shuffle_epi8(T01C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
733 T01D
= _mm_shuffle_epi8(T01D
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
734 T02A
= _mm_shuffle_epi8(T02A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
735 T02B
= _mm_shuffle_epi8(T02B
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
736 T02C
= _mm_shuffle_epi8(T02C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
737 T02D
= _mm_shuffle_epi8(T02D
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
738 T03A
= _mm_shuffle_epi8(T03A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
739 T03B
= _mm_shuffle_epi8(T03B
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
740 T03C
= _mm_shuffle_epi8(T03C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
741 T03D
= _mm_shuffle_epi8(T03D
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
742 T04A
= _mm_shuffle_epi8(T04A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
743 T04B
= _mm_shuffle_epi8(T04B
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
744 T04C
= _mm_shuffle_epi8(T04C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
745 T04D
= _mm_shuffle_epi8(T04D
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
746 T05A
= _mm_shuffle_epi8(T05A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
747 T05B
= _mm_shuffle_epi8(T05B
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
748 T05C
= _mm_shuffle_epi8(T05C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
749 T05D
= _mm_shuffle_epi8(T05D
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
750 T06A
= _mm_shuffle_epi8(T06A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
751 T06B
= _mm_shuffle_epi8(T06B
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
752 T06C
= _mm_shuffle_epi8(T06C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
753 T06D
= _mm_shuffle_epi8(T06D
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
754 T07A
= _mm_shuffle_epi8(T07A
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
755 T07B
= _mm_shuffle_epi8(T07B
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
756 T07C
= _mm_shuffle_epi8(T07C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[1]));
757 T07D
= _mm_shuffle_epi8(T07D
, _mm_load_si128((__m128i
*)tab_dct_32_0
[0]));
759 T10A
= _mm_add_epi16(T00A
, T00D
); // [E05 E02 E06 E01 E04 E03 E07 E00]
760 T10B
= _mm_add_epi16(T00B
, T00C
); // [E10 E13 E09 E14 E11 E12 E08 E15]
761 T11A
= _mm_add_epi16(T01A
, T01D
);
762 T11B
= _mm_add_epi16(T01B
, T01C
);
763 T12A
= _mm_add_epi16(T02A
, T02D
);
764 T12B
= _mm_add_epi16(T02B
, T02C
);
765 T13A
= _mm_add_epi16(T03A
, T03D
);
766 T13B
= _mm_add_epi16(T03B
, T03C
);
767 T14A
= _mm_add_epi16(T04A
, T04D
);
768 T14B
= _mm_add_epi16(T04B
, T04C
);
769 T15A
= _mm_add_epi16(T05A
, T05D
);
770 T15B
= _mm_add_epi16(T05B
, T05C
);
771 T16A
= _mm_add_epi16(T06A
, T06D
);
772 T16B
= _mm_add_epi16(T06B
, T06C
);
773 T17A
= _mm_add_epi16(T07A
, T07D
);
774 T17B
= _mm_add_epi16(T07B
, T07C
);
776 T00A
= _mm_sub_epi16(T00A
, T00D
); // [O05 O02 O06 O01 O04 O03 O07 O00]
777 T00B
= _mm_sub_epi16(T00B
, T00C
); // [O10 O13 O09 O14 O11 O12 O08 O15]
778 T01A
= _mm_sub_epi16(T01A
, T01D
);
779 T01B
= _mm_sub_epi16(T01B
, T01C
);
780 T02A
= _mm_sub_epi16(T02A
, T02D
);
781 T02B
= _mm_sub_epi16(T02B
, T02C
);
782 T03A
= _mm_sub_epi16(T03A
, T03D
);
783 T03B
= _mm_sub_epi16(T03B
, T03C
);
784 T04A
= _mm_sub_epi16(T04A
, T04D
);
785 T04B
= _mm_sub_epi16(T04B
, T04C
);
786 T05A
= _mm_sub_epi16(T05A
, T05D
);
787 T05B
= _mm_sub_epi16(T05B
, T05C
);
788 T06A
= _mm_sub_epi16(T06A
, T06D
);
789 T06B
= _mm_sub_epi16(T06B
, T06C
);
790 T07A
= _mm_sub_epi16(T07A
, T07D
);
791 T07B
= _mm_sub_epi16(T07B
, T07C
);
793 T20
= _mm_add_epi16(T10A
, T10B
); // [EE5 EE2 EE6 EE1 EE4 EE3 EE7 EE0]
794 T21
= _mm_add_epi16(T11A
, T11B
);
795 T22
= _mm_add_epi16(T12A
, T12B
);
796 T23
= _mm_add_epi16(T13A
, T13B
);
797 T24
= _mm_add_epi16(T14A
, T14B
);
798 T25
= _mm_add_epi16(T15A
, T15B
);
799 T26
= _mm_add_epi16(T16A
, T16B
);
800 T27
= _mm_add_epi16(T17A
, T17B
);
802 T30
= _mm_madd_epi16(T20
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
803 T31
= _mm_madd_epi16(T21
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
804 T32
= _mm_madd_epi16(T22
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
805 T33
= _mm_madd_epi16(T23
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
806 T34
= _mm_madd_epi16(T24
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
807 T35
= _mm_madd_epi16(T25
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
808 T36
= _mm_madd_epi16(T26
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
809 T37
= _mm_madd_epi16(T27
, _mm_load_si128((__m128i
*)tab_dct_8
[1]));
811 T40
= _mm_hadd_epi32(T30
, T31
);
812 T41
= _mm_hadd_epi32(T32
, T33
);
813 T42
= _mm_hadd_epi32(T34
, T35
);
814 T43
= _mm_hadd_epi32(T36
, T37
);
816 T50
= _mm_hadd_epi32(T40
, T41
);
817 T51
= _mm_hadd_epi32(T42
, T43
);
818 T50
= _mm_srai_epi32(_mm_add_epi32(T50
, c_8
), 4);
819 T51
= _mm_srai_epi32(_mm_add_epi32(T51
, c_8
), 4);
820 T60
= _mm_packs_epi32(T50
, T51
);
823 T50
= _mm_hsub_epi32(T40
, T41
);
824 T51
= _mm_hsub_epi32(T42
, T43
);
825 T50
= _mm_srai_epi32(_mm_add_epi32(T50
, c_8
), 4);
826 T51
= _mm_srai_epi32(_mm_add_epi32(T51
, c_8
), 4);
827 T60
= _mm_packs_epi32(T50
, T51
);
830 T30
= _mm_madd_epi16(T20
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
831 T31
= _mm_madd_epi16(T21
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
832 T32
= _mm_madd_epi16(T22
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
833 T33
= _mm_madd_epi16(T23
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
834 T34
= _mm_madd_epi16(T24
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
835 T35
= _mm_madd_epi16(T25
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
836 T36
= _mm_madd_epi16(T26
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
837 T37
= _mm_madd_epi16(T27
, _mm_load_si128((__m128i
*)tab_dct_16_1
[8]));
839 T40
= _mm_hadd_epi32(T30
, T31
);
840 T41
= _mm_hadd_epi32(T32
, T33
);
841 T42
= _mm_hadd_epi32(T34
, T35
);
842 T43
= _mm_hadd_epi32(T36
, T37
);
844 T50
= _mm_hadd_epi32(T40
, T41
);
845 T51
= _mm_hadd_epi32(T42
, T43
);
846 T50
= _mm_srai_epi32(_mm_add_epi32(T50
, c_8
), 4);
847 T51
= _mm_srai_epi32(_mm_add_epi32(T51
, c_8
), 4);
848 T60
= _mm_packs_epi32(T50
, T51
);
851 T30
= _mm_madd_epi16(T20
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
852 T31
= _mm_madd_epi16(T21
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
853 T32
= _mm_madd_epi16(T22
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
854 T33
= _mm_madd_epi16(T23
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
855 T34
= _mm_madd_epi16(T24
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
856 T35
= _mm_madd_epi16(T25
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
857 T36
= _mm_madd_epi16(T26
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
858 T37
= _mm_madd_epi16(T27
, _mm_load_si128((__m128i
*)tab_dct_16_1
[9]));
860 T40
= _mm_hadd_epi32(T30
, T31
);
861 T41
= _mm_hadd_epi32(T32
, T33
);
862 T42
= _mm_hadd_epi32(T34
, T35
);
863 T43
= _mm_hadd_epi32(T36
, T37
);
865 T50
= _mm_hadd_epi32(T40
, T41
);
866 T51
= _mm_hadd_epi32(T42
, T43
);
867 T50
= _mm_srai_epi32(_mm_add_epi32(T50
, c_8
), 4);
868 T51
= _mm_srai_epi32(_mm_add_epi32(T51
, c_8
), 4);
869 T60
= _mm_packs_epi32(T50
, T51
);
872 #define MAKE_ODD(tab, dstPos) \
873 T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
874 T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
875 T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
876 T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
877 T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
878 T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
879 T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
880 T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
882 T40 = _mm_hadd_epi32(T30, T31); \
883 T41 = _mm_hadd_epi32(T32, T33); \
884 T42 = _mm_hadd_epi32(T34, T35); \
885 T43 = _mm_hadd_epi32(T36, T37); \
887 T50 = _mm_hadd_epi32(T40, T41); \
888 T51 = _mm_hadd_epi32(T42, T43); \
889 T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
890 T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
891 T60 = _mm_packs_epi32(T50, T51); \
892 im[(dstPos)][i] = T60;
899 T20
= _mm_sub_epi16(T10A
, T10B
); // [EO5 EO2 EO6 EO1 EO4 EO3 EO7 EO0]
900 T21
= _mm_sub_epi16(T11A
, T11B
);
901 T22
= _mm_sub_epi16(T12A
, T12B
);
902 T23
= _mm_sub_epi16(T13A
, T13B
);
903 T24
= _mm_sub_epi16(T14A
, T14B
);
904 T25
= _mm_sub_epi16(T15A
, T15B
);
905 T26
= _mm_sub_epi16(T16A
, T16B
);
906 T27
= _mm_sub_epi16(T17A
, T17B
);
918 #define MAKE_ODD(tab, dstPos) \
919 T20 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
920 T21 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
921 T22 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
922 T23 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
923 T24 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
924 T25 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
925 T26 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
926 T27 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
927 T30 = _mm_madd_epi16(T04A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
928 T31 = _mm_madd_epi16(T04B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
929 T32 = _mm_madd_epi16(T05A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
930 T33 = _mm_madd_epi16(T05B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
931 T34 = _mm_madd_epi16(T06A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
932 T35 = _mm_madd_epi16(T06B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
933 T36 = _mm_madd_epi16(T07A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
934 T37 = _mm_madd_epi16(T07B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
936 T40 = _mm_hadd_epi32(T20, T21); \
937 T41 = _mm_hadd_epi32(T22, T23); \
938 T42 = _mm_hadd_epi32(T24, T25); \
939 T43 = _mm_hadd_epi32(T26, T27); \
940 T44 = _mm_hadd_epi32(T30, T31); \
941 T45 = _mm_hadd_epi32(T32, T33); \
942 T46 = _mm_hadd_epi32(T34, T35); \
943 T47 = _mm_hadd_epi32(T36, T37); \
945 T50 = _mm_hadd_epi32(T40, T41); \
946 T51 = _mm_hadd_epi32(T42, T43); \
947 T52 = _mm_hadd_epi32(T44, T45); \
948 T53 = _mm_hadd_epi32(T46, T47); \
950 T50 = _mm_hadd_epi32(T50, T51); \
951 T51 = _mm_hadd_epi32(T52, T53); \
952 T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
953 T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
954 T60 = _mm_packs_epi32(T50, T51); \
955 im[(dstPos)][i] = T60;
978 for (i
= 0; i
< 32 / 4; i
++)
980 // OPT_ME: to avoid register spill, I use matrix multiply, have other way?
981 T00A
= im
[i
* 4 + 0][0]; // [07 06 05 04 03 02 01 00]
982 T00B
= im
[i
* 4 + 0][1]; // [15 14 13 12 11 10 09 08]
983 T00C
= im
[i
* 4 + 0][2]; // [23 22 21 20 19 18 17 16]
984 T00D
= im
[i
* 4 + 0][3]; // [31 30 29 28 27 26 25 24]
985 T01A
= im
[i
* 4 + 1][0];
986 T01B
= im
[i
* 4 + 1][1];
987 T01C
= im
[i
* 4 + 1][2];
988 T01D
= im
[i
* 4 + 1][3];
989 T02A
= im
[i
* 4 + 2][0];
990 T02B
= im
[i
* 4 + 2][1];
991 T02C
= im
[i
* 4 + 2][2];
992 T02D
= im
[i
* 4 + 2][3];
993 T03A
= im
[i
* 4 + 3][0];
994 T03B
= im
[i
* 4 + 3][1];
995 T03C
= im
[i
* 4 + 3][2];
996 T03D
= im
[i
* 4 + 3][3];
998 T00C
= _mm_shuffle_epi8(T00C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0])); // [16 17 18 19 20 21 22 23]
999 T00D
= _mm_shuffle_epi8(T00D
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0])); // [24 25 26 27 28 29 30 31]
1000 T01C
= _mm_shuffle_epi8(T01C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
1001 T01D
= _mm_shuffle_epi8(T01D
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
1002 T02C
= _mm_shuffle_epi8(T02C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
1003 T02D
= _mm_shuffle_epi8(T02D
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
1004 T03C
= _mm_shuffle_epi8(T03C
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
1005 T03D
= _mm_shuffle_epi8(T03D
, _mm_load_si128((__m128i
*)tab_dct_16_0
[0]));
1007 T10A
= _mm_unpacklo_epi16(T00A
, T00D
); // [28 03 29 02 30 01 31 00]
1008 T10B
= _mm_unpackhi_epi16(T00A
, T00D
); // [24 07 25 06 26 05 27 04]
1009 T00A
= _mm_unpacklo_epi16(T00B
, T00C
); // [20 11 21 10 22 09 23 08]
1010 T00B
= _mm_unpackhi_epi16(T00B
, T00C
); // [16 15 17 14 18 13 19 12]
1011 T11A
= _mm_unpacklo_epi16(T01A
, T01D
);
1012 T11B
= _mm_unpackhi_epi16(T01A
, T01D
);
1013 T01A
= _mm_unpacklo_epi16(T01B
, T01C
);
1014 T01B
= _mm_unpackhi_epi16(T01B
, T01C
);
1015 T12A
= _mm_unpacklo_epi16(T02A
, T02D
);
1016 T12B
= _mm_unpackhi_epi16(T02A
, T02D
);
1017 T02A
= _mm_unpacklo_epi16(T02B
, T02C
);
1018 T02B
= _mm_unpackhi_epi16(T02B
, T02C
);
1019 T13A
= _mm_unpacklo_epi16(T03A
, T03D
);
1020 T13B
= _mm_unpackhi_epi16(T03A
, T03D
);
1021 T03A
= _mm_unpacklo_epi16(T03B
, T03C
);
1022 T03B
= _mm_unpackhi_epi16(T03B
, T03C
);
1024 #define MAKE_ODD(tab0, tab1, tab2, tab3, dstPos) \
1025 T20 = _mm_madd_epi16(T10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1026 T21 = _mm_madd_epi16(T10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1027 T22 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1028 T23 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1029 T24 = _mm_madd_epi16(T11A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1030 T25 = _mm_madd_epi16(T11B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1031 T26 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1032 T27 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1033 T30 = _mm_madd_epi16(T12A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1034 T31 = _mm_madd_epi16(T12B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1035 T32 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1036 T33 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1037 T34 = _mm_madd_epi16(T13A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1038 T35 = _mm_madd_epi16(T13B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1039 T36 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1040 T37 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1042 T60 = _mm_hadd_epi32(T20, T21); \
1043 T61 = _mm_hadd_epi32(T22, T23); \
1044 T62 = _mm_hadd_epi32(T24, T25); \
1045 T63 = _mm_hadd_epi32(T26, T27); \
1046 T64 = _mm_hadd_epi32(T30, T31); \
1047 T65 = _mm_hadd_epi32(T32, T33); \
1048 T66 = _mm_hadd_epi32(T34, T35); \
1049 T67 = _mm_hadd_epi32(T36, T37); \
1051 T60 = _mm_hadd_epi32(T60, T61); \
1052 T61 = _mm_hadd_epi32(T62, T63); \
1053 T62 = _mm_hadd_epi32(T64, T65); \
1054 T63 = _mm_hadd_epi32(T66, T67); \
1056 T60 = _mm_hadd_epi32(T60, T61); \
1057 T61 = _mm_hadd_epi32(T62, T63); \
1059 T60 = _mm_hadd_epi32(T60, T61); \
1061 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \
1062 T60 = _mm_packs_epi32(T60, T60); \
1063 _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
1065 MAKE_ODD(44, 44, 44, 44, 0);
1066 MAKE_ODD(45, 45, 45, 45, 16);
1067 MAKE_ODD(46, 47, 46, 47, 8);
1068 MAKE_ODD(48, 49, 48, 49, 24);
1070 MAKE_ODD(50, 51, 52, 53, 4);
1071 MAKE_ODD(54, 55, 56, 57, 12);
1072 MAKE_ODD(58, 59, 60, 61, 20);
1073 MAKE_ODD(62, 63, 64, 65, 28);
1075 MAKE_ODD(66, 67, 68, 69, 2);
1076 MAKE_ODD(70, 71, 72, 73, 6);
1077 MAKE_ODD(74, 75, 76, 77, 10);
1078 MAKE_ODD(78, 79, 80, 81, 14);
1080 MAKE_ODD(82, 83, 84, 85, 18);
1081 MAKE_ODD(86, 87, 88, 89, 22);
1082 MAKE_ODD(90, 91, 92, 93, 26);
1083 MAKE_ODD(94, 95, 96, 97, 30);
1085 MAKE_ODD(98, 99, 100, 101, 1);
1086 MAKE_ODD(102, 103, 104, 105, 3);
1087 MAKE_ODD(106, 107, 108, 109, 5);
1088 MAKE_ODD(110, 111, 112, 113, 7);
1089 MAKE_ODD(114, 115, 116, 117, 9);
1090 MAKE_ODD(118, 119, 120, 121, 11);
1091 MAKE_ODD(122, 123, 124, 125, 13);
1092 MAKE_ODD(126, 127, 128, 129, 15);
1093 MAKE_ODD(130, 131, 132, 133, 17);
1094 MAKE_ODD(134, 135, 136, 137, 19);
1095 MAKE_ODD(138, 139, 140, 141, 21);
1096 MAKE_ODD(142, 143, 144, 145, 23);
1097 MAKE_ODD(146, 147, 148, 149, 25);
1098 MAKE_ODD(150, 151, 152, 153, 27);
1099 MAKE_ODD(154, 155, 156, 157, 29);
1100 MAKE_ODD(158, 159, 160, 161, 31);
1105 #endif // if !HIGH_BIT_DEPTH
1108 void Setup_Vec_DCTPrimitives_ssse3(EncoderPrimitives
&p
)
1110 /* Note: We have AVX2 assembly for these two functions, but since AVX2 is
1111 * still somewhat rare on end-user PCs we still compile and link these SSSE3
1112 * intrinsic SIMD functions */
1114 p
.dct
[DCT_16x16
] = dct16
;
1115 p
.dct
[DCT_32x32
] = dct32
;