bbb7858bdb51869f3b0725953a0a79390463a4f6
[deb_x265.git] / source / common / vec / dct-ssse3.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
7 * Mahesh Pittala <mahesh@multicorewareinc.com>
8 * Rajesh Paulraj <rajesh@multicorewareinc.com>
9 * Min Chen <min.chen@multicorewareinc.com>
10 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
11 * Nabajit Deka <nabajit@multicorewareinc.com>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 *
27 * This program is also available under a commercial proprietary license.
28 * For more information, contact us at license @ x265.com.
29 *****************************************************************************/
30
31 #include "common.h"
32 #include "primitives.h"
33 #include <xmmintrin.h> // SSE
34 #include <pmmintrin.h> // SSE3
35 #include <tmmintrin.h> // SSSE3
36
37 using namespace x265;
38
39 #if !HIGH_BIT_DEPTH
40 namespace {
41 ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
42 {
43 { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A },
44
45 { 64, 64, 64, 64, 64, 64, 64, 64 },
46 { 64, -64, 64, -64, 64, -64, 64, -64 },
47 { 83, 36, 83, 36, 83, 36, 83, 36 },
48 { 36, -83, 36, -83, 36, -83, 36, -83 },
49 { 89, 18, 75, 50, 89, 18, 75, 50 },
50 { 75, -50, -18, -89, 75, -50, -18, -89 },
51 { 50, 75, -89, 18, 50, 75, -89, 18 },
52 { 18, -89, -50, 75, 18, -89, -50, 75 },
53
54 { 83, 83, -83, -83, 36, 36, -36, -36 },
55 { 36, 36, -36, -36, -83, -83, 83, 83 },
56 { 89, -89, 18, -18, 75, -75, 50, -50 },
57 { 75, -75, -50, 50, -18, 18, -89, 89 },
58 { 50, -50, 75, -75, -89, 89, 18, -18 },
59 { 18, -18, -89, 89, -50, 50, 75, -75 },
60 };
61
62 ALIGN_VAR_32(static const int16_t, tab_dct_16_0[][8]) =
63 {
64 { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 }, // 0
65 { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, // 1
66 { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A }, // 2
67 { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 }, // 3
68 };
69
70 ALIGN_VAR_32(static const int16_t, tab_dct_16_1[][8]) =
71 {
72 { 90, 87, 80, 70, 57, 43, 25, 9 }, // 0
73 { 87, 57, 9, -43, -80, -90, -70, -25 }, // 1
74 { 80, 9, -70, -87, -25, 57, 90, 43 }, // 2
75 { 70, -43, -87, 9, 90, 25, -80, -57 }, // 3
76 { 57, -80, -25, 90, -9, -87, 43, 70 }, // 4
77 { 43, -90, 57, 25, -87, 70, 9, -80 }, // 5
78 { 25, -70, 90, -80, 43, 9, -57, 87 }, // 6
79 { 9, -25, 43, -57, 70, -80, 87, -90 }, // 7
80 { 83, 83, -83, -83, 36, 36, -36, -36 }, // 8
81 { 36, 36, -36, -36, -83, -83, 83, 83 }, // 9
82 { 89, 89, 18, 18, 75, 75, 50, 50 }, // 10
83 { 75, 75, -50, -50, -18, -18, -89, -89 }, // 11
84 { 50, 50, 75, 75, -89, -89, 18, 18 }, // 12
85 { 18, 18, -89, -89, -50, -50, 75, 75 }, // 13
86
87 #define MAKE_COEF(a0, a1, a2, a3, a4, a5, a6, a7) \
88 { (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2) \
89 }, \
90 { (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5) },
91
92 MAKE_COEF(90, 87, 80, 70, 57, 43, 25, 9)
93 MAKE_COEF(87, 57, 9, -43, -80, -90, -70, -25)
94 MAKE_COEF(80, 9, -70, -87, -25, 57, 90, 43)
95 MAKE_COEF(70, -43, -87, 9, 90, 25, -80, -57)
96 MAKE_COEF(57, -80, -25, 90, -9, -87, 43, 70)
97 MAKE_COEF(43, -90, 57, 25, -87, 70, 9, -80)
98 MAKE_COEF(25, -70, 90, -80, 43, 9, -57, 87)
99 MAKE_COEF(9, -25, 43, -57, 70, -80, 87, -90)
100 #undef MAKE_COEF
101 };
102
103 void dct16(int16_t *src, int32_t *dst, intptr_t stride)
104 {
105 // Const
106 __m128i c_4 = _mm_set1_epi32(4);
107 __m128i c_512 = _mm_set1_epi32(512);
108
109 int i;
110
111 ALIGN_VAR_32(int16_t, tmp[16 * 16]);
112
113 __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A;
114 __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B;
115 __m128i T10, T11, T12, T13, T14, T15, T16, T17;
116 __m128i T20, T21, T22, T23, T24, T25, T26, T27;
117 __m128i T30, T31, T32, T33, T34, T35, T36, T37;
118 __m128i T40, T41, T42, T43, T44, T45, T46, T47;
119 __m128i T50, T51, T52, T53;
120 __m128i T60, T61, T62, T63, T64, T65, T66, T67;
121 __m128i T70;
122
123 // DCT1
124 for (i = 0; i < 16; i += 8)
125 {
126 T00A = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 0]); // [07 06 05 04 03 02 01 00]
127 T00B = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 8]); // [0F 0E 0D 0C 0B 0A 09 08]
128 T01A = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 0]); // [17 16 15 14 13 12 11 10]
129 T01B = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 8]); // [1F 1E 1D 1C 1B 1A 19 18]
130 T02A = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 0]); // [27 26 25 24 23 22 21 20]
131 T02B = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 8]); // [2F 2E 2D 2C 2B 2A 29 28]
132 T03A = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 0]); // [37 36 35 34 33 32 31 30]
133 T03B = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 8]); // [3F 3E 3D 3C 3B 3A 39 38]
134 T04A = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 0]); // [47 46 45 44 43 42 41 40]
135 T04B = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 8]); // [4F 4E 4D 4C 4B 4A 49 48]
136 T05A = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 0]); // [57 56 55 54 53 52 51 50]
137 T05B = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 8]); // [5F 5E 5D 5C 5B 5A 59 58]
138 T06A = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 0]); // [67 66 65 64 63 62 61 60]
139 T06B = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 8]); // [6F 6E 6D 6C 6B 6A 69 68]
140 T07A = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 0]); // [77 76 75 74 73 72 71 70]
141 T07B = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 8]); // [7F 7E 7D 7C 7B 7A 79 78]
142
143 T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
144 T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
145 T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
146 T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
147 T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
148 T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
149 T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
150 T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
151
152 T10 = _mm_add_epi16(T00A, T00B);
153 T11 = _mm_add_epi16(T01A, T01B);
154 T12 = _mm_add_epi16(T02A, T02B);
155 T13 = _mm_add_epi16(T03A, T03B);
156 T14 = _mm_add_epi16(T04A, T04B);
157 T15 = _mm_add_epi16(T05A, T05B);
158 T16 = _mm_add_epi16(T06A, T06B);
159 T17 = _mm_add_epi16(T07A, T07B);
160
161 T20 = _mm_sub_epi16(T00A, T00B);
162 T21 = _mm_sub_epi16(T01A, T01B);
163 T22 = _mm_sub_epi16(T02A, T02B);
164 T23 = _mm_sub_epi16(T03A, T03B);
165 T24 = _mm_sub_epi16(T04A, T04B);
166 T25 = _mm_sub_epi16(T05A, T05B);
167 T26 = _mm_sub_epi16(T06A, T06B);
168 T27 = _mm_sub_epi16(T07A, T07B);
169
170 T30 = _mm_shuffle_epi8(T10, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
171 T31 = _mm_shuffle_epi8(T11, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
172 T32 = _mm_shuffle_epi8(T12, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
173 T33 = _mm_shuffle_epi8(T13, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
174 T34 = _mm_shuffle_epi8(T14, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
175 T35 = _mm_shuffle_epi8(T15, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
176 T36 = _mm_shuffle_epi8(T16, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
177 T37 = _mm_shuffle_epi8(T17, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
178
179 T40 = _mm_hadd_epi16(T30, T31);
180 T41 = _mm_hadd_epi16(T32, T33);
181 T42 = _mm_hadd_epi16(T34, T35);
182 T43 = _mm_hadd_epi16(T36, T37);
183 T44 = _mm_hsub_epi16(T30, T31);
184 T45 = _mm_hsub_epi16(T32, T33);
185 T46 = _mm_hsub_epi16(T34, T35);
186 T47 = _mm_hsub_epi16(T36, T37);
187
188 T50 = _mm_hadd_epi16(T40, T41);
189 T51 = _mm_hadd_epi16(T42, T43);
190 T52 = _mm_hsub_epi16(T40, T41);
191 T53 = _mm_hsub_epi16(T42, T43);
192
193 T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
194 T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
195 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
196 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
197 T70 = _mm_packs_epi32(T60, T61);
198 _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
199
200 T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
201 T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
202 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
203 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
204 T70 = _mm_packs_epi32(T60, T61);
205 _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
206
207 T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
208 T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
209 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
210 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
211 T70 = _mm_packs_epi32(T60, T61);
212 _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
213
214 T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
215 T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
216 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
217 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
218 T70 = _mm_packs_epi32(T60, T61);
219 _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
220
221 T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[5]));
222 T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[5]));
223 T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[5]));
224 T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
225 T60 = _mm_hadd_epi32(T60, T61);
226 T61 = _mm_hadd_epi32(T62, T63);
227 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
228 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
229 T70 = _mm_packs_epi32(T60, T61);
230 _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
231
232 T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[6]));
233 T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[6]));
234 T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[6]));
235 T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
236 T60 = _mm_hadd_epi32(T60, T61);
237 T61 = _mm_hadd_epi32(T62, T63);
238 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
239 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
240 T70 = _mm_packs_epi32(T60, T61);
241 _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
242
243 T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[7]));
244 T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[7]));
245 T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[7]));
246 T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
247 T60 = _mm_hadd_epi32(T60, T61);
248 T61 = _mm_hadd_epi32(T62, T63);
249 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
250 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
251 T70 = _mm_packs_epi32(T60, T61);
252 _mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70);
253
254 T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[8]));
255 T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[8]));
256 T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[8]));
257 T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8]));
258 T60 = _mm_hadd_epi32(T60, T61);
259 T61 = _mm_hadd_epi32(T62, T63);
260 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
261 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
262 T70 = _mm_packs_epi32(T60, T61);
263 _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
264
265 #define MAKE_ODD(tab, dstPos) \
266 T60 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
267 T61 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
268 T62 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
269 T63 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
270 T64 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
271 T65 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
272 T66 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
273 T67 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
274 T60 = _mm_hadd_epi32(T60, T61); \
275 T61 = _mm_hadd_epi32(T62, T63); \
276 T62 = _mm_hadd_epi32(T64, T65); \
277 T63 = _mm_hadd_epi32(T66, T67); \
278 T60 = _mm_hadd_epi32(T60, T61); \
279 T61 = _mm_hadd_epi32(T62, T63); \
280 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); \
281 T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); \
282 T70 = _mm_packs_epi32(T60, T61); \
283 _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
284
285 MAKE_ODD(0, 1);
286 MAKE_ODD(1, 3);
287 MAKE_ODD(2, 5);
288 MAKE_ODD(3, 7);
289 MAKE_ODD(4, 9);
290 MAKE_ODD(5, 11);
291 MAKE_ODD(6, 13);
292 MAKE_ODD(7, 15);
293 #undef MAKE_ODD
294 }
295
296 // DCT2
297 for (i = 0; i < 16; i += 4)
298 {
299 T00A = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 0]); // [07 06 05 04 03 02 01 00]
300 T00B = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 8]); // [0F 0E 0D 0C 0B 0A 09 08]
301 T01A = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 0]); // [17 16 15 14 13 12 11 10]
302 T01B = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 8]); // [1F 1E 1D 1C 1B 1A 19 18]
303 T02A = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 0]); // [27 26 25 24 23 22 21 20]
304 T02B = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 8]); // [2F 2E 2D 2C 2B 2A 29 28]
305 T03A = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 0]); // [37 36 35 34 33 32 31 30]
306 T03B = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 8]); // [3F 3E 3D 3C 3B 3A 39 38]
307
308 T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[2]));
309 T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[3]));
310 T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[2]));
311 T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[3]));
312 T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[2]));
313 T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[3]));
314 T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[2]));
315 T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[3]));
316
317 T10 = _mm_unpacklo_epi16(T00A, T00B);
318 T11 = _mm_unpackhi_epi16(T00A, T00B);
319 T12 = _mm_unpacklo_epi16(T01A, T01B);
320 T13 = _mm_unpackhi_epi16(T01A, T01B);
321 T14 = _mm_unpacklo_epi16(T02A, T02B);
322 T15 = _mm_unpackhi_epi16(T02A, T02B);
323 T16 = _mm_unpacklo_epi16(T03A, T03B);
324 T17 = _mm_unpackhi_epi16(T03A, T03B);
325
326 T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_8[1]));
327 T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_8[1]));
328 T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_8[1]));
329 T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_8[1]));
330 T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_8[1]));
331 T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_8[1]));
332 T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_8[1]));
333 T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_8[1]));
334
335 T30 = _mm_add_epi32(T20, T21);
336 T31 = _mm_add_epi32(T22, T23);
337 T32 = _mm_add_epi32(T24, T25);
338 T33 = _mm_add_epi32(T26, T27);
339
340 T30 = _mm_hadd_epi32(T30, T31);
341 T31 = _mm_hadd_epi32(T32, T33);
342
343 T40 = _mm_hadd_epi32(T30, T31);
344 T41 = _mm_hsub_epi32(T30, T31);
345 T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
346 T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10);
347 _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40);
348 _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41);
349
350 T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
351 T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
352 T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
353 T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
354 T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
355 T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
356 T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
357 T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
358
359 T30 = _mm_add_epi32(T20, T21);
360 T31 = _mm_add_epi32(T22, T23);
361 T32 = _mm_add_epi32(T24, T25);
362 T33 = _mm_add_epi32(T26, T27);
363
364 T30 = _mm_hadd_epi32(T30, T31);
365 T31 = _mm_hadd_epi32(T32, T33);
366
367 T40 = _mm_hadd_epi32(T30, T31);
368 T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
369 _mm_storeu_si128((__m128i*)&dst[4 * 16 + i], T40);
370
371 T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
372 T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
373 T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
374 T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
375 T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
376 T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
377 T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
378 T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
379
380 T30 = _mm_add_epi32(T20, T21);
381 T31 = _mm_add_epi32(T22, T23);
382 T32 = _mm_add_epi32(T24, T25);
383 T33 = _mm_add_epi32(T26, T27);
384
385 T30 = _mm_hadd_epi32(T30, T31);
386 T31 = _mm_hadd_epi32(T32, T33);
387
388 T40 = _mm_hadd_epi32(T30, T31);
389 T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
390 _mm_storeu_si128((__m128i*)&dst[12 * 16 + i], T40);
391
392 T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
393 T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
394 T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
395 T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
396 T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
397 T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
398 T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
399 T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
400
401 T30 = _mm_sub_epi32(T20, T21);
402 T31 = _mm_sub_epi32(T22, T23);
403 T32 = _mm_sub_epi32(T24, T25);
404 T33 = _mm_sub_epi32(T26, T27);
405
406 T30 = _mm_hadd_epi32(T30, T31);
407 T31 = _mm_hadd_epi32(T32, T33);
408
409 T40 = _mm_hadd_epi32(T30, T31);
410 T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
411 _mm_storeu_si128((__m128i*)&dst[2 * 16 + i], T40);
412
413 T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
414 T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
415 T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
416 T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
417 T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
418 T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
419 T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
420 T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
421
422 T30 = _mm_sub_epi32(T20, T21);
423 T31 = _mm_sub_epi32(T22, T23);
424 T32 = _mm_sub_epi32(T24, T25);
425 T33 = _mm_sub_epi32(T26, T27);
426
427 T30 = _mm_hadd_epi32(T30, T31);
428 T31 = _mm_hadd_epi32(T32, T33);
429
430 T40 = _mm_hadd_epi32(T30, T31);
431 T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
432 _mm_storeu_si128((__m128i*)&dst[6 * 16 + i], T40);
433
434 T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
435 T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
436 T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
437 T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
438 T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
439 T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
440 T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
441 T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
442
443 T30 = _mm_sub_epi32(T20, T21);
444 T31 = _mm_sub_epi32(T22, T23);
445 T32 = _mm_sub_epi32(T24, T25);
446 T33 = _mm_sub_epi32(T26, T27);
447
448 T30 = _mm_hadd_epi32(T30, T31);
449 T31 = _mm_hadd_epi32(T32, T33);
450
451 T40 = _mm_hadd_epi32(T30, T31);
452 T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
453 _mm_storeu_si128((__m128i*)&dst[10 * 16 + i], T40);
454
455 T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
456 T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
457 T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
458 T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
459 T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
460 T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
461 T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
462 T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
463
464 T30 = _mm_sub_epi32(T20, T21);
465 T31 = _mm_sub_epi32(T22, T23);
466 T32 = _mm_sub_epi32(T24, T25);
467 T33 = _mm_sub_epi32(T26, T27);
468
469 T30 = _mm_hadd_epi32(T30, T31);
470 T31 = _mm_hadd_epi32(T32, T33);
471
472 T40 = _mm_hadd_epi32(T30, T31);
473 T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
474 _mm_storeu_si128((__m128i*)&dst[14 * 16 + i], T40);
475
476 #define MAKE_ODD(tab, dstPos) \
477 T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \
478 T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); /* [*O5_0 *O6_0 *O4_0 *O7_0] */ \
479 T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
480 T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \
481 T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
482 T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \
483 T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
484 T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \
485 \
486 T30 = _mm_add_epi32(T20, T21); \
487 T31 = _mm_add_epi32(T22, T23); \
488 T32 = _mm_add_epi32(T24, T25); \
489 T33 = _mm_add_epi32(T26, T27); \
490 \
491 T30 = _mm_hadd_epi32(T30, T31); \
492 T31 = _mm_hadd_epi32(T32, T33); \
493 \
494 T40 = _mm_hadd_epi32(T30, T31); \
495 T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \
496 _mm_storeu_si128((__m128i*)&dst[(dstPos) * 16 + i], T40);
497
498 MAKE_ODD(14, 1);
499 MAKE_ODD(16, 3);
500 MAKE_ODD(18, 5);
501 MAKE_ODD(20, 7);
502 MAKE_ODD(22, 9);
503 MAKE_ODD(24, 11);
504 MAKE_ODD(26, 13);
505 MAKE_ODD(28, 15);
506 #undef MAKE_ODD
507 }
508 }
509
510 ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) =
511 {
512 { 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504 }, // 0
513 };
514
515 ALIGN_VAR_32(static const int16_t, tab_dct_32_1[][8]) =
516 {
517 { 89, -89, 18, -18, 75, -75, 50, -50 }, // 0
518 { 75, -75, -50, 50, -18, 18, -89, 89 }, // 1
519 { 50, -50, 75, -75, -89, 89, 18, -18 }, // 2
520 { 18, -18, -89, 89, -50, 50, 75, -75 }, // 3
521
522 #define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \
523 { (a0), (a7), (a3), (a4), (a1), (a6), (a2), (a5) \
524 }, \
525
526 MAKE_COEF8(90, 87, 80, 70, 57, 43, 25, 9) // 4
527 MAKE_COEF8(87, 57, 9, -43, -80, -90, -70, -25) // 5
528 MAKE_COEF8(80, 9, -70, -87, -25, 57, 90, 43) // 6
529 MAKE_COEF8(70, -43, -87, 9, 90, 25, -80, -57) // 7
530 MAKE_COEF8(57, -80, -25, 90, -9, -87, 43, 70) // 8
531 MAKE_COEF8(43, -90, 57, 25, -87, 70, 9, -80) // 9
532 MAKE_COEF8(25, -70, 90, -80, 43, 9, -57, 87) // 10
533 MAKE_COEF8(9, -25, 43, -57, 70, -80, 87, -90) // 11
534 #undef MAKE_COEF8
535
536 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \
537 { (a00), (a07), (a03), (a04), (a01), (a06), (a02), (a05) }, \
538 { (a15), (a08), (a12), (a11), (a14), (a09), (a13), (a10) },
539
540 MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 12
541 MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) // 14
542 MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) // 16
543 MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31) // 18
544 MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38) // 20
545 MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) // 22
546 MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) // 24
547 MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61) // 26
548 MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) // 28
549 MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) // 30
550 MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78) // 32
551 MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82) // 34
552 MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) // 36
553 MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) // 38
554 MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90) // 40
555 MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90) // 42
556 #undef MAKE_COEF16
557
558 {
559 64, 64, 64, 64, 64, 64, 64, 64
560 }, // 44
561
562 { 64, 64, -64, -64, -64, -64, 64, 64 }, // 45
563
564 { 83, 83, 36, 36, -36, -36, -83, -83 }, // 46
565 { -83, -83, -36, -36, 36, 36, 83, 83 }, // 47
566
567 { 36, 36, -83, -83, 83, 83, -36, -36 }, // 48
568 { -36, -36, 83, 83, -83, -83, 36, 36 }, // 49
569
570 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \
571 { (a00), (a00), (a01), (a01), (a02), (a02), (a03), (a03) }, \
572 { (a04), (a04), (a05), (a05), (a06), (a06), (a07), (a07) }, \
573 { (a08), (a08), (a09), (a09), (a10), (a10), (a11), (a11) }, \
574 { (a12), (a12), (a13), (a13), (a14), (a14), (a15), (a15) },
575
576 MAKE_COEF16(89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89) // 50
577 MAKE_COEF16(75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75) // 54
578
579 // TODO: convert below table here
580 #undef MAKE_COEF16
581
582 {
583 50, 50, -89, -89, 18, 18, 75, 75
584 }, // 58
585 { -75, -75, -18, -18, 89, 89, -50, -50 }, // 59
586 { -50, -50, 89, 89, -18, -18, -75, -75 }, // 60
587 { 75, 75, 18, 18, -89, -89, 50, 50 }, // 61
588
589 { 18, 18, -50, -50, 75, 75, -89, -89 }, // 62
590 { 89, 89, -75, -75, 50, 50, -18, -18 }, // 63
591 { -18, -18, 50, 50, -75, -75, 89, 89 }, // 64
592 { -89, -89, 75, 75, -50, -50, 18, 18 }, // 65
593
594 { 90, 90, 87, 87, 80, 80, 70, 70 }, // 66
595 { 57, 57, 43, 43, 25, 25, 9, 9 }, // 67
596 { -9, -9, -25, -25, -43, -43, -57, -57 }, // 68
597 { -70, -70, -80, -80, -87, -87, -90, -90 }, // 69
598
599 { 87, 87, 57, 57, 9, 9, -43, -43 }, // 70
600 { -80, -80, -90, -90, -70, -70, -25, -25 }, // 71
601 { 25, 25, 70, 70, 90, 90, 80, 80 }, // 72
602 { 43, 43, -9, -9, -57, -57, -87, -87 }, // 73
603
604 { 80, 80, 9, 9, -70, -70, -87, -87 }, // 74
605 { -25, -25, 57, 57, 90, 90, 43, 43 }, // 75
606 { -43, -43, -90, -90, -57, -57, 25, 25 }, // 76
607 { 87, 87, 70, 70, -9, -9, -80, -80 }, // 77
608
609 { 70, 70, -43, -43, -87, -87, 9, 9 }, // 78
610 { 90, 90, 25, 25, -80, -80, -57, -57 }, // 79
611 { 57, 57, 80, 80, -25, -25, -90, -90 }, // 80
612 { -9, -9, 87, 87, 43, 43, -70, -70 }, // 81
613
614 { 57, 57, -80, -80, -25, -25, 90, 90 }, // 82
615 { -9, -9, -87, -87, 43, 43, 70, 70 }, // 83
616 { -70, -70, -43, -43, 87, 87, 9, 9 }, // 84
617 { -90, -90, 25, 25, 80, 80, -57, -57 }, // 85
618
619 { 43, 43, -90, -90, 57, 57, 25, 25 }, // 86
620 { -87, -87, 70, 70, 9, 9, -80, -80 }, // 87
621 { 80, 80, -9, -9, -70, -70, 87, 87 }, // 88
622 { -25, -25, -57, -57, 90, 90, -43, -43 }, // 89
623
624 { 25, 25, -70, -70, 90, 90, -80, -80 }, // 90
625 { 43, 43, 9, 9, -57, -57, 87, 87 }, // 91
626 { -87, -87, 57, 57, -9, -9, -43, -43 }, // 92
627 { 80, 80, -90, -90, 70, 70, -25, -25 }, // 93
628
629 { 9, 9, -25, -25, 43, 43, -57, -57 }, // 94
630 { 70, 70, -80, -80, 87, 87, -90, -90 }, // 95
631 { 90, 90, -87, -87, 80, 80, -70, -70 }, // 96
632 { 57, 57, -43, -43, 25, 25, -9, -9 }, // 97
633
634 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \
635 { (a00), -(a00), (a01), -(a01), (a02), -(a02), (a03), -(a03) }, \
636 { (a04), -(a04), (a05), -(a05), (a06), -(a06), (a07), -(a07) }, \
637 { (a08), -(a08), (a09), -(a09), (a10), -(a10), (a11), -(a11) }, \
638 { (a12), -(a12), (a13), -(a13), (a14), -(a14), (a15), -(a15) },
639
640 MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 98
641 MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) //102
642 MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) //106
643 MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, +82, 88, 54, -4, -61, -90, -78, -31) //110
644 MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, +31, -46, -90, -67, 4, 73, 88, 38) //114
645 MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) //118
646 MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) //122
647 MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, +90, 13, -88, -31, 82, 46, -73, -61) //126
648 MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) //130
649 MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) //134
650 MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, +22, 67, -85, 13, 73, -82, 4, 78) //138
651 MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, +85, -78, 13, 61, -90, 54, 22, -82) //142
652 MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) //146
653 MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) //150
654 MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, +54, -31, 4, 22, -46, 67, -82, 90) //154
655 MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, +67, -73, 78, -82, 85, -88, 90, -90) //158
656
657 #undef MAKE_COEF16
658 };
659
660 void dct32(int16_t *src, int32_t *dst, intptr_t stride)
661 {
662 // Const
663 __m128i c_8 = _mm_set1_epi32(8);
664 __m128i c_1024 = _mm_set1_epi32(1024);
665
666 int i;
667
668 __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A;
669 __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B;
670 __m128i T00C, T01C, T02C, T03C, T04C, T05C, T06C, T07C;
671 __m128i T00D, T01D, T02D, T03D, T04D, T05D, T06D, T07D;
672 __m128i T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A;
673 __m128i T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B;
674 __m128i T20, T21, T22, T23, T24, T25, T26, T27;
675 __m128i T30, T31, T32, T33, T34, T35, T36, T37;
676 __m128i T40, T41, T42, T43, T44, T45, T46, T47;
677 __m128i T50, T51, T52, T53;
678 __m128i T60, T61, T62, T63, T64, T65, T66, T67;
679 __m128i im[32][4];
680
681 // DCT1
682 for (i = 0; i < 32 / 8; i++)
683 {
684 T00A = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 0]); // [07 06 05 04 03 02 01 00]
685 T00B = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 8]); // [15 14 13 12 11 10 09 08]
686 T00C = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 16]); // [23 22 21 20 19 18 17 16]
687 T00D = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 24]); // [31 30 29 28 27 26 25 24]
688 T01A = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 0]);
689 T01B = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 8]);
690 T01C = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 16]);
691 T01D = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 24]);
692 T02A = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 0]);
693 T02B = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 8]);
694 T02C = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 16]);
695 T02D = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 24]);
696 T03A = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 0]);
697 T03B = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 8]);
698 T03C = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 16]);
699 T03D = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 24]);
700 T04A = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 0]);
701 T04B = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 8]);
702 T04C = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 16]);
703 T04D = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 24]);
704 T05A = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 0]);
705 T05B = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 8]);
706 T05C = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 16]);
707 T05D = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 24]);
708 T06A = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 0]);
709 T06B = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 8]);
710 T06C = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 16]);
711 T06D = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 24]);
712 T07A = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 0]);
713 T07B = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 8]);
714 T07C = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 16]);
715 T07D = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 24]);
716
717 T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00]
718 T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15]
719 T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [21 18 22 17 20 19 23 16]
720 T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [26 29 25 30 27 28 24 31]
721 T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
722 T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
723 T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
724 T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
725 T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
726 T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
727 T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
728 T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
729 T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
730 T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
731 T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
732 T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
733 T04A = _mm_shuffle_epi8(T04A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
734 T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
735 T04C = _mm_shuffle_epi8(T04C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
736 T04D = _mm_shuffle_epi8(T04D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
737 T05A = _mm_shuffle_epi8(T05A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
738 T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
739 T05C = _mm_shuffle_epi8(T05C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
740 T05D = _mm_shuffle_epi8(T05D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
741 T06A = _mm_shuffle_epi8(T06A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
742 T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
743 T06C = _mm_shuffle_epi8(T06C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
744 T06D = _mm_shuffle_epi8(T06D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
745 T07A = _mm_shuffle_epi8(T07A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
746 T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
747 T07C = _mm_shuffle_epi8(T07C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
748 T07D = _mm_shuffle_epi8(T07D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
749
750 T10A = _mm_add_epi16(T00A, T00D); // [E05 E02 E06 E01 E04 E03 E07 E00]
751 T10B = _mm_add_epi16(T00B, T00C); // [E10 E13 E09 E14 E11 E12 E08 E15]
752 T11A = _mm_add_epi16(T01A, T01D);
753 T11B = _mm_add_epi16(T01B, T01C);
754 T12A = _mm_add_epi16(T02A, T02D);
755 T12B = _mm_add_epi16(T02B, T02C);
756 T13A = _mm_add_epi16(T03A, T03D);
757 T13B = _mm_add_epi16(T03B, T03C);
758 T14A = _mm_add_epi16(T04A, T04D);
759 T14B = _mm_add_epi16(T04B, T04C);
760 T15A = _mm_add_epi16(T05A, T05D);
761 T15B = _mm_add_epi16(T05B, T05C);
762 T16A = _mm_add_epi16(T06A, T06D);
763 T16B = _mm_add_epi16(T06B, T06C);
764 T17A = _mm_add_epi16(T07A, T07D);
765 T17B = _mm_add_epi16(T07B, T07C);
766
767 T00A = _mm_sub_epi16(T00A, T00D); // [O05 O02 O06 O01 O04 O03 O07 O00]
768 T00B = _mm_sub_epi16(T00B, T00C); // [O10 O13 O09 O14 O11 O12 O08 O15]
769 T01A = _mm_sub_epi16(T01A, T01D);
770 T01B = _mm_sub_epi16(T01B, T01C);
771 T02A = _mm_sub_epi16(T02A, T02D);
772 T02B = _mm_sub_epi16(T02B, T02C);
773 T03A = _mm_sub_epi16(T03A, T03D);
774 T03B = _mm_sub_epi16(T03B, T03C);
775 T04A = _mm_sub_epi16(T04A, T04D);
776 T04B = _mm_sub_epi16(T04B, T04C);
777 T05A = _mm_sub_epi16(T05A, T05D);
778 T05B = _mm_sub_epi16(T05B, T05C);
779 T06A = _mm_sub_epi16(T06A, T06D);
780 T06B = _mm_sub_epi16(T06B, T06C);
781 T07A = _mm_sub_epi16(T07A, T07D);
782 T07B = _mm_sub_epi16(T07B, T07C);
783
784 T20 = _mm_add_epi16(T10A, T10B); // [EE5 EE2 EE6 EE1 EE4 EE3 EE7 EE0]
785 T21 = _mm_add_epi16(T11A, T11B);
786 T22 = _mm_add_epi16(T12A, T12B);
787 T23 = _mm_add_epi16(T13A, T13B);
788 T24 = _mm_add_epi16(T14A, T14B);
789 T25 = _mm_add_epi16(T15A, T15B);
790 T26 = _mm_add_epi16(T16A, T16B);
791 T27 = _mm_add_epi16(T17A, T17B);
792
793 T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_8[1]));
794 T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_8[1]));
795 T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_8[1]));
796 T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_8[1]));
797 T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_8[1]));
798 T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_8[1]));
799 T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_8[1]));
800 T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_8[1]));
801
802 T40 = _mm_hadd_epi32(T30, T31);
803 T41 = _mm_hadd_epi32(T32, T33);
804 T42 = _mm_hadd_epi32(T34, T35);
805 T43 = _mm_hadd_epi32(T36, T37);
806
807 T50 = _mm_hadd_epi32(T40, T41);
808 T51 = _mm_hadd_epi32(T42, T43);
809 T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
810 T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
811 T60 = _mm_packs_epi32(T50, T51);
812 im[0][i] = T60;
813
814 T50 = _mm_hsub_epi32(T40, T41);
815 T51 = _mm_hsub_epi32(T42, T43);
816 T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
817 T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
818 T60 = _mm_packs_epi32(T50, T51);
819 im[16][i] = T60;
820
821 T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
822 T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
823 T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
824 T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
825 T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
826 T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
827 T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
828 T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
829
830 T40 = _mm_hadd_epi32(T30, T31);
831 T41 = _mm_hadd_epi32(T32, T33);
832 T42 = _mm_hadd_epi32(T34, T35);
833 T43 = _mm_hadd_epi32(T36, T37);
834
835 T50 = _mm_hadd_epi32(T40, T41);
836 T51 = _mm_hadd_epi32(T42, T43);
837 T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
838 T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
839 T60 = _mm_packs_epi32(T50, T51);
840 im[8][i] = T60;
841
842 T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
843 T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
844 T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
845 T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
846 T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
847 T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
848 T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
849 T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
850
851 T40 = _mm_hadd_epi32(T30, T31);
852 T41 = _mm_hadd_epi32(T32, T33);
853 T42 = _mm_hadd_epi32(T34, T35);
854 T43 = _mm_hadd_epi32(T36, T37);
855
856 T50 = _mm_hadd_epi32(T40, T41);
857 T51 = _mm_hadd_epi32(T42, T43);
858 T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
859 T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
860 T60 = _mm_packs_epi32(T50, T51);
861 im[24][i] = T60;
862
863 #define MAKE_ODD(tab, dstPos) \
864 T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
865 T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
866 T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
867 T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
868 T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
869 T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
870 T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
871 T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
872 \
873 T40 = _mm_hadd_epi32(T30, T31); \
874 T41 = _mm_hadd_epi32(T32, T33); \
875 T42 = _mm_hadd_epi32(T34, T35); \
876 T43 = _mm_hadd_epi32(T36, T37); \
877 \
878 T50 = _mm_hadd_epi32(T40, T41); \
879 T51 = _mm_hadd_epi32(T42, T43); \
880 T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
881 T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
882 T60 = _mm_packs_epi32(T50, T51); \
883 im[(dstPos)][i] = T60;
884
885 MAKE_ODD(0, 4);
886 MAKE_ODD(1, 12);
887 MAKE_ODD(2, 20);
888 MAKE_ODD(3, 28);
889
890 T20 = _mm_sub_epi16(T10A, T10B); // [EO5 EO2 EO6 EO1 EO4 EO3 EO7 EO0]
891 T21 = _mm_sub_epi16(T11A, T11B);
892 T22 = _mm_sub_epi16(T12A, T12B);
893 T23 = _mm_sub_epi16(T13A, T13B);
894 T24 = _mm_sub_epi16(T14A, T14B);
895 T25 = _mm_sub_epi16(T15A, T15B);
896 T26 = _mm_sub_epi16(T16A, T16B);
897 T27 = _mm_sub_epi16(T17A, T17B);
898
899 MAKE_ODD(4, 2);
900 MAKE_ODD(5, 6);
901 MAKE_ODD(6, 10);
902 MAKE_ODD(7, 14);
903 MAKE_ODD(8, 18);
904 MAKE_ODD(9, 22);
905 MAKE_ODD(10, 26);
906 MAKE_ODD(11, 30);
907 #undef MAKE_ODD
908
909 #define MAKE_ODD(tab, dstPos) \
910 T20 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
911 T21 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
912 T22 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
913 T23 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
914 T24 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
915 T25 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
916 T26 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
917 T27 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
918 T30 = _mm_madd_epi16(T04A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
919 T31 = _mm_madd_epi16(T04B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
920 T32 = _mm_madd_epi16(T05A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
921 T33 = _mm_madd_epi16(T05B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
922 T34 = _mm_madd_epi16(T06A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
923 T35 = _mm_madd_epi16(T06B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
924 T36 = _mm_madd_epi16(T07A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
925 T37 = _mm_madd_epi16(T07B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
926 \
927 T40 = _mm_hadd_epi32(T20, T21); \
928 T41 = _mm_hadd_epi32(T22, T23); \
929 T42 = _mm_hadd_epi32(T24, T25); \
930 T43 = _mm_hadd_epi32(T26, T27); \
931 T44 = _mm_hadd_epi32(T30, T31); \
932 T45 = _mm_hadd_epi32(T32, T33); \
933 T46 = _mm_hadd_epi32(T34, T35); \
934 T47 = _mm_hadd_epi32(T36, T37); \
935 \
936 T50 = _mm_hadd_epi32(T40, T41); \
937 T51 = _mm_hadd_epi32(T42, T43); \
938 T52 = _mm_hadd_epi32(T44, T45); \
939 T53 = _mm_hadd_epi32(T46, T47); \
940 \
941 T50 = _mm_hadd_epi32(T50, T51); \
942 T51 = _mm_hadd_epi32(T52, T53); \
943 T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
944 T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
945 T60 = _mm_packs_epi32(T50, T51); \
946 im[(dstPos)][i] = T60;
947
948 MAKE_ODD(12, 1);
949 MAKE_ODD(14, 3);
950 MAKE_ODD(16, 5);
951 MAKE_ODD(18, 7);
952 MAKE_ODD(20, 9);
953 MAKE_ODD(22, 11);
954 MAKE_ODD(24, 13);
955 MAKE_ODD(26, 15);
956 MAKE_ODD(28, 17);
957 MAKE_ODD(30, 19);
958 MAKE_ODD(32, 21);
959 MAKE_ODD(34, 23);
960 MAKE_ODD(36, 25);
961 MAKE_ODD(38, 27);
962 MAKE_ODD(40, 29);
963 MAKE_ODD(42, 31);
964
965 #undef MAKE_ODD
966 }
967
968 // DCT2
969 for (i = 0; i < 32 / 4; i++)
970 {
971 // OPT_ME: to avoid register spill, I use matrix multiply, have other way?
972 T00A = im[i * 4 + 0][0]; // [07 06 05 04 03 02 01 00]
973 T00B = im[i * 4 + 0][1]; // [15 14 13 12 11 10 09 08]
974 T00C = im[i * 4 + 0][2]; // [23 22 21 20 19 18 17 16]
975 T00D = im[i * 4 + 0][3]; // [31 30 29 28 27 26 25 24]
976 T01A = im[i * 4 + 1][0];
977 T01B = im[i * 4 + 1][1];
978 T01C = im[i * 4 + 1][2];
979 T01D = im[i * 4 + 1][3];
980 T02A = im[i * 4 + 2][0];
981 T02B = im[i * 4 + 2][1];
982 T02C = im[i * 4 + 2][2];
983 T02D = im[i * 4 + 2][3];
984 T03A = im[i * 4 + 3][0];
985 T03B = im[i * 4 + 3][1];
986 T03C = im[i * 4 + 3][2];
987 T03D = im[i * 4 + 3][3];
988
989 T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [16 17 18 19 20 21 22 23]
990 T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [24 25 26 27 28 29 30 31]
991 T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
992 T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
993 T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
994 T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
995 T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
996 T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
997
998 T10A = _mm_unpacklo_epi16(T00A, T00D); // [28 03 29 02 30 01 31 00]
999 T10B = _mm_unpackhi_epi16(T00A, T00D); // [24 07 25 06 26 05 27 04]
1000 T00A = _mm_unpacklo_epi16(T00B, T00C); // [20 11 21 10 22 09 23 08]
1001 T00B = _mm_unpackhi_epi16(T00B, T00C); // [16 15 17 14 18 13 19 12]
1002 T11A = _mm_unpacklo_epi16(T01A, T01D);
1003 T11B = _mm_unpackhi_epi16(T01A, T01D);
1004 T01A = _mm_unpacklo_epi16(T01B, T01C);
1005 T01B = _mm_unpackhi_epi16(T01B, T01C);
1006 T12A = _mm_unpacklo_epi16(T02A, T02D);
1007 T12B = _mm_unpackhi_epi16(T02A, T02D);
1008 T02A = _mm_unpacklo_epi16(T02B, T02C);
1009 T02B = _mm_unpackhi_epi16(T02B, T02C);
1010 T13A = _mm_unpacklo_epi16(T03A, T03D);
1011 T13B = _mm_unpackhi_epi16(T03A, T03D);
1012 T03A = _mm_unpacklo_epi16(T03B, T03C);
1013 T03B = _mm_unpackhi_epi16(T03B, T03C);
1014
1015 #define MAKE_ODD(tab0, tab1, tab2, tab3, dstPos) \
1016 T20 = _mm_madd_epi16(T10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1017 T21 = _mm_madd_epi16(T10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1018 T22 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1019 T23 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1020 T24 = _mm_madd_epi16(T11A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1021 T25 = _mm_madd_epi16(T11B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1022 T26 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1023 T27 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1024 T30 = _mm_madd_epi16(T12A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1025 T31 = _mm_madd_epi16(T12B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1026 T32 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1027 T33 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1028 T34 = _mm_madd_epi16(T13A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1029 T35 = _mm_madd_epi16(T13B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1030 T36 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1031 T37 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1032 \
1033 T60 = _mm_hadd_epi32(T20, T21); \
1034 T61 = _mm_hadd_epi32(T22, T23); \
1035 T62 = _mm_hadd_epi32(T24, T25); \
1036 T63 = _mm_hadd_epi32(T26, T27); \
1037 T64 = _mm_hadd_epi32(T30, T31); \
1038 T65 = _mm_hadd_epi32(T32, T33); \
1039 T66 = _mm_hadd_epi32(T34, T35); \
1040 T67 = _mm_hadd_epi32(T36, T37); \
1041 \
1042 T60 = _mm_hadd_epi32(T60, T61); \
1043 T61 = _mm_hadd_epi32(T62, T63); \
1044 T62 = _mm_hadd_epi32(T64, T65); \
1045 T63 = _mm_hadd_epi32(T66, T67); \
1046 \
1047 T60 = _mm_hadd_epi32(T60, T61); \
1048 T61 = _mm_hadd_epi32(T62, T63); \
1049 \
1050 T60 = _mm_hadd_epi32(T60, T61); \
1051 \
1052 T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \
1053 _mm_storeu_si128((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
1054
1055 MAKE_ODD(44, 44, 44, 44, 0);
1056 MAKE_ODD(45, 45, 45, 45, 16);
1057 MAKE_ODD(46, 47, 46, 47, 8);
1058 MAKE_ODD(48, 49, 48, 49, 24);
1059
1060 MAKE_ODD(50, 51, 52, 53, 4);
1061 MAKE_ODD(54, 55, 56, 57, 12);
1062 MAKE_ODD(58, 59, 60, 61, 20);
1063 MAKE_ODD(62, 63, 64, 65, 28);
1064
1065 MAKE_ODD(66, 67, 68, 69, 2);
1066 MAKE_ODD(70, 71, 72, 73, 6);
1067 MAKE_ODD(74, 75, 76, 77, 10);
1068 MAKE_ODD(78, 79, 80, 81, 14);
1069
1070 MAKE_ODD(82, 83, 84, 85, 18);
1071 MAKE_ODD(86, 87, 88, 89, 22);
1072 MAKE_ODD(90, 91, 92, 93, 26);
1073 MAKE_ODD(94, 95, 96, 97, 30);
1074
1075 MAKE_ODD(98, 99, 100, 101, 1);
1076 MAKE_ODD(102, 103, 104, 105, 3);
1077 MAKE_ODD(106, 107, 108, 109, 5);
1078 MAKE_ODD(110, 111, 112, 113, 7);
1079 MAKE_ODD(114, 115, 116, 117, 9);
1080 MAKE_ODD(118, 119, 120, 121, 11);
1081 MAKE_ODD(122, 123, 124, 125, 13);
1082 MAKE_ODD(126, 127, 128, 129, 15);
1083 MAKE_ODD(130, 131, 132, 133, 17);
1084 MAKE_ODD(134, 135, 136, 137, 19);
1085 MAKE_ODD(138, 139, 140, 141, 21);
1086 MAKE_ODD(142, 143, 144, 145, 23);
1087 MAKE_ODD(146, 147, 148, 149, 25);
1088 MAKE_ODD(150, 151, 152, 153, 27);
1089 MAKE_ODD(154, 155, 156, 157, 29);
1090 MAKE_ODD(158, 159, 160, 161, 31);
1091 #undef MAKE_ODD
1092 }
1093 }
1094 }
1095 #endif // if !HIGH_BIT_DEPTH
1096
1097 namespace x265 {
1098 void Setup_Vec_DCTPrimitives_ssse3(EncoderPrimitives &p)
1099 {
1100 /* Note: We have AVX2 assembly for these two functions, but since AVX2 is
1101 * still somewhat rare on end-user PCs we still compile and link these SSSE3
1102 * intrinsic SIMD functions */
1103 #if !HIGH_BIT_DEPTH
1104 p.dct[DCT_16x16] = dct16;
1105 p.dct[DCT_32x32] = dct32;
1106 #endif
1107 }
1108 }