Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Steve Borho <steve@borho.org> | |
5 | * Mandar Gurav <mandar@multicorewareinc.com> | |
6 | * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com> | |
7 | * Mahesh Pittala <mahesh@multicorewareinc.com> | |
8 | * Rajesh Paulraj <rajesh@multicorewareinc.com> | |
9 | * Min Chen <min.chen@multicorewareinc.com> | |
10 | * Praveen Kumar Tiwari <praveen@multicorewareinc.com> | |
11 | * Nabajit Deka <nabajit@multicorewareinc.com> | |
12 | * | |
13 | * This program is free software; you can redistribute it and/or modify | |
14 | * it under the terms of the GNU General Public License as published by | |
15 | * the Free Software Foundation; either version 2 of the License, or | |
16 | * (at your option) any later version. | |
17 | * | |
18 | * This program is distributed in the hope that it will be useful, | |
19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | * GNU General Public License for more details. | |
22 | * | |
23 | * You should have received a copy of the GNU General Public License | |
24 | * along with this program; if not, write to the Free Software | |
25 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
26 | * | |
27 | * This program is also available under a commercial proprietary license. | |
28 | * For more information, contact us at license @ x265.com. | |
29 | *****************************************************************************/ | |
30 | ||
31 | #include "common.h" | |
32 | #include "primitives.h" | |
33 | #include <xmmintrin.h> // SSE | |
34 | #include <pmmintrin.h> // SSE3 | |
35 | #include <tmmintrin.h> // SSSE3 | |
36 | ||
37 | using namespace x265; | |
38 | ||
39 | #if !HIGH_BIT_DEPTH | |
40 | namespace { | |
41 | ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) = | |
42 | { | |
43 | { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, | |
44 | ||
45 | { 64, 64, 64, 64, 64, 64, 64, 64 }, | |
46 | { 64, -64, 64, -64, 64, -64, 64, -64 }, | |
47 | { 83, 36, 83, 36, 83, 36, 83, 36 }, | |
48 | { 36, -83, 36, -83, 36, -83, 36, -83 }, | |
49 | { 89, 18, 75, 50, 89, 18, 75, 50 }, | |
50 | { 75, -50, -18, -89, 75, -50, -18, -89 }, | |
51 | { 50, 75, -89, 18, 50, 75, -89, 18 }, | |
52 | { 18, -89, -50, 75, 18, -89, -50, 75 }, | |
53 | ||
54 | { 83, 83, -83, -83, 36, 36, -36, -36 }, | |
55 | { 36, 36, -36, -36, -83, -83, 83, 83 }, | |
56 | { 89, -89, 18, -18, 75, -75, 50, -50 }, | |
57 | { 75, -75, -50, 50, -18, 18, -89, 89 }, | |
58 | { 50, -50, 75, -75, -89, 89, 18, -18 }, | |
59 | { 18, -18, -89, 89, -50, 50, 75, -75 }, | |
60 | }; | |
61 | ||
62 | ALIGN_VAR_32(static const int16_t, tab_dct_16_0[][8]) = | |
63 | { | |
64 | { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 }, // 0 | |
65 | { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, // 1 | |
66 | { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A }, // 2 | |
67 | { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 }, // 3 | |
68 | }; | |
69 | ||
70 | ALIGN_VAR_32(static const int16_t, tab_dct_16_1[][8]) = | |
71 | { | |
72 | { 90, 87, 80, 70, 57, 43, 25, 9 }, // 0 | |
73 | { 87, 57, 9, -43, -80, -90, -70, -25 }, // 1 | |
74 | { 80, 9, -70, -87, -25, 57, 90, 43 }, // 2 | |
75 | { 70, -43, -87, 9, 90, 25, -80, -57 }, // 3 | |
76 | { 57, -80, -25, 90, -9, -87, 43, 70 }, // 4 | |
77 | { 43, -90, 57, 25, -87, 70, 9, -80 }, // 5 | |
78 | { 25, -70, 90, -80, 43, 9, -57, 87 }, // 6 | |
79 | { 9, -25, 43, -57, 70, -80, 87, -90 }, // 7 | |
80 | { 83, 83, -83, -83, 36, 36, -36, -36 }, // 8 | |
81 | { 36, 36, -36, -36, -83, -83, 83, 83 }, // 9 | |
82 | { 89, 89, 18, 18, 75, 75, 50, 50 }, // 10 | |
83 | { 75, 75, -50, -50, -18, -18, -89, -89 }, // 11 | |
84 | { 50, 50, 75, 75, -89, -89, 18, 18 }, // 12 | |
85 | { 18, 18, -89, -89, -50, -50, 75, 75 }, // 13 | |
86 | ||
87 | #define MAKE_COEF(a0, a1, a2, a3, a4, a5, a6, a7) \ | |
88 | { (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2) \ | |
89 | }, \ | |
90 | { (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5) }, | |
91 | ||
92 | MAKE_COEF(90, 87, 80, 70, 57, 43, 25, 9) | |
93 | MAKE_COEF(87, 57, 9, -43, -80, -90, -70, -25) | |
94 | MAKE_COEF(80, 9, -70, -87, -25, 57, 90, 43) | |
95 | MAKE_COEF(70, -43, -87, 9, 90, 25, -80, -57) | |
96 | MAKE_COEF(57, -80, -25, 90, -9, -87, 43, 70) | |
97 | MAKE_COEF(43, -90, 57, 25, -87, 70, 9, -80) | |
98 | MAKE_COEF(25, -70, 90, -80, 43, 9, -57, 87) | |
99 | MAKE_COEF(9, -25, 43, -57, 70, -80, 87, -90) | |
100 | #undef MAKE_COEF | |
101 | }; | |
102 | ||
b53f7c52 | 103 | void dct16(const int16_t *src, int16_t *dst, intptr_t stride) |
72b9787e JB |
104 | { |
105 | // Const | |
106 | __m128i c_4 = _mm_set1_epi32(4); | |
107 | __m128i c_512 = _mm_set1_epi32(512); | |
108 | ||
109 | int i; | |
110 | ||
111 | ALIGN_VAR_32(int16_t, tmp[16 * 16]); | |
112 | ||
113 | __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; | |
114 | __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; | |
115 | __m128i T10, T11, T12, T13, T14, T15, T16, T17; | |
116 | __m128i T20, T21, T22, T23, T24, T25, T26, T27; | |
117 | __m128i T30, T31, T32, T33, T34, T35, T36, T37; | |
118 | __m128i T40, T41, T42, T43, T44, T45, T46, T47; | |
119 | __m128i T50, T51, T52, T53; | |
120 | __m128i T60, T61, T62, T63, T64, T65, T66, T67; | |
121 | __m128i T70; | |
122 | ||
123 | // DCT1 | |
124 | for (i = 0; i < 16; i += 8) | |
125 | { | |
126 | T00A = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 0]); // [07 06 05 04 03 02 01 00] | |
127 | T00B = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 8]); // [0F 0E 0D 0C 0B 0A 09 08] | |
128 | T01A = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 0]); // [17 16 15 14 13 12 11 10] | |
129 | T01B = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 8]); // [1F 1E 1D 1C 1B 1A 19 18] | |
130 | T02A = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 0]); // [27 26 25 24 23 22 21 20] | |
131 | T02B = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 8]); // [2F 2E 2D 2C 2B 2A 29 28] | |
132 | T03A = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 0]); // [37 36 35 34 33 32 31 30] | |
133 | T03B = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 8]); // [3F 3E 3D 3C 3B 3A 39 38] | |
134 | T04A = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 0]); // [47 46 45 44 43 42 41 40] | |
135 | T04B = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 8]); // [4F 4E 4D 4C 4B 4A 49 48] | |
136 | T05A = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 0]); // [57 56 55 54 53 52 51 50] | |
137 | T05B = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 8]); // [5F 5E 5D 5C 5B 5A 59 58] | |
138 | T06A = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 0]); // [67 66 65 64 63 62 61 60] | |
139 | T06B = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 8]); // [6F 6E 6D 6C 6B 6A 69 68] | |
140 | T07A = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 0]); // [77 76 75 74 73 72 71 70] | |
141 | T07B = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 8]); // [7F 7E 7D 7C 7B 7A 79 78] | |
142 | ||
143 | T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
144 | T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
145 | T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
146 | T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
147 | T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
148 | T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
149 | T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
150 | T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
151 | ||
152 | T10 = _mm_add_epi16(T00A, T00B); | |
153 | T11 = _mm_add_epi16(T01A, T01B); | |
154 | T12 = _mm_add_epi16(T02A, T02B); | |
155 | T13 = _mm_add_epi16(T03A, T03B); | |
156 | T14 = _mm_add_epi16(T04A, T04B); | |
157 | T15 = _mm_add_epi16(T05A, T05B); | |
158 | T16 = _mm_add_epi16(T06A, T06B); | |
159 | T17 = _mm_add_epi16(T07A, T07B); | |
160 | ||
161 | T20 = _mm_sub_epi16(T00A, T00B); | |
162 | T21 = _mm_sub_epi16(T01A, T01B); | |
163 | T22 = _mm_sub_epi16(T02A, T02B); | |
164 | T23 = _mm_sub_epi16(T03A, T03B); | |
165 | T24 = _mm_sub_epi16(T04A, T04B); | |
166 | T25 = _mm_sub_epi16(T05A, T05B); | |
167 | T26 = _mm_sub_epi16(T06A, T06B); | |
168 | T27 = _mm_sub_epi16(T07A, T07B); | |
169 | ||
170 | T30 = _mm_shuffle_epi8(T10, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
171 | T31 = _mm_shuffle_epi8(T11, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
172 | T32 = _mm_shuffle_epi8(T12, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
173 | T33 = _mm_shuffle_epi8(T13, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
174 | T34 = _mm_shuffle_epi8(T14, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
175 | T35 = _mm_shuffle_epi8(T15, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
176 | T36 = _mm_shuffle_epi8(T16, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
177 | T37 = _mm_shuffle_epi8(T17, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
178 | ||
179 | T40 = _mm_hadd_epi16(T30, T31); | |
180 | T41 = _mm_hadd_epi16(T32, T33); | |
181 | T42 = _mm_hadd_epi16(T34, T35); | |
182 | T43 = _mm_hadd_epi16(T36, T37); | |
183 | T44 = _mm_hsub_epi16(T30, T31); | |
184 | T45 = _mm_hsub_epi16(T32, T33); | |
185 | T46 = _mm_hsub_epi16(T34, T35); | |
186 | T47 = _mm_hsub_epi16(T36, T37); | |
187 | ||
188 | T50 = _mm_hadd_epi16(T40, T41); | |
189 | T51 = _mm_hadd_epi16(T42, T43); | |
190 | T52 = _mm_hsub_epi16(T40, T41); | |
191 | T53 = _mm_hsub_epi16(T42, T43); | |
192 | ||
193 | T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
194 | T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
195 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); | |
196 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); | |
197 | T70 = _mm_packs_epi32(T60, T61); | |
198 | _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70); | |
199 | ||
200 | T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2])); | |
201 | T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2])); | |
202 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); | |
203 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); | |
204 | T70 = _mm_packs_epi32(T60, T61); | |
205 | _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70); | |
206 | ||
207 | T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3])); | |
208 | T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3])); | |
209 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); | |
210 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); | |
211 | T70 = _mm_packs_epi32(T60, T61); | |
212 | _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70); | |
213 | ||
214 | T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4])); | |
215 | T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4])); | |
216 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); | |
217 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); | |
218 | T70 = _mm_packs_epi32(T60, T61); | |
219 | _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70); | |
220 | ||
221 | T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[5])); | |
222 | T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[5])); | |
223 | T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[5])); | |
224 | T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5])); | |
225 | T60 = _mm_hadd_epi32(T60, T61); | |
226 | T61 = _mm_hadd_epi32(T62, T63); | |
227 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); | |
228 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); | |
229 | T70 = _mm_packs_epi32(T60, T61); | |
230 | _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70); | |
231 | ||
232 | T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[6])); | |
233 | T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[6])); | |
234 | T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[6])); | |
235 | T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6])); | |
236 | T60 = _mm_hadd_epi32(T60, T61); | |
237 | T61 = _mm_hadd_epi32(T62, T63); | |
238 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); | |
239 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); | |
240 | T70 = _mm_packs_epi32(T60, T61); | |
241 | _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70); | |
242 | ||
243 | T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[7])); | |
244 | T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[7])); | |
245 | T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[7])); | |
246 | T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7])); | |
247 | T60 = _mm_hadd_epi32(T60, T61); | |
248 | T61 = _mm_hadd_epi32(T62, T63); | |
249 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); | |
250 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); | |
251 | T70 = _mm_packs_epi32(T60, T61); | |
252 | _mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70); | |
253 | ||
254 | T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[8])); | |
255 | T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[8])); | |
256 | T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[8])); | |
257 | T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8])); | |
258 | T60 = _mm_hadd_epi32(T60, T61); | |
259 | T61 = _mm_hadd_epi32(T62, T63); | |
260 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); | |
261 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); | |
262 | T70 = _mm_packs_epi32(T60, T61); | |
263 | _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70); | |
264 | ||
265 | #define MAKE_ODD(tab, dstPos) \ | |
266 | T60 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
267 | T61 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
268 | T62 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
269 | T63 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
270 | T64 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
271 | T65 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
272 | T66 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
273 | T67 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
274 | T60 = _mm_hadd_epi32(T60, T61); \ | |
275 | T61 = _mm_hadd_epi32(T62, T63); \ | |
276 | T62 = _mm_hadd_epi32(T64, T65); \ | |
277 | T63 = _mm_hadd_epi32(T66, T67); \ | |
278 | T60 = _mm_hadd_epi32(T60, T61); \ | |
279 | T61 = _mm_hadd_epi32(T62, T63); \ | |
280 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); \ | |
281 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); \ | |
282 | T70 = _mm_packs_epi32(T60, T61); \ | |
283 | _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70); | |
284 | ||
285 | MAKE_ODD(0, 1); | |
286 | MAKE_ODD(1, 3); | |
287 | MAKE_ODD(2, 5); | |
288 | MAKE_ODD(3, 7); | |
289 | MAKE_ODD(4, 9); | |
290 | MAKE_ODD(5, 11); | |
291 | MAKE_ODD(6, 13); | |
292 | MAKE_ODD(7, 15); | |
293 | #undef MAKE_ODD | |
294 | } | |
295 | ||
296 | // DCT2 | |
297 | for (i = 0; i < 16; i += 4) | |
298 | { | |
299 | T00A = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 0]); // [07 06 05 04 03 02 01 00] | |
300 | T00B = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 8]); // [0F 0E 0D 0C 0B 0A 09 08] | |
301 | T01A = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 0]); // [17 16 15 14 13 12 11 10] | |
302 | T01B = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 8]); // [1F 1E 1D 1C 1B 1A 19 18] | |
303 | T02A = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 0]); // [27 26 25 24 23 22 21 20] | |
304 | T02B = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 8]); // [2F 2E 2D 2C 2B 2A 29 28] | |
305 | T03A = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 0]); // [37 36 35 34 33 32 31 30] | |
306 | T03B = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 8]); // [3F 3E 3D 3C 3B 3A 39 38] | |
307 | ||
308 | T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); | |
309 | T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); | |
310 | T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); | |
311 | T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); | |
312 | T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); | |
313 | T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); | |
314 | T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); | |
315 | T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); | |
316 | ||
317 | T10 = _mm_unpacklo_epi16(T00A, T00B); | |
318 | T11 = _mm_unpackhi_epi16(T00A, T00B); | |
319 | T12 = _mm_unpacklo_epi16(T01A, T01B); | |
320 | T13 = _mm_unpackhi_epi16(T01A, T01B); | |
321 | T14 = _mm_unpacklo_epi16(T02A, T02B); | |
322 | T15 = _mm_unpackhi_epi16(T02A, T02B); | |
323 | T16 = _mm_unpacklo_epi16(T03A, T03B); | |
324 | T17 = _mm_unpackhi_epi16(T03A, T03B); | |
325 | ||
326 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
327 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
328 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
329 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
330 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
331 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
332 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
333 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
334 | ||
335 | T30 = _mm_add_epi32(T20, T21); | |
336 | T31 = _mm_add_epi32(T22, T23); | |
337 | T32 = _mm_add_epi32(T24, T25); | |
338 | T33 = _mm_add_epi32(T26, T27); | |
339 | ||
340 | T30 = _mm_hadd_epi32(T30, T31); | |
341 | T31 = _mm_hadd_epi32(T32, T33); | |
342 | ||
343 | T40 = _mm_hadd_epi32(T30, T31); | |
344 | T41 = _mm_hsub_epi32(T30, T31); | |
345 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); | |
346 | T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10); | |
b53f7c52 JB |
347 | T40 = _mm_packs_epi32(T40, T40); |
348 | T41 = _mm_packs_epi32(T41, T41); | |
349 | _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40); | |
350 | _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41); | |
72b9787e JB |
351 | |
352 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
353 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
354 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
355 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
356 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
357 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
358 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
359 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
360 | ||
361 | T30 = _mm_add_epi32(T20, T21); | |
362 | T31 = _mm_add_epi32(T22, T23); | |
363 | T32 = _mm_add_epi32(T24, T25); | |
364 | T33 = _mm_add_epi32(T26, T27); | |
365 | ||
366 | T30 = _mm_hadd_epi32(T30, T31); | |
367 | T31 = _mm_hadd_epi32(T32, T33); | |
368 | ||
369 | T40 = _mm_hadd_epi32(T30, T31); | |
370 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); | |
b53f7c52 JB |
371 | T40 = _mm_packs_epi32(T40, T40); |
372 | _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40); | |
72b9787e JB |
373 | |
374 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
375 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
376 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
377 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
378 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
379 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
380 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
381 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
382 | ||
383 | T30 = _mm_add_epi32(T20, T21); | |
384 | T31 = _mm_add_epi32(T22, T23); | |
385 | T32 = _mm_add_epi32(T24, T25); | |
386 | T33 = _mm_add_epi32(T26, T27); | |
387 | ||
388 | T30 = _mm_hadd_epi32(T30, T31); | |
389 | T31 = _mm_hadd_epi32(T32, T33); | |
390 | ||
391 | T40 = _mm_hadd_epi32(T30, T31); | |
392 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); | |
b53f7c52 JB |
393 | T40 = _mm_packs_epi32(T40, T40); |
394 | _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40); | |
72b9787e JB |
395 | |
396 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10])); | |
397 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10])); | |
398 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[10])); | |
399 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[10])); | |
400 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[10])); | |
401 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[10])); | |
402 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[10])); | |
403 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[10])); | |
404 | ||
405 | T30 = _mm_sub_epi32(T20, T21); | |
406 | T31 = _mm_sub_epi32(T22, T23); | |
407 | T32 = _mm_sub_epi32(T24, T25); | |
408 | T33 = _mm_sub_epi32(T26, T27); | |
409 | ||
410 | T30 = _mm_hadd_epi32(T30, T31); | |
411 | T31 = _mm_hadd_epi32(T32, T33); | |
412 | ||
413 | T40 = _mm_hadd_epi32(T30, T31); | |
414 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); | |
b53f7c52 JB |
415 | T40 = _mm_packs_epi32(T40, T40); |
416 | _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40); | |
72b9787e JB |
417 | |
418 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11])); | |
419 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11])); | |
420 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[11])); | |
421 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[11])); | |
422 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[11])); | |
423 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[11])); | |
424 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[11])); | |
425 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[11])); | |
426 | ||
427 | T30 = _mm_sub_epi32(T20, T21); | |
428 | T31 = _mm_sub_epi32(T22, T23); | |
429 | T32 = _mm_sub_epi32(T24, T25); | |
430 | T33 = _mm_sub_epi32(T26, T27); | |
431 | ||
432 | T30 = _mm_hadd_epi32(T30, T31); | |
433 | T31 = _mm_hadd_epi32(T32, T33); | |
434 | ||
435 | T40 = _mm_hadd_epi32(T30, T31); | |
436 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); | |
b53f7c52 JB |
437 | T40 = _mm_packs_epi32(T40, T40); |
438 | _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40); | |
72b9787e JB |
439 | |
440 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12])); | |
441 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12])); | |
442 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[12])); | |
443 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[12])); | |
444 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[12])); | |
445 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[12])); | |
446 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[12])); | |
447 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[12])); | |
448 | ||
449 | T30 = _mm_sub_epi32(T20, T21); | |
450 | T31 = _mm_sub_epi32(T22, T23); | |
451 | T32 = _mm_sub_epi32(T24, T25); | |
452 | T33 = _mm_sub_epi32(T26, T27); | |
453 | ||
454 | T30 = _mm_hadd_epi32(T30, T31); | |
455 | T31 = _mm_hadd_epi32(T32, T33); | |
456 | ||
457 | T40 = _mm_hadd_epi32(T30, T31); | |
458 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); | |
b53f7c52 JB |
459 | T40 = _mm_packs_epi32(T40, T40); |
460 | _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40); | |
72b9787e JB |
461 | |
462 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13])); | |
463 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13])); | |
464 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[13])); | |
465 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[13])); | |
466 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[13])); | |
467 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[13])); | |
468 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[13])); | |
469 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[13])); | |
470 | ||
471 | T30 = _mm_sub_epi32(T20, T21); | |
472 | T31 = _mm_sub_epi32(T22, T23); | |
473 | T32 = _mm_sub_epi32(T24, T25); | |
474 | T33 = _mm_sub_epi32(T26, T27); | |
475 | ||
476 | T30 = _mm_hadd_epi32(T30, T31); | |
477 | T31 = _mm_hadd_epi32(T32, T33); | |
478 | ||
479 | T40 = _mm_hadd_epi32(T30, T31); | |
480 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); | |
b53f7c52 JB |
481 | T40 = _mm_packs_epi32(T40, T40); |
482 | _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40); | |
72b9787e JB |
483 | |
484 | #define MAKE_ODD(tab, dstPos) \ | |
485 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \ | |
486 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); /* [*O5_0 *O6_0 *O4_0 *O7_0] */ \ | |
487 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
488 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ | |
489 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
490 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ | |
491 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ | |
492 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ | |
493 | \ | |
494 | T30 = _mm_add_epi32(T20, T21); \ | |
495 | T31 = _mm_add_epi32(T22, T23); \ | |
496 | T32 = _mm_add_epi32(T24, T25); \ | |
497 | T33 = _mm_add_epi32(T26, T27); \ | |
498 | \ | |
499 | T30 = _mm_hadd_epi32(T30, T31); \ | |
500 | T31 = _mm_hadd_epi32(T32, T33); \ | |
501 | \ | |
502 | T40 = _mm_hadd_epi32(T30, T31); \ | |
503 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \ | |
b53f7c52 JB |
504 | T40 = _mm_packs_epi32(T40, T40); \ |
505 | _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40); | |
72b9787e JB |
506 | |
507 | MAKE_ODD(14, 1); | |
508 | MAKE_ODD(16, 3); | |
509 | MAKE_ODD(18, 5); | |
510 | MAKE_ODD(20, 7); | |
511 | MAKE_ODD(22, 9); | |
512 | MAKE_ODD(24, 11); | |
513 | MAKE_ODD(26, 13); | |
514 | MAKE_ODD(28, 15); | |
515 | #undef MAKE_ODD | |
516 | } | |
517 | } | |
518 | ||
519 | ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) = | |
520 | { | |
521 | { 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504 }, // 0 | |
522 | }; | |
523 | ||
524 | ALIGN_VAR_32(static const int16_t, tab_dct_32_1[][8]) = | |
525 | { | |
526 | { 89, -89, 18, -18, 75, -75, 50, -50 }, // 0 | |
527 | { 75, -75, -50, 50, -18, 18, -89, 89 }, // 1 | |
528 | { 50, -50, 75, -75, -89, 89, 18, -18 }, // 2 | |
529 | { 18, -18, -89, 89, -50, 50, 75, -75 }, // 3 | |
530 | ||
531 | #define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \ | |
532 | { (a0), (a7), (a3), (a4), (a1), (a6), (a2), (a5) \ | |
533 | }, \ | |
534 | ||
535 | MAKE_COEF8(90, 87, 80, 70, 57, 43, 25, 9) // 4 | |
536 | MAKE_COEF8(87, 57, 9, -43, -80, -90, -70, -25) // 5 | |
537 | MAKE_COEF8(80, 9, -70, -87, -25, 57, 90, 43) // 6 | |
538 | MAKE_COEF8(70, -43, -87, 9, 90, 25, -80, -57) // 7 | |
539 | MAKE_COEF8(57, -80, -25, 90, -9, -87, 43, 70) // 8 | |
540 | MAKE_COEF8(43, -90, 57, 25, -87, 70, 9, -80) // 9 | |
541 | MAKE_COEF8(25, -70, 90, -80, 43, 9, -57, 87) // 10 | |
542 | MAKE_COEF8(9, -25, 43, -57, 70, -80, 87, -90) // 11 | |
543 | #undef MAKE_COEF8 | |
544 | ||
545 | #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ | |
546 | { (a00), (a07), (a03), (a04), (a01), (a06), (a02), (a05) }, \ | |
547 | { (a15), (a08), (a12), (a11), (a14), (a09), (a13), (a10) }, | |
548 | ||
549 | MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 12 | |
550 | MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) // 14 | |
551 | MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) // 16 | |
552 | MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31) // 18 | |
553 | MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38) // 20 | |
554 | MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) // 22 | |
555 | MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) // 24 | |
556 | MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61) // 26 | |
557 | MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) // 28 | |
558 | MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) // 30 | |
559 | MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78) // 32 | |
560 | MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82) // 34 | |
561 | MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) // 36 | |
562 | MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) // 38 | |
563 | MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90) // 40 | |
564 | MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90) // 42 | |
565 | #undef MAKE_COEF16 | |
566 | ||
567 | { | |
568 | 64, 64, 64, 64, 64, 64, 64, 64 | |
569 | }, // 44 | |
570 | ||
571 | { 64, 64, -64, -64, -64, -64, 64, 64 }, // 45 | |
572 | ||
573 | { 83, 83, 36, 36, -36, -36, -83, -83 }, // 46 | |
574 | { -83, -83, -36, -36, 36, 36, 83, 83 }, // 47 | |
575 | ||
576 | { 36, 36, -83, -83, 83, 83, -36, -36 }, // 48 | |
577 | { -36, -36, 83, 83, -83, -83, 36, 36 }, // 49 | |
578 | ||
579 | #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ | |
580 | { (a00), (a00), (a01), (a01), (a02), (a02), (a03), (a03) }, \ | |
581 | { (a04), (a04), (a05), (a05), (a06), (a06), (a07), (a07) }, \ | |
582 | { (a08), (a08), (a09), (a09), (a10), (a10), (a11), (a11) }, \ | |
583 | { (a12), (a12), (a13), (a13), (a14), (a14), (a15), (a15) }, | |
584 | ||
585 | MAKE_COEF16(89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89) // 50 | |
586 | MAKE_COEF16(75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75) // 54 | |
587 | ||
588 | // TODO: convert below table here | |
589 | #undef MAKE_COEF16 | |
590 | ||
591 | { | |
592 | 50, 50, -89, -89, 18, 18, 75, 75 | |
593 | }, // 58 | |
594 | { -75, -75, -18, -18, 89, 89, -50, -50 }, // 59 | |
595 | { -50, -50, 89, 89, -18, -18, -75, -75 }, // 60 | |
596 | { 75, 75, 18, 18, -89, -89, 50, 50 }, // 61 | |
597 | ||
598 | { 18, 18, -50, -50, 75, 75, -89, -89 }, // 62 | |
599 | { 89, 89, -75, -75, 50, 50, -18, -18 }, // 63 | |
600 | { -18, -18, 50, 50, -75, -75, 89, 89 }, // 64 | |
601 | { -89, -89, 75, 75, -50, -50, 18, 18 }, // 65 | |
602 | ||
603 | { 90, 90, 87, 87, 80, 80, 70, 70 }, // 66 | |
604 | { 57, 57, 43, 43, 25, 25, 9, 9 }, // 67 | |
605 | { -9, -9, -25, -25, -43, -43, -57, -57 }, // 68 | |
606 | { -70, -70, -80, -80, -87, -87, -90, -90 }, // 69 | |
607 | ||
608 | { 87, 87, 57, 57, 9, 9, -43, -43 }, // 70 | |
609 | { -80, -80, -90, -90, -70, -70, -25, -25 }, // 71 | |
610 | { 25, 25, 70, 70, 90, 90, 80, 80 }, // 72 | |
611 | { 43, 43, -9, -9, -57, -57, -87, -87 }, // 73 | |
612 | ||
613 | { 80, 80, 9, 9, -70, -70, -87, -87 }, // 74 | |
614 | { -25, -25, 57, 57, 90, 90, 43, 43 }, // 75 | |
615 | { -43, -43, -90, -90, -57, -57, 25, 25 }, // 76 | |
616 | { 87, 87, 70, 70, -9, -9, -80, -80 }, // 77 | |
617 | ||
618 | { 70, 70, -43, -43, -87, -87, 9, 9 }, // 78 | |
619 | { 90, 90, 25, 25, -80, -80, -57, -57 }, // 79 | |
620 | { 57, 57, 80, 80, -25, -25, -90, -90 }, // 80 | |
621 | { -9, -9, 87, 87, 43, 43, -70, -70 }, // 81 | |
622 | ||
623 | { 57, 57, -80, -80, -25, -25, 90, 90 }, // 82 | |
624 | { -9, -9, -87, -87, 43, 43, 70, 70 }, // 83 | |
625 | { -70, -70, -43, -43, 87, 87, 9, 9 }, // 84 | |
626 | { -90, -90, 25, 25, 80, 80, -57, -57 }, // 85 | |
627 | ||
628 | { 43, 43, -90, -90, 57, 57, 25, 25 }, // 86 | |
629 | { -87, -87, 70, 70, 9, 9, -80, -80 }, // 87 | |
630 | { 80, 80, -9, -9, -70, -70, 87, 87 }, // 88 | |
631 | { -25, -25, -57, -57, 90, 90, -43, -43 }, // 89 | |
632 | ||
633 | { 25, 25, -70, -70, 90, 90, -80, -80 }, // 90 | |
634 | { 43, 43, 9, 9, -57, -57, 87, 87 }, // 91 | |
635 | { -87, -87, 57, 57, -9, -9, -43, -43 }, // 92 | |
636 | { 80, 80, -90, -90, 70, 70, -25, -25 }, // 93 | |
637 | ||
638 | { 9, 9, -25, -25, 43, 43, -57, -57 }, // 94 | |
639 | { 70, 70, -80, -80, 87, 87, -90, -90 }, // 95 | |
640 | { 90, 90, -87, -87, 80, 80, -70, -70 }, // 96 | |
641 | { 57, 57, -43, -43, 25, 25, -9, -9 }, // 97 | |
642 | ||
643 | #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ | |
644 | { (a00), -(a00), (a01), -(a01), (a02), -(a02), (a03), -(a03) }, \ | |
645 | { (a04), -(a04), (a05), -(a05), (a06), -(a06), (a07), -(a07) }, \ | |
646 | { (a08), -(a08), (a09), -(a09), (a10), -(a10), (a11), -(a11) }, \ | |
647 | { (a12), -(a12), (a13), -(a13), (a14), -(a14), (a15), -(a15) }, | |
648 | ||
649 | MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 98 | |
650 | MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) //102 | |
651 | MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) //106 | |
652 | MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, +82, 88, 54, -4, -61, -90, -78, -31) //110 | |
653 | MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, +31, -46, -90, -67, 4, 73, 88, 38) //114 | |
654 | MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) //118 | |
655 | MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) //122 | |
656 | MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, +90, 13, -88, -31, 82, 46, -73, -61) //126 | |
657 | MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) //130 | |
658 | MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) //134 | |
659 | MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, +22, 67, -85, 13, 73, -82, 4, 78) //138 | |
660 | MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, +85, -78, 13, 61, -90, 54, 22, -82) //142 | |
661 | MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) //146 | |
662 | MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) //150 | |
663 | MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, +54, -31, 4, 22, -46, 67, -82, 90) //154 | |
664 | MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, +67, -73, 78, -82, 85, -88, 90, -90) //158 | |
665 | ||
666 | #undef MAKE_COEF16 | |
667 | }; | |
668 | ||
b53f7c52 | 669 | void dct32(const int16_t *src, int16_t *dst, intptr_t stride) |
72b9787e JB |
670 | { |
671 | // Const | |
672 | __m128i c_8 = _mm_set1_epi32(8); | |
673 | __m128i c_1024 = _mm_set1_epi32(1024); | |
674 | ||
675 | int i; | |
676 | ||
677 | __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; | |
678 | __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; | |
679 | __m128i T00C, T01C, T02C, T03C, T04C, T05C, T06C, T07C; | |
680 | __m128i T00D, T01D, T02D, T03D, T04D, T05D, T06D, T07D; | |
681 | __m128i T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A; | |
682 | __m128i T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B; | |
683 | __m128i T20, T21, T22, T23, T24, T25, T26, T27; | |
684 | __m128i T30, T31, T32, T33, T34, T35, T36, T37; | |
685 | __m128i T40, T41, T42, T43, T44, T45, T46, T47; | |
686 | __m128i T50, T51, T52, T53; | |
687 | __m128i T60, T61, T62, T63, T64, T65, T66, T67; | |
688 | __m128i im[32][4]; | |
689 | ||
690 | // DCT1 | |
691 | for (i = 0; i < 32 / 8; i++) | |
692 | { | |
693 | T00A = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 0]); // [07 06 05 04 03 02 01 00] | |
694 | T00B = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 8]); // [15 14 13 12 11 10 09 08] | |
695 | T00C = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 16]); // [23 22 21 20 19 18 17 16] | |
696 | T00D = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 24]); // [31 30 29 28 27 26 25 24] | |
697 | T01A = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 0]); | |
698 | T01B = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 8]); | |
699 | T01C = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 16]); | |
700 | T01D = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 24]); | |
701 | T02A = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 0]); | |
702 | T02B = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 8]); | |
703 | T02C = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 16]); | |
704 | T02D = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 24]); | |
705 | T03A = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 0]); | |
706 | T03B = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 8]); | |
707 | T03C = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 16]); | |
708 | T03D = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 24]); | |
709 | T04A = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 0]); | |
710 | T04B = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 8]); | |
711 | T04C = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 16]); | |
712 | T04D = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 24]); | |
713 | T05A = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 0]); | |
714 | T05B = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 8]); | |
715 | T05C = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 16]); | |
716 | T05D = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 24]); | |
717 | T06A = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 0]); | |
718 | T06B = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 8]); | |
719 | T06C = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 16]); | |
720 | T06D = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 24]); | |
721 | T07A = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 0]); | |
722 | T07B = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 8]); | |
723 | T07C = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 16]); | |
724 | T07D = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 24]); | |
725 | ||
726 | T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00] | |
727 | T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15] | |
728 | T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [21 18 22 17 20 19 23 16] | |
729 | T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [26 29 25 30 27 28 24 31] | |
730 | T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
731 | T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
732 | T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
733 | T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
734 | T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
735 | T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
736 | T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
737 | T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
738 | T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
739 | T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
740 | T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
741 | T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
742 | T04A = _mm_shuffle_epi8(T04A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
743 | T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
744 | T04C = _mm_shuffle_epi8(T04C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
745 | T04D = _mm_shuffle_epi8(T04D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
746 | T05A = _mm_shuffle_epi8(T05A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
747 | T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
748 | T05C = _mm_shuffle_epi8(T05C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
749 | T05D = _mm_shuffle_epi8(T05D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
750 | T06A = _mm_shuffle_epi8(T06A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
751 | T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
752 | T06C = _mm_shuffle_epi8(T06C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
753 | T06D = _mm_shuffle_epi8(T06D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
754 | T07A = _mm_shuffle_epi8(T07A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
755 | T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
756 | T07C = _mm_shuffle_epi8(T07C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); | |
757 | T07D = _mm_shuffle_epi8(T07D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); | |
758 | ||
759 | T10A = _mm_add_epi16(T00A, T00D); // [E05 E02 E06 E01 E04 E03 E07 E00] | |
760 | T10B = _mm_add_epi16(T00B, T00C); // [E10 E13 E09 E14 E11 E12 E08 E15] | |
761 | T11A = _mm_add_epi16(T01A, T01D); | |
762 | T11B = _mm_add_epi16(T01B, T01C); | |
763 | T12A = _mm_add_epi16(T02A, T02D); | |
764 | T12B = _mm_add_epi16(T02B, T02C); | |
765 | T13A = _mm_add_epi16(T03A, T03D); | |
766 | T13B = _mm_add_epi16(T03B, T03C); | |
767 | T14A = _mm_add_epi16(T04A, T04D); | |
768 | T14B = _mm_add_epi16(T04B, T04C); | |
769 | T15A = _mm_add_epi16(T05A, T05D); | |
770 | T15B = _mm_add_epi16(T05B, T05C); | |
771 | T16A = _mm_add_epi16(T06A, T06D); | |
772 | T16B = _mm_add_epi16(T06B, T06C); | |
773 | T17A = _mm_add_epi16(T07A, T07D); | |
774 | T17B = _mm_add_epi16(T07B, T07C); | |
775 | ||
776 | T00A = _mm_sub_epi16(T00A, T00D); // [O05 O02 O06 O01 O04 O03 O07 O00] | |
777 | T00B = _mm_sub_epi16(T00B, T00C); // [O10 O13 O09 O14 O11 O12 O08 O15] | |
778 | T01A = _mm_sub_epi16(T01A, T01D); | |
779 | T01B = _mm_sub_epi16(T01B, T01C); | |
780 | T02A = _mm_sub_epi16(T02A, T02D); | |
781 | T02B = _mm_sub_epi16(T02B, T02C); | |
782 | T03A = _mm_sub_epi16(T03A, T03D); | |
783 | T03B = _mm_sub_epi16(T03B, T03C); | |
784 | T04A = _mm_sub_epi16(T04A, T04D); | |
785 | T04B = _mm_sub_epi16(T04B, T04C); | |
786 | T05A = _mm_sub_epi16(T05A, T05D); | |
787 | T05B = _mm_sub_epi16(T05B, T05C); | |
788 | T06A = _mm_sub_epi16(T06A, T06D); | |
789 | T06B = _mm_sub_epi16(T06B, T06C); | |
790 | T07A = _mm_sub_epi16(T07A, T07D); | |
791 | T07B = _mm_sub_epi16(T07B, T07C); | |
792 | ||
793 | T20 = _mm_add_epi16(T10A, T10B); // [EE5 EE2 EE6 EE1 EE4 EE3 EE7 EE0] | |
794 | T21 = _mm_add_epi16(T11A, T11B); | |
795 | T22 = _mm_add_epi16(T12A, T12B); | |
796 | T23 = _mm_add_epi16(T13A, T13B); | |
797 | T24 = _mm_add_epi16(T14A, T14B); | |
798 | T25 = _mm_add_epi16(T15A, T15B); | |
799 | T26 = _mm_add_epi16(T16A, T16B); | |
800 | T27 = _mm_add_epi16(T17A, T17B); | |
801 | ||
802 | T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
803 | T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
804 | T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
805 | T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
806 | T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
807 | T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
808 | T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
809 | T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_8[1])); | |
810 | ||
811 | T40 = _mm_hadd_epi32(T30, T31); | |
812 | T41 = _mm_hadd_epi32(T32, T33); | |
813 | T42 = _mm_hadd_epi32(T34, T35); | |
814 | T43 = _mm_hadd_epi32(T36, T37); | |
815 | ||
816 | T50 = _mm_hadd_epi32(T40, T41); | |
817 | T51 = _mm_hadd_epi32(T42, T43); | |
818 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); | |
819 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); | |
820 | T60 = _mm_packs_epi32(T50, T51); | |
821 | im[0][i] = T60; | |
822 | ||
823 | T50 = _mm_hsub_epi32(T40, T41); | |
824 | T51 = _mm_hsub_epi32(T42, T43); | |
825 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); | |
826 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); | |
827 | T60 = _mm_packs_epi32(T50, T51); | |
828 | im[16][i] = T60; | |
829 | ||
830 | T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
831 | T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
832 | T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
833 | T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
834 | T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
835 | T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
836 | T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
837 | T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[8])); | |
838 | ||
839 | T40 = _mm_hadd_epi32(T30, T31); | |
840 | T41 = _mm_hadd_epi32(T32, T33); | |
841 | T42 = _mm_hadd_epi32(T34, T35); | |
842 | T43 = _mm_hadd_epi32(T36, T37); | |
843 | ||
844 | T50 = _mm_hadd_epi32(T40, T41); | |
845 | T51 = _mm_hadd_epi32(T42, T43); | |
846 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); | |
847 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); | |
848 | T60 = _mm_packs_epi32(T50, T51); | |
849 | im[8][i] = T60; | |
850 | ||
851 | T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
852 | T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
853 | T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
854 | T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
855 | T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
856 | T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
857 | T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
858 | T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[9])); | |
859 | ||
860 | T40 = _mm_hadd_epi32(T30, T31); | |
861 | T41 = _mm_hadd_epi32(T32, T33); | |
862 | T42 = _mm_hadd_epi32(T34, T35); | |
863 | T43 = _mm_hadd_epi32(T36, T37); | |
864 | ||
865 | T50 = _mm_hadd_epi32(T40, T41); | |
866 | T51 = _mm_hadd_epi32(T42, T43); | |
867 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); | |
868 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); | |
869 | T60 = _mm_packs_epi32(T50, T51); | |
870 | im[24][i] = T60; | |
871 | ||
872 | #define MAKE_ODD(tab, dstPos) \ | |
873 | T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
874 | T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
875 | T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
876 | T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
877 | T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
878 | T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
879 | T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
880 | T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
881 | \ | |
882 | T40 = _mm_hadd_epi32(T30, T31); \ | |
883 | T41 = _mm_hadd_epi32(T32, T33); \ | |
884 | T42 = _mm_hadd_epi32(T34, T35); \ | |
885 | T43 = _mm_hadd_epi32(T36, T37); \ | |
886 | \ | |
887 | T50 = _mm_hadd_epi32(T40, T41); \ | |
888 | T51 = _mm_hadd_epi32(T42, T43); \ | |
889 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \ | |
890 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \ | |
891 | T60 = _mm_packs_epi32(T50, T51); \ | |
892 | im[(dstPos)][i] = T60; | |
893 | ||
894 | MAKE_ODD(0, 4); | |
895 | MAKE_ODD(1, 12); | |
896 | MAKE_ODD(2, 20); | |
897 | MAKE_ODD(3, 28); | |
898 | ||
899 | T20 = _mm_sub_epi16(T10A, T10B); // [EO5 EO2 EO6 EO1 EO4 EO3 EO7 EO0] | |
900 | T21 = _mm_sub_epi16(T11A, T11B); | |
901 | T22 = _mm_sub_epi16(T12A, T12B); | |
902 | T23 = _mm_sub_epi16(T13A, T13B); | |
903 | T24 = _mm_sub_epi16(T14A, T14B); | |
904 | T25 = _mm_sub_epi16(T15A, T15B); | |
905 | T26 = _mm_sub_epi16(T16A, T16B); | |
906 | T27 = _mm_sub_epi16(T17A, T17B); | |
907 | ||
908 | MAKE_ODD(4, 2); | |
909 | MAKE_ODD(5, 6); | |
910 | MAKE_ODD(6, 10); | |
911 | MAKE_ODD(7, 14); | |
912 | MAKE_ODD(8, 18); | |
913 | MAKE_ODD(9, 22); | |
914 | MAKE_ODD(10, 26); | |
915 | MAKE_ODD(11, 30); | |
916 | #undef MAKE_ODD | |
917 | ||
918 | #define MAKE_ODD(tab, dstPos) \ | |
919 | T20 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
920 | T21 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ | |
921 | T22 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
922 | T23 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ | |
923 | T24 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
924 | T25 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ | |
925 | T26 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
926 | T27 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ | |
927 | T30 = _mm_madd_epi16(T04A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
928 | T31 = _mm_madd_epi16(T04B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ | |
929 | T32 = _mm_madd_epi16(T05A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
930 | T33 = _mm_madd_epi16(T05B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ | |
931 | T34 = _mm_madd_epi16(T06A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
932 | T35 = _mm_madd_epi16(T06B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ | |
933 | T36 = _mm_madd_epi16(T07A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ | |
934 | T37 = _mm_madd_epi16(T07B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ | |
935 | \ | |
936 | T40 = _mm_hadd_epi32(T20, T21); \ | |
937 | T41 = _mm_hadd_epi32(T22, T23); \ | |
938 | T42 = _mm_hadd_epi32(T24, T25); \ | |
939 | T43 = _mm_hadd_epi32(T26, T27); \ | |
940 | T44 = _mm_hadd_epi32(T30, T31); \ | |
941 | T45 = _mm_hadd_epi32(T32, T33); \ | |
942 | T46 = _mm_hadd_epi32(T34, T35); \ | |
943 | T47 = _mm_hadd_epi32(T36, T37); \ | |
944 | \ | |
945 | T50 = _mm_hadd_epi32(T40, T41); \ | |
946 | T51 = _mm_hadd_epi32(T42, T43); \ | |
947 | T52 = _mm_hadd_epi32(T44, T45); \ | |
948 | T53 = _mm_hadd_epi32(T46, T47); \ | |
949 | \ | |
950 | T50 = _mm_hadd_epi32(T50, T51); \ | |
951 | T51 = _mm_hadd_epi32(T52, T53); \ | |
952 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \ | |
953 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \ | |
954 | T60 = _mm_packs_epi32(T50, T51); \ | |
955 | im[(dstPos)][i] = T60; | |
956 | ||
957 | MAKE_ODD(12, 1); | |
958 | MAKE_ODD(14, 3); | |
959 | MAKE_ODD(16, 5); | |
960 | MAKE_ODD(18, 7); | |
961 | MAKE_ODD(20, 9); | |
962 | MAKE_ODD(22, 11); | |
963 | MAKE_ODD(24, 13); | |
964 | MAKE_ODD(26, 15); | |
965 | MAKE_ODD(28, 17); | |
966 | MAKE_ODD(30, 19); | |
967 | MAKE_ODD(32, 21); | |
968 | MAKE_ODD(34, 23); | |
969 | MAKE_ODD(36, 25); | |
970 | MAKE_ODD(38, 27); | |
971 | MAKE_ODD(40, 29); | |
972 | MAKE_ODD(42, 31); | |
973 | ||
974 | #undef MAKE_ODD | |
975 | } | |
976 | ||
977 | // DCT2 | |
978 | for (i = 0; i < 32 / 4; i++) | |
979 | { | |
980 | // OPT_ME: to avoid register spill, I use matrix multiply, have other way? | |
981 | T00A = im[i * 4 + 0][0]; // [07 06 05 04 03 02 01 00] | |
982 | T00B = im[i * 4 + 0][1]; // [15 14 13 12 11 10 09 08] | |
983 | T00C = im[i * 4 + 0][2]; // [23 22 21 20 19 18 17 16] | |
984 | T00D = im[i * 4 + 0][3]; // [31 30 29 28 27 26 25 24] | |
985 | T01A = im[i * 4 + 1][0]; | |
986 | T01B = im[i * 4 + 1][1]; | |
987 | T01C = im[i * 4 + 1][2]; | |
988 | T01D = im[i * 4 + 1][3]; | |
989 | T02A = im[i * 4 + 2][0]; | |
990 | T02B = im[i * 4 + 2][1]; | |
991 | T02C = im[i * 4 + 2][2]; | |
992 | T02D = im[i * 4 + 2][3]; | |
993 | T03A = im[i * 4 + 3][0]; | |
994 | T03B = im[i * 4 + 3][1]; | |
995 | T03C = im[i * 4 + 3][2]; | |
996 | T03D = im[i * 4 + 3][3]; | |
997 | ||
998 | T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [16 17 18 19 20 21 22 23] | |
999 | T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [24 25 26 27 28 29 30 31] | |
1000 | T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
1001 | T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
1002 | T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
1003 | T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
1004 | T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
1005 | T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); | |
1006 | ||
1007 | T10A = _mm_unpacklo_epi16(T00A, T00D); // [28 03 29 02 30 01 31 00] | |
1008 | T10B = _mm_unpackhi_epi16(T00A, T00D); // [24 07 25 06 26 05 27 04] | |
1009 | T00A = _mm_unpacklo_epi16(T00B, T00C); // [20 11 21 10 22 09 23 08] | |
1010 | T00B = _mm_unpackhi_epi16(T00B, T00C); // [16 15 17 14 18 13 19 12] | |
1011 | T11A = _mm_unpacklo_epi16(T01A, T01D); | |
1012 | T11B = _mm_unpackhi_epi16(T01A, T01D); | |
1013 | T01A = _mm_unpacklo_epi16(T01B, T01C); | |
1014 | T01B = _mm_unpackhi_epi16(T01B, T01C); | |
1015 | T12A = _mm_unpacklo_epi16(T02A, T02D); | |
1016 | T12B = _mm_unpackhi_epi16(T02A, T02D); | |
1017 | T02A = _mm_unpacklo_epi16(T02B, T02C); | |
1018 | T02B = _mm_unpackhi_epi16(T02B, T02C); | |
1019 | T13A = _mm_unpacklo_epi16(T03A, T03D); | |
1020 | T13B = _mm_unpackhi_epi16(T03A, T03D); | |
1021 | T03A = _mm_unpacklo_epi16(T03B, T03C); | |
1022 | T03B = _mm_unpackhi_epi16(T03B, T03C); | |
1023 | ||
1024 | #define MAKE_ODD(tab0, tab1, tab2, tab3, dstPos) \ | |
1025 | T20 = _mm_madd_epi16(T10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ | |
1026 | T21 = _mm_madd_epi16(T10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ | |
1027 | T22 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ | |
1028 | T23 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ | |
1029 | T24 = _mm_madd_epi16(T11A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ | |
1030 | T25 = _mm_madd_epi16(T11B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ | |
1031 | T26 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ | |
1032 | T27 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ | |
1033 | T30 = _mm_madd_epi16(T12A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ | |
1034 | T31 = _mm_madd_epi16(T12B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ | |
1035 | T32 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ | |
1036 | T33 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ | |
1037 | T34 = _mm_madd_epi16(T13A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ | |
1038 | T35 = _mm_madd_epi16(T13B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ | |
1039 | T36 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ | |
1040 | T37 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ | |
1041 | \ | |
1042 | T60 = _mm_hadd_epi32(T20, T21); \ | |
1043 | T61 = _mm_hadd_epi32(T22, T23); \ | |
1044 | T62 = _mm_hadd_epi32(T24, T25); \ | |
1045 | T63 = _mm_hadd_epi32(T26, T27); \ | |
1046 | T64 = _mm_hadd_epi32(T30, T31); \ | |
1047 | T65 = _mm_hadd_epi32(T32, T33); \ | |
1048 | T66 = _mm_hadd_epi32(T34, T35); \ | |
1049 | T67 = _mm_hadd_epi32(T36, T37); \ | |
1050 | \ | |
1051 | T60 = _mm_hadd_epi32(T60, T61); \ | |
1052 | T61 = _mm_hadd_epi32(T62, T63); \ | |
1053 | T62 = _mm_hadd_epi32(T64, T65); \ | |
1054 | T63 = _mm_hadd_epi32(T66, T67); \ | |
1055 | \ | |
1056 | T60 = _mm_hadd_epi32(T60, T61); \ | |
1057 | T61 = _mm_hadd_epi32(T62, T63); \ | |
1058 | \ | |
1059 | T60 = _mm_hadd_epi32(T60, T61); \ | |
1060 | \ | |
1061 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \ | |
b53f7c52 JB |
1062 | T60 = _mm_packs_epi32(T60, T60); \ |
1063 | _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \ | |
72b9787e JB |
1064 | |
1065 | MAKE_ODD(44, 44, 44, 44, 0); | |
1066 | MAKE_ODD(45, 45, 45, 45, 16); | |
1067 | MAKE_ODD(46, 47, 46, 47, 8); | |
1068 | MAKE_ODD(48, 49, 48, 49, 24); | |
1069 | ||
1070 | MAKE_ODD(50, 51, 52, 53, 4); | |
1071 | MAKE_ODD(54, 55, 56, 57, 12); | |
1072 | MAKE_ODD(58, 59, 60, 61, 20); | |
1073 | MAKE_ODD(62, 63, 64, 65, 28); | |
1074 | ||
1075 | MAKE_ODD(66, 67, 68, 69, 2); | |
1076 | MAKE_ODD(70, 71, 72, 73, 6); | |
1077 | MAKE_ODD(74, 75, 76, 77, 10); | |
1078 | MAKE_ODD(78, 79, 80, 81, 14); | |
1079 | ||
1080 | MAKE_ODD(82, 83, 84, 85, 18); | |
1081 | MAKE_ODD(86, 87, 88, 89, 22); | |
1082 | MAKE_ODD(90, 91, 92, 93, 26); | |
1083 | MAKE_ODD(94, 95, 96, 97, 30); | |
1084 | ||
1085 | MAKE_ODD(98, 99, 100, 101, 1); | |
1086 | MAKE_ODD(102, 103, 104, 105, 3); | |
1087 | MAKE_ODD(106, 107, 108, 109, 5); | |
1088 | MAKE_ODD(110, 111, 112, 113, 7); | |
1089 | MAKE_ODD(114, 115, 116, 117, 9); | |
1090 | MAKE_ODD(118, 119, 120, 121, 11); | |
1091 | MAKE_ODD(122, 123, 124, 125, 13); | |
1092 | MAKE_ODD(126, 127, 128, 129, 15); | |
1093 | MAKE_ODD(130, 131, 132, 133, 17); | |
1094 | MAKE_ODD(134, 135, 136, 137, 19); | |
1095 | MAKE_ODD(138, 139, 140, 141, 21); | |
1096 | MAKE_ODD(142, 143, 144, 145, 23); | |
1097 | MAKE_ODD(146, 147, 148, 149, 25); | |
1098 | MAKE_ODD(150, 151, 152, 153, 27); | |
1099 | MAKE_ODD(154, 155, 156, 157, 29); | |
1100 | MAKE_ODD(158, 159, 160, 161, 31); | |
1101 | #undef MAKE_ODD | |
1102 | } | |
1103 | } | |
1104 | } | |
1105 | #endif // if !HIGH_BIT_DEPTH | |
1106 | ||
1107 | namespace x265 { | |
1108 | void Setup_Vec_DCTPrimitives_ssse3(EncoderPrimitives &p) | |
1109 | { | |
1110 | /* Note: We have AVX2 assembly for these two functions, but since AVX2 is | |
1111 | * still somewhat rare on end-user PCs we still compile and link these SSSE3 | |
1112 | * intrinsic SIMD functions */ | |
1113 | #if !HIGH_BIT_DEPTH | |
1114 | p.dct[DCT_16x16] = dct16; | |
1115 | p.dct[DCT_32x32] = dct32; | |
1116 | #endif | |
1117 | } | |
1118 | } |