c435b5292357f51e8cf3892f106eda2ffe555978
[deb_x265.git] / source / common / vec / dct-sse3.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
7 * Mahesh Pittala <mahesh@multicorewareinc.com>
8 * Rajesh Paulraj <rajesh@multicorewareinc.com>
9 * Min Chen <min.chen@multicorewareinc.com>
10 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
11 * Nabajit Deka <nabajit@multicorewareinc.com>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 *
27 * This program is also available under a commercial proprietary license.
28 * For more information, contact us at license @ x265.com.
29 *****************************************************************************/
30
31 #include "common.h"
32 #include "primitives.h"
33 #include <xmmintrin.h> // SSE
34 #include <pmmintrin.h> // SSE3
35
36 using namespace x265;
37
38 namespace {
39 #if !HIGH_BIT_DEPTH
40 ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
41 {
42 { 89, 75, 89, 75, 89, 75, 89, 75 },
43 { 50, 18, 50, 18, 50, 18, 50, 18 },
44 { 75, -18, 75, -18, 75, -18, 75, -18 },
45 { -89, -50, -89, -50, -89, -50, -89, -50 },
46 { 50, -89, 50, -89, 50, -89, 50, -89 },
47 { 18, 75, 18, 75, 18, 75, 18, 75 },
48 { 18, -50, 18, -50, 18, -50, 18, -50 },
49 { 75, -89, 75, -89, 75, -89, 75, -89 },
50 { 64, 64, 64, 64, 64, 64, 64, 64 },
51 { 64, -64, 64, -64, 64, -64, 64, -64 },
52 { 83, 36, 83, 36, 83, 36, 83, 36 },
53 { 36, -83, 36, -83, 36, -83, 36, -83 }
54 };
55 void idct8(int32_t *src, int16_t *dst, intptr_t stride)
56 {
57 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
58 __m128i T00, T01, T02, T03, T04, T05, T06, T07;
59
60 m128iAdd = _mm_set1_epi32(64);
61
62 T00 = _mm_load_si128((__m128i*)&src[8 + 0]);
63 T01 = _mm_load_si128((__m128i*)&src[8 + 4]);
64 m128iS1 = _mm_packs_epi32(T00, T01);
65 T00 = _mm_load_si128((__m128i*)&src[24 + 0]);
66 T01 = _mm_load_si128((__m128i*)&src[24 + 4]);
67 m128iS3 = _mm_packs_epi32(T00, T01);
68 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
69 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
70 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
71 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
72
73 T00 = _mm_load_si128((__m128i*)&src[40 + 0]);
74 T01 = _mm_load_si128((__m128i*)&src[40 + 4]);
75 m128iS5 = _mm_packs_epi32(T00, T01);
76 T00 = _mm_load_si128((__m128i*)&src[56 + 0]);
77 T01 = _mm_load_si128((__m128i*)&src[56 + 4]);
78 m128iS7 = _mm_packs_epi32(T00, T01);
79 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
80 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
81 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
82 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
83 O0l = _mm_add_epi32(E1l, E2l);
84 O0h = _mm_add_epi32(E1h, E2h);
85
86 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
87 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
88 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
89 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
90
91 O1l = _mm_add_epi32(E1l, E2l);
92 O1h = _mm_add_epi32(E1h, E2h);
93
94 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
95 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
96 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
97 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
98 O2l = _mm_add_epi32(E1l, E2l);
99 O2h = _mm_add_epi32(E1h, E2h);
100
101 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
102 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
103 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
104 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
105 O3h = _mm_add_epi32(E1h, E2h);
106 O3l = _mm_add_epi32(E1l, E2l);
107
108 /* ------- */
109
110 T00 = _mm_load_si128((__m128i*)&src[0 + 0]);
111 T01 = _mm_load_si128((__m128i*)&src[0 + 4]);
112 m128iS0 = _mm_packs_epi32(T00, T01);
113 T00 = _mm_load_si128((__m128i*)&src[32 + 0]);
114 T01 = _mm_load_si128((__m128i*)&src[32 + 4]);
115 m128iS4 = _mm_packs_epi32(T00, T01);
116 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
117 EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
118 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
119 EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
120
121 EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
122 EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
123
124 /* ------- */
125
126 T00 = _mm_load_si128((__m128i*)&src[16 + 0]);
127 T01 = _mm_load_si128((__m128i*)&src[16 + 4]);
128 m128iS2 = _mm_packs_epi32(T00, T01);
129 T00 = _mm_load_si128((__m128i*)&src[48 + 0]);
130 T01 = _mm_load_si128((__m128i*)&src[48 + 4]);
131 m128iS6 = _mm_packs_epi32(T00, T01);
132 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
133 E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
134 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
135 E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
136 E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
137 E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
138 E0l = _mm_add_epi32(EE0l, E00l);
139 E0l = _mm_add_epi32(E0l, m128iAdd);
140 E0h = _mm_add_epi32(EE0h, E00h);
141 E0h = _mm_add_epi32(E0h, m128iAdd);
142 E3l = _mm_sub_epi32(EE0l, E00l);
143 E3l = _mm_add_epi32(E3l, m128iAdd);
144 E3h = _mm_sub_epi32(EE0h, E00h);
145 E3h = _mm_add_epi32(E3h, m128iAdd);
146
147 E1l = _mm_add_epi32(EE1l, E01l);
148 E1l = _mm_add_epi32(E1l, m128iAdd);
149 E1h = _mm_add_epi32(EE1h, E01h);
150 E1h = _mm_add_epi32(E1h, m128iAdd);
151 E2l = _mm_sub_epi32(EE1l, E01l);
152 E2l = _mm_add_epi32(E2l, m128iAdd);
153 E2h = _mm_sub_epi32(EE1h, E01h);
154 E2h = _mm_add_epi32(E2h, m128iAdd);
155 m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 7));
156 m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 7));
157 m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 7));
158 m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 7));
159 m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 7));
160 m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 7));
161 m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 7));
162 m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 7));
163 /* Invers matrix */
164
165 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
166 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
167 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
168 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
169 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
170 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
171 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
172 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
173 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
174 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
175 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
176 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
177 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
178 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
179 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
180 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
181 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
182 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
183 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
184 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
185 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
186 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
187 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
188 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
189
190 m128iAdd = _mm_set1_epi32(2048);
191
192 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
193 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
194 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
195 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
196 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
197 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
198 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
199 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
200 O0l = _mm_add_epi32(E1l, E2l);
201 O0h = _mm_add_epi32(E1h, E2h);
202 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
203 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
204 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
205 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
206 O1l = _mm_add_epi32(E1l, E2l);
207 O1h = _mm_add_epi32(E1h, E2h);
208 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
209 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
210 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
211 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
212 O2l = _mm_add_epi32(E1l, E2l);
213 O2h = _mm_add_epi32(E1h, E2h);
214 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
215 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
216 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
217 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
218 O3h = _mm_add_epi32(E1h, E2h);
219 O3l = _mm_add_epi32(E1l, E2l);
220
221 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
222 EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
223 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
224 EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
225 EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
226 EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
227
228 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
229 E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
230 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
231 E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
232 E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
233 E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
234 E0l = _mm_add_epi32(EE0l, E00l);
235 E0l = _mm_add_epi32(E0l, m128iAdd);
236 E0h = _mm_add_epi32(EE0h, E00h);
237 E0h = _mm_add_epi32(E0h, m128iAdd);
238 E3l = _mm_sub_epi32(EE0l, E00l);
239 E3l = _mm_add_epi32(E3l, m128iAdd);
240 E3h = _mm_sub_epi32(EE0h, E00h);
241 E3h = _mm_add_epi32(E3h, m128iAdd);
242 E1l = _mm_add_epi32(EE1l, E01l);
243 E1l = _mm_add_epi32(E1l, m128iAdd);
244 E1h = _mm_add_epi32(EE1h, E01h);
245 E1h = _mm_add_epi32(E1h, m128iAdd);
246 E2l = _mm_sub_epi32(EE1l, E01l);
247 E2l = _mm_add_epi32(E2l, m128iAdd);
248 E2h = _mm_sub_epi32(EE1h, E01h);
249 E2h = _mm_add_epi32(E2h, m128iAdd);
250
251 m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 12));
252 m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 12));
253 m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 12));
254 m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 12));
255 m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 12));
256 m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 12));
257 m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 12));
258 m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 12));
259
260 // [07 06 05 04 03 02 01 00]
261 // [17 16 15 14 13 12 11 10]
262 // [27 26 25 24 23 22 21 20]
263 // [37 36 35 34 33 32 31 30]
264 // [47 46 45 44 43 42 41 40]
265 // [57 56 55 54 53 52 51 50]
266 // [67 66 65 64 63 62 61 60]
267 // [77 76 75 74 73 72 71 70]
268
269 T00 = _mm_unpacklo_epi16(m128iS0, m128iS1); // [13 03 12 02 11 01 10 00]
270 T01 = _mm_unpackhi_epi16(m128iS0, m128iS1); // [17 07 16 06 15 05 14 04]
271 T02 = _mm_unpacklo_epi16(m128iS2, m128iS3); // [33 23 32 22 31 21 30 20]
272 T03 = _mm_unpackhi_epi16(m128iS2, m128iS3); // [37 27 36 26 35 25 34 24]
273 T04 = _mm_unpacklo_epi16(m128iS4, m128iS5); // [53 43 52 42 51 41 50 40]
274 T05 = _mm_unpackhi_epi16(m128iS4, m128iS5); // [57 47 56 46 55 45 54 44]
275 T06 = _mm_unpacklo_epi16(m128iS6, m128iS7); // [73 63 72 62 71 61 70 60]
276 T07 = _mm_unpackhi_epi16(m128iS6, m128iS7); // [77 67 76 66 75 65 74 64]
277
278 __m128i T10, T11;
279 T10 = _mm_unpacklo_epi32(T00, T02); // [31 21 11 01 30 20 10 00]
280 T11 = _mm_unpackhi_epi32(T00, T02); // [33 23 13 03 32 22 12 02]
281 _mm_storel_epi64((__m128i*)&dst[0 * stride + 0], T10); // [30 20 10 00]
282 _mm_storeh_pi((__m64*)&dst[1 * stride + 0], _mm_castsi128_ps(T10)); // [31 21 11 01]
283 _mm_storel_epi64((__m128i*)&dst[2 * stride + 0], T11); // [32 22 12 02]
284 _mm_storeh_pi((__m64*)&dst[3 * stride + 0], _mm_castsi128_ps(T11)); // [33 23 13 03]
285
286 T10 = _mm_unpacklo_epi32(T04, T06); // [71 61 51 41 70 60 50 40]
287 T11 = _mm_unpackhi_epi32(T04, T06); // [73 63 53 43 72 62 52 42]
288 _mm_storel_epi64((__m128i*)&dst[0 * stride + 4], T10);
289 _mm_storeh_pi((__m64*)&dst[1 * stride + 4], _mm_castsi128_ps(T10));
290 _mm_storel_epi64((__m128i*)&dst[2 * stride + 4], T11);
291 _mm_storeh_pi((__m64*)&dst[3 * stride + 4], _mm_castsi128_ps(T11));
292
293 T10 = _mm_unpacklo_epi32(T01, T03); // [35 25 15 05 34 24 14 04]
294 T11 = _mm_unpackhi_epi32(T01, T03); // [37 27 17 07 36 26 16 06]
295 _mm_storel_epi64((__m128i*)&dst[4 * stride + 0], T10);
296 _mm_storeh_pi((__m64*)&dst[5 * stride + 0], _mm_castsi128_ps(T10));
297 _mm_storel_epi64((__m128i*)&dst[6 * stride + 0], T11);
298 _mm_storeh_pi((__m64*)&dst[7 * stride + 0], _mm_castsi128_ps(T11));
299
300 T10 = _mm_unpacklo_epi32(T05, T07); // [75 65 55 45 74 64 54 44]
301 T11 = _mm_unpackhi_epi32(T05, T07); // [77 67 57 47 76 56 46 36]
302 _mm_storel_epi64((__m128i*)&dst[4 * stride + 4], T10);
303 _mm_storeh_pi((__m64*)&dst[5 * stride + 4], _mm_castsi128_ps(T10));
304 _mm_storel_epi64((__m128i*)&dst[6 * stride + 4], T11);
305 _mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(T11));
306 }
307
308 void idct16(int32_t *src, int16_t *dst, intptr_t stride)
309 {
310 const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
311 const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050);
312 const __m128i c16_p43_p57 = _mm_set1_epi32(0x002B0039);
313 const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019);
314 const __m128i c16_p57_p87 = _mm_set1_epi32(0x00390057); //row1
315 const __m128i c16_n43_p09 = _mm_set1_epi32(0xFFD50009);
316 const __m128i c16_n90_n80 = _mm_set1_epi32(0xFFA6FFB0);
317 const __m128i c16_n25_n70 = _mm_set1_epi32(0xFFE7FFBA);
318 const __m128i c16_p09_p80 = _mm_set1_epi32(0x00090050); //row2
319 const __m128i c16_n87_n70 = _mm_set1_epi32(0xFFA9FFBA);
320 const __m128i c16_p57_n25 = _mm_set1_epi32(0x0039FFE7);
321 const __m128i c16_p43_p90 = _mm_set1_epi32(0x002B005A);
322 const __m128i c16_n43_p70 = _mm_set1_epi32(0xFFD50046); //row3
323 const __m128i c16_p09_n87 = _mm_set1_epi32(0x0009FFA9);
324 const __m128i c16_p25_p90 = _mm_set1_epi32(0x0019005A);
325 const __m128i c16_n57_n80 = _mm_set1_epi32(0xFFC7FFB0);
326 const __m128i c16_n80_p57 = _mm_set1_epi32(0xFFB00039); //row4
327 const __m128i c16_p90_n25 = _mm_set1_epi32(0x005AFFE7);
328 const __m128i c16_n87_n09 = _mm_set1_epi32(0xFFA9FFF7);
329 const __m128i c16_p70_p43 = _mm_set1_epi32(0x0046002B);
330 const __m128i c16_n90_p43 = _mm_set1_epi32(0xFFA6002B); //row5
331 const __m128i c16_p25_p57 = _mm_set1_epi32(0x00190039);
332 const __m128i c16_p70_n87 = _mm_set1_epi32(0x0046FFA9);
333 const __m128i c16_n80_p09 = _mm_set1_epi32(0xFFB00009);
334 const __m128i c16_n70_p25 = _mm_set1_epi32(0xFFBA0019); //row6
335 const __m128i c16_n80_p90 = _mm_set1_epi32(0xFFB0005A);
336 const __m128i c16_p09_p43 = _mm_set1_epi32(0x0009002B);
337 const __m128i c16_p87_n57 = _mm_set1_epi32(0x0057FFC7);
338 const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); //row7
339 const __m128i c16_n57_p43 = _mm_set1_epi32(0xFFC7002B);
340 const __m128i c16_n80_p70 = _mm_set1_epi32(0xFFB00046);
341 const __m128i c16_n90_p87 = _mm_set1_epi32(0xFFA60057);
342
343 const __m128i c16_p75_p89 = _mm_set1_epi32(0x004B0059);
344 const __m128i c16_p18_p50 = _mm_set1_epi32(0x00120032);
345 const __m128i c16_n18_p75 = _mm_set1_epi32(0xFFEE004B);
346 const __m128i c16_n50_n89 = _mm_set1_epi32(0xFFCEFFA7);
347 const __m128i c16_n89_p50 = _mm_set1_epi32(0xFFA70032);
348 const __m128i c16_p75_p18 = _mm_set1_epi32(0x004B0012);
349 const __m128i c16_n50_p18 = _mm_set1_epi32(0xFFCE0012);
350 const __m128i c16_n89_p75 = _mm_set1_epi32(0xFFA7004B);
351
352 const __m128i c16_p36_p83 = _mm_set1_epi32(0x00240053);
353 const __m128i c16_n83_p36 = _mm_set1_epi32(0xFFAD0024);
354
355 const __m128i c16_n64_p64 = _mm_set1_epi32(0xFFC00040);
356 const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040);
357 __m128i c32_rnd = _mm_set1_epi32(64);
358
359 int nShift = 7;
360
361 // DCT1
362 __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2];
363 __m128i in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2];
364 __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2];
365 __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2];
366
367 for (int i = 0; i < 2; i++)
368 {
369 const int offset = (i << 3);
370 __m128i T00, T01;
371
372 T00 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]);
373 T01 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset + 4]);
374 in00[i] = _mm_packs_epi32(T00, T01); // [07 06 05 04 03 02 01 00]
375
376 T00 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]);
377 T01 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset + 4]);
378 in01[i] = _mm_packs_epi32(T00, T01); // [17 16 15 14 13 12 11 10]
379
380 T00 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]);
381 T01 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset + 4]);
382 in02[i] = _mm_packs_epi32(T00, T01); // [27 26 25 24 23 22 21 20]
383
384 T00 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]);
385 T01 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset + 4]);
386 in03[i] = _mm_packs_epi32(T00, T01); // [37 36 35 34 33 32 31 30]
387
388 T00 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]);
389 T01 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset + 4]);
390 in04[i] = _mm_packs_epi32(T00, T01); // [47 46 45 44 43 42 41 40]
391
392 T00 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]);
393 T01 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset + 4]);
394 in05[i] = _mm_packs_epi32(T00, T01); // [57 56 55 54 53 52 51 50]
395
396 T00 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]);
397 T01 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset + 4]);
398 in06[i] = _mm_packs_epi32(T00, T01); // [67 66 65 64 63 62 61 60]
399
400 T00 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]);
401 T01 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset + 4]);
402 in07[i] = _mm_packs_epi32(T00, T01); // [77 76 75 74 73 72 71 70]
403
404 T00 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
405 T01 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset + 4]);
406 in08[i] = _mm_packs_epi32(T00, T01);
407
408 T00 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
409 T01 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset + 4]);
410 in09[i] = _mm_packs_epi32(T00, T01);
411
412 T00 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
413 T01 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset + 4]);
414 in10[i] = _mm_packs_epi32(T00, T01);
415
416 T00 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
417 T01 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset + 4]);
418 in11[i] = _mm_packs_epi32(T00, T01);
419
420 T00 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
421 T01 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset + 4]);
422 in12[i] = _mm_packs_epi32(T00, T01);
423
424 T00 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
425 T01 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset + 4]);
426 in13[i] = _mm_packs_epi32(T00, T01);
427
428 T00 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
429 T01 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset + 4]);
430 in14[i] = _mm_packs_epi32(T00, T01);
431
432 T00 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
433 T01 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset + 4]);
434 in15[i] = _mm_packs_epi32(T00, T01);
435 }
436
437 for (int pass = 0; pass < 2; pass++)
438 {
439 if (pass == 1)
440 {
441 c32_rnd = _mm_set1_epi32(2048);
442 nShift = 12;
443 }
444
445 for (int part = 0; part < 2; part++)
446 {
447 const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10]
448 const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14]
449 const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ]
450 const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ]
451 const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ]
452 const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ]
453 const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ]
454 const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ]
455 const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ]
456 const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ]
457 const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ]
458 const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ]
459 const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ]row
460 const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ]
461 const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]); // [83 03 82 02 81 01 81 00] row08 row00
462 const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]); // [87 07 86 06 85 05 84 04]
463
464 __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;
465 __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;
466 {
467 __m128i T00, T01;
468 #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \
469 T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \
470 T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \
471 row = _mm_add_epi32(T00, T01);
472
473 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)
474 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)
475 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)
476 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)
477 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)
478 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)
479 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)
480 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)
481
482 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)
483 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)
484 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)
485 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)
486 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)
487 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)
488 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)
489 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)
490 #undef COMPUTE_ROW
491 }
492
493 __m128i EO0A, EO1A, EO2A, EO3A;
494 __m128i EO0B, EO1B, EO2B, EO3B;
495 EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89), _mm_madd_epi16(T_00_05A, c16_p18_p50)); // EO0
496 EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89), _mm_madd_epi16(T_00_05B, c16_p18_p50));
497 EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75), _mm_madd_epi16(T_00_05A, c16_n50_n89)); // EO1
498 EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75), _mm_madd_epi16(T_00_05B, c16_n50_n89));
499 EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50), _mm_madd_epi16(T_00_05A, c16_p75_p18)); // EO2
500 EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50), _mm_madd_epi16(T_00_05B, c16_p75_p18));
501 EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18), _mm_madd_epi16(T_00_05A, c16_n89_p75)); // EO3
502 EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18), _mm_madd_epi16(T_00_05B, c16_n89_p75));
503
504 __m128i EEO0A, EEO1A;
505 __m128i EEO0B, EEO1B;
506 EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);
507 EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);
508 EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);
509 EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);
510
511 __m128i EEE0A, EEE1A;
512 __m128i EEE0B, EEE1B;
513 EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);
514 EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);
515 EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);
516 EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);
517
518 const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0
519 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);
520 const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1
521 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);
522 const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0
523 const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);
524 const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1
525 const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);
526
527 const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0
528 const __m128i E0B = _mm_add_epi32(EE0B, EO0B);
529 const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1
530 const __m128i E1B = _mm_add_epi32(EE1B, EO1B);
531 const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2
532 const __m128i E2B = _mm_add_epi32(EE2B, EO2B);
533 const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3
534 const __m128i E3B = _mm_add_epi32(EE3B, EO3B);
535 const __m128i E7A = _mm_sub_epi32(EE0A, EO0A); // E0 = EE0 - EO0
536 const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);
537 const __m128i E6A = _mm_sub_epi32(EE1A, EO1A); // E1 = EE1 - EO1
538 const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);
539 const __m128i E5A = _mm_sub_epi32(EE2A, EO2A); // E2 = EE2 - EO2
540 const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);
541 const __m128i E4A = _mm_sub_epi32(EE3A, EO3A); // E3 = EE3 - EO3
542 const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);
543
544 const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd
545 const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);
546 const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd
547 const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);
548 const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd
549 const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);
550 const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd
551 const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);
552 const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd
553 const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);
554 const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd
555 const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);
556 const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd
557 const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);
558 const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd
559 const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);
560
561 const __m128i T20A = _mm_add_epi32(T10A, O0A); // E0 + O0 + rnd
562 const __m128i T20B = _mm_add_epi32(T10B, O0B);
563 const __m128i T21A = _mm_add_epi32(T11A, O1A); // E1 + O1 + rnd
564 const __m128i T21B = _mm_add_epi32(T11B, O1B);
565 const __m128i T22A = _mm_add_epi32(T12A, O2A); // E2 + O2 + rnd
566 const __m128i T22B = _mm_add_epi32(T12B, O2B);
567 const __m128i T23A = _mm_add_epi32(T13A, O3A); // E3 + O3 + rnd
568 const __m128i T23B = _mm_add_epi32(T13B, O3B);
569 const __m128i T24A = _mm_add_epi32(T14A, O4A); // E4
570 const __m128i T24B = _mm_add_epi32(T14B, O4B);
571 const __m128i T25A = _mm_add_epi32(T15A, O5A); // E5
572 const __m128i T25B = _mm_add_epi32(T15B, O5B);
573 const __m128i T26A = _mm_add_epi32(T16A, O6A); // E6
574 const __m128i T26B = _mm_add_epi32(T16B, O6B);
575 const __m128i T27A = _mm_add_epi32(T17A, O7A); // E7
576 const __m128i T27B = _mm_add_epi32(T17B, O7B);
577 const __m128i T2FA = _mm_sub_epi32(T10A, O0A); // E0 - O0 + rnd
578 const __m128i T2FB = _mm_sub_epi32(T10B, O0B);
579 const __m128i T2EA = _mm_sub_epi32(T11A, O1A); // E1 - O1 + rnd
580 const __m128i T2EB = _mm_sub_epi32(T11B, O1B);
581 const __m128i T2DA = _mm_sub_epi32(T12A, O2A); // E2 - O2 + rnd
582 const __m128i T2DB = _mm_sub_epi32(T12B, O2B);
583 const __m128i T2CA = _mm_sub_epi32(T13A, O3A); // E3 - O3 + rnd
584 const __m128i T2CB = _mm_sub_epi32(T13B, O3B);
585 const __m128i T2BA = _mm_sub_epi32(T14A, O4A); // E4
586 const __m128i T2BB = _mm_sub_epi32(T14B, O4B);
587 const __m128i T2AA = _mm_sub_epi32(T15A, O5A); // E5
588 const __m128i T2AB = _mm_sub_epi32(T15B, O5B);
589 const __m128i T29A = _mm_sub_epi32(T16A, O6A); // E6
590 const __m128i T29B = _mm_sub_epi32(T16B, O6B);
591 const __m128i T28A = _mm_sub_epi32(T17A, O7A); // E7
592 const __m128i T28B = _mm_sub_epi32(T17B, O7B);
593
594 const __m128i T30A = _mm_srai_epi32(T20A, nShift); // [30 20 10 00]
595 const __m128i T30B = _mm_srai_epi32(T20B, nShift); // [70 60 50 40]
596 const __m128i T31A = _mm_srai_epi32(T21A, nShift); // [31 21 11 01]
597 const __m128i T31B = _mm_srai_epi32(T21B, nShift); // [71 61 51 41]
598 const __m128i T32A = _mm_srai_epi32(T22A, nShift); // [32 22 12 02]
599 const __m128i T32B = _mm_srai_epi32(T22B, nShift); // [72 62 52 42]
600 const __m128i T33A = _mm_srai_epi32(T23A, nShift); // [33 23 13 03]
601 const __m128i T33B = _mm_srai_epi32(T23B, nShift); // [73 63 53 43]
602 const __m128i T34A = _mm_srai_epi32(T24A, nShift); // [33 24 14 04]
603 const __m128i T34B = _mm_srai_epi32(T24B, nShift); // [74 64 54 44]
604 const __m128i T35A = _mm_srai_epi32(T25A, nShift); // [35 25 15 05]
605 const __m128i T35B = _mm_srai_epi32(T25B, nShift); // [75 65 55 45]
606 const __m128i T36A = _mm_srai_epi32(T26A, nShift); // [36 26 16 06]
607 const __m128i T36B = _mm_srai_epi32(T26B, nShift); // [76 66 56 46]
608 const __m128i T37A = _mm_srai_epi32(T27A, nShift); // [37 27 17 07]
609 const __m128i T37B = _mm_srai_epi32(T27B, nShift); // [77 67 57 47]
610
611 const __m128i T38A = _mm_srai_epi32(T28A, nShift); // [30 20 10 00] x8
612 const __m128i T38B = _mm_srai_epi32(T28B, nShift); // [70 60 50 40]
613 const __m128i T39A = _mm_srai_epi32(T29A, nShift); // [31 21 11 01] x9
614 const __m128i T39B = _mm_srai_epi32(T29B, nShift); // [71 61 51 41]
615 const __m128i T3AA = _mm_srai_epi32(T2AA, nShift); // [32 22 12 02] xA
616 const __m128i T3AB = _mm_srai_epi32(T2AB, nShift); // [72 62 52 42]
617 const __m128i T3BA = _mm_srai_epi32(T2BA, nShift); // [33 23 13 03] xB
618 const __m128i T3BB = _mm_srai_epi32(T2BB, nShift); // [73 63 53 43]
619 const __m128i T3CA = _mm_srai_epi32(T2CA, nShift); // [33 24 14 04] xC
620 const __m128i T3CB = _mm_srai_epi32(T2CB, nShift); // [74 64 54 44]
621 const __m128i T3DA = _mm_srai_epi32(T2DA, nShift); // [35 25 15 05] xD
622 const __m128i T3DB = _mm_srai_epi32(T2DB, nShift); // [75 65 55 45]
623 const __m128i T3EA = _mm_srai_epi32(T2EA, nShift); // [36 26 16 06] xE
624 const __m128i T3EB = _mm_srai_epi32(T2EB, nShift); // [76 66 56 46]
625 const __m128i T3FA = _mm_srai_epi32(T2FA, nShift); // [37 27 17 07] xF
626 const __m128i T3FB = _mm_srai_epi32(T2FB, nShift); // [77 67 57 47]
627
628 res00[part] = _mm_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00]
629 res01[part] = _mm_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01]
630 res02[part] = _mm_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02]
631 res03[part] = _mm_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03]
632 res04[part] = _mm_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04]
633 res05[part] = _mm_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05]
634 res06[part] = _mm_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06]
635 res07[part] = _mm_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07]
636
637 res08[part] = _mm_packs_epi32(T38A, T38B); // [A0 ... 80]
638 res09[part] = _mm_packs_epi32(T39A, T39B); // [A1 ... 81]
639 res10[part] = _mm_packs_epi32(T3AA, T3AB); // [A2 ... 82]
640 res11[part] = _mm_packs_epi32(T3BA, T3BB); // [A3 ... 83]
641 res12[part] = _mm_packs_epi32(T3CA, T3CB); // [A4 ... 84]
642 res13[part] = _mm_packs_epi32(T3DA, T3DB); // [A5 ... 85]
643 res14[part] = _mm_packs_epi32(T3EA, T3EB); // [A6 ... 86]
644 res15[part] = _mm_packs_epi32(T3FA, T3FB); // [A7 ... 87]
645 }
646 //transpose matrix 8x8 16bit.
647 {
648 __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
649 __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
650 #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
651 tr0_0 = _mm_unpacklo_epi16(I0, I1); \
652 tr0_1 = _mm_unpacklo_epi16(I2, I3); \
653 tr0_2 = _mm_unpackhi_epi16(I0, I1); \
654 tr0_3 = _mm_unpackhi_epi16(I2, I3); \
655 tr0_4 = _mm_unpacklo_epi16(I4, I5); \
656 tr0_5 = _mm_unpacklo_epi16(I6, I7); \
657 tr0_6 = _mm_unpackhi_epi16(I4, I5); \
658 tr0_7 = _mm_unpackhi_epi16(I6, I7); \
659 tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
660 tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
661 tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
662 tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
663 tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
664 tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
665 tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
666 tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
667 O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
668 O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
669 O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
670 O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
671 O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
672 O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
673 O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
674 O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
675
676 TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
677 TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
678 TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
679 TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
680
681 #undef TRANSPOSE_8x8_16BIT
682 }
683 }
684
685 _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);
686 _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);
687 _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);
688 _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);
689 _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);
690 _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);
691 _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);
692 _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);
693 _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);
694 _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);
695 _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);
696 _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);
697 _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);
698 _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);
699 _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);
700 _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);
701 _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);
702 _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);
703 _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);
704 _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);
705 _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);
706 _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);
707 _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);
708 _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);
709 _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);
710 _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);
711 _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);
712 _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);
713 _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);
714 _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);
715 _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);
716 _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
717 }
718
719 void idct32(int32_t *src, int16_t *dst, intptr_t stride)
720 {
721 //Odd
722 const __m128i c16_p90_p90 = _mm_set1_epi32(0x005A005A); //column 0
723 const __m128i c16_p85_p88 = _mm_set1_epi32(0x00550058);
724 const __m128i c16_p78_p82 = _mm_set1_epi32(0x004E0052);
725 const __m128i c16_p67_p73 = _mm_set1_epi32(0x00430049);
726 const __m128i c16_p54_p61 = _mm_set1_epi32(0x0036003D);
727 const __m128i c16_p38_p46 = _mm_set1_epi32(0x0026002E);
728 const __m128i c16_p22_p31 = _mm_set1_epi32(0x0016001F);
729 const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D);
730 const __m128i c16_p82_p90 = _mm_set1_epi32(0x0052005A); //column 1
731 const __m128i c16_p46_p67 = _mm_set1_epi32(0x002E0043);
732 const __m128i c16_n04_p22 = _mm_set1_epi32(0xFFFC0016);
733 const __m128i c16_n54_n31 = _mm_set1_epi32(0xFFCAFFE1);
734 const __m128i c16_n85_n73 = _mm_set1_epi32(0xFFABFFB7);
735 const __m128i c16_n88_n90 = _mm_set1_epi32(0xFFA8FFA6);
736 const __m128i c16_n61_n78 = _mm_set1_epi32(0xFFC3FFB2);
737 const __m128i c16_n13_n38 = _mm_set1_epi32(0xFFF3FFDA);
738 const __m128i c16_p67_p88 = _mm_set1_epi32(0x00430058); //column 2
739 const __m128i c16_n13_p31 = _mm_set1_epi32(0xFFF3001F);
740 const __m128i c16_n82_n54 = _mm_set1_epi32(0xFFAEFFCA);
741 const __m128i c16_n78_n90 = _mm_set1_epi32(0xFFB2FFA6);
742 const __m128i c16_n04_n46 = _mm_set1_epi32(0xFFFCFFD2);
743 const __m128i c16_p73_p38 = _mm_set1_epi32(0x00490026);
744 const __m128i c16_p85_p90 = _mm_set1_epi32(0x0055005A);
745 const __m128i c16_p22_p61 = _mm_set1_epi32(0x0016003D);
746 const __m128i c16_p46_p85 = _mm_set1_epi32(0x002E0055); //column 3
747 const __m128i c16_n67_n13 = _mm_set1_epi32(0xFFBDFFF3);
748 const __m128i c16_n73_n90 = _mm_set1_epi32(0xFFB7FFA6);
749 const __m128i c16_p38_n22 = _mm_set1_epi32(0x0026FFEA);
750 const __m128i c16_p88_p82 = _mm_set1_epi32(0x00580052);
751 const __m128i c16_n04_p54 = _mm_set1_epi32(0xFFFC0036);
752 const __m128i c16_n90_n61 = _mm_set1_epi32(0xFFA6FFC3);
753 const __m128i c16_n31_n78 = _mm_set1_epi32(0xFFE1FFB2);
754 const __m128i c16_p22_p82 = _mm_set1_epi32(0x00160052); //column 4
755 const __m128i c16_n90_n54 = _mm_set1_epi32(0xFFA6FFCA);
756 const __m128i c16_p13_n61 = _mm_set1_epi32(0x000DFFC3);
757 const __m128i c16_p85_p78 = _mm_set1_epi32(0x0055004E);
758 const __m128i c16_n46_p31 = _mm_set1_epi32(0xFFD2001F);
759 const __m128i c16_n67_n90 = _mm_set1_epi32(0xFFBDFFA6);
760 const __m128i c16_p73_p04 = _mm_set1_epi32(0x00490004);
761 const __m128i c16_p38_p88 = _mm_set1_epi32(0x00260058);
762 const __m128i c16_n04_p78 = _mm_set1_epi32(0xFFFC004E); //column 5
763 const __m128i c16_n73_n82 = _mm_set1_epi32(0xFFB7FFAE);
764 const __m128i c16_p85_p13 = _mm_set1_epi32(0x0055000D);
765 const __m128i c16_n22_p67 = _mm_set1_epi32(0xFFEA0043);
766 const __m128i c16_n61_n88 = _mm_set1_epi32(0xFFC3FFA8);
767 const __m128i c16_p90_p31 = _mm_set1_epi32(0x005A001F);
768 const __m128i c16_n38_p54 = _mm_set1_epi32(0xFFDA0036);
769 const __m128i c16_n46_n90 = _mm_set1_epi32(0xFFD2FFA6);
770 const __m128i c16_n31_p73 = _mm_set1_epi32(0xFFE10049); //column 6
771 const __m128i c16_n22_n90 = _mm_set1_epi32(0xFFEAFFA6);
772 const __m128i c16_p67_p78 = _mm_set1_epi32(0x0043004E);
773 const __m128i c16_n90_n38 = _mm_set1_epi32(0xFFA6FFDA);
774 const __m128i c16_p82_n13 = _mm_set1_epi32(0x0052FFF3);
775 const __m128i c16_n46_p61 = _mm_set1_epi32(0xFFD2003D);
776 const __m128i c16_n04_n88 = _mm_set1_epi32(0xFFFCFFA8);
777 const __m128i c16_p54_p85 = _mm_set1_epi32(0x00360055);
778 const __m128i c16_n54_p67 = _mm_set1_epi32(0xFFCA0043); //column 7
779 const __m128i c16_p38_n78 = _mm_set1_epi32(0x0026FFB2);
780 const __m128i c16_n22_p85 = _mm_set1_epi32(0xFFEA0055);
781 const __m128i c16_p04_n90 = _mm_set1_epi32(0x0004FFA6);
782 const __m128i c16_p13_p90 = _mm_set1_epi32(0x000D005A);
783 const __m128i c16_n31_n88 = _mm_set1_epi32(0xFFE1FFA8);
784 const __m128i c16_p46_p82 = _mm_set1_epi32(0x002E0052);
785 const __m128i c16_n61_n73 = _mm_set1_epi32(0xFFC3FFB7);
786 const __m128i c16_n73_p61 = _mm_set1_epi32(0xFFB7003D); //column 8
787 const __m128i c16_p82_n46 = _mm_set1_epi32(0x0052FFD2);
788 const __m128i c16_n88_p31 = _mm_set1_epi32(0xFFA8001F);
789 const __m128i c16_p90_n13 = _mm_set1_epi32(0x005AFFF3);
790 const __m128i c16_n90_n04 = _mm_set1_epi32(0xFFA6FFFC);
791 const __m128i c16_p85_p22 = _mm_set1_epi32(0x00550016);
792 const __m128i c16_n78_n38 = _mm_set1_epi32(0xFFB2FFDA);
793 const __m128i c16_p67_p54 = _mm_set1_epi32(0x00430036);
794 const __m128i c16_n85_p54 = _mm_set1_epi32(0xFFAB0036); //column 9
795 const __m128i c16_p88_n04 = _mm_set1_epi32(0x0058FFFC);
796 const __m128i c16_n61_n46 = _mm_set1_epi32(0xFFC3FFD2);
797 const __m128i c16_p13_p82 = _mm_set1_epi32(0x000D0052);
798 const __m128i c16_p38_n90 = _mm_set1_epi32(0x0026FFA6);
799 const __m128i c16_n78_p67 = _mm_set1_epi32(0xFFB20043);
800 const __m128i c16_p90_n22 = _mm_set1_epi32(0x005AFFEA);
801 const __m128i c16_n73_n31 = _mm_set1_epi32(0xFFB7FFE1);
802 const __m128i c16_n90_p46 = _mm_set1_epi32(0xFFA6002E); //column 10
803 const __m128i c16_p54_p38 = _mm_set1_epi32(0x00360026);
804 const __m128i c16_p31_n90 = _mm_set1_epi32(0x001FFFA6);
805 const __m128i c16_n88_p61 = _mm_set1_epi32(0xFFA8003D);
806 const __m128i c16_p67_p22 = _mm_set1_epi32(0x00430016);
807 const __m128i c16_p13_n85 = _mm_set1_epi32(0x000DFFAB);
808 const __m128i c16_n82_p73 = _mm_set1_epi32(0xFFAE0049);
809 const __m128i c16_p78_p04 = _mm_set1_epi32(0x004E0004);
810 const __m128i c16_n88_p38 = _mm_set1_epi32(0xFFA80026); //column 11
811 const __m128i c16_n04_p73 = _mm_set1_epi32(0xFFFC0049);
812 const __m128i c16_p90_n67 = _mm_set1_epi32(0x005AFFBD);
813 const __m128i c16_n31_n46 = _mm_set1_epi32(0xFFE1FFD2);
814 const __m128i c16_n78_p85 = _mm_set1_epi32(0xFFB20055);
815 const __m128i c16_p61_p13 = _mm_set1_epi32(0x003D000D);
816 const __m128i c16_p54_n90 = _mm_set1_epi32(0x0036FFA6);
817 const __m128i c16_n82_p22 = _mm_set1_epi32(0xFFAE0016);
818 const __m128i c16_n78_p31 = _mm_set1_epi32(0xFFB2001F); //column 12
819 const __m128i c16_n61_p90 = _mm_set1_epi32(0xFFC3005A);
820 const __m128i c16_p54_p04 = _mm_set1_epi32(0x00360004);
821 const __m128i c16_p82_n88 = _mm_set1_epi32(0x0052FFA8);
822 const __m128i c16_n22_n38 = _mm_set1_epi32(0xFFEAFFDA);
823 const __m128i c16_n90_p73 = _mm_set1_epi32(0xFFA60049);
824 const __m128i c16_n13_p67 = _mm_set1_epi32(0xFFF30043);
825 const __m128i c16_p85_n46 = _mm_set1_epi32(0x0055FFD2);
826 const __m128i c16_n61_p22 = _mm_set1_epi32(0xFFC30016); //column 13
827 const __m128i c16_n90_p85 = _mm_set1_epi32(0xFFA60055);
828 const __m128i c16_n38_p73 = _mm_set1_epi32(0xFFDA0049);
829 const __m128i c16_p46_n04 = _mm_set1_epi32(0x002EFFFC);
830 const __m128i c16_p90_n78 = _mm_set1_epi32(0x005AFFB2);
831 const __m128i c16_p54_n82 = _mm_set1_epi32(0x0036FFAE);
832 const __m128i c16_n31_n13 = _mm_set1_epi32(0xFFE1FFF3);
833 const __m128i c16_n88_p67 = _mm_set1_epi32(0xFFA80043);
834 const __m128i c16_n38_p13 = _mm_set1_epi32(0xFFDA000D); //column 14
835 const __m128i c16_n78_p61 = _mm_set1_epi32(0xFFB2003D);
836 const __m128i c16_n90_p88 = _mm_set1_epi32(0xFFA60058);
837 const __m128i c16_n73_p85 = _mm_set1_epi32(0xFFB70055);
838 const __m128i c16_n31_p54 = _mm_set1_epi32(0xFFE10036);
839 const __m128i c16_p22_p04 = _mm_set1_epi32(0x00160004);
840 const __m128i c16_p67_n46 = _mm_set1_epi32(0x0043FFD2);
841 const __m128i c16_p90_n82 = _mm_set1_epi32(0x005AFFAE);
842 const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //column 15
843 const __m128i c16_n31_p22 = _mm_set1_epi32(0xFFE10016);
844 const __m128i c16_n46_p38 = _mm_set1_epi32(0xFFD20026);
845 const __m128i c16_n61_p54 = _mm_set1_epi32(0xFFC30036);
846 const __m128i c16_n73_p67 = _mm_set1_epi32(0xFFB70043);
847 const __m128i c16_n82_p78 = _mm_set1_epi32(0xFFAE004E);
848 const __m128i c16_n88_p85 = _mm_set1_epi32(0xFFA80055);
849 const __m128i c16_n90_p90 = _mm_set1_epi32(0xFFA6005A);
850
851 //EO
852 const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
853 const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050);
854 const __m128i c16_p43_p57 = _mm_set1_epi32(0x002B0039);
855 const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019);
856 const __m128i c16_p57_p87 = _mm_set1_epi32(0x00390057); //row1
857 const __m128i c16_n43_p09 = _mm_set1_epi32(0xFFD50009);
858 const __m128i c16_n90_n80 = _mm_set1_epi32(0xFFA6FFB0);
859 const __m128i c16_n25_n70 = _mm_set1_epi32(0xFFE7FFBA);
860 const __m128i c16_p09_p80 = _mm_set1_epi32(0x00090050); //row2
861 const __m128i c16_n87_n70 = _mm_set1_epi32(0xFFA9FFBA);
862 const __m128i c16_p57_n25 = _mm_set1_epi32(0x0039FFE7);
863 const __m128i c16_p43_p90 = _mm_set1_epi32(0x002B005A);
864 const __m128i c16_n43_p70 = _mm_set1_epi32(0xFFD50046); //row3
865 const __m128i c16_p09_n87 = _mm_set1_epi32(0x0009FFA9);
866 const __m128i c16_p25_p90 = _mm_set1_epi32(0x0019005A);
867 const __m128i c16_n57_n80 = _mm_set1_epi32(0xFFC7FFB0);
868 const __m128i c16_n80_p57 = _mm_set1_epi32(0xFFB00039); //row4
869 const __m128i c16_p90_n25 = _mm_set1_epi32(0x005AFFE7);
870 const __m128i c16_n87_n09 = _mm_set1_epi32(0xFFA9FFF7);
871 const __m128i c16_p70_p43 = _mm_set1_epi32(0x0046002B);
872 const __m128i c16_n90_p43 = _mm_set1_epi32(0xFFA6002B); //row5
873 const __m128i c16_p25_p57 = _mm_set1_epi32(0x00190039);
874 const __m128i c16_p70_n87 = _mm_set1_epi32(0x0046FFA9);
875 const __m128i c16_n80_p09 = _mm_set1_epi32(0xFFB00009);
876 const __m128i c16_n70_p25 = _mm_set1_epi32(0xFFBA0019); //row6
877 const __m128i c16_n80_p90 = _mm_set1_epi32(0xFFB0005A);
878 const __m128i c16_p09_p43 = _mm_set1_epi32(0x0009002B);
879 const __m128i c16_p87_n57 = _mm_set1_epi32(0x0057FFC7);
880 const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); //row7
881 const __m128i c16_n57_p43 = _mm_set1_epi32(0xFFC7002B);
882 const __m128i c16_n80_p70 = _mm_set1_epi32(0xFFB00046);
883 const __m128i c16_n90_p87 = _mm_set1_epi32(0xFFA60057);
884 //EEO
885 const __m128i c16_p75_p89 = _mm_set1_epi32(0x004B0059);
886 const __m128i c16_p18_p50 = _mm_set1_epi32(0x00120032);
887 const __m128i c16_n18_p75 = _mm_set1_epi32(0xFFEE004B);
888 const __m128i c16_n50_n89 = _mm_set1_epi32(0xFFCEFFA7);
889 const __m128i c16_n89_p50 = _mm_set1_epi32(0xFFA70032);
890 const __m128i c16_p75_p18 = _mm_set1_epi32(0x004B0012);
891 const __m128i c16_n50_p18 = _mm_set1_epi32(0xFFCE0012);
892 const __m128i c16_n89_p75 = _mm_set1_epi32(0xFFA7004B);
893 //EEEO
894 const __m128i c16_p36_p83 = _mm_set1_epi32(0x00240053);
895 const __m128i c16_n83_p36 = _mm_set1_epi32(0xFFAD0024);
896 //EEEE
897 const __m128i c16_n64_p64 = _mm_set1_epi32(0xFFC00040);
898 const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040);
899 __m128i c32_rnd = _mm_set1_epi32(64);
900
901 int nShift = 7;
902
903 // DCT1
904 __m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4];
905 __m128i in16[4], in17[4], in18[4], in19[4], in20[4], in21[4], in22[4], in23[4], in24[4], in25[4], in26[4], in27[4], in28[4], in29[4], in30[4], in31[4];
906 __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4];
907 __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4];
908
909 for (int i = 0; i < 4; i++)
910 {
911 const int offset = (i << 3);
912 __m128i T00, T01;
913
914 T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
915 T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
916 in00[i] = _mm_packs_epi32(T00, T01);
917
918 T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
919 T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
920 in01[i] = _mm_packs_epi32(T00, T01);
921
922 T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
923 T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
924 in02[i] = _mm_packs_epi32(T00, T01);
925
926 T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
927 T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
928 in03[i] = _mm_packs_epi32(T00, T01);
929
930 T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
931 T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
932 in04[i] = _mm_packs_epi32(T00, T01);
933
934 T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
935 T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
936 in05[i] = _mm_packs_epi32(T00, T01);
937
938 T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
939 T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
940 in06[i] = _mm_packs_epi32(T00, T01);
941
942 T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
943 T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
944 in07[i] = _mm_packs_epi32(T00, T01);
945
946 T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
947 T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
948 in08[i] = _mm_packs_epi32(T00, T01);
949
950 T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
951 T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
952 in09[i] = _mm_packs_epi32(T00, T01);
953
954 T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
955 T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
956 in10[i] = _mm_packs_epi32(T00, T01);
957
958 T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
959 T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
960 in11[i] = _mm_packs_epi32(T00, T01);
961
962 T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
963 T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
964 in12[i] = _mm_packs_epi32(T00, T01);
965
966 T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
967 T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
968 in13[i] = _mm_packs_epi32(T00, T01);
969
970 T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
971 T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
972 in14[i] = _mm_packs_epi32(T00, T01);
973
974 T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
975 T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
976 in15[i] = _mm_packs_epi32(T00, T01);
977
978 T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
979 T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
980 in16[i] = _mm_packs_epi32(T00, T01);
981
982 T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
983 T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
984 in17[i] = _mm_packs_epi32(T00, T01);
985
986 T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
987 T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
988 in18[i] = _mm_packs_epi32(T00, T01);
989
990 T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
991 T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
992 in19[i] = _mm_packs_epi32(T00, T01);
993
994 T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
995 T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
996 in20[i] = _mm_packs_epi32(T00, T01);
997
998 T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
999 T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
1000 in21[i] = _mm_packs_epi32(T00, T01);
1001
1002 T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
1003 T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
1004 in22[i] = _mm_packs_epi32(T00, T01);
1005
1006 T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
1007 T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
1008 in23[i] = _mm_packs_epi32(T00, T01);
1009
1010 T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
1011 T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
1012 in24[i] = _mm_packs_epi32(T00, T01);
1013
1014 T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
1015 T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
1016 in25[i] = _mm_packs_epi32(T00, T01);
1017
1018 T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
1019 T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
1020 in26[i] = _mm_packs_epi32(T00, T01);
1021
1022 T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
1023 T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
1024 in27[i] = _mm_packs_epi32(T00, T01);
1025
1026 T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
1027 T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
1028 in28[i] = _mm_packs_epi32(T00, T01);
1029
1030 T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
1031 T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
1032 in29[i] = _mm_packs_epi32(T00, T01);
1033
1034 T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
1035 T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
1036 in30[i] = _mm_packs_epi32(T00, T01);
1037
1038 T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
1039 T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
1040 in31[i] = _mm_packs_epi32(T00, T01);
1041 }
1042
1043 for (int pass = 0; pass < 2; pass++)
1044 {
1045 if (pass == 1)
1046 {
1047 c32_rnd = _mm_set1_epi32(2048);
1048 nShift = 12;
1049 }
1050
1051 for (int part = 0; part < 4; part++)
1052 {
1053 const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10]
1054 const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14]
1055 const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ]
1056 const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ]
1057 const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ]
1058 const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ]
1059 const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ]
1060 const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ]
1061 const __m128i T_00_04A = _mm_unpacklo_epi16(in17[part], in19[part]); // [ ]
1062 const __m128i T_00_04B = _mm_unpackhi_epi16(in17[part], in19[part]); // [ ]
1063 const __m128i T_00_05A = _mm_unpacklo_epi16(in21[part], in23[part]); // [ ]
1064 const __m128i T_00_05B = _mm_unpackhi_epi16(in21[part], in23[part]); // [ ]
1065 const __m128i T_00_06A = _mm_unpacklo_epi16(in25[part], in27[part]); // [ ]
1066 const __m128i T_00_06B = _mm_unpackhi_epi16(in25[part], in27[part]); // [ ]
1067 const __m128i T_00_07A = _mm_unpacklo_epi16(in29[part], in31[part]); //
1068 const __m128i T_00_07B = _mm_unpackhi_epi16(in29[part], in31[part]); // [ ]
1069
1070 const __m128i T_00_08A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ]
1071 const __m128i T_00_08B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ]
1072 const __m128i T_00_09A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ]
1073 const __m128i T_00_09B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ]
1074 const __m128i T_00_10A = _mm_unpacklo_epi16(in18[part], in22[part]); // [ ]
1075 const __m128i T_00_10B = _mm_unpackhi_epi16(in18[part], in22[part]); // [ ]
1076 const __m128i T_00_11A = _mm_unpacklo_epi16(in26[part], in30[part]); // [ ]
1077 const __m128i T_00_11B = _mm_unpackhi_epi16(in26[part], in30[part]); // [ ]
1078
1079 const __m128i T_00_12A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ]
1080 const __m128i T_00_12B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ]
1081 const __m128i T_00_13A = _mm_unpacklo_epi16(in20[part], in28[part]); // [ ]
1082 const __m128i T_00_13B = _mm_unpackhi_epi16(in20[part], in28[part]); // [ ]
1083
1084 const __m128i T_00_14A = _mm_unpacklo_epi16(in08[part], in24[part]); //
1085 const __m128i T_00_14B = _mm_unpackhi_epi16(in08[part], in24[part]); // [ ]
1086 const __m128i T_00_15A = _mm_unpacklo_epi16(in00[part], in16[part]); //
1087 const __m128i T_00_15B = _mm_unpackhi_epi16(in00[part], in16[part]); // [ ]
1088
1089 __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A;
1090 __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B;
1091 {
1092 __m128i T00, T01, T02, T03;
1093 #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \
1094 T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \
1095 T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \
1096 T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \
1097 T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \
1098 row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03));
1099
1100 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1101 c16_p90_p90, c16_p85_p88, c16_p78_p82, c16_p67_p73, c16_p54_p61, c16_p38_p46, c16_p22_p31, c16_p04_p13, O00A)
1102 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1103 c16_p82_p90, c16_p46_p67, c16_n04_p22, c16_n54_n31, c16_n85_n73, c16_n88_n90, c16_n61_n78, c16_n13_n38, O01A)
1104 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1105 c16_p67_p88, c16_n13_p31, c16_n82_n54, c16_n78_n90, c16_n04_n46, c16_p73_p38, c16_p85_p90, c16_p22_p61, O02A)
1106 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1107 c16_p46_p85, c16_n67_n13, c16_n73_n90, c16_p38_n22, c16_p88_p82, c16_n04_p54, c16_n90_n61, c16_n31_n78, O03A)
1108 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1109 c16_p22_p82, c16_n90_n54, c16_p13_n61, c16_p85_p78, c16_n46_p31, c16_n67_n90, c16_p73_p04, c16_p38_p88, O04A)
1110 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1111 c16_n04_p78, c16_n73_n82, c16_p85_p13, c16_n22_p67, c16_n61_n88, c16_p90_p31, c16_n38_p54, c16_n46_n90, O05A)
1112 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1113 c16_n31_p73, c16_n22_n90, c16_p67_p78, c16_n90_n38, c16_p82_n13, c16_n46_p61, c16_n04_n88, c16_p54_p85, O06A)
1114 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1115 c16_n54_p67, c16_p38_n78, c16_n22_p85, c16_p04_n90, c16_p13_p90, c16_n31_n88, c16_p46_p82, c16_n61_n73, O07A)
1116 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1117 c16_n73_p61, c16_p82_n46, c16_n88_p31, c16_p90_n13, c16_n90_n04, c16_p85_p22, c16_n78_n38, c16_p67_p54, O08A)
1118 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1119 c16_n85_p54, c16_p88_n04, c16_n61_n46, c16_p13_p82, c16_p38_n90, c16_n78_p67, c16_p90_n22, c16_n73_n31, O09A)
1120 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1121 c16_n90_p46, c16_p54_p38, c16_p31_n90, c16_n88_p61, c16_p67_p22, c16_p13_n85, c16_n82_p73, c16_p78_p04, O10A)
1122 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1123 c16_n88_p38, c16_n04_p73, c16_p90_n67, c16_n31_n46, c16_n78_p85, c16_p61_p13, c16_p54_n90, c16_n82_p22, O11A)
1124 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1125 c16_n78_p31, c16_n61_p90, c16_p54_p04, c16_p82_n88, c16_n22_n38, c16_n90_p73, c16_n13_p67, c16_p85_n46, O12A)
1126 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1127 c16_n61_p22, c16_n90_p85, c16_n38_p73, c16_p46_n04, c16_p90_n78, c16_p54_n82, c16_n31_n13, c16_n88_p67, O13A)
1128 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1129 c16_n38_p13, c16_n78_p61, c16_n90_p88, c16_n73_p85, c16_n31_p54, c16_p22_p04, c16_p67_n46, c16_p90_n82, O14A)
1130 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
1131 c16_n13_p04, c16_n31_p22, c16_n46_p38, c16_n61_p54, c16_n73_p67, c16_n82_p78, c16_n88_p85, c16_n90_p90, O15A)
1132
1133 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1134 c16_p90_p90, c16_p85_p88, c16_p78_p82, c16_p67_p73, c16_p54_p61, c16_p38_p46, c16_p22_p31, c16_p04_p13, O00B)
1135 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1136 c16_p82_p90, c16_p46_p67, c16_n04_p22, c16_n54_n31, c16_n85_n73, c16_n88_n90, c16_n61_n78, c16_n13_n38, O01B)
1137 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1138 c16_p67_p88, c16_n13_p31, c16_n82_n54, c16_n78_n90, c16_n04_n46, c16_p73_p38, c16_p85_p90, c16_p22_p61, O02B)
1139 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1140 c16_p46_p85, c16_n67_n13, c16_n73_n90, c16_p38_n22, c16_p88_p82, c16_n04_p54, c16_n90_n61, c16_n31_n78, O03B)
1141 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1142 c16_p22_p82, c16_n90_n54, c16_p13_n61, c16_p85_p78, c16_n46_p31, c16_n67_n90, c16_p73_p04, c16_p38_p88, O04B)
1143 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1144 c16_n04_p78, c16_n73_n82, c16_p85_p13, c16_n22_p67, c16_n61_n88, c16_p90_p31, c16_n38_p54, c16_n46_n90, O05B)
1145 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1146 c16_n31_p73, c16_n22_n90, c16_p67_p78, c16_n90_n38, c16_p82_n13, c16_n46_p61, c16_n04_n88, c16_p54_p85, O06B)
1147 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1148 c16_n54_p67, c16_p38_n78, c16_n22_p85, c16_p04_n90, c16_p13_p90, c16_n31_n88, c16_p46_p82, c16_n61_n73, O07B)
1149 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1150 c16_n73_p61, c16_p82_n46, c16_n88_p31, c16_p90_n13, c16_n90_n04, c16_p85_p22, c16_n78_n38, c16_p67_p54, O08B)
1151 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1152 c16_n85_p54, c16_p88_n04, c16_n61_n46, c16_p13_p82, c16_p38_n90, c16_n78_p67, c16_p90_n22, c16_n73_n31, O09B)
1153 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1154 c16_n90_p46, c16_p54_p38, c16_p31_n90, c16_n88_p61, c16_p67_p22, c16_p13_n85, c16_n82_p73, c16_p78_p04, O10B)
1155 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1156 c16_n88_p38, c16_n04_p73, c16_p90_n67, c16_n31_n46, c16_n78_p85, c16_p61_p13, c16_p54_n90, c16_n82_p22, O11B)
1157 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1158 c16_n78_p31, c16_n61_p90, c16_p54_p04, c16_p82_n88, c16_n22_n38, c16_n90_p73, c16_n13_p67, c16_p85_n46, O12B)
1159 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1160 c16_n61_p22, c16_n90_p85, c16_n38_p73, c16_p46_n04, c16_p90_n78, c16_p54_n82, c16_n31_n13, c16_n88_p67, O13B)
1161 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1162 c16_n38_p13, c16_n78_p61, c16_n90_p88, c16_n73_p85, c16_n31_p54, c16_p22_p04, c16_p67_n46, c16_p90_n82, O14B)
1163 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1164 c16_n13_p04, c16_n31_p22, c16_n46_p38, c16_n61_p54, c16_n73_p67, c16_n82_p78, c16_n88_p85, c16_n90_p90, O15B)
1165
1166 #undef COMPUTE_ROW
1167 }
1168
1169 __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A;
1170 __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B;
1171 {
1172 __m128i T00, T01;
1173 #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \
1174 T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \
1175 T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \
1176 row = _mm_add_epi32(T00, T01);
1177
1178 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, EO0A)
1179 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, EO1A)
1180 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, EO2A)
1181 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, EO3A)
1182 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, EO4A)
1183 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, EO5A)
1184 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, EO6A)
1185 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, EO7A)
1186
1187 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, EO0B)
1188 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, EO1B)
1189 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, EO2B)
1190 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, EO3B)
1191 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, EO4B)
1192 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, EO5B)
1193 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, EO6B)
1194 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, EO7B)
1195 #undef COMPUTE_ROW
1196 }
1197
1198 const __m128i EEO0A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_p75_p89), _mm_madd_epi16(T_00_13A, c16_p18_p50)); // EEO0
1199 const __m128i EEO0B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_p75_p89), _mm_madd_epi16(T_00_13B, c16_p18_p50));
1200 const __m128i EEO1A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n18_p75), _mm_madd_epi16(T_00_13A, c16_n50_n89)); // EEO1
1201 const __m128i EEO1B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n18_p75), _mm_madd_epi16(T_00_13B, c16_n50_n89));
1202 const __m128i EEO2A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n89_p50), _mm_madd_epi16(T_00_13A, c16_p75_p18)); // EEO2
1203 const __m128i EEO2B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n89_p50), _mm_madd_epi16(T_00_13B, c16_p75_p18));
1204 const __m128i EEO3A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n50_p18), _mm_madd_epi16(T_00_13A, c16_n89_p75)); // EEO3
1205 const __m128i EEO3B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n50_p18), _mm_madd_epi16(T_00_13B, c16_n89_p75));
1206
1207 const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p36_p83);
1208 const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p36_p83);
1209 const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n83_p36);
1210 const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n83_p36);
1211
1212 const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p64_p64);
1213 const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p64_p64);
1214 const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n64_p64);
1215 const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n64_p64);
1216
1217 const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0
1218 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B);
1219 const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1
1220 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B);
1221 const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0
1222 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B);
1223 const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1
1224 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B);
1225
1226 const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0
1227 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);
1228 const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1
1229 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);
1230 const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0
1231 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B);
1232 const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1
1233 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B);
1234 const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0
1235 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B);
1236 const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1
1237 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B);
1238 const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0
1239 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B);
1240 const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1
1241 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B);
1242
1243 const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0
1244 const __m128i E0B = _mm_add_epi32(EE0B, EO0B);
1245 const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1
1246 const __m128i E1B = _mm_add_epi32(EE1B, EO1B);
1247 const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2
1248 const __m128i E2B = _mm_add_epi32(EE2B, EO2B);
1249 const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3
1250 const __m128i E3B = _mm_add_epi32(EE3B, EO3B);
1251 const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 =
1252 const __m128i E4B = _mm_add_epi32(EE4B, EO4B);
1253 const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 =
1254 const __m128i E5B = _mm_add_epi32(EE5B, EO5B);
1255 const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 =
1256 const __m128i E6B = _mm_add_epi32(EE6B, EO6B);
1257 const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 =
1258 const __m128i E7B = _mm_add_epi32(EE7B, EO7B);
1259 const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0
1260 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B);
1261 const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1
1262 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B);
1263 const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2
1264 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B);
1265 const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3
1266 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B);
1267 const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB =
1268 const __m128i EBB = _mm_sub_epi32(EE4B, EO4B);
1269 const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA =
1270 const __m128i EAB = _mm_sub_epi32(EE5B, EO5B);
1271 const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 =
1272 const __m128i E9B = _mm_sub_epi32(EE6B, EO6B);
1273 const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 =
1274 const __m128i E8B = _mm_sub_epi32(EE7B, EO7B);
1275
1276 const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd
1277 const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);
1278 const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd
1279 const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);
1280 const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd
1281 const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);
1282 const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd
1283 const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);
1284 const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd
1285 const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);
1286 const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd
1287 const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);
1288 const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd
1289 const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);
1290 const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd
1291 const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);
1292 const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd
1293 const __m128i T18B = _mm_add_epi32(E8B, c32_rnd);
1294 const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd
1295 const __m128i T19B = _mm_add_epi32(E9B, c32_rnd);
1296 const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd
1297 const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd);
1298 const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd
1299 const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd);
1300 const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd
1301 const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd);
1302 const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd
1303 const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd);
1304 const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd
1305 const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd);
1306 const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd
1307 const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd);
1308
1309 const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd
1310 const __m128i T2_00B = _mm_add_epi32(T10B, O00B);
1311 const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd
1312 const __m128i T2_01B = _mm_add_epi32(T11B, O01B);
1313 const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd
1314 const __m128i T2_02B = _mm_add_epi32(T12B, O02B);
1315 const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd
1316 const __m128i T2_03B = _mm_add_epi32(T13B, O03B);
1317 const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4
1318 const __m128i T2_04B = _mm_add_epi32(T14B, O04B);
1319 const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5
1320 const __m128i T2_05B = _mm_add_epi32(T15B, O05B);
1321 const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6
1322 const __m128i T2_06B = _mm_add_epi32(T16B, O06B);
1323 const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7
1324 const __m128i T2_07B = _mm_add_epi32(T17B, O07B);
1325 const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8
1326 const __m128i T2_08B = _mm_add_epi32(T18B, O08B);
1327 const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9
1328 const __m128i T2_09B = _mm_add_epi32(T19B, O09B);
1329 const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10
1330 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B);
1331 const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11
1332 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B);
1333 const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12
1334 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B);
1335 const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13
1336 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B);
1337 const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14
1338 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B);
1339 const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15
1340 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B);
1341 const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd
1342 const __m128i T2_31B = _mm_sub_epi32(T10B, O00B);
1343 const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd
1344 const __m128i T2_30B = _mm_sub_epi32(T11B, O01B);
1345 const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd
1346 const __m128i T2_29B = _mm_sub_epi32(T12B, O02B);
1347 const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd
1348 const __m128i T2_28B = _mm_sub_epi32(T13B, O03B);
1349 const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4
1350 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B);
1351 const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5
1352 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B);
1353 const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6
1354 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B);
1355 const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7
1356 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B);
1357 const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); //
1358 const __m128i T2_23B = _mm_sub_epi32(T18B, O08B);
1359 const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); //
1360 const __m128i T2_22B = _mm_sub_epi32(T19B, O09B);
1361 const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); //
1362 const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B);
1363 const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); //
1364 const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B);
1365 const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); //
1366 const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B);
1367 const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); //
1368 const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B);
1369 const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); //
1370 const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B);
1371 const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); //
1372 const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B);
1373
1374 const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00]
1375 const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40]
1376 const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01]
1377 const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41]
1378 const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02]
1379 const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42]
1380 const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03]
1381 const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43]
1382 const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04]
1383 const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44]
1384 const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05]
1385 const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45]
1386 const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06]
1387 const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46]
1388 const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07]
1389 const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47]
1390 const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8
1391 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40]
1392 const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9
1393 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41]
1394 const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA
1395 const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42]
1396 const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB
1397 const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43]
1398 const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC
1399 const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44]
1400 const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD
1401 const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45]
1402 const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE
1403 const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46]
1404 const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF
1405 const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47]
1406
1407 const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00]
1408 const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40]
1409 const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01]
1410 const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41]
1411 const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02]
1412 const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42]
1413 const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03]
1414 const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43]
1415 const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04]
1416 const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44]
1417 const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05]
1418 const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45]
1419 const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06]
1420 const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46]
1421 const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07]
1422 const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47]
1423 const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8
1424 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40]
1425 const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9
1426 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41]
1427 const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA
1428 const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42]
1429 const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB
1430 const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43]
1431 const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC
1432 const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44]
1433 const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD
1434 const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45]
1435 const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE
1436 const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46]
1437 const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF
1438 const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47]
1439
1440 res00[part] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00]
1441 res01[part] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01]
1442 res02[part] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02]
1443 res03[part] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03]
1444 res04[part] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04]
1445 res05[part] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05]
1446 res06[part] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06]
1447 res07[part] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07]
1448 res08[part] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80]
1449 res09[part] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81]
1450 res10[part] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82]
1451 res11[part] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83]
1452 res12[part] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84]
1453 res13[part] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85]
1454 res14[part] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86]
1455 res15[part] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87]
1456 res16[part] = _mm_packs_epi32(T3_16A, T3_16B);
1457 res17[part] = _mm_packs_epi32(T3_17A, T3_17B);
1458 res18[part] = _mm_packs_epi32(T3_18A, T3_18B);
1459 res19[part] = _mm_packs_epi32(T3_19A, T3_19B);
1460 res20[part] = _mm_packs_epi32(T3_20A, T3_20B);
1461 res21[part] = _mm_packs_epi32(T3_21A, T3_21B);
1462 res22[part] = _mm_packs_epi32(T3_22A, T3_22B);
1463 res23[part] = _mm_packs_epi32(T3_23A, T3_23B);
1464 res24[part] = _mm_packs_epi32(T3_24A, T3_24B);
1465 res25[part] = _mm_packs_epi32(T3_25A, T3_25B);
1466 res26[part] = _mm_packs_epi32(T3_26A, T3_26B);
1467 res27[part] = _mm_packs_epi32(T3_27A, T3_27B);
1468 res28[part] = _mm_packs_epi32(T3_28A, T3_28B);
1469 res29[part] = _mm_packs_epi32(T3_29A, T3_29B);
1470 res30[part] = _mm_packs_epi32(T3_30A, T3_30B);
1471 res31[part] = _mm_packs_epi32(T3_31A, T3_31B);
1472 }
1473 //transpose matrix 8x8 16bit.
1474 {
1475 __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
1476 __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
1477 #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
1478 tr0_0 = _mm_unpacklo_epi16(I0, I1); \
1479 tr0_1 = _mm_unpacklo_epi16(I2, I3); \
1480 tr0_2 = _mm_unpackhi_epi16(I0, I1); \
1481 tr0_3 = _mm_unpackhi_epi16(I2, I3); \
1482 tr0_4 = _mm_unpacklo_epi16(I4, I5); \
1483 tr0_5 = _mm_unpacklo_epi16(I6, I7); \
1484 tr0_6 = _mm_unpackhi_epi16(I4, I5); \
1485 tr0_7 = _mm_unpackhi_epi16(I6, I7); \
1486 tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
1487 tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
1488 tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
1489 tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
1490 tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
1491 tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
1492 tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
1493 tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
1494 O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
1495 O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
1496 O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
1497 O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
1498 O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
1499 O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
1500 O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
1501 O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
1502
1503 TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
1504 TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
1505 TRANSPOSE_8x8_16BIT(res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2], in16[0], in17[0], in18[0], in19[0], in20[0], in21[0], in22[0], in23[0])
1506 TRANSPOSE_8x8_16BIT(res00[3], res01[3], res02[3], res03[3], res04[3], res05[3], res06[3], res07[3], in24[0], in25[0], in26[0], in27[0], in28[0], in29[0], in30[0], in31[0])
1507
1508 TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
1509 TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
1510 TRANSPOSE_8x8_16BIT(res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2], in16[1], in17[1], in18[1], in19[1], in20[1], in21[1], in22[1], in23[1])
1511 TRANSPOSE_8x8_16BIT(res08[3], res09[3], res10[3], res11[3], res12[3], res13[3], res14[3], res15[3], in24[1], in25[1], in26[1], in27[1], in28[1], in29[1], in30[1], in31[1])
1512
1513 TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2])
1514 TRANSPOSE_8x8_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2])
1515 TRANSPOSE_8x8_16BIT(res16[2], res17[2], res18[2], res19[2], res20[2], res21[2], res22[2], res23[2], in16[2], in17[2], in18[2], in19[2], in20[2], in21[2], in22[2], in23[2])
1516 TRANSPOSE_8x8_16BIT(res16[3], res17[3], res18[3], res19[3], res20[3], res21[3], res22[3], res23[3], in24[2], in25[2], in26[2], in27[2], in28[2], in29[2], in30[2], in31[2])
1517
1518 TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[3], in01[3], in02[3], in03[3], in04[3], in05[3], in06[3], in07[3])
1519 TRANSPOSE_8x8_16BIT(res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in08[3], in09[3], in10[3], in11[3], in12[3], in13[3], in14[3], in15[3])
1520 TRANSPOSE_8x8_16BIT(res24[2], res25[2], res26[2], res27[2], res28[2], res29[2], res30[2], res31[2], in16[3], in17[3], in18[3], in19[3], in20[3], in21[3], in22[3], in23[3])
1521 TRANSPOSE_8x8_16BIT(res24[3], res25[3], res26[3], res27[3], res28[3], res29[3], res30[3], res31[3], in24[3], in25[3], in26[3], in27[3], in28[3], in29[3], in30[3], in31[3])
1522
1523 #undef TRANSPOSE_8x8_16BIT
1524 }
1525 }
1526
1527 // Add
1528 for (int i = 0; i < 2; i++)
1529 {
1530 #define STORE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \
1531 _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 0], L0); \
1532 _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 8], H0); \
1533 _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 0], L1); \
1534 _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 8], H1); \
1535 _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 0], L2); \
1536 _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 8], H2); \
1537 _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 0], L3); \
1538 _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 8], H3); \
1539 _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 0], L4); \
1540 _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 8], H4); \
1541 _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 0], L5); \
1542 _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 8], H5); \
1543 _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 0], L6); \
1544 _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 8], H6); \
1545 _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 0], L7); \
1546 _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 8], H7);
1547
1548 const int k = i * 2;
1549 STORE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k], in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1], in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16)
1550 STORE_LINE(in08[k], in09[k], in10[k], in11[k], in12[k], in13[k], in14[k], in15[k], in08[k + 1], in09[k + 1], in10[k + 1], in11[k + 1], in12[k + 1], in13[k + 1], in14[k + 1], in15[k + 1], 8, i * 16)
1551 STORE_LINE(in16[k], in17[k], in18[k], in19[k], in20[k], in21[k], in22[k], in23[k], in16[k + 1], in17[k + 1], in18[k + 1], in19[k + 1], in20[k + 1], in21[k + 1], in22[k + 1], in23[k + 1], 16, i * 16)
1552 STORE_LINE(in24[k], in25[k], in26[k], in27[k], in28[k], in29[k], in30[k], in31[k], in24[k + 1], in25[k + 1], in26[k + 1], in27[k + 1], in28[k + 1], in29[k + 1], in30[k + 1], in31[k + 1], 24, i * 16)
1553 #undef STORE_LINE
1554 }
1555 }
1556
1557 #endif // if !HIGH_BIT_DEPTH
1558 }
1559
1560 namespace x265 {
1561 void Setup_Vec_DCTPrimitives_sse3(EncoderPrimitives &p)
1562 {
1563 /* Note: We have AVX2 assembly for these two functions, but since AVX2 is
1564 * still somewhat rare on end-user PCs we still compile and link these SSE3
1565 * intrinsic SIMD functions */
1566 #if !HIGH_BIT_DEPTH
1567 p.idct[IDCT_8x8] = idct8;
1568 p.idct[IDCT_16x16] = idct16;
1569 p.idct[IDCT_32x32] = idct32;
1570 #endif
1571 }
1572 }