Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / vec / dct-sse3.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
7 * Mahesh Pittala <mahesh@multicorewareinc.com>
8 * Rajesh Paulraj <rajesh@multicorewareinc.com>
9 * Min Chen <min.chen@multicorewareinc.com>
10 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
11 * Nabajit Deka <nabajit@multicorewareinc.com>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 *
27 * This program is also available under a commercial proprietary license.
28 * For more information, contact us at license @ x265.com.
29 *****************************************************************************/
30
31#include "common.h"
32#include "primitives.h"
33#include <xmmintrin.h> // SSE
34#include <pmmintrin.h> // SSE3
35
36using namespace x265;
37
38namespace {
39#if !HIGH_BIT_DEPTH
40ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
41{
42 { 89, 75, 89, 75, 89, 75, 89, 75 },
43 { 50, 18, 50, 18, 50, 18, 50, 18 },
44 { 75, -18, 75, -18, 75, -18, 75, -18 },
45 { -89, -50, -89, -50, -89, -50, -89, -50 },
46 { 50, -89, 50, -89, 50, -89, 50, -89 },
47 { 18, 75, 18, 75, 18, 75, 18, 75 },
48 { 18, -50, 18, -50, 18, -50, 18, -50 },
49 { 75, -89, 75, -89, 75, -89, 75, -89 },
50 { 64, 64, 64, 64, 64, 64, 64, 64 },
51 { 64, -64, 64, -64, 64, -64, 64, -64 },
52 { 83, 36, 83, 36, 83, 36, 83, 36 },
53 { 36, -83, 36, -83, 36, -83, 36, -83 }
54};
b53f7c52 55void idct8(const int16_t* src, int16_t* dst, intptr_t stride)
72b9787e
JB
56{
57 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
58 __m128i T00, T01, T02, T03, T04, T05, T06, T07;
59
60 m128iAdd = _mm_set1_epi32(64);
61
b53f7c52
JB
62 m128iS1 = _mm_load_si128((__m128i*)&src[8 + 0]);
63 m128iS3 = _mm_load_si128((__m128i*)&src[24 + 0]);
72b9787e
JB
64 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
65 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
66 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
67 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
68
b53f7c52
JB
69 m128iS5 = _mm_load_si128((__m128i*)&src[40 + 0]);
70 m128iS7 = _mm_load_si128((__m128i*)&src[56 + 0]);
72b9787e
JB
71 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
72 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
73 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
74 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
75 O0l = _mm_add_epi32(E1l, E2l);
76 O0h = _mm_add_epi32(E1h, E2h);
77
78 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
79 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
80 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
81 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
82
83 O1l = _mm_add_epi32(E1l, E2l);
84 O1h = _mm_add_epi32(E1h, E2h);
85
86 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
87 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
88 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
89 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
90 O2l = _mm_add_epi32(E1l, E2l);
91 O2h = _mm_add_epi32(E1h, E2h);
92
93 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
94 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
95 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
96 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
97 O3h = _mm_add_epi32(E1h, E2h);
98 O3l = _mm_add_epi32(E1l, E2l);
99
100 /* ------- */
101
b53f7c52
JB
102 m128iS0 = _mm_load_si128((__m128i*)&src[0 + 0]);
103 m128iS4 = _mm_load_si128((__m128i*)&src[32 + 0]);
72b9787e
JB
104 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
105 EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
106 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
107 EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
108
109 EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
110 EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
111
112 /* ------- */
113
b53f7c52
JB
114 m128iS2 = _mm_load_si128((__m128i*)&src[16 + 0]);
115 m128iS6 = _mm_load_si128((__m128i*)&src[48 + 0]);
72b9787e
JB
116 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
117 E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
118 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
119 E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
120 E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
121 E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
122 E0l = _mm_add_epi32(EE0l, E00l);
123 E0l = _mm_add_epi32(E0l, m128iAdd);
124 E0h = _mm_add_epi32(EE0h, E00h);
125 E0h = _mm_add_epi32(E0h, m128iAdd);
126 E3l = _mm_sub_epi32(EE0l, E00l);
127 E3l = _mm_add_epi32(E3l, m128iAdd);
128 E3h = _mm_sub_epi32(EE0h, E00h);
129 E3h = _mm_add_epi32(E3h, m128iAdd);
130
131 E1l = _mm_add_epi32(EE1l, E01l);
132 E1l = _mm_add_epi32(E1l, m128iAdd);
133 E1h = _mm_add_epi32(EE1h, E01h);
134 E1h = _mm_add_epi32(E1h, m128iAdd);
135 E2l = _mm_sub_epi32(EE1l, E01l);
136 E2l = _mm_add_epi32(E2l, m128iAdd);
137 E2h = _mm_sub_epi32(EE1h, E01h);
138 E2h = _mm_add_epi32(E2h, m128iAdd);
139 m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 7));
140 m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 7));
141 m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 7));
142 m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 7));
143 m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 7));
144 m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 7));
145 m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 7));
146 m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 7));
147 /* Invers matrix */
148
149 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
150 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
151 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
152 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
153 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
154 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
155 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
156 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
157 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
158 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
159 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
160 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
161 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
162 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
163 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
164 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
165 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
166 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
167 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
168 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
169 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
170 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
171 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
172 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
173
174 m128iAdd = _mm_set1_epi32(2048);
175
176 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
177 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
178 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
179 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
180 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
181 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
182 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
183 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
184 O0l = _mm_add_epi32(E1l, E2l);
185 O0h = _mm_add_epi32(E1h, E2h);
186 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
187 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
188 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
189 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
190 O1l = _mm_add_epi32(E1l, E2l);
191 O1h = _mm_add_epi32(E1h, E2h);
192 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
193 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
194 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
195 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
196 O2l = _mm_add_epi32(E1l, E2l);
197 O2h = _mm_add_epi32(E1h, E2h);
198 E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
199 E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
200 E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
201 E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
202 O3h = _mm_add_epi32(E1h, E2h);
203 O3l = _mm_add_epi32(E1l, E2l);
204
205 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
206 EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
207 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
208 EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
209 EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
210 EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
211
212 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
213 E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
214 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
215 E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
216 E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
217 E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
218 E0l = _mm_add_epi32(EE0l, E00l);
219 E0l = _mm_add_epi32(E0l, m128iAdd);
220 E0h = _mm_add_epi32(EE0h, E00h);
221 E0h = _mm_add_epi32(E0h, m128iAdd);
222 E3l = _mm_sub_epi32(EE0l, E00l);
223 E3l = _mm_add_epi32(E3l, m128iAdd);
224 E3h = _mm_sub_epi32(EE0h, E00h);
225 E3h = _mm_add_epi32(E3h, m128iAdd);
226 E1l = _mm_add_epi32(EE1l, E01l);
227 E1l = _mm_add_epi32(E1l, m128iAdd);
228 E1h = _mm_add_epi32(EE1h, E01h);
229 E1h = _mm_add_epi32(E1h, m128iAdd);
230 E2l = _mm_sub_epi32(EE1l, E01l);
231 E2l = _mm_add_epi32(E2l, m128iAdd);
232 E2h = _mm_sub_epi32(EE1h, E01h);
233 E2h = _mm_add_epi32(E2h, m128iAdd);
234
235 m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 12));
236 m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 12));
237 m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 12));
238 m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 12));
239 m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 12));
240 m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 12));
241 m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 12));
242 m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 12));
243
244 // [07 06 05 04 03 02 01 00]
245 // [17 16 15 14 13 12 11 10]
246 // [27 26 25 24 23 22 21 20]
247 // [37 36 35 34 33 32 31 30]
248 // [47 46 45 44 43 42 41 40]
249 // [57 56 55 54 53 52 51 50]
250 // [67 66 65 64 63 62 61 60]
251 // [77 76 75 74 73 72 71 70]
252
253 T00 = _mm_unpacklo_epi16(m128iS0, m128iS1); // [13 03 12 02 11 01 10 00]
254 T01 = _mm_unpackhi_epi16(m128iS0, m128iS1); // [17 07 16 06 15 05 14 04]
255 T02 = _mm_unpacklo_epi16(m128iS2, m128iS3); // [33 23 32 22 31 21 30 20]
256 T03 = _mm_unpackhi_epi16(m128iS2, m128iS3); // [37 27 36 26 35 25 34 24]
257 T04 = _mm_unpacklo_epi16(m128iS4, m128iS5); // [53 43 52 42 51 41 50 40]
258 T05 = _mm_unpackhi_epi16(m128iS4, m128iS5); // [57 47 56 46 55 45 54 44]
259 T06 = _mm_unpacklo_epi16(m128iS6, m128iS7); // [73 63 72 62 71 61 70 60]
260 T07 = _mm_unpackhi_epi16(m128iS6, m128iS7); // [77 67 76 66 75 65 74 64]
261
262 __m128i T10, T11;
263 T10 = _mm_unpacklo_epi32(T00, T02); // [31 21 11 01 30 20 10 00]
264 T11 = _mm_unpackhi_epi32(T00, T02); // [33 23 13 03 32 22 12 02]
265 _mm_storel_epi64((__m128i*)&dst[0 * stride + 0], T10); // [30 20 10 00]
266 _mm_storeh_pi((__m64*)&dst[1 * stride + 0], _mm_castsi128_ps(T10)); // [31 21 11 01]
267 _mm_storel_epi64((__m128i*)&dst[2 * stride + 0], T11); // [32 22 12 02]
268 _mm_storeh_pi((__m64*)&dst[3 * stride + 0], _mm_castsi128_ps(T11)); // [33 23 13 03]
269
270 T10 = _mm_unpacklo_epi32(T04, T06); // [71 61 51 41 70 60 50 40]
271 T11 = _mm_unpackhi_epi32(T04, T06); // [73 63 53 43 72 62 52 42]
272 _mm_storel_epi64((__m128i*)&dst[0 * stride + 4], T10);
273 _mm_storeh_pi((__m64*)&dst[1 * stride + 4], _mm_castsi128_ps(T10));
274 _mm_storel_epi64((__m128i*)&dst[2 * stride + 4], T11);
275 _mm_storeh_pi((__m64*)&dst[3 * stride + 4], _mm_castsi128_ps(T11));
276
277 T10 = _mm_unpacklo_epi32(T01, T03); // [35 25 15 05 34 24 14 04]
278 T11 = _mm_unpackhi_epi32(T01, T03); // [37 27 17 07 36 26 16 06]
279 _mm_storel_epi64((__m128i*)&dst[4 * stride + 0], T10);
280 _mm_storeh_pi((__m64*)&dst[5 * stride + 0], _mm_castsi128_ps(T10));
281 _mm_storel_epi64((__m128i*)&dst[6 * stride + 0], T11);
282 _mm_storeh_pi((__m64*)&dst[7 * stride + 0], _mm_castsi128_ps(T11));
283
284 T10 = _mm_unpacklo_epi32(T05, T07); // [75 65 55 45 74 64 54 44]
285 T11 = _mm_unpackhi_epi32(T05, T07); // [77 67 57 47 76 56 46 36]
286 _mm_storel_epi64((__m128i*)&dst[4 * stride + 4], T10);
287 _mm_storeh_pi((__m64*)&dst[5 * stride + 4], _mm_castsi128_ps(T10));
288 _mm_storel_epi64((__m128i*)&dst[6 * stride + 4], T11);
289 _mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(T11));
290}
291
b53f7c52 292void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
72b9787e
JB
293{
294 const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
295 const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050);
296 const __m128i c16_p43_p57 = _mm_set1_epi32(0x002B0039);
297 const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019);
298 const __m128i c16_p57_p87 = _mm_set1_epi32(0x00390057); //row1
299 const __m128i c16_n43_p09 = _mm_set1_epi32(0xFFD50009);
300 const __m128i c16_n90_n80 = _mm_set1_epi32(0xFFA6FFB0);
301 const __m128i c16_n25_n70 = _mm_set1_epi32(0xFFE7FFBA);
302 const __m128i c16_p09_p80 = _mm_set1_epi32(0x00090050); //row2
303 const __m128i c16_n87_n70 = _mm_set1_epi32(0xFFA9FFBA);
304 const __m128i c16_p57_n25 = _mm_set1_epi32(0x0039FFE7);
305 const __m128i c16_p43_p90 = _mm_set1_epi32(0x002B005A);
306 const __m128i c16_n43_p70 = _mm_set1_epi32(0xFFD50046); //row3
307 const __m128i c16_p09_n87 = _mm_set1_epi32(0x0009FFA9);
308 const __m128i c16_p25_p90 = _mm_set1_epi32(0x0019005A);
309 const __m128i c16_n57_n80 = _mm_set1_epi32(0xFFC7FFB0);
310 const __m128i c16_n80_p57 = _mm_set1_epi32(0xFFB00039); //row4
311 const __m128i c16_p90_n25 = _mm_set1_epi32(0x005AFFE7);
312 const __m128i c16_n87_n09 = _mm_set1_epi32(0xFFA9FFF7);
313 const __m128i c16_p70_p43 = _mm_set1_epi32(0x0046002B);
314 const __m128i c16_n90_p43 = _mm_set1_epi32(0xFFA6002B); //row5
315 const __m128i c16_p25_p57 = _mm_set1_epi32(0x00190039);
316 const __m128i c16_p70_n87 = _mm_set1_epi32(0x0046FFA9);
317 const __m128i c16_n80_p09 = _mm_set1_epi32(0xFFB00009);
318 const __m128i c16_n70_p25 = _mm_set1_epi32(0xFFBA0019); //row6
319 const __m128i c16_n80_p90 = _mm_set1_epi32(0xFFB0005A);
320 const __m128i c16_p09_p43 = _mm_set1_epi32(0x0009002B);
321 const __m128i c16_p87_n57 = _mm_set1_epi32(0x0057FFC7);
322 const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); //row7
323 const __m128i c16_n57_p43 = _mm_set1_epi32(0xFFC7002B);
324 const __m128i c16_n80_p70 = _mm_set1_epi32(0xFFB00046);
325 const __m128i c16_n90_p87 = _mm_set1_epi32(0xFFA60057);
326
327 const __m128i c16_p75_p89 = _mm_set1_epi32(0x004B0059);
328 const __m128i c16_p18_p50 = _mm_set1_epi32(0x00120032);
329 const __m128i c16_n18_p75 = _mm_set1_epi32(0xFFEE004B);
330 const __m128i c16_n50_n89 = _mm_set1_epi32(0xFFCEFFA7);
331 const __m128i c16_n89_p50 = _mm_set1_epi32(0xFFA70032);
332 const __m128i c16_p75_p18 = _mm_set1_epi32(0x004B0012);
333 const __m128i c16_n50_p18 = _mm_set1_epi32(0xFFCE0012);
334 const __m128i c16_n89_p75 = _mm_set1_epi32(0xFFA7004B);
335
336 const __m128i c16_p36_p83 = _mm_set1_epi32(0x00240053);
337 const __m128i c16_n83_p36 = _mm_set1_epi32(0xFFAD0024);
338
339 const __m128i c16_n64_p64 = _mm_set1_epi32(0xFFC00040);
340 const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040);
341 __m128i c32_rnd = _mm_set1_epi32(64);
342
343 int nShift = 7;
344
345 // DCT1
346 __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2];
347 __m128i in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2];
348 __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2];
349 __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2];
350
351 for (int i = 0; i < 2; i++)
352 {
353 const int offset = (i << 3);
b53f7c52
JB
354 in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00]
355 in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10]
356 in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20]
357 in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30]
358 in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); // [47 46 45 44 43 42 41 40]
359 in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); // [57 56 55 54 53 52 51 50]
360 in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); // [67 66 65 64 63 62 61 60]
361 in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); // [77 76 75 74 73 72 71 70]
362 in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
363 in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
364 in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
365 in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
366 in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
367 in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
368 in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
369 in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
72b9787e
JB
370 }
371
372 for (int pass = 0; pass < 2; pass++)
373 {
374 if (pass == 1)
375 {
376 c32_rnd = _mm_set1_epi32(2048);
377 nShift = 12;
378 }
379
380 for (int part = 0; part < 2; part++)
381 {
382 const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10]
383 const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14]
384 const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ]
385 const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ]
386 const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ]
387 const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ]
388 const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ]
389 const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ]
390 const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ]
391 const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ]
392 const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ]
393 const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ]
394 const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ]row
395 const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ]
396 const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]); // [83 03 82 02 81 01 81 00] row08 row00
397 const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]); // [87 07 86 06 85 05 84 04]
398
399 __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;
400 __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;
401 {
402 __m128i T00, T01;
403#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \
404 T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \
405 T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \
406 row = _mm_add_epi32(T00, T01);
407
408 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)
409 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)
410 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)
411 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)
412 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)
413 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)
414 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)
415 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)
416
417 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)
418 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)
419 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)
420 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)
421 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)
422 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)
423 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)
424 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)
425#undef COMPUTE_ROW
426 }
427
428 __m128i EO0A, EO1A, EO2A, EO3A;
429 __m128i EO0B, EO1B, EO2B, EO3B;
430 EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89), _mm_madd_epi16(T_00_05A, c16_p18_p50)); // EO0
431 EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89), _mm_madd_epi16(T_00_05B, c16_p18_p50));
432 EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75), _mm_madd_epi16(T_00_05A, c16_n50_n89)); // EO1
433 EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75), _mm_madd_epi16(T_00_05B, c16_n50_n89));
434 EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50), _mm_madd_epi16(T_00_05A, c16_p75_p18)); // EO2
435 EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50), _mm_madd_epi16(T_00_05B, c16_p75_p18));
436 EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18), _mm_madd_epi16(T_00_05A, c16_n89_p75)); // EO3
437 EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18), _mm_madd_epi16(T_00_05B, c16_n89_p75));
438
439 __m128i EEO0A, EEO1A;
440 __m128i EEO0B, EEO1B;
441 EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);
442 EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);
443 EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);
444 EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);
445
446 __m128i EEE0A, EEE1A;
447 __m128i EEE0B, EEE1B;
448 EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);
449 EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);
450 EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);
451 EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);
452
453 const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0
454 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);
455 const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1
456 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);
457 const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0
458 const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);
459 const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1
460 const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);
461
462 const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0
463 const __m128i E0B = _mm_add_epi32(EE0B, EO0B);
464 const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1
465 const __m128i E1B = _mm_add_epi32(EE1B, EO1B);
466 const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2
467 const __m128i E2B = _mm_add_epi32(EE2B, EO2B);
468 const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3
469 const __m128i E3B = _mm_add_epi32(EE3B, EO3B);
470 const __m128i E7A = _mm_sub_epi32(EE0A, EO0A); // E0 = EE0 - EO0
471 const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);
472 const __m128i E6A = _mm_sub_epi32(EE1A, EO1A); // E1 = EE1 - EO1
473 const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);
474 const __m128i E5A = _mm_sub_epi32(EE2A, EO2A); // E2 = EE2 - EO2
475 const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);
476 const __m128i E4A = _mm_sub_epi32(EE3A, EO3A); // E3 = EE3 - EO3
477 const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);
478
479 const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd
480 const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);
481 const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd
482 const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);
483 const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd
484 const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);
485 const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd
486 const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);
487 const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd
488 const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);
489 const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd
490 const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);
491 const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd
492 const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);
493 const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd
494 const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);
495
496 const __m128i T20A = _mm_add_epi32(T10A, O0A); // E0 + O0 + rnd
497 const __m128i T20B = _mm_add_epi32(T10B, O0B);
498 const __m128i T21A = _mm_add_epi32(T11A, O1A); // E1 + O1 + rnd
499 const __m128i T21B = _mm_add_epi32(T11B, O1B);
500 const __m128i T22A = _mm_add_epi32(T12A, O2A); // E2 + O2 + rnd
501 const __m128i T22B = _mm_add_epi32(T12B, O2B);
502 const __m128i T23A = _mm_add_epi32(T13A, O3A); // E3 + O3 + rnd
503 const __m128i T23B = _mm_add_epi32(T13B, O3B);
504 const __m128i T24A = _mm_add_epi32(T14A, O4A); // E4
505 const __m128i T24B = _mm_add_epi32(T14B, O4B);
506 const __m128i T25A = _mm_add_epi32(T15A, O5A); // E5
507 const __m128i T25B = _mm_add_epi32(T15B, O5B);
508 const __m128i T26A = _mm_add_epi32(T16A, O6A); // E6
509 const __m128i T26B = _mm_add_epi32(T16B, O6B);
510 const __m128i T27A = _mm_add_epi32(T17A, O7A); // E7
511 const __m128i T27B = _mm_add_epi32(T17B, O7B);
512 const __m128i T2FA = _mm_sub_epi32(T10A, O0A); // E0 - O0 + rnd
513 const __m128i T2FB = _mm_sub_epi32(T10B, O0B);
514 const __m128i T2EA = _mm_sub_epi32(T11A, O1A); // E1 - O1 + rnd
515 const __m128i T2EB = _mm_sub_epi32(T11B, O1B);
516 const __m128i T2DA = _mm_sub_epi32(T12A, O2A); // E2 - O2 + rnd
517 const __m128i T2DB = _mm_sub_epi32(T12B, O2B);
518 const __m128i T2CA = _mm_sub_epi32(T13A, O3A); // E3 - O3 + rnd
519 const __m128i T2CB = _mm_sub_epi32(T13B, O3B);
520 const __m128i T2BA = _mm_sub_epi32(T14A, O4A); // E4
521 const __m128i T2BB = _mm_sub_epi32(T14B, O4B);
522 const __m128i T2AA = _mm_sub_epi32(T15A, O5A); // E5
523 const __m128i T2AB = _mm_sub_epi32(T15B, O5B);
524 const __m128i T29A = _mm_sub_epi32(T16A, O6A); // E6
525 const __m128i T29B = _mm_sub_epi32(T16B, O6B);
526 const __m128i T28A = _mm_sub_epi32(T17A, O7A); // E7
527 const __m128i T28B = _mm_sub_epi32(T17B, O7B);
528
529 const __m128i T30A = _mm_srai_epi32(T20A, nShift); // [30 20 10 00]
530 const __m128i T30B = _mm_srai_epi32(T20B, nShift); // [70 60 50 40]
531 const __m128i T31A = _mm_srai_epi32(T21A, nShift); // [31 21 11 01]
532 const __m128i T31B = _mm_srai_epi32(T21B, nShift); // [71 61 51 41]
533 const __m128i T32A = _mm_srai_epi32(T22A, nShift); // [32 22 12 02]
534 const __m128i T32B = _mm_srai_epi32(T22B, nShift); // [72 62 52 42]
535 const __m128i T33A = _mm_srai_epi32(T23A, nShift); // [33 23 13 03]
536 const __m128i T33B = _mm_srai_epi32(T23B, nShift); // [73 63 53 43]
537 const __m128i T34A = _mm_srai_epi32(T24A, nShift); // [33 24 14 04]
538 const __m128i T34B = _mm_srai_epi32(T24B, nShift); // [74 64 54 44]
539 const __m128i T35A = _mm_srai_epi32(T25A, nShift); // [35 25 15 05]
540 const __m128i T35B = _mm_srai_epi32(T25B, nShift); // [75 65 55 45]
541 const __m128i T36A = _mm_srai_epi32(T26A, nShift); // [36 26 16 06]
542 const __m128i T36B = _mm_srai_epi32(T26B, nShift); // [76 66 56 46]
543 const __m128i T37A = _mm_srai_epi32(T27A, nShift); // [37 27 17 07]
544 const __m128i T37B = _mm_srai_epi32(T27B, nShift); // [77 67 57 47]
545
546 const __m128i T38A = _mm_srai_epi32(T28A, nShift); // [30 20 10 00] x8
547 const __m128i T38B = _mm_srai_epi32(T28B, nShift); // [70 60 50 40]
548 const __m128i T39A = _mm_srai_epi32(T29A, nShift); // [31 21 11 01] x9
549 const __m128i T39B = _mm_srai_epi32(T29B, nShift); // [71 61 51 41]
550 const __m128i T3AA = _mm_srai_epi32(T2AA, nShift); // [32 22 12 02] xA
551 const __m128i T3AB = _mm_srai_epi32(T2AB, nShift); // [72 62 52 42]
552 const __m128i T3BA = _mm_srai_epi32(T2BA, nShift); // [33 23 13 03] xB
553 const __m128i T3BB = _mm_srai_epi32(T2BB, nShift); // [73 63 53 43]
554 const __m128i T3CA = _mm_srai_epi32(T2CA, nShift); // [33 24 14 04] xC
555 const __m128i T3CB = _mm_srai_epi32(T2CB, nShift); // [74 64 54 44]
556 const __m128i T3DA = _mm_srai_epi32(T2DA, nShift); // [35 25 15 05] xD
557 const __m128i T3DB = _mm_srai_epi32(T2DB, nShift); // [75 65 55 45]
558 const __m128i T3EA = _mm_srai_epi32(T2EA, nShift); // [36 26 16 06] xE
559 const __m128i T3EB = _mm_srai_epi32(T2EB, nShift); // [76 66 56 46]
560 const __m128i T3FA = _mm_srai_epi32(T2FA, nShift); // [37 27 17 07] xF
561 const __m128i T3FB = _mm_srai_epi32(T2FB, nShift); // [77 67 57 47]
562
563 res00[part] = _mm_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00]
564 res01[part] = _mm_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01]
565 res02[part] = _mm_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02]
566 res03[part] = _mm_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03]
567 res04[part] = _mm_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04]
568 res05[part] = _mm_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05]
569 res06[part] = _mm_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06]
570 res07[part] = _mm_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07]
571
572 res08[part] = _mm_packs_epi32(T38A, T38B); // [A0 ... 80]
573 res09[part] = _mm_packs_epi32(T39A, T39B); // [A1 ... 81]
574 res10[part] = _mm_packs_epi32(T3AA, T3AB); // [A2 ... 82]
575 res11[part] = _mm_packs_epi32(T3BA, T3BB); // [A3 ... 83]
576 res12[part] = _mm_packs_epi32(T3CA, T3CB); // [A4 ... 84]
577 res13[part] = _mm_packs_epi32(T3DA, T3DB); // [A5 ... 85]
578 res14[part] = _mm_packs_epi32(T3EA, T3EB); // [A6 ... 86]
579 res15[part] = _mm_packs_epi32(T3FA, T3FB); // [A7 ... 87]
580 }
581 //transpose matrix 8x8 16bit.
582 {
583 __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
584 __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
585#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
586 tr0_0 = _mm_unpacklo_epi16(I0, I1); \
587 tr0_1 = _mm_unpacklo_epi16(I2, I3); \
588 tr0_2 = _mm_unpackhi_epi16(I0, I1); \
589 tr0_3 = _mm_unpackhi_epi16(I2, I3); \
590 tr0_4 = _mm_unpacklo_epi16(I4, I5); \
591 tr0_5 = _mm_unpacklo_epi16(I6, I7); \
592 tr0_6 = _mm_unpackhi_epi16(I4, I5); \
593 tr0_7 = _mm_unpackhi_epi16(I6, I7); \
594 tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
595 tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
596 tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
597 tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
598 tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
599 tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
600 tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
601 tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
602 O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
603 O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
604 O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
605 O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
606 O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
607 O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
608 O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
609 O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
610
611 TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
612 TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
613 TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
614 TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
615
616#undef TRANSPOSE_8x8_16BIT
617 }
618 }
619
620 _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);
621 _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);
622 _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);
623 _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);
624 _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);
625 _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);
626 _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);
627 _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);
628 _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);
629 _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);
630 _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);
631 _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);
632 _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);
633 _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);
634 _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);
635 _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);
636 _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);
637 _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);
638 _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);
639 _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);
640 _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);
641 _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);
642 _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);
643 _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);
644 _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);
645 _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);
646 _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);
647 _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);
648 _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);
649 _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);
650 _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);
651 _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
652}
653
b53f7c52 654void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
72b9787e
JB
655{
656 //Odd
657 const __m128i c16_p90_p90 = _mm_set1_epi32(0x005A005A); //column 0
658 const __m128i c16_p85_p88 = _mm_set1_epi32(0x00550058);
659 const __m128i c16_p78_p82 = _mm_set1_epi32(0x004E0052);
660 const __m128i c16_p67_p73 = _mm_set1_epi32(0x00430049);
661 const __m128i c16_p54_p61 = _mm_set1_epi32(0x0036003D);
662 const __m128i c16_p38_p46 = _mm_set1_epi32(0x0026002E);
663 const __m128i c16_p22_p31 = _mm_set1_epi32(0x0016001F);
664 const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D);
665 const __m128i c16_p82_p90 = _mm_set1_epi32(0x0052005A); //column 1
666 const __m128i c16_p46_p67 = _mm_set1_epi32(0x002E0043);
667 const __m128i c16_n04_p22 = _mm_set1_epi32(0xFFFC0016);
668 const __m128i c16_n54_n31 = _mm_set1_epi32(0xFFCAFFE1);
669 const __m128i c16_n85_n73 = _mm_set1_epi32(0xFFABFFB7);
670 const __m128i c16_n88_n90 = _mm_set1_epi32(0xFFA8FFA6);
671 const __m128i c16_n61_n78 = _mm_set1_epi32(0xFFC3FFB2);
672 const __m128i c16_n13_n38 = _mm_set1_epi32(0xFFF3FFDA);
673 const __m128i c16_p67_p88 = _mm_set1_epi32(0x00430058); //column 2
674 const __m128i c16_n13_p31 = _mm_set1_epi32(0xFFF3001F);
675 const __m128i c16_n82_n54 = _mm_set1_epi32(0xFFAEFFCA);
676 const __m128i c16_n78_n90 = _mm_set1_epi32(0xFFB2FFA6);
677 const __m128i c16_n04_n46 = _mm_set1_epi32(0xFFFCFFD2);
678 const __m128i c16_p73_p38 = _mm_set1_epi32(0x00490026);
679 const __m128i c16_p85_p90 = _mm_set1_epi32(0x0055005A);
680 const __m128i c16_p22_p61 = _mm_set1_epi32(0x0016003D);
681 const __m128i c16_p46_p85 = _mm_set1_epi32(0x002E0055); //column 3
682 const __m128i c16_n67_n13 = _mm_set1_epi32(0xFFBDFFF3);
683 const __m128i c16_n73_n90 = _mm_set1_epi32(0xFFB7FFA6);
684 const __m128i c16_p38_n22 = _mm_set1_epi32(0x0026FFEA);
685 const __m128i c16_p88_p82 = _mm_set1_epi32(0x00580052);
686 const __m128i c16_n04_p54 = _mm_set1_epi32(0xFFFC0036);
687 const __m128i c16_n90_n61 = _mm_set1_epi32(0xFFA6FFC3);
688 const __m128i c16_n31_n78 = _mm_set1_epi32(0xFFE1FFB2);
689 const __m128i c16_p22_p82 = _mm_set1_epi32(0x00160052); //column 4
690 const __m128i c16_n90_n54 = _mm_set1_epi32(0xFFA6FFCA);
691 const __m128i c16_p13_n61 = _mm_set1_epi32(0x000DFFC3);
692 const __m128i c16_p85_p78 = _mm_set1_epi32(0x0055004E);
693 const __m128i c16_n46_p31 = _mm_set1_epi32(0xFFD2001F);
694 const __m128i c16_n67_n90 = _mm_set1_epi32(0xFFBDFFA6);
695 const __m128i c16_p73_p04 = _mm_set1_epi32(0x00490004);
696 const __m128i c16_p38_p88 = _mm_set1_epi32(0x00260058);
697 const __m128i c16_n04_p78 = _mm_set1_epi32(0xFFFC004E); //column 5
698 const __m128i c16_n73_n82 = _mm_set1_epi32(0xFFB7FFAE);
699 const __m128i c16_p85_p13 = _mm_set1_epi32(0x0055000D);
700 const __m128i c16_n22_p67 = _mm_set1_epi32(0xFFEA0043);
701 const __m128i c16_n61_n88 = _mm_set1_epi32(0xFFC3FFA8);
702 const __m128i c16_p90_p31 = _mm_set1_epi32(0x005A001F);
703 const __m128i c16_n38_p54 = _mm_set1_epi32(0xFFDA0036);
704 const __m128i c16_n46_n90 = _mm_set1_epi32(0xFFD2FFA6);
705 const __m128i c16_n31_p73 = _mm_set1_epi32(0xFFE10049); //column 6
706 const __m128i c16_n22_n90 = _mm_set1_epi32(0xFFEAFFA6);
707 const __m128i c16_p67_p78 = _mm_set1_epi32(0x0043004E);
708 const __m128i c16_n90_n38 = _mm_set1_epi32(0xFFA6FFDA);
709 const __m128i c16_p82_n13 = _mm_set1_epi32(0x0052FFF3);
710 const __m128i c16_n46_p61 = _mm_set1_epi32(0xFFD2003D);
711 const __m128i c16_n04_n88 = _mm_set1_epi32(0xFFFCFFA8);
712 const __m128i c16_p54_p85 = _mm_set1_epi32(0x00360055);
713 const __m128i c16_n54_p67 = _mm_set1_epi32(0xFFCA0043); //column 7
714 const __m128i c16_p38_n78 = _mm_set1_epi32(0x0026FFB2);
715 const __m128i c16_n22_p85 = _mm_set1_epi32(0xFFEA0055);
716 const __m128i c16_p04_n90 = _mm_set1_epi32(0x0004FFA6);
717 const __m128i c16_p13_p90 = _mm_set1_epi32(0x000D005A);
718 const __m128i c16_n31_n88 = _mm_set1_epi32(0xFFE1FFA8);
719 const __m128i c16_p46_p82 = _mm_set1_epi32(0x002E0052);
720 const __m128i c16_n61_n73 = _mm_set1_epi32(0xFFC3FFB7);
721 const __m128i c16_n73_p61 = _mm_set1_epi32(0xFFB7003D); //column 8
722 const __m128i c16_p82_n46 = _mm_set1_epi32(0x0052FFD2);
723 const __m128i c16_n88_p31 = _mm_set1_epi32(0xFFA8001F);
724 const __m128i c16_p90_n13 = _mm_set1_epi32(0x005AFFF3);
725 const __m128i c16_n90_n04 = _mm_set1_epi32(0xFFA6FFFC);
726 const __m128i c16_p85_p22 = _mm_set1_epi32(0x00550016);
727 const __m128i c16_n78_n38 = _mm_set1_epi32(0xFFB2FFDA);
728 const __m128i c16_p67_p54 = _mm_set1_epi32(0x00430036);
729 const __m128i c16_n85_p54 = _mm_set1_epi32(0xFFAB0036); //column 9
730 const __m128i c16_p88_n04 = _mm_set1_epi32(0x0058FFFC);
731 const __m128i c16_n61_n46 = _mm_set1_epi32(0xFFC3FFD2);
732 const __m128i c16_p13_p82 = _mm_set1_epi32(0x000D0052);
733 const __m128i c16_p38_n90 = _mm_set1_epi32(0x0026FFA6);
734 const __m128i c16_n78_p67 = _mm_set1_epi32(0xFFB20043);
735 const __m128i c16_p90_n22 = _mm_set1_epi32(0x005AFFEA);
736 const __m128i c16_n73_n31 = _mm_set1_epi32(0xFFB7FFE1);
737 const __m128i c16_n90_p46 = _mm_set1_epi32(0xFFA6002E); //column 10
738 const __m128i c16_p54_p38 = _mm_set1_epi32(0x00360026);
739 const __m128i c16_p31_n90 = _mm_set1_epi32(0x001FFFA6);
740 const __m128i c16_n88_p61 = _mm_set1_epi32(0xFFA8003D);
741 const __m128i c16_p67_p22 = _mm_set1_epi32(0x00430016);
742 const __m128i c16_p13_n85 = _mm_set1_epi32(0x000DFFAB);
743 const __m128i c16_n82_p73 = _mm_set1_epi32(0xFFAE0049);
744 const __m128i c16_p78_p04 = _mm_set1_epi32(0x004E0004);
745 const __m128i c16_n88_p38 = _mm_set1_epi32(0xFFA80026); //column 11
746 const __m128i c16_n04_p73 = _mm_set1_epi32(0xFFFC0049);
747 const __m128i c16_p90_n67 = _mm_set1_epi32(0x005AFFBD);
748 const __m128i c16_n31_n46 = _mm_set1_epi32(0xFFE1FFD2);
749 const __m128i c16_n78_p85 = _mm_set1_epi32(0xFFB20055);
750 const __m128i c16_p61_p13 = _mm_set1_epi32(0x003D000D);
751 const __m128i c16_p54_n90 = _mm_set1_epi32(0x0036FFA6);
752 const __m128i c16_n82_p22 = _mm_set1_epi32(0xFFAE0016);
753 const __m128i c16_n78_p31 = _mm_set1_epi32(0xFFB2001F); //column 12
754 const __m128i c16_n61_p90 = _mm_set1_epi32(0xFFC3005A);
755 const __m128i c16_p54_p04 = _mm_set1_epi32(0x00360004);
756 const __m128i c16_p82_n88 = _mm_set1_epi32(0x0052FFA8);
757 const __m128i c16_n22_n38 = _mm_set1_epi32(0xFFEAFFDA);
758 const __m128i c16_n90_p73 = _mm_set1_epi32(0xFFA60049);
759 const __m128i c16_n13_p67 = _mm_set1_epi32(0xFFF30043);
760 const __m128i c16_p85_n46 = _mm_set1_epi32(0x0055FFD2);
761 const __m128i c16_n61_p22 = _mm_set1_epi32(0xFFC30016); //column 13
762 const __m128i c16_n90_p85 = _mm_set1_epi32(0xFFA60055);
763 const __m128i c16_n38_p73 = _mm_set1_epi32(0xFFDA0049);
764 const __m128i c16_p46_n04 = _mm_set1_epi32(0x002EFFFC);
765 const __m128i c16_p90_n78 = _mm_set1_epi32(0x005AFFB2);
766 const __m128i c16_p54_n82 = _mm_set1_epi32(0x0036FFAE);
767 const __m128i c16_n31_n13 = _mm_set1_epi32(0xFFE1FFF3);
768 const __m128i c16_n88_p67 = _mm_set1_epi32(0xFFA80043);
769 const __m128i c16_n38_p13 = _mm_set1_epi32(0xFFDA000D); //column 14
770 const __m128i c16_n78_p61 = _mm_set1_epi32(0xFFB2003D);
771 const __m128i c16_n90_p88 = _mm_set1_epi32(0xFFA60058);
772 const __m128i c16_n73_p85 = _mm_set1_epi32(0xFFB70055);
773 const __m128i c16_n31_p54 = _mm_set1_epi32(0xFFE10036);
774 const __m128i c16_p22_p04 = _mm_set1_epi32(0x00160004);
775 const __m128i c16_p67_n46 = _mm_set1_epi32(0x0043FFD2);
776 const __m128i c16_p90_n82 = _mm_set1_epi32(0x005AFFAE);
777 const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //column 15
778 const __m128i c16_n31_p22 = _mm_set1_epi32(0xFFE10016);
779 const __m128i c16_n46_p38 = _mm_set1_epi32(0xFFD20026);
780 const __m128i c16_n61_p54 = _mm_set1_epi32(0xFFC30036);
781 const __m128i c16_n73_p67 = _mm_set1_epi32(0xFFB70043);
782 const __m128i c16_n82_p78 = _mm_set1_epi32(0xFFAE004E);
783 const __m128i c16_n88_p85 = _mm_set1_epi32(0xFFA80055);
784 const __m128i c16_n90_p90 = _mm_set1_epi32(0xFFA6005A);
785
786 //EO
787 const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
788 const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050);
789 const __m128i c16_p43_p57 = _mm_set1_epi32(0x002B0039);
790 const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019);
791 const __m128i c16_p57_p87 = _mm_set1_epi32(0x00390057); //row1
792 const __m128i c16_n43_p09 = _mm_set1_epi32(0xFFD50009);
793 const __m128i c16_n90_n80 = _mm_set1_epi32(0xFFA6FFB0);
794 const __m128i c16_n25_n70 = _mm_set1_epi32(0xFFE7FFBA);
795 const __m128i c16_p09_p80 = _mm_set1_epi32(0x00090050); //row2
796 const __m128i c16_n87_n70 = _mm_set1_epi32(0xFFA9FFBA);
797 const __m128i c16_p57_n25 = _mm_set1_epi32(0x0039FFE7);
798 const __m128i c16_p43_p90 = _mm_set1_epi32(0x002B005A);
799 const __m128i c16_n43_p70 = _mm_set1_epi32(0xFFD50046); //row3
800 const __m128i c16_p09_n87 = _mm_set1_epi32(0x0009FFA9);
801 const __m128i c16_p25_p90 = _mm_set1_epi32(0x0019005A);
802 const __m128i c16_n57_n80 = _mm_set1_epi32(0xFFC7FFB0);
803 const __m128i c16_n80_p57 = _mm_set1_epi32(0xFFB00039); //row4
804 const __m128i c16_p90_n25 = _mm_set1_epi32(0x005AFFE7);
805 const __m128i c16_n87_n09 = _mm_set1_epi32(0xFFA9FFF7);
806 const __m128i c16_p70_p43 = _mm_set1_epi32(0x0046002B);
807 const __m128i c16_n90_p43 = _mm_set1_epi32(0xFFA6002B); //row5
808 const __m128i c16_p25_p57 = _mm_set1_epi32(0x00190039);
809 const __m128i c16_p70_n87 = _mm_set1_epi32(0x0046FFA9);
810 const __m128i c16_n80_p09 = _mm_set1_epi32(0xFFB00009);
811 const __m128i c16_n70_p25 = _mm_set1_epi32(0xFFBA0019); //row6
812 const __m128i c16_n80_p90 = _mm_set1_epi32(0xFFB0005A);
813 const __m128i c16_p09_p43 = _mm_set1_epi32(0x0009002B);
814 const __m128i c16_p87_n57 = _mm_set1_epi32(0x0057FFC7);
815 const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); //row7
816 const __m128i c16_n57_p43 = _mm_set1_epi32(0xFFC7002B);
817 const __m128i c16_n80_p70 = _mm_set1_epi32(0xFFB00046);
818 const __m128i c16_n90_p87 = _mm_set1_epi32(0xFFA60057);
819 //EEO
820 const __m128i c16_p75_p89 = _mm_set1_epi32(0x004B0059);
821 const __m128i c16_p18_p50 = _mm_set1_epi32(0x00120032);
822 const __m128i c16_n18_p75 = _mm_set1_epi32(0xFFEE004B);
823 const __m128i c16_n50_n89 = _mm_set1_epi32(0xFFCEFFA7);
824 const __m128i c16_n89_p50 = _mm_set1_epi32(0xFFA70032);
825 const __m128i c16_p75_p18 = _mm_set1_epi32(0x004B0012);
826 const __m128i c16_n50_p18 = _mm_set1_epi32(0xFFCE0012);
827 const __m128i c16_n89_p75 = _mm_set1_epi32(0xFFA7004B);
828 //EEEO
829 const __m128i c16_p36_p83 = _mm_set1_epi32(0x00240053);
830 const __m128i c16_n83_p36 = _mm_set1_epi32(0xFFAD0024);
831 //EEEE
832 const __m128i c16_n64_p64 = _mm_set1_epi32(0xFFC00040);
833 const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040);
834 __m128i c32_rnd = _mm_set1_epi32(64);
835
836 int nShift = 7;
837
838 // DCT1
839 __m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4];
840 __m128i in16[4], in17[4], in18[4], in19[4], in20[4], in21[4], in22[4], in23[4], in24[4], in25[4], in26[4], in27[4], in28[4], in29[4], in30[4], in31[4];
841 __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4];
842 __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4];
843
844 for (int i = 0; i < 4; i++)
845 {
846 const int offset = (i << 3);
b53f7c52
JB
847 in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
848 in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
849 in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
850 in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
851 in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
852 in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
853 in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
854 in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
855 in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
856 in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
857 in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
858 in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
859 in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
860 in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
861 in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
862 in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
863 in16[i] = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
864 in17[i] = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
865 in18[i] = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
866 in19[i] = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
867 in20[i] = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
868 in21[i] = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
869 in22[i] = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
870 in23[i] = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
871 in24[i] = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
872 in25[i] = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
873 in26[i] = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
874 in27[i] = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
875 in28[i] = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
876 in29[i] = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
877 in30[i] = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
878 in31[i] = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
72b9787e
JB
879 }
880
881 for (int pass = 0; pass < 2; pass++)
882 {
883 if (pass == 1)
884 {
885 c32_rnd = _mm_set1_epi32(2048);
886 nShift = 12;
887 }
888
889 for (int part = 0; part < 4; part++)
890 {
891 const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10]
892 const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14]
893 const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ]
894 const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ]
895 const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ]
896 const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ]
897 const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ]
898 const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ]
899 const __m128i T_00_04A = _mm_unpacklo_epi16(in17[part], in19[part]); // [ ]
900 const __m128i T_00_04B = _mm_unpackhi_epi16(in17[part], in19[part]); // [ ]
901 const __m128i T_00_05A = _mm_unpacklo_epi16(in21[part], in23[part]); // [ ]
902 const __m128i T_00_05B = _mm_unpackhi_epi16(in21[part], in23[part]); // [ ]
903 const __m128i T_00_06A = _mm_unpacklo_epi16(in25[part], in27[part]); // [ ]
904 const __m128i T_00_06B = _mm_unpackhi_epi16(in25[part], in27[part]); // [ ]
905 const __m128i T_00_07A = _mm_unpacklo_epi16(in29[part], in31[part]); //
906 const __m128i T_00_07B = _mm_unpackhi_epi16(in29[part], in31[part]); // [ ]
907
908 const __m128i T_00_08A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ]
909 const __m128i T_00_08B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ]
910 const __m128i T_00_09A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ]
911 const __m128i T_00_09B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ]
912 const __m128i T_00_10A = _mm_unpacklo_epi16(in18[part], in22[part]); // [ ]
913 const __m128i T_00_10B = _mm_unpackhi_epi16(in18[part], in22[part]); // [ ]
914 const __m128i T_00_11A = _mm_unpacklo_epi16(in26[part], in30[part]); // [ ]
915 const __m128i T_00_11B = _mm_unpackhi_epi16(in26[part], in30[part]); // [ ]
916
917 const __m128i T_00_12A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ]
918 const __m128i T_00_12B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ]
919 const __m128i T_00_13A = _mm_unpacklo_epi16(in20[part], in28[part]); // [ ]
920 const __m128i T_00_13B = _mm_unpackhi_epi16(in20[part], in28[part]); // [ ]
921
922 const __m128i T_00_14A = _mm_unpacklo_epi16(in08[part], in24[part]); //
923 const __m128i T_00_14B = _mm_unpackhi_epi16(in08[part], in24[part]); // [ ]
924 const __m128i T_00_15A = _mm_unpacklo_epi16(in00[part], in16[part]); //
925 const __m128i T_00_15B = _mm_unpackhi_epi16(in00[part], in16[part]); // [ ]
926
927 __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A;
928 __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B;
929 {
930 __m128i T00, T01, T02, T03;
931#define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \
932 T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \
933 T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \
934 T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \
935 T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \
936 row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03));
937
938 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
939 c16_p90_p90, c16_p85_p88, c16_p78_p82, c16_p67_p73, c16_p54_p61, c16_p38_p46, c16_p22_p31, c16_p04_p13, O00A)
940 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
941 c16_p82_p90, c16_p46_p67, c16_n04_p22, c16_n54_n31, c16_n85_n73, c16_n88_n90, c16_n61_n78, c16_n13_n38, O01A)
942 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
943 c16_p67_p88, c16_n13_p31, c16_n82_n54, c16_n78_n90, c16_n04_n46, c16_p73_p38, c16_p85_p90, c16_p22_p61, O02A)
944 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
945 c16_p46_p85, c16_n67_n13, c16_n73_n90, c16_p38_n22, c16_p88_p82, c16_n04_p54, c16_n90_n61, c16_n31_n78, O03A)
946 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
947 c16_p22_p82, c16_n90_n54, c16_p13_n61, c16_p85_p78, c16_n46_p31, c16_n67_n90, c16_p73_p04, c16_p38_p88, O04A)
948 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
949 c16_n04_p78, c16_n73_n82, c16_p85_p13, c16_n22_p67, c16_n61_n88, c16_p90_p31, c16_n38_p54, c16_n46_n90, O05A)
950 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
951 c16_n31_p73, c16_n22_n90, c16_p67_p78, c16_n90_n38, c16_p82_n13, c16_n46_p61, c16_n04_n88, c16_p54_p85, O06A)
952 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
953 c16_n54_p67, c16_p38_n78, c16_n22_p85, c16_p04_n90, c16_p13_p90, c16_n31_n88, c16_p46_p82, c16_n61_n73, O07A)
954 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
955 c16_n73_p61, c16_p82_n46, c16_n88_p31, c16_p90_n13, c16_n90_n04, c16_p85_p22, c16_n78_n38, c16_p67_p54, O08A)
956 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
957 c16_n85_p54, c16_p88_n04, c16_n61_n46, c16_p13_p82, c16_p38_n90, c16_n78_p67, c16_p90_n22, c16_n73_n31, O09A)
958 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
959 c16_n90_p46, c16_p54_p38, c16_p31_n90, c16_n88_p61, c16_p67_p22, c16_p13_n85, c16_n82_p73, c16_p78_p04, O10A)
960 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
961 c16_n88_p38, c16_n04_p73, c16_p90_n67, c16_n31_n46, c16_n78_p85, c16_p61_p13, c16_p54_n90, c16_n82_p22, O11A)
962 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
963 c16_n78_p31, c16_n61_p90, c16_p54_p04, c16_p82_n88, c16_n22_n38, c16_n90_p73, c16_n13_p67, c16_p85_n46, O12A)
964 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
965 c16_n61_p22, c16_n90_p85, c16_n38_p73, c16_p46_n04, c16_p90_n78, c16_p54_n82, c16_n31_n13, c16_n88_p67, O13A)
966 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
967 c16_n38_p13, c16_n78_p61, c16_n90_p88, c16_n73_p85, c16_n31_p54, c16_p22_p04, c16_p67_n46, c16_p90_n82, O14A)
968 COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \
969 c16_n13_p04, c16_n31_p22, c16_n46_p38, c16_n61_p54, c16_n73_p67, c16_n82_p78, c16_n88_p85, c16_n90_p90, O15A)
970
971 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
972 c16_p90_p90, c16_p85_p88, c16_p78_p82, c16_p67_p73, c16_p54_p61, c16_p38_p46, c16_p22_p31, c16_p04_p13, O00B)
973 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
974 c16_p82_p90, c16_p46_p67, c16_n04_p22, c16_n54_n31, c16_n85_n73, c16_n88_n90, c16_n61_n78, c16_n13_n38, O01B)
975 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
976 c16_p67_p88, c16_n13_p31, c16_n82_n54, c16_n78_n90, c16_n04_n46, c16_p73_p38, c16_p85_p90, c16_p22_p61, O02B)
977 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
978 c16_p46_p85, c16_n67_n13, c16_n73_n90, c16_p38_n22, c16_p88_p82, c16_n04_p54, c16_n90_n61, c16_n31_n78, O03B)
979 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
980 c16_p22_p82, c16_n90_n54, c16_p13_n61, c16_p85_p78, c16_n46_p31, c16_n67_n90, c16_p73_p04, c16_p38_p88, O04B)
981 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
982 c16_n04_p78, c16_n73_n82, c16_p85_p13, c16_n22_p67, c16_n61_n88, c16_p90_p31, c16_n38_p54, c16_n46_n90, O05B)
983 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
984 c16_n31_p73, c16_n22_n90, c16_p67_p78, c16_n90_n38, c16_p82_n13, c16_n46_p61, c16_n04_n88, c16_p54_p85, O06B)
985 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
986 c16_n54_p67, c16_p38_n78, c16_n22_p85, c16_p04_n90, c16_p13_p90, c16_n31_n88, c16_p46_p82, c16_n61_n73, O07B)
987 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
988 c16_n73_p61, c16_p82_n46, c16_n88_p31, c16_p90_n13, c16_n90_n04, c16_p85_p22, c16_n78_n38, c16_p67_p54, O08B)
989 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
990 c16_n85_p54, c16_p88_n04, c16_n61_n46, c16_p13_p82, c16_p38_n90, c16_n78_p67, c16_p90_n22, c16_n73_n31, O09B)
991 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
992 c16_n90_p46, c16_p54_p38, c16_p31_n90, c16_n88_p61, c16_p67_p22, c16_p13_n85, c16_n82_p73, c16_p78_p04, O10B)
993 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
994 c16_n88_p38, c16_n04_p73, c16_p90_n67, c16_n31_n46, c16_n78_p85, c16_p61_p13, c16_p54_n90, c16_n82_p22, O11B)
995 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
996 c16_n78_p31, c16_n61_p90, c16_p54_p04, c16_p82_n88, c16_n22_n38, c16_n90_p73, c16_n13_p67, c16_p85_n46, O12B)
997 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
998 c16_n61_p22, c16_n90_p85, c16_n38_p73, c16_p46_n04, c16_p90_n78, c16_p54_n82, c16_n31_n13, c16_n88_p67, O13B)
999 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1000 c16_n38_p13, c16_n78_p61, c16_n90_p88, c16_n73_p85, c16_n31_p54, c16_p22_p04, c16_p67_n46, c16_p90_n82, O14B)
1001 COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \
1002 c16_n13_p04, c16_n31_p22, c16_n46_p38, c16_n61_p54, c16_n73_p67, c16_n82_p78, c16_n88_p85, c16_n90_p90, O15B)
1003
1004#undef COMPUTE_ROW
1005 }
1006
1007 __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A;
1008 __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B;
1009 {
1010 __m128i T00, T01;
1011#define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \
1012 T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \
1013 T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \
1014 row = _mm_add_epi32(T00, T01);
1015
1016 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, EO0A)
1017 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, EO1A)
1018 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, EO2A)
1019 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, EO3A)
1020 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, EO4A)
1021 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, EO5A)
1022 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, EO6A)
1023 COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, EO7A)
1024
1025 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, EO0B)
1026 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, EO1B)
1027 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, EO2B)
1028 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, EO3B)
1029 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, EO4B)
1030 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, EO5B)
1031 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, EO6B)
1032 COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, EO7B)
1033#undef COMPUTE_ROW
1034 }
1035
1036 const __m128i EEO0A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_p75_p89), _mm_madd_epi16(T_00_13A, c16_p18_p50)); // EEO0
1037 const __m128i EEO0B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_p75_p89), _mm_madd_epi16(T_00_13B, c16_p18_p50));
1038 const __m128i EEO1A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n18_p75), _mm_madd_epi16(T_00_13A, c16_n50_n89)); // EEO1
1039 const __m128i EEO1B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n18_p75), _mm_madd_epi16(T_00_13B, c16_n50_n89));
1040 const __m128i EEO2A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n89_p50), _mm_madd_epi16(T_00_13A, c16_p75_p18)); // EEO2
1041 const __m128i EEO2B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n89_p50), _mm_madd_epi16(T_00_13B, c16_p75_p18));
1042 const __m128i EEO3A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n50_p18), _mm_madd_epi16(T_00_13A, c16_n89_p75)); // EEO3
1043 const __m128i EEO3B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n50_p18), _mm_madd_epi16(T_00_13B, c16_n89_p75));
1044
1045 const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p36_p83);
1046 const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p36_p83);
1047 const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n83_p36);
1048 const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n83_p36);
1049
1050 const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p64_p64);
1051 const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p64_p64);
1052 const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n64_p64);
1053 const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n64_p64);
1054
1055 const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0
1056 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B);
1057 const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1
1058 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B);
1059 const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0
1060 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B);
1061 const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1
1062 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B);
1063
1064 const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0
1065 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);
1066 const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1
1067 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);
1068 const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0
1069 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B);
1070 const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1
1071 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B);
1072 const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0
1073 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B);
1074 const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1
1075 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B);
1076 const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0
1077 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B);
1078 const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1
1079 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B);
1080
1081 const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0
1082 const __m128i E0B = _mm_add_epi32(EE0B, EO0B);
1083 const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1
1084 const __m128i E1B = _mm_add_epi32(EE1B, EO1B);
1085 const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2
1086 const __m128i E2B = _mm_add_epi32(EE2B, EO2B);
1087 const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3
1088 const __m128i E3B = _mm_add_epi32(EE3B, EO3B);
1089 const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 =
1090 const __m128i E4B = _mm_add_epi32(EE4B, EO4B);
1091 const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 =
1092 const __m128i E5B = _mm_add_epi32(EE5B, EO5B);
1093 const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 =
1094 const __m128i E6B = _mm_add_epi32(EE6B, EO6B);
1095 const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 =
1096 const __m128i E7B = _mm_add_epi32(EE7B, EO7B);
1097 const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0
1098 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B);
1099 const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1
1100 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B);
1101 const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2
1102 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B);
1103 const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3
1104 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B);
1105 const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB =
1106 const __m128i EBB = _mm_sub_epi32(EE4B, EO4B);
1107 const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA =
1108 const __m128i EAB = _mm_sub_epi32(EE5B, EO5B);
1109 const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 =
1110 const __m128i E9B = _mm_sub_epi32(EE6B, EO6B);
1111 const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 =
1112 const __m128i E8B = _mm_sub_epi32(EE7B, EO7B);
1113
1114 const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd
1115 const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);
1116 const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd
1117 const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);
1118 const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd
1119 const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);
1120 const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd
1121 const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);
1122 const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd
1123 const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);
1124 const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd
1125 const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);
1126 const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd
1127 const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);
1128 const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd
1129 const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);
1130 const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd
1131 const __m128i T18B = _mm_add_epi32(E8B, c32_rnd);
1132 const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd
1133 const __m128i T19B = _mm_add_epi32(E9B, c32_rnd);
1134 const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd
1135 const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd);
1136 const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd
1137 const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd);
1138 const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd
1139 const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd);
1140 const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd
1141 const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd);
1142 const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd
1143 const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd);
1144 const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd
1145 const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd);
1146
1147 const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd
1148 const __m128i T2_00B = _mm_add_epi32(T10B, O00B);
1149 const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd
1150 const __m128i T2_01B = _mm_add_epi32(T11B, O01B);
1151 const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd
1152 const __m128i T2_02B = _mm_add_epi32(T12B, O02B);
1153 const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd
1154 const __m128i T2_03B = _mm_add_epi32(T13B, O03B);
1155 const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4
1156 const __m128i T2_04B = _mm_add_epi32(T14B, O04B);
1157 const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5
1158 const __m128i T2_05B = _mm_add_epi32(T15B, O05B);
1159 const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6
1160 const __m128i T2_06B = _mm_add_epi32(T16B, O06B);
1161 const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7
1162 const __m128i T2_07B = _mm_add_epi32(T17B, O07B);
1163 const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8
1164 const __m128i T2_08B = _mm_add_epi32(T18B, O08B);
1165 const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9
1166 const __m128i T2_09B = _mm_add_epi32(T19B, O09B);
1167 const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10
1168 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B);
1169 const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11
1170 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B);
1171 const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12
1172 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B);
1173 const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13
1174 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B);
1175 const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14
1176 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B);
1177 const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15
1178 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B);
1179 const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd
1180 const __m128i T2_31B = _mm_sub_epi32(T10B, O00B);
1181 const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd
1182 const __m128i T2_30B = _mm_sub_epi32(T11B, O01B);
1183 const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd
1184 const __m128i T2_29B = _mm_sub_epi32(T12B, O02B);
1185 const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd
1186 const __m128i T2_28B = _mm_sub_epi32(T13B, O03B);
1187 const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4
1188 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B);
1189 const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5
1190 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B);
1191 const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6
1192 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B);
1193 const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7
1194 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B);
1195 const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); //
1196 const __m128i T2_23B = _mm_sub_epi32(T18B, O08B);
1197 const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); //
1198 const __m128i T2_22B = _mm_sub_epi32(T19B, O09B);
1199 const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); //
1200 const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B);
1201 const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); //
1202 const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B);
1203 const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); //
1204 const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B);
1205 const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); //
1206 const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B);
1207 const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); //
1208 const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B);
1209 const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); //
1210 const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B);
1211
1212 const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00]
1213 const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40]
1214 const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01]
1215 const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41]
1216 const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02]
1217 const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42]
1218 const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03]
1219 const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43]
1220 const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04]
1221 const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44]
1222 const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05]
1223 const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45]
1224 const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06]
1225 const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46]
1226 const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07]
1227 const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47]
1228 const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8
1229 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40]
1230 const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9
1231 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41]
1232 const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA
1233 const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42]
1234 const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB
1235 const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43]
1236 const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC
1237 const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44]
1238 const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD
1239 const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45]
1240 const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE
1241 const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46]
1242 const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF
1243 const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47]
1244
1245 const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00]
1246 const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40]
1247 const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01]
1248 const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41]
1249 const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02]
1250 const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42]
1251 const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03]
1252 const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43]
1253 const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04]
1254 const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44]
1255 const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05]
1256 const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45]
1257 const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06]
1258 const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46]
1259 const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07]
1260 const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47]
1261 const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8
1262 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40]
1263 const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9
1264 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41]
1265 const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA
1266 const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42]
1267 const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB
1268 const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43]
1269 const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC
1270 const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44]
1271 const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD
1272 const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45]
1273 const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE
1274 const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46]
1275 const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF
1276 const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47]
1277
1278 res00[part] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00]
1279 res01[part] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01]
1280 res02[part] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02]
1281 res03[part] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03]
1282 res04[part] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04]
1283 res05[part] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05]
1284 res06[part] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06]
1285 res07[part] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07]
1286 res08[part] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80]
1287 res09[part] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81]
1288 res10[part] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82]
1289 res11[part] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83]
1290 res12[part] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84]
1291 res13[part] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85]
1292 res14[part] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86]
1293 res15[part] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87]
1294 res16[part] = _mm_packs_epi32(T3_16A, T3_16B);
1295 res17[part] = _mm_packs_epi32(T3_17A, T3_17B);
1296 res18[part] = _mm_packs_epi32(T3_18A, T3_18B);
1297 res19[part] = _mm_packs_epi32(T3_19A, T3_19B);
1298 res20[part] = _mm_packs_epi32(T3_20A, T3_20B);
1299 res21[part] = _mm_packs_epi32(T3_21A, T3_21B);
1300 res22[part] = _mm_packs_epi32(T3_22A, T3_22B);
1301 res23[part] = _mm_packs_epi32(T3_23A, T3_23B);
1302 res24[part] = _mm_packs_epi32(T3_24A, T3_24B);
1303 res25[part] = _mm_packs_epi32(T3_25A, T3_25B);
1304 res26[part] = _mm_packs_epi32(T3_26A, T3_26B);
1305 res27[part] = _mm_packs_epi32(T3_27A, T3_27B);
1306 res28[part] = _mm_packs_epi32(T3_28A, T3_28B);
1307 res29[part] = _mm_packs_epi32(T3_29A, T3_29B);
1308 res30[part] = _mm_packs_epi32(T3_30A, T3_30B);
1309 res31[part] = _mm_packs_epi32(T3_31A, T3_31B);
1310 }
1311 //transpose matrix 8x8 16bit.
1312 {
1313 __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
1314 __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
1315#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
1316 tr0_0 = _mm_unpacklo_epi16(I0, I1); \
1317 tr0_1 = _mm_unpacklo_epi16(I2, I3); \
1318 tr0_2 = _mm_unpackhi_epi16(I0, I1); \
1319 tr0_3 = _mm_unpackhi_epi16(I2, I3); \
1320 tr0_4 = _mm_unpacklo_epi16(I4, I5); \
1321 tr0_5 = _mm_unpacklo_epi16(I6, I7); \
1322 tr0_6 = _mm_unpackhi_epi16(I4, I5); \
1323 tr0_7 = _mm_unpackhi_epi16(I6, I7); \
1324 tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
1325 tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
1326 tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
1327 tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
1328 tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
1329 tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
1330 tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
1331 tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
1332 O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
1333 O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
1334 O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
1335 O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
1336 O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
1337 O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
1338 O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
1339 O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
1340
1341 TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
1342 TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
1343 TRANSPOSE_8x8_16BIT(res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2], in16[0], in17[0], in18[0], in19[0], in20[0], in21[0], in22[0], in23[0])
1344 TRANSPOSE_8x8_16BIT(res00[3], res01[3], res02[3], res03[3], res04[3], res05[3], res06[3], res07[3], in24[0], in25[0], in26[0], in27[0], in28[0], in29[0], in30[0], in31[0])
1345
1346 TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
1347 TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
1348 TRANSPOSE_8x8_16BIT(res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2], in16[1], in17[1], in18[1], in19[1], in20[1], in21[1], in22[1], in23[1])
1349 TRANSPOSE_8x8_16BIT(res08[3], res09[3], res10[3], res11[3], res12[3], res13[3], res14[3], res15[3], in24[1], in25[1], in26[1], in27[1], in28[1], in29[1], in30[1], in31[1])
1350
1351 TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2])
1352 TRANSPOSE_8x8_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2])
1353 TRANSPOSE_8x8_16BIT(res16[2], res17[2], res18[2], res19[2], res20[2], res21[2], res22[2], res23[2], in16[2], in17[2], in18[2], in19[2], in20[2], in21[2], in22[2], in23[2])
1354 TRANSPOSE_8x8_16BIT(res16[3], res17[3], res18[3], res19[3], res20[3], res21[3], res22[3], res23[3], in24[2], in25[2], in26[2], in27[2], in28[2], in29[2], in30[2], in31[2])
1355
1356 TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[3], in01[3], in02[3], in03[3], in04[3], in05[3], in06[3], in07[3])
1357 TRANSPOSE_8x8_16BIT(res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in08[3], in09[3], in10[3], in11[3], in12[3], in13[3], in14[3], in15[3])
1358 TRANSPOSE_8x8_16BIT(res24[2], res25[2], res26[2], res27[2], res28[2], res29[2], res30[2], res31[2], in16[3], in17[3], in18[3], in19[3], in20[3], in21[3], in22[3], in23[3])
1359 TRANSPOSE_8x8_16BIT(res24[3], res25[3], res26[3], res27[3], res28[3], res29[3], res30[3], res31[3], in24[3], in25[3], in26[3], in27[3], in28[3], in29[3], in30[3], in31[3])
1360
1361#undef TRANSPOSE_8x8_16BIT
1362 }
1363 }
1364
1365 // Add
1366 for (int i = 0; i < 2; i++)
1367 {
1368#define STORE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \
1369 _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 0], L0); \
1370 _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 8], H0); \
1371 _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 0], L1); \
1372 _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 8], H1); \
1373 _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 0], L2); \
1374 _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 8], H2); \
1375 _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 0], L3); \
1376 _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 8], H3); \
1377 _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 0], L4); \
1378 _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 8], H4); \
1379 _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 0], L5); \
1380 _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 8], H5); \
1381 _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 0], L6); \
1382 _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 8], H6); \
1383 _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 0], L7); \
1384 _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 8], H7);
1385
1386 const int k = i * 2;
1387 STORE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k], in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1], in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16)
1388 STORE_LINE(in08[k], in09[k], in10[k], in11[k], in12[k], in13[k], in14[k], in15[k], in08[k + 1], in09[k + 1], in10[k + 1], in11[k + 1], in12[k + 1], in13[k + 1], in14[k + 1], in15[k + 1], 8, i * 16)
1389 STORE_LINE(in16[k], in17[k], in18[k], in19[k], in20[k], in21[k], in22[k], in23[k], in16[k + 1], in17[k + 1], in18[k + 1], in19[k + 1], in20[k + 1], in21[k + 1], in22[k + 1], in23[k + 1], 16, i * 16)
1390 STORE_LINE(in24[k], in25[k], in26[k], in27[k], in28[k], in29[k], in30[k], in31[k], in24[k + 1], in25[k + 1], in26[k + 1], in27[k + 1], in28[k + 1], in29[k + 1], in30[k + 1], in31[k + 1], 24, i * 16)
1391#undef STORE_LINE
1392 }
1393}
1394
1395#endif // if !HIGH_BIT_DEPTH
1396}
1397
1398namespace x265 {
1399void Setup_Vec_DCTPrimitives_sse3(EncoderPrimitives &p)
1400{
1401 /* Note: We have AVX2 assembly for these two functions, but since AVX2 is
1402 * still somewhat rare on end-user PCs we still compile and link these SSE3
1403 * intrinsic SIMD functions */
1404#if !HIGH_BIT_DEPTH
1405 p.idct[IDCT_8x8] = idct8;
1406 p.idct[IDCT_16x16] = idct16;
1407 p.idct[IDCT_32x32] = idct32;
1408#endif
1409}
1410}