c435b5292357f51e8cf3892f106eda2ffe555978
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
7 * Mahesh Pittala <mahesh@multicorewareinc.com>
8 * Rajesh Paulraj <rajesh@multicorewareinc.com>
9 * Min Chen <min.chen@multicorewareinc.com>
10 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
11 * Nabajit Deka <nabajit@multicorewareinc.com>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 * This program is also available under a commercial proprietary license.
28 * For more information, contact us at license @ x265.com.
29 *****************************************************************************/
32 #include "primitives.h"
33 #include <xmmintrin.h> // SSE
34 #include <pmmintrin.h> // SSE3
40 ALIGN_VAR_32(static const int16_t, tab_idct_8x8
[12][8]) =
42 { 89, 75, 89, 75, 89, 75, 89, 75 },
43 { 50, 18, 50, 18, 50, 18, 50, 18 },
44 { 75, -18, 75, -18, 75, -18, 75, -18 },
45 { -89, -50, -89, -50, -89, -50, -89, -50 },
46 { 50, -89, 50, -89, 50, -89, 50, -89 },
47 { 18, 75, 18, 75, 18, 75, 18, 75 },
48 { 18, -50, 18, -50, 18, -50, 18, -50 },
49 { 75, -89, 75, -89, 75, -89, 75, -89 },
50 { 64, 64, 64, 64, 64, 64, 64, 64 },
51 { 64, -64, 64, -64, 64, -64, 64, -64 },
52 { 83, 36, 83, 36, 83, 36, 83, 36 },
53 { 36, -83, 36, -83, 36, -83, 36, -83 }
55 void idct8(int32_t *src
, int16_t *dst
, intptr_t stride
)
57 __m128i m128iS0
, m128iS1
, m128iS2
, m128iS3
, m128iS4
, m128iS5
, m128iS6
, m128iS7
, m128iAdd
, m128Tmp0
, m128Tmp1
, m128Tmp2
, m128Tmp3
, E0h
, E1h
, E2h
, E3h
, E0l
, E1l
, E2l
, E3l
, O0h
, O1h
, O2h
, O3h
, O0l
, O1l
, O2l
, O3l
, EE0l
, EE1l
, E00l
, E01l
, EE0h
, EE1h
, E00h
, E01h
;
58 __m128i T00
, T01
, T02
, T03
, T04
, T05
, T06
, T07
;
60 m128iAdd
= _mm_set1_epi32(64);
62 T00
= _mm_load_si128((__m128i
*)&src
[8 + 0]);
63 T01
= _mm_load_si128((__m128i
*)&src
[8 + 4]);
64 m128iS1
= _mm_packs_epi32(T00
, T01
);
65 T00
= _mm_load_si128((__m128i
*)&src
[24 + 0]);
66 T01
= _mm_load_si128((__m128i
*)&src
[24 + 4]);
67 m128iS3
= _mm_packs_epi32(T00
, T01
);
68 m128Tmp0
= _mm_unpacklo_epi16(m128iS1
, m128iS3
);
69 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[0])));
70 m128Tmp1
= _mm_unpackhi_epi16(m128iS1
, m128iS3
);
71 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[0])));
73 T00
= _mm_load_si128((__m128i
*)&src
[40 + 0]);
74 T01
= _mm_load_si128((__m128i
*)&src
[40 + 4]);
75 m128iS5
= _mm_packs_epi32(T00
, T01
);
76 T00
= _mm_load_si128((__m128i
*)&src
[56 + 0]);
77 T01
= _mm_load_si128((__m128i
*)&src
[56 + 4]);
78 m128iS7
= _mm_packs_epi32(T00
, T01
);
79 m128Tmp2
= _mm_unpacklo_epi16(m128iS5
, m128iS7
);
80 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[1])));
81 m128Tmp3
= _mm_unpackhi_epi16(m128iS5
, m128iS7
);
82 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[1])));
83 O0l
= _mm_add_epi32(E1l
, E2l
);
84 O0h
= _mm_add_epi32(E1h
, E2h
);
86 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[2])));
87 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[2])));
88 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[3])));
89 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[3])));
91 O1l
= _mm_add_epi32(E1l
, E2l
);
92 O1h
= _mm_add_epi32(E1h
, E2h
);
94 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[4])));
95 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[4])));
96 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[5])));
97 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[5])));
98 O2l
= _mm_add_epi32(E1l
, E2l
);
99 O2h
= _mm_add_epi32(E1h
, E2h
);
101 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[6])));
102 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[6])));
103 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[7])));
104 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[7])));
105 O3h
= _mm_add_epi32(E1h
, E2h
);
106 O3l
= _mm_add_epi32(E1l
, E2l
);
110 T00
= _mm_load_si128((__m128i
*)&src
[0 + 0]);
111 T01
= _mm_load_si128((__m128i
*)&src
[0 + 4]);
112 m128iS0
= _mm_packs_epi32(T00
, T01
);
113 T00
= _mm_load_si128((__m128i
*)&src
[32 + 0]);
114 T01
= _mm_load_si128((__m128i
*)&src
[32 + 4]);
115 m128iS4
= _mm_packs_epi32(T00
, T01
);
116 m128Tmp0
= _mm_unpacklo_epi16(m128iS0
, m128iS4
);
117 EE0l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[8])));
118 m128Tmp1
= _mm_unpackhi_epi16(m128iS0
, m128iS4
);
119 EE0h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[8])));
121 EE1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[9])));
122 EE1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[9])));
126 T00
= _mm_load_si128((__m128i
*)&src
[16 + 0]);
127 T01
= _mm_load_si128((__m128i
*)&src
[16 + 4]);
128 m128iS2
= _mm_packs_epi32(T00
, T01
);
129 T00
= _mm_load_si128((__m128i
*)&src
[48 + 0]);
130 T01
= _mm_load_si128((__m128i
*)&src
[48 + 4]);
131 m128iS6
= _mm_packs_epi32(T00
, T01
);
132 m128Tmp0
= _mm_unpacklo_epi16(m128iS2
, m128iS6
);
133 E00l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[10])));
134 m128Tmp1
= _mm_unpackhi_epi16(m128iS2
, m128iS6
);
135 E00h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[10])));
136 E01l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[11])));
137 E01h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[11])));
138 E0l
= _mm_add_epi32(EE0l
, E00l
);
139 E0l
= _mm_add_epi32(E0l
, m128iAdd
);
140 E0h
= _mm_add_epi32(EE0h
, E00h
);
141 E0h
= _mm_add_epi32(E0h
, m128iAdd
);
142 E3l
= _mm_sub_epi32(EE0l
, E00l
);
143 E3l
= _mm_add_epi32(E3l
, m128iAdd
);
144 E3h
= _mm_sub_epi32(EE0h
, E00h
);
145 E3h
= _mm_add_epi32(E3h
, m128iAdd
);
147 E1l
= _mm_add_epi32(EE1l
, E01l
);
148 E1l
= _mm_add_epi32(E1l
, m128iAdd
);
149 E1h
= _mm_add_epi32(EE1h
, E01h
);
150 E1h
= _mm_add_epi32(E1h
, m128iAdd
);
151 E2l
= _mm_sub_epi32(EE1l
, E01l
);
152 E2l
= _mm_add_epi32(E2l
, m128iAdd
);
153 E2h
= _mm_sub_epi32(EE1h
, E01h
);
154 E2h
= _mm_add_epi32(E2h
, m128iAdd
);
155 m128iS0
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l
, O0l
), 7), _mm_srai_epi32(_mm_add_epi32(E0h
, O0h
), 7));
156 m128iS1
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l
, O1l
), 7), _mm_srai_epi32(_mm_add_epi32(E1h
, O1h
), 7));
157 m128iS2
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l
, O2l
), 7), _mm_srai_epi32(_mm_add_epi32(E2h
, O2h
), 7));
158 m128iS3
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l
, O3l
), 7), _mm_srai_epi32(_mm_add_epi32(E3h
, O3h
), 7));
159 m128iS4
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l
, O3l
), 7), _mm_srai_epi32(_mm_sub_epi32(E3h
, O3h
), 7));
160 m128iS5
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l
, O2l
), 7), _mm_srai_epi32(_mm_sub_epi32(E2h
, O2h
), 7));
161 m128iS6
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l
, O1l
), 7), _mm_srai_epi32(_mm_sub_epi32(E1h
, O1h
), 7));
162 m128iS7
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l
, O0l
), 7), _mm_srai_epi32(_mm_sub_epi32(E0h
, O0h
), 7));
165 E0l
= _mm_unpacklo_epi16(m128iS0
, m128iS4
);
166 E1l
= _mm_unpacklo_epi16(m128iS1
, m128iS5
);
167 E2l
= _mm_unpacklo_epi16(m128iS2
, m128iS6
);
168 E3l
= _mm_unpacklo_epi16(m128iS3
, m128iS7
);
169 O0l
= _mm_unpackhi_epi16(m128iS0
, m128iS4
);
170 O1l
= _mm_unpackhi_epi16(m128iS1
, m128iS5
);
171 O2l
= _mm_unpackhi_epi16(m128iS2
, m128iS6
);
172 O3l
= _mm_unpackhi_epi16(m128iS3
, m128iS7
);
173 m128Tmp0
= _mm_unpacklo_epi16(E0l
, E2l
);
174 m128Tmp1
= _mm_unpacklo_epi16(E1l
, E3l
);
175 m128iS0
= _mm_unpacklo_epi16(m128Tmp0
, m128Tmp1
);
176 m128iS1
= _mm_unpackhi_epi16(m128Tmp0
, m128Tmp1
);
177 m128Tmp2
= _mm_unpackhi_epi16(E0l
, E2l
);
178 m128Tmp3
= _mm_unpackhi_epi16(E1l
, E3l
);
179 m128iS2
= _mm_unpacklo_epi16(m128Tmp2
, m128Tmp3
);
180 m128iS3
= _mm_unpackhi_epi16(m128Tmp2
, m128Tmp3
);
181 m128Tmp0
= _mm_unpacklo_epi16(O0l
, O2l
);
182 m128Tmp1
= _mm_unpacklo_epi16(O1l
, O3l
);
183 m128iS4
= _mm_unpacklo_epi16(m128Tmp0
, m128Tmp1
);
184 m128iS5
= _mm_unpackhi_epi16(m128Tmp0
, m128Tmp1
);
185 m128Tmp2
= _mm_unpackhi_epi16(O0l
, O2l
);
186 m128Tmp3
= _mm_unpackhi_epi16(O1l
, O3l
);
187 m128iS6
= _mm_unpacklo_epi16(m128Tmp2
, m128Tmp3
);
188 m128iS7
= _mm_unpackhi_epi16(m128Tmp2
, m128Tmp3
);
190 m128iAdd
= _mm_set1_epi32(2048);
192 m128Tmp0
= _mm_unpacklo_epi16(m128iS1
, m128iS3
);
193 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[0])));
194 m128Tmp1
= _mm_unpackhi_epi16(m128iS1
, m128iS3
);
195 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[0])));
196 m128Tmp2
= _mm_unpacklo_epi16(m128iS5
, m128iS7
);
197 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[1])));
198 m128Tmp3
= _mm_unpackhi_epi16(m128iS5
, m128iS7
);
199 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[1])));
200 O0l
= _mm_add_epi32(E1l
, E2l
);
201 O0h
= _mm_add_epi32(E1h
, E2h
);
202 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[2])));
203 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[2])));
204 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[3])));
205 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[3])));
206 O1l
= _mm_add_epi32(E1l
, E2l
);
207 O1h
= _mm_add_epi32(E1h
, E2h
);
208 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[4])));
209 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[4])));
210 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[5])));
211 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[5])));
212 O2l
= _mm_add_epi32(E1l
, E2l
);
213 O2h
= _mm_add_epi32(E1h
, E2h
);
214 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[6])));
215 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[6])));
216 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[7])));
217 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[7])));
218 O3h
= _mm_add_epi32(E1h
, E2h
);
219 O3l
= _mm_add_epi32(E1l
, E2l
);
221 m128Tmp0
= _mm_unpacklo_epi16(m128iS0
, m128iS4
);
222 EE0l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[8])));
223 m128Tmp1
= _mm_unpackhi_epi16(m128iS0
, m128iS4
);
224 EE0h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[8])));
225 EE1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[9])));
226 EE1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[9])));
228 m128Tmp0
= _mm_unpacklo_epi16(m128iS2
, m128iS6
);
229 E00l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[10])));
230 m128Tmp1
= _mm_unpackhi_epi16(m128iS2
, m128iS6
);
231 E00h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[10])));
232 E01l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[11])));
233 E01h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[11])));
234 E0l
= _mm_add_epi32(EE0l
, E00l
);
235 E0l
= _mm_add_epi32(E0l
, m128iAdd
);
236 E0h
= _mm_add_epi32(EE0h
, E00h
);
237 E0h
= _mm_add_epi32(E0h
, m128iAdd
);
238 E3l
= _mm_sub_epi32(EE0l
, E00l
);
239 E3l
= _mm_add_epi32(E3l
, m128iAdd
);
240 E3h
= _mm_sub_epi32(EE0h
, E00h
);
241 E3h
= _mm_add_epi32(E3h
, m128iAdd
);
242 E1l
= _mm_add_epi32(EE1l
, E01l
);
243 E1l
= _mm_add_epi32(E1l
, m128iAdd
);
244 E1h
= _mm_add_epi32(EE1h
, E01h
);
245 E1h
= _mm_add_epi32(E1h
, m128iAdd
);
246 E2l
= _mm_sub_epi32(EE1l
, E01l
);
247 E2l
= _mm_add_epi32(E2l
, m128iAdd
);
248 E2h
= _mm_sub_epi32(EE1h
, E01h
);
249 E2h
= _mm_add_epi32(E2h
, m128iAdd
);
251 m128iS0
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l
, O0l
), 12), _mm_srai_epi32(_mm_add_epi32(E0h
, O0h
), 12));
252 m128iS1
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l
, O1l
), 12), _mm_srai_epi32(_mm_add_epi32(E1h
, O1h
), 12));
253 m128iS2
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l
, O2l
), 12), _mm_srai_epi32(_mm_add_epi32(E2h
, O2h
), 12));
254 m128iS3
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l
, O3l
), 12), _mm_srai_epi32(_mm_add_epi32(E3h
, O3h
), 12));
255 m128iS4
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l
, O3l
), 12), _mm_srai_epi32(_mm_sub_epi32(E3h
, O3h
), 12));
256 m128iS5
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l
, O2l
), 12), _mm_srai_epi32(_mm_sub_epi32(E2h
, O2h
), 12));
257 m128iS6
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l
, O1l
), 12), _mm_srai_epi32(_mm_sub_epi32(E1h
, O1h
), 12));
258 m128iS7
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l
, O0l
), 12), _mm_srai_epi32(_mm_sub_epi32(E0h
, O0h
), 12));
260 // [07 06 05 04 03 02 01 00]
261 // [17 16 15 14 13 12 11 10]
262 // [27 26 25 24 23 22 21 20]
263 // [37 36 35 34 33 32 31 30]
264 // [47 46 45 44 43 42 41 40]
265 // [57 56 55 54 53 52 51 50]
266 // [67 66 65 64 63 62 61 60]
267 // [77 76 75 74 73 72 71 70]
269 T00
= _mm_unpacklo_epi16(m128iS0
, m128iS1
); // [13 03 12 02 11 01 10 00]
270 T01
= _mm_unpackhi_epi16(m128iS0
, m128iS1
); // [17 07 16 06 15 05 14 04]
271 T02
= _mm_unpacklo_epi16(m128iS2
, m128iS3
); // [33 23 32 22 31 21 30 20]
272 T03
= _mm_unpackhi_epi16(m128iS2
, m128iS3
); // [37 27 36 26 35 25 34 24]
273 T04
= _mm_unpacklo_epi16(m128iS4
, m128iS5
); // [53 43 52 42 51 41 50 40]
274 T05
= _mm_unpackhi_epi16(m128iS4
, m128iS5
); // [57 47 56 46 55 45 54 44]
275 T06
= _mm_unpacklo_epi16(m128iS6
, m128iS7
); // [73 63 72 62 71 61 70 60]
276 T07
= _mm_unpackhi_epi16(m128iS6
, m128iS7
); // [77 67 76 66 75 65 74 64]
279 T10
= _mm_unpacklo_epi32(T00
, T02
); // [31 21 11 01 30 20 10 00]
280 T11
= _mm_unpackhi_epi32(T00
, T02
); // [33 23 13 03 32 22 12 02]
281 _mm_storel_epi64((__m128i
*)&dst
[0 * stride
+ 0], T10
); // [30 20 10 00]
282 _mm_storeh_pi((__m64
*)&dst
[1 * stride
+ 0], _mm_castsi128_ps(T10
)); // [31 21 11 01]
283 _mm_storel_epi64((__m128i
*)&dst
[2 * stride
+ 0], T11
); // [32 22 12 02]
284 _mm_storeh_pi((__m64
*)&dst
[3 * stride
+ 0], _mm_castsi128_ps(T11
)); // [33 23 13 03]
286 T10
= _mm_unpacklo_epi32(T04
, T06
); // [71 61 51 41 70 60 50 40]
287 T11
= _mm_unpackhi_epi32(T04
, T06
); // [73 63 53 43 72 62 52 42]
288 _mm_storel_epi64((__m128i
*)&dst
[0 * stride
+ 4], T10
);
289 _mm_storeh_pi((__m64
*)&dst
[1 * stride
+ 4], _mm_castsi128_ps(T10
));
290 _mm_storel_epi64((__m128i
*)&dst
[2 * stride
+ 4], T11
);
291 _mm_storeh_pi((__m64
*)&dst
[3 * stride
+ 4], _mm_castsi128_ps(T11
));
293 T10
= _mm_unpacklo_epi32(T01
, T03
); // [35 25 15 05 34 24 14 04]
294 T11
= _mm_unpackhi_epi32(T01
, T03
); // [37 27 17 07 36 26 16 06]
295 _mm_storel_epi64((__m128i
*)&dst
[4 * stride
+ 0], T10
);
296 _mm_storeh_pi((__m64
*)&dst
[5 * stride
+ 0], _mm_castsi128_ps(T10
));
297 _mm_storel_epi64((__m128i
*)&dst
[6 * stride
+ 0], T11
);
298 _mm_storeh_pi((__m64
*)&dst
[7 * stride
+ 0], _mm_castsi128_ps(T11
));
300 T10
= _mm_unpacklo_epi32(T05
, T07
); // [75 65 55 45 74 64 54 44]
301 T11
= _mm_unpackhi_epi32(T05
, T07
); // [77 67 57 47 76 56 46 36]
302 _mm_storel_epi64((__m128i
*)&dst
[4 * stride
+ 4], T10
);
303 _mm_storeh_pi((__m64
*)&dst
[5 * stride
+ 4], _mm_castsi128_ps(T10
));
304 _mm_storel_epi64((__m128i
*)&dst
[6 * stride
+ 4], T11
);
305 _mm_storeh_pi((__m64
*)&dst
[7 * stride
+ 4], _mm_castsi128_ps(T11
));
308 void idct16(int32_t *src
, int16_t *dst
, intptr_t stride
)
310 const __m128i c16_p87_p90
= _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
311 const __m128i c16_p70_p80
= _mm_set1_epi32(0x00460050);
312 const __m128i c16_p43_p57
= _mm_set1_epi32(0x002B0039);
313 const __m128i c16_p09_p25
= _mm_set1_epi32(0x00090019);
314 const __m128i c16_p57_p87
= _mm_set1_epi32(0x00390057); //row1
315 const __m128i c16_n43_p09
= _mm_set1_epi32(0xFFD50009);
316 const __m128i c16_n90_n80
= _mm_set1_epi32(0xFFA6FFB0);
317 const __m128i c16_n25_n70
= _mm_set1_epi32(0xFFE7FFBA);
318 const __m128i c16_p09_p80
= _mm_set1_epi32(0x00090050); //row2
319 const __m128i c16_n87_n70
= _mm_set1_epi32(0xFFA9FFBA);
320 const __m128i c16_p57_n25
= _mm_set1_epi32(0x0039FFE7);
321 const __m128i c16_p43_p90
= _mm_set1_epi32(0x002B005A);
322 const __m128i c16_n43_p70
= _mm_set1_epi32(0xFFD50046); //row3
323 const __m128i c16_p09_n87
= _mm_set1_epi32(0x0009FFA9);
324 const __m128i c16_p25_p90
= _mm_set1_epi32(0x0019005A);
325 const __m128i c16_n57_n80
= _mm_set1_epi32(0xFFC7FFB0);
326 const __m128i c16_n80_p57
= _mm_set1_epi32(0xFFB00039); //row4
327 const __m128i c16_p90_n25
= _mm_set1_epi32(0x005AFFE7);
328 const __m128i c16_n87_n09
= _mm_set1_epi32(0xFFA9FFF7);
329 const __m128i c16_p70_p43
= _mm_set1_epi32(0x0046002B);
330 const __m128i c16_n90_p43
= _mm_set1_epi32(0xFFA6002B); //row5
331 const __m128i c16_p25_p57
= _mm_set1_epi32(0x00190039);
332 const __m128i c16_p70_n87
= _mm_set1_epi32(0x0046FFA9);
333 const __m128i c16_n80_p09
= _mm_set1_epi32(0xFFB00009);
334 const __m128i c16_n70_p25
= _mm_set1_epi32(0xFFBA0019); //row6
335 const __m128i c16_n80_p90
= _mm_set1_epi32(0xFFB0005A);
336 const __m128i c16_p09_p43
= _mm_set1_epi32(0x0009002B);
337 const __m128i c16_p87_n57
= _mm_set1_epi32(0x0057FFC7);
338 const __m128i c16_n25_p09
= _mm_set1_epi32(0xFFE70009); //row7
339 const __m128i c16_n57_p43
= _mm_set1_epi32(0xFFC7002B);
340 const __m128i c16_n80_p70
= _mm_set1_epi32(0xFFB00046);
341 const __m128i c16_n90_p87
= _mm_set1_epi32(0xFFA60057);
343 const __m128i c16_p75_p89
= _mm_set1_epi32(0x004B0059);
344 const __m128i c16_p18_p50
= _mm_set1_epi32(0x00120032);
345 const __m128i c16_n18_p75
= _mm_set1_epi32(0xFFEE004B);
346 const __m128i c16_n50_n89
= _mm_set1_epi32(0xFFCEFFA7);
347 const __m128i c16_n89_p50
= _mm_set1_epi32(0xFFA70032);
348 const __m128i c16_p75_p18
= _mm_set1_epi32(0x004B0012);
349 const __m128i c16_n50_p18
= _mm_set1_epi32(0xFFCE0012);
350 const __m128i c16_n89_p75
= _mm_set1_epi32(0xFFA7004B);
352 const __m128i c16_p36_p83
= _mm_set1_epi32(0x00240053);
353 const __m128i c16_n83_p36
= _mm_set1_epi32(0xFFAD0024);
355 const __m128i c16_n64_p64
= _mm_set1_epi32(0xFFC00040);
356 const __m128i c16_p64_p64
= _mm_set1_epi32(0x00400040);
357 __m128i c32_rnd
= _mm_set1_epi32(64);
362 __m128i in00
[2], in01
[2], in02
[2], in03
[2], in04
[2], in05
[2], in06
[2], in07
[2];
363 __m128i in08
[2], in09
[2], in10
[2], in11
[2], in12
[2], in13
[2], in14
[2], in15
[2];
364 __m128i res00
[2], res01
[2], res02
[2], res03
[2], res04
[2], res05
[2], res06
[2], res07
[2];
365 __m128i res08
[2], res09
[2], res10
[2], res11
[2], res12
[2], res13
[2], res14
[2], res15
[2];
367 for (int i
= 0; i
< 2; i
++)
369 const int offset
= (i
<< 3);
372 T00
= _mm_loadu_si128((const __m128i
*)&src
[0 * 16 + offset
]);
373 T01
= _mm_loadu_si128((const __m128i
*)&src
[0 * 16 + offset
+ 4]);
374 in00
[i
] = _mm_packs_epi32(T00
, T01
); // [07 06 05 04 03 02 01 00]
376 T00
= _mm_loadu_si128((const __m128i
*)&src
[1 * 16 + offset
]);
377 T01
= _mm_loadu_si128((const __m128i
*)&src
[1 * 16 + offset
+ 4]);
378 in01
[i
] = _mm_packs_epi32(T00
, T01
); // [17 16 15 14 13 12 11 10]
380 T00
= _mm_loadu_si128((const __m128i
*)&src
[2 * 16 + offset
]);
381 T01
= _mm_loadu_si128((const __m128i
*)&src
[2 * 16 + offset
+ 4]);
382 in02
[i
] = _mm_packs_epi32(T00
, T01
); // [27 26 25 24 23 22 21 20]
384 T00
= _mm_loadu_si128((const __m128i
*)&src
[3 * 16 + offset
]);
385 T01
= _mm_loadu_si128((const __m128i
*)&src
[3 * 16 + offset
+ 4]);
386 in03
[i
] = _mm_packs_epi32(T00
, T01
); // [37 36 35 34 33 32 31 30]
388 T00
= _mm_loadu_si128((const __m128i
*)&src
[4 * 16 + offset
]);
389 T01
= _mm_loadu_si128((const __m128i
*)&src
[4 * 16 + offset
+ 4]);
390 in04
[i
] = _mm_packs_epi32(T00
, T01
); // [47 46 45 44 43 42 41 40]
392 T00
= _mm_loadu_si128((const __m128i
*)&src
[5 * 16 + offset
]);
393 T01
= _mm_loadu_si128((const __m128i
*)&src
[5 * 16 + offset
+ 4]);
394 in05
[i
] = _mm_packs_epi32(T00
, T01
); // [57 56 55 54 53 52 51 50]
396 T00
= _mm_loadu_si128((const __m128i
*)&src
[6 * 16 + offset
]);
397 T01
= _mm_loadu_si128((const __m128i
*)&src
[6 * 16 + offset
+ 4]);
398 in06
[i
] = _mm_packs_epi32(T00
, T01
); // [67 66 65 64 63 62 61 60]
400 T00
= _mm_loadu_si128((const __m128i
*)&src
[7 * 16 + offset
]);
401 T01
= _mm_loadu_si128((const __m128i
*)&src
[7 * 16 + offset
+ 4]);
402 in07
[i
] = _mm_packs_epi32(T00
, T01
); // [77 76 75 74 73 72 71 70]
404 T00
= _mm_loadu_si128((const __m128i
*)&src
[8 * 16 + offset
]);
405 T01
= _mm_loadu_si128((const __m128i
*)&src
[8 * 16 + offset
+ 4]);
406 in08
[i
] = _mm_packs_epi32(T00
, T01
);
408 T00
= _mm_loadu_si128((const __m128i
*)&src
[9 * 16 + offset
]);
409 T01
= _mm_loadu_si128((const __m128i
*)&src
[9 * 16 + offset
+ 4]);
410 in09
[i
] = _mm_packs_epi32(T00
, T01
);
412 T00
= _mm_loadu_si128((const __m128i
*)&src
[10 * 16 + offset
]);
413 T01
= _mm_loadu_si128((const __m128i
*)&src
[10 * 16 + offset
+ 4]);
414 in10
[i
] = _mm_packs_epi32(T00
, T01
);
416 T00
= _mm_loadu_si128((const __m128i
*)&src
[11 * 16 + offset
]);
417 T01
= _mm_loadu_si128((const __m128i
*)&src
[11 * 16 + offset
+ 4]);
418 in11
[i
] = _mm_packs_epi32(T00
, T01
);
420 T00
= _mm_loadu_si128((const __m128i
*)&src
[12 * 16 + offset
]);
421 T01
= _mm_loadu_si128((const __m128i
*)&src
[12 * 16 + offset
+ 4]);
422 in12
[i
] = _mm_packs_epi32(T00
, T01
);
424 T00
= _mm_loadu_si128((const __m128i
*)&src
[13 * 16 + offset
]);
425 T01
= _mm_loadu_si128((const __m128i
*)&src
[13 * 16 + offset
+ 4]);
426 in13
[i
] = _mm_packs_epi32(T00
, T01
);
428 T00
= _mm_loadu_si128((const __m128i
*)&src
[14 * 16 + offset
]);
429 T01
= _mm_loadu_si128((const __m128i
*)&src
[14 * 16 + offset
+ 4]);
430 in14
[i
] = _mm_packs_epi32(T00
, T01
);
432 T00
= _mm_loadu_si128((const __m128i
*)&src
[15 * 16 + offset
]);
433 T01
= _mm_loadu_si128((const __m128i
*)&src
[15 * 16 + offset
+ 4]);
434 in15
[i
] = _mm_packs_epi32(T00
, T01
);
437 for (int pass
= 0; pass
< 2; pass
++)
441 c32_rnd
= _mm_set1_epi32(2048);
445 for (int part
= 0; part
< 2; part
++)
447 const __m128i T_00_00A
= _mm_unpacklo_epi16(in01
[part
], in03
[part
]); // [33 13 32 12 31 11 30 10]
448 const __m128i T_00_00B
= _mm_unpackhi_epi16(in01
[part
], in03
[part
]); // [37 17 36 16 35 15 34 14]
449 const __m128i T_00_01A
= _mm_unpacklo_epi16(in05
[part
], in07
[part
]); // [ ]
450 const __m128i T_00_01B
= _mm_unpackhi_epi16(in05
[part
], in07
[part
]); // [ ]
451 const __m128i T_00_02A
= _mm_unpacklo_epi16(in09
[part
], in11
[part
]); // [ ]
452 const __m128i T_00_02B
= _mm_unpackhi_epi16(in09
[part
], in11
[part
]); // [ ]
453 const __m128i T_00_03A
= _mm_unpacklo_epi16(in13
[part
], in15
[part
]); // [ ]
454 const __m128i T_00_03B
= _mm_unpackhi_epi16(in13
[part
], in15
[part
]); // [ ]
455 const __m128i T_00_04A
= _mm_unpacklo_epi16(in02
[part
], in06
[part
]); // [ ]
456 const __m128i T_00_04B
= _mm_unpackhi_epi16(in02
[part
], in06
[part
]); // [ ]
457 const __m128i T_00_05A
= _mm_unpacklo_epi16(in10
[part
], in14
[part
]); // [ ]
458 const __m128i T_00_05B
= _mm_unpackhi_epi16(in10
[part
], in14
[part
]); // [ ]
459 const __m128i T_00_06A
= _mm_unpacklo_epi16(in04
[part
], in12
[part
]); // [ ]row
460 const __m128i T_00_06B
= _mm_unpackhi_epi16(in04
[part
], in12
[part
]); // [ ]
461 const __m128i T_00_07A
= _mm_unpacklo_epi16(in00
[part
], in08
[part
]); // [83 03 82 02 81 01 81 00] row08 row00
462 const __m128i T_00_07B
= _mm_unpackhi_epi16(in00
[part
], in08
[part
]); // [87 07 86 06 85 05 84 04]
464 __m128i O0A
, O1A
, O2A
, O3A
, O4A
, O5A
, O6A
, O7A
;
465 __m128i O0B
, O1B
, O2B
, O3B
, O4B
, O5B
, O6B
, O7B
;
468 #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \
469 T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \
470 T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \
471 row = _mm_add_epi32(T00, T01);
473 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_p87_p90
, c16_p70_p80
, c16_p43_p57
, c16_p09_p25
, O0A
)
474 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_p57_p87
, c16_n43_p09
, c16_n90_n80
, c16_n25_n70
, O1A
)
475 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_p09_p80
, c16_n87_n70
, c16_p57_n25
, c16_p43_p90
, O2A
)
476 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n43_p70
, c16_p09_n87
, c16_p25_p90
, c16_n57_n80
, O3A
)
477 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n80_p57
, c16_p90_n25
, c16_n87_n09
, c16_p70_p43
, O4A
)
478 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n90_p43
, c16_p25_p57
, c16_p70_n87
, c16_n80_p09
, O5A
)
479 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n70_p25
, c16_n80_p90
, c16_p09_p43
, c16_p87_n57
, O6A
)
480 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n25_p09
, c16_n57_p43
, c16_n80_p70
, c16_n90_p87
, O7A
)
482 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_p87_p90
, c16_p70_p80
, c16_p43_p57
, c16_p09_p25
, O0B
)
483 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_p57_p87
, c16_n43_p09
, c16_n90_n80
, c16_n25_n70
, O1B
)
484 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_p09_p80
, c16_n87_n70
, c16_p57_n25
, c16_p43_p90
, O2B
)
485 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n43_p70
, c16_p09_n87
, c16_p25_p90
, c16_n57_n80
, O3B
)
486 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n80_p57
, c16_p90_n25
, c16_n87_n09
, c16_p70_p43
, O4B
)
487 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n90_p43
, c16_p25_p57
, c16_p70_n87
, c16_n80_p09
, O5B
)
488 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n70_p25
, c16_n80_p90
, c16_p09_p43
, c16_p87_n57
, O6B
)
489 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n25_p09
, c16_n57_p43
, c16_n80_p70
, c16_n90_p87
, O7B
)
493 __m128i EO0A
, EO1A
, EO2A
, EO3A
;
494 __m128i EO0B
, EO1B
, EO2B
, EO3B
;
495 EO0A
= _mm_add_epi32(_mm_madd_epi16(T_00_04A
, c16_p75_p89
), _mm_madd_epi16(T_00_05A
, c16_p18_p50
)); // EO0
496 EO0B
= _mm_add_epi32(_mm_madd_epi16(T_00_04B
, c16_p75_p89
), _mm_madd_epi16(T_00_05B
, c16_p18_p50
));
497 EO1A
= _mm_add_epi32(_mm_madd_epi16(T_00_04A
, c16_n18_p75
), _mm_madd_epi16(T_00_05A
, c16_n50_n89
)); // EO1
498 EO1B
= _mm_add_epi32(_mm_madd_epi16(T_00_04B
, c16_n18_p75
), _mm_madd_epi16(T_00_05B
, c16_n50_n89
));
499 EO2A
= _mm_add_epi32(_mm_madd_epi16(T_00_04A
, c16_n89_p50
), _mm_madd_epi16(T_00_05A
, c16_p75_p18
)); // EO2
500 EO2B
= _mm_add_epi32(_mm_madd_epi16(T_00_04B
, c16_n89_p50
), _mm_madd_epi16(T_00_05B
, c16_p75_p18
));
501 EO3A
= _mm_add_epi32(_mm_madd_epi16(T_00_04A
, c16_n50_p18
), _mm_madd_epi16(T_00_05A
, c16_n89_p75
)); // EO3
502 EO3B
= _mm_add_epi32(_mm_madd_epi16(T_00_04B
, c16_n50_p18
), _mm_madd_epi16(T_00_05B
, c16_n89_p75
));
504 __m128i EEO0A
, EEO1A
;
505 __m128i EEO0B
, EEO1B
;
506 EEO0A
= _mm_madd_epi16(T_00_06A
, c16_p36_p83
);
507 EEO0B
= _mm_madd_epi16(T_00_06B
, c16_p36_p83
);
508 EEO1A
= _mm_madd_epi16(T_00_06A
, c16_n83_p36
);
509 EEO1B
= _mm_madd_epi16(T_00_06B
, c16_n83_p36
);
511 __m128i EEE0A
, EEE1A
;
512 __m128i EEE0B
, EEE1B
;
513 EEE0A
= _mm_madd_epi16(T_00_07A
, c16_p64_p64
);
514 EEE0B
= _mm_madd_epi16(T_00_07B
, c16_p64_p64
);
515 EEE1A
= _mm_madd_epi16(T_00_07A
, c16_n64_p64
);
516 EEE1B
= _mm_madd_epi16(T_00_07B
, c16_n64_p64
);
518 const __m128i EE0A
= _mm_add_epi32(EEE0A
, EEO0A
); // EE0 = EEE0 + EEO0
519 const __m128i EE0B
= _mm_add_epi32(EEE0B
, EEO0B
);
520 const __m128i EE1A
= _mm_add_epi32(EEE1A
, EEO1A
); // EE1 = EEE1 + EEO1
521 const __m128i EE1B
= _mm_add_epi32(EEE1B
, EEO1B
);
522 const __m128i EE3A
= _mm_sub_epi32(EEE0A
, EEO0A
); // EE2 = EEE0 - EEO0
523 const __m128i EE3B
= _mm_sub_epi32(EEE0B
, EEO0B
);
524 const __m128i EE2A
= _mm_sub_epi32(EEE1A
, EEO1A
); // EE3 = EEE1 - EEO1
525 const __m128i EE2B
= _mm_sub_epi32(EEE1B
, EEO1B
);
527 const __m128i E0A
= _mm_add_epi32(EE0A
, EO0A
); // E0 = EE0 + EO0
528 const __m128i E0B
= _mm_add_epi32(EE0B
, EO0B
);
529 const __m128i E1A
= _mm_add_epi32(EE1A
, EO1A
); // E1 = EE1 + EO1
530 const __m128i E1B
= _mm_add_epi32(EE1B
, EO1B
);
531 const __m128i E2A
= _mm_add_epi32(EE2A
, EO2A
); // E2 = EE2 + EO2
532 const __m128i E2B
= _mm_add_epi32(EE2B
, EO2B
);
533 const __m128i E3A
= _mm_add_epi32(EE3A
, EO3A
); // E3 = EE3 + EO3
534 const __m128i E3B
= _mm_add_epi32(EE3B
, EO3B
);
535 const __m128i E7A
= _mm_sub_epi32(EE0A
, EO0A
); // E0 = EE0 - EO0
536 const __m128i E7B
= _mm_sub_epi32(EE0B
, EO0B
);
537 const __m128i E6A
= _mm_sub_epi32(EE1A
, EO1A
); // E1 = EE1 - EO1
538 const __m128i E6B
= _mm_sub_epi32(EE1B
, EO1B
);
539 const __m128i E5A
= _mm_sub_epi32(EE2A
, EO2A
); // E2 = EE2 - EO2
540 const __m128i E5B
= _mm_sub_epi32(EE2B
, EO2B
);
541 const __m128i E4A
= _mm_sub_epi32(EE3A
, EO3A
); // E3 = EE3 - EO3
542 const __m128i E4B
= _mm_sub_epi32(EE3B
, EO3B
);
544 const __m128i T10A
= _mm_add_epi32(E0A
, c32_rnd
); // E0 + rnd
545 const __m128i T10B
= _mm_add_epi32(E0B
, c32_rnd
);
546 const __m128i T11A
= _mm_add_epi32(E1A
, c32_rnd
); // E1 + rnd
547 const __m128i T11B
= _mm_add_epi32(E1B
, c32_rnd
);
548 const __m128i T12A
= _mm_add_epi32(E2A
, c32_rnd
); // E2 + rnd
549 const __m128i T12B
= _mm_add_epi32(E2B
, c32_rnd
);
550 const __m128i T13A
= _mm_add_epi32(E3A
, c32_rnd
); // E3 + rnd
551 const __m128i T13B
= _mm_add_epi32(E3B
, c32_rnd
);
552 const __m128i T14A
= _mm_add_epi32(E4A
, c32_rnd
); // E4 + rnd
553 const __m128i T14B
= _mm_add_epi32(E4B
, c32_rnd
);
554 const __m128i T15A
= _mm_add_epi32(E5A
, c32_rnd
); // E5 + rnd
555 const __m128i T15B
= _mm_add_epi32(E5B
, c32_rnd
);
556 const __m128i T16A
= _mm_add_epi32(E6A
, c32_rnd
); // E6 + rnd
557 const __m128i T16B
= _mm_add_epi32(E6B
, c32_rnd
);
558 const __m128i T17A
= _mm_add_epi32(E7A
, c32_rnd
); // E7 + rnd
559 const __m128i T17B
= _mm_add_epi32(E7B
, c32_rnd
);
561 const __m128i T20A
= _mm_add_epi32(T10A
, O0A
); // E0 + O0 + rnd
562 const __m128i T20B
= _mm_add_epi32(T10B
, O0B
);
563 const __m128i T21A
= _mm_add_epi32(T11A
, O1A
); // E1 + O1 + rnd
564 const __m128i T21B
= _mm_add_epi32(T11B
, O1B
);
565 const __m128i T22A
= _mm_add_epi32(T12A
, O2A
); // E2 + O2 + rnd
566 const __m128i T22B
= _mm_add_epi32(T12B
, O2B
);
567 const __m128i T23A
= _mm_add_epi32(T13A
, O3A
); // E3 + O3 + rnd
568 const __m128i T23B
= _mm_add_epi32(T13B
, O3B
);
569 const __m128i T24A
= _mm_add_epi32(T14A
, O4A
); // E4
570 const __m128i T24B
= _mm_add_epi32(T14B
, O4B
);
571 const __m128i T25A
= _mm_add_epi32(T15A
, O5A
); // E5
572 const __m128i T25B
= _mm_add_epi32(T15B
, O5B
);
573 const __m128i T26A
= _mm_add_epi32(T16A
, O6A
); // E6
574 const __m128i T26B
= _mm_add_epi32(T16B
, O6B
);
575 const __m128i T27A
= _mm_add_epi32(T17A
, O7A
); // E7
576 const __m128i T27B
= _mm_add_epi32(T17B
, O7B
);
577 const __m128i T2FA
= _mm_sub_epi32(T10A
, O0A
); // E0 - O0 + rnd
578 const __m128i T2FB
= _mm_sub_epi32(T10B
, O0B
);
579 const __m128i T2EA
= _mm_sub_epi32(T11A
, O1A
); // E1 - O1 + rnd
580 const __m128i T2EB
= _mm_sub_epi32(T11B
, O1B
);
581 const __m128i T2DA
= _mm_sub_epi32(T12A
, O2A
); // E2 - O2 + rnd
582 const __m128i T2DB
= _mm_sub_epi32(T12B
, O2B
);
583 const __m128i T2CA
= _mm_sub_epi32(T13A
, O3A
); // E3 - O3 + rnd
584 const __m128i T2CB
= _mm_sub_epi32(T13B
, O3B
);
585 const __m128i T2BA
= _mm_sub_epi32(T14A
, O4A
); // E4
586 const __m128i T2BB
= _mm_sub_epi32(T14B
, O4B
);
587 const __m128i T2AA
= _mm_sub_epi32(T15A
, O5A
); // E5
588 const __m128i T2AB
= _mm_sub_epi32(T15B
, O5B
);
589 const __m128i T29A
= _mm_sub_epi32(T16A
, O6A
); // E6
590 const __m128i T29B
= _mm_sub_epi32(T16B
, O6B
);
591 const __m128i T28A
= _mm_sub_epi32(T17A
, O7A
); // E7
592 const __m128i T28B
= _mm_sub_epi32(T17B
, O7B
);
594 const __m128i T30A
= _mm_srai_epi32(T20A
, nShift
); // [30 20 10 00]
595 const __m128i T30B
= _mm_srai_epi32(T20B
, nShift
); // [70 60 50 40]
596 const __m128i T31A
= _mm_srai_epi32(T21A
, nShift
); // [31 21 11 01]
597 const __m128i T31B
= _mm_srai_epi32(T21B
, nShift
); // [71 61 51 41]
598 const __m128i T32A
= _mm_srai_epi32(T22A
, nShift
); // [32 22 12 02]
599 const __m128i T32B
= _mm_srai_epi32(T22B
, nShift
); // [72 62 52 42]
600 const __m128i T33A
= _mm_srai_epi32(T23A
, nShift
); // [33 23 13 03]
601 const __m128i T33B
= _mm_srai_epi32(T23B
, nShift
); // [73 63 53 43]
602 const __m128i T34A
= _mm_srai_epi32(T24A
, nShift
); // [33 24 14 04]
603 const __m128i T34B
= _mm_srai_epi32(T24B
, nShift
); // [74 64 54 44]
604 const __m128i T35A
= _mm_srai_epi32(T25A
, nShift
); // [35 25 15 05]
605 const __m128i T35B
= _mm_srai_epi32(T25B
, nShift
); // [75 65 55 45]
606 const __m128i T36A
= _mm_srai_epi32(T26A
, nShift
); // [36 26 16 06]
607 const __m128i T36B
= _mm_srai_epi32(T26B
, nShift
); // [76 66 56 46]
608 const __m128i T37A
= _mm_srai_epi32(T27A
, nShift
); // [37 27 17 07]
609 const __m128i T37B
= _mm_srai_epi32(T27B
, nShift
); // [77 67 57 47]
611 const __m128i T38A
= _mm_srai_epi32(T28A
, nShift
); // [30 20 10 00] x8
612 const __m128i T38B
= _mm_srai_epi32(T28B
, nShift
); // [70 60 50 40]
613 const __m128i T39A
= _mm_srai_epi32(T29A
, nShift
); // [31 21 11 01] x9
614 const __m128i T39B
= _mm_srai_epi32(T29B
, nShift
); // [71 61 51 41]
615 const __m128i T3AA
= _mm_srai_epi32(T2AA
, nShift
); // [32 22 12 02] xA
616 const __m128i T3AB
= _mm_srai_epi32(T2AB
, nShift
); // [72 62 52 42]
617 const __m128i T3BA
= _mm_srai_epi32(T2BA
, nShift
); // [33 23 13 03] xB
618 const __m128i T3BB
= _mm_srai_epi32(T2BB
, nShift
); // [73 63 53 43]
619 const __m128i T3CA
= _mm_srai_epi32(T2CA
, nShift
); // [33 24 14 04] xC
620 const __m128i T3CB
= _mm_srai_epi32(T2CB
, nShift
); // [74 64 54 44]
621 const __m128i T3DA
= _mm_srai_epi32(T2DA
, nShift
); // [35 25 15 05] xD
622 const __m128i T3DB
= _mm_srai_epi32(T2DB
, nShift
); // [75 65 55 45]
623 const __m128i T3EA
= _mm_srai_epi32(T2EA
, nShift
); // [36 26 16 06] xE
624 const __m128i T3EB
= _mm_srai_epi32(T2EB
, nShift
); // [76 66 56 46]
625 const __m128i T3FA
= _mm_srai_epi32(T2FA
, nShift
); // [37 27 17 07] xF
626 const __m128i T3FB
= _mm_srai_epi32(T2FB
, nShift
); // [77 67 57 47]
628 res00
[part
] = _mm_packs_epi32(T30A
, T30B
); // [70 60 50 40 30 20 10 00]
629 res01
[part
] = _mm_packs_epi32(T31A
, T31B
); // [71 61 51 41 31 21 11 01]
630 res02
[part
] = _mm_packs_epi32(T32A
, T32B
); // [72 62 52 42 32 22 12 02]
631 res03
[part
] = _mm_packs_epi32(T33A
, T33B
); // [73 63 53 43 33 23 13 03]
632 res04
[part
] = _mm_packs_epi32(T34A
, T34B
); // [74 64 54 44 34 24 14 04]
633 res05
[part
] = _mm_packs_epi32(T35A
, T35B
); // [75 65 55 45 35 25 15 05]
634 res06
[part
] = _mm_packs_epi32(T36A
, T36B
); // [76 66 56 46 36 26 16 06]
635 res07
[part
] = _mm_packs_epi32(T37A
, T37B
); // [77 67 57 47 37 27 17 07]
637 res08
[part
] = _mm_packs_epi32(T38A
, T38B
); // [A0 ... 80]
638 res09
[part
] = _mm_packs_epi32(T39A
, T39B
); // [A1 ... 81]
639 res10
[part
] = _mm_packs_epi32(T3AA
, T3AB
); // [A2 ... 82]
640 res11
[part
] = _mm_packs_epi32(T3BA
, T3BB
); // [A3 ... 83]
641 res12
[part
] = _mm_packs_epi32(T3CA
, T3CB
); // [A4 ... 84]
642 res13
[part
] = _mm_packs_epi32(T3DA
, T3DB
); // [A5 ... 85]
643 res14
[part
] = _mm_packs_epi32(T3EA
, T3EB
); // [A6 ... 86]
644 res15
[part
] = _mm_packs_epi32(T3FA
, T3FB
); // [A7 ... 87]
646 //transpose matrix 8x8 16bit.
648 __m128i tr0_0
, tr0_1
, tr0_2
, tr0_3
, tr0_4
, tr0_5
, tr0_6
, tr0_7
;
649 __m128i tr1_0
, tr1_1
, tr1_2
, tr1_3
, tr1_4
, tr1_5
, tr1_6
, tr1_7
;
650 #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
651 tr0_0 = _mm_unpacklo_epi16(I0, I1); \
652 tr0_1 = _mm_unpacklo_epi16(I2, I3); \
653 tr0_2 = _mm_unpackhi_epi16(I0, I1); \
654 tr0_3 = _mm_unpackhi_epi16(I2, I3); \
655 tr0_4 = _mm_unpacklo_epi16(I4, I5); \
656 tr0_5 = _mm_unpacklo_epi16(I6, I7); \
657 tr0_6 = _mm_unpackhi_epi16(I4, I5); \
658 tr0_7 = _mm_unpackhi_epi16(I6, I7); \
659 tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
660 tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
661 tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
662 tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
663 tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
664 tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
665 tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
666 tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
667 O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
668 O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
669 O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
670 O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
671 O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
672 O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
673 O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
674 O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
676 TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
677 TRANSPOSE_8x8_16BIT(res08
[0], res09
[0], res10
[0], res11
[0], res12
[0], res13
[0], res14
[0], res15
[0], in00
[1], in01
[1], in02
[1], in03
[1], in04
[1], in05
[1], in06
[1], in07
[1])
678 TRANSPOSE_8x8_16BIT(res00
[1], res01
[1], res02
[1], res03
[1], res04
[1], res05
[1], res06
[1], res07
[1], in08
[0], in09
[0], in10
[0], in11
[0], in12
[0], in13
[0], in14
[0], in15
[0])
679 TRANSPOSE_8x8_16BIT(res08
[1], res09
[1], res10
[1], res11
[1], res12
[1], res13
[1], res14
[1], res15
[1], in08
[1], in09
[1], in10
[1], in11
[1], in12
[1], in13
[1], in14
[1], in15
[1])
681 #undef TRANSPOSE_8x8_16BIT
685 _mm_store_si128((__m128i
*)&dst
[0 * stride
+ 0], in00
[0]);
686 _mm_store_si128((__m128i
*)&dst
[0 * stride
+ 8], in00
[1]);
687 _mm_store_si128((__m128i
*)&dst
[1 * stride
+ 0], in01
[0]);
688 _mm_store_si128((__m128i
*)&dst
[1 * stride
+ 8], in01
[1]);
689 _mm_store_si128((__m128i
*)&dst
[2 * stride
+ 0], in02
[0]);
690 _mm_store_si128((__m128i
*)&dst
[2 * stride
+ 8], in02
[1]);
691 _mm_store_si128((__m128i
*)&dst
[3 * stride
+ 0], in03
[0]);
692 _mm_store_si128((__m128i
*)&dst
[3 * stride
+ 8], in03
[1]);
693 _mm_store_si128((__m128i
*)&dst
[4 * stride
+ 0], in04
[0]);
694 _mm_store_si128((__m128i
*)&dst
[4 * stride
+ 8], in04
[1]);
695 _mm_store_si128((__m128i
*)&dst
[5 * stride
+ 0], in05
[0]);
696 _mm_store_si128((__m128i
*)&dst
[5 * stride
+ 8], in05
[1]);
697 _mm_store_si128((__m128i
*)&dst
[6 * stride
+ 0], in06
[0]);
698 _mm_store_si128((__m128i
*)&dst
[6 * stride
+ 8], in06
[1]);
699 _mm_store_si128((__m128i
*)&dst
[7 * stride
+ 0], in07
[0]);
700 _mm_store_si128((__m128i
*)&dst
[7 * stride
+ 8], in07
[1]);
701 _mm_store_si128((__m128i
*)&dst
[8 * stride
+ 0], in08
[0]);
702 _mm_store_si128((__m128i
*)&dst
[8 * stride
+ 8], in08
[1]);
703 _mm_store_si128((__m128i
*)&dst
[9 * stride
+ 0], in09
[0]);
704 _mm_store_si128((__m128i
*)&dst
[9 * stride
+ 8], in09
[1]);
705 _mm_store_si128((__m128i
*)&dst
[10 * stride
+ 0], in10
[0]);
706 _mm_store_si128((__m128i
*)&dst
[10 * stride
+ 8], in10
[1]);
707 _mm_store_si128((__m128i
*)&dst
[11 * stride
+ 0], in11
[0]);
708 _mm_store_si128((__m128i
*)&dst
[11 * stride
+ 8], in11
[1]);
709 _mm_store_si128((__m128i
*)&dst
[12 * stride
+ 0], in12
[0]);
710 _mm_store_si128((__m128i
*)&dst
[12 * stride
+ 8], in12
[1]);
711 _mm_store_si128((__m128i
*)&dst
[13 * stride
+ 0], in13
[0]);
712 _mm_store_si128((__m128i
*)&dst
[13 * stride
+ 8], in13
[1]);
713 _mm_store_si128((__m128i
*)&dst
[14 * stride
+ 0], in14
[0]);
714 _mm_store_si128((__m128i
*)&dst
[14 * stride
+ 8], in14
[1]);
715 _mm_store_si128((__m128i
*)&dst
[15 * stride
+ 0], in15
[0]);
716 _mm_store_si128((__m128i
*)&dst
[15 * stride
+ 8], in15
[1]);
719 void idct32(int32_t *src
, int16_t *dst
, intptr_t stride
)
722 const __m128i c16_p90_p90
= _mm_set1_epi32(0x005A005A); //column 0
723 const __m128i c16_p85_p88
= _mm_set1_epi32(0x00550058);
724 const __m128i c16_p78_p82
= _mm_set1_epi32(0x004E0052);
725 const __m128i c16_p67_p73
= _mm_set1_epi32(0x00430049);
726 const __m128i c16_p54_p61
= _mm_set1_epi32(0x0036003D);
727 const __m128i c16_p38_p46
= _mm_set1_epi32(0x0026002E);
728 const __m128i c16_p22_p31
= _mm_set1_epi32(0x0016001F);
729 const __m128i c16_p04_p13
= _mm_set1_epi32(0x0004000D);
730 const __m128i c16_p82_p90
= _mm_set1_epi32(0x0052005A); //column 1
731 const __m128i c16_p46_p67
= _mm_set1_epi32(0x002E0043);
732 const __m128i c16_n04_p22
= _mm_set1_epi32(0xFFFC0016);
733 const __m128i c16_n54_n31
= _mm_set1_epi32(0xFFCAFFE1);
734 const __m128i c16_n85_n73
= _mm_set1_epi32(0xFFABFFB7);
735 const __m128i c16_n88_n90
= _mm_set1_epi32(0xFFA8FFA6);
736 const __m128i c16_n61_n78
= _mm_set1_epi32(0xFFC3FFB2);
737 const __m128i c16_n13_n38
= _mm_set1_epi32(0xFFF3FFDA);
738 const __m128i c16_p67_p88
= _mm_set1_epi32(0x00430058); //column 2
739 const __m128i c16_n13_p31
= _mm_set1_epi32(0xFFF3001F);
740 const __m128i c16_n82_n54
= _mm_set1_epi32(0xFFAEFFCA);
741 const __m128i c16_n78_n90
= _mm_set1_epi32(0xFFB2FFA6);
742 const __m128i c16_n04_n46
= _mm_set1_epi32(0xFFFCFFD2);
743 const __m128i c16_p73_p38
= _mm_set1_epi32(0x00490026);
744 const __m128i c16_p85_p90
= _mm_set1_epi32(0x0055005A);
745 const __m128i c16_p22_p61
= _mm_set1_epi32(0x0016003D);
746 const __m128i c16_p46_p85
= _mm_set1_epi32(0x002E0055); //column 3
747 const __m128i c16_n67_n13
= _mm_set1_epi32(0xFFBDFFF3);
748 const __m128i c16_n73_n90
= _mm_set1_epi32(0xFFB7FFA6);
749 const __m128i c16_p38_n22
= _mm_set1_epi32(0x0026FFEA);
750 const __m128i c16_p88_p82
= _mm_set1_epi32(0x00580052);
751 const __m128i c16_n04_p54
= _mm_set1_epi32(0xFFFC0036);
752 const __m128i c16_n90_n61
= _mm_set1_epi32(0xFFA6FFC3);
753 const __m128i c16_n31_n78
= _mm_set1_epi32(0xFFE1FFB2);
754 const __m128i c16_p22_p82
= _mm_set1_epi32(0x00160052); //column 4
755 const __m128i c16_n90_n54
= _mm_set1_epi32(0xFFA6FFCA);
756 const __m128i c16_p13_n61
= _mm_set1_epi32(0x000DFFC3);
757 const __m128i c16_p85_p78
= _mm_set1_epi32(0x0055004E);
758 const __m128i c16_n46_p31
= _mm_set1_epi32(0xFFD2001F);
759 const __m128i c16_n67_n90
= _mm_set1_epi32(0xFFBDFFA6);
760 const __m128i c16_p73_p04
= _mm_set1_epi32(0x00490004);
761 const __m128i c16_p38_p88
= _mm_set1_epi32(0x00260058);
762 const __m128i c16_n04_p78
= _mm_set1_epi32(0xFFFC004E); //column 5
763 const __m128i c16_n73_n82
= _mm_set1_epi32(0xFFB7FFAE);
764 const __m128i c16_p85_p13
= _mm_set1_epi32(0x0055000D);
765 const __m128i c16_n22_p67
= _mm_set1_epi32(0xFFEA0043);
766 const __m128i c16_n61_n88
= _mm_set1_epi32(0xFFC3FFA8);
767 const __m128i c16_p90_p31
= _mm_set1_epi32(0x005A001F);
768 const __m128i c16_n38_p54
= _mm_set1_epi32(0xFFDA0036);
769 const __m128i c16_n46_n90
= _mm_set1_epi32(0xFFD2FFA6);
770 const __m128i c16_n31_p73
= _mm_set1_epi32(0xFFE10049); //column 6
771 const __m128i c16_n22_n90
= _mm_set1_epi32(0xFFEAFFA6);
772 const __m128i c16_p67_p78
= _mm_set1_epi32(0x0043004E);
773 const __m128i c16_n90_n38
= _mm_set1_epi32(0xFFA6FFDA);
774 const __m128i c16_p82_n13
= _mm_set1_epi32(0x0052FFF3);
775 const __m128i c16_n46_p61
= _mm_set1_epi32(0xFFD2003D);
776 const __m128i c16_n04_n88
= _mm_set1_epi32(0xFFFCFFA8);
777 const __m128i c16_p54_p85
= _mm_set1_epi32(0x00360055);
778 const __m128i c16_n54_p67
= _mm_set1_epi32(0xFFCA0043); //column 7
779 const __m128i c16_p38_n78
= _mm_set1_epi32(0x0026FFB2);
780 const __m128i c16_n22_p85
= _mm_set1_epi32(0xFFEA0055);
781 const __m128i c16_p04_n90
= _mm_set1_epi32(0x0004FFA6);
782 const __m128i c16_p13_p90
= _mm_set1_epi32(0x000D005A);
783 const __m128i c16_n31_n88
= _mm_set1_epi32(0xFFE1FFA8);
784 const __m128i c16_p46_p82
= _mm_set1_epi32(0x002E0052);
785 const __m128i c16_n61_n73
= _mm_set1_epi32(0xFFC3FFB7);
786 const __m128i c16_n73_p61
= _mm_set1_epi32(0xFFB7003D); //column 8
787 const __m128i c16_p82_n46
= _mm_set1_epi32(0x0052FFD2);
788 const __m128i c16_n88_p31
= _mm_set1_epi32(0xFFA8001F);
789 const __m128i c16_p90_n13
= _mm_set1_epi32(0x005AFFF3);
790 const __m128i c16_n90_n04
= _mm_set1_epi32(0xFFA6FFFC);
791 const __m128i c16_p85_p22
= _mm_set1_epi32(0x00550016);
792 const __m128i c16_n78_n38
= _mm_set1_epi32(0xFFB2FFDA);
793 const __m128i c16_p67_p54
= _mm_set1_epi32(0x00430036);
794 const __m128i c16_n85_p54
= _mm_set1_epi32(0xFFAB0036); //column 9
795 const __m128i c16_p88_n04
= _mm_set1_epi32(0x0058FFFC);
796 const __m128i c16_n61_n46
= _mm_set1_epi32(0xFFC3FFD2);
797 const __m128i c16_p13_p82
= _mm_set1_epi32(0x000D0052);
798 const __m128i c16_p38_n90
= _mm_set1_epi32(0x0026FFA6);
799 const __m128i c16_n78_p67
= _mm_set1_epi32(0xFFB20043);
800 const __m128i c16_p90_n22
= _mm_set1_epi32(0x005AFFEA);
801 const __m128i c16_n73_n31
= _mm_set1_epi32(0xFFB7FFE1);
802 const __m128i c16_n90_p46
= _mm_set1_epi32(0xFFA6002E); //column 10
803 const __m128i c16_p54_p38
= _mm_set1_epi32(0x00360026);
804 const __m128i c16_p31_n90
= _mm_set1_epi32(0x001FFFA6);
805 const __m128i c16_n88_p61
= _mm_set1_epi32(0xFFA8003D);
806 const __m128i c16_p67_p22
= _mm_set1_epi32(0x00430016);
807 const __m128i c16_p13_n85
= _mm_set1_epi32(0x000DFFAB);
808 const __m128i c16_n82_p73
= _mm_set1_epi32(0xFFAE0049);
809 const __m128i c16_p78_p04
= _mm_set1_epi32(0x004E0004);
810 const __m128i c16_n88_p38
= _mm_set1_epi32(0xFFA80026); //column 11
811 const __m128i c16_n04_p73
= _mm_set1_epi32(0xFFFC0049);
812 const __m128i c16_p90_n67
= _mm_set1_epi32(0x005AFFBD);
813 const __m128i c16_n31_n46
= _mm_set1_epi32(0xFFE1FFD2);
814 const __m128i c16_n78_p85
= _mm_set1_epi32(0xFFB20055);
815 const __m128i c16_p61_p13
= _mm_set1_epi32(0x003D000D);
816 const __m128i c16_p54_n90
= _mm_set1_epi32(0x0036FFA6);
817 const __m128i c16_n82_p22
= _mm_set1_epi32(0xFFAE0016);
818 const __m128i c16_n78_p31
= _mm_set1_epi32(0xFFB2001F); //column 12
819 const __m128i c16_n61_p90
= _mm_set1_epi32(0xFFC3005A);
820 const __m128i c16_p54_p04
= _mm_set1_epi32(0x00360004);
821 const __m128i c16_p82_n88
= _mm_set1_epi32(0x0052FFA8);
822 const __m128i c16_n22_n38
= _mm_set1_epi32(0xFFEAFFDA);
823 const __m128i c16_n90_p73
= _mm_set1_epi32(0xFFA60049);
824 const __m128i c16_n13_p67
= _mm_set1_epi32(0xFFF30043);
825 const __m128i c16_p85_n46
= _mm_set1_epi32(0x0055FFD2);
826 const __m128i c16_n61_p22
= _mm_set1_epi32(0xFFC30016); //column 13
827 const __m128i c16_n90_p85
= _mm_set1_epi32(0xFFA60055);
828 const __m128i c16_n38_p73
= _mm_set1_epi32(0xFFDA0049);
829 const __m128i c16_p46_n04
= _mm_set1_epi32(0x002EFFFC);
830 const __m128i c16_p90_n78
= _mm_set1_epi32(0x005AFFB2);
831 const __m128i c16_p54_n82
= _mm_set1_epi32(0x0036FFAE);
832 const __m128i c16_n31_n13
= _mm_set1_epi32(0xFFE1FFF3);
833 const __m128i c16_n88_p67
= _mm_set1_epi32(0xFFA80043);
834 const __m128i c16_n38_p13
= _mm_set1_epi32(0xFFDA000D); //column 14
835 const __m128i c16_n78_p61
= _mm_set1_epi32(0xFFB2003D);
836 const __m128i c16_n90_p88
= _mm_set1_epi32(0xFFA60058);
837 const __m128i c16_n73_p85
= _mm_set1_epi32(0xFFB70055);
838 const __m128i c16_n31_p54
= _mm_set1_epi32(0xFFE10036);
839 const __m128i c16_p22_p04
= _mm_set1_epi32(0x00160004);
840 const __m128i c16_p67_n46
= _mm_set1_epi32(0x0043FFD2);
841 const __m128i c16_p90_n82
= _mm_set1_epi32(0x005AFFAE);
842 const __m128i c16_n13_p04
= _mm_set1_epi32(0xFFF30004); //column 15
843 const __m128i c16_n31_p22
= _mm_set1_epi32(0xFFE10016);
844 const __m128i c16_n46_p38
= _mm_set1_epi32(0xFFD20026);
845 const __m128i c16_n61_p54
= _mm_set1_epi32(0xFFC30036);
846 const __m128i c16_n73_p67
= _mm_set1_epi32(0xFFB70043);
847 const __m128i c16_n82_p78
= _mm_set1_epi32(0xFFAE004E);
848 const __m128i c16_n88_p85
= _mm_set1_epi32(0xFFA80055);
849 const __m128i c16_n90_p90
= _mm_set1_epi32(0xFFA6005A);
852 const __m128i c16_p87_p90
= _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
853 const __m128i c16_p70_p80
= _mm_set1_epi32(0x00460050);
854 const __m128i c16_p43_p57
= _mm_set1_epi32(0x002B0039);
855 const __m128i c16_p09_p25
= _mm_set1_epi32(0x00090019);
856 const __m128i c16_p57_p87
= _mm_set1_epi32(0x00390057); //row1
857 const __m128i c16_n43_p09
= _mm_set1_epi32(0xFFD50009);
858 const __m128i c16_n90_n80
= _mm_set1_epi32(0xFFA6FFB0);
859 const __m128i c16_n25_n70
= _mm_set1_epi32(0xFFE7FFBA);
860 const __m128i c16_p09_p80
= _mm_set1_epi32(0x00090050); //row2
861 const __m128i c16_n87_n70
= _mm_set1_epi32(0xFFA9FFBA);
862 const __m128i c16_p57_n25
= _mm_set1_epi32(0x0039FFE7);
863 const __m128i c16_p43_p90
= _mm_set1_epi32(0x002B005A);
864 const __m128i c16_n43_p70
= _mm_set1_epi32(0xFFD50046); //row3
865 const __m128i c16_p09_n87
= _mm_set1_epi32(0x0009FFA9);
866 const __m128i c16_p25_p90
= _mm_set1_epi32(0x0019005A);
867 const __m128i c16_n57_n80
= _mm_set1_epi32(0xFFC7FFB0);
868 const __m128i c16_n80_p57
= _mm_set1_epi32(0xFFB00039); //row4
869 const __m128i c16_p90_n25
= _mm_set1_epi32(0x005AFFE7);
870 const __m128i c16_n87_n09
= _mm_set1_epi32(0xFFA9FFF7);
871 const __m128i c16_p70_p43
= _mm_set1_epi32(0x0046002B);
872 const __m128i c16_n90_p43
= _mm_set1_epi32(0xFFA6002B); //row5
873 const __m128i c16_p25_p57
= _mm_set1_epi32(0x00190039);
874 const __m128i c16_p70_n87
= _mm_set1_epi32(0x0046FFA9);
875 const __m128i c16_n80_p09
= _mm_set1_epi32(0xFFB00009);
876 const __m128i c16_n70_p25
= _mm_set1_epi32(0xFFBA0019); //row6
877 const __m128i c16_n80_p90
= _mm_set1_epi32(0xFFB0005A);
878 const __m128i c16_p09_p43
= _mm_set1_epi32(0x0009002B);
879 const __m128i c16_p87_n57
= _mm_set1_epi32(0x0057FFC7);
880 const __m128i c16_n25_p09
= _mm_set1_epi32(0xFFE70009); //row7
881 const __m128i c16_n57_p43
= _mm_set1_epi32(0xFFC7002B);
882 const __m128i c16_n80_p70
= _mm_set1_epi32(0xFFB00046);
883 const __m128i c16_n90_p87
= _mm_set1_epi32(0xFFA60057);
885 const __m128i c16_p75_p89
= _mm_set1_epi32(0x004B0059);
886 const __m128i c16_p18_p50
= _mm_set1_epi32(0x00120032);
887 const __m128i c16_n18_p75
= _mm_set1_epi32(0xFFEE004B);
888 const __m128i c16_n50_n89
= _mm_set1_epi32(0xFFCEFFA7);
889 const __m128i c16_n89_p50
= _mm_set1_epi32(0xFFA70032);
890 const __m128i c16_p75_p18
= _mm_set1_epi32(0x004B0012);
891 const __m128i c16_n50_p18
= _mm_set1_epi32(0xFFCE0012);
892 const __m128i c16_n89_p75
= _mm_set1_epi32(0xFFA7004B);
894 const __m128i c16_p36_p83
= _mm_set1_epi32(0x00240053);
895 const __m128i c16_n83_p36
= _mm_set1_epi32(0xFFAD0024);
897 const __m128i c16_n64_p64
= _mm_set1_epi32(0xFFC00040);
898 const __m128i c16_p64_p64
= _mm_set1_epi32(0x00400040);
899 __m128i c32_rnd
= _mm_set1_epi32(64);
904 __m128i in00
[4], in01
[4], in02
[4], in03
[4], in04
[4], in05
[4], in06
[4], in07
[4], in08
[4], in09
[4], in10
[4], in11
[4], in12
[4], in13
[4], in14
[4], in15
[4];
905 __m128i in16
[4], in17
[4], in18
[4], in19
[4], in20
[4], in21
[4], in22
[4], in23
[4], in24
[4], in25
[4], in26
[4], in27
[4], in28
[4], in29
[4], in30
[4], in31
[4];
906 __m128i res00
[4], res01
[4], res02
[4], res03
[4], res04
[4], res05
[4], res06
[4], res07
[4], res08
[4], res09
[4], res10
[4], res11
[4], res12
[4], res13
[4], res14
[4], res15
[4];
907 __m128i res16
[4], res17
[4], res18
[4], res19
[4], res20
[4], res21
[4], res22
[4], res23
[4], res24
[4], res25
[4], res26
[4], res27
[4], res28
[4], res29
[4], res30
[4], res31
[4];
909 for (int i
= 0; i
< 4; i
++)
911 const int offset
= (i
<< 3);
914 T00
= _mm_loadu_si128((const __m128i
*)&src
[0 * 32 + offset
]);
915 T01
= _mm_loadu_si128((const __m128i
*)&src
[0 * 32 + offset
+ 4]);
916 in00
[i
] = _mm_packs_epi32(T00
, T01
);
918 T00
= _mm_loadu_si128((const __m128i
*)&src
[1 * 32 + offset
]);
919 T01
= _mm_loadu_si128((const __m128i
*)&src
[1 * 32 + offset
+ 4]);
920 in01
[i
] = _mm_packs_epi32(T00
, T01
);
922 T00
= _mm_loadu_si128((const __m128i
*)&src
[2 * 32 + offset
]);
923 T01
= _mm_loadu_si128((const __m128i
*)&src
[2 * 32 + offset
+ 4]);
924 in02
[i
] = _mm_packs_epi32(T00
, T01
);
926 T00
= _mm_loadu_si128((const __m128i
*)&src
[3 * 32 + offset
]);
927 T01
= _mm_loadu_si128((const __m128i
*)&src
[3 * 32 + offset
+ 4]);
928 in03
[i
] = _mm_packs_epi32(T00
, T01
);
930 T00
= _mm_loadu_si128((const __m128i
*)&src
[4 * 32 + offset
]);
931 T01
= _mm_loadu_si128((const __m128i
*)&src
[4 * 32 + offset
+ 4]);
932 in04
[i
] = _mm_packs_epi32(T00
, T01
);
934 T00
= _mm_loadu_si128((const __m128i
*)&src
[5 * 32 + offset
]);
935 T01
= _mm_loadu_si128((const __m128i
*)&src
[5 * 32 + offset
+ 4]);
936 in05
[i
] = _mm_packs_epi32(T00
, T01
);
938 T00
= _mm_loadu_si128((const __m128i
*)&src
[6 * 32 + offset
]);
939 T01
= _mm_loadu_si128((const __m128i
*)&src
[6 * 32 + offset
+ 4]);
940 in06
[i
] = _mm_packs_epi32(T00
, T01
);
942 T00
= _mm_loadu_si128((const __m128i
*)&src
[7 * 32 + offset
]);
943 T01
= _mm_loadu_si128((const __m128i
*)&src
[7 * 32 + offset
+ 4]);
944 in07
[i
] = _mm_packs_epi32(T00
, T01
);
946 T00
= _mm_loadu_si128((const __m128i
*)&src
[8 * 32 + offset
]);
947 T01
= _mm_loadu_si128((const __m128i
*)&src
[8 * 32 + offset
+ 4]);
948 in08
[i
] = _mm_packs_epi32(T00
, T01
);
950 T00
= _mm_loadu_si128((const __m128i
*)&src
[9 * 32 + offset
]);
951 T01
= _mm_loadu_si128((const __m128i
*)&src
[9 * 32 + offset
+ 4]);
952 in09
[i
] = _mm_packs_epi32(T00
, T01
);
954 T00
= _mm_loadu_si128((const __m128i
*)&src
[10 * 32 + offset
]);
955 T01
= _mm_loadu_si128((const __m128i
*)&src
[10 * 32 + offset
+ 4]);
956 in10
[i
] = _mm_packs_epi32(T00
, T01
);
958 T00
= _mm_loadu_si128((const __m128i
*)&src
[11 * 32 + offset
]);
959 T01
= _mm_loadu_si128((const __m128i
*)&src
[11 * 32 + offset
+ 4]);
960 in11
[i
] = _mm_packs_epi32(T00
, T01
);
962 T00
= _mm_loadu_si128((const __m128i
*)&src
[12 * 32 + offset
]);
963 T01
= _mm_loadu_si128((const __m128i
*)&src
[12 * 32 + offset
+ 4]);
964 in12
[i
] = _mm_packs_epi32(T00
, T01
);
966 T00
= _mm_loadu_si128((const __m128i
*)&src
[13 * 32 + offset
]);
967 T01
= _mm_loadu_si128((const __m128i
*)&src
[13 * 32 + offset
+ 4]);
968 in13
[i
] = _mm_packs_epi32(T00
, T01
);
970 T00
= _mm_loadu_si128((const __m128i
*)&src
[14 * 32 + offset
]);
971 T01
= _mm_loadu_si128((const __m128i
*)&src
[14 * 32 + offset
+ 4]);
972 in14
[i
] = _mm_packs_epi32(T00
, T01
);
974 T00
= _mm_loadu_si128((const __m128i
*)&src
[15 * 32 + offset
]);
975 T01
= _mm_loadu_si128((const __m128i
*)&src
[15 * 32 + offset
+ 4]);
976 in15
[i
] = _mm_packs_epi32(T00
, T01
);
978 T00
= _mm_loadu_si128((const __m128i
*)&src
[16 * 32 + offset
]);
979 T01
= _mm_loadu_si128((const __m128i
*)&src
[16 * 32 + offset
+ 4]);
980 in16
[i
] = _mm_packs_epi32(T00
, T01
);
982 T00
= _mm_loadu_si128((const __m128i
*)&src
[17 * 32 + offset
]);
983 T01
= _mm_loadu_si128((const __m128i
*)&src
[17 * 32 + offset
+ 4]);
984 in17
[i
] = _mm_packs_epi32(T00
, T01
);
986 T00
= _mm_loadu_si128((const __m128i
*)&src
[18 * 32 + offset
]);
987 T01
= _mm_loadu_si128((const __m128i
*)&src
[18 * 32 + offset
+ 4]);
988 in18
[i
] = _mm_packs_epi32(T00
, T01
);
990 T00
= _mm_loadu_si128((const __m128i
*)&src
[19 * 32 + offset
]);
991 T01
= _mm_loadu_si128((const __m128i
*)&src
[19 * 32 + offset
+ 4]);
992 in19
[i
] = _mm_packs_epi32(T00
, T01
);
994 T00
= _mm_loadu_si128((const __m128i
*)&src
[20 * 32 + offset
]);
995 T01
= _mm_loadu_si128((const __m128i
*)&src
[20 * 32 + offset
+ 4]);
996 in20
[i
] = _mm_packs_epi32(T00
, T01
);
998 T00
= _mm_loadu_si128((const __m128i
*)&src
[21 * 32 + offset
]);
999 T01
= _mm_loadu_si128((const __m128i
*)&src
[21 * 32 + offset
+ 4]);
1000 in21
[i
] = _mm_packs_epi32(T00
, T01
);
1002 T00
= _mm_loadu_si128((const __m128i
*)&src
[22 * 32 + offset
]);
1003 T01
= _mm_loadu_si128((const __m128i
*)&src
[22 * 32 + offset
+ 4]);
1004 in22
[i
] = _mm_packs_epi32(T00
, T01
);
1006 T00
= _mm_loadu_si128((const __m128i
*)&src
[23 * 32 + offset
]);
1007 T01
= _mm_loadu_si128((const __m128i
*)&src
[23 * 32 + offset
+ 4]);
1008 in23
[i
] = _mm_packs_epi32(T00
, T01
);
1010 T00
= _mm_loadu_si128((const __m128i
*)&src
[24 * 32 + offset
]);
1011 T01
= _mm_loadu_si128((const __m128i
*)&src
[24 * 32 + offset
+ 4]);
1012 in24
[i
] = _mm_packs_epi32(T00
, T01
);
1014 T00
= _mm_loadu_si128((const __m128i
*)&src
[25 * 32 + offset
]);
1015 T01
= _mm_loadu_si128((const __m128i
*)&src
[25 * 32 + offset
+ 4]);
1016 in25
[i
] = _mm_packs_epi32(T00
, T01
);
1018 T00
= _mm_loadu_si128((const __m128i
*)&src
[26 * 32 + offset
]);
1019 T01
= _mm_loadu_si128((const __m128i
*)&src
[26 * 32 + offset
+ 4]);
1020 in26
[i
] = _mm_packs_epi32(T00
, T01
);
1022 T00
= _mm_loadu_si128((const __m128i
*)&src
[27 * 32 + offset
]);
1023 T01
= _mm_loadu_si128((const __m128i
*)&src
[27 * 32 + offset
+ 4]);
1024 in27
[i
] = _mm_packs_epi32(T00
, T01
);
1026 T00
= _mm_loadu_si128((const __m128i
*)&src
[28 * 32 + offset
]);
1027 T01
= _mm_loadu_si128((const __m128i
*)&src
[28 * 32 + offset
+ 4]);
1028 in28
[i
] = _mm_packs_epi32(T00
, T01
);
1030 T00
= _mm_loadu_si128((const __m128i
*)&src
[29 * 32 + offset
]);
1031 T01
= _mm_loadu_si128((const __m128i
*)&src
[29 * 32 + offset
+ 4]);
1032 in29
[i
] = _mm_packs_epi32(T00
, T01
);
1034 T00
= _mm_loadu_si128((const __m128i
*)&src
[30 * 32 + offset
]);
1035 T01
= _mm_loadu_si128((const __m128i
*)&src
[30 * 32 + offset
+ 4]);
1036 in30
[i
] = _mm_packs_epi32(T00
, T01
);
1038 T00
= _mm_loadu_si128((const __m128i
*)&src
[31 * 32 + offset
]);
1039 T01
= _mm_loadu_si128((const __m128i
*)&src
[31 * 32 + offset
+ 4]);
1040 in31
[i
] = _mm_packs_epi32(T00
, T01
);
1043 for (int pass
= 0; pass
< 2; pass
++)
1047 c32_rnd
= _mm_set1_epi32(2048);
1051 for (int part
= 0; part
< 4; part
++)
1053 const __m128i T_00_00A
= _mm_unpacklo_epi16(in01
[part
], in03
[part
]); // [33 13 32 12 31 11 30 10]
1054 const __m128i T_00_00B
= _mm_unpackhi_epi16(in01
[part
], in03
[part
]); // [37 17 36 16 35 15 34 14]
1055 const __m128i T_00_01A
= _mm_unpacklo_epi16(in05
[part
], in07
[part
]); // [ ]
1056 const __m128i T_00_01B
= _mm_unpackhi_epi16(in05
[part
], in07
[part
]); // [ ]
1057 const __m128i T_00_02A
= _mm_unpacklo_epi16(in09
[part
], in11
[part
]); // [ ]
1058 const __m128i T_00_02B
= _mm_unpackhi_epi16(in09
[part
], in11
[part
]); // [ ]
1059 const __m128i T_00_03A
= _mm_unpacklo_epi16(in13
[part
], in15
[part
]); // [ ]
1060 const __m128i T_00_03B
= _mm_unpackhi_epi16(in13
[part
], in15
[part
]); // [ ]
1061 const __m128i T_00_04A
= _mm_unpacklo_epi16(in17
[part
], in19
[part
]); // [ ]
1062 const __m128i T_00_04B
= _mm_unpackhi_epi16(in17
[part
], in19
[part
]); // [ ]
1063 const __m128i T_00_05A
= _mm_unpacklo_epi16(in21
[part
], in23
[part
]); // [ ]
1064 const __m128i T_00_05B
= _mm_unpackhi_epi16(in21
[part
], in23
[part
]); // [ ]
1065 const __m128i T_00_06A
= _mm_unpacklo_epi16(in25
[part
], in27
[part
]); // [ ]
1066 const __m128i T_00_06B
= _mm_unpackhi_epi16(in25
[part
], in27
[part
]); // [ ]
1067 const __m128i T_00_07A
= _mm_unpacklo_epi16(in29
[part
], in31
[part
]); //
1068 const __m128i T_00_07B
= _mm_unpackhi_epi16(in29
[part
], in31
[part
]); // [ ]
1070 const __m128i T_00_08A
= _mm_unpacklo_epi16(in02
[part
], in06
[part
]); // [ ]
1071 const __m128i T_00_08B
= _mm_unpackhi_epi16(in02
[part
], in06
[part
]); // [ ]
1072 const __m128i T_00_09A
= _mm_unpacklo_epi16(in10
[part
], in14
[part
]); // [ ]
1073 const __m128i T_00_09B
= _mm_unpackhi_epi16(in10
[part
], in14
[part
]); // [ ]
1074 const __m128i T_00_10A
= _mm_unpacklo_epi16(in18
[part
], in22
[part
]); // [ ]
1075 const __m128i T_00_10B
= _mm_unpackhi_epi16(in18
[part
], in22
[part
]); // [ ]
1076 const __m128i T_00_11A
= _mm_unpacklo_epi16(in26
[part
], in30
[part
]); // [ ]
1077 const __m128i T_00_11B
= _mm_unpackhi_epi16(in26
[part
], in30
[part
]); // [ ]
1079 const __m128i T_00_12A
= _mm_unpacklo_epi16(in04
[part
], in12
[part
]); // [ ]
1080 const __m128i T_00_12B
= _mm_unpackhi_epi16(in04
[part
], in12
[part
]); // [ ]
1081 const __m128i T_00_13A
= _mm_unpacklo_epi16(in20
[part
], in28
[part
]); // [ ]
1082 const __m128i T_00_13B
= _mm_unpackhi_epi16(in20
[part
], in28
[part
]); // [ ]
1084 const __m128i T_00_14A
= _mm_unpacklo_epi16(in08
[part
], in24
[part
]); //
1085 const __m128i T_00_14B
= _mm_unpackhi_epi16(in08
[part
], in24
[part
]); // [ ]
1086 const __m128i T_00_15A
= _mm_unpacklo_epi16(in00
[part
], in16
[part
]); //
1087 const __m128i T_00_15B
= _mm_unpackhi_epi16(in00
[part
], in16
[part
]); // [ ]
1089 __m128i O00A
, O01A
, O02A
, O03A
, O04A
, O05A
, O06A
, O07A
, O08A
, O09A
, O10A
, O11A
, O12A
, O13A
, O14A
, O15A
;
1090 __m128i O00B
, O01B
, O02B
, O03B
, O04B
, O05B
, O06B
, O07B
, O08B
, O09B
, O10B
, O11B
, O12B
, O13B
, O14B
, O15B
;
1092 __m128i T00
, T01
, T02
, T03
;
1093 #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \
1094 T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \
1095 T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \
1096 T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \
1097 T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \
1098 row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03));
1100 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1101 c16_p90_p90
, c16_p85_p88
, c16_p78_p82
, c16_p67_p73
, c16_p54_p61
, c16_p38_p46
, c16_p22_p31
, c16_p04_p13
, O00A
)
1102 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1103 c16_p82_p90
, c16_p46_p67
, c16_n04_p22
, c16_n54_n31
, c16_n85_n73
, c16_n88_n90
, c16_n61_n78
, c16_n13_n38
, O01A
)
1104 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1105 c16_p67_p88
, c16_n13_p31
, c16_n82_n54
, c16_n78_n90
, c16_n04_n46
, c16_p73_p38
, c16_p85_p90
, c16_p22_p61
, O02A
)
1106 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1107 c16_p46_p85
, c16_n67_n13
, c16_n73_n90
, c16_p38_n22
, c16_p88_p82
, c16_n04_p54
, c16_n90_n61
, c16_n31_n78
, O03A
)
1108 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1109 c16_p22_p82
, c16_n90_n54
, c16_p13_n61
, c16_p85_p78
, c16_n46_p31
, c16_n67_n90
, c16_p73_p04
, c16_p38_p88
, O04A
)
1110 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1111 c16_n04_p78
, c16_n73_n82
, c16_p85_p13
, c16_n22_p67
, c16_n61_n88
, c16_p90_p31
, c16_n38_p54
, c16_n46_n90
, O05A
)
1112 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1113 c16_n31_p73
, c16_n22_n90
, c16_p67_p78
, c16_n90_n38
, c16_p82_n13
, c16_n46_p61
, c16_n04_n88
, c16_p54_p85
, O06A
)
1114 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1115 c16_n54_p67
, c16_p38_n78
, c16_n22_p85
, c16_p04_n90
, c16_p13_p90
, c16_n31_n88
, c16_p46_p82
, c16_n61_n73
, O07A
)
1116 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1117 c16_n73_p61
, c16_p82_n46
, c16_n88_p31
, c16_p90_n13
, c16_n90_n04
, c16_p85_p22
, c16_n78_n38
, c16_p67_p54
, O08A
)
1118 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1119 c16_n85_p54
, c16_p88_n04
, c16_n61_n46
, c16_p13_p82
, c16_p38_n90
, c16_n78_p67
, c16_p90_n22
, c16_n73_n31
, O09A
)
1120 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1121 c16_n90_p46
, c16_p54_p38
, c16_p31_n90
, c16_n88_p61
, c16_p67_p22
, c16_p13_n85
, c16_n82_p73
, c16_p78_p04
, O10A
)
1122 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1123 c16_n88_p38
, c16_n04_p73
, c16_p90_n67
, c16_n31_n46
, c16_n78_p85
, c16_p61_p13
, c16_p54_n90
, c16_n82_p22
, O11A
)
1124 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1125 c16_n78_p31
, c16_n61_p90
, c16_p54_p04
, c16_p82_n88
, c16_n22_n38
, c16_n90_p73
, c16_n13_p67
, c16_p85_n46
, O12A
)
1126 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1127 c16_n61_p22
, c16_n90_p85
, c16_n38_p73
, c16_p46_n04
, c16_p90_n78
, c16_p54_n82
, c16_n31_n13
, c16_n88_p67
, O13A
)
1128 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1129 c16_n38_p13
, c16_n78_p61
, c16_n90_p88
, c16_n73_p85
, c16_n31_p54
, c16_p22_p04
, c16_p67_n46
, c16_p90_n82
, O14A
)
1130 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
1131 c16_n13_p04
, c16_n31_p22
, c16_n46_p38
, c16_n61_p54
, c16_n73_p67
, c16_n82_p78
, c16_n88_p85
, c16_n90_p90
, O15A
)
1133 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1134 c16_p90_p90
, c16_p85_p88
, c16_p78_p82
, c16_p67_p73
, c16_p54_p61
, c16_p38_p46
, c16_p22_p31
, c16_p04_p13
, O00B
)
1135 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1136 c16_p82_p90
, c16_p46_p67
, c16_n04_p22
, c16_n54_n31
, c16_n85_n73
, c16_n88_n90
, c16_n61_n78
, c16_n13_n38
, O01B
)
1137 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1138 c16_p67_p88
, c16_n13_p31
, c16_n82_n54
, c16_n78_n90
, c16_n04_n46
, c16_p73_p38
, c16_p85_p90
, c16_p22_p61
, O02B
)
1139 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1140 c16_p46_p85
, c16_n67_n13
, c16_n73_n90
, c16_p38_n22
, c16_p88_p82
, c16_n04_p54
, c16_n90_n61
, c16_n31_n78
, O03B
)
1141 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1142 c16_p22_p82
, c16_n90_n54
, c16_p13_n61
, c16_p85_p78
, c16_n46_p31
, c16_n67_n90
, c16_p73_p04
, c16_p38_p88
, O04B
)
1143 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1144 c16_n04_p78
, c16_n73_n82
, c16_p85_p13
, c16_n22_p67
, c16_n61_n88
, c16_p90_p31
, c16_n38_p54
, c16_n46_n90
, O05B
)
1145 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1146 c16_n31_p73
, c16_n22_n90
, c16_p67_p78
, c16_n90_n38
, c16_p82_n13
, c16_n46_p61
, c16_n04_n88
, c16_p54_p85
, O06B
)
1147 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1148 c16_n54_p67
, c16_p38_n78
, c16_n22_p85
, c16_p04_n90
, c16_p13_p90
, c16_n31_n88
, c16_p46_p82
, c16_n61_n73
, O07B
)
1149 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1150 c16_n73_p61
, c16_p82_n46
, c16_n88_p31
, c16_p90_n13
, c16_n90_n04
, c16_p85_p22
, c16_n78_n38
, c16_p67_p54
, O08B
)
1151 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1152 c16_n85_p54
, c16_p88_n04
, c16_n61_n46
, c16_p13_p82
, c16_p38_n90
, c16_n78_p67
, c16_p90_n22
, c16_n73_n31
, O09B
)
1153 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1154 c16_n90_p46
, c16_p54_p38
, c16_p31_n90
, c16_n88_p61
, c16_p67_p22
, c16_p13_n85
, c16_n82_p73
, c16_p78_p04
, O10B
)
1155 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1156 c16_n88_p38
, c16_n04_p73
, c16_p90_n67
, c16_n31_n46
, c16_n78_p85
, c16_p61_p13
, c16_p54_n90
, c16_n82_p22
, O11B
)
1157 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1158 c16_n78_p31
, c16_n61_p90
, c16_p54_p04
, c16_p82_n88
, c16_n22_n38
, c16_n90_p73
, c16_n13_p67
, c16_p85_n46
, O12B
)
1159 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1160 c16_n61_p22
, c16_n90_p85
, c16_n38_p73
, c16_p46_n04
, c16_p90_n78
, c16_p54_n82
, c16_n31_n13
, c16_n88_p67
, O13B
)
1161 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1162 c16_n38_p13
, c16_n78_p61
, c16_n90_p88
, c16_n73_p85
, c16_n31_p54
, c16_p22_p04
, c16_p67_n46
, c16_p90_n82
, O14B
)
1163 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1164 c16_n13_p04
, c16_n31_p22
, c16_n46_p38
, c16_n61_p54
, c16_n73_p67
, c16_n82_p78
, c16_n88_p85
, c16_n90_p90
, O15B
)
1169 __m128i EO0A
, EO1A
, EO2A
, EO3A
, EO4A
, EO5A
, EO6A
, EO7A
;
1170 __m128i EO0B
, EO1B
, EO2B
, EO3B
, EO4B
, EO5B
, EO6B
, EO7B
;
1173 #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \
1174 T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \
1175 T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \
1176 row = _mm_add_epi32(T00, T01);
1178 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_p87_p90
, c16_p70_p80
, c16_p43_p57
, c16_p09_p25
, EO0A
)
1179 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_p57_p87
, c16_n43_p09
, c16_n90_n80
, c16_n25_n70
, EO1A
)
1180 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_p09_p80
, c16_n87_n70
, c16_p57_n25
, c16_p43_p90
, EO2A
)
1181 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n43_p70
, c16_p09_n87
, c16_p25_p90
, c16_n57_n80
, EO3A
)
1182 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n80_p57
, c16_p90_n25
, c16_n87_n09
, c16_p70_p43
, EO4A
)
1183 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n90_p43
, c16_p25_p57
, c16_p70_n87
, c16_n80_p09
, EO5A
)
1184 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n70_p25
, c16_n80_p90
, c16_p09_p43
, c16_p87_n57
, EO6A
)
1185 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n25_p09
, c16_n57_p43
, c16_n80_p70
, c16_n90_p87
, EO7A
)
1187 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_p87_p90
, c16_p70_p80
, c16_p43_p57
, c16_p09_p25
, EO0B
)
1188 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_p57_p87
, c16_n43_p09
, c16_n90_n80
, c16_n25_n70
, EO1B
)
1189 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_p09_p80
, c16_n87_n70
, c16_p57_n25
, c16_p43_p90
, EO2B
)
1190 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n43_p70
, c16_p09_n87
, c16_p25_p90
, c16_n57_n80
, EO3B
)
1191 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n80_p57
, c16_p90_n25
, c16_n87_n09
, c16_p70_p43
, EO4B
)
1192 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n90_p43
, c16_p25_p57
, c16_p70_n87
, c16_n80_p09
, EO5B
)
1193 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n70_p25
, c16_n80_p90
, c16_p09_p43
, c16_p87_n57
, EO6B
)
1194 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n25_p09
, c16_n57_p43
, c16_n80_p70
, c16_n90_p87
, EO7B
)
1198 const __m128i EEO0A
= _mm_add_epi32(_mm_madd_epi16(T_00_12A
, c16_p75_p89
), _mm_madd_epi16(T_00_13A
, c16_p18_p50
)); // EEO0
1199 const __m128i EEO0B
= _mm_add_epi32(_mm_madd_epi16(T_00_12B
, c16_p75_p89
), _mm_madd_epi16(T_00_13B
, c16_p18_p50
));
1200 const __m128i EEO1A
= _mm_add_epi32(_mm_madd_epi16(T_00_12A
, c16_n18_p75
), _mm_madd_epi16(T_00_13A
, c16_n50_n89
)); // EEO1
1201 const __m128i EEO1B
= _mm_add_epi32(_mm_madd_epi16(T_00_12B
, c16_n18_p75
), _mm_madd_epi16(T_00_13B
, c16_n50_n89
));
1202 const __m128i EEO2A
= _mm_add_epi32(_mm_madd_epi16(T_00_12A
, c16_n89_p50
), _mm_madd_epi16(T_00_13A
, c16_p75_p18
)); // EEO2
1203 const __m128i EEO2B
= _mm_add_epi32(_mm_madd_epi16(T_00_12B
, c16_n89_p50
), _mm_madd_epi16(T_00_13B
, c16_p75_p18
));
1204 const __m128i EEO3A
= _mm_add_epi32(_mm_madd_epi16(T_00_12A
, c16_n50_p18
), _mm_madd_epi16(T_00_13A
, c16_n89_p75
)); // EEO3
1205 const __m128i EEO3B
= _mm_add_epi32(_mm_madd_epi16(T_00_12B
, c16_n50_p18
), _mm_madd_epi16(T_00_13B
, c16_n89_p75
));
1207 const __m128i EEEO0A
= _mm_madd_epi16(T_00_14A
, c16_p36_p83
);
1208 const __m128i EEEO0B
= _mm_madd_epi16(T_00_14B
, c16_p36_p83
);
1209 const __m128i EEEO1A
= _mm_madd_epi16(T_00_14A
, c16_n83_p36
);
1210 const __m128i EEEO1B
= _mm_madd_epi16(T_00_14B
, c16_n83_p36
);
1212 const __m128i EEEE0A
= _mm_madd_epi16(T_00_15A
, c16_p64_p64
);
1213 const __m128i EEEE0B
= _mm_madd_epi16(T_00_15B
, c16_p64_p64
);
1214 const __m128i EEEE1A
= _mm_madd_epi16(T_00_15A
, c16_n64_p64
);
1215 const __m128i EEEE1B
= _mm_madd_epi16(T_00_15B
, c16_n64_p64
);
1217 const __m128i EEE0A
= _mm_add_epi32(EEEE0A
, EEEO0A
); // EEE0 = EEEE0 + EEEO0
1218 const __m128i EEE0B
= _mm_add_epi32(EEEE0B
, EEEO0B
);
1219 const __m128i EEE1A
= _mm_add_epi32(EEEE1A
, EEEO1A
); // EEE1 = EEEE1 + EEEO1
1220 const __m128i EEE1B
= _mm_add_epi32(EEEE1B
, EEEO1B
);
1221 const __m128i EEE3A
= _mm_sub_epi32(EEEE0A
, EEEO0A
); // EEE2 = EEEE0 - EEEO0
1222 const __m128i EEE3B
= _mm_sub_epi32(EEEE0B
, EEEO0B
);
1223 const __m128i EEE2A
= _mm_sub_epi32(EEEE1A
, EEEO1A
); // EEE3 = EEEE1 - EEEO1
1224 const __m128i EEE2B
= _mm_sub_epi32(EEEE1B
, EEEO1B
);
1226 const __m128i EE0A
= _mm_add_epi32(EEE0A
, EEO0A
); // EE0 = EEE0 + EEO0
1227 const __m128i EE0B
= _mm_add_epi32(EEE0B
, EEO0B
);
1228 const __m128i EE1A
= _mm_add_epi32(EEE1A
, EEO1A
); // EE1 = EEE1 + EEO1
1229 const __m128i EE1B
= _mm_add_epi32(EEE1B
, EEO1B
);
1230 const __m128i EE2A
= _mm_add_epi32(EEE2A
, EEO2A
); // EE2 = EEE0 + EEO0
1231 const __m128i EE2B
= _mm_add_epi32(EEE2B
, EEO2B
);
1232 const __m128i EE3A
= _mm_add_epi32(EEE3A
, EEO3A
); // EE3 = EEE1 + EEO1
1233 const __m128i EE3B
= _mm_add_epi32(EEE3B
, EEO3B
);
1234 const __m128i EE7A
= _mm_sub_epi32(EEE0A
, EEO0A
); // EE7 = EEE0 - EEO0
1235 const __m128i EE7B
= _mm_sub_epi32(EEE0B
, EEO0B
);
1236 const __m128i EE6A
= _mm_sub_epi32(EEE1A
, EEO1A
); // EE6 = EEE1 - EEO1
1237 const __m128i EE6B
= _mm_sub_epi32(EEE1B
, EEO1B
);
1238 const __m128i EE5A
= _mm_sub_epi32(EEE2A
, EEO2A
); // EE5 = EEE0 - EEO0
1239 const __m128i EE5B
= _mm_sub_epi32(EEE2B
, EEO2B
);
1240 const __m128i EE4A
= _mm_sub_epi32(EEE3A
, EEO3A
); // EE4 = EEE1 - EEO1
1241 const __m128i EE4B
= _mm_sub_epi32(EEE3B
, EEO3B
);
1243 const __m128i E0A
= _mm_add_epi32(EE0A
, EO0A
); // E0 = EE0 + EO0
1244 const __m128i E0B
= _mm_add_epi32(EE0B
, EO0B
);
1245 const __m128i E1A
= _mm_add_epi32(EE1A
, EO1A
); // E1 = EE1 + EO1
1246 const __m128i E1B
= _mm_add_epi32(EE1B
, EO1B
);
1247 const __m128i E2A
= _mm_add_epi32(EE2A
, EO2A
); // E2 = EE2 + EO2
1248 const __m128i E2B
= _mm_add_epi32(EE2B
, EO2B
);
1249 const __m128i E3A
= _mm_add_epi32(EE3A
, EO3A
); // E3 = EE3 + EO3
1250 const __m128i E3B
= _mm_add_epi32(EE3B
, EO3B
);
1251 const __m128i E4A
= _mm_add_epi32(EE4A
, EO4A
); // E4 =
1252 const __m128i E4B
= _mm_add_epi32(EE4B
, EO4B
);
1253 const __m128i E5A
= _mm_add_epi32(EE5A
, EO5A
); // E5 =
1254 const __m128i E5B
= _mm_add_epi32(EE5B
, EO5B
);
1255 const __m128i E6A
= _mm_add_epi32(EE6A
, EO6A
); // E6 =
1256 const __m128i E6B
= _mm_add_epi32(EE6B
, EO6B
);
1257 const __m128i E7A
= _mm_add_epi32(EE7A
, EO7A
); // E7 =
1258 const __m128i E7B
= _mm_add_epi32(EE7B
, EO7B
);
1259 const __m128i EFA
= _mm_sub_epi32(EE0A
, EO0A
); // EF = EE0 - EO0
1260 const __m128i EFB
= _mm_sub_epi32(EE0B
, EO0B
);
1261 const __m128i EEA
= _mm_sub_epi32(EE1A
, EO1A
); // EE = EE1 - EO1
1262 const __m128i EEB
= _mm_sub_epi32(EE1B
, EO1B
);
1263 const __m128i EDA
= _mm_sub_epi32(EE2A
, EO2A
); // ED = EE2 - EO2
1264 const __m128i EDB
= _mm_sub_epi32(EE2B
, EO2B
);
1265 const __m128i ECA
= _mm_sub_epi32(EE3A
, EO3A
); // EC = EE3 - EO3
1266 const __m128i ECB
= _mm_sub_epi32(EE3B
, EO3B
);
1267 const __m128i EBA
= _mm_sub_epi32(EE4A
, EO4A
); // EB =
1268 const __m128i EBB
= _mm_sub_epi32(EE4B
, EO4B
);
1269 const __m128i EAA
= _mm_sub_epi32(EE5A
, EO5A
); // EA =
1270 const __m128i EAB
= _mm_sub_epi32(EE5B
, EO5B
);
1271 const __m128i E9A
= _mm_sub_epi32(EE6A
, EO6A
); // E9 =
1272 const __m128i E9B
= _mm_sub_epi32(EE6B
, EO6B
);
1273 const __m128i E8A
= _mm_sub_epi32(EE7A
, EO7A
); // E8 =
1274 const __m128i E8B
= _mm_sub_epi32(EE7B
, EO7B
);
1276 const __m128i T10A
= _mm_add_epi32(E0A
, c32_rnd
); // E0 + rnd
1277 const __m128i T10B
= _mm_add_epi32(E0B
, c32_rnd
);
1278 const __m128i T11A
= _mm_add_epi32(E1A
, c32_rnd
); // E1 + rnd
1279 const __m128i T11B
= _mm_add_epi32(E1B
, c32_rnd
);
1280 const __m128i T12A
= _mm_add_epi32(E2A
, c32_rnd
); // E2 + rnd
1281 const __m128i T12B
= _mm_add_epi32(E2B
, c32_rnd
);
1282 const __m128i T13A
= _mm_add_epi32(E3A
, c32_rnd
); // E3 + rnd
1283 const __m128i T13B
= _mm_add_epi32(E3B
, c32_rnd
);
1284 const __m128i T14A
= _mm_add_epi32(E4A
, c32_rnd
); // E4 + rnd
1285 const __m128i T14B
= _mm_add_epi32(E4B
, c32_rnd
);
1286 const __m128i T15A
= _mm_add_epi32(E5A
, c32_rnd
); // E5 + rnd
1287 const __m128i T15B
= _mm_add_epi32(E5B
, c32_rnd
);
1288 const __m128i T16A
= _mm_add_epi32(E6A
, c32_rnd
); // E6 + rnd
1289 const __m128i T16B
= _mm_add_epi32(E6B
, c32_rnd
);
1290 const __m128i T17A
= _mm_add_epi32(E7A
, c32_rnd
); // E7 + rnd
1291 const __m128i T17B
= _mm_add_epi32(E7B
, c32_rnd
);
1292 const __m128i T18A
= _mm_add_epi32(E8A
, c32_rnd
); // E8 + rnd
1293 const __m128i T18B
= _mm_add_epi32(E8B
, c32_rnd
);
1294 const __m128i T19A
= _mm_add_epi32(E9A
, c32_rnd
); // E9 + rnd
1295 const __m128i T19B
= _mm_add_epi32(E9B
, c32_rnd
);
1296 const __m128i T1AA
= _mm_add_epi32(EAA
, c32_rnd
); // E10 + rnd
1297 const __m128i T1AB
= _mm_add_epi32(EAB
, c32_rnd
);
1298 const __m128i T1BA
= _mm_add_epi32(EBA
, c32_rnd
); // E11 + rnd
1299 const __m128i T1BB
= _mm_add_epi32(EBB
, c32_rnd
);
1300 const __m128i T1CA
= _mm_add_epi32(ECA
, c32_rnd
); // E12 + rnd
1301 const __m128i T1CB
= _mm_add_epi32(ECB
, c32_rnd
);
1302 const __m128i T1DA
= _mm_add_epi32(EDA
, c32_rnd
); // E13 + rnd
1303 const __m128i T1DB
= _mm_add_epi32(EDB
, c32_rnd
);
1304 const __m128i T1EA
= _mm_add_epi32(EEA
, c32_rnd
); // E14 + rnd
1305 const __m128i T1EB
= _mm_add_epi32(EEB
, c32_rnd
);
1306 const __m128i T1FA
= _mm_add_epi32(EFA
, c32_rnd
); // E15 + rnd
1307 const __m128i T1FB
= _mm_add_epi32(EFB
, c32_rnd
);
1309 const __m128i T2_00A
= _mm_add_epi32(T10A
, O00A
); // E0 + O0 + rnd
1310 const __m128i T2_00B
= _mm_add_epi32(T10B
, O00B
);
1311 const __m128i T2_01A
= _mm_add_epi32(T11A
, O01A
); // E1 + O1 + rnd
1312 const __m128i T2_01B
= _mm_add_epi32(T11B
, O01B
);
1313 const __m128i T2_02A
= _mm_add_epi32(T12A
, O02A
); // E2 + O2 + rnd
1314 const __m128i T2_02B
= _mm_add_epi32(T12B
, O02B
);
1315 const __m128i T2_03A
= _mm_add_epi32(T13A
, O03A
); // E3 + O3 + rnd
1316 const __m128i T2_03B
= _mm_add_epi32(T13B
, O03B
);
1317 const __m128i T2_04A
= _mm_add_epi32(T14A
, O04A
); // E4
1318 const __m128i T2_04B
= _mm_add_epi32(T14B
, O04B
);
1319 const __m128i T2_05A
= _mm_add_epi32(T15A
, O05A
); // E5
1320 const __m128i T2_05B
= _mm_add_epi32(T15B
, O05B
);
1321 const __m128i T2_06A
= _mm_add_epi32(T16A
, O06A
); // E6
1322 const __m128i T2_06B
= _mm_add_epi32(T16B
, O06B
);
1323 const __m128i T2_07A
= _mm_add_epi32(T17A
, O07A
); // E7
1324 const __m128i T2_07B
= _mm_add_epi32(T17B
, O07B
);
1325 const __m128i T2_08A
= _mm_add_epi32(T18A
, O08A
); // E8
1326 const __m128i T2_08B
= _mm_add_epi32(T18B
, O08B
);
1327 const __m128i T2_09A
= _mm_add_epi32(T19A
, O09A
); // E9
1328 const __m128i T2_09B
= _mm_add_epi32(T19B
, O09B
);
1329 const __m128i T2_10A
= _mm_add_epi32(T1AA
, O10A
); // E10
1330 const __m128i T2_10B
= _mm_add_epi32(T1AB
, O10B
);
1331 const __m128i T2_11A
= _mm_add_epi32(T1BA
, O11A
); // E11
1332 const __m128i T2_11B
= _mm_add_epi32(T1BB
, O11B
);
1333 const __m128i T2_12A
= _mm_add_epi32(T1CA
, O12A
); // E12
1334 const __m128i T2_12B
= _mm_add_epi32(T1CB
, O12B
);
1335 const __m128i T2_13A
= _mm_add_epi32(T1DA
, O13A
); // E13
1336 const __m128i T2_13B
= _mm_add_epi32(T1DB
, O13B
);
1337 const __m128i T2_14A
= _mm_add_epi32(T1EA
, O14A
); // E14
1338 const __m128i T2_14B
= _mm_add_epi32(T1EB
, O14B
);
1339 const __m128i T2_15A
= _mm_add_epi32(T1FA
, O15A
); // E15
1340 const __m128i T2_15B
= _mm_add_epi32(T1FB
, O15B
);
1341 const __m128i T2_31A
= _mm_sub_epi32(T10A
, O00A
); // E0 - O0 + rnd
1342 const __m128i T2_31B
= _mm_sub_epi32(T10B
, O00B
);
1343 const __m128i T2_30A
= _mm_sub_epi32(T11A
, O01A
); // E1 - O1 + rnd
1344 const __m128i T2_30B
= _mm_sub_epi32(T11B
, O01B
);
1345 const __m128i T2_29A
= _mm_sub_epi32(T12A
, O02A
); // E2 - O2 + rnd
1346 const __m128i T2_29B
= _mm_sub_epi32(T12B
, O02B
);
1347 const __m128i T2_28A
= _mm_sub_epi32(T13A
, O03A
); // E3 - O3 + rnd
1348 const __m128i T2_28B
= _mm_sub_epi32(T13B
, O03B
);
1349 const __m128i T2_27A
= _mm_sub_epi32(T14A
, O04A
); // E4
1350 const __m128i T2_27B
= _mm_sub_epi32(T14B
, O04B
);
1351 const __m128i T2_26A
= _mm_sub_epi32(T15A
, O05A
); // E5
1352 const __m128i T2_26B
= _mm_sub_epi32(T15B
, O05B
);
1353 const __m128i T2_25A
= _mm_sub_epi32(T16A
, O06A
); // E6
1354 const __m128i T2_25B
= _mm_sub_epi32(T16B
, O06B
);
1355 const __m128i T2_24A
= _mm_sub_epi32(T17A
, O07A
); // E7
1356 const __m128i T2_24B
= _mm_sub_epi32(T17B
, O07B
);
1357 const __m128i T2_23A
= _mm_sub_epi32(T18A
, O08A
); //
1358 const __m128i T2_23B
= _mm_sub_epi32(T18B
, O08B
);
1359 const __m128i T2_22A
= _mm_sub_epi32(T19A
, O09A
); //
1360 const __m128i T2_22B
= _mm_sub_epi32(T19B
, O09B
);
1361 const __m128i T2_21A
= _mm_sub_epi32(T1AA
, O10A
); //
1362 const __m128i T2_21B
= _mm_sub_epi32(T1AB
, O10B
);
1363 const __m128i T2_20A
= _mm_sub_epi32(T1BA
, O11A
); //
1364 const __m128i T2_20B
= _mm_sub_epi32(T1BB
, O11B
);
1365 const __m128i T2_19A
= _mm_sub_epi32(T1CA
, O12A
); //
1366 const __m128i T2_19B
= _mm_sub_epi32(T1CB
, O12B
);
1367 const __m128i T2_18A
= _mm_sub_epi32(T1DA
, O13A
); //
1368 const __m128i T2_18B
= _mm_sub_epi32(T1DB
, O13B
);
1369 const __m128i T2_17A
= _mm_sub_epi32(T1EA
, O14A
); //
1370 const __m128i T2_17B
= _mm_sub_epi32(T1EB
, O14B
);
1371 const __m128i T2_16A
= _mm_sub_epi32(T1FA
, O15A
); //
1372 const __m128i T2_16B
= _mm_sub_epi32(T1FB
, O15B
);
1374 const __m128i T3_00A
= _mm_srai_epi32(T2_00A
, nShift
); // [30 20 10 00]
1375 const __m128i T3_00B
= _mm_srai_epi32(T2_00B
, nShift
); // [70 60 50 40]
1376 const __m128i T3_01A
= _mm_srai_epi32(T2_01A
, nShift
); // [31 21 11 01]
1377 const __m128i T3_01B
= _mm_srai_epi32(T2_01B
, nShift
); // [71 61 51 41]
1378 const __m128i T3_02A
= _mm_srai_epi32(T2_02A
, nShift
); // [32 22 12 02]
1379 const __m128i T3_02B
= _mm_srai_epi32(T2_02B
, nShift
); // [72 62 52 42]
1380 const __m128i T3_03A
= _mm_srai_epi32(T2_03A
, nShift
); // [33 23 13 03]
1381 const __m128i T3_03B
= _mm_srai_epi32(T2_03B
, nShift
); // [73 63 53 43]
1382 const __m128i T3_04A
= _mm_srai_epi32(T2_04A
, nShift
); // [33 24 14 04]
1383 const __m128i T3_04B
= _mm_srai_epi32(T2_04B
, nShift
); // [74 64 54 44]
1384 const __m128i T3_05A
= _mm_srai_epi32(T2_05A
, nShift
); // [35 25 15 05]
1385 const __m128i T3_05B
= _mm_srai_epi32(T2_05B
, nShift
); // [75 65 55 45]
1386 const __m128i T3_06A
= _mm_srai_epi32(T2_06A
, nShift
); // [36 26 16 06]
1387 const __m128i T3_06B
= _mm_srai_epi32(T2_06B
, nShift
); // [76 66 56 46]
1388 const __m128i T3_07A
= _mm_srai_epi32(T2_07A
, nShift
); // [37 27 17 07]
1389 const __m128i T3_07B
= _mm_srai_epi32(T2_07B
, nShift
); // [77 67 57 47]
1390 const __m128i T3_08A
= _mm_srai_epi32(T2_08A
, nShift
); // [30 20 10 00] x8
1391 const __m128i T3_08B
= _mm_srai_epi32(T2_08B
, nShift
); // [70 60 50 40]
1392 const __m128i T3_09A
= _mm_srai_epi32(T2_09A
, nShift
); // [31 21 11 01] x9
1393 const __m128i T3_09B
= _mm_srai_epi32(T2_09B
, nShift
); // [71 61 51 41]
1394 const __m128i T3_10A
= _mm_srai_epi32(T2_10A
, nShift
); // [32 22 12 02] xA
1395 const __m128i T3_10B
= _mm_srai_epi32(T2_10B
, nShift
); // [72 62 52 42]
1396 const __m128i T3_11A
= _mm_srai_epi32(T2_11A
, nShift
); // [33 23 13 03] xB
1397 const __m128i T3_11B
= _mm_srai_epi32(T2_11B
, nShift
); // [73 63 53 43]
1398 const __m128i T3_12A
= _mm_srai_epi32(T2_12A
, nShift
); // [33 24 14 04] xC
1399 const __m128i T3_12B
= _mm_srai_epi32(T2_12B
, nShift
); // [74 64 54 44]
1400 const __m128i T3_13A
= _mm_srai_epi32(T2_13A
, nShift
); // [35 25 15 05] xD
1401 const __m128i T3_13B
= _mm_srai_epi32(T2_13B
, nShift
); // [75 65 55 45]
1402 const __m128i T3_14A
= _mm_srai_epi32(T2_14A
, nShift
); // [36 26 16 06] xE
1403 const __m128i T3_14B
= _mm_srai_epi32(T2_14B
, nShift
); // [76 66 56 46]
1404 const __m128i T3_15A
= _mm_srai_epi32(T2_15A
, nShift
); // [37 27 17 07] xF
1405 const __m128i T3_15B
= _mm_srai_epi32(T2_15B
, nShift
); // [77 67 57 47]
1407 const __m128i T3_16A
= _mm_srai_epi32(T2_16A
, nShift
); // [30 20 10 00]
1408 const __m128i T3_16B
= _mm_srai_epi32(T2_16B
, nShift
); // [70 60 50 40]
1409 const __m128i T3_17A
= _mm_srai_epi32(T2_17A
, nShift
); // [31 21 11 01]
1410 const __m128i T3_17B
= _mm_srai_epi32(T2_17B
, nShift
); // [71 61 51 41]
1411 const __m128i T3_18A
= _mm_srai_epi32(T2_18A
, nShift
); // [32 22 12 02]
1412 const __m128i T3_18B
= _mm_srai_epi32(T2_18B
, nShift
); // [72 62 52 42]
1413 const __m128i T3_19A
= _mm_srai_epi32(T2_19A
, nShift
); // [33 23 13 03]
1414 const __m128i T3_19B
= _mm_srai_epi32(T2_19B
, nShift
); // [73 63 53 43]
1415 const __m128i T3_20A
= _mm_srai_epi32(T2_20A
, nShift
); // [33 24 14 04]
1416 const __m128i T3_20B
= _mm_srai_epi32(T2_20B
, nShift
); // [74 64 54 44]
1417 const __m128i T3_21A
= _mm_srai_epi32(T2_21A
, nShift
); // [35 25 15 05]
1418 const __m128i T3_21B
= _mm_srai_epi32(T2_21B
, nShift
); // [75 65 55 45]
1419 const __m128i T3_22A
= _mm_srai_epi32(T2_22A
, nShift
); // [36 26 16 06]
1420 const __m128i T3_22B
= _mm_srai_epi32(T2_22B
, nShift
); // [76 66 56 46]
1421 const __m128i T3_23A
= _mm_srai_epi32(T2_23A
, nShift
); // [37 27 17 07]
1422 const __m128i T3_23B
= _mm_srai_epi32(T2_23B
, nShift
); // [77 67 57 47]
1423 const __m128i T3_24A
= _mm_srai_epi32(T2_24A
, nShift
); // [30 20 10 00] x8
1424 const __m128i T3_24B
= _mm_srai_epi32(T2_24B
, nShift
); // [70 60 50 40]
1425 const __m128i T3_25A
= _mm_srai_epi32(T2_25A
, nShift
); // [31 21 11 01] x9
1426 const __m128i T3_25B
= _mm_srai_epi32(T2_25B
, nShift
); // [71 61 51 41]
1427 const __m128i T3_26A
= _mm_srai_epi32(T2_26A
, nShift
); // [32 22 12 02] xA
1428 const __m128i T3_26B
= _mm_srai_epi32(T2_26B
, nShift
); // [72 62 52 42]
1429 const __m128i T3_27A
= _mm_srai_epi32(T2_27A
, nShift
); // [33 23 13 03] xB
1430 const __m128i T3_27B
= _mm_srai_epi32(T2_27B
, nShift
); // [73 63 53 43]
1431 const __m128i T3_28A
= _mm_srai_epi32(T2_28A
, nShift
); // [33 24 14 04] xC
1432 const __m128i T3_28B
= _mm_srai_epi32(T2_28B
, nShift
); // [74 64 54 44]
1433 const __m128i T3_29A
= _mm_srai_epi32(T2_29A
, nShift
); // [35 25 15 05] xD
1434 const __m128i T3_29B
= _mm_srai_epi32(T2_29B
, nShift
); // [75 65 55 45]
1435 const __m128i T3_30A
= _mm_srai_epi32(T2_30A
, nShift
); // [36 26 16 06] xE
1436 const __m128i T3_30B
= _mm_srai_epi32(T2_30B
, nShift
); // [76 66 56 46]
1437 const __m128i T3_31A
= _mm_srai_epi32(T2_31A
, nShift
); // [37 27 17 07] xF
1438 const __m128i T3_31B
= _mm_srai_epi32(T2_31B
, nShift
); // [77 67 57 47]
1440 res00
[part
] = _mm_packs_epi32(T3_00A
, T3_00B
); // [70 60 50 40 30 20 10 00]
1441 res01
[part
] = _mm_packs_epi32(T3_01A
, T3_01B
); // [71 61 51 41 31 21 11 01]
1442 res02
[part
] = _mm_packs_epi32(T3_02A
, T3_02B
); // [72 62 52 42 32 22 12 02]
1443 res03
[part
] = _mm_packs_epi32(T3_03A
, T3_03B
); // [73 63 53 43 33 23 13 03]
1444 res04
[part
] = _mm_packs_epi32(T3_04A
, T3_04B
); // [74 64 54 44 34 24 14 04]
1445 res05
[part
] = _mm_packs_epi32(T3_05A
, T3_05B
); // [75 65 55 45 35 25 15 05]
1446 res06
[part
] = _mm_packs_epi32(T3_06A
, T3_06B
); // [76 66 56 46 36 26 16 06]
1447 res07
[part
] = _mm_packs_epi32(T3_07A
, T3_07B
); // [77 67 57 47 37 27 17 07]
1448 res08
[part
] = _mm_packs_epi32(T3_08A
, T3_08B
); // [A0 ... 80]
1449 res09
[part
] = _mm_packs_epi32(T3_09A
, T3_09B
); // [A1 ... 81]
1450 res10
[part
] = _mm_packs_epi32(T3_10A
, T3_10B
); // [A2 ... 82]
1451 res11
[part
] = _mm_packs_epi32(T3_11A
, T3_11B
); // [A3 ... 83]
1452 res12
[part
] = _mm_packs_epi32(T3_12A
, T3_12B
); // [A4 ... 84]
1453 res13
[part
] = _mm_packs_epi32(T3_13A
, T3_13B
); // [A5 ... 85]
1454 res14
[part
] = _mm_packs_epi32(T3_14A
, T3_14B
); // [A6 ... 86]
1455 res15
[part
] = _mm_packs_epi32(T3_15A
, T3_15B
); // [A7 ... 87]
1456 res16
[part
] = _mm_packs_epi32(T3_16A
, T3_16B
);
1457 res17
[part
] = _mm_packs_epi32(T3_17A
, T3_17B
);
1458 res18
[part
] = _mm_packs_epi32(T3_18A
, T3_18B
);
1459 res19
[part
] = _mm_packs_epi32(T3_19A
, T3_19B
);
1460 res20
[part
] = _mm_packs_epi32(T3_20A
, T3_20B
);
1461 res21
[part
] = _mm_packs_epi32(T3_21A
, T3_21B
);
1462 res22
[part
] = _mm_packs_epi32(T3_22A
, T3_22B
);
1463 res23
[part
] = _mm_packs_epi32(T3_23A
, T3_23B
);
1464 res24
[part
] = _mm_packs_epi32(T3_24A
, T3_24B
);
1465 res25
[part
] = _mm_packs_epi32(T3_25A
, T3_25B
);
1466 res26
[part
] = _mm_packs_epi32(T3_26A
, T3_26B
);
1467 res27
[part
] = _mm_packs_epi32(T3_27A
, T3_27B
);
1468 res28
[part
] = _mm_packs_epi32(T3_28A
, T3_28B
);
1469 res29
[part
] = _mm_packs_epi32(T3_29A
, T3_29B
);
1470 res30
[part
] = _mm_packs_epi32(T3_30A
, T3_30B
);
1471 res31
[part
] = _mm_packs_epi32(T3_31A
, T3_31B
);
1473 //transpose matrix 8x8 16bit.
1475 __m128i tr0_0
, tr0_1
, tr0_2
, tr0_3
, tr0_4
, tr0_5
, tr0_6
, tr0_7
;
1476 __m128i tr1_0
, tr1_1
, tr1_2
, tr1_3
, tr1_4
, tr1_5
, tr1_6
, tr1_7
;
1477 #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
1478 tr0_0 = _mm_unpacklo_epi16(I0, I1); \
1479 tr0_1 = _mm_unpacklo_epi16(I2, I3); \
1480 tr0_2 = _mm_unpackhi_epi16(I0, I1); \
1481 tr0_3 = _mm_unpackhi_epi16(I2, I3); \
1482 tr0_4 = _mm_unpacklo_epi16(I4, I5); \
1483 tr0_5 = _mm_unpacklo_epi16(I6, I7); \
1484 tr0_6 = _mm_unpackhi_epi16(I4, I5); \
1485 tr0_7 = _mm_unpackhi_epi16(I6, I7); \
1486 tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
1487 tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
1488 tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
1489 tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
1490 tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
1491 tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
1492 tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
1493 tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
1494 O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
1495 O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
1496 O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
1497 O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
1498 O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
1499 O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
1500 O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
1501 O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
1503 TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
1504 TRANSPOSE_8x8_16BIT(res00
[1], res01
[1], res02
[1], res03
[1], res04
[1], res05
[1], res06
[1], res07
[1], in08
[0], in09
[0], in10
[0], in11
[0], in12
[0], in13
[0], in14
[0], in15
[0])
1505 TRANSPOSE_8x8_16BIT(res00
[2], res01
[2], res02
[2], res03
[2], res04
[2], res05
[2], res06
[2], res07
[2], in16
[0], in17
[0], in18
[0], in19
[0], in20
[0], in21
[0], in22
[0], in23
[0])
1506 TRANSPOSE_8x8_16BIT(res00
[3], res01
[3], res02
[3], res03
[3], res04
[3], res05
[3], res06
[3], res07
[3], in24
[0], in25
[0], in26
[0], in27
[0], in28
[0], in29
[0], in30
[0], in31
[0])
1508 TRANSPOSE_8x8_16BIT(res08
[0], res09
[0], res10
[0], res11
[0], res12
[0], res13
[0], res14
[0], res15
[0], in00
[1], in01
[1], in02
[1], in03
[1], in04
[1], in05
[1], in06
[1], in07
[1])
1509 TRANSPOSE_8x8_16BIT(res08
[1], res09
[1], res10
[1], res11
[1], res12
[1], res13
[1], res14
[1], res15
[1], in08
[1], in09
[1], in10
[1], in11
[1], in12
[1], in13
[1], in14
[1], in15
[1])
1510 TRANSPOSE_8x8_16BIT(res08
[2], res09
[2], res10
[2], res11
[2], res12
[2], res13
[2], res14
[2], res15
[2], in16
[1], in17
[1], in18
[1], in19
[1], in20
[1], in21
[1], in22
[1], in23
[1])
1511 TRANSPOSE_8x8_16BIT(res08
[3], res09
[3], res10
[3], res11
[3], res12
[3], res13
[3], res14
[3], res15
[3], in24
[1], in25
[1], in26
[1], in27
[1], in28
[1], in29
[1], in30
[1], in31
[1])
1513 TRANSPOSE_8x8_16BIT(res16
[0], res17
[0], res18
[0], res19
[0], res20
[0], res21
[0], res22
[0], res23
[0], in00
[2], in01
[2], in02
[2], in03
[2], in04
[2], in05
[2], in06
[2], in07
[2])
1514 TRANSPOSE_8x8_16BIT(res16
[1], res17
[1], res18
[1], res19
[1], res20
[1], res21
[1], res22
[1], res23
[1], in08
[2], in09
[2], in10
[2], in11
[2], in12
[2], in13
[2], in14
[2], in15
[2])
1515 TRANSPOSE_8x8_16BIT(res16
[2], res17
[2], res18
[2], res19
[2], res20
[2], res21
[2], res22
[2], res23
[2], in16
[2], in17
[2], in18
[2], in19
[2], in20
[2], in21
[2], in22
[2], in23
[2])
1516 TRANSPOSE_8x8_16BIT(res16
[3], res17
[3], res18
[3], res19
[3], res20
[3], res21
[3], res22
[3], res23
[3], in24
[2], in25
[2], in26
[2], in27
[2], in28
[2], in29
[2], in30
[2], in31
[2])
1518 TRANSPOSE_8x8_16BIT(res24
[0], res25
[0], res26
[0], res27
[0], res28
[0], res29
[0], res30
[0], res31
[0], in00
[3], in01
[3], in02
[3], in03
[3], in04
[3], in05
[3], in06
[3], in07
[3])
1519 TRANSPOSE_8x8_16BIT(res24
[1], res25
[1], res26
[1], res27
[1], res28
[1], res29
[1], res30
[1], res31
[1], in08
[3], in09
[3], in10
[3], in11
[3], in12
[3], in13
[3], in14
[3], in15
[3])
1520 TRANSPOSE_8x8_16BIT(res24
[2], res25
[2], res26
[2], res27
[2], res28
[2], res29
[2], res30
[2], res31
[2], in16
[3], in17
[3], in18
[3], in19
[3], in20
[3], in21
[3], in22
[3], in23
[3])
1521 TRANSPOSE_8x8_16BIT(res24
[3], res25
[3], res26
[3], res27
[3], res28
[3], res29
[3], res30
[3], res31
[3], in24
[3], in25
[3], in26
[3], in27
[3], in28
[3], in29
[3], in30
[3], in31
[3])
1523 #undef TRANSPOSE_8x8_16BIT
1528 for (int i
= 0; i
< 2; i
++)
1530 #define STORE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \
1531 _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 0], L0); \
1532 _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 8], H0); \
1533 _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 0], L1); \
1534 _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 8], H1); \
1535 _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 0], L2); \
1536 _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 8], H2); \
1537 _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 0], L3); \
1538 _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 8], H3); \
1539 _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 0], L4); \
1540 _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 8], H4); \
1541 _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 0], L5); \
1542 _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 8], H5); \
1543 _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 0], L6); \
1544 _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 8], H6); \
1545 _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 0], L7); \
1546 _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 8], H7);
1548 const int k
= i
* 2;
1549 STORE_LINE(in00
[k
], in01
[k
], in02
[k
], in03
[k
], in04
[k
], in05
[k
], in06
[k
], in07
[k
], in00
[k
+ 1], in01
[k
+ 1], in02
[k
+ 1], in03
[k
+ 1], in04
[k
+ 1], in05
[k
+ 1], in06
[k
+ 1], in07
[k
+ 1], 0, i
* 16)
1550 STORE_LINE(in08
[k
], in09
[k
], in10
[k
], in11
[k
], in12
[k
], in13
[k
], in14
[k
], in15
[k
], in08
[k
+ 1], in09
[k
+ 1], in10
[k
+ 1], in11
[k
+ 1], in12
[k
+ 1], in13
[k
+ 1], in14
[k
+ 1], in15
[k
+ 1], 8, i
* 16)
1551 STORE_LINE(in16
[k
], in17
[k
], in18
[k
], in19
[k
], in20
[k
], in21
[k
], in22
[k
], in23
[k
], in16
[k
+ 1], in17
[k
+ 1], in18
[k
+ 1], in19
[k
+ 1], in20
[k
+ 1], in21
[k
+ 1], in22
[k
+ 1], in23
[k
+ 1], 16, i
* 16)
1552 STORE_LINE(in24
[k
], in25
[k
], in26
[k
], in27
[k
], in28
[k
], in29
[k
], in30
[k
], in31
[k
], in24
[k
+ 1], in25
[k
+ 1], in26
[k
+ 1], in27
[k
+ 1], in28
[k
+ 1], in29
[k
+ 1], in30
[k
+ 1], in31
[k
+ 1], 24, i
* 16)
1557 #endif // if !HIGH_BIT_DEPTH
1561 void Setup_Vec_DCTPrimitives_sse3(EncoderPrimitives
&p
)
1563 /* Note: We have AVX2 assembly for these two functions, but since AVX2 is
1564 * still somewhat rare on end-user PCs we still compile and link these SSE3
1565 * intrinsic SIMD functions */
1567 p
.idct
[IDCT_8x8
] = idct8
;
1568 p
.idct
[IDCT_16x16
] = idct16
;
1569 p
.idct
[IDCT_32x32
] = idct32
;