1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
7 * Mahesh Pittala <mahesh@multicorewareinc.com>
8 * Rajesh Paulraj <rajesh@multicorewareinc.com>
9 * Min Chen <min.chen@multicorewareinc.com>
10 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
11 * Nabajit Deka <nabajit@multicorewareinc.com>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 * This program is also available under a commercial proprietary license.
28 * For more information, contact us at license @ x265.com.
29 *****************************************************************************/
32 #include "primitives.h"
33 #include <xmmintrin.h> // SSE
34 #include <pmmintrin.h> // SSE3
40 ALIGN_VAR_32(static const int16_t, tab_idct_8x8
[12][8]) =
42 { 89, 75, 89, 75, 89, 75, 89, 75 },
43 { 50, 18, 50, 18, 50, 18, 50, 18 },
44 { 75, -18, 75, -18, 75, -18, 75, -18 },
45 { -89, -50, -89, -50, -89, -50, -89, -50 },
46 { 50, -89, 50, -89, 50, -89, 50, -89 },
47 { 18, 75, 18, 75, 18, 75, 18, 75 },
48 { 18, -50, 18, -50, 18, -50, 18, -50 },
49 { 75, -89, 75, -89, 75, -89, 75, -89 },
50 { 64, 64, 64, 64, 64, 64, 64, 64 },
51 { 64, -64, 64, -64, 64, -64, 64, -64 },
52 { 83, 36, 83, 36, 83, 36, 83, 36 },
53 { 36, -83, 36, -83, 36, -83, 36, -83 }
55 void idct8(const int16_t* src
, int16_t* dst
, intptr_t stride
)
57 __m128i m128iS0
, m128iS1
, m128iS2
, m128iS3
, m128iS4
, m128iS5
, m128iS6
, m128iS7
, m128iAdd
, m128Tmp0
, m128Tmp1
, m128Tmp2
, m128Tmp3
, E0h
, E1h
, E2h
, E3h
, E0l
, E1l
, E2l
, E3l
, O0h
, O1h
, O2h
, O3h
, O0l
, O1l
, O2l
, O3l
, EE0l
, EE1l
, E00l
, E01l
, EE0h
, EE1h
, E00h
, E01h
;
58 __m128i T00
, T01
, T02
, T03
, T04
, T05
, T06
, T07
;
60 m128iAdd
= _mm_set1_epi32(64);
62 m128iS1
= _mm_load_si128((__m128i
*)&src
[8 + 0]);
63 m128iS3
= _mm_load_si128((__m128i
*)&src
[24 + 0]);
64 m128Tmp0
= _mm_unpacklo_epi16(m128iS1
, m128iS3
);
65 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[0])));
66 m128Tmp1
= _mm_unpackhi_epi16(m128iS1
, m128iS3
);
67 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[0])));
69 m128iS5
= _mm_load_si128((__m128i
*)&src
[40 + 0]);
70 m128iS7
= _mm_load_si128((__m128i
*)&src
[56 + 0]);
71 m128Tmp2
= _mm_unpacklo_epi16(m128iS5
, m128iS7
);
72 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[1])));
73 m128Tmp3
= _mm_unpackhi_epi16(m128iS5
, m128iS7
);
74 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[1])));
75 O0l
= _mm_add_epi32(E1l
, E2l
);
76 O0h
= _mm_add_epi32(E1h
, E2h
);
78 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[2])));
79 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[2])));
80 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[3])));
81 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[3])));
83 O1l
= _mm_add_epi32(E1l
, E2l
);
84 O1h
= _mm_add_epi32(E1h
, E2h
);
86 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[4])));
87 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[4])));
88 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[5])));
89 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[5])));
90 O2l
= _mm_add_epi32(E1l
, E2l
);
91 O2h
= _mm_add_epi32(E1h
, E2h
);
93 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[6])));
94 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[6])));
95 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[7])));
96 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[7])));
97 O3h
= _mm_add_epi32(E1h
, E2h
);
98 O3l
= _mm_add_epi32(E1l
, E2l
);
102 m128iS0
= _mm_load_si128((__m128i
*)&src
[0 + 0]);
103 m128iS4
= _mm_load_si128((__m128i
*)&src
[32 + 0]);
104 m128Tmp0
= _mm_unpacklo_epi16(m128iS0
, m128iS4
);
105 EE0l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[8])));
106 m128Tmp1
= _mm_unpackhi_epi16(m128iS0
, m128iS4
);
107 EE0h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[8])));
109 EE1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[9])));
110 EE1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[9])));
114 m128iS2
= _mm_load_si128((__m128i
*)&src
[16 + 0]);
115 m128iS6
= _mm_load_si128((__m128i
*)&src
[48 + 0]);
116 m128Tmp0
= _mm_unpacklo_epi16(m128iS2
, m128iS6
);
117 E00l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[10])));
118 m128Tmp1
= _mm_unpackhi_epi16(m128iS2
, m128iS6
);
119 E00h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[10])));
120 E01l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[11])));
121 E01h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[11])));
122 E0l
= _mm_add_epi32(EE0l
, E00l
);
123 E0l
= _mm_add_epi32(E0l
, m128iAdd
);
124 E0h
= _mm_add_epi32(EE0h
, E00h
);
125 E0h
= _mm_add_epi32(E0h
, m128iAdd
);
126 E3l
= _mm_sub_epi32(EE0l
, E00l
);
127 E3l
= _mm_add_epi32(E3l
, m128iAdd
);
128 E3h
= _mm_sub_epi32(EE0h
, E00h
);
129 E3h
= _mm_add_epi32(E3h
, m128iAdd
);
131 E1l
= _mm_add_epi32(EE1l
, E01l
);
132 E1l
= _mm_add_epi32(E1l
, m128iAdd
);
133 E1h
= _mm_add_epi32(EE1h
, E01h
);
134 E1h
= _mm_add_epi32(E1h
, m128iAdd
);
135 E2l
= _mm_sub_epi32(EE1l
, E01l
);
136 E2l
= _mm_add_epi32(E2l
, m128iAdd
);
137 E2h
= _mm_sub_epi32(EE1h
, E01h
);
138 E2h
= _mm_add_epi32(E2h
, m128iAdd
);
139 m128iS0
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l
, O0l
), 7), _mm_srai_epi32(_mm_add_epi32(E0h
, O0h
), 7));
140 m128iS1
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l
, O1l
), 7), _mm_srai_epi32(_mm_add_epi32(E1h
, O1h
), 7));
141 m128iS2
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l
, O2l
), 7), _mm_srai_epi32(_mm_add_epi32(E2h
, O2h
), 7));
142 m128iS3
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l
, O3l
), 7), _mm_srai_epi32(_mm_add_epi32(E3h
, O3h
), 7));
143 m128iS4
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l
, O3l
), 7), _mm_srai_epi32(_mm_sub_epi32(E3h
, O3h
), 7));
144 m128iS5
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l
, O2l
), 7), _mm_srai_epi32(_mm_sub_epi32(E2h
, O2h
), 7));
145 m128iS6
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l
, O1l
), 7), _mm_srai_epi32(_mm_sub_epi32(E1h
, O1h
), 7));
146 m128iS7
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l
, O0l
), 7), _mm_srai_epi32(_mm_sub_epi32(E0h
, O0h
), 7));
149 E0l
= _mm_unpacklo_epi16(m128iS0
, m128iS4
);
150 E1l
= _mm_unpacklo_epi16(m128iS1
, m128iS5
);
151 E2l
= _mm_unpacklo_epi16(m128iS2
, m128iS6
);
152 E3l
= _mm_unpacklo_epi16(m128iS3
, m128iS7
);
153 O0l
= _mm_unpackhi_epi16(m128iS0
, m128iS4
);
154 O1l
= _mm_unpackhi_epi16(m128iS1
, m128iS5
);
155 O2l
= _mm_unpackhi_epi16(m128iS2
, m128iS6
);
156 O3l
= _mm_unpackhi_epi16(m128iS3
, m128iS7
);
157 m128Tmp0
= _mm_unpacklo_epi16(E0l
, E2l
);
158 m128Tmp1
= _mm_unpacklo_epi16(E1l
, E3l
);
159 m128iS0
= _mm_unpacklo_epi16(m128Tmp0
, m128Tmp1
);
160 m128iS1
= _mm_unpackhi_epi16(m128Tmp0
, m128Tmp1
);
161 m128Tmp2
= _mm_unpackhi_epi16(E0l
, E2l
);
162 m128Tmp3
= _mm_unpackhi_epi16(E1l
, E3l
);
163 m128iS2
= _mm_unpacklo_epi16(m128Tmp2
, m128Tmp3
);
164 m128iS3
= _mm_unpackhi_epi16(m128Tmp2
, m128Tmp3
);
165 m128Tmp0
= _mm_unpacklo_epi16(O0l
, O2l
);
166 m128Tmp1
= _mm_unpacklo_epi16(O1l
, O3l
);
167 m128iS4
= _mm_unpacklo_epi16(m128Tmp0
, m128Tmp1
);
168 m128iS5
= _mm_unpackhi_epi16(m128Tmp0
, m128Tmp1
);
169 m128Tmp2
= _mm_unpackhi_epi16(O0l
, O2l
);
170 m128Tmp3
= _mm_unpackhi_epi16(O1l
, O3l
);
171 m128iS6
= _mm_unpacklo_epi16(m128Tmp2
, m128Tmp3
);
172 m128iS7
= _mm_unpackhi_epi16(m128Tmp2
, m128Tmp3
);
174 m128iAdd
= _mm_set1_epi32(2048);
176 m128Tmp0
= _mm_unpacklo_epi16(m128iS1
, m128iS3
);
177 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[0])));
178 m128Tmp1
= _mm_unpackhi_epi16(m128iS1
, m128iS3
);
179 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[0])));
180 m128Tmp2
= _mm_unpacklo_epi16(m128iS5
, m128iS7
);
181 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[1])));
182 m128Tmp3
= _mm_unpackhi_epi16(m128iS5
, m128iS7
);
183 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[1])));
184 O0l
= _mm_add_epi32(E1l
, E2l
);
185 O0h
= _mm_add_epi32(E1h
, E2h
);
186 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[2])));
187 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[2])));
188 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[3])));
189 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[3])));
190 O1l
= _mm_add_epi32(E1l
, E2l
);
191 O1h
= _mm_add_epi32(E1h
, E2h
);
192 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[4])));
193 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[4])));
194 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[5])));
195 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[5])));
196 O2l
= _mm_add_epi32(E1l
, E2l
);
197 O2h
= _mm_add_epi32(E1h
, E2h
);
198 E1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[6])));
199 E1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[6])));
200 E2l
= _mm_madd_epi16(m128Tmp2
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[7])));
201 E2h
= _mm_madd_epi16(m128Tmp3
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[7])));
202 O3h
= _mm_add_epi32(E1h
, E2h
);
203 O3l
= _mm_add_epi32(E1l
, E2l
);
205 m128Tmp0
= _mm_unpacklo_epi16(m128iS0
, m128iS4
);
206 EE0l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[8])));
207 m128Tmp1
= _mm_unpackhi_epi16(m128iS0
, m128iS4
);
208 EE0h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[8])));
209 EE1l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[9])));
210 EE1h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[9])));
212 m128Tmp0
= _mm_unpacklo_epi16(m128iS2
, m128iS6
);
213 E00l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[10])));
214 m128Tmp1
= _mm_unpackhi_epi16(m128iS2
, m128iS6
);
215 E00h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[10])));
216 E01l
= _mm_madd_epi16(m128Tmp0
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[11])));
217 E01h
= _mm_madd_epi16(m128Tmp1
, _mm_load_si128((__m128i
*)(tab_idct_8x8
[11])));
218 E0l
= _mm_add_epi32(EE0l
, E00l
);
219 E0l
= _mm_add_epi32(E0l
, m128iAdd
);
220 E0h
= _mm_add_epi32(EE0h
, E00h
);
221 E0h
= _mm_add_epi32(E0h
, m128iAdd
);
222 E3l
= _mm_sub_epi32(EE0l
, E00l
);
223 E3l
= _mm_add_epi32(E3l
, m128iAdd
);
224 E3h
= _mm_sub_epi32(EE0h
, E00h
);
225 E3h
= _mm_add_epi32(E3h
, m128iAdd
);
226 E1l
= _mm_add_epi32(EE1l
, E01l
);
227 E1l
= _mm_add_epi32(E1l
, m128iAdd
);
228 E1h
= _mm_add_epi32(EE1h
, E01h
);
229 E1h
= _mm_add_epi32(E1h
, m128iAdd
);
230 E2l
= _mm_sub_epi32(EE1l
, E01l
);
231 E2l
= _mm_add_epi32(E2l
, m128iAdd
);
232 E2h
= _mm_sub_epi32(EE1h
, E01h
);
233 E2h
= _mm_add_epi32(E2h
, m128iAdd
);
235 m128iS0
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l
, O0l
), 12), _mm_srai_epi32(_mm_add_epi32(E0h
, O0h
), 12));
236 m128iS1
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l
, O1l
), 12), _mm_srai_epi32(_mm_add_epi32(E1h
, O1h
), 12));
237 m128iS2
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l
, O2l
), 12), _mm_srai_epi32(_mm_add_epi32(E2h
, O2h
), 12));
238 m128iS3
= _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l
, O3l
), 12), _mm_srai_epi32(_mm_add_epi32(E3h
, O3h
), 12));
239 m128iS4
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l
, O3l
), 12), _mm_srai_epi32(_mm_sub_epi32(E3h
, O3h
), 12));
240 m128iS5
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l
, O2l
), 12), _mm_srai_epi32(_mm_sub_epi32(E2h
, O2h
), 12));
241 m128iS6
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l
, O1l
), 12), _mm_srai_epi32(_mm_sub_epi32(E1h
, O1h
), 12));
242 m128iS7
= _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l
, O0l
), 12), _mm_srai_epi32(_mm_sub_epi32(E0h
, O0h
), 12));
244 // [07 06 05 04 03 02 01 00]
245 // [17 16 15 14 13 12 11 10]
246 // [27 26 25 24 23 22 21 20]
247 // [37 36 35 34 33 32 31 30]
248 // [47 46 45 44 43 42 41 40]
249 // [57 56 55 54 53 52 51 50]
250 // [67 66 65 64 63 62 61 60]
251 // [77 76 75 74 73 72 71 70]
253 T00
= _mm_unpacklo_epi16(m128iS0
, m128iS1
); // [13 03 12 02 11 01 10 00]
254 T01
= _mm_unpackhi_epi16(m128iS0
, m128iS1
); // [17 07 16 06 15 05 14 04]
255 T02
= _mm_unpacklo_epi16(m128iS2
, m128iS3
); // [33 23 32 22 31 21 30 20]
256 T03
= _mm_unpackhi_epi16(m128iS2
, m128iS3
); // [37 27 36 26 35 25 34 24]
257 T04
= _mm_unpacklo_epi16(m128iS4
, m128iS5
); // [53 43 52 42 51 41 50 40]
258 T05
= _mm_unpackhi_epi16(m128iS4
, m128iS5
); // [57 47 56 46 55 45 54 44]
259 T06
= _mm_unpacklo_epi16(m128iS6
, m128iS7
); // [73 63 72 62 71 61 70 60]
260 T07
= _mm_unpackhi_epi16(m128iS6
, m128iS7
); // [77 67 76 66 75 65 74 64]
263 T10
= _mm_unpacklo_epi32(T00
, T02
); // [31 21 11 01 30 20 10 00]
264 T11
= _mm_unpackhi_epi32(T00
, T02
); // [33 23 13 03 32 22 12 02]
265 _mm_storel_epi64((__m128i
*)&dst
[0 * stride
+ 0], T10
); // [30 20 10 00]
266 _mm_storeh_pi((__m64
*)&dst
[1 * stride
+ 0], _mm_castsi128_ps(T10
)); // [31 21 11 01]
267 _mm_storel_epi64((__m128i
*)&dst
[2 * stride
+ 0], T11
); // [32 22 12 02]
268 _mm_storeh_pi((__m64
*)&dst
[3 * stride
+ 0], _mm_castsi128_ps(T11
)); // [33 23 13 03]
270 T10
= _mm_unpacklo_epi32(T04
, T06
); // [71 61 51 41 70 60 50 40]
271 T11
= _mm_unpackhi_epi32(T04
, T06
); // [73 63 53 43 72 62 52 42]
272 _mm_storel_epi64((__m128i
*)&dst
[0 * stride
+ 4], T10
);
273 _mm_storeh_pi((__m64
*)&dst
[1 * stride
+ 4], _mm_castsi128_ps(T10
));
274 _mm_storel_epi64((__m128i
*)&dst
[2 * stride
+ 4], T11
);
275 _mm_storeh_pi((__m64
*)&dst
[3 * stride
+ 4], _mm_castsi128_ps(T11
));
277 T10
= _mm_unpacklo_epi32(T01
, T03
); // [35 25 15 05 34 24 14 04]
278 T11
= _mm_unpackhi_epi32(T01
, T03
); // [37 27 17 07 36 26 16 06]
279 _mm_storel_epi64((__m128i
*)&dst
[4 * stride
+ 0], T10
);
280 _mm_storeh_pi((__m64
*)&dst
[5 * stride
+ 0], _mm_castsi128_ps(T10
));
281 _mm_storel_epi64((__m128i
*)&dst
[6 * stride
+ 0], T11
);
282 _mm_storeh_pi((__m64
*)&dst
[7 * stride
+ 0], _mm_castsi128_ps(T11
));
284 T10
= _mm_unpacklo_epi32(T05
, T07
); // [75 65 55 45 74 64 54 44]
285 T11
= _mm_unpackhi_epi32(T05
, T07
); // [77 67 57 47 76 56 46 36]
286 _mm_storel_epi64((__m128i
*)&dst
[4 * stride
+ 4], T10
);
287 _mm_storeh_pi((__m64
*)&dst
[5 * stride
+ 4], _mm_castsi128_ps(T10
));
288 _mm_storel_epi64((__m128i
*)&dst
[6 * stride
+ 4], T11
);
289 _mm_storeh_pi((__m64
*)&dst
[7 * stride
+ 4], _mm_castsi128_ps(T11
));
292 void idct16(const int16_t *src
, int16_t *dst
, intptr_t stride
)
294 const __m128i c16_p87_p90
= _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
295 const __m128i c16_p70_p80
= _mm_set1_epi32(0x00460050);
296 const __m128i c16_p43_p57
= _mm_set1_epi32(0x002B0039);
297 const __m128i c16_p09_p25
= _mm_set1_epi32(0x00090019);
298 const __m128i c16_p57_p87
= _mm_set1_epi32(0x00390057); //row1
299 const __m128i c16_n43_p09
= _mm_set1_epi32(0xFFD50009);
300 const __m128i c16_n90_n80
= _mm_set1_epi32(0xFFA6FFB0);
301 const __m128i c16_n25_n70
= _mm_set1_epi32(0xFFE7FFBA);
302 const __m128i c16_p09_p80
= _mm_set1_epi32(0x00090050); //row2
303 const __m128i c16_n87_n70
= _mm_set1_epi32(0xFFA9FFBA);
304 const __m128i c16_p57_n25
= _mm_set1_epi32(0x0039FFE7);
305 const __m128i c16_p43_p90
= _mm_set1_epi32(0x002B005A);
306 const __m128i c16_n43_p70
= _mm_set1_epi32(0xFFD50046); //row3
307 const __m128i c16_p09_n87
= _mm_set1_epi32(0x0009FFA9);
308 const __m128i c16_p25_p90
= _mm_set1_epi32(0x0019005A);
309 const __m128i c16_n57_n80
= _mm_set1_epi32(0xFFC7FFB0);
310 const __m128i c16_n80_p57
= _mm_set1_epi32(0xFFB00039); //row4
311 const __m128i c16_p90_n25
= _mm_set1_epi32(0x005AFFE7);
312 const __m128i c16_n87_n09
= _mm_set1_epi32(0xFFA9FFF7);
313 const __m128i c16_p70_p43
= _mm_set1_epi32(0x0046002B);
314 const __m128i c16_n90_p43
= _mm_set1_epi32(0xFFA6002B); //row5
315 const __m128i c16_p25_p57
= _mm_set1_epi32(0x00190039);
316 const __m128i c16_p70_n87
= _mm_set1_epi32(0x0046FFA9);
317 const __m128i c16_n80_p09
= _mm_set1_epi32(0xFFB00009);
318 const __m128i c16_n70_p25
= _mm_set1_epi32(0xFFBA0019); //row6
319 const __m128i c16_n80_p90
= _mm_set1_epi32(0xFFB0005A);
320 const __m128i c16_p09_p43
= _mm_set1_epi32(0x0009002B);
321 const __m128i c16_p87_n57
= _mm_set1_epi32(0x0057FFC7);
322 const __m128i c16_n25_p09
= _mm_set1_epi32(0xFFE70009); //row7
323 const __m128i c16_n57_p43
= _mm_set1_epi32(0xFFC7002B);
324 const __m128i c16_n80_p70
= _mm_set1_epi32(0xFFB00046);
325 const __m128i c16_n90_p87
= _mm_set1_epi32(0xFFA60057);
327 const __m128i c16_p75_p89
= _mm_set1_epi32(0x004B0059);
328 const __m128i c16_p18_p50
= _mm_set1_epi32(0x00120032);
329 const __m128i c16_n18_p75
= _mm_set1_epi32(0xFFEE004B);
330 const __m128i c16_n50_n89
= _mm_set1_epi32(0xFFCEFFA7);
331 const __m128i c16_n89_p50
= _mm_set1_epi32(0xFFA70032);
332 const __m128i c16_p75_p18
= _mm_set1_epi32(0x004B0012);
333 const __m128i c16_n50_p18
= _mm_set1_epi32(0xFFCE0012);
334 const __m128i c16_n89_p75
= _mm_set1_epi32(0xFFA7004B);
336 const __m128i c16_p36_p83
= _mm_set1_epi32(0x00240053);
337 const __m128i c16_n83_p36
= _mm_set1_epi32(0xFFAD0024);
339 const __m128i c16_n64_p64
= _mm_set1_epi32(0xFFC00040);
340 const __m128i c16_p64_p64
= _mm_set1_epi32(0x00400040);
341 __m128i c32_rnd
= _mm_set1_epi32(64);
346 __m128i in00
[2], in01
[2], in02
[2], in03
[2], in04
[2], in05
[2], in06
[2], in07
[2];
347 __m128i in08
[2], in09
[2], in10
[2], in11
[2], in12
[2], in13
[2], in14
[2], in15
[2];
348 __m128i res00
[2], res01
[2], res02
[2], res03
[2], res04
[2], res05
[2], res06
[2], res07
[2];
349 __m128i res08
[2], res09
[2], res10
[2], res11
[2], res12
[2], res13
[2], res14
[2], res15
[2];
351 for (int i
= 0; i
< 2; i
++)
353 const int offset
= (i
<< 3);
354 in00
[i
] = _mm_loadu_si128((const __m128i
*)&src
[0 * 16 + offset
]); // [07 06 05 04 03 02 01 00]
355 in01
[i
] = _mm_loadu_si128((const __m128i
*)&src
[1 * 16 + offset
]); // [17 16 15 14 13 12 11 10]
356 in02
[i
] = _mm_loadu_si128((const __m128i
*)&src
[2 * 16 + offset
]); // [27 26 25 24 23 22 21 20]
357 in03
[i
] = _mm_loadu_si128((const __m128i
*)&src
[3 * 16 + offset
]); // [37 36 35 34 33 32 31 30]
358 in04
[i
] = _mm_loadu_si128((const __m128i
*)&src
[4 * 16 + offset
]); // [47 46 45 44 43 42 41 40]
359 in05
[i
] = _mm_loadu_si128((const __m128i
*)&src
[5 * 16 + offset
]); // [57 56 55 54 53 52 51 50]
360 in06
[i
] = _mm_loadu_si128((const __m128i
*)&src
[6 * 16 + offset
]); // [67 66 65 64 63 62 61 60]
361 in07
[i
] = _mm_loadu_si128((const __m128i
*)&src
[7 * 16 + offset
]); // [77 76 75 74 73 72 71 70]
362 in08
[i
] = _mm_loadu_si128((const __m128i
*)&src
[8 * 16 + offset
]);
363 in09
[i
] = _mm_loadu_si128((const __m128i
*)&src
[9 * 16 + offset
]);
364 in10
[i
] = _mm_loadu_si128((const __m128i
*)&src
[10 * 16 + offset
]);
365 in11
[i
] = _mm_loadu_si128((const __m128i
*)&src
[11 * 16 + offset
]);
366 in12
[i
] = _mm_loadu_si128((const __m128i
*)&src
[12 * 16 + offset
]);
367 in13
[i
] = _mm_loadu_si128((const __m128i
*)&src
[13 * 16 + offset
]);
368 in14
[i
] = _mm_loadu_si128((const __m128i
*)&src
[14 * 16 + offset
]);
369 in15
[i
] = _mm_loadu_si128((const __m128i
*)&src
[15 * 16 + offset
]);
372 for (int pass
= 0; pass
< 2; pass
++)
376 c32_rnd
= _mm_set1_epi32(2048);
380 for (int part
= 0; part
< 2; part
++)
382 const __m128i T_00_00A
= _mm_unpacklo_epi16(in01
[part
], in03
[part
]); // [33 13 32 12 31 11 30 10]
383 const __m128i T_00_00B
= _mm_unpackhi_epi16(in01
[part
], in03
[part
]); // [37 17 36 16 35 15 34 14]
384 const __m128i T_00_01A
= _mm_unpacklo_epi16(in05
[part
], in07
[part
]); // [ ]
385 const __m128i T_00_01B
= _mm_unpackhi_epi16(in05
[part
], in07
[part
]); // [ ]
386 const __m128i T_00_02A
= _mm_unpacklo_epi16(in09
[part
], in11
[part
]); // [ ]
387 const __m128i T_00_02B
= _mm_unpackhi_epi16(in09
[part
], in11
[part
]); // [ ]
388 const __m128i T_00_03A
= _mm_unpacklo_epi16(in13
[part
], in15
[part
]); // [ ]
389 const __m128i T_00_03B
= _mm_unpackhi_epi16(in13
[part
], in15
[part
]); // [ ]
390 const __m128i T_00_04A
= _mm_unpacklo_epi16(in02
[part
], in06
[part
]); // [ ]
391 const __m128i T_00_04B
= _mm_unpackhi_epi16(in02
[part
], in06
[part
]); // [ ]
392 const __m128i T_00_05A
= _mm_unpacklo_epi16(in10
[part
], in14
[part
]); // [ ]
393 const __m128i T_00_05B
= _mm_unpackhi_epi16(in10
[part
], in14
[part
]); // [ ]
394 const __m128i T_00_06A
= _mm_unpacklo_epi16(in04
[part
], in12
[part
]); // [ ]row
395 const __m128i T_00_06B
= _mm_unpackhi_epi16(in04
[part
], in12
[part
]); // [ ]
396 const __m128i T_00_07A
= _mm_unpacklo_epi16(in00
[part
], in08
[part
]); // [83 03 82 02 81 01 81 00] row08 row00
397 const __m128i T_00_07B
= _mm_unpackhi_epi16(in00
[part
], in08
[part
]); // [87 07 86 06 85 05 84 04]
399 __m128i O0A
, O1A
, O2A
, O3A
, O4A
, O5A
, O6A
, O7A
;
400 __m128i O0B
, O1B
, O2B
, O3B
, O4B
, O5B
, O6B
, O7B
;
403 #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \
404 T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \
405 T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \
406 row = _mm_add_epi32(T00, T01);
408 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_p87_p90
, c16_p70_p80
, c16_p43_p57
, c16_p09_p25
, O0A
)
409 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_p57_p87
, c16_n43_p09
, c16_n90_n80
, c16_n25_n70
, O1A
)
410 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_p09_p80
, c16_n87_n70
, c16_p57_n25
, c16_p43_p90
, O2A
)
411 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n43_p70
, c16_p09_n87
, c16_p25_p90
, c16_n57_n80
, O3A
)
412 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n80_p57
, c16_p90_n25
, c16_n87_n09
, c16_p70_p43
, O4A
)
413 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n90_p43
, c16_p25_p57
, c16_p70_n87
, c16_n80_p09
, O5A
)
414 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n70_p25
, c16_n80_p90
, c16_p09_p43
, c16_p87_n57
, O6A
)
415 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, c16_n25_p09
, c16_n57_p43
, c16_n80_p70
, c16_n90_p87
, O7A
)
417 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_p87_p90
, c16_p70_p80
, c16_p43_p57
, c16_p09_p25
, O0B
)
418 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_p57_p87
, c16_n43_p09
, c16_n90_n80
, c16_n25_n70
, O1B
)
419 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_p09_p80
, c16_n87_n70
, c16_p57_n25
, c16_p43_p90
, O2B
)
420 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n43_p70
, c16_p09_n87
, c16_p25_p90
, c16_n57_n80
, O3B
)
421 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n80_p57
, c16_p90_n25
, c16_n87_n09
, c16_p70_p43
, O4B
)
422 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n90_p43
, c16_p25_p57
, c16_p70_n87
, c16_n80_p09
, O5B
)
423 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n70_p25
, c16_n80_p90
, c16_p09_p43
, c16_p87_n57
, O6B
)
424 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, c16_n25_p09
, c16_n57_p43
, c16_n80_p70
, c16_n90_p87
, O7B
)
428 __m128i EO0A
, EO1A
, EO2A
, EO3A
;
429 __m128i EO0B
, EO1B
, EO2B
, EO3B
;
430 EO0A
= _mm_add_epi32(_mm_madd_epi16(T_00_04A
, c16_p75_p89
), _mm_madd_epi16(T_00_05A
, c16_p18_p50
)); // EO0
431 EO0B
= _mm_add_epi32(_mm_madd_epi16(T_00_04B
, c16_p75_p89
), _mm_madd_epi16(T_00_05B
, c16_p18_p50
));
432 EO1A
= _mm_add_epi32(_mm_madd_epi16(T_00_04A
, c16_n18_p75
), _mm_madd_epi16(T_00_05A
, c16_n50_n89
)); // EO1
433 EO1B
= _mm_add_epi32(_mm_madd_epi16(T_00_04B
, c16_n18_p75
), _mm_madd_epi16(T_00_05B
, c16_n50_n89
));
434 EO2A
= _mm_add_epi32(_mm_madd_epi16(T_00_04A
, c16_n89_p50
), _mm_madd_epi16(T_00_05A
, c16_p75_p18
)); // EO2
435 EO2B
= _mm_add_epi32(_mm_madd_epi16(T_00_04B
, c16_n89_p50
), _mm_madd_epi16(T_00_05B
, c16_p75_p18
));
436 EO3A
= _mm_add_epi32(_mm_madd_epi16(T_00_04A
, c16_n50_p18
), _mm_madd_epi16(T_00_05A
, c16_n89_p75
)); // EO3
437 EO3B
= _mm_add_epi32(_mm_madd_epi16(T_00_04B
, c16_n50_p18
), _mm_madd_epi16(T_00_05B
, c16_n89_p75
));
439 __m128i EEO0A
, EEO1A
;
440 __m128i EEO0B
, EEO1B
;
441 EEO0A
= _mm_madd_epi16(T_00_06A
, c16_p36_p83
);
442 EEO0B
= _mm_madd_epi16(T_00_06B
, c16_p36_p83
);
443 EEO1A
= _mm_madd_epi16(T_00_06A
, c16_n83_p36
);
444 EEO1B
= _mm_madd_epi16(T_00_06B
, c16_n83_p36
);
446 __m128i EEE0A
, EEE1A
;
447 __m128i EEE0B
, EEE1B
;
448 EEE0A
= _mm_madd_epi16(T_00_07A
, c16_p64_p64
);
449 EEE0B
= _mm_madd_epi16(T_00_07B
, c16_p64_p64
);
450 EEE1A
= _mm_madd_epi16(T_00_07A
, c16_n64_p64
);
451 EEE1B
= _mm_madd_epi16(T_00_07B
, c16_n64_p64
);
453 const __m128i EE0A
= _mm_add_epi32(EEE0A
, EEO0A
); // EE0 = EEE0 + EEO0
454 const __m128i EE0B
= _mm_add_epi32(EEE0B
, EEO0B
);
455 const __m128i EE1A
= _mm_add_epi32(EEE1A
, EEO1A
); // EE1 = EEE1 + EEO1
456 const __m128i EE1B
= _mm_add_epi32(EEE1B
, EEO1B
);
457 const __m128i EE3A
= _mm_sub_epi32(EEE0A
, EEO0A
); // EE2 = EEE0 - EEO0
458 const __m128i EE3B
= _mm_sub_epi32(EEE0B
, EEO0B
);
459 const __m128i EE2A
= _mm_sub_epi32(EEE1A
, EEO1A
); // EE3 = EEE1 - EEO1
460 const __m128i EE2B
= _mm_sub_epi32(EEE1B
, EEO1B
);
462 const __m128i E0A
= _mm_add_epi32(EE0A
, EO0A
); // E0 = EE0 + EO0
463 const __m128i E0B
= _mm_add_epi32(EE0B
, EO0B
);
464 const __m128i E1A
= _mm_add_epi32(EE1A
, EO1A
); // E1 = EE1 + EO1
465 const __m128i E1B
= _mm_add_epi32(EE1B
, EO1B
);
466 const __m128i E2A
= _mm_add_epi32(EE2A
, EO2A
); // E2 = EE2 + EO2
467 const __m128i E2B
= _mm_add_epi32(EE2B
, EO2B
);
468 const __m128i E3A
= _mm_add_epi32(EE3A
, EO3A
); // E3 = EE3 + EO3
469 const __m128i E3B
= _mm_add_epi32(EE3B
, EO3B
);
470 const __m128i E7A
= _mm_sub_epi32(EE0A
, EO0A
); // E0 = EE0 - EO0
471 const __m128i E7B
= _mm_sub_epi32(EE0B
, EO0B
);
472 const __m128i E6A
= _mm_sub_epi32(EE1A
, EO1A
); // E1 = EE1 - EO1
473 const __m128i E6B
= _mm_sub_epi32(EE1B
, EO1B
);
474 const __m128i E5A
= _mm_sub_epi32(EE2A
, EO2A
); // E2 = EE2 - EO2
475 const __m128i E5B
= _mm_sub_epi32(EE2B
, EO2B
);
476 const __m128i E4A
= _mm_sub_epi32(EE3A
, EO3A
); // E3 = EE3 - EO3
477 const __m128i E4B
= _mm_sub_epi32(EE3B
, EO3B
);
479 const __m128i T10A
= _mm_add_epi32(E0A
, c32_rnd
); // E0 + rnd
480 const __m128i T10B
= _mm_add_epi32(E0B
, c32_rnd
);
481 const __m128i T11A
= _mm_add_epi32(E1A
, c32_rnd
); // E1 + rnd
482 const __m128i T11B
= _mm_add_epi32(E1B
, c32_rnd
);
483 const __m128i T12A
= _mm_add_epi32(E2A
, c32_rnd
); // E2 + rnd
484 const __m128i T12B
= _mm_add_epi32(E2B
, c32_rnd
);
485 const __m128i T13A
= _mm_add_epi32(E3A
, c32_rnd
); // E3 + rnd
486 const __m128i T13B
= _mm_add_epi32(E3B
, c32_rnd
);
487 const __m128i T14A
= _mm_add_epi32(E4A
, c32_rnd
); // E4 + rnd
488 const __m128i T14B
= _mm_add_epi32(E4B
, c32_rnd
);
489 const __m128i T15A
= _mm_add_epi32(E5A
, c32_rnd
); // E5 + rnd
490 const __m128i T15B
= _mm_add_epi32(E5B
, c32_rnd
);
491 const __m128i T16A
= _mm_add_epi32(E6A
, c32_rnd
); // E6 + rnd
492 const __m128i T16B
= _mm_add_epi32(E6B
, c32_rnd
);
493 const __m128i T17A
= _mm_add_epi32(E7A
, c32_rnd
); // E7 + rnd
494 const __m128i T17B
= _mm_add_epi32(E7B
, c32_rnd
);
496 const __m128i T20A
= _mm_add_epi32(T10A
, O0A
); // E0 + O0 + rnd
497 const __m128i T20B
= _mm_add_epi32(T10B
, O0B
);
498 const __m128i T21A
= _mm_add_epi32(T11A
, O1A
); // E1 + O1 + rnd
499 const __m128i T21B
= _mm_add_epi32(T11B
, O1B
);
500 const __m128i T22A
= _mm_add_epi32(T12A
, O2A
); // E2 + O2 + rnd
501 const __m128i T22B
= _mm_add_epi32(T12B
, O2B
);
502 const __m128i T23A
= _mm_add_epi32(T13A
, O3A
); // E3 + O3 + rnd
503 const __m128i T23B
= _mm_add_epi32(T13B
, O3B
);
504 const __m128i T24A
= _mm_add_epi32(T14A
, O4A
); // E4
505 const __m128i T24B
= _mm_add_epi32(T14B
, O4B
);
506 const __m128i T25A
= _mm_add_epi32(T15A
, O5A
); // E5
507 const __m128i T25B
= _mm_add_epi32(T15B
, O5B
);
508 const __m128i T26A
= _mm_add_epi32(T16A
, O6A
); // E6
509 const __m128i T26B
= _mm_add_epi32(T16B
, O6B
);
510 const __m128i T27A
= _mm_add_epi32(T17A
, O7A
); // E7
511 const __m128i T27B
= _mm_add_epi32(T17B
, O7B
);
512 const __m128i T2FA
= _mm_sub_epi32(T10A
, O0A
); // E0 - O0 + rnd
513 const __m128i T2FB
= _mm_sub_epi32(T10B
, O0B
);
514 const __m128i T2EA
= _mm_sub_epi32(T11A
, O1A
); // E1 - O1 + rnd
515 const __m128i T2EB
= _mm_sub_epi32(T11B
, O1B
);
516 const __m128i T2DA
= _mm_sub_epi32(T12A
, O2A
); // E2 - O2 + rnd
517 const __m128i T2DB
= _mm_sub_epi32(T12B
, O2B
);
518 const __m128i T2CA
= _mm_sub_epi32(T13A
, O3A
); // E3 - O3 + rnd
519 const __m128i T2CB
= _mm_sub_epi32(T13B
, O3B
);
520 const __m128i T2BA
= _mm_sub_epi32(T14A
, O4A
); // E4
521 const __m128i T2BB
= _mm_sub_epi32(T14B
, O4B
);
522 const __m128i T2AA
= _mm_sub_epi32(T15A
, O5A
); // E5
523 const __m128i T2AB
= _mm_sub_epi32(T15B
, O5B
);
524 const __m128i T29A
= _mm_sub_epi32(T16A
, O6A
); // E6
525 const __m128i T29B
= _mm_sub_epi32(T16B
, O6B
);
526 const __m128i T28A
= _mm_sub_epi32(T17A
, O7A
); // E7
527 const __m128i T28B
= _mm_sub_epi32(T17B
, O7B
);
529 const __m128i T30A
= _mm_srai_epi32(T20A
, nShift
); // [30 20 10 00]
530 const __m128i T30B
= _mm_srai_epi32(T20B
, nShift
); // [70 60 50 40]
531 const __m128i T31A
= _mm_srai_epi32(T21A
, nShift
); // [31 21 11 01]
532 const __m128i T31B
= _mm_srai_epi32(T21B
, nShift
); // [71 61 51 41]
533 const __m128i T32A
= _mm_srai_epi32(T22A
, nShift
); // [32 22 12 02]
534 const __m128i T32B
= _mm_srai_epi32(T22B
, nShift
); // [72 62 52 42]
535 const __m128i T33A
= _mm_srai_epi32(T23A
, nShift
); // [33 23 13 03]
536 const __m128i T33B
= _mm_srai_epi32(T23B
, nShift
); // [73 63 53 43]
537 const __m128i T34A
= _mm_srai_epi32(T24A
, nShift
); // [33 24 14 04]
538 const __m128i T34B
= _mm_srai_epi32(T24B
, nShift
); // [74 64 54 44]
539 const __m128i T35A
= _mm_srai_epi32(T25A
, nShift
); // [35 25 15 05]
540 const __m128i T35B
= _mm_srai_epi32(T25B
, nShift
); // [75 65 55 45]
541 const __m128i T36A
= _mm_srai_epi32(T26A
, nShift
); // [36 26 16 06]
542 const __m128i T36B
= _mm_srai_epi32(T26B
, nShift
); // [76 66 56 46]
543 const __m128i T37A
= _mm_srai_epi32(T27A
, nShift
); // [37 27 17 07]
544 const __m128i T37B
= _mm_srai_epi32(T27B
, nShift
); // [77 67 57 47]
546 const __m128i T38A
= _mm_srai_epi32(T28A
, nShift
); // [30 20 10 00] x8
547 const __m128i T38B
= _mm_srai_epi32(T28B
, nShift
); // [70 60 50 40]
548 const __m128i T39A
= _mm_srai_epi32(T29A
, nShift
); // [31 21 11 01] x9
549 const __m128i T39B
= _mm_srai_epi32(T29B
, nShift
); // [71 61 51 41]
550 const __m128i T3AA
= _mm_srai_epi32(T2AA
, nShift
); // [32 22 12 02] xA
551 const __m128i T3AB
= _mm_srai_epi32(T2AB
, nShift
); // [72 62 52 42]
552 const __m128i T3BA
= _mm_srai_epi32(T2BA
, nShift
); // [33 23 13 03] xB
553 const __m128i T3BB
= _mm_srai_epi32(T2BB
, nShift
); // [73 63 53 43]
554 const __m128i T3CA
= _mm_srai_epi32(T2CA
, nShift
); // [33 24 14 04] xC
555 const __m128i T3CB
= _mm_srai_epi32(T2CB
, nShift
); // [74 64 54 44]
556 const __m128i T3DA
= _mm_srai_epi32(T2DA
, nShift
); // [35 25 15 05] xD
557 const __m128i T3DB
= _mm_srai_epi32(T2DB
, nShift
); // [75 65 55 45]
558 const __m128i T3EA
= _mm_srai_epi32(T2EA
, nShift
); // [36 26 16 06] xE
559 const __m128i T3EB
= _mm_srai_epi32(T2EB
, nShift
); // [76 66 56 46]
560 const __m128i T3FA
= _mm_srai_epi32(T2FA
, nShift
); // [37 27 17 07] xF
561 const __m128i T3FB
= _mm_srai_epi32(T2FB
, nShift
); // [77 67 57 47]
563 res00
[part
] = _mm_packs_epi32(T30A
, T30B
); // [70 60 50 40 30 20 10 00]
564 res01
[part
] = _mm_packs_epi32(T31A
, T31B
); // [71 61 51 41 31 21 11 01]
565 res02
[part
] = _mm_packs_epi32(T32A
, T32B
); // [72 62 52 42 32 22 12 02]
566 res03
[part
] = _mm_packs_epi32(T33A
, T33B
); // [73 63 53 43 33 23 13 03]
567 res04
[part
] = _mm_packs_epi32(T34A
, T34B
); // [74 64 54 44 34 24 14 04]
568 res05
[part
] = _mm_packs_epi32(T35A
, T35B
); // [75 65 55 45 35 25 15 05]
569 res06
[part
] = _mm_packs_epi32(T36A
, T36B
); // [76 66 56 46 36 26 16 06]
570 res07
[part
] = _mm_packs_epi32(T37A
, T37B
); // [77 67 57 47 37 27 17 07]
572 res08
[part
] = _mm_packs_epi32(T38A
, T38B
); // [A0 ... 80]
573 res09
[part
] = _mm_packs_epi32(T39A
, T39B
); // [A1 ... 81]
574 res10
[part
] = _mm_packs_epi32(T3AA
, T3AB
); // [A2 ... 82]
575 res11
[part
] = _mm_packs_epi32(T3BA
, T3BB
); // [A3 ... 83]
576 res12
[part
] = _mm_packs_epi32(T3CA
, T3CB
); // [A4 ... 84]
577 res13
[part
] = _mm_packs_epi32(T3DA
, T3DB
); // [A5 ... 85]
578 res14
[part
] = _mm_packs_epi32(T3EA
, T3EB
); // [A6 ... 86]
579 res15
[part
] = _mm_packs_epi32(T3FA
, T3FB
); // [A7 ... 87]
581 //transpose matrix 8x8 16bit.
583 __m128i tr0_0
, tr0_1
, tr0_2
, tr0_3
, tr0_4
, tr0_5
, tr0_6
, tr0_7
;
584 __m128i tr1_0
, tr1_1
, tr1_2
, tr1_3
, tr1_4
, tr1_5
, tr1_6
, tr1_7
;
585 #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
586 tr0_0 = _mm_unpacklo_epi16(I0, I1); \
587 tr0_1 = _mm_unpacklo_epi16(I2, I3); \
588 tr0_2 = _mm_unpackhi_epi16(I0, I1); \
589 tr0_3 = _mm_unpackhi_epi16(I2, I3); \
590 tr0_4 = _mm_unpacklo_epi16(I4, I5); \
591 tr0_5 = _mm_unpacklo_epi16(I6, I7); \
592 tr0_6 = _mm_unpackhi_epi16(I4, I5); \
593 tr0_7 = _mm_unpackhi_epi16(I6, I7); \
594 tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
595 tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
596 tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
597 tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
598 tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
599 tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
600 tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
601 tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
602 O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
603 O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
604 O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
605 O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
606 O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
607 O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
608 O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
609 O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
611 TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
612 TRANSPOSE_8x8_16BIT(res08
[0], res09
[0], res10
[0], res11
[0], res12
[0], res13
[0], res14
[0], res15
[0], in00
[1], in01
[1], in02
[1], in03
[1], in04
[1], in05
[1], in06
[1], in07
[1])
613 TRANSPOSE_8x8_16BIT(res00
[1], res01
[1], res02
[1], res03
[1], res04
[1], res05
[1], res06
[1], res07
[1], in08
[0], in09
[0], in10
[0], in11
[0], in12
[0], in13
[0], in14
[0], in15
[0])
614 TRANSPOSE_8x8_16BIT(res08
[1], res09
[1], res10
[1], res11
[1], res12
[1], res13
[1], res14
[1], res15
[1], in08
[1], in09
[1], in10
[1], in11
[1], in12
[1], in13
[1], in14
[1], in15
[1])
616 #undef TRANSPOSE_8x8_16BIT
620 _mm_store_si128((__m128i
*)&dst
[0 * stride
+ 0], in00
[0]);
621 _mm_store_si128((__m128i
*)&dst
[0 * stride
+ 8], in00
[1]);
622 _mm_store_si128((__m128i
*)&dst
[1 * stride
+ 0], in01
[0]);
623 _mm_store_si128((__m128i
*)&dst
[1 * stride
+ 8], in01
[1]);
624 _mm_store_si128((__m128i
*)&dst
[2 * stride
+ 0], in02
[0]);
625 _mm_store_si128((__m128i
*)&dst
[2 * stride
+ 8], in02
[1]);
626 _mm_store_si128((__m128i
*)&dst
[3 * stride
+ 0], in03
[0]);
627 _mm_store_si128((__m128i
*)&dst
[3 * stride
+ 8], in03
[1]);
628 _mm_store_si128((__m128i
*)&dst
[4 * stride
+ 0], in04
[0]);
629 _mm_store_si128((__m128i
*)&dst
[4 * stride
+ 8], in04
[1]);
630 _mm_store_si128((__m128i
*)&dst
[5 * stride
+ 0], in05
[0]);
631 _mm_store_si128((__m128i
*)&dst
[5 * stride
+ 8], in05
[1]);
632 _mm_store_si128((__m128i
*)&dst
[6 * stride
+ 0], in06
[0]);
633 _mm_store_si128((__m128i
*)&dst
[6 * stride
+ 8], in06
[1]);
634 _mm_store_si128((__m128i
*)&dst
[7 * stride
+ 0], in07
[0]);
635 _mm_store_si128((__m128i
*)&dst
[7 * stride
+ 8], in07
[1]);
636 _mm_store_si128((__m128i
*)&dst
[8 * stride
+ 0], in08
[0]);
637 _mm_store_si128((__m128i
*)&dst
[8 * stride
+ 8], in08
[1]);
638 _mm_store_si128((__m128i
*)&dst
[9 * stride
+ 0], in09
[0]);
639 _mm_store_si128((__m128i
*)&dst
[9 * stride
+ 8], in09
[1]);
640 _mm_store_si128((__m128i
*)&dst
[10 * stride
+ 0], in10
[0]);
641 _mm_store_si128((__m128i
*)&dst
[10 * stride
+ 8], in10
[1]);
642 _mm_store_si128((__m128i
*)&dst
[11 * stride
+ 0], in11
[0]);
643 _mm_store_si128((__m128i
*)&dst
[11 * stride
+ 8], in11
[1]);
644 _mm_store_si128((__m128i
*)&dst
[12 * stride
+ 0], in12
[0]);
645 _mm_store_si128((__m128i
*)&dst
[12 * stride
+ 8], in12
[1]);
646 _mm_store_si128((__m128i
*)&dst
[13 * stride
+ 0], in13
[0]);
647 _mm_store_si128((__m128i
*)&dst
[13 * stride
+ 8], in13
[1]);
648 _mm_store_si128((__m128i
*)&dst
[14 * stride
+ 0], in14
[0]);
649 _mm_store_si128((__m128i
*)&dst
[14 * stride
+ 8], in14
[1]);
650 _mm_store_si128((__m128i
*)&dst
[15 * stride
+ 0], in15
[0]);
651 _mm_store_si128((__m128i
*)&dst
[15 * stride
+ 8], in15
[1]);
654 void idct32(const int16_t *src
, int16_t *dst
, intptr_t stride
)
657 const __m128i c16_p90_p90
= _mm_set1_epi32(0x005A005A); //column 0
658 const __m128i c16_p85_p88
= _mm_set1_epi32(0x00550058);
659 const __m128i c16_p78_p82
= _mm_set1_epi32(0x004E0052);
660 const __m128i c16_p67_p73
= _mm_set1_epi32(0x00430049);
661 const __m128i c16_p54_p61
= _mm_set1_epi32(0x0036003D);
662 const __m128i c16_p38_p46
= _mm_set1_epi32(0x0026002E);
663 const __m128i c16_p22_p31
= _mm_set1_epi32(0x0016001F);
664 const __m128i c16_p04_p13
= _mm_set1_epi32(0x0004000D);
665 const __m128i c16_p82_p90
= _mm_set1_epi32(0x0052005A); //column 1
666 const __m128i c16_p46_p67
= _mm_set1_epi32(0x002E0043);
667 const __m128i c16_n04_p22
= _mm_set1_epi32(0xFFFC0016);
668 const __m128i c16_n54_n31
= _mm_set1_epi32(0xFFCAFFE1);
669 const __m128i c16_n85_n73
= _mm_set1_epi32(0xFFABFFB7);
670 const __m128i c16_n88_n90
= _mm_set1_epi32(0xFFA8FFA6);
671 const __m128i c16_n61_n78
= _mm_set1_epi32(0xFFC3FFB2);
672 const __m128i c16_n13_n38
= _mm_set1_epi32(0xFFF3FFDA);
673 const __m128i c16_p67_p88
= _mm_set1_epi32(0x00430058); //column 2
674 const __m128i c16_n13_p31
= _mm_set1_epi32(0xFFF3001F);
675 const __m128i c16_n82_n54
= _mm_set1_epi32(0xFFAEFFCA);
676 const __m128i c16_n78_n90
= _mm_set1_epi32(0xFFB2FFA6);
677 const __m128i c16_n04_n46
= _mm_set1_epi32(0xFFFCFFD2);
678 const __m128i c16_p73_p38
= _mm_set1_epi32(0x00490026);
679 const __m128i c16_p85_p90
= _mm_set1_epi32(0x0055005A);
680 const __m128i c16_p22_p61
= _mm_set1_epi32(0x0016003D);
681 const __m128i c16_p46_p85
= _mm_set1_epi32(0x002E0055); //column 3
682 const __m128i c16_n67_n13
= _mm_set1_epi32(0xFFBDFFF3);
683 const __m128i c16_n73_n90
= _mm_set1_epi32(0xFFB7FFA6);
684 const __m128i c16_p38_n22
= _mm_set1_epi32(0x0026FFEA);
685 const __m128i c16_p88_p82
= _mm_set1_epi32(0x00580052);
686 const __m128i c16_n04_p54
= _mm_set1_epi32(0xFFFC0036);
687 const __m128i c16_n90_n61
= _mm_set1_epi32(0xFFA6FFC3);
688 const __m128i c16_n31_n78
= _mm_set1_epi32(0xFFE1FFB2);
689 const __m128i c16_p22_p82
= _mm_set1_epi32(0x00160052); //column 4
690 const __m128i c16_n90_n54
= _mm_set1_epi32(0xFFA6FFCA);
691 const __m128i c16_p13_n61
= _mm_set1_epi32(0x000DFFC3);
692 const __m128i c16_p85_p78
= _mm_set1_epi32(0x0055004E);
693 const __m128i c16_n46_p31
= _mm_set1_epi32(0xFFD2001F);
694 const __m128i c16_n67_n90
= _mm_set1_epi32(0xFFBDFFA6);
695 const __m128i c16_p73_p04
= _mm_set1_epi32(0x00490004);
696 const __m128i c16_p38_p88
= _mm_set1_epi32(0x00260058);
697 const __m128i c16_n04_p78
= _mm_set1_epi32(0xFFFC004E); //column 5
698 const __m128i c16_n73_n82
= _mm_set1_epi32(0xFFB7FFAE);
699 const __m128i c16_p85_p13
= _mm_set1_epi32(0x0055000D);
700 const __m128i c16_n22_p67
= _mm_set1_epi32(0xFFEA0043);
701 const __m128i c16_n61_n88
= _mm_set1_epi32(0xFFC3FFA8);
702 const __m128i c16_p90_p31
= _mm_set1_epi32(0x005A001F);
703 const __m128i c16_n38_p54
= _mm_set1_epi32(0xFFDA0036);
704 const __m128i c16_n46_n90
= _mm_set1_epi32(0xFFD2FFA6);
705 const __m128i c16_n31_p73
= _mm_set1_epi32(0xFFE10049); //column 6
706 const __m128i c16_n22_n90
= _mm_set1_epi32(0xFFEAFFA6);
707 const __m128i c16_p67_p78
= _mm_set1_epi32(0x0043004E);
708 const __m128i c16_n90_n38
= _mm_set1_epi32(0xFFA6FFDA);
709 const __m128i c16_p82_n13
= _mm_set1_epi32(0x0052FFF3);
710 const __m128i c16_n46_p61
= _mm_set1_epi32(0xFFD2003D);
711 const __m128i c16_n04_n88
= _mm_set1_epi32(0xFFFCFFA8);
712 const __m128i c16_p54_p85
= _mm_set1_epi32(0x00360055);
713 const __m128i c16_n54_p67
= _mm_set1_epi32(0xFFCA0043); //column 7
714 const __m128i c16_p38_n78
= _mm_set1_epi32(0x0026FFB2);
715 const __m128i c16_n22_p85
= _mm_set1_epi32(0xFFEA0055);
716 const __m128i c16_p04_n90
= _mm_set1_epi32(0x0004FFA6);
717 const __m128i c16_p13_p90
= _mm_set1_epi32(0x000D005A);
718 const __m128i c16_n31_n88
= _mm_set1_epi32(0xFFE1FFA8);
719 const __m128i c16_p46_p82
= _mm_set1_epi32(0x002E0052);
720 const __m128i c16_n61_n73
= _mm_set1_epi32(0xFFC3FFB7);
721 const __m128i c16_n73_p61
= _mm_set1_epi32(0xFFB7003D); //column 8
722 const __m128i c16_p82_n46
= _mm_set1_epi32(0x0052FFD2);
723 const __m128i c16_n88_p31
= _mm_set1_epi32(0xFFA8001F);
724 const __m128i c16_p90_n13
= _mm_set1_epi32(0x005AFFF3);
725 const __m128i c16_n90_n04
= _mm_set1_epi32(0xFFA6FFFC);
726 const __m128i c16_p85_p22
= _mm_set1_epi32(0x00550016);
727 const __m128i c16_n78_n38
= _mm_set1_epi32(0xFFB2FFDA);
728 const __m128i c16_p67_p54
= _mm_set1_epi32(0x00430036);
729 const __m128i c16_n85_p54
= _mm_set1_epi32(0xFFAB0036); //column 9
730 const __m128i c16_p88_n04
= _mm_set1_epi32(0x0058FFFC);
731 const __m128i c16_n61_n46
= _mm_set1_epi32(0xFFC3FFD2);
732 const __m128i c16_p13_p82
= _mm_set1_epi32(0x000D0052);
733 const __m128i c16_p38_n90
= _mm_set1_epi32(0x0026FFA6);
734 const __m128i c16_n78_p67
= _mm_set1_epi32(0xFFB20043);
735 const __m128i c16_p90_n22
= _mm_set1_epi32(0x005AFFEA);
736 const __m128i c16_n73_n31
= _mm_set1_epi32(0xFFB7FFE1);
737 const __m128i c16_n90_p46
= _mm_set1_epi32(0xFFA6002E); //column 10
738 const __m128i c16_p54_p38
= _mm_set1_epi32(0x00360026);
739 const __m128i c16_p31_n90
= _mm_set1_epi32(0x001FFFA6);
740 const __m128i c16_n88_p61
= _mm_set1_epi32(0xFFA8003D);
741 const __m128i c16_p67_p22
= _mm_set1_epi32(0x00430016);
742 const __m128i c16_p13_n85
= _mm_set1_epi32(0x000DFFAB);
743 const __m128i c16_n82_p73
= _mm_set1_epi32(0xFFAE0049);
744 const __m128i c16_p78_p04
= _mm_set1_epi32(0x004E0004);
745 const __m128i c16_n88_p38
= _mm_set1_epi32(0xFFA80026); //column 11
746 const __m128i c16_n04_p73
= _mm_set1_epi32(0xFFFC0049);
747 const __m128i c16_p90_n67
= _mm_set1_epi32(0x005AFFBD);
748 const __m128i c16_n31_n46
= _mm_set1_epi32(0xFFE1FFD2);
749 const __m128i c16_n78_p85
= _mm_set1_epi32(0xFFB20055);
750 const __m128i c16_p61_p13
= _mm_set1_epi32(0x003D000D);
751 const __m128i c16_p54_n90
= _mm_set1_epi32(0x0036FFA6);
752 const __m128i c16_n82_p22
= _mm_set1_epi32(0xFFAE0016);
753 const __m128i c16_n78_p31
= _mm_set1_epi32(0xFFB2001F); //column 12
754 const __m128i c16_n61_p90
= _mm_set1_epi32(0xFFC3005A);
755 const __m128i c16_p54_p04
= _mm_set1_epi32(0x00360004);
756 const __m128i c16_p82_n88
= _mm_set1_epi32(0x0052FFA8);
757 const __m128i c16_n22_n38
= _mm_set1_epi32(0xFFEAFFDA);
758 const __m128i c16_n90_p73
= _mm_set1_epi32(0xFFA60049);
759 const __m128i c16_n13_p67
= _mm_set1_epi32(0xFFF30043);
760 const __m128i c16_p85_n46
= _mm_set1_epi32(0x0055FFD2);
761 const __m128i c16_n61_p22
= _mm_set1_epi32(0xFFC30016); //column 13
762 const __m128i c16_n90_p85
= _mm_set1_epi32(0xFFA60055);
763 const __m128i c16_n38_p73
= _mm_set1_epi32(0xFFDA0049);
764 const __m128i c16_p46_n04
= _mm_set1_epi32(0x002EFFFC);
765 const __m128i c16_p90_n78
= _mm_set1_epi32(0x005AFFB2);
766 const __m128i c16_p54_n82
= _mm_set1_epi32(0x0036FFAE);
767 const __m128i c16_n31_n13
= _mm_set1_epi32(0xFFE1FFF3);
768 const __m128i c16_n88_p67
= _mm_set1_epi32(0xFFA80043);
769 const __m128i c16_n38_p13
= _mm_set1_epi32(0xFFDA000D); //column 14
770 const __m128i c16_n78_p61
= _mm_set1_epi32(0xFFB2003D);
771 const __m128i c16_n90_p88
= _mm_set1_epi32(0xFFA60058);
772 const __m128i c16_n73_p85
= _mm_set1_epi32(0xFFB70055);
773 const __m128i c16_n31_p54
= _mm_set1_epi32(0xFFE10036);
774 const __m128i c16_p22_p04
= _mm_set1_epi32(0x00160004);
775 const __m128i c16_p67_n46
= _mm_set1_epi32(0x0043FFD2);
776 const __m128i c16_p90_n82
= _mm_set1_epi32(0x005AFFAE);
777 const __m128i c16_n13_p04
= _mm_set1_epi32(0xFFF30004); //column 15
778 const __m128i c16_n31_p22
= _mm_set1_epi32(0xFFE10016);
779 const __m128i c16_n46_p38
= _mm_set1_epi32(0xFFD20026);
780 const __m128i c16_n61_p54
= _mm_set1_epi32(0xFFC30036);
781 const __m128i c16_n73_p67
= _mm_set1_epi32(0xFFB70043);
782 const __m128i c16_n82_p78
= _mm_set1_epi32(0xFFAE004E);
783 const __m128i c16_n88_p85
= _mm_set1_epi32(0xFFA80055);
784 const __m128i c16_n90_p90
= _mm_set1_epi32(0xFFA6005A);
787 const __m128i c16_p87_p90
= _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
788 const __m128i c16_p70_p80
= _mm_set1_epi32(0x00460050);
789 const __m128i c16_p43_p57
= _mm_set1_epi32(0x002B0039);
790 const __m128i c16_p09_p25
= _mm_set1_epi32(0x00090019);
791 const __m128i c16_p57_p87
= _mm_set1_epi32(0x00390057); //row1
792 const __m128i c16_n43_p09
= _mm_set1_epi32(0xFFD50009);
793 const __m128i c16_n90_n80
= _mm_set1_epi32(0xFFA6FFB0);
794 const __m128i c16_n25_n70
= _mm_set1_epi32(0xFFE7FFBA);
795 const __m128i c16_p09_p80
= _mm_set1_epi32(0x00090050); //row2
796 const __m128i c16_n87_n70
= _mm_set1_epi32(0xFFA9FFBA);
797 const __m128i c16_p57_n25
= _mm_set1_epi32(0x0039FFE7);
798 const __m128i c16_p43_p90
= _mm_set1_epi32(0x002B005A);
799 const __m128i c16_n43_p70
= _mm_set1_epi32(0xFFD50046); //row3
800 const __m128i c16_p09_n87
= _mm_set1_epi32(0x0009FFA9);
801 const __m128i c16_p25_p90
= _mm_set1_epi32(0x0019005A);
802 const __m128i c16_n57_n80
= _mm_set1_epi32(0xFFC7FFB0);
803 const __m128i c16_n80_p57
= _mm_set1_epi32(0xFFB00039); //row4
804 const __m128i c16_p90_n25
= _mm_set1_epi32(0x005AFFE7);
805 const __m128i c16_n87_n09
= _mm_set1_epi32(0xFFA9FFF7);
806 const __m128i c16_p70_p43
= _mm_set1_epi32(0x0046002B);
807 const __m128i c16_n90_p43
= _mm_set1_epi32(0xFFA6002B); //row5
808 const __m128i c16_p25_p57
= _mm_set1_epi32(0x00190039);
809 const __m128i c16_p70_n87
= _mm_set1_epi32(0x0046FFA9);
810 const __m128i c16_n80_p09
= _mm_set1_epi32(0xFFB00009);
811 const __m128i c16_n70_p25
= _mm_set1_epi32(0xFFBA0019); //row6
812 const __m128i c16_n80_p90
= _mm_set1_epi32(0xFFB0005A);
813 const __m128i c16_p09_p43
= _mm_set1_epi32(0x0009002B);
814 const __m128i c16_p87_n57
= _mm_set1_epi32(0x0057FFC7);
815 const __m128i c16_n25_p09
= _mm_set1_epi32(0xFFE70009); //row7
816 const __m128i c16_n57_p43
= _mm_set1_epi32(0xFFC7002B);
817 const __m128i c16_n80_p70
= _mm_set1_epi32(0xFFB00046);
818 const __m128i c16_n90_p87
= _mm_set1_epi32(0xFFA60057);
820 const __m128i c16_p75_p89
= _mm_set1_epi32(0x004B0059);
821 const __m128i c16_p18_p50
= _mm_set1_epi32(0x00120032);
822 const __m128i c16_n18_p75
= _mm_set1_epi32(0xFFEE004B);
823 const __m128i c16_n50_n89
= _mm_set1_epi32(0xFFCEFFA7);
824 const __m128i c16_n89_p50
= _mm_set1_epi32(0xFFA70032);
825 const __m128i c16_p75_p18
= _mm_set1_epi32(0x004B0012);
826 const __m128i c16_n50_p18
= _mm_set1_epi32(0xFFCE0012);
827 const __m128i c16_n89_p75
= _mm_set1_epi32(0xFFA7004B);
829 const __m128i c16_p36_p83
= _mm_set1_epi32(0x00240053);
830 const __m128i c16_n83_p36
= _mm_set1_epi32(0xFFAD0024);
832 const __m128i c16_n64_p64
= _mm_set1_epi32(0xFFC00040);
833 const __m128i c16_p64_p64
= _mm_set1_epi32(0x00400040);
834 __m128i c32_rnd
= _mm_set1_epi32(64);
839 __m128i in00
[4], in01
[4], in02
[4], in03
[4], in04
[4], in05
[4], in06
[4], in07
[4], in08
[4], in09
[4], in10
[4], in11
[4], in12
[4], in13
[4], in14
[4], in15
[4];
840 __m128i in16
[4], in17
[4], in18
[4], in19
[4], in20
[4], in21
[4], in22
[4], in23
[4], in24
[4], in25
[4], in26
[4], in27
[4], in28
[4], in29
[4], in30
[4], in31
[4];
841 __m128i res00
[4], res01
[4], res02
[4], res03
[4], res04
[4], res05
[4], res06
[4], res07
[4], res08
[4], res09
[4], res10
[4], res11
[4], res12
[4], res13
[4], res14
[4], res15
[4];
842 __m128i res16
[4], res17
[4], res18
[4], res19
[4], res20
[4], res21
[4], res22
[4], res23
[4], res24
[4], res25
[4], res26
[4], res27
[4], res28
[4], res29
[4], res30
[4], res31
[4];
844 for (int i
= 0; i
< 4; i
++)
846 const int offset
= (i
<< 3);
847 in00
[i
] = _mm_loadu_si128((const __m128i
*)&src
[0 * 32 + offset
]);
848 in01
[i
] = _mm_loadu_si128((const __m128i
*)&src
[1 * 32 + offset
]);
849 in02
[i
] = _mm_loadu_si128((const __m128i
*)&src
[2 * 32 + offset
]);
850 in03
[i
] = _mm_loadu_si128((const __m128i
*)&src
[3 * 32 + offset
]);
851 in04
[i
] = _mm_loadu_si128((const __m128i
*)&src
[4 * 32 + offset
]);
852 in05
[i
] = _mm_loadu_si128((const __m128i
*)&src
[5 * 32 + offset
]);
853 in06
[i
] = _mm_loadu_si128((const __m128i
*)&src
[6 * 32 + offset
]);
854 in07
[i
] = _mm_loadu_si128((const __m128i
*)&src
[7 * 32 + offset
]);
855 in08
[i
] = _mm_loadu_si128((const __m128i
*)&src
[8 * 32 + offset
]);
856 in09
[i
] = _mm_loadu_si128((const __m128i
*)&src
[9 * 32 + offset
]);
857 in10
[i
] = _mm_loadu_si128((const __m128i
*)&src
[10 * 32 + offset
]);
858 in11
[i
] = _mm_loadu_si128((const __m128i
*)&src
[11 * 32 + offset
]);
859 in12
[i
] = _mm_loadu_si128((const __m128i
*)&src
[12 * 32 + offset
]);
860 in13
[i
] = _mm_loadu_si128((const __m128i
*)&src
[13 * 32 + offset
]);
861 in14
[i
] = _mm_loadu_si128((const __m128i
*)&src
[14 * 32 + offset
]);
862 in15
[i
] = _mm_loadu_si128((const __m128i
*)&src
[15 * 32 + offset
]);
863 in16
[i
] = _mm_loadu_si128((const __m128i
*)&src
[16 * 32 + offset
]);
864 in17
[i
] = _mm_loadu_si128((const __m128i
*)&src
[17 * 32 + offset
]);
865 in18
[i
] = _mm_loadu_si128((const __m128i
*)&src
[18 * 32 + offset
]);
866 in19
[i
] = _mm_loadu_si128((const __m128i
*)&src
[19 * 32 + offset
]);
867 in20
[i
] = _mm_loadu_si128((const __m128i
*)&src
[20 * 32 + offset
]);
868 in21
[i
] = _mm_loadu_si128((const __m128i
*)&src
[21 * 32 + offset
]);
869 in22
[i
] = _mm_loadu_si128((const __m128i
*)&src
[22 * 32 + offset
]);
870 in23
[i
] = _mm_loadu_si128((const __m128i
*)&src
[23 * 32 + offset
]);
871 in24
[i
] = _mm_loadu_si128((const __m128i
*)&src
[24 * 32 + offset
]);
872 in25
[i
] = _mm_loadu_si128((const __m128i
*)&src
[25 * 32 + offset
]);
873 in26
[i
] = _mm_loadu_si128((const __m128i
*)&src
[26 * 32 + offset
]);
874 in27
[i
] = _mm_loadu_si128((const __m128i
*)&src
[27 * 32 + offset
]);
875 in28
[i
] = _mm_loadu_si128((const __m128i
*)&src
[28 * 32 + offset
]);
876 in29
[i
] = _mm_loadu_si128((const __m128i
*)&src
[29 * 32 + offset
]);
877 in30
[i
] = _mm_loadu_si128((const __m128i
*)&src
[30 * 32 + offset
]);
878 in31
[i
] = _mm_loadu_si128((const __m128i
*)&src
[31 * 32 + offset
]);
881 for (int pass
= 0; pass
< 2; pass
++)
885 c32_rnd
= _mm_set1_epi32(2048);
889 for (int part
= 0; part
< 4; part
++)
891 const __m128i T_00_00A
= _mm_unpacklo_epi16(in01
[part
], in03
[part
]); // [33 13 32 12 31 11 30 10]
892 const __m128i T_00_00B
= _mm_unpackhi_epi16(in01
[part
], in03
[part
]); // [37 17 36 16 35 15 34 14]
893 const __m128i T_00_01A
= _mm_unpacklo_epi16(in05
[part
], in07
[part
]); // [ ]
894 const __m128i T_00_01B
= _mm_unpackhi_epi16(in05
[part
], in07
[part
]); // [ ]
895 const __m128i T_00_02A
= _mm_unpacklo_epi16(in09
[part
], in11
[part
]); // [ ]
896 const __m128i T_00_02B
= _mm_unpackhi_epi16(in09
[part
], in11
[part
]); // [ ]
897 const __m128i T_00_03A
= _mm_unpacklo_epi16(in13
[part
], in15
[part
]); // [ ]
898 const __m128i T_00_03B
= _mm_unpackhi_epi16(in13
[part
], in15
[part
]); // [ ]
899 const __m128i T_00_04A
= _mm_unpacklo_epi16(in17
[part
], in19
[part
]); // [ ]
900 const __m128i T_00_04B
= _mm_unpackhi_epi16(in17
[part
], in19
[part
]); // [ ]
901 const __m128i T_00_05A
= _mm_unpacklo_epi16(in21
[part
], in23
[part
]); // [ ]
902 const __m128i T_00_05B
= _mm_unpackhi_epi16(in21
[part
], in23
[part
]); // [ ]
903 const __m128i T_00_06A
= _mm_unpacklo_epi16(in25
[part
], in27
[part
]); // [ ]
904 const __m128i T_00_06B
= _mm_unpackhi_epi16(in25
[part
], in27
[part
]); // [ ]
905 const __m128i T_00_07A
= _mm_unpacklo_epi16(in29
[part
], in31
[part
]); //
906 const __m128i T_00_07B
= _mm_unpackhi_epi16(in29
[part
], in31
[part
]); // [ ]
908 const __m128i T_00_08A
= _mm_unpacklo_epi16(in02
[part
], in06
[part
]); // [ ]
909 const __m128i T_00_08B
= _mm_unpackhi_epi16(in02
[part
], in06
[part
]); // [ ]
910 const __m128i T_00_09A
= _mm_unpacklo_epi16(in10
[part
], in14
[part
]); // [ ]
911 const __m128i T_00_09B
= _mm_unpackhi_epi16(in10
[part
], in14
[part
]); // [ ]
912 const __m128i T_00_10A
= _mm_unpacklo_epi16(in18
[part
], in22
[part
]); // [ ]
913 const __m128i T_00_10B
= _mm_unpackhi_epi16(in18
[part
], in22
[part
]); // [ ]
914 const __m128i T_00_11A
= _mm_unpacklo_epi16(in26
[part
], in30
[part
]); // [ ]
915 const __m128i T_00_11B
= _mm_unpackhi_epi16(in26
[part
], in30
[part
]); // [ ]
917 const __m128i T_00_12A
= _mm_unpacklo_epi16(in04
[part
], in12
[part
]); // [ ]
918 const __m128i T_00_12B
= _mm_unpackhi_epi16(in04
[part
], in12
[part
]); // [ ]
919 const __m128i T_00_13A
= _mm_unpacklo_epi16(in20
[part
], in28
[part
]); // [ ]
920 const __m128i T_00_13B
= _mm_unpackhi_epi16(in20
[part
], in28
[part
]); // [ ]
922 const __m128i T_00_14A
= _mm_unpacklo_epi16(in08
[part
], in24
[part
]); //
923 const __m128i T_00_14B
= _mm_unpackhi_epi16(in08
[part
], in24
[part
]); // [ ]
924 const __m128i T_00_15A
= _mm_unpacklo_epi16(in00
[part
], in16
[part
]); //
925 const __m128i T_00_15B
= _mm_unpackhi_epi16(in00
[part
], in16
[part
]); // [ ]
927 __m128i O00A
, O01A
, O02A
, O03A
, O04A
, O05A
, O06A
, O07A
, O08A
, O09A
, O10A
, O11A
, O12A
, O13A
, O14A
, O15A
;
928 __m128i O00B
, O01B
, O02B
, O03B
, O04B
, O05B
, O06B
, O07B
, O08B
, O09B
, O10B
, O11B
, O12B
, O13B
, O14B
, O15B
;
930 __m128i T00
, T01
, T02
, T03
;
931 #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \
932 T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \
933 T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \
934 T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \
935 T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \
936 row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03));
938 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
939 c16_p90_p90
, c16_p85_p88
, c16_p78_p82
, c16_p67_p73
, c16_p54_p61
, c16_p38_p46
, c16_p22_p31
, c16_p04_p13
, O00A
)
940 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
941 c16_p82_p90
, c16_p46_p67
, c16_n04_p22
, c16_n54_n31
, c16_n85_n73
, c16_n88_n90
, c16_n61_n78
, c16_n13_n38
, O01A
)
942 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
943 c16_p67_p88
, c16_n13_p31
, c16_n82_n54
, c16_n78_n90
, c16_n04_n46
, c16_p73_p38
, c16_p85_p90
, c16_p22_p61
, O02A
)
944 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
945 c16_p46_p85
, c16_n67_n13
, c16_n73_n90
, c16_p38_n22
, c16_p88_p82
, c16_n04_p54
, c16_n90_n61
, c16_n31_n78
, O03A
)
946 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
947 c16_p22_p82
, c16_n90_n54
, c16_p13_n61
, c16_p85_p78
, c16_n46_p31
, c16_n67_n90
, c16_p73_p04
, c16_p38_p88
, O04A
)
948 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
949 c16_n04_p78
, c16_n73_n82
, c16_p85_p13
, c16_n22_p67
, c16_n61_n88
, c16_p90_p31
, c16_n38_p54
, c16_n46_n90
, O05A
)
950 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
951 c16_n31_p73
, c16_n22_n90
, c16_p67_p78
, c16_n90_n38
, c16_p82_n13
, c16_n46_p61
, c16_n04_n88
, c16_p54_p85
, O06A
)
952 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
953 c16_n54_p67
, c16_p38_n78
, c16_n22_p85
, c16_p04_n90
, c16_p13_p90
, c16_n31_n88
, c16_p46_p82
, c16_n61_n73
, O07A
)
954 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
955 c16_n73_p61
, c16_p82_n46
, c16_n88_p31
, c16_p90_n13
, c16_n90_n04
, c16_p85_p22
, c16_n78_n38
, c16_p67_p54
, O08A
)
956 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
957 c16_n85_p54
, c16_p88_n04
, c16_n61_n46
, c16_p13_p82
, c16_p38_n90
, c16_n78_p67
, c16_p90_n22
, c16_n73_n31
, O09A
)
958 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
959 c16_n90_p46
, c16_p54_p38
, c16_p31_n90
, c16_n88_p61
, c16_p67_p22
, c16_p13_n85
, c16_n82_p73
, c16_p78_p04
, O10A
)
960 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
961 c16_n88_p38
, c16_n04_p73
, c16_p90_n67
, c16_n31_n46
, c16_n78_p85
, c16_p61_p13
, c16_p54_n90
, c16_n82_p22
, O11A
)
962 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
963 c16_n78_p31
, c16_n61_p90
, c16_p54_p04
, c16_p82_n88
, c16_n22_n38
, c16_n90_p73
, c16_n13_p67
, c16_p85_n46
, O12A
)
964 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
965 c16_n61_p22
, c16_n90_p85
, c16_n38_p73
, c16_p46_n04
, c16_p90_n78
, c16_p54_n82
, c16_n31_n13
, c16_n88_p67
, O13A
)
966 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
967 c16_n38_p13
, c16_n78_p61
, c16_n90_p88
, c16_n73_p85
, c16_n31_p54
, c16_p22_p04
, c16_p67_n46
, c16_p90_n82
, O14A
)
968 COMPUTE_ROW(T_00_00A
, T_00_01A
, T_00_02A
, T_00_03A
, T_00_04A
, T_00_05A
, T_00_06A
, T_00_07A
, \
969 c16_n13_p04
, c16_n31_p22
, c16_n46_p38
, c16_n61_p54
, c16_n73_p67
, c16_n82_p78
, c16_n88_p85
, c16_n90_p90
, O15A
)
971 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
972 c16_p90_p90
, c16_p85_p88
, c16_p78_p82
, c16_p67_p73
, c16_p54_p61
, c16_p38_p46
, c16_p22_p31
, c16_p04_p13
, O00B
)
973 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
974 c16_p82_p90
, c16_p46_p67
, c16_n04_p22
, c16_n54_n31
, c16_n85_n73
, c16_n88_n90
, c16_n61_n78
, c16_n13_n38
, O01B
)
975 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
976 c16_p67_p88
, c16_n13_p31
, c16_n82_n54
, c16_n78_n90
, c16_n04_n46
, c16_p73_p38
, c16_p85_p90
, c16_p22_p61
, O02B
)
977 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
978 c16_p46_p85
, c16_n67_n13
, c16_n73_n90
, c16_p38_n22
, c16_p88_p82
, c16_n04_p54
, c16_n90_n61
, c16_n31_n78
, O03B
)
979 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
980 c16_p22_p82
, c16_n90_n54
, c16_p13_n61
, c16_p85_p78
, c16_n46_p31
, c16_n67_n90
, c16_p73_p04
, c16_p38_p88
, O04B
)
981 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
982 c16_n04_p78
, c16_n73_n82
, c16_p85_p13
, c16_n22_p67
, c16_n61_n88
, c16_p90_p31
, c16_n38_p54
, c16_n46_n90
, O05B
)
983 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
984 c16_n31_p73
, c16_n22_n90
, c16_p67_p78
, c16_n90_n38
, c16_p82_n13
, c16_n46_p61
, c16_n04_n88
, c16_p54_p85
, O06B
)
985 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
986 c16_n54_p67
, c16_p38_n78
, c16_n22_p85
, c16_p04_n90
, c16_p13_p90
, c16_n31_n88
, c16_p46_p82
, c16_n61_n73
, O07B
)
987 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
988 c16_n73_p61
, c16_p82_n46
, c16_n88_p31
, c16_p90_n13
, c16_n90_n04
, c16_p85_p22
, c16_n78_n38
, c16_p67_p54
, O08B
)
989 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
990 c16_n85_p54
, c16_p88_n04
, c16_n61_n46
, c16_p13_p82
, c16_p38_n90
, c16_n78_p67
, c16_p90_n22
, c16_n73_n31
, O09B
)
991 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
992 c16_n90_p46
, c16_p54_p38
, c16_p31_n90
, c16_n88_p61
, c16_p67_p22
, c16_p13_n85
, c16_n82_p73
, c16_p78_p04
, O10B
)
993 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
994 c16_n88_p38
, c16_n04_p73
, c16_p90_n67
, c16_n31_n46
, c16_n78_p85
, c16_p61_p13
, c16_p54_n90
, c16_n82_p22
, O11B
)
995 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
996 c16_n78_p31
, c16_n61_p90
, c16_p54_p04
, c16_p82_n88
, c16_n22_n38
, c16_n90_p73
, c16_n13_p67
, c16_p85_n46
, O12B
)
997 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
998 c16_n61_p22
, c16_n90_p85
, c16_n38_p73
, c16_p46_n04
, c16_p90_n78
, c16_p54_n82
, c16_n31_n13
, c16_n88_p67
, O13B
)
999 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1000 c16_n38_p13
, c16_n78_p61
, c16_n90_p88
, c16_n73_p85
, c16_n31_p54
, c16_p22_p04
, c16_p67_n46
, c16_p90_n82
, O14B
)
1001 COMPUTE_ROW(T_00_00B
, T_00_01B
, T_00_02B
, T_00_03B
, T_00_04B
, T_00_05B
, T_00_06B
, T_00_07B
, \
1002 c16_n13_p04
, c16_n31_p22
, c16_n46_p38
, c16_n61_p54
, c16_n73_p67
, c16_n82_p78
, c16_n88_p85
, c16_n90_p90
, O15B
)
1007 __m128i EO0A
, EO1A
, EO2A
, EO3A
, EO4A
, EO5A
, EO6A
, EO7A
;
1008 __m128i EO0B
, EO1B
, EO2B
, EO3B
, EO4B
, EO5B
, EO6B
, EO7B
;
1011 #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \
1012 T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \
1013 T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \
1014 row = _mm_add_epi32(T00, T01);
1016 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_p87_p90
, c16_p70_p80
, c16_p43_p57
, c16_p09_p25
, EO0A
)
1017 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_p57_p87
, c16_n43_p09
, c16_n90_n80
, c16_n25_n70
, EO1A
)
1018 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_p09_p80
, c16_n87_n70
, c16_p57_n25
, c16_p43_p90
, EO2A
)
1019 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n43_p70
, c16_p09_n87
, c16_p25_p90
, c16_n57_n80
, EO3A
)
1020 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n80_p57
, c16_p90_n25
, c16_n87_n09
, c16_p70_p43
, EO4A
)
1021 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n90_p43
, c16_p25_p57
, c16_p70_n87
, c16_n80_p09
, EO5A
)
1022 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n70_p25
, c16_n80_p90
, c16_p09_p43
, c16_p87_n57
, EO6A
)
1023 COMPUTE_ROW(T_00_08A
, T_00_09A
, T_00_10A
, T_00_11A
, c16_n25_p09
, c16_n57_p43
, c16_n80_p70
, c16_n90_p87
, EO7A
)
1025 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_p87_p90
, c16_p70_p80
, c16_p43_p57
, c16_p09_p25
, EO0B
)
1026 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_p57_p87
, c16_n43_p09
, c16_n90_n80
, c16_n25_n70
, EO1B
)
1027 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_p09_p80
, c16_n87_n70
, c16_p57_n25
, c16_p43_p90
, EO2B
)
1028 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n43_p70
, c16_p09_n87
, c16_p25_p90
, c16_n57_n80
, EO3B
)
1029 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n80_p57
, c16_p90_n25
, c16_n87_n09
, c16_p70_p43
, EO4B
)
1030 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n90_p43
, c16_p25_p57
, c16_p70_n87
, c16_n80_p09
, EO5B
)
1031 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n70_p25
, c16_n80_p90
, c16_p09_p43
, c16_p87_n57
, EO6B
)
1032 COMPUTE_ROW(T_00_08B
, T_00_09B
, T_00_10B
, T_00_11B
, c16_n25_p09
, c16_n57_p43
, c16_n80_p70
, c16_n90_p87
, EO7B
)
1036 const __m128i EEO0A
= _mm_add_epi32(_mm_madd_epi16(T_00_12A
, c16_p75_p89
), _mm_madd_epi16(T_00_13A
, c16_p18_p50
)); // EEO0
1037 const __m128i EEO0B
= _mm_add_epi32(_mm_madd_epi16(T_00_12B
, c16_p75_p89
), _mm_madd_epi16(T_00_13B
, c16_p18_p50
));
1038 const __m128i EEO1A
= _mm_add_epi32(_mm_madd_epi16(T_00_12A
, c16_n18_p75
), _mm_madd_epi16(T_00_13A
, c16_n50_n89
)); // EEO1
1039 const __m128i EEO1B
= _mm_add_epi32(_mm_madd_epi16(T_00_12B
, c16_n18_p75
), _mm_madd_epi16(T_00_13B
, c16_n50_n89
));
1040 const __m128i EEO2A
= _mm_add_epi32(_mm_madd_epi16(T_00_12A
, c16_n89_p50
), _mm_madd_epi16(T_00_13A
, c16_p75_p18
)); // EEO2
1041 const __m128i EEO2B
= _mm_add_epi32(_mm_madd_epi16(T_00_12B
, c16_n89_p50
), _mm_madd_epi16(T_00_13B
, c16_p75_p18
));
1042 const __m128i EEO3A
= _mm_add_epi32(_mm_madd_epi16(T_00_12A
, c16_n50_p18
), _mm_madd_epi16(T_00_13A
, c16_n89_p75
)); // EEO3
1043 const __m128i EEO3B
= _mm_add_epi32(_mm_madd_epi16(T_00_12B
, c16_n50_p18
), _mm_madd_epi16(T_00_13B
, c16_n89_p75
));
1045 const __m128i EEEO0A
= _mm_madd_epi16(T_00_14A
, c16_p36_p83
);
1046 const __m128i EEEO0B
= _mm_madd_epi16(T_00_14B
, c16_p36_p83
);
1047 const __m128i EEEO1A
= _mm_madd_epi16(T_00_14A
, c16_n83_p36
);
1048 const __m128i EEEO1B
= _mm_madd_epi16(T_00_14B
, c16_n83_p36
);
1050 const __m128i EEEE0A
= _mm_madd_epi16(T_00_15A
, c16_p64_p64
);
1051 const __m128i EEEE0B
= _mm_madd_epi16(T_00_15B
, c16_p64_p64
);
1052 const __m128i EEEE1A
= _mm_madd_epi16(T_00_15A
, c16_n64_p64
);
1053 const __m128i EEEE1B
= _mm_madd_epi16(T_00_15B
, c16_n64_p64
);
1055 const __m128i EEE0A
= _mm_add_epi32(EEEE0A
, EEEO0A
); // EEE0 = EEEE0 + EEEO0
1056 const __m128i EEE0B
= _mm_add_epi32(EEEE0B
, EEEO0B
);
1057 const __m128i EEE1A
= _mm_add_epi32(EEEE1A
, EEEO1A
); // EEE1 = EEEE1 + EEEO1
1058 const __m128i EEE1B
= _mm_add_epi32(EEEE1B
, EEEO1B
);
1059 const __m128i EEE3A
= _mm_sub_epi32(EEEE0A
, EEEO0A
); // EEE2 = EEEE0 - EEEO0
1060 const __m128i EEE3B
= _mm_sub_epi32(EEEE0B
, EEEO0B
);
1061 const __m128i EEE2A
= _mm_sub_epi32(EEEE1A
, EEEO1A
); // EEE3 = EEEE1 - EEEO1
1062 const __m128i EEE2B
= _mm_sub_epi32(EEEE1B
, EEEO1B
);
1064 const __m128i EE0A
= _mm_add_epi32(EEE0A
, EEO0A
); // EE0 = EEE0 + EEO0
1065 const __m128i EE0B
= _mm_add_epi32(EEE0B
, EEO0B
);
1066 const __m128i EE1A
= _mm_add_epi32(EEE1A
, EEO1A
); // EE1 = EEE1 + EEO1
1067 const __m128i EE1B
= _mm_add_epi32(EEE1B
, EEO1B
);
1068 const __m128i EE2A
= _mm_add_epi32(EEE2A
, EEO2A
); // EE2 = EEE0 + EEO0
1069 const __m128i EE2B
= _mm_add_epi32(EEE2B
, EEO2B
);
1070 const __m128i EE3A
= _mm_add_epi32(EEE3A
, EEO3A
); // EE3 = EEE1 + EEO1
1071 const __m128i EE3B
= _mm_add_epi32(EEE3B
, EEO3B
);
1072 const __m128i EE7A
= _mm_sub_epi32(EEE0A
, EEO0A
); // EE7 = EEE0 - EEO0
1073 const __m128i EE7B
= _mm_sub_epi32(EEE0B
, EEO0B
);
1074 const __m128i EE6A
= _mm_sub_epi32(EEE1A
, EEO1A
); // EE6 = EEE1 - EEO1
1075 const __m128i EE6B
= _mm_sub_epi32(EEE1B
, EEO1B
);
1076 const __m128i EE5A
= _mm_sub_epi32(EEE2A
, EEO2A
); // EE5 = EEE0 - EEO0
1077 const __m128i EE5B
= _mm_sub_epi32(EEE2B
, EEO2B
);
1078 const __m128i EE4A
= _mm_sub_epi32(EEE3A
, EEO3A
); // EE4 = EEE1 - EEO1
1079 const __m128i EE4B
= _mm_sub_epi32(EEE3B
, EEO3B
);
1081 const __m128i E0A
= _mm_add_epi32(EE0A
, EO0A
); // E0 = EE0 + EO0
1082 const __m128i E0B
= _mm_add_epi32(EE0B
, EO0B
);
1083 const __m128i E1A
= _mm_add_epi32(EE1A
, EO1A
); // E1 = EE1 + EO1
1084 const __m128i E1B
= _mm_add_epi32(EE1B
, EO1B
);
1085 const __m128i E2A
= _mm_add_epi32(EE2A
, EO2A
); // E2 = EE2 + EO2
1086 const __m128i E2B
= _mm_add_epi32(EE2B
, EO2B
);
1087 const __m128i E3A
= _mm_add_epi32(EE3A
, EO3A
); // E3 = EE3 + EO3
1088 const __m128i E3B
= _mm_add_epi32(EE3B
, EO3B
);
1089 const __m128i E4A
= _mm_add_epi32(EE4A
, EO4A
); // E4 =
1090 const __m128i E4B
= _mm_add_epi32(EE4B
, EO4B
);
1091 const __m128i E5A
= _mm_add_epi32(EE5A
, EO5A
); // E5 =
1092 const __m128i E5B
= _mm_add_epi32(EE5B
, EO5B
);
1093 const __m128i E6A
= _mm_add_epi32(EE6A
, EO6A
); // E6 =
1094 const __m128i E6B
= _mm_add_epi32(EE6B
, EO6B
);
1095 const __m128i E7A
= _mm_add_epi32(EE7A
, EO7A
); // E7 =
1096 const __m128i E7B
= _mm_add_epi32(EE7B
, EO7B
);
1097 const __m128i EFA
= _mm_sub_epi32(EE0A
, EO0A
); // EF = EE0 - EO0
1098 const __m128i EFB
= _mm_sub_epi32(EE0B
, EO0B
);
1099 const __m128i EEA
= _mm_sub_epi32(EE1A
, EO1A
); // EE = EE1 - EO1
1100 const __m128i EEB
= _mm_sub_epi32(EE1B
, EO1B
);
1101 const __m128i EDA
= _mm_sub_epi32(EE2A
, EO2A
); // ED = EE2 - EO2
1102 const __m128i EDB
= _mm_sub_epi32(EE2B
, EO2B
);
1103 const __m128i ECA
= _mm_sub_epi32(EE3A
, EO3A
); // EC = EE3 - EO3
1104 const __m128i ECB
= _mm_sub_epi32(EE3B
, EO3B
);
1105 const __m128i EBA
= _mm_sub_epi32(EE4A
, EO4A
); // EB =
1106 const __m128i EBB
= _mm_sub_epi32(EE4B
, EO4B
);
1107 const __m128i EAA
= _mm_sub_epi32(EE5A
, EO5A
); // EA =
1108 const __m128i EAB
= _mm_sub_epi32(EE5B
, EO5B
);
1109 const __m128i E9A
= _mm_sub_epi32(EE6A
, EO6A
); // E9 =
1110 const __m128i E9B
= _mm_sub_epi32(EE6B
, EO6B
);
1111 const __m128i E8A
= _mm_sub_epi32(EE7A
, EO7A
); // E8 =
1112 const __m128i E8B
= _mm_sub_epi32(EE7B
, EO7B
);
1114 const __m128i T10A
= _mm_add_epi32(E0A
, c32_rnd
); // E0 + rnd
1115 const __m128i T10B
= _mm_add_epi32(E0B
, c32_rnd
);
1116 const __m128i T11A
= _mm_add_epi32(E1A
, c32_rnd
); // E1 + rnd
1117 const __m128i T11B
= _mm_add_epi32(E1B
, c32_rnd
);
1118 const __m128i T12A
= _mm_add_epi32(E2A
, c32_rnd
); // E2 + rnd
1119 const __m128i T12B
= _mm_add_epi32(E2B
, c32_rnd
);
1120 const __m128i T13A
= _mm_add_epi32(E3A
, c32_rnd
); // E3 + rnd
1121 const __m128i T13B
= _mm_add_epi32(E3B
, c32_rnd
);
1122 const __m128i T14A
= _mm_add_epi32(E4A
, c32_rnd
); // E4 + rnd
1123 const __m128i T14B
= _mm_add_epi32(E4B
, c32_rnd
);
1124 const __m128i T15A
= _mm_add_epi32(E5A
, c32_rnd
); // E5 + rnd
1125 const __m128i T15B
= _mm_add_epi32(E5B
, c32_rnd
);
1126 const __m128i T16A
= _mm_add_epi32(E6A
, c32_rnd
); // E6 + rnd
1127 const __m128i T16B
= _mm_add_epi32(E6B
, c32_rnd
);
1128 const __m128i T17A
= _mm_add_epi32(E7A
, c32_rnd
); // E7 + rnd
1129 const __m128i T17B
= _mm_add_epi32(E7B
, c32_rnd
);
1130 const __m128i T18A
= _mm_add_epi32(E8A
, c32_rnd
); // E8 + rnd
1131 const __m128i T18B
= _mm_add_epi32(E8B
, c32_rnd
);
1132 const __m128i T19A
= _mm_add_epi32(E9A
, c32_rnd
); // E9 + rnd
1133 const __m128i T19B
= _mm_add_epi32(E9B
, c32_rnd
);
1134 const __m128i T1AA
= _mm_add_epi32(EAA
, c32_rnd
); // E10 + rnd
1135 const __m128i T1AB
= _mm_add_epi32(EAB
, c32_rnd
);
1136 const __m128i T1BA
= _mm_add_epi32(EBA
, c32_rnd
); // E11 + rnd
1137 const __m128i T1BB
= _mm_add_epi32(EBB
, c32_rnd
);
1138 const __m128i T1CA
= _mm_add_epi32(ECA
, c32_rnd
); // E12 + rnd
1139 const __m128i T1CB
= _mm_add_epi32(ECB
, c32_rnd
);
1140 const __m128i T1DA
= _mm_add_epi32(EDA
, c32_rnd
); // E13 + rnd
1141 const __m128i T1DB
= _mm_add_epi32(EDB
, c32_rnd
);
1142 const __m128i T1EA
= _mm_add_epi32(EEA
, c32_rnd
); // E14 + rnd
1143 const __m128i T1EB
= _mm_add_epi32(EEB
, c32_rnd
);
1144 const __m128i T1FA
= _mm_add_epi32(EFA
, c32_rnd
); // E15 + rnd
1145 const __m128i T1FB
= _mm_add_epi32(EFB
, c32_rnd
);
1147 const __m128i T2_00A
= _mm_add_epi32(T10A
, O00A
); // E0 + O0 + rnd
1148 const __m128i T2_00B
= _mm_add_epi32(T10B
, O00B
);
1149 const __m128i T2_01A
= _mm_add_epi32(T11A
, O01A
); // E1 + O1 + rnd
1150 const __m128i T2_01B
= _mm_add_epi32(T11B
, O01B
);
1151 const __m128i T2_02A
= _mm_add_epi32(T12A
, O02A
); // E2 + O2 + rnd
1152 const __m128i T2_02B
= _mm_add_epi32(T12B
, O02B
);
1153 const __m128i T2_03A
= _mm_add_epi32(T13A
, O03A
); // E3 + O3 + rnd
1154 const __m128i T2_03B
= _mm_add_epi32(T13B
, O03B
);
1155 const __m128i T2_04A
= _mm_add_epi32(T14A
, O04A
); // E4
1156 const __m128i T2_04B
= _mm_add_epi32(T14B
, O04B
);
1157 const __m128i T2_05A
= _mm_add_epi32(T15A
, O05A
); // E5
1158 const __m128i T2_05B
= _mm_add_epi32(T15B
, O05B
);
1159 const __m128i T2_06A
= _mm_add_epi32(T16A
, O06A
); // E6
1160 const __m128i T2_06B
= _mm_add_epi32(T16B
, O06B
);
1161 const __m128i T2_07A
= _mm_add_epi32(T17A
, O07A
); // E7
1162 const __m128i T2_07B
= _mm_add_epi32(T17B
, O07B
);
1163 const __m128i T2_08A
= _mm_add_epi32(T18A
, O08A
); // E8
1164 const __m128i T2_08B
= _mm_add_epi32(T18B
, O08B
);
1165 const __m128i T2_09A
= _mm_add_epi32(T19A
, O09A
); // E9
1166 const __m128i T2_09B
= _mm_add_epi32(T19B
, O09B
);
1167 const __m128i T2_10A
= _mm_add_epi32(T1AA
, O10A
); // E10
1168 const __m128i T2_10B
= _mm_add_epi32(T1AB
, O10B
);
1169 const __m128i T2_11A
= _mm_add_epi32(T1BA
, O11A
); // E11
1170 const __m128i T2_11B
= _mm_add_epi32(T1BB
, O11B
);
1171 const __m128i T2_12A
= _mm_add_epi32(T1CA
, O12A
); // E12
1172 const __m128i T2_12B
= _mm_add_epi32(T1CB
, O12B
);
1173 const __m128i T2_13A
= _mm_add_epi32(T1DA
, O13A
); // E13
1174 const __m128i T2_13B
= _mm_add_epi32(T1DB
, O13B
);
1175 const __m128i T2_14A
= _mm_add_epi32(T1EA
, O14A
); // E14
1176 const __m128i T2_14B
= _mm_add_epi32(T1EB
, O14B
);
1177 const __m128i T2_15A
= _mm_add_epi32(T1FA
, O15A
); // E15
1178 const __m128i T2_15B
= _mm_add_epi32(T1FB
, O15B
);
1179 const __m128i T2_31A
= _mm_sub_epi32(T10A
, O00A
); // E0 - O0 + rnd
1180 const __m128i T2_31B
= _mm_sub_epi32(T10B
, O00B
);
1181 const __m128i T2_30A
= _mm_sub_epi32(T11A
, O01A
); // E1 - O1 + rnd
1182 const __m128i T2_30B
= _mm_sub_epi32(T11B
, O01B
);
1183 const __m128i T2_29A
= _mm_sub_epi32(T12A
, O02A
); // E2 - O2 + rnd
1184 const __m128i T2_29B
= _mm_sub_epi32(T12B
, O02B
);
1185 const __m128i T2_28A
= _mm_sub_epi32(T13A
, O03A
); // E3 - O3 + rnd
1186 const __m128i T2_28B
= _mm_sub_epi32(T13B
, O03B
);
1187 const __m128i T2_27A
= _mm_sub_epi32(T14A
, O04A
); // E4
1188 const __m128i T2_27B
= _mm_sub_epi32(T14B
, O04B
);
1189 const __m128i T2_26A
= _mm_sub_epi32(T15A
, O05A
); // E5
1190 const __m128i T2_26B
= _mm_sub_epi32(T15B
, O05B
);
1191 const __m128i T2_25A
= _mm_sub_epi32(T16A
, O06A
); // E6
1192 const __m128i T2_25B
= _mm_sub_epi32(T16B
, O06B
);
1193 const __m128i T2_24A
= _mm_sub_epi32(T17A
, O07A
); // E7
1194 const __m128i T2_24B
= _mm_sub_epi32(T17B
, O07B
);
1195 const __m128i T2_23A
= _mm_sub_epi32(T18A
, O08A
); //
1196 const __m128i T2_23B
= _mm_sub_epi32(T18B
, O08B
);
1197 const __m128i T2_22A
= _mm_sub_epi32(T19A
, O09A
); //
1198 const __m128i T2_22B
= _mm_sub_epi32(T19B
, O09B
);
1199 const __m128i T2_21A
= _mm_sub_epi32(T1AA
, O10A
); //
1200 const __m128i T2_21B
= _mm_sub_epi32(T1AB
, O10B
);
1201 const __m128i T2_20A
= _mm_sub_epi32(T1BA
, O11A
); //
1202 const __m128i T2_20B
= _mm_sub_epi32(T1BB
, O11B
);
1203 const __m128i T2_19A
= _mm_sub_epi32(T1CA
, O12A
); //
1204 const __m128i T2_19B
= _mm_sub_epi32(T1CB
, O12B
);
1205 const __m128i T2_18A
= _mm_sub_epi32(T1DA
, O13A
); //
1206 const __m128i T2_18B
= _mm_sub_epi32(T1DB
, O13B
);
1207 const __m128i T2_17A
= _mm_sub_epi32(T1EA
, O14A
); //
1208 const __m128i T2_17B
= _mm_sub_epi32(T1EB
, O14B
);
1209 const __m128i T2_16A
= _mm_sub_epi32(T1FA
, O15A
); //
1210 const __m128i T2_16B
= _mm_sub_epi32(T1FB
, O15B
);
1212 const __m128i T3_00A
= _mm_srai_epi32(T2_00A
, nShift
); // [30 20 10 00]
1213 const __m128i T3_00B
= _mm_srai_epi32(T2_00B
, nShift
); // [70 60 50 40]
1214 const __m128i T3_01A
= _mm_srai_epi32(T2_01A
, nShift
); // [31 21 11 01]
1215 const __m128i T3_01B
= _mm_srai_epi32(T2_01B
, nShift
); // [71 61 51 41]
1216 const __m128i T3_02A
= _mm_srai_epi32(T2_02A
, nShift
); // [32 22 12 02]
1217 const __m128i T3_02B
= _mm_srai_epi32(T2_02B
, nShift
); // [72 62 52 42]
1218 const __m128i T3_03A
= _mm_srai_epi32(T2_03A
, nShift
); // [33 23 13 03]
1219 const __m128i T3_03B
= _mm_srai_epi32(T2_03B
, nShift
); // [73 63 53 43]
1220 const __m128i T3_04A
= _mm_srai_epi32(T2_04A
, nShift
); // [33 24 14 04]
1221 const __m128i T3_04B
= _mm_srai_epi32(T2_04B
, nShift
); // [74 64 54 44]
1222 const __m128i T3_05A
= _mm_srai_epi32(T2_05A
, nShift
); // [35 25 15 05]
1223 const __m128i T3_05B
= _mm_srai_epi32(T2_05B
, nShift
); // [75 65 55 45]
1224 const __m128i T3_06A
= _mm_srai_epi32(T2_06A
, nShift
); // [36 26 16 06]
1225 const __m128i T3_06B
= _mm_srai_epi32(T2_06B
, nShift
); // [76 66 56 46]
1226 const __m128i T3_07A
= _mm_srai_epi32(T2_07A
, nShift
); // [37 27 17 07]
1227 const __m128i T3_07B
= _mm_srai_epi32(T2_07B
, nShift
); // [77 67 57 47]
1228 const __m128i T3_08A
= _mm_srai_epi32(T2_08A
, nShift
); // [30 20 10 00] x8
1229 const __m128i T3_08B
= _mm_srai_epi32(T2_08B
, nShift
); // [70 60 50 40]
1230 const __m128i T3_09A
= _mm_srai_epi32(T2_09A
, nShift
); // [31 21 11 01] x9
1231 const __m128i T3_09B
= _mm_srai_epi32(T2_09B
, nShift
); // [71 61 51 41]
1232 const __m128i T3_10A
= _mm_srai_epi32(T2_10A
, nShift
); // [32 22 12 02] xA
1233 const __m128i T3_10B
= _mm_srai_epi32(T2_10B
, nShift
); // [72 62 52 42]
1234 const __m128i T3_11A
= _mm_srai_epi32(T2_11A
, nShift
); // [33 23 13 03] xB
1235 const __m128i T3_11B
= _mm_srai_epi32(T2_11B
, nShift
); // [73 63 53 43]
1236 const __m128i T3_12A
= _mm_srai_epi32(T2_12A
, nShift
); // [33 24 14 04] xC
1237 const __m128i T3_12B
= _mm_srai_epi32(T2_12B
, nShift
); // [74 64 54 44]
1238 const __m128i T3_13A
= _mm_srai_epi32(T2_13A
, nShift
); // [35 25 15 05] xD
1239 const __m128i T3_13B
= _mm_srai_epi32(T2_13B
, nShift
); // [75 65 55 45]
1240 const __m128i T3_14A
= _mm_srai_epi32(T2_14A
, nShift
); // [36 26 16 06] xE
1241 const __m128i T3_14B
= _mm_srai_epi32(T2_14B
, nShift
); // [76 66 56 46]
1242 const __m128i T3_15A
= _mm_srai_epi32(T2_15A
, nShift
); // [37 27 17 07] xF
1243 const __m128i T3_15B
= _mm_srai_epi32(T2_15B
, nShift
); // [77 67 57 47]
1245 const __m128i T3_16A
= _mm_srai_epi32(T2_16A
, nShift
); // [30 20 10 00]
1246 const __m128i T3_16B
= _mm_srai_epi32(T2_16B
, nShift
); // [70 60 50 40]
1247 const __m128i T3_17A
= _mm_srai_epi32(T2_17A
, nShift
); // [31 21 11 01]
1248 const __m128i T3_17B
= _mm_srai_epi32(T2_17B
, nShift
); // [71 61 51 41]
1249 const __m128i T3_18A
= _mm_srai_epi32(T2_18A
, nShift
); // [32 22 12 02]
1250 const __m128i T3_18B
= _mm_srai_epi32(T2_18B
, nShift
); // [72 62 52 42]
1251 const __m128i T3_19A
= _mm_srai_epi32(T2_19A
, nShift
); // [33 23 13 03]
1252 const __m128i T3_19B
= _mm_srai_epi32(T2_19B
, nShift
); // [73 63 53 43]
1253 const __m128i T3_20A
= _mm_srai_epi32(T2_20A
, nShift
); // [33 24 14 04]
1254 const __m128i T3_20B
= _mm_srai_epi32(T2_20B
, nShift
); // [74 64 54 44]
1255 const __m128i T3_21A
= _mm_srai_epi32(T2_21A
, nShift
); // [35 25 15 05]
1256 const __m128i T3_21B
= _mm_srai_epi32(T2_21B
, nShift
); // [75 65 55 45]
1257 const __m128i T3_22A
= _mm_srai_epi32(T2_22A
, nShift
); // [36 26 16 06]
1258 const __m128i T3_22B
= _mm_srai_epi32(T2_22B
, nShift
); // [76 66 56 46]
1259 const __m128i T3_23A
= _mm_srai_epi32(T2_23A
, nShift
); // [37 27 17 07]
1260 const __m128i T3_23B
= _mm_srai_epi32(T2_23B
, nShift
); // [77 67 57 47]
1261 const __m128i T3_24A
= _mm_srai_epi32(T2_24A
, nShift
); // [30 20 10 00] x8
1262 const __m128i T3_24B
= _mm_srai_epi32(T2_24B
, nShift
); // [70 60 50 40]
1263 const __m128i T3_25A
= _mm_srai_epi32(T2_25A
, nShift
); // [31 21 11 01] x9
1264 const __m128i T3_25B
= _mm_srai_epi32(T2_25B
, nShift
); // [71 61 51 41]
1265 const __m128i T3_26A
= _mm_srai_epi32(T2_26A
, nShift
); // [32 22 12 02] xA
1266 const __m128i T3_26B
= _mm_srai_epi32(T2_26B
, nShift
); // [72 62 52 42]
1267 const __m128i T3_27A
= _mm_srai_epi32(T2_27A
, nShift
); // [33 23 13 03] xB
1268 const __m128i T3_27B
= _mm_srai_epi32(T2_27B
, nShift
); // [73 63 53 43]
1269 const __m128i T3_28A
= _mm_srai_epi32(T2_28A
, nShift
); // [33 24 14 04] xC
1270 const __m128i T3_28B
= _mm_srai_epi32(T2_28B
, nShift
); // [74 64 54 44]
1271 const __m128i T3_29A
= _mm_srai_epi32(T2_29A
, nShift
); // [35 25 15 05] xD
1272 const __m128i T3_29B
= _mm_srai_epi32(T2_29B
, nShift
); // [75 65 55 45]
1273 const __m128i T3_30A
= _mm_srai_epi32(T2_30A
, nShift
); // [36 26 16 06] xE
1274 const __m128i T3_30B
= _mm_srai_epi32(T2_30B
, nShift
); // [76 66 56 46]
1275 const __m128i T3_31A
= _mm_srai_epi32(T2_31A
, nShift
); // [37 27 17 07] xF
1276 const __m128i T3_31B
= _mm_srai_epi32(T2_31B
, nShift
); // [77 67 57 47]
1278 res00
[part
] = _mm_packs_epi32(T3_00A
, T3_00B
); // [70 60 50 40 30 20 10 00]
1279 res01
[part
] = _mm_packs_epi32(T3_01A
, T3_01B
); // [71 61 51 41 31 21 11 01]
1280 res02
[part
] = _mm_packs_epi32(T3_02A
, T3_02B
); // [72 62 52 42 32 22 12 02]
1281 res03
[part
] = _mm_packs_epi32(T3_03A
, T3_03B
); // [73 63 53 43 33 23 13 03]
1282 res04
[part
] = _mm_packs_epi32(T3_04A
, T3_04B
); // [74 64 54 44 34 24 14 04]
1283 res05
[part
] = _mm_packs_epi32(T3_05A
, T3_05B
); // [75 65 55 45 35 25 15 05]
1284 res06
[part
] = _mm_packs_epi32(T3_06A
, T3_06B
); // [76 66 56 46 36 26 16 06]
1285 res07
[part
] = _mm_packs_epi32(T3_07A
, T3_07B
); // [77 67 57 47 37 27 17 07]
1286 res08
[part
] = _mm_packs_epi32(T3_08A
, T3_08B
); // [A0 ... 80]
1287 res09
[part
] = _mm_packs_epi32(T3_09A
, T3_09B
); // [A1 ... 81]
1288 res10
[part
] = _mm_packs_epi32(T3_10A
, T3_10B
); // [A2 ... 82]
1289 res11
[part
] = _mm_packs_epi32(T3_11A
, T3_11B
); // [A3 ... 83]
1290 res12
[part
] = _mm_packs_epi32(T3_12A
, T3_12B
); // [A4 ... 84]
1291 res13
[part
] = _mm_packs_epi32(T3_13A
, T3_13B
); // [A5 ... 85]
1292 res14
[part
] = _mm_packs_epi32(T3_14A
, T3_14B
); // [A6 ... 86]
1293 res15
[part
] = _mm_packs_epi32(T3_15A
, T3_15B
); // [A7 ... 87]
1294 res16
[part
] = _mm_packs_epi32(T3_16A
, T3_16B
);
1295 res17
[part
] = _mm_packs_epi32(T3_17A
, T3_17B
);
1296 res18
[part
] = _mm_packs_epi32(T3_18A
, T3_18B
);
1297 res19
[part
] = _mm_packs_epi32(T3_19A
, T3_19B
);
1298 res20
[part
] = _mm_packs_epi32(T3_20A
, T3_20B
);
1299 res21
[part
] = _mm_packs_epi32(T3_21A
, T3_21B
);
1300 res22
[part
] = _mm_packs_epi32(T3_22A
, T3_22B
);
1301 res23
[part
] = _mm_packs_epi32(T3_23A
, T3_23B
);
1302 res24
[part
] = _mm_packs_epi32(T3_24A
, T3_24B
);
1303 res25
[part
] = _mm_packs_epi32(T3_25A
, T3_25B
);
1304 res26
[part
] = _mm_packs_epi32(T3_26A
, T3_26B
);
1305 res27
[part
] = _mm_packs_epi32(T3_27A
, T3_27B
);
1306 res28
[part
] = _mm_packs_epi32(T3_28A
, T3_28B
);
1307 res29
[part
] = _mm_packs_epi32(T3_29A
, T3_29B
);
1308 res30
[part
] = _mm_packs_epi32(T3_30A
, T3_30B
);
1309 res31
[part
] = _mm_packs_epi32(T3_31A
, T3_31B
);
1311 //transpose matrix 8x8 16bit.
1313 __m128i tr0_0
, tr0_1
, tr0_2
, tr0_3
, tr0_4
, tr0_5
, tr0_6
, tr0_7
;
1314 __m128i tr1_0
, tr1_1
, tr1_2
, tr1_3
, tr1_4
, tr1_5
, tr1_6
, tr1_7
;
1315 #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
1316 tr0_0 = _mm_unpacklo_epi16(I0, I1); \
1317 tr0_1 = _mm_unpacklo_epi16(I2, I3); \
1318 tr0_2 = _mm_unpackhi_epi16(I0, I1); \
1319 tr0_3 = _mm_unpackhi_epi16(I2, I3); \
1320 tr0_4 = _mm_unpacklo_epi16(I4, I5); \
1321 tr0_5 = _mm_unpacklo_epi16(I6, I7); \
1322 tr0_6 = _mm_unpackhi_epi16(I4, I5); \
1323 tr0_7 = _mm_unpackhi_epi16(I6, I7); \
1324 tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
1325 tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
1326 tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
1327 tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
1328 tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
1329 tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
1330 tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
1331 tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
1332 O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
1333 O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
1334 O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
1335 O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
1336 O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
1337 O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
1338 O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
1339 O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
1341 TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
1342 TRANSPOSE_8x8_16BIT(res00
[1], res01
[1], res02
[1], res03
[1], res04
[1], res05
[1], res06
[1], res07
[1], in08
[0], in09
[0], in10
[0], in11
[0], in12
[0], in13
[0], in14
[0], in15
[0])
1343 TRANSPOSE_8x8_16BIT(res00
[2], res01
[2], res02
[2], res03
[2], res04
[2], res05
[2], res06
[2], res07
[2], in16
[0], in17
[0], in18
[0], in19
[0], in20
[0], in21
[0], in22
[0], in23
[0])
1344 TRANSPOSE_8x8_16BIT(res00
[3], res01
[3], res02
[3], res03
[3], res04
[3], res05
[3], res06
[3], res07
[3], in24
[0], in25
[0], in26
[0], in27
[0], in28
[0], in29
[0], in30
[0], in31
[0])
1346 TRANSPOSE_8x8_16BIT(res08
[0], res09
[0], res10
[0], res11
[0], res12
[0], res13
[0], res14
[0], res15
[0], in00
[1], in01
[1], in02
[1], in03
[1], in04
[1], in05
[1], in06
[1], in07
[1])
1347 TRANSPOSE_8x8_16BIT(res08
[1], res09
[1], res10
[1], res11
[1], res12
[1], res13
[1], res14
[1], res15
[1], in08
[1], in09
[1], in10
[1], in11
[1], in12
[1], in13
[1], in14
[1], in15
[1])
1348 TRANSPOSE_8x8_16BIT(res08
[2], res09
[2], res10
[2], res11
[2], res12
[2], res13
[2], res14
[2], res15
[2], in16
[1], in17
[1], in18
[1], in19
[1], in20
[1], in21
[1], in22
[1], in23
[1])
1349 TRANSPOSE_8x8_16BIT(res08
[3], res09
[3], res10
[3], res11
[3], res12
[3], res13
[3], res14
[3], res15
[3], in24
[1], in25
[1], in26
[1], in27
[1], in28
[1], in29
[1], in30
[1], in31
[1])
1351 TRANSPOSE_8x8_16BIT(res16
[0], res17
[0], res18
[0], res19
[0], res20
[0], res21
[0], res22
[0], res23
[0], in00
[2], in01
[2], in02
[2], in03
[2], in04
[2], in05
[2], in06
[2], in07
[2])
1352 TRANSPOSE_8x8_16BIT(res16
[1], res17
[1], res18
[1], res19
[1], res20
[1], res21
[1], res22
[1], res23
[1], in08
[2], in09
[2], in10
[2], in11
[2], in12
[2], in13
[2], in14
[2], in15
[2])
1353 TRANSPOSE_8x8_16BIT(res16
[2], res17
[2], res18
[2], res19
[2], res20
[2], res21
[2], res22
[2], res23
[2], in16
[2], in17
[2], in18
[2], in19
[2], in20
[2], in21
[2], in22
[2], in23
[2])
1354 TRANSPOSE_8x8_16BIT(res16
[3], res17
[3], res18
[3], res19
[3], res20
[3], res21
[3], res22
[3], res23
[3], in24
[2], in25
[2], in26
[2], in27
[2], in28
[2], in29
[2], in30
[2], in31
[2])
1356 TRANSPOSE_8x8_16BIT(res24
[0], res25
[0], res26
[0], res27
[0], res28
[0], res29
[0], res30
[0], res31
[0], in00
[3], in01
[3], in02
[3], in03
[3], in04
[3], in05
[3], in06
[3], in07
[3])
1357 TRANSPOSE_8x8_16BIT(res24
[1], res25
[1], res26
[1], res27
[1], res28
[1], res29
[1], res30
[1], res31
[1], in08
[3], in09
[3], in10
[3], in11
[3], in12
[3], in13
[3], in14
[3], in15
[3])
1358 TRANSPOSE_8x8_16BIT(res24
[2], res25
[2], res26
[2], res27
[2], res28
[2], res29
[2], res30
[2], res31
[2], in16
[3], in17
[3], in18
[3], in19
[3], in20
[3], in21
[3], in22
[3], in23
[3])
1359 TRANSPOSE_8x8_16BIT(res24
[3], res25
[3], res26
[3], res27
[3], res28
[3], res29
[3], res30
[3], res31
[3], in24
[3], in25
[3], in26
[3], in27
[3], in28
[3], in29
[3], in30
[3], in31
[3])
1361 #undef TRANSPOSE_8x8_16BIT
1366 for (int i
= 0; i
< 2; i
++)
1368 #define STORE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \
1369 _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 0], L0); \
1370 _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 8], H0); \
1371 _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 0], L1); \
1372 _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 8], H1); \
1373 _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 0], L2); \
1374 _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 8], H2); \
1375 _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 0], L3); \
1376 _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 8], H3); \
1377 _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 0], L4); \
1378 _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 8], H4); \
1379 _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 0], L5); \
1380 _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 8], H5); \
1381 _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 0], L6); \
1382 _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 8], H6); \
1383 _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 0], L7); \
1384 _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 8], H7);
1386 const int k
= i
* 2;
1387 STORE_LINE(in00
[k
], in01
[k
], in02
[k
], in03
[k
], in04
[k
], in05
[k
], in06
[k
], in07
[k
], in00
[k
+ 1], in01
[k
+ 1], in02
[k
+ 1], in03
[k
+ 1], in04
[k
+ 1], in05
[k
+ 1], in06
[k
+ 1], in07
[k
+ 1], 0, i
* 16)
1388 STORE_LINE(in08
[k
], in09
[k
], in10
[k
], in11
[k
], in12
[k
], in13
[k
], in14
[k
], in15
[k
], in08
[k
+ 1], in09
[k
+ 1], in10
[k
+ 1], in11
[k
+ 1], in12
[k
+ 1], in13
[k
+ 1], in14
[k
+ 1], in15
[k
+ 1], 8, i
* 16)
1389 STORE_LINE(in16
[k
], in17
[k
], in18
[k
], in19
[k
], in20
[k
], in21
[k
], in22
[k
], in23
[k
], in16
[k
+ 1], in17
[k
+ 1], in18
[k
+ 1], in19
[k
+ 1], in20
[k
+ 1], in21
[k
+ 1], in22
[k
+ 1], in23
[k
+ 1], 16, i
* 16)
1390 STORE_LINE(in24
[k
], in25
[k
], in26
[k
], in27
[k
], in28
[k
], in29
[k
], in30
[k
], in31
[k
], in24
[k
+ 1], in25
[k
+ 1], in26
[k
+ 1], in27
[k
+ 1], in28
[k
+ 1], in29
[k
+ 1], in30
[k
+ 1], in31
[k
+ 1], 24, i
* 16)
1395 #endif // if !HIGH_BIT_DEPTH
1399 void Setup_Vec_DCTPrimitives_sse3(EncoderPrimitives
&p
)
1401 /* Note: We have AVX2 assembly for these two functions, but since AVX2 is
1402 * still somewhat rare on end-user PCs we still compile and link these SSE3
1403 * intrinsic SIMD functions */
1405 p
.idct
[IDCT_8x8
] = idct8
;
1406 p
.idct
[IDCT_16x16
] = idct16
;
1407 p
.idct
[IDCT_32x32
] = idct32
;