| 1 | /***************************************************************************** |
| 2 | * Copyright (C) 2013 x265 project |
| 3 | * |
| 4 | * Authors: Steve Borho <steve@borho.org> |
| 5 | * Mandar Gurav <mandar@multicorewareinc.com> |
| 6 | * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com> |
| 7 | * Mahesh Pittala <mahesh@multicorewareinc.com> |
| 8 | * Rajesh Paulraj <rajesh@multicorewareinc.com> |
| 9 | * Min Chen <min.chen@multicorewareinc.com> |
| 10 | * Praveen Kumar Tiwari <praveen@multicorewareinc.com> |
| 11 | * Nabajit Deka <nabajit@multicorewareinc.com> |
| 12 | * |
| 13 | * This program is free software; you can redistribute it and/or modify |
| 14 | * it under the terms of the GNU General Public License as published by |
| 15 | * the Free Software Foundation; either version 2 of the License, or |
| 16 | * (at your option) any later version. |
| 17 | * |
| 18 | * This program is distributed in the hope that it will be useful, |
| 19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 21 | * GNU General Public License for more details. |
| 22 | * |
| 23 | * You should have received a copy of the GNU General Public License |
| 24 | * along with this program; if not, write to the Free Software |
| 25 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
| 26 | * |
| 27 | * This program is also available under a commercial proprietary license. |
| 28 | * For more information, contact us at license @ x265.com. |
| 29 | *****************************************************************************/ |
| 30 | |
| 31 | #include "common.h" |
| 32 | #include "primitives.h" |
| 33 | #include <xmmintrin.h> // SSE |
| 34 | #include <pmmintrin.h> // SSE3 |
| 35 | #include <tmmintrin.h> // SSSE3 |
| 36 | |
| 37 | using namespace x265; |
| 38 | |
| 39 | #if !HIGH_BIT_DEPTH |
| 40 | namespace { |
| 41 | ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) = |
| 42 | { |
| 43 | { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, |
| 44 | |
| 45 | { 64, 64, 64, 64, 64, 64, 64, 64 }, |
| 46 | { 64, -64, 64, -64, 64, -64, 64, -64 }, |
| 47 | { 83, 36, 83, 36, 83, 36, 83, 36 }, |
| 48 | { 36, -83, 36, -83, 36, -83, 36, -83 }, |
| 49 | { 89, 18, 75, 50, 89, 18, 75, 50 }, |
| 50 | { 75, -50, -18, -89, 75, -50, -18, -89 }, |
| 51 | { 50, 75, -89, 18, 50, 75, -89, 18 }, |
| 52 | { 18, -89, -50, 75, 18, -89, -50, 75 }, |
| 53 | |
| 54 | { 83, 83, -83, -83, 36, 36, -36, -36 }, |
| 55 | { 36, 36, -36, -36, -83, -83, 83, 83 }, |
| 56 | { 89, -89, 18, -18, 75, -75, 50, -50 }, |
| 57 | { 75, -75, -50, 50, -18, 18, -89, 89 }, |
| 58 | { 50, -50, 75, -75, -89, 89, 18, -18 }, |
| 59 | { 18, -18, -89, 89, -50, 50, 75, -75 }, |
| 60 | }; |
| 61 | |
| 62 | ALIGN_VAR_32(static const int16_t, tab_dct_16_0[][8]) = |
| 63 | { |
| 64 | { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 }, // 0 |
| 65 | { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, // 1 |
| 66 | { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A }, // 2 |
| 67 | { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 }, // 3 |
| 68 | }; |
| 69 | |
| 70 | ALIGN_VAR_32(static const int16_t, tab_dct_16_1[][8]) = |
| 71 | { |
| 72 | { 90, 87, 80, 70, 57, 43, 25, 9 }, // 0 |
| 73 | { 87, 57, 9, -43, -80, -90, -70, -25 }, // 1 |
| 74 | { 80, 9, -70, -87, -25, 57, 90, 43 }, // 2 |
| 75 | { 70, -43, -87, 9, 90, 25, -80, -57 }, // 3 |
| 76 | { 57, -80, -25, 90, -9, -87, 43, 70 }, // 4 |
| 77 | { 43, -90, 57, 25, -87, 70, 9, -80 }, // 5 |
| 78 | { 25, -70, 90, -80, 43, 9, -57, 87 }, // 6 |
| 79 | { 9, -25, 43, -57, 70, -80, 87, -90 }, // 7 |
| 80 | { 83, 83, -83, -83, 36, 36, -36, -36 }, // 8 |
| 81 | { 36, 36, -36, -36, -83, -83, 83, 83 }, // 9 |
| 82 | { 89, 89, 18, 18, 75, 75, 50, 50 }, // 10 |
| 83 | { 75, 75, -50, -50, -18, -18, -89, -89 }, // 11 |
| 84 | { 50, 50, 75, 75, -89, -89, 18, 18 }, // 12 |
| 85 | { 18, 18, -89, -89, -50, -50, 75, 75 }, // 13 |
| 86 | |
| 87 | #define MAKE_COEF(a0, a1, a2, a3, a4, a5, a6, a7) \ |
| 88 | { (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2) \ |
| 89 | }, \ |
| 90 | { (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5) }, |
| 91 | |
| 92 | MAKE_COEF(90, 87, 80, 70, 57, 43, 25, 9) |
| 93 | MAKE_COEF(87, 57, 9, -43, -80, -90, -70, -25) |
| 94 | MAKE_COEF(80, 9, -70, -87, -25, 57, 90, 43) |
| 95 | MAKE_COEF(70, -43, -87, 9, 90, 25, -80, -57) |
| 96 | MAKE_COEF(57, -80, -25, 90, -9, -87, 43, 70) |
| 97 | MAKE_COEF(43, -90, 57, 25, -87, 70, 9, -80) |
| 98 | MAKE_COEF(25, -70, 90, -80, 43, 9, -57, 87) |
| 99 | MAKE_COEF(9, -25, 43, -57, 70, -80, 87, -90) |
| 100 | #undef MAKE_COEF |
| 101 | }; |
| 102 | |
| 103 | void dct16(const int16_t *src, int16_t *dst, intptr_t stride) |
| 104 | { |
| 105 | // Const |
| 106 | __m128i c_4 = _mm_set1_epi32(4); |
| 107 | __m128i c_512 = _mm_set1_epi32(512); |
| 108 | |
| 109 | int i; |
| 110 | |
| 111 | ALIGN_VAR_32(int16_t, tmp[16 * 16]); |
| 112 | |
| 113 | __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; |
| 114 | __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; |
| 115 | __m128i T10, T11, T12, T13, T14, T15, T16, T17; |
| 116 | __m128i T20, T21, T22, T23, T24, T25, T26, T27; |
| 117 | __m128i T30, T31, T32, T33, T34, T35, T36, T37; |
| 118 | __m128i T40, T41, T42, T43, T44, T45, T46, T47; |
| 119 | __m128i T50, T51, T52, T53; |
| 120 | __m128i T60, T61, T62, T63, T64, T65, T66, T67; |
| 121 | __m128i T70; |
| 122 | |
| 123 | // DCT1 |
| 124 | for (i = 0; i < 16; i += 8) |
| 125 | { |
| 126 | T00A = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 0]); // [07 06 05 04 03 02 01 00] |
| 127 | T00B = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 8]); // [0F 0E 0D 0C 0B 0A 09 08] |
| 128 | T01A = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 0]); // [17 16 15 14 13 12 11 10] |
| 129 | T01B = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 8]); // [1F 1E 1D 1C 1B 1A 19 18] |
| 130 | T02A = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 0]); // [27 26 25 24 23 22 21 20] |
| 131 | T02B = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 8]); // [2F 2E 2D 2C 2B 2A 29 28] |
| 132 | T03A = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 0]); // [37 36 35 34 33 32 31 30] |
| 133 | T03B = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 8]); // [3F 3E 3D 3C 3B 3A 39 38] |
| 134 | T04A = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 0]); // [47 46 45 44 43 42 41 40] |
| 135 | T04B = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 8]); // [4F 4E 4D 4C 4B 4A 49 48] |
| 136 | T05A = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 0]); // [57 56 55 54 53 52 51 50] |
| 137 | T05B = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 8]); // [5F 5E 5D 5C 5B 5A 59 58] |
| 138 | T06A = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 0]); // [67 66 65 64 63 62 61 60] |
| 139 | T06B = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 8]); // [6F 6E 6D 6C 6B 6A 69 68] |
| 140 | T07A = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 0]); // [77 76 75 74 73 72 71 70] |
| 141 | T07B = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 8]); // [7F 7E 7D 7C 7B 7A 79 78] |
| 142 | |
| 143 | T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 144 | T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 145 | T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 146 | T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 147 | T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 148 | T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 149 | T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 150 | T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 151 | |
| 152 | T10 = _mm_add_epi16(T00A, T00B); |
| 153 | T11 = _mm_add_epi16(T01A, T01B); |
| 154 | T12 = _mm_add_epi16(T02A, T02B); |
| 155 | T13 = _mm_add_epi16(T03A, T03B); |
| 156 | T14 = _mm_add_epi16(T04A, T04B); |
| 157 | T15 = _mm_add_epi16(T05A, T05B); |
| 158 | T16 = _mm_add_epi16(T06A, T06B); |
| 159 | T17 = _mm_add_epi16(T07A, T07B); |
| 160 | |
| 161 | T20 = _mm_sub_epi16(T00A, T00B); |
| 162 | T21 = _mm_sub_epi16(T01A, T01B); |
| 163 | T22 = _mm_sub_epi16(T02A, T02B); |
| 164 | T23 = _mm_sub_epi16(T03A, T03B); |
| 165 | T24 = _mm_sub_epi16(T04A, T04B); |
| 166 | T25 = _mm_sub_epi16(T05A, T05B); |
| 167 | T26 = _mm_sub_epi16(T06A, T06B); |
| 168 | T27 = _mm_sub_epi16(T07A, T07B); |
| 169 | |
| 170 | T30 = _mm_shuffle_epi8(T10, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 171 | T31 = _mm_shuffle_epi8(T11, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 172 | T32 = _mm_shuffle_epi8(T12, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 173 | T33 = _mm_shuffle_epi8(T13, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 174 | T34 = _mm_shuffle_epi8(T14, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 175 | T35 = _mm_shuffle_epi8(T15, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 176 | T36 = _mm_shuffle_epi8(T16, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 177 | T37 = _mm_shuffle_epi8(T17, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 178 | |
| 179 | T40 = _mm_hadd_epi16(T30, T31); |
| 180 | T41 = _mm_hadd_epi16(T32, T33); |
| 181 | T42 = _mm_hadd_epi16(T34, T35); |
| 182 | T43 = _mm_hadd_epi16(T36, T37); |
| 183 | T44 = _mm_hsub_epi16(T30, T31); |
| 184 | T45 = _mm_hsub_epi16(T32, T33); |
| 185 | T46 = _mm_hsub_epi16(T34, T35); |
| 186 | T47 = _mm_hsub_epi16(T36, T37); |
| 187 | |
| 188 | T50 = _mm_hadd_epi16(T40, T41); |
| 189 | T51 = _mm_hadd_epi16(T42, T43); |
| 190 | T52 = _mm_hsub_epi16(T40, T41); |
| 191 | T53 = _mm_hsub_epi16(T42, T43); |
| 192 | |
| 193 | T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 194 | T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 195 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); |
| 196 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); |
| 197 | T70 = _mm_packs_epi32(T60, T61); |
| 198 | _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70); |
| 199 | |
| 200 | T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2])); |
| 201 | T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2])); |
| 202 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); |
| 203 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); |
| 204 | T70 = _mm_packs_epi32(T60, T61); |
| 205 | _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70); |
| 206 | |
| 207 | T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3])); |
| 208 | T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3])); |
| 209 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); |
| 210 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); |
| 211 | T70 = _mm_packs_epi32(T60, T61); |
| 212 | _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70); |
| 213 | |
| 214 | T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4])); |
| 215 | T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4])); |
| 216 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); |
| 217 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); |
| 218 | T70 = _mm_packs_epi32(T60, T61); |
| 219 | _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70); |
| 220 | |
| 221 | T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[5])); |
| 222 | T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[5])); |
| 223 | T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[5])); |
| 224 | T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5])); |
| 225 | T60 = _mm_hadd_epi32(T60, T61); |
| 226 | T61 = _mm_hadd_epi32(T62, T63); |
| 227 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); |
| 228 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); |
| 229 | T70 = _mm_packs_epi32(T60, T61); |
| 230 | _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70); |
| 231 | |
| 232 | T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[6])); |
| 233 | T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[6])); |
| 234 | T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[6])); |
| 235 | T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6])); |
| 236 | T60 = _mm_hadd_epi32(T60, T61); |
| 237 | T61 = _mm_hadd_epi32(T62, T63); |
| 238 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); |
| 239 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); |
| 240 | T70 = _mm_packs_epi32(T60, T61); |
| 241 | _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70); |
| 242 | |
| 243 | T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[7])); |
| 244 | T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[7])); |
| 245 | T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[7])); |
| 246 | T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7])); |
| 247 | T60 = _mm_hadd_epi32(T60, T61); |
| 248 | T61 = _mm_hadd_epi32(T62, T63); |
| 249 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); |
| 250 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); |
| 251 | T70 = _mm_packs_epi32(T60, T61); |
| 252 | _mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70); |
| 253 | |
| 254 | T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[8])); |
| 255 | T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[8])); |
| 256 | T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[8])); |
| 257 | T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8])); |
| 258 | T60 = _mm_hadd_epi32(T60, T61); |
| 259 | T61 = _mm_hadd_epi32(T62, T63); |
| 260 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); |
| 261 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); |
| 262 | T70 = _mm_packs_epi32(T60, T61); |
| 263 | _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70); |
| 264 | |
| 265 | #define MAKE_ODD(tab, dstPos) \ |
| 266 | T60 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 267 | T61 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 268 | T62 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 269 | T63 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 270 | T64 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 271 | T65 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 272 | T66 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 273 | T67 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 274 | T60 = _mm_hadd_epi32(T60, T61); \ |
| 275 | T61 = _mm_hadd_epi32(T62, T63); \ |
| 276 | T62 = _mm_hadd_epi32(T64, T65); \ |
| 277 | T63 = _mm_hadd_epi32(T66, T67); \ |
| 278 | T60 = _mm_hadd_epi32(T60, T61); \ |
| 279 | T61 = _mm_hadd_epi32(T62, T63); \ |
| 280 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); \ |
| 281 | T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); \ |
| 282 | T70 = _mm_packs_epi32(T60, T61); \ |
| 283 | _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70); |
| 284 | |
| 285 | MAKE_ODD(0, 1); |
| 286 | MAKE_ODD(1, 3); |
| 287 | MAKE_ODD(2, 5); |
| 288 | MAKE_ODD(3, 7); |
| 289 | MAKE_ODD(4, 9); |
| 290 | MAKE_ODD(5, 11); |
| 291 | MAKE_ODD(6, 13); |
| 292 | MAKE_ODD(7, 15); |
| 293 | #undef MAKE_ODD |
| 294 | } |
| 295 | |
| 296 | // DCT2 |
| 297 | for (i = 0; i < 16; i += 4) |
| 298 | { |
| 299 | T00A = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 0]); // [07 06 05 04 03 02 01 00] |
| 300 | T00B = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 8]); // [0F 0E 0D 0C 0B 0A 09 08] |
| 301 | T01A = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 0]); // [17 16 15 14 13 12 11 10] |
| 302 | T01B = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 8]); // [1F 1E 1D 1C 1B 1A 19 18] |
| 303 | T02A = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 0]); // [27 26 25 24 23 22 21 20] |
| 304 | T02B = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 8]); // [2F 2E 2D 2C 2B 2A 29 28] |
| 305 | T03A = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 0]); // [37 36 35 34 33 32 31 30] |
| 306 | T03B = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 8]); // [3F 3E 3D 3C 3B 3A 39 38] |
| 307 | |
| 308 | T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); |
| 309 | T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); |
| 310 | T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); |
| 311 | T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); |
| 312 | T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); |
| 313 | T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); |
| 314 | T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); |
| 315 | T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); |
| 316 | |
| 317 | T10 = _mm_unpacklo_epi16(T00A, T00B); |
| 318 | T11 = _mm_unpackhi_epi16(T00A, T00B); |
| 319 | T12 = _mm_unpacklo_epi16(T01A, T01B); |
| 320 | T13 = _mm_unpackhi_epi16(T01A, T01B); |
| 321 | T14 = _mm_unpacklo_epi16(T02A, T02B); |
| 322 | T15 = _mm_unpackhi_epi16(T02A, T02B); |
| 323 | T16 = _mm_unpacklo_epi16(T03A, T03B); |
| 324 | T17 = _mm_unpackhi_epi16(T03A, T03B); |
| 325 | |
| 326 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 327 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 328 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 329 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 330 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 331 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 332 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 333 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 334 | |
| 335 | T30 = _mm_add_epi32(T20, T21); |
| 336 | T31 = _mm_add_epi32(T22, T23); |
| 337 | T32 = _mm_add_epi32(T24, T25); |
| 338 | T33 = _mm_add_epi32(T26, T27); |
| 339 | |
| 340 | T30 = _mm_hadd_epi32(T30, T31); |
| 341 | T31 = _mm_hadd_epi32(T32, T33); |
| 342 | |
| 343 | T40 = _mm_hadd_epi32(T30, T31); |
| 344 | T41 = _mm_hsub_epi32(T30, T31); |
| 345 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); |
| 346 | T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10); |
| 347 | T40 = _mm_packs_epi32(T40, T40); |
| 348 | T41 = _mm_packs_epi32(T41, T41); |
| 349 | _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40); |
| 350 | _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41); |
| 351 | |
| 352 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 353 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 354 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 355 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 356 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 357 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 358 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 359 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 360 | |
| 361 | T30 = _mm_add_epi32(T20, T21); |
| 362 | T31 = _mm_add_epi32(T22, T23); |
| 363 | T32 = _mm_add_epi32(T24, T25); |
| 364 | T33 = _mm_add_epi32(T26, T27); |
| 365 | |
| 366 | T30 = _mm_hadd_epi32(T30, T31); |
| 367 | T31 = _mm_hadd_epi32(T32, T33); |
| 368 | |
| 369 | T40 = _mm_hadd_epi32(T30, T31); |
| 370 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); |
| 371 | T40 = _mm_packs_epi32(T40, T40); |
| 372 | _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40); |
| 373 | |
| 374 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 375 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 376 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 377 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 378 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 379 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 380 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 381 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 382 | |
| 383 | T30 = _mm_add_epi32(T20, T21); |
| 384 | T31 = _mm_add_epi32(T22, T23); |
| 385 | T32 = _mm_add_epi32(T24, T25); |
| 386 | T33 = _mm_add_epi32(T26, T27); |
| 387 | |
| 388 | T30 = _mm_hadd_epi32(T30, T31); |
| 389 | T31 = _mm_hadd_epi32(T32, T33); |
| 390 | |
| 391 | T40 = _mm_hadd_epi32(T30, T31); |
| 392 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); |
| 393 | T40 = _mm_packs_epi32(T40, T40); |
| 394 | _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40); |
| 395 | |
| 396 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10])); |
| 397 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10])); |
| 398 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[10])); |
| 399 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[10])); |
| 400 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[10])); |
| 401 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[10])); |
| 402 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[10])); |
| 403 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[10])); |
| 404 | |
| 405 | T30 = _mm_sub_epi32(T20, T21); |
| 406 | T31 = _mm_sub_epi32(T22, T23); |
| 407 | T32 = _mm_sub_epi32(T24, T25); |
| 408 | T33 = _mm_sub_epi32(T26, T27); |
| 409 | |
| 410 | T30 = _mm_hadd_epi32(T30, T31); |
| 411 | T31 = _mm_hadd_epi32(T32, T33); |
| 412 | |
| 413 | T40 = _mm_hadd_epi32(T30, T31); |
| 414 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); |
| 415 | T40 = _mm_packs_epi32(T40, T40); |
| 416 | _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40); |
| 417 | |
| 418 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11])); |
| 419 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11])); |
| 420 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[11])); |
| 421 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[11])); |
| 422 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[11])); |
| 423 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[11])); |
| 424 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[11])); |
| 425 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[11])); |
| 426 | |
| 427 | T30 = _mm_sub_epi32(T20, T21); |
| 428 | T31 = _mm_sub_epi32(T22, T23); |
| 429 | T32 = _mm_sub_epi32(T24, T25); |
| 430 | T33 = _mm_sub_epi32(T26, T27); |
| 431 | |
| 432 | T30 = _mm_hadd_epi32(T30, T31); |
| 433 | T31 = _mm_hadd_epi32(T32, T33); |
| 434 | |
| 435 | T40 = _mm_hadd_epi32(T30, T31); |
| 436 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); |
| 437 | T40 = _mm_packs_epi32(T40, T40); |
| 438 | _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40); |
| 439 | |
| 440 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12])); |
| 441 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12])); |
| 442 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[12])); |
| 443 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[12])); |
| 444 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[12])); |
| 445 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[12])); |
| 446 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[12])); |
| 447 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[12])); |
| 448 | |
| 449 | T30 = _mm_sub_epi32(T20, T21); |
| 450 | T31 = _mm_sub_epi32(T22, T23); |
| 451 | T32 = _mm_sub_epi32(T24, T25); |
| 452 | T33 = _mm_sub_epi32(T26, T27); |
| 453 | |
| 454 | T30 = _mm_hadd_epi32(T30, T31); |
| 455 | T31 = _mm_hadd_epi32(T32, T33); |
| 456 | |
| 457 | T40 = _mm_hadd_epi32(T30, T31); |
| 458 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); |
| 459 | T40 = _mm_packs_epi32(T40, T40); |
| 460 | _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40); |
| 461 | |
| 462 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13])); |
| 463 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13])); |
| 464 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[13])); |
| 465 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[13])); |
| 466 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[13])); |
| 467 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[13])); |
| 468 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[13])); |
| 469 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[13])); |
| 470 | |
| 471 | T30 = _mm_sub_epi32(T20, T21); |
| 472 | T31 = _mm_sub_epi32(T22, T23); |
| 473 | T32 = _mm_sub_epi32(T24, T25); |
| 474 | T33 = _mm_sub_epi32(T26, T27); |
| 475 | |
| 476 | T30 = _mm_hadd_epi32(T30, T31); |
| 477 | T31 = _mm_hadd_epi32(T32, T33); |
| 478 | |
| 479 | T40 = _mm_hadd_epi32(T30, T31); |
| 480 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); |
| 481 | T40 = _mm_packs_epi32(T40, T40); |
| 482 | _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40); |
| 483 | |
| 484 | #define MAKE_ODD(tab, dstPos) \ |
| 485 | T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \ |
| 486 | T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); /* [*O5_0 *O6_0 *O4_0 *O7_0] */ \ |
| 487 | T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 488 | T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ |
| 489 | T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 490 | T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ |
| 491 | T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ |
| 492 | T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ |
| 493 | \ |
| 494 | T30 = _mm_add_epi32(T20, T21); \ |
| 495 | T31 = _mm_add_epi32(T22, T23); \ |
| 496 | T32 = _mm_add_epi32(T24, T25); \ |
| 497 | T33 = _mm_add_epi32(T26, T27); \ |
| 498 | \ |
| 499 | T30 = _mm_hadd_epi32(T30, T31); \ |
| 500 | T31 = _mm_hadd_epi32(T32, T33); \ |
| 501 | \ |
| 502 | T40 = _mm_hadd_epi32(T30, T31); \ |
| 503 | T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \ |
| 504 | T40 = _mm_packs_epi32(T40, T40); \ |
| 505 | _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40); |
| 506 | |
| 507 | MAKE_ODD(14, 1); |
| 508 | MAKE_ODD(16, 3); |
| 509 | MAKE_ODD(18, 5); |
| 510 | MAKE_ODD(20, 7); |
| 511 | MAKE_ODD(22, 9); |
| 512 | MAKE_ODD(24, 11); |
| 513 | MAKE_ODD(26, 13); |
| 514 | MAKE_ODD(28, 15); |
| 515 | #undef MAKE_ODD |
| 516 | } |
| 517 | } |
| 518 | |
| 519 | ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) = |
| 520 | { |
| 521 | { 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504 }, // 0 |
| 522 | }; |
| 523 | |
| 524 | ALIGN_VAR_32(static const int16_t, tab_dct_32_1[][8]) = |
| 525 | { |
| 526 | { 89, -89, 18, -18, 75, -75, 50, -50 }, // 0 |
| 527 | { 75, -75, -50, 50, -18, 18, -89, 89 }, // 1 |
| 528 | { 50, -50, 75, -75, -89, 89, 18, -18 }, // 2 |
| 529 | { 18, -18, -89, 89, -50, 50, 75, -75 }, // 3 |
| 530 | |
| 531 | #define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \ |
| 532 | { (a0), (a7), (a3), (a4), (a1), (a6), (a2), (a5) \ |
| 533 | }, \ |
| 534 | |
| 535 | MAKE_COEF8(90, 87, 80, 70, 57, 43, 25, 9) // 4 |
| 536 | MAKE_COEF8(87, 57, 9, -43, -80, -90, -70, -25) // 5 |
| 537 | MAKE_COEF8(80, 9, -70, -87, -25, 57, 90, 43) // 6 |
| 538 | MAKE_COEF8(70, -43, -87, 9, 90, 25, -80, -57) // 7 |
| 539 | MAKE_COEF8(57, -80, -25, 90, -9, -87, 43, 70) // 8 |
| 540 | MAKE_COEF8(43, -90, 57, 25, -87, 70, 9, -80) // 9 |
| 541 | MAKE_COEF8(25, -70, 90, -80, 43, 9, -57, 87) // 10 |
| 542 | MAKE_COEF8(9, -25, 43, -57, 70, -80, 87, -90) // 11 |
| 543 | #undef MAKE_COEF8 |
| 544 | |
| 545 | #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ |
| 546 | { (a00), (a07), (a03), (a04), (a01), (a06), (a02), (a05) }, \ |
| 547 | { (a15), (a08), (a12), (a11), (a14), (a09), (a13), (a10) }, |
| 548 | |
| 549 | MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 12 |
| 550 | MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) // 14 |
| 551 | MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) // 16 |
| 552 | MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31) // 18 |
| 553 | MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38) // 20 |
| 554 | MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) // 22 |
| 555 | MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) // 24 |
| 556 | MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61) // 26 |
| 557 | MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) // 28 |
| 558 | MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) // 30 |
| 559 | MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78) // 32 |
| 560 | MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82) // 34 |
| 561 | MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) // 36 |
| 562 | MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) // 38 |
| 563 | MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90) // 40 |
| 564 | MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90) // 42 |
| 565 | #undef MAKE_COEF16 |
| 566 | |
| 567 | { |
| 568 | 64, 64, 64, 64, 64, 64, 64, 64 |
| 569 | }, // 44 |
| 570 | |
| 571 | { 64, 64, -64, -64, -64, -64, 64, 64 }, // 45 |
| 572 | |
| 573 | { 83, 83, 36, 36, -36, -36, -83, -83 }, // 46 |
| 574 | { -83, -83, -36, -36, 36, 36, 83, 83 }, // 47 |
| 575 | |
| 576 | { 36, 36, -83, -83, 83, 83, -36, -36 }, // 48 |
| 577 | { -36, -36, 83, 83, -83, -83, 36, 36 }, // 49 |
| 578 | |
| 579 | #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ |
| 580 | { (a00), (a00), (a01), (a01), (a02), (a02), (a03), (a03) }, \ |
| 581 | { (a04), (a04), (a05), (a05), (a06), (a06), (a07), (a07) }, \ |
| 582 | { (a08), (a08), (a09), (a09), (a10), (a10), (a11), (a11) }, \ |
| 583 | { (a12), (a12), (a13), (a13), (a14), (a14), (a15), (a15) }, |
| 584 | |
| 585 | MAKE_COEF16(89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89) // 50 |
| 586 | MAKE_COEF16(75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75) // 54 |
| 587 | |
| 588 | // TODO: convert below table here |
| 589 | #undef MAKE_COEF16 |
| 590 | |
| 591 | { |
| 592 | 50, 50, -89, -89, 18, 18, 75, 75 |
| 593 | }, // 58 |
| 594 | { -75, -75, -18, -18, 89, 89, -50, -50 }, // 59 |
| 595 | { -50, -50, 89, 89, -18, -18, -75, -75 }, // 60 |
| 596 | { 75, 75, 18, 18, -89, -89, 50, 50 }, // 61 |
| 597 | |
| 598 | { 18, 18, -50, -50, 75, 75, -89, -89 }, // 62 |
| 599 | { 89, 89, -75, -75, 50, 50, -18, -18 }, // 63 |
| 600 | { -18, -18, 50, 50, -75, -75, 89, 89 }, // 64 |
| 601 | { -89, -89, 75, 75, -50, -50, 18, 18 }, // 65 |
| 602 | |
| 603 | { 90, 90, 87, 87, 80, 80, 70, 70 }, // 66 |
| 604 | { 57, 57, 43, 43, 25, 25, 9, 9 }, // 67 |
| 605 | { -9, -9, -25, -25, -43, -43, -57, -57 }, // 68 |
| 606 | { -70, -70, -80, -80, -87, -87, -90, -90 }, // 69 |
| 607 | |
| 608 | { 87, 87, 57, 57, 9, 9, -43, -43 }, // 70 |
| 609 | { -80, -80, -90, -90, -70, -70, -25, -25 }, // 71 |
| 610 | { 25, 25, 70, 70, 90, 90, 80, 80 }, // 72 |
| 611 | { 43, 43, -9, -9, -57, -57, -87, -87 }, // 73 |
| 612 | |
| 613 | { 80, 80, 9, 9, -70, -70, -87, -87 }, // 74 |
| 614 | { -25, -25, 57, 57, 90, 90, 43, 43 }, // 75 |
| 615 | { -43, -43, -90, -90, -57, -57, 25, 25 }, // 76 |
| 616 | { 87, 87, 70, 70, -9, -9, -80, -80 }, // 77 |
| 617 | |
| 618 | { 70, 70, -43, -43, -87, -87, 9, 9 }, // 78 |
| 619 | { 90, 90, 25, 25, -80, -80, -57, -57 }, // 79 |
| 620 | { 57, 57, 80, 80, -25, -25, -90, -90 }, // 80 |
| 621 | { -9, -9, 87, 87, 43, 43, -70, -70 }, // 81 |
| 622 | |
| 623 | { 57, 57, -80, -80, -25, -25, 90, 90 }, // 82 |
| 624 | { -9, -9, -87, -87, 43, 43, 70, 70 }, // 83 |
| 625 | { -70, -70, -43, -43, 87, 87, 9, 9 }, // 84 |
| 626 | { -90, -90, 25, 25, 80, 80, -57, -57 }, // 85 |
| 627 | |
| 628 | { 43, 43, -90, -90, 57, 57, 25, 25 }, // 86 |
| 629 | { -87, -87, 70, 70, 9, 9, -80, -80 }, // 87 |
| 630 | { 80, 80, -9, -9, -70, -70, 87, 87 }, // 88 |
| 631 | { -25, -25, -57, -57, 90, 90, -43, -43 }, // 89 |
| 632 | |
| 633 | { 25, 25, -70, -70, 90, 90, -80, -80 }, // 90 |
| 634 | { 43, 43, 9, 9, -57, -57, 87, 87 }, // 91 |
| 635 | { -87, -87, 57, 57, -9, -9, -43, -43 }, // 92 |
| 636 | { 80, 80, -90, -90, 70, 70, -25, -25 }, // 93 |
| 637 | |
| 638 | { 9, 9, -25, -25, 43, 43, -57, -57 }, // 94 |
| 639 | { 70, 70, -80, -80, 87, 87, -90, -90 }, // 95 |
| 640 | { 90, 90, -87, -87, 80, 80, -70, -70 }, // 96 |
| 641 | { 57, 57, -43, -43, 25, 25, -9, -9 }, // 97 |
| 642 | |
| 643 | #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ |
| 644 | { (a00), -(a00), (a01), -(a01), (a02), -(a02), (a03), -(a03) }, \ |
| 645 | { (a04), -(a04), (a05), -(a05), (a06), -(a06), (a07), -(a07) }, \ |
| 646 | { (a08), -(a08), (a09), -(a09), (a10), -(a10), (a11), -(a11) }, \ |
| 647 | { (a12), -(a12), (a13), -(a13), (a14), -(a14), (a15), -(a15) }, |
| 648 | |
| 649 | MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 98 |
| 650 | MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) //102 |
| 651 | MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) //106 |
| 652 | MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, +82, 88, 54, -4, -61, -90, -78, -31) //110 |
| 653 | MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, +31, -46, -90, -67, 4, 73, 88, 38) //114 |
| 654 | MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) //118 |
| 655 | MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) //122 |
| 656 | MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, +90, 13, -88, -31, 82, 46, -73, -61) //126 |
| 657 | MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) //130 |
| 658 | MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) //134 |
| 659 | MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, +22, 67, -85, 13, 73, -82, 4, 78) //138 |
| 660 | MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, +85, -78, 13, 61, -90, 54, 22, -82) //142 |
| 661 | MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) //146 |
| 662 | MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) //150 |
| 663 | MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, +54, -31, 4, 22, -46, 67, -82, 90) //154 |
| 664 | MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, +67, -73, 78, -82, 85, -88, 90, -90) //158 |
| 665 | |
| 666 | #undef MAKE_COEF16 |
| 667 | }; |
| 668 | |
| 669 | void dct32(const int16_t *src, int16_t *dst, intptr_t stride) |
| 670 | { |
| 671 | // Const |
| 672 | __m128i c_8 = _mm_set1_epi32(8); |
| 673 | __m128i c_1024 = _mm_set1_epi32(1024); |
| 674 | |
| 675 | int i; |
| 676 | |
| 677 | __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; |
| 678 | __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; |
| 679 | __m128i T00C, T01C, T02C, T03C, T04C, T05C, T06C, T07C; |
| 680 | __m128i T00D, T01D, T02D, T03D, T04D, T05D, T06D, T07D; |
| 681 | __m128i T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A; |
| 682 | __m128i T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B; |
| 683 | __m128i T20, T21, T22, T23, T24, T25, T26, T27; |
| 684 | __m128i T30, T31, T32, T33, T34, T35, T36, T37; |
| 685 | __m128i T40, T41, T42, T43, T44, T45, T46, T47; |
| 686 | __m128i T50, T51, T52, T53; |
| 687 | __m128i T60, T61, T62, T63, T64, T65, T66, T67; |
| 688 | __m128i im[32][4]; |
| 689 | |
| 690 | // DCT1 |
| 691 | for (i = 0; i < 32 / 8; i++) |
| 692 | { |
| 693 | T00A = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 0]); // [07 06 05 04 03 02 01 00] |
| 694 | T00B = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 8]); // [15 14 13 12 11 10 09 08] |
| 695 | T00C = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 16]); // [23 22 21 20 19 18 17 16] |
| 696 | T00D = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 24]); // [31 30 29 28 27 26 25 24] |
| 697 | T01A = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 0]); |
| 698 | T01B = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 8]); |
| 699 | T01C = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 16]); |
| 700 | T01D = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 24]); |
| 701 | T02A = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 0]); |
| 702 | T02B = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 8]); |
| 703 | T02C = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 16]); |
| 704 | T02D = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 24]); |
| 705 | T03A = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 0]); |
| 706 | T03B = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 8]); |
| 707 | T03C = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 16]); |
| 708 | T03D = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 24]); |
| 709 | T04A = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 0]); |
| 710 | T04B = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 8]); |
| 711 | T04C = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 16]); |
| 712 | T04D = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 24]); |
| 713 | T05A = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 0]); |
| 714 | T05B = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 8]); |
| 715 | T05C = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 16]); |
| 716 | T05D = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 24]); |
| 717 | T06A = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 0]); |
| 718 | T06B = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 8]); |
| 719 | T06C = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 16]); |
| 720 | T06D = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 24]); |
| 721 | T07A = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 0]); |
| 722 | T07B = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 8]); |
| 723 | T07C = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 16]); |
| 724 | T07D = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 24]); |
| 725 | |
| 726 | T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00] |
| 727 | T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15] |
| 728 | T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [21 18 22 17 20 19 23 16] |
| 729 | T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [26 29 25 30 27 28 24 31] |
| 730 | T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 731 | T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 732 | T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 733 | T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 734 | T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 735 | T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 736 | T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 737 | T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 738 | T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 739 | T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 740 | T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 741 | T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 742 | T04A = _mm_shuffle_epi8(T04A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 743 | T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 744 | T04C = _mm_shuffle_epi8(T04C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 745 | T04D = _mm_shuffle_epi8(T04D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 746 | T05A = _mm_shuffle_epi8(T05A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 747 | T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 748 | T05C = _mm_shuffle_epi8(T05C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 749 | T05D = _mm_shuffle_epi8(T05D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 750 | T06A = _mm_shuffle_epi8(T06A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 751 | T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 752 | T06C = _mm_shuffle_epi8(T06C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 753 | T06D = _mm_shuffle_epi8(T06D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 754 | T07A = _mm_shuffle_epi8(T07A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 755 | T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 756 | T07C = _mm_shuffle_epi8(T07C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); |
| 757 | T07D = _mm_shuffle_epi8(T07D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); |
| 758 | |
| 759 | T10A = _mm_add_epi16(T00A, T00D); // [E05 E02 E06 E01 E04 E03 E07 E00] |
| 760 | T10B = _mm_add_epi16(T00B, T00C); // [E10 E13 E09 E14 E11 E12 E08 E15] |
| 761 | T11A = _mm_add_epi16(T01A, T01D); |
| 762 | T11B = _mm_add_epi16(T01B, T01C); |
| 763 | T12A = _mm_add_epi16(T02A, T02D); |
| 764 | T12B = _mm_add_epi16(T02B, T02C); |
| 765 | T13A = _mm_add_epi16(T03A, T03D); |
| 766 | T13B = _mm_add_epi16(T03B, T03C); |
| 767 | T14A = _mm_add_epi16(T04A, T04D); |
| 768 | T14B = _mm_add_epi16(T04B, T04C); |
| 769 | T15A = _mm_add_epi16(T05A, T05D); |
| 770 | T15B = _mm_add_epi16(T05B, T05C); |
| 771 | T16A = _mm_add_epi16(T06A, T06D); |
| 772 | T16B = _mm_add_epi16(T06B, T06C); |
| 773 | T17A = _mm_add_epi16(T07A, T07D); |
| 774 | T17B = _mm_add_epi16(T07B, T07C); |
| 775 | |
| 776 | T00A = _mm_sub_epi16(T00A, T00D); // [O05 O02 O06 O01 O04 O03 O07 O00] |
| 777 | T00B = _mm_sub_epi16(T00B, T00C); // [O10 O13 O09 O14 O11 O12 O08 O15] |
| 778 | T01A = _mm_sub_epi16(T01A, T01D); |
| 779 | T01B = _mm_sub_epi16(T01B, T01C); |
| 780 | T02A = _mm_sub_epi16(T02A, T02D); |
| 781 | T02B = _mm_sub_epi16(T02B, T02C); |
| 782 | T03A = _mm_sub_epi16(T03A, T03D); |
| 783 | T03B = _mm_sub_epi16(T03B, T03C); |
| 784 | T04A = _mm_sub_epi16(T04A, T04D); |
| 785 | T04B = _mm_sub_epi16(T04B, T04C); |
| 786 | T05A = _mm_sub_epi16(T05A, T05D); |
| 787 | T05B = _mm_sub_epi16(T05B, T05C); |
| 788 | T06A = _mm_sub_epi16(T06A, T06D); |
| 789 | T06B = _mm_sub_epi16(T06B, T06C); |
| 790 | T07A = _mm_sub_epi16(T07A, T07D); |
| 791 | T07B = _mm_sub_epi16(T07B, T07C); |
| 792 | |
| 793 | T20 = _mm_add_epi16(T10A, T10B); // [EE5 EE2 EE6 EE1 EE4 EE3 EE7 EE0] |
| 794 | T21 = _mm_add_epi16(T11A, T11B); |
| 795 | T22 = _mm_add_epi16(T12A, T12B); |
| 796 | T23 = _mm_add_epi16(T13A, T13B); |
| 797 | T24 = _mm_add_epi16(T14A, T14B); |
| 798 | T25 = _mm_add_epi16(T15A, T15B); |
| 799 | T26 = _mm_add_epi16(T16A, T16B); |
| 800 | T27 = _mm_add_epi16(T17A, T17B); |
| 801 | |
| 802 | T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 803 | T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 804 | T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 805 | T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 806 | T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 807 | T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 808 | T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 809 | T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_8[1])); |
| 810 | |
| 811 | T40 = _mm_hadd_epi32(T30, T31); |
| 812 | T41 = _mm_hadd_epi32(T32, T33); |
| 813 | T42 = _mm_hadd_epi32(T34, T35); |
| 814 | T43 = _mm_hadd_epi32(T36, T37); |
| 815 | |
| 816 | T50 = _mm_hadd_epi32(T40, T41); |
| 817 | T51 = _mm_hadd_epi32(T42, T43); |
| 818 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); |
| 819 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); |
| 820 | T60 = _mm_packs_epi32(T50, T51); |
| 821 | im[0][i] = T60; |
| 822 | |
| 823 | T50 = _mm_hsub_epi32(T40, T41); |
| 824 | T51 = _mm_hsub_epi32(T42, T43); |
| 825 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); |
| 826 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); |
| 827 | T60 = _mm_packs_epi32(T50, T51); |
| 828 | im[16][i] = T60; |
| 829 | |
| 830 | T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 831 | T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 832 | T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 833 | T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 834 | T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 835 | T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 836 | T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 837 | T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[8])); |
| 838 | |
| 839 | T40 = _mm_hadd_epi32(T30, T31); |
| 840 | T41 = _mm_hadd_epi32(T32, T33); |
| 841 | T42 = _mm_hadd_epi32(T34, T35); |
| 842 | T43 = _mm_hadd_epi32(T36, T37); |
| 843 | |
| 844 | T50 = _mm_hadd_epi32(T40, T41); |
| 845 | T51 = _mm_hadd_epi32(T42, T43); |
| 846 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); |
| 847 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); |
| 848 | T60 = _mm_packs_epi32(T50, T51); |
| 849 | im[8][i] = T60; |
| 850 | |
| 851 | T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 852 | T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 853 | T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 854 | T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 855 | T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 856 | T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 857 | T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 858 | T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[9])); |
| 859 | |
| 860 | T40 = _mm_hadd_epi32(T30, T31); |
| 861 | T41 = _mm_hadd_epi32(T32, T33); |
| 862 | T42 = _mm_hadd_epi32(T34, T35); |
| 863 | T43 = _mm_hadd_epi32(T36, T37); |
| 864 | |
| 865 | T50 = _mm_hadd_epi32(T40, T41); |
| 866 | T51 = _mm_hadd_epi32(T42, T43); |
| 867 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); |
| 868 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); |
| 869 | T60 = _mm_packs_epi32(T50, T51); |
| 870 | im[24][i] = T60; |
| 871 | |
| 872 | #define MAKE_ODD(tab, dstPos) \ |
| 873 | T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 874 | T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 875 | T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 876 | T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 877 | T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 878 | T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 879 | T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 880 | T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 881 | \ |
| 882 | T40 = _mm_hadd_epi32(T30, T31); \ |
| 883 | T41 = _mm_hadd_epi32(T32, T33); \ |
| 884 | T42 = _mm_hadd_epi32(T34, T35); \ |
| 885 | T43 = _mm_hadd_epi32(T36, T37); \ |
| 886 | \ |
| 887 | T50 = _mm_hadd_epi32(T40, T41); \ |
| 888 | T51 = _mm_hadd_epi32(T42, T43); \ |
| 889 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \ |
| 890 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \ |
| 891 | T60 = _mm_packs_epi32(T50, T51); \ |
| 892 | im[(dstPos)][i] = T60; |
| 893 | |
| 894 | MAKE_ODD(0, 4); |
| 895 | MAKE_ODD(1, 12); |
| 896 | MAKE_ODD(2, 20); |
| 897 | MAKE_ODD(3, 28); |
| 898 | |
| 899 | T20 = _mm_sub_epi16(T10A, T10B); // [EO5 EO2 EO6 EO1 EO4 EO3 EO7 EO0] |
| 900 | T21 = _mm_sub_epi16(T11A, T11B); |
| 901 | T22 = _mm_sub_epi16(T12A, T12B); |
| 902 | T23 = _mm_sub_epi16(T13A, T13B); |
| 903 | T24 = _mm_sub_epi16(T14A, T14B); |
| 904 | T25 = _mm_sub_epi16(T15A, T15B); |
| 905 | T26 = _mm_sub_epi16(T16A, T16B); |
| 906 | T27 = _mm_sub_epi16(T17A, T17B); |
| 907 | |
| 908 | MAKE_ODD(4, 2); |
| 909 | MAKE_ODD(5, 6); |
| 910 | MAKE_ODD(6, 10); |
| 911 | MAKE_ODD(7, 14); |
| 912 | MAKE_ODD(8, 18); |
| 913 | MAKE_ODD(9, 22); |
| 914 | MAKE_ODD(10, 26); |
| 915 | MAKE_ODD(11, 30); |
| 916 | #undef MAKE_ODD |
| 917 | |
| 918 | #define MAKE_ODD(tab, dstPos) \ |
| 919 | T20 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 920 | T21 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ |
| 921 | T22 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 922 | T23 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ |
| 923 | T24 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 924 | T25 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ |
| 925 | T26 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 926 | T27 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ |
| 927 | T30 = _mm_madd_epi16(T04A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 928 | T31 = _mm_madd_epi16(T04B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ |
| 929 | T32 = _mm_madd_epi16(T05A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 930 | T33 = _mm_madd_epi16(T05B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ |
| 931 | T34 = _mm_madd_epi16(T06A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 932 | T35 = _mm_madd_epi16(T06B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ |
| 933 | T36 = _mm_madd_epi16(T07A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ |
| 934 | T37 = _mm_madd_epi16(T07B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ |
| 935 | \ |
| 936 | T40 = _mm_hadd_epi32(T20, T21); \ |
| 937 | T41 = _mm_hadd_epi32(T22, T23); \ |
| 938 | T42 = _mm_hadd_epi32(T24, T25); \ |
| 939 | T43 = _mm_hadd_epi32(T26, T27); \ |
| 940 | T44 = _mm_hadd_epi32(T30, T31); \ |
| 941 | T45 = _mm_hadd_epi32(T32, T33); \ |
| 942 | T46 = _mm_hadd_epi32(T34, T35); \ |
| 943 | T47 = _mm_hadd_epi32(T36, T37); \ |
| 944 | \ |
| 945 | T50 = _mm_hadd_epi32(T40, T41); \ |
| 946 | T51 = _mm_hadd_epi32(T42, T43); \ |
| 947 | T52 = _mm_hadd_epi32(T44, T45); \ |
| 948 | T53 = _mm_hadd_epi32(T46, T47); \ |
| 949 | \ |
| 950 | T50 = _mm_hadd_epi32(T50, T51); \ |
| 951 | T51 = _mm_hadd_epi32(T52, T53); \ |
| 952 | T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \ |
| 953 | T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \ |
| 954 | T60 = _mm_packs_epi32(T50, T51); \ |
| 955 | im[(dstPos)][i] = T60; |
| 956 | |
| 957 | MAKE_ODD(12, 1); |
| 958 | MAKE_ODD(14, 3); |
| 959 | MAKE_ODD(16, 5); |
| 960 | MAKE_ODD(18, 7); |
| 961 | MAKE_ODD(20, 9); |
| 962 | MAKE_ODD(22, 11); |
| 963 | MAKE_ODD(24, 13); |
| 964 | MAKE_ODD(26, 15); |
| 965 | MAKE_ODD(28, 17); |
| 966 | MAKE_ODD(30, 19); |
| 967 | MAKE_ODD(32, 21); |
| 968 | MAKE_ODD(34, 23); |
| 969 | MAKE_ODD(36, 25); |
| 970 | MAKE_ODD(38, 27); |
| 971 | MAKE_ODD(40, 29); |
| 972 | MAKE_ODD(42, 31); |
| 973 | |
| 974 | #undef MAKE_ODD |
| 975 | } |
| 976 | |
| 977 | // DCT2 |
| 978 | for (i = 0; i < 32 / 4; i++) |
| 979 | { |
| 980 | // OPT_ME: to avoid register spill, I use matrix multiply, have other way? |
| 981 | T00A = im[i * 4 + 0][0]; // [07 06 05 04 03 02 01 00] |
| 982 | T00B = im[i * 4 + 0][1]; // [15 14 13 12 11 10 09 08] |
| 983 | T00C = im[i * 4 + 0][2]; // [23 22 21 20 19 18 17 16] |
| 984 | T00D = im[i * 4 + 0][3]; // [31 30 29 28 27 26 25 24] |
| 985 | T01A = im[i * 4 + 1][0]; |
| 986 | T01B = im[i * 4 + 1][1]; |
| 987 | T01C = im[i * 4 + 1][2]; |
| 988 | T01D = im[i * 4 + 1][3]; |
| 989 | T02A = im[i * 4 + 2][0]; |
| 990 | T02B = im[i * 4 + 2][1]; |
| 991 | T02C = im[i * 4 + 2][2]; |
| 992 | T02D = im[i * 4 + 2][3]; |
| 993 | T03A = im[i * 4 + 3][0]; |
| 994 | T03B = im[i * 4 + 3][1]; |
| 995 | T03C = im[i * 4 + 3][2]; |
| 996 | T03D = im[i * 4 + 3][3]; |
| 997 | |
| 998 | T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [16 17 18 19 20 21 22 23] |
| 999 | T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [24 25 26 27 28 29 30 31] |
| 1000 | T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 1001 | T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 1002 | T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 1003 | T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 1004 | T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 1005 | T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); |
| 1006 | |
| 1007 | T10A = _mm_unpacklo_epi16(T00A, T00D); // [28 03 29 02 30 01 31 00] |
| 1008 | T10B = _mm_unpackhi_epi16(T00A, T00D); // [24 07 25 06 26 05 27 04] |
| 1009 | T00A = _mm_unpacklo_epi16(T00B, T00C); // [20 11 21 10 22 09 23 08] |
| 1010 | T00B = _mm_unpackhi_epi16(T00B, T00C); // [16 15 17 14 18 13 19 12] |
| 1011 | T11A = _mm_unpacklo_epi16(T01A, T01D); |
| 1012 | T11B = _mm_unpackhi_epi16(T01A, T01D); |
| 1013 | T01A = _mm_unpacklo_epi16(T01B, T01C); |
| 1014 | T01B = _mm_unpackhi_epi16(T01B, T01C); |
| 1015 | T12A = _mm_unpacklo_epi16(T02A, T02D); |
| 1016 | T12B = _mm_unpackhi_epi16(T02A, T02D); |
| 1017 | T02A = _mm_unpacklo_epi16(T02B, T02C); |
| 1018 | T02B = _mm_unpackhi_epi16(T02B, T02C); |
| 1019 | T13A = _mm_unpacklo_epi16(T03A, T03D); |
| 1020 | T13B = _mm_unpackhi_epi16(T03A, T03D); |
| 1021 | T03A = _mm_unpacklo_epi16(T03B, T03C); |
| 1022 | T03B = _mm_unpackhi_epi16(T03B, T03C); |
| 1023 | |
| 1024 | #define MAKE_ODD(tab0, tab1, tab2, tab3, dstPos) \ |
| 1025 | T20 = _mm_madd_epi16(T10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ |
| 1026 | T21 = _mm_madd_epi16(T10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ |
| 1027 | T22 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ |
| 1028 | T23 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ |
| 1029 | T24 = _mm_madd_epi16(T11A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ |
| 1030 | T25 = _mm_madd_epi16(T11B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ |
| 1031 | T26 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ |
| 1032 | T27 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ |
| 1033 | T30 = _mm_madd_epi16(T12A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ |
| 1034 | T31 = _mm_madd_epi16(T12B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ |
| 1035 | T32 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ |
| 1036 | T33 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ |
| 1037 | T34 = _mm_madd_epi16(T13A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ |
| 1038 | T35 = _mm_madd_epi16(T13B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ |
| 1039 | T36 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ |
| 1040 | T37 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ |
| 1041 | \ |
| 1042 | T60 = _mm_hadd_epi32(T20, T21); \ |
| 1043 | T61 = _mm_hadd_epi32(T22, T23); \ |
| 1044 | T62 = _mm_hadd_epi32(T24, T25); \ |
| 1045 | T63 = _mm_hadd_epi32(T26, T27); \ |
| 1046 | T64 = _mm_hadd_epi32(T30, T31); \ |
| 1047 | T65 = _mm_hadd_epi32(T32, T33); \ |
| 1048 | T66 = _mm_hadd_epi32(T34, T35); \ |
| 1049 | T67 = _mm_hadd_epi32(T36, T37); \ |
| 1050 | \ |
| 1051 | T60 = _mm_hadd_epi32(T60, T61); \ |
| 1052 | T61 = _mm_hadd_epi32(T62, T63); \ |
| 1053 | T62 = _mm_hadd_epi32(T64, T65); \ |
| 1054 | T63 = _mm_hadd_epi32(T66, T67); \ |
| 1055 | \ |
| 1056 | T60 = _mm_hadd_epi32(T60, T61); \ |
| 1057 | T61 = _mm_hadd_epi32(T62, T63); \ |
| 1058 | \ |
| 1059 | T60 = _mm_hadd_epi32(T60, T61); \ |
| 1060 | \ |
| 1061 | T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \ |
| 1062 | T60 = _mm_packs_epi32(T60, T60); \ |
| 1063 | _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \ |
| 1064 | |
| 1065 | MAKE_ODD(44, 44, 44, 44, 0); |
| 1066 | MAKE_ODD(45, 45, 45, 45, 16); |
| 1067 | MAKE_ODD(46, 47, 46, 47, 8); |
| 1068 | MAKE_ODD(48, 49, 48, 49, 24); |
| 1069 | |
| 1070 | MAKE_ODD(50, 51, 52, 53, 4); |
| 1071 | MAKE_ODD(54, 55, 56, 57, 12); |
| 1072 | MAKE_ODD(58, 59, 60, 61, 20); |
| 1073 | MAKE_ODD(62, 63, 64, 65, 28); |
| 1074 | |
| 1075 | MAKE_ODD(66, 67, 68, 69, 2); |
| 1076 | MAKE_ODD(70, 71, 72, 73, 6); |
| 1077 | MAKE_ODD(74, 75, 76, 77, 10); |
| 1078 | MAKE_ODD(78, 79, 80, 81, 14); |
| 1079 | |
| 1080 | MAKE_ODD(82, 83, 84, 85, 18); |
| 1081 | MAKE_ODD(86, 87, 88, 89, 22); |
| 1082 | MAKE_ODD(90, 91, 92, 93, 26); |
| 1083 | MAKE_ODD(94, 95, 96, 97, 30); |
| 1084 | |
| 1085 | MAKE_ODD(98, 99, 100, 101, 1); |
| 1086 | MAKE_ODD(102, 103, 104, 105, 3); |
| 1087 | MAKE_ODD(106, 107, 108, 109, 5); |
| 1088 | MAKE_ODD(110, 111, 112, 113, 7); |
| 1089 | MAKE_ODD(114, 115, 116, 117, 9); |
| 1090 | MAKE_ODD(118, 119, 120, 121, 11); |
| 1091 | MAKE_ODD(122, 123, 124, 125, 13); |
| 1092 | MAKE_ODD(126, 127, 128, 129, 15); |
| 1093 | MAKE_ODD(130, 131, 132, 133, 17); |
| 1094 | MAKE_ODD(134, 135, 136, 137, 19); |
| 1095 | MAKE_ODD(138, 139, 140, 141, 21); |
| 1096 | MAKE_ODD(142, 143, 144, 145, 23); |
| 1097 | MAKE_ODD(146, 147, 148, 149, 25); |
| 1098 | MAKE_ODD(150, 151, 152, 153, 27); |
| 1099 | MAKE_ODD(154, 155, 156, 157, 29); |
| 1100 | MAKE_ODD(158, 159, 160, 161, 31); |
| 1101 | #undef MAKE_ODD |
| 1102 | } |
| 1103 | } |
| 1104 | } |
| 1105 | #endif // if !HIGH_BIT_DEPTH |
| 1106 | |
| 1107 | namespace x265 { |
| 1108 | void Setup_Vec_DCTPrimitives_ssse3(EncoderPrimitives &p) |
| 1109 | { |
| 1110 | /* Note: We have AVX2 assembly for these two functions, but since AVX2 is |
| 1111 | * still somewhat rare on end-user PCs we still compile and link these SSSE3 |
| 1112 | * intrinsic SIMD functions */ |
| 1113 | #if !HIGH_BIT_DEPTH |
| 1114 | p.dct[DCT_16x16] = dct16; |
| 1115 | p.dct[DCT_32x32] = dct32; |
| 1116 | #endif |
| 1117 | } |
| 1118 | } |