| 1 | /***************************************************************************** |
| 2 | * Copyright (C) 2013 x265 project |
| 3 | * |
| 4 | * Authors: Steve Borho <steve@borho.org> |
| 5 | * Mandar Gurav <mandar@multicorewareinc.com> |
| 6 | * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com> |
| 7 | * Mahesh Pittala <mahesh@multicorewareinc.com> |
| 8 | * Rajesh Paulraj <rajesh@multicorewareinc.com> |
| 9 | * Min Chen <min.chen@multicorewareinc.com> |
| 10 | * Praveen Kumar Tiwari <praveen@multicorewareinc.com> |
| 11 | * Nabajit Deka <nabajit@multicorewareinc.com> |
| 12 | * |
| 13 | * This program is free software; you can redistribute it and/or modify |
| 14 | * it under the terms of the GNU General Public License as published by |
| 15 | * the Free Software Foundation; either version 2 of the License, or |
| 16 | * (at your option) any later version. |
| 17 | * |
| 18 | * This program is distributed in the hope that it will be useful, |
| 19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 21 | * GNU General Public License for more details. |
| 22 | * |
| 23 | * You should have received a copy of the GNU General Public License |
| 24 | * along with this program; if not, write to the Free Software |
| 25 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
| 26 | * |
| 27 | * This program is also available under a commercial proprietary license. |
| 28 | * For more information, contact us at license @ x265.com. |
| 29 | *****************************************************************************/ |
| 30 | |
| 31 | #include "common.h" |
| 32 | #include "primitives.h" |
| 33 | #include <xmmintrin.h> // SSE |
| 34 | #include <smmintrin.h> // SSE4.1 |
| 35 | |
| 36 | using namespace x265; |
| 37 | |
| 38 | namespace { |
| 39 | void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift) |
| 40 | { |
| 41 | X265_CHECK(num <= 32 * 32, "dequant num too large\n"); |
| 42 | |
| 43 | int valueToAdd; |
| 44 | |
| 45 | shift += 4; |
| 46 | |
| 47 | if (shift > per) |
| 48 | { |
| 49 | valueToAdd = 1 << (shift - per - 1); |
| 50 | __m128i IAdd = _mm_set1_epi32(valueToAdd); |
| 51 | |
| 52 | for (int n = 0; n < num; n = n + 8) |
| 53 | { |
| 54 | __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign; |
| 55 | |
| 56 | quantCoef12 = _mm_loadu_si128((__m128i*)(quantCoef + n)); |
| 57 | |
| 58 | deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n)); |
| 59 | deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4)); |
| 60 | |
| 61 | sign = _mm_srai_epi16(quantCoef12, 15); |
| 62 | quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); |
| 63 | quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); |
| 64 | |
| 65 | quantCoef1 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef1, deQuantCoef1), IAdd), _mm_cvtsi32_si128(shift - per)); |
| 66 | quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per)); |
| 67 | |
| 68 | quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2); |
| 69 | sign = _mm_srai_epi16(quantCoef12, 15); |
| 70 | quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); |
| 71 | _mm_storeu_si128((__m128i*)(coef + n), quantCoef1); |
| 72 | quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); |
| 73 | _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2); |
| 74 | } |
| 75 | } |
| 76 | else |
| 77 | { |
| 78 | for (int n = 0; n < num; n = n + 8) |
| 79 | { |
| 80 | __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign; |
| 81 | |
| 82 | quantCoef12 = _mm_loadu_si128((__m128i*)(quantCoef + n)); |
| 83 | |
| 84 | deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n)); |
| 85 | deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4)); |
| 86 | |
| 87 | sign = _mm_srai_epi16(quantCoef12, 15); |
| 88 | quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); |
| 89 | quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); |
| 90 | |
| 91 | quantCoef1 = _mm_mullo_epi32(quantCoef1, deQuantCoef1); |
| 92 | quantCoef2 = _mm_mullo_epi32(quantCoef2, deQuantCoef2); |
| 93 | |
| 94 | quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2); |
| 95 | sign = _mm_srai_epi16(quantCoef12, 15); |
| 96 | quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); |
| 97 | quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); |
| 98 | |
| 99 | quantCoef1 = _mm_sll_epi32(quantCoef1, _mm_cvtsi32_si128(per - shift)); |
| 100 | quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift)); |
| 101 | |
| 102 | quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2); |
| 103 | sign = _mm_srai_epi16(quantCoef12, 15); |
| 104 | quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); |
| 105 | _mm_storeu_si128((__m128i*)(coef + n), quantCoef1); |
| 106 | quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); |
| 107 | _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2); |
| 108 | } |
| 109 | } |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | namespace x265 { |
| 114 | void Setup_Vec_DCTPrimitives_sse41(EncoderPrimitives &p) |
| 115 | { |
| 116 | p.dequant_scaling = dequant_scaling; |
| 117 | } |
| 118 | } |