Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Steve Borho <steve@borho.org> | |
5 | * Mandar Gurav <mandar@multicorewareinc.com> | |
6 | * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com> | |
7 | * Mahesh Pittala <mahesh@multicorewareinc.com> | |
8 | * Rajesh Paulraj <rajesh@multicorewareinc.com> | |
9 | * Praveen Kumar Tiwari <praveen@multicorewareinc.com> | |
10 | * Min Chen <chenm003@163.com> | |
11 | * | |
12 | * This program is free software; you can redistribute it and/or modify | |
13 | * it under the terms of the GNU General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or | |
15 | * (at your option) any later version. | |
16 | * | |
17 | * This program is distributed in the hope that it will be useful, | |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | * GNU General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU General Public License | |
23 | * along with this program; if not, write to the Free Software | |
24 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
25 | * | |
26 | * This program is also available under a commercial proprietary license. | |
27 | * For more information, contact us at license @ x265.com. | |
28 | *****************************************************************************/ | |
29 | ||
30 | #ifndef X265_PRIMITIVES_H | |
31 | #define X265_PRIMITIVES_H | |
32 | ||
33 | #include "common.h" | |
34 | #include "cpu.h" | |
35 | ||
36 | namespace x265 { | |
37 | // x265 private namespace | |
38 | ||
39 | enum LumaPartitions | |
40 | { | |
41 | // Square | |
42 | LUMA_4x4, LUMA_8x8, LUMA_16x16, LUMA_32x32, LUMA_64x64, | |
43 | // Rectangular | |
44 | LUMA_8x4, LUMA_4x8, | |
45 | LUMA_16x8, LUMA_8x16, | |
46 | LUMA_32x16, LUMA_16x32, | |
47 | LUMA_64x32, LUMA_32x64, | |
48 | // Asymmetrical (0.75, 0.25) | |
49 | LUMA_16x12, LUMA_12x16, LUMA_16x4, LUMA_4x16, | |
50 | LUMA_32x24, LUMA_24x32, LUMA_32x8, LUMA_8x32, | |
51 | LUMA_64x48, LUMA_48x64, LUMA_64x16, LUMA_16x64, | |
52 | NUM_LUMA_PARTITIONS | |
53 | }; | |
54 | ||
55 | // 4:2:0 chroma partition sizes. These enums are just a convenience for indexing into the | |
56 | // chroma primitive arrays when instantiating templates. The function tables should always | |
57 | // be indexed by the luma partition enum | |
58 | enum Chroma420Partitions | |
59 | { | |
60 | CHROMA_2x2, CHROMA_4x4, CHROMA_8x8, CHROMA_16x16, CHROMA_32x32, | |
61 | CHROMA_4x2, CHROMA_2x4, | |
62 | CHROMA_8x4, CHROMA_4x8, | |
63 | CHROMA_16x8, CHROMA_8x16, | |
64 | CHROMA_32x16, CHROMA_16x32, | |
65 | CHROMA_8x6, CHROMA_6x8, CHROMA_8x2, CHROMA_2x8, | |
66 | CHROMA_16x12, CHROMA_12x16, CHROMA_16x4, CHROMA_4x16, | |
67 | CHROMA_32x24, CHROMA_24x32, CHROMA_32x8, CHROMA_8x32, | |
68 | NUM_CHROMA_PARTITIONS | |
69 | }; | |
70 | ||
71 | enum Chroma422Partitions | |
72 | { | |
73 | CHROMA422_2x4, CHROMA422_4x8, CHROMA422_8x16, CHROMA422_16x32, CHROMA422_32x64, | |
74 | CHROMA422_4x4, CHROMA422_2x8, | |
75 | CHROMA422_8x8, CHROMA422_4x16, | |
76 | CHROMA422_16x16, CHROMA422_8x32, | |
77 | CHROMA422_32x32, CHROMA422_16x64, | |
78 | CHROMA422_8x12, CHROMA422_6x16, CHROMA422_8x4, CHROMA422_2x16, | |
79 | CHROMA422_16x24, CHROMA422_12x32, CHROMA422_16x8, CHROMA422_4x32, | |
80 | CHROMA422_32x48, CHROMA422_24x64, CHROMA422_32x16, CHROMA422_8x64, | |
81 | NUM_CHROMA_PARTITIONS422 | |
82 | }; | |
83 | ||
84 | enum SquareBlocks // Routines can be indexed using log2n(width)-2 | |
85 | { | |
86 | BLOCK_4x4, | |
87 | BLOCK_8x8, | |
88 | BLOCK_16x16, | |
89 | BLOCK_32x32, | |
90 | BLOCK_64x64, | |
91 | NUM_SQUARE_BLOCKS | |
92 | }; | |
93 | ||
94 | enum { NUM_TR_SIZE = 4 }; | |
95 | ||
96 | // NOTE: Not all DCT functions support dest stride | |
97 | enum Dcts | |
98 | { | |
99 | DST_4x4, | |
100 | DCT_4x4, | |
101 | DCT_8x8, | |
102 | DCT_16x16, | |
103 | DCT_32x32, | |
104 | NUM_DCTS | |
105 | }; | |
106 | ||
107 | enum IDcts | |
108 | { | |
109 | IDST_4x4, | |
110 | IDCT_4x4, | |
111 | IDCT_8x8, | |
112 | IDCT_16x16, | |
113 | IDCT_32x32, | |
114 | NUM_IDCTS | |
115 | }; | |
116 | ||
117 | // Returns a LumaPartitions enum for the given size, always expected to return a valid enum | |
118 | inline int partitionFromSizes(int width, int height) | |
119 | { | |
120 | X265_CHECK(((width | height) & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block width/height\n"); | |
121 | extern const uint8_t lumaPartitionMapTable[]; | |
122 | int w = (width >> 2) - 1; | |
123 | int h = (height >> 2) - 1; | |
124 | int part = (int)lumaPartitionMapTable[(w << 4) + h]; | |
125 | X265_CHECK(part != 255, "Invalid block width %d height %d\n", width, height); | |
126 | return part; | |
127 | } | |
128 | ||
129 | inline int partitionFromLog2Size(int log2Size) | |
130 | { | |
131 | X265_CHECK(2 <= log2Size && log2Size <= 6, "Invalid block size\n"); | |
132 | return log2Size - 2; | |
133 | } | |
134 | ||
b53f7c52 JB |
135 | typedef int (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned |
136 | typedef int (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride); | |
137 | typedef int (*pixelcmp_sp_t)(const int16_t* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); | |
138 | typedef int (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride); | |
139 | typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res); | |
140 | typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res); | |
141 | typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight); | |
142 | typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val); | |
143 | ||
144 | typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter); | |
145 | typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma); | |
146 | ||
147 | typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); | |
148 | typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); | |
149 | typedef void (*cpy1Dto2D_shl_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); | |
150 | typedef void (*cpy1Dto2D_shr_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); | |
151 | typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t resiStride); | |
152 | ||
153 | typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t srcStride); | |
154 | typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t dstStride); | |
155 | typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff); | |
156 | ||
157 | typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); | |
158 | typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride); | |
159 | typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff); | |
160 | typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff); | |
161 | typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift); | |
162 | typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift); | |
163 | typedef int (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff); | |
164 | ||
165 | typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); | |
166 | typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); | |
167 | typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride); | |
168 | typedef void (*downscale_t)(const pixel* src0, pixel* dstf, pixel* dsth, pixel* dstv, pixel* dstc, | |
72b9787e JB |
169 | intptr_t src_stride, intptr_t dst_stride, int width, int height); |
170 | typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX); | |
b53f7c52 | 171 | typedef void (*ssim_4x4x2_core_t)(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]); |
72b9787e | 172 | typedef float (*ssim_end4_t)(int sum0[5][4], int sum1[5][4], int width); |
b53f7c52 JB |
173 | typedef uint64_t (*var_t)(const pixel* pix, intptr_t stride); |
174 | typedef void (*plane_copy_deinterleave_t)(pixel* dstu, intptr_t dstuStride, pixel* dstv, intptr_t dstvStride, const pixel* src, intptr_t srcStride, int w, int h); | |
72b9787e | 175 | |
b53f7c52 JB |
176 | typedef void (*filter_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); |
177 | typedef void (*filter_hps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); | |
178 | typedef void (*filter_ps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); | |
179 | typedef void (*filter_sp_t) (const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); | |
180 | typedef void (*filter_ss_t) (const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); | |
181 | typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); | |
182 | typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height); | |
72b9787e | 183 | |
b53f7c52 JB |
184 | typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned |
185 | typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); | |
186 | typedef void (*copy_ps_t)(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); | |
187 | typedef void (*copy_ss_t)(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); | |
72b9787e | 188 | |
b53f7c52 JB |
189 | typedef void (*pixel_sub_ps_t)(int16_t* dst, intptr_t dstride, const pixel* src0, const pixel* src1, intptr_t sstride0, intptr_t sstride1); |
190 | typedef void (*pixel_add_ps_t)(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); | |
191 | typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride); | |
72b9787e | 192 | |
b53f7c52 JB |
193 | typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft); |
194 | typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift); | |
195 | typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); | |
72b9787e | 196 | |
b53f7c52 | 197 | typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len); |
72b9787e JB |
198 | |
199 | /* Define a structure containing function pointers to optimized encoder | |
200 | * primitives. Each pointer can reference either an assembly routine, | |
201 | * a vectorized primitive, or a C function. */ | |
202 | struct EncoderPrimitives | |
203 | { | |
b53f7c52 JB |
204 | pixelcmp_t sad[NUM_LUMA_PARTITIONS]; // Sum of Differences for each size |
205 | pixelcmp_x3_t sad_x3[NUM_LUMA_PARTITIONS]; // Sum of Differences 3x for each size | |
206 | pixelcmp_x4_t sad_x4[NUM_LUMA_PARTITIONS]; // Sum of Differences 4x for each size | |
207 | pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed | |
208 | pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed | |
209 | pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed | |
210 | pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed | |
211 | pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD) | |
212 | pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions | |
213 | pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks | |
214 | pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in AC energy between two blocks | |
215 | pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS]; | |
216 | ||
217 | dct_t dct[NUM_DCTS]; | |
218 | idct_t idct[NUM_IDCTS]; | |
219 | quant_t quant; | |
220 | nquant_t nquant; | |
221 | dequant_scaling_t dequant_scaling; | |
222 | dequant_normal_t dequant_normal; | |
223 | count_nonzero_t count_nonzero; | |
224 | denoiseDct_t denoiseDct; | |
225 | calcresidual_t calcresidual[NUM_SQUARE_BLOCKS]; | |
226 | blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value | |
227 | cpy2Dto1D_shl_t cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1]; | |
228 | cpy2Dto1D_shr_t cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1]; | |
229 | cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1]; | |
230 | cpy1Dto2D_shr_t cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1]; | |
231 | copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1]; | |
232 | ||
233 | intra_pred_t intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE]; | |
234 | intra_allangs_t intra_pred_allangs[NUM_TR_SIZE]; | |
235 | transpose_t transpose[NUM_SQUARE_BLOCKS]; | |
236 | scale_t scale1D_128to64; | |
237 | scale_t scale2D_64to32; | |
238 | ||
239 | var_t var[NUM_SQUARE_BLOCKS]; | |
240 | ssim_4x4x2_core_t ssim_4x4x2_core; | |
241 | ssim_end4_t ssim_end_4; | |
242 | ||
243 | saoCuOrgE0_t saoCuOrgE0; | |
244 | ||
245 | downscale_t frameInitLowres; | |
246 | cutree_propagate_cost propagateCost; | |
247 | ||
248 | extendCURowBorder_t extendRowBorder; | |
249 | planecopy_cp_t planecopy_cp; | |
250 | planecopy_sp_t planecopy_sp; | |
251 | ||
252 | weightp_sp_t weight_sp; | |
253 | weightp_pp_t weight_pp; | |
254 | pixelavg_pp_t pixelavg_pp[NUM_LUMA_PARTITIONS]; | |
255 | addAvg_t luma_addAvg[NUM_LUMA_PARTITIONS]; | |
256 | ||
257 | filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS]; | |
258 | filter_hps_t luma_hps[NUM_LUMA_PARTITIONS]; | |
259 | filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS]; | |
260 | filter_ps_t luma_vps[NUM_LUMA_PARTITIONS]; | |
261 | filter_sp_t luma_vsp[NUM_LUMA_PARTITIONS]; | |
262 | filter_ss_t luma_vss[NUM_LUMA_PARTITIONS]; | |
263 | filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS]; | |
264 | filter_p2s_t luma_p2s; | |
265 | ||
266 | copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS]; | |
267 | copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS]; | |
268 | copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS]; | |
269 | copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS]; | |
270 | pixel_sub_ps_t luma_sub_ps[NUM_SQUARE_BLOCKS]; | |
271 | pixel_add_ps_t luma_add_ps[NUM_SQUARE_BLOCKS]; | |
72b9787e JB |
272 | |
273 | struct | |
274 | { | |
b53f7c52 | 275 | pixelcmp_t satd[NUM_LUMA_PARTITIONS]; |
72b9787e JB |
276 | filter_pp_t filter_vpp[NUM_LUMA_PARTITIONS]; |
277 | filter_ps_t filter_vps[NUM_LUMA_PARTITIONS]; | |
278 | filter_sp_t filter_vsp[NUM_LUMA_PARTITIONS]; | |
279 | filter_ss_t filter_vss[NUM_LUMA_PARTITIONS]; | |
280 | filter_pp_t filter_hpp[NUM_LUMA_PARTITIONS]; | |
281 | filter_hps_t filter_hps[NUM_LUMA_PARTITIONS]; | |
282 | addAvg_t addAvg[NUM_LUMA_PARTITIONS]; | |
283 | copy_pp_t copy_pp[NUM_LUMA_PARTITIONS]; | |
284 | copy_sp_t copy_sp[NUM_LUMA_PARTITIONS]; | |
285 | copy_ps_t copy_ps[NUM_LUMA_PARTITIONS]; | |
286 | copy_ss_t copy_ss[NUM_LUMA_PARTITIONS]; | |
287 | pixel_sub_ps_t sub_ps[NUM_SQUARE_BLOCKS]; | |
288 | pixel_add_ps_t add_ps[NUM_SQUARE_BLOCKS]; | |
b53f7c52 JB |
289 | filter_p2s_t p2s; |
290 | } chroma[X265_CSP_COUNT]; | |
72b9787e JB |
291 | }; |
292 | ||
293 | void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY); | |
294 | ||
295 | /* This copy of the table is what gets used by the encoder. | |
296 | * It must be initialized before the encoder begins. */ | |
297 | extern EncoderPrimitives primitives; | |
298 | ||
299 | void Setup_C_Primitives(EncoderPrimitives &p); | |
300 | void Setup_Instrinsic_Primitives(EncoderPrimitives &p, int cpuMask); | |
301 | void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask); | |
302 | void Setup_Alias_Primitives(EncoderPrimitives &p); | |
303 | } | |
304 | ||
305 | #endif // ifndef X265_PRIMITIVES_H |