Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Steve Borho <steve@borho.org> | |
5 | * Mandar Gurav <mandar@multicorewareinc.com> | |
6 | * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com> | |
7 | * Mahesh Pittala <mahesh@multicorewareinc.com> | |
8 | * Rajesh Paulraj <rajesh@multicorewareinc.com> | |
9 | * Praveen Kumar Tiwari <praveen@multicorewareinc.com> | |
10 | * Min Chen <chenm003@163.com> | |
11 | * | |
12 | * This program is free software; you can redistribute it and/or modify | |
13 | * it under the terms of the GNU General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or | |
15 | * (at your option) any later version. | |
16 | * | |
17 | * This program is distributed in the hope that it will be useful, | |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | * GNU General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU General Public License | |
23 | * along with this program; if not, write to the Free Software | |
24 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
25 | * | |
26 | * This program is also available under a commercial proprietary license. | |
27 | * For more information, contact us at license @ x265.com. | |
28 | *****************************************************************************/ | |
29 | ||
30 | #ifndef X265_PRIMITIVES_H | |
31 | #define X265_PRIMITIVES_H | |
32 | ||
33 | #include "common.h" | |
34 | #include "cpu.h" | |
35 | ||
36 | namespace x265 { | |
37 | // x265 private namespace | |
38 | ||
39 | enum LumaPartitions | |
40 | { | |
41 | // Square | |
42 | LUMA_4x4, LUMA_8x8, LUMA_16x16, LUMA_32x32, LUMA_64x64, | |
43 | // Rectangular | |
44 | LUMA_8x4, LUMA_4x8, | |
45 | LUMA_16x8, LUMA_8x16, | |
46 | LUMA_32x16, LUMA_16x32, | |
47 | LUMA_64x32, LUMA_32x64, | |
48 | // Asymmetrical (0.75, 0.25) | |
49 | LUMA_16x12, LUMA_12x16, LUMA_16x4, LUMA_4x16, | |
50 | LUMA_32x24, LUMA_24x32, LUMA_32x8, LUMA_8x32, | |
51 | LUMA_64x48, LUMA_48x64, LUMA_64x16, LUMA_16x64, | |
52 | NUM_LUMA_PARTITIONS | |
53 | }; | |
54 | ||
55 | // 4:2:0 chroma partition sizes. These enums are just a convenience for indexing into the | |
56 | // chroma primitive arrays when instantiating templates. The function tables should always | |
57 | // be indexed by the luma partition enum | |
58 | enum Chroma420Partitions | |
59 | { | |
60 | CHROMA_2x2, CHROMA_4x4, CHROMA_8x8, CHROMA_16x16, CHROMA_32x32, | |
61 | CHROMA_4x2, CHROMA_2x4, | |
62 | CHROMA_8x4, CHROMA_4x8, | |
63 | CHROMA_16x8, CHROMA_8x16, | |
64 | CHROMA_32x16, CHROMA_16x32, | |
65 | CHROMA_8x6, CHROMA_6x8, CHROMA_8x2, CHROMA_2x8, | |
66 | CHROMA_16x12, CHROMA_12x16, CHROMA_16x4, CHROMA_4x16, | |
67 | CHROMA_32x24, CHROMA_24x32, CHROMA_32x8, CHROMA_8x32, | |
68 | NUM_CHROMA_PARTITIONS | |
69 | }; | |
70 | ||
71 | enum Chroma422Partitions | |
72 | { | |
73 | CHROMA422_2x4, CHROMA422_4x8, CHROMA422_8x16, CHROMA422_16x32, CHROMA422_32x64, | |
74 | CHROMA422_4x4, CHROMA422_2x8, | |
75 | CHROMA422_8x8, CHROMA422_4x16, | |
76 | CHROMA422_16x16, CHROMA422_8x32, | |
77 | CHROMA422_32x32, CHROMA422_16x64, | |
78 | CHROMA422_8x12, CHROMA422_6x16, CHROMA422_8x4, CHROMA422_2x16, | |
79 | CHROMA422_16x24, CHROMA422_12x32, CHROMA422_16x8, CHROMA422_4x32, | |
80 | CHROMA422_32x48, CHROMA422_24x64, CHROMA422_32x16, CHROMA422_8x64, | |
81 | NUM_CHROMA_PARTITIONS422 | |
82 | }; | |
83 | ||
84 | enum SquareBlocks // Routines can be indexed using log2n(width)-2 | |
85 | { | |
86 | BLOCK_4x4, | |
87 | BLOCK_8x8, | |
88 | BLOCK_16x16, | |
89 | BLOCK_32x32, | |
90 | BLOCK_64x64, | |
91 | NUM_SQUARE_BLOCKS | |
92 | }; | |
93 | ||
94 | enum { NUM_TR_SIZE = 4 }; | |
95 | ||
96 | // NOTE: Not all DCT functions support dest stride | |
97 | enum Dcts | |
98 | { | |
99 | DST_4x4, | |
100 | DCT_4x4, | |
101 | DCT_8x8, | |
102 | DCT_16x16, | |
103 | DCT_32x32, | |
104 | NUM_DCTS | |
105 | }; | |
106 | ||
107 | enum IDcts | |
108 | { | |
109 | IDST_4x4, | |
110 | IDCT_4x4, | |
111 | IDCT_8x8, | |
112 | IDCT_16x16, | |
113 | IDCT_32x32, | |
114 | NUM_IDCTS | |
115 | }; | |
116 | ||
117 | // Returns a LumaPartitions enum for the given size, always expected to return a valid enum | |
118 | inline int partitionFromSizes(int width, int height) | |
119 | { | |
120 | X265_CHECK(((width | height) & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block width/height\n"); | |
121 | extern const uint8_t lumaPartitionMapTable[]; | |
122 | int w = (width >> 2) - 1; | |
123 | int h = (height >> 2) - 1; | |
124 | int part = (int)lumaPartitionMapTable[(w << 4) + h]; | |
125 | X265_CHECK(part != 255, "Invalid block width %d height %d\n", width, height); | |
126 | return part; | |
127 | } | |
128 | ||
129 | inline int partitionFromLog2Size(int log2Size) | |
130 | { | |
131 | X265_CHECK(2 <= log2Size && log2Size <= 6, "Invalid block size\n"); | |
132 | return log2Size - 2; | |
133 | } | |
134 | ||
135 | typedef int (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned | |
136 | typedef int (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride); | |
137 | typedef int (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); | |
138 | typedef int (*pixel_ssd_s_t)(int16_t *fenc, intptr_t fencstride); | |
139 | typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res); | |
140 | typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res); | |
141 | typedef void (*blockcpy_sp_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned | |
142 | typedef void (*blockcpy_sc_t)(int bx, int by, int16_t *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned | |
143 | typedef void (*pixelsub_ps_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); | |
144 | typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight); | |
145 | typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val); | |
146 | ||
147 | typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter); | |
148 | typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma); | |
149 | ||
150 | typedef void (*cvt16to32_shl_t)(int32_t *dst, int16_t *src, intptr_t, int, int); | |
151 | typedef void (*cvt16to32_shr_t)(int32_t *dst, int16_t *src, intptr_t, int, int); | |
152 | typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int); | |
153 | typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int); | |
154 | typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride); | |
155 | typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size); | |
156 | typedef void (*copy_shl_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift); | |
157 | ||
158 | typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride); | |
159 | typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride); | |
160 | typedef void (*denoiseDct_t)(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff); | |
161 | ||
162 | typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); | |
163 | typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); | |
164 | typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride); | |
165 | typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); | |
166 | typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); | |
167 | typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift); | |
168 | typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift); | |
169 | typedef int (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff); | |
170 | ||
171 | typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); | |
172 | typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); | |
173 | typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride); | |
174 | typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc, | |
175 | intptr_t src_stride, intptr_t dst_stride, int width, int height); | |
176 | typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX); | |
177 | typedef void (*ssim_4x4x2_core_t)(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]); | |
178 | typedef float (*ssim_end4_t)(int sum0[5][4], int sum1[5][4], int width); | |
179 | typedef uint64_t (*var_t)(pixel *pix, intptr_t stride); | |
180 | typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src, intptr_t srcStride, int w, int h); | |
181 | ||
182 | typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx); | |
183 | typedef void (*filter_hps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt); | |
184 | typedef void (*filter_ps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); | |
185 | typedef void (*filter_sp_t) (int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx); | |
186 | typedef void (*filter_ss_t) (int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); | |
187 | typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY); | |
188 | typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); | |
189 | ||
190 | typedef void (*copy_pp_t)(pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned | |
191 | typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride); | |
192 | typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride); | |
193 | typedef void (*copy_ss_t)(int16_t *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride); | |
194 | ||
195 | typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); | |
196 | typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1); | |
197 | typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride); | |
198 | ||
199 | typedef void (*saoCuOrgE0_t)(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft); | |
200 | typedef void (*planecopy_cp_t) (uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift); | |
201 | typedef void (*planecopy_sp_t) (uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); | |
202 | ||
203 | typedef void (*cutree_propagate_cost) (int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len); | |
204 | ||
205 | /* Define a structure containing function pointers to optimized encoder | |
206 | * primitives. Each pointer can reference either an assembly routine, | |
207 | * a vectorized primitive, or a C function. */ | |
208 | struct EncoderPrimitives | |
209 | { | |
210 | pixelcmp_t sad[NUM_LUMA_PARTITIONS]; // Sum of Differences for each size | |
211 | pixelcmp_x3_t sad_x3[NUM_LUMA_PARTITIONS]; // Sum of Differences 3x for each size | |
212 | pixelcmp_x4_t sad_x4[NUM_LUMA_PARTITIONS]; // Sum of Differences 4x for each size | |
213 | pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed | |
214 | pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed | |
215 | pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed | |
216 | pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed | |
217 | pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD) | |
218 | pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions | |
219 | pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks | |
220 | pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in AC energy between two blocks | |
221 | pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS]; | |
222 | ||
223 | blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value | |
224 | cvt16to32_shl_t cvt16to32_shl; | |
225 | cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1]; | |
226 | cvt32to16_shr_t cvt32to16_shr; | |
227 | cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1]; | |
228 | copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1]; | |
229 | copy_shr_t copy_shr; | |
230 | copy_shl_t copy_shl[NUM_SQUARE_BLOCKS - 1]; | |
231 | ||
232 | copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS]; | |
233 | copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS]; | |
234 | copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS]; | |
235 | copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS]; | |
236 | pixel_sub_ps_t luma_sub_ps[NUM_SQUARE_BLOCKS]; | |
237 | pixel_add_ps_t luma_add_ps[NUM_SQUARE_BLOCKS]; | |
238 | copy_pp_t square_copy_pp[NUM_SQUARE_BLOCKS]; | |
239 | copy_sp_t square_copy_sp[NUM_SQUARE_BLOCKS]; | |
240 | copy_ps_t square_copy_ps[NUM_SQUARE_BLOCKS]; | |
241 | copy_ss_t square_copy_ss[NUM_SQUARE_BLOCKS]; | |
242 | ||
243 | filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS]; | |
244 | filter_hps_t luma_hps[NUM_LUMA_PARTITIONS]; | |
245 | filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS]; | |
246 | filter_ps_t luma_vps[NUM_LUMA_PARTITIONS]; | |
247 | filter_sp_t luma_vsp[NUM_LUMA_PARTITIONS]; | |
248 | filter_ss_t luma_vss[NUM_LUMA_PARTITIONS]; | |
249 | filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS]; | |
250 | filter_p2s_t luma_p2s; | |
251 | filter_p2s_t chroma_p2s[X265_CSP_COUNT]; | |
252 | ||
253 | weightp_sp_t weight_sp; | |
254 | weightp_pp_t weight_pp; | |
255 | pixelavg_pp_t pixelavg_pp[NUM_LUMA_PARTITIONS]; | |
256 | addAvg_t luma_addAvg[NUM_LUMA_PARTITIONS]; | |
257 | ||
258 | intra_pred_t intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE]; | |
259 | intra_allangs_t intra_pred_allangs[NUM_TR_SIZE]; | |
260 | scale_t scale1D_128to64; | |
261 | scale_t scale2D_64to32; | |
262 | ||
263 | dct_t dct[NUM_DCTS]; | |
264 | idct_t idct[NUM_IDCTS]; | |
265 | quant_t quant; | |
266 | nquant_t nquant; | |
267 | dequant_scaling_t dequant_scaling; | |
268 | dequant_normal_t dequant_normal; | |
269 | count_nonzero_t count_nonzero; | |
270 | denoiseDct_t denoiseDct; | |
271 | ||
272 | calcresidual_t calcresidual[NUM_SQUARE_BLOCKS]; | |
273 | transpose_t transpose[NUM_SQUARE_BLOCKS]; | |
274 | ||
275 | var_t var[NUM_SQUARE_BLOCKS]; | |
276 | ssim_4x4x2_core_t ssim_4x4x2_core; | |
277 | ssim_end4_t ssim_end_4; | |
278 | ||
279 | downscale_t frame_init_lowres_core; | |
280 | plane_copy_deinterleave_t plane_copy_deinterleave_c; | |
281 | extendCURowBorder_t extendRowBorder; | |
282 | // sao primitives | |
283 | saoCuOrgE0_t saoCuOrgE0; | |
284 | planecopy_cp_t planecopy_cp; | |
285 | planecopy_sp_t planecopy_sp; | |
286 | ||
287 | cutree_propagate_cost propagateCost; | |
288 | ||
289 | struct | |
290 | { | |
291 | filter_pp_t filter_vpp[NUM_LUMA_PARTITIONS]; | |
292 | filter_ps_t filter_vps[NUM_LUMA_PARTITIONS]; | |
293 | filter_sp_t filter_vsp[NUM_LUMA_PARTITIONS]; | |
294 | filter_ss_t filter_vss[NUM_LUMA_PARTITIONS]; | |
295 | filter_pp_t filter_hpp[NUM_LUMA_PARTITIONS]; | |
296 | filter_hps_t filter_hps[NUM_LUMA_PARTITIONS]; | |
297 | addAvg_t addAvg[NUM_LUMA_PARTITIONS]; | |
298 | copy_pp_t copy_pp[NUM_LUMA_PARTITIONS]; | |
299 | copy_sp_t copy_sp[NUM_LUMA_PARTITIONS]; | |
300 | copy_ps_t copy_ps[NUM_LUMA_PARTITIONS]; | |
301 | copy_ss_t copy_ss[NUM_LUMA_PARTITIONS]; | |
302 | pixel_sub_ps_t sub_ps[NUM_SQUARE_BLOCKS]; | |
303 | pixel_add_ps_t add_ps[NUM_SQUARE_BLOCKS]; | |
304 | } chroma[4]; // X265_CSP_COUNT - do not want to include x265.h here | |
305 | }; | |
306 | ||
307 | void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY); | |
308 | ||
309 | /* This copy of the table is what gets used by the encoder. | |
310 | * It must be initialized before the encoder begins. */ | |
311 | extern EncoderPrimitives primitives; | |
312 | ||
313 | void Setup_C_Primitives(EncoderPrimitives &p); | |
314 | void Setup_Instrinsic_Primitives(EncoderPrimitives &p, int cpuMask); | |
315 | void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask); | |
316 | void Setup_Alias_Primitives(EncoderPrimitives &p); | |
317 | } | |
318 | ||
319 | #endif // ifndef X265_PRIMITIVES_H |