1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
7 * Mahesh Pittala <mahesh@multicorewareinc.com>
8 * Rajesh Paulraj <rajesh@multicorewareinc.com>
9 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
10 * Min Chen <chenm003@163.com>
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 * This program is also available under a commercial proprietary license.
27 * For more information, contact us at license @ x265.com.
28 *****************************************************************************/
30 #ifndef X265_PRIMITIVES_H
31 #define X265_PRIMITIVES_H
37 // x265 private namespace
42 LUMA_4x4
, LUMA_8x8
, LUMA_16x16
, LUMA_32x32
, LUMA_64x64
,
46 LUMA_32x16
, LUMA_16x32
,
47 LUMA_64x32
, LUMA_32x64
,
48 // Asymmetrical (0.75, 0.25)
49 LUMA_16x12
, LUMA_12x16
, LUMA_16x4
, LUMA_4x16
,
50 LUMA_32x24
, LUMA_24x32
, LUMA_32x8
, LUMA_8x32
,
51 LUMA_64x48
, LUMA_48x64
, LUMA_64x16
, LUMA_16x64
,
55 // 4:2:0 chroma partition sizes. These enums are just a convenience for indexing into the
56 // chroma primitive arrays when instantiating templates. The function tables should always
57 // be indexed by the luma partition enum
58 enum Chroma420Partitions
60 CHROMA_2x2
, CHROMA_4x4
, CHROMA_8x8
, CHROMA_16x16
, CHROMA_32x32
,
61 CHROMA_4x2
, CHROMA_2x4
,
62 CHROMA_8x4
, CHROMA_4x8
,
63 CHROMA_16x8
, CHROMA_8x16
,
64 CHROMA_32x16
, CHROMA_16x32
,
65 CHROMA_8x6
, CHROMA_6x8
, CHROMA_8x2
, CHROMA_2x8
,
66 CHROMA_16x12
, CHROMA_12x16
, CHROMA_16x4
, CHROMA_4x16
,
67 CHROMA_32x24
, CHROMA_24x32
, CHROMA_32x8
, CHROMA_8x32
,
71 enum Chroma422Partitions
73 CHROMA422_2x4
, CHROMA422_4x8
, CHROMA422_8x16
, CHROMA422_16x32
, CHROMA422_32x64
,
74 CHROMA422_4x4
, CHROMA422_2x8
,
75 CHROMA422_8x8
, CHROMA422_4x16
,
76 CHROMA422_16x16
, CHROMA422_8x32
,
77 CHROMA422_32x32
, CHROMA422_16x64
,
78 CHROMA422_8x12
, CHROMA422_6x16
, CHROMA422_8x4
, CHROMA422_2x16
,
79 CHROMA422_16x24
, CHROMA422_12x32
, CHROMA422_16x8
, CHROMA422_4x32
,
80 CHROMA422_32x48
, CHROMA422_24x64
, CHROMA422_32x16
, CHROMA422_8x64
,
81 NUM_CHROMA_PARTITIONS422
84 enum SquareBlocks
// Routines can be indexed using log2n(width)-2
94 enum { NUM_TR_SIZE
= 4 };
96 // NOTE: Not all DCT functions support dest stride
117 // Returns a LumaPartitions enum for the given size, always expected to return a valid enum
118 inline int partitionFromSizes(int width
, int height
)
120 X265_CHECK(((width
| height
) & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block width/height\n");
121 extern const uint8_t lumaPartitionMapTable
[];
122 int w
= (width
>> 2) - 1;
123 int h
= (height
>> 2) - 1;
124 int part
= (int)lumaPartitionMapTable
[(w
<< 4) + h
];
125 X265_CHECK(part
!= 255, "Invalid block width %d height %d\n", width
, height
);
129 inline int partitionFromLog2Size(int log2Size
)
131 X265_CHECK(2 <= log2Size
&& log2Size
<= 6, "Invalid block size\n");
135 typedef int (*pixelcmp_t
)(pixel
*fenc
, intptr_t fencstride
, pixel
*fref
, intptr_t frefstride
); // fenc is aligned
136 typedef int (*pixelcmp_ss_t
)(int16_t *fenc
, intptr_t fencstride
, int16_t *fref
, intptr_t frefstride
);
137 typedef int (*pixelcmp_sp_t
)(int16_t *fenc
, intptr_t fencstride
, pixel
*fref
, intptr_t frefstride
);
138 typedef int (*pixel_ssd_s_t
)(int16_t *fenc
, intptr_t fencstride
);
139 typedef void (*pixelcmp_x4_t
)(pixel
*fenc
, pixel
*fref0
, pixel
*fref1
, pixel
*fref2
, pixel
*fref3
, intptr_t frefstride
, int32_t *res
);
140 typedef void (*pixelcmp_x3_t
)(pixel
*fenc
, pixel
*fref0
, pixel
*fref1
, pixel
*fref2
, intptr_t frefstride
, int32_t *res
);
141 typedef void (*blockcpy_sp_t
)(int bx
, int by
, int16_t *dst
, intptr_t dstride
, pixel
*src
, intptr_t sstride
); // dst is aligned
142 typedef void (*blockcpy_sc_t
)(int bx
, int by
, int16_t *dst
, intptr_t dstride
, uint8_t *src
, intptr_t sstride
); // dst is aligned
143 typedef void (*pixelsub_ps_t
)(int bx
, int by
, int16_t *dst
, intptr_t dstride
, pixel
*src0
, pixel
*src1
, intptr_t sstride0
, intptr_t sstride1
);
144 typedef void (*pixelavg_pp_t
)(pixel
*dst
, intptr_t dstride
, pixel
*src0
, intptr_t sstride0
, pixel
*src1
, intptr_t sstride1
, int weight
);
145 typedef void (*blockfill_s_t
)(int16_t *dst
, intptr_t dstride
, int16_t val
);
147 typedef void (*intra_pred_t
)(pixel
* dst
, intptr_t dstStride
, pixel
*refLeft
, pixel
*refAbove
, int dirMode
, int bFilter
);
148 typedef void (*intra_allangs_t
)(pixel
*dst
, pixel
*above0
, pixel
*left0
, pixel
*above1
, pixel
*left1
, int bLuma
);
150 typedef void (*cvt16to32_shl_t
)(int32_t *dst
, int16_t *src
, intptr_t, int, int);
151 typedef void (*cvt16to32_shr_t
)(int32_t *dst
, int16_t *src
, intptr_t, int, int);
152 typedef void (*cvt32to16_shr_t
)(int16_t *dst
, int32_t *src
, intptr_t, int, int);
153 typedef void (*cvt32to16_shl_t
)(int16_t *dst
, int32_t *src
, intptr_t, int);
154 typedef uint32_t (*copy_cnt_t
)(int16_t* coeff
, int16_t* residual
, intptr_t stride
);
155 typedef void (*copy_shr_t
)(int16_t *dst
, int16_t *src
, intptr_t stride
, int shift
, int size
);
156 typedef void (*copy_shl_t
)(int16_t *dst
, int16_t *src
, intptr_t stride
, int shift
);
158 typedef void (*dct_t
)(int16_t *src
, int32_t *dst
, intptr_t stride
);
159 typedef void (*idct_t
)(int32_t *src
, int16_t *dst
, intptr_t stride
);
160 typedef void (*denoiseDct_t
)(int32_t* dctCoef
, uint32_t* resSum
, uint16_t* offset
, int numCoeff
);
162 typedef void (*calcresidual_t
)(pixel
*fenc
, pixel
*pred
, int16_t *residual
, intptr_t stride
);
163 typedef void (*calcrecon_t
)(pixel
* pred
, int16_t* residual
, int16_t* reconqt
, pixel
*reconipred
, int stride
, int strideqt
, int strideipred
);
164 typedef void (*transpose_t
)(pixel
* dst
, pixel
* src
, intptr_t stride
);
165 typedef uint32_t (*quant_t
)(int32_t *coef
, int32_t *quantCoeff
, int32_t *deltaU
, int16_t *qCoef
, int qBits
, int add
, int numCoeff
);
166 typedef uint32_t (*nquant_t
)(int32_t *coef
, int32_t *quantCoeff
, int16_t *qCoef
, int qBits
, int add
, int numCoeff
);
167 typedef void (*dequant_scaling_t
)(const int16_t* src
, const int32_t *dequantCoef
, int32_t* dst
, int num
, int mcqp_miper
, int shift
);
168 typedef void (*dequant_normal_t
)(const int16_t* quantCoef
, int32_t* coef
, int num
, int scale
, int shift
);
169 typedef int (*count_nonzero_t
)(const int16_t *quantCoeff
, int numCoeff
);
171 typedef void (*weightp_pp_t
)(pixel
*src
, pixel
*dst
, intptr_t stride
, int width
, int height
, int w0
, int round
, int shift
, int offset
);
172 typedef void (*weightp_sp_t
)(int16_t *src
, pixel
*dst
, intptr_t srcStride
, intptr_t dstStride
, int width
, int height
, int w0
, int round
, int shift
, int offset
);
173 typedef void (*scale_t
)(pixel
*dst
, pixel
*src
, intptr_t stride
);
174 typedef void (*downscale_t
)(pixel
*src0
, pixel
*dstf
, pixel
*dsth
, pixel
*dstv
, pixel
*dstc
,
175 intptr_t src_stride
, intptr_t dst_stride
, int width
, int height
);
176 typedef void (*extendCURowBorder_t
)(pixel
* txt
, intptr_t stride
, int width
, int height
, int marginX
);
177 typedef void (*ssim_4x4x2_core_t
)(const pixel
*pix1
, intptr_t stride1
, const pixel
*pix2
, intptr_t stride2
, int sums
[2][4]);
178 typedef float (*ssim_end4_t
)(int sum0
[5][4], int sum1
[5][4], int width
);
179 typedef uint64_t (*var_t
)(pixel
*pix
, intptr_t stride
);
180 typedef void (*plane_copy_deinterleave_t
)(pixel
*dstu
, intptr_t dstuStride
, pixel
*dstv
, intptr_t dstvStride
, pixel
*src
, intptr_t srcStride
, int w
, int h
);
182 typedef void (*filter_pp_t
) (pixel
*src
, intptr_t srcStride
, pixel
*dst
, intptr_t dstStride
, int coeffIdx
);
183 typedef void (*filter_hps_t
) (pixel
*src
, intptr_t srcStride
, int16_t *dst
, intptr_t dstStride
, int coeffIdx
, int isRowExt
);
184 typedef void (*filter_ps_t
) (pixel
*src
, intptr_t srcStride
, int16_t *dst
, intptr_t dstStride
, int coeffIdx
);
185 typedef void (*filter_sp_t
) (int16_t *src
, intptr_t srcStride
, pixel
*dst
, intptr_t dstStride
, int coeffIdx
);
186 typedef void (*filter_ss_t
) (int16_t *src
, intptr_t srcStride
, int16_t *dst
, intptr_t dstStride
, int coeffIdx
);
187 typedef void (*filter_hv_pp_t
) (pixel
*src
, intptr_t srcStride
, pixel
*dst
, intptr_t dstStride
, int idxX
, int idxY
);
188 typedef void (*filter_p2s_t
)(pixel
*src
, intptr_t srcStride
, int16_t *dst
, int width
, int height
);
190 typedef void (*copy_pp_t
)(pixel
*dst
, intptr_t dstride
, pixel
*src
, intptr_t sstride
); // dst is aligned
191 typedef void (*copy_sp_t
)(pixel
*dst
, intptr_t dstStride
, int16_t *src
, intptr_t srcStride
);
192 typedef void (*copy_ps_t
)(int16_t *dst
, intptr_t dstStride
, pixel
*src
, intptr_t srcStride
);
193 typedef void (*copy_ss_t
)(int16_t *dst
, intptr_t dstStride
, int16_t *src
, intptr_t srcStride
);
195 typedef void (*pixel_sub_ps_t
)(int16_t *dst
, intptr_t dstride
, pixel
*src0
, pixel
*src1
, intptr_t sstride0
, intptr_t sstride1
);
196 typedef void (*pixel_add_ps_t
)(pixel
*a
, intptr_t dstride
, pixel
*b0
, int16_t *b1
, intptr_t sstride0
, intptr_t sstride1
);
197 typedef void (*addAvg_t
)(int16_t* src0
, int16_t* src1
, pixel
* dst
, intptr_t src0Stride
, intptr_t src1Stride
, intptr_t dstStride
);
199 typedef void (*saoCuOrgE0_t
)(pixel
* rec
, int8_t * offsetEo
, int width
, int8_t signLeft
);
200 typedef void (*planecopy_cp_t
) (uint8_t *src
, intptr_t srcStride
, pixel
*dst
, intptr_t dstStride
, int width
, int height
, int shift
);
201 typedef void (*planecopy_sp_t
) (uint16_t *src
, intptr_t srcStride
, pixel
*dst
, intptr_t dstStride
, int width
, int height
, int shift
, uint16_t mask
);
203 typedef void (*cutree_propagate_cost
) (int *dst
, uint16_t *propagateIn
, int32_t *intraCosts
, uint16_t *interCosts
, int32_t *invQscales
, double *fpsFactor
, int len
);
205 /* Define a structure containing function pointers to optimized encoder
206 * primitives. Each pointer can reference either an assembly routine,
207 * a vectorized primitive, or a C function. */
208 struct EncoderPrimitives
210 pixelcmp_t sad
[NUM_LUMA_PARTITIONS
]; // Sum of Differences for each size
211 pixelcmp_x3_t sad_x3
[NUM_LUMA_PARTITIONS
]; // Sum of Differences 3x for each size
212 pixelcmp_x4_t sad_x4
[NUM_LUMA_PARTITIONS
]; // Sum of Differences 4x for each size
213 pixelcmp_t sse_pp
[NUM_LUMA_PARTITIONS
]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
214 pixelcmp_ss_t sse_ss
[NUM_LUMA_PARTITIONS
]; // Sum of Square Error (short, short) fenc alignment not assumed
215 pixelcmp_sp_t sse_sp
[NUM_LUMA_PARTITIONS
]; // Sum of Square Error (short, pixel) fenc alignment not assumed
216 pixel_ssd_s_t ssd_s
[NUM_SQUARE_BLOCKS
- 1]; // Sum of Square Error (short) fenc alignment not assumed
217 pixelcmp_t satd
[NUM_LUMA_PARTITIONS
]; // Sum of Transformed differences (HADAMARD)
218 pixelcmp_t sa8d_inter
[NUM_LUMA_PARTITIONS
]; // sa8d primitives for motion search partitions
219 pixelcmp_t sa8d
[NUM_SQUARE_BLOCKS
]; // sa8d primitives for square intra blocks
220 pixelcmp_t psy_cost_pp
[NUM_SQUARE_BLOCKS
]; // difference in AC energy between two blocks
221 pixelcmp_ss_t psy_cost_ss
[NUM_SQUARE_BLOCKS
];
223 blockfill_s_t blockfill_s
[NUM_SQUARE_BLOCKS
]; // block fill with value
224 cvt16to32_shl_t cvt16to32_shl
;
225 cvt16to32_shr_t cvt16to32_shr
[NUM_SQUARE_BLOCKS
- 1];
226 cvt32to16_shr_t cvt32to16_shr
;
227 cvt32to16_shl_t cvt32to16_shl
[NUM_SQUARE_BLOCKS
- 1];
228 copy_cnt_t copy_cnt
[NUM_SQUARE_BLOCKS
- 1];
230 copy_shl_t copy_shl
[NUM_SQUARE_BLOCKS
- 1];
232 copy_pp_t luma_copy_pp
[NUM_LUMA_PARTITIONS
];
233 copy_sp_t luma_copy_sp
[NUM_LUMA_PARTITIONS
];
234 copy_ps_t luma_copy_ps
[NUM_LUMA_PARTITIONS
];
235 copy_ss_t luma_copy_ss
[NUM_LUMA_PARTITIONS
];
236 pixel_sub_ps_t luma_sub_ps
[NUM_SQUARE_BLOCKS
];
237 pixel_add_ps_t luma_add_ps
[NUM_SQUARE_BLOCKS
];
238 copy_pp_t square_copy_pp
[NUM_SQUARE_BLOCKS
];
239 copy_sp_t square_copy_sp
[NUM_SQUARE_BLOCKS
];
240 copy_ps_t square_copy_ps
[NUM_SQUARE_BLOCKS
];
241 copy_ss_t square_copy_ss
[NUM_SQUARE_BLOCKS
];
243 filter_pp_t luma_hpp
[NUM_LUMA_PARTITIONS
];
244 filter_hps_t luma_hps
[NUM_LUMA_PARTITIONS
];
245 filter_pp_t luma_vpp
[NUM_LUMA_PARTITIONS
];
246 filter_ps_t luma_vps
[NUM_LUMA_PARTITIONS
];
247 filter_sp_t luma_vsp
[NUM_LUMA_PARTITIONS
];
248 filter_ss_t luma_vss
[NUM_LUMA_PARTITIONS
];
249 filter_hv_pp_t luma_hvpp
[NUM_LUMA_PARTITIONS
];
250 filter_p2s_t luma_p2s
;
251 filter_p2s_t chroma_p2s
[X265_CSP_COUNT
];
253 weightp_sp_t weight_sp
;
254 weightp_pp_t weight_pp
;
255 pixelavg_pp_t pixelavg_pp
[NUM_LUMA_PARTITIONS
];
256 addAvg_t luma_addAvg
[NUM_LUMA_PARTITIONS
];
258 intra_pred_t intra_pred
[NUM_INTRA_MODE
][NUM_TR_SIZE
];
259 intra_allangs_t intra_pred_allangs
[NUM_TR_SIZE
];
260 scale_t scale1D_128to64
;
261 scale_t scale2D_64to32
;
264 idct_t idct
[NUM_IDCTS
];
267 dequant_scaling_t dequant_scaling
;
268 dequant_normal_t dequant_normal
;
269 count_nonzero_t count_nonzero
;
270 denoiseDct_t denoiseDct
;
272 calcresidual_t calcresidual
[NUM_SQUARE_BLOCKS
];
273 transpose_t transpose
[NUM_SQUARE_BLOCKS
];
275 var_t var
[NUM_SQUARE_BLOCKS
];
276 ssim_4x4x2_core_t ssim_4x4x2_core
;
277 ssim_end4_t ssim_end_4
;
279 downscale_t frame_init_lowres_core
;
280 plane_copy_deinterleave_t plane_copy_deinterleave_c
;
281 extendCURowBorder_t extendRowBorder
;
283 saoCuOrgE0_t saoCuOrgE0
;
284 planecopy_cp_t planecopy_cp
;
285 planecopy_sp_t planecopy_sp
;
287 cutree_propagate_cost propagateCost
;
291 filter_pp_t filter_vpp
[NUM_LUMA_PARTITIONS
];
292 filter_ps_t filter_vps
[NUM_LUMA_PARTITIONS
];
293 filter_sp_t filter_vsp
[NUM_LUMA_PARTITIONS
];
294 filter_ss_t filter_vss
[NUM_LUMA_PARTITIONS
];
295 filter_pp_t filter_hpp
[NUM_LUMA_PARTITIONS
];
296 filter_hps_t filter_hps
[NUM_LUMA_PARTITIONS
];
297 addAvg_t addAvg
[NUM_LUMA_PARTITIONS
];
298 copy_pp_t copy_pp
[NUM_LUMA_PARTITIONS
];
299 copy_sp_t copy_sp
[NUM_LUMA_PARTITIONS
];
300 copy_ps_t copy_ps
[NUM_LUMA_PARTITIONS
];
301 copy_ss_t copy_ss
[NUM_LUMA_PARTITIONS
];
302 pixel_sub_ps_t sub_ps
[NUM_SQUARE_BLOCKS
];
303 pixel_add_ps_t add_ps
[NUM_SQUARE_BLOCKS
];
304 } chroma
[4]; // X265_CSP_COUNT - do not want to include x265.h here
307 void extendPicBorder(pixel
* recon
, intptr_t stride
, int width
, int height
, int marginX
, int marginY
);
309 /* This copy of the table is what gets used by the encoder.
310 * It must be initialized before the encoder begins. */
311 extern EncoderPrimitives primitives
;
313 void Setup_C_Primitives(EncoderPrimitives
&p
);
314 void Setup_Instrinsic_Primitives(EncoderPrimitives
&p
, int cpuMask
);
315 void Setup_Assembly_Primitives(EncoderPrimitives
&p
, int cpuMask
);
316 void Setup_Alias_Primitives(EncoderPrimitives
&p
);
319 #endif // ifndef X265_PRIMITIVES_H