1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
7 * Mahesh Pittala <mahesh@multicorewareinc.com>
8 * Rajesh Paulraj <rajesh@multicorewareinc.com>
9 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
10 * Min Chen <chenm003@163.com>
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 * This program is also available under a commercial proprietary license.
27 * For more information, contact us at license @ x265.com.
28 *****************************************************************************/
30 #ifndef X265_PRIMITIVES_H
31 #define X265_PRIMITIVES_H
37 // x265 private namespace
42 LUMA_4x4
, LUMA_8x8
, LUMA_16x16
, LUMA_32x32
, LUMA_64x64
,
46 LUMA_32x16
, LUMA_16x32
,
47 LUMA_64x32
, LUMA_32x64
,
48 // Asymmetrical (0.75, 0.25)
49 LUMA_16x12
, LUMA_12x16
, LUMA_16x4
, LUMA_4x16
,
50 LUMA_32x24
, LUMA_24x32
, LUMA_32x8
, LUMA_8x32
,
51 LUMA_64x48
, LUMA_48x64
, LUMA_64x16
, LUMA_16x64
,
55 // 4:2:0 chroma partition sizes. These enums are just a convenience for indexing into the
56 // chroma primitive arrays when instantiating templates. The function tables should always
57 // be indexed by the luma partition enum
58 enum Chroma420Partitions
60 CHROMA_2x2
, CHROMA_4x4
, CHROMA_8x8
, CHROMA_16x16
, CHROMA_32x32
,
61 CHROMA_4x2
, CHROMA_2x4
,
62 CHROMA_8x4
, CHROMA_4x8
,
63 CHROMA_16x8
, CHROMA_8x16
,
64 CHROMA_32x16
, CHROMA_16x32
,
65 CHROMA_8x6
, CHROMA_6x8
, CHROMA_8x2
, CHROMA_2x8
,
66 CHROMA_16x12
, CHROMA_12x16
, CHROMA_16x4
, CHROMA_4x16
,
67 CHROMA_32x24
, CHROMA_24x32
, CHROMA_32x8
, CHROMA_8x32
,
71 enum Chroma422Partitions
73 CHROMA422_2x4
, CHROMA422_4x8
, CHROMA422_8x16
, CHROMA422_16x32
, CHROMA422_32x64
,
74 CHROMA422_4x4
, CHROMA422_2x8
,
75 CHROMA422_8x8
, CHROMA422_4x16
,
76 CHROMA422_16x16
, CHROMA422_8x32
,
77 CHROMA422_32x32
, CHROMA422_16x64
,
78 CHROMA422_8x12
, CHROMA422_6x16
, CHROMA422_8x4
, CHROMA422_2x16
,
79 CHROMA422_16x24
, CHROMA422_12x32
, CHROMA422_16x8
, CHROMA422_4x32
,
80 CHROMA422_32x48
, CHROMA422_24x64
, CHROMA422_32x16
, CHROMA422_8x64
,
81 NUM_CHROMA_PARTITIONS422
84 enum SquareBlocks
// Routines can be indexed using log2n(width)-2
94 enum { NUM_TR_SIZE
= 4 };
96 // NOTE: Not all DCT functions support dest stride
117 // Returns a LumaPartitions enum for the given size, always expected to return a valid enum
118 inline int partitionFromSizes(int width
, int height
)
120 X265_CHECK(((width
| height
) & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block width/height\n");
121 extern const uint8_t lumaPartitionMapTable
[];
122 int w
= (width
>> 2) - 1;
123 int h
= (height
>> 2) - 1;
124 int part
= (int)lumaPartitionMapTable
[(w
<< 4) + h
];
125 X265_CHECK(part
!= 255, "Invalid block width %d height %d\n", width
, height
);
129 inline int partitionFromLog2Size(int log2Size
)
131 X265_CHECK(2 <= log2Size
&& log2Size
<= 6, "Invalid block size\n");
135 typedef int (*pixelcmp_t
)(const pixel
* fenc
, intptr_t fencstride
, const pixel
* fref
, intptr_t frefstride
); // fenc is aligned
136 typedef int (*pixelcmp_ss_t
)(const int16_t* fenc
, intptr_t fencstride
, const int16_t* fref
, intptr_t frefstride
);
137 typedef int (*pixelcmp_sp_t
)(const int16_t* fenc
, intptr_t fencstride
, const pixel
* fref
, intptr_t frefstride
);
138 typedef int (*pixel_ssd_s_t
)(const int16_t* fenc
, intptr_t fencstride
);
139 typedef void (*pixelcmp_x4_t
)(const pixel
* fenc
, const pixel
* fref0
, const pixel
* fref1
, const pixel
* fref2
, const pixel
* fref3
, intptr_t frefstride
, int32_t* res
);
140 typedef void (*pixelcmp_x3_t
)(const pixel
* fenc
, const pixel
* fref0
, const pixel
* fref1
, const pixel
* fref2
, intptr_t frefstride
, int32_t* res
);
141 typedef void (*pixelavg_pp_t
)(pixel
* dst
, intptr_t dstride
, const pixel
* src0
, intptr_t sstride0
, const pixel
* src1
, intptr_t sstride1
, int weight
);
142 typedef void (*blockfill_s_t
)(int16_t* dst
, intptr_t dstride
, int16_t val
);
144 typedef void (*intra_pred_t
)(pixel
* dst
, intptr_t dstStride
, pixel
* refLeft
, pixel
* refAbove
, int dirMode
, int bFilter
);
145 typedef void (*intra_allangs_t
)(pixel
* dst
, pixel
* above0
, pixel
* left0
, pixel
* above1
, pixel
* left1
, int bLuma
);
147 typedef void (*cpy2Dto1D_shl_t
)(int16_t* dst
, const int16_t* src
, intptr_t srcStride
, int shift
);
148 typedef void (*cpy2Dto1D_shr_t
)(int16_t* dst
, const int16_t* src
, intptr_t srcStride
, int shift
);
149 typedef void (*cpy1Dto2D_shl_t
)(int16_t* dst
, const int16_t* src
, intptr_t dstStride
, int shift
);
150 typedef void (*cpy1Dto2D_shr_t
)(int16_t* dst
, const int16_t* src
, intptr_t dstStride
, int shift
);
151 typedef uint32_t (*copy_cnt_t
)(int16_t* coeff
, const int16_t* residual
, intptr_t resiStride
);
153 typedef void (*dct_t
)(const int16_t* src
, int16_t* dst
, intptr_t srcStride
);
154 typedef void (*idct_t
)(const int16_t* src
, int16_t* dst
, intptr_t dstStride
);
155 typedef void (*denoiseDct_t
)(int16_t* dctCoef
, uint32_t* resSum
, const uint16_t* offset
, int numCoeff
);
157 typedef void (*calcresidual_t
)(const pixel
* fenc
, const pixel
* pred
, int16_t* residual
, intptr_t stride
);
158 typedef void (*transpose_t
)(pixel
* dst
, const pixel
* src
, intptr_t stride
);
159 typedef uint32_t (*quant_t
)(const int16_t* coef
, const int32_t* quantCoeff
, int32_t* deltaU
, int16_t* qCoef
, int qBits
, int add
, int numCoeff
);
160 typedef uint32_t (*nquant_t
)(const int16_t* coef
, const int32_t* quantCoeff
, int16_t* qCoef
, int qBits
, int add
, int numCoeff
);
161 typedef void (*dequant_scaling_t
)(const int16_t* src
, const int32_t* dequantCoef
, int16_t* dst
, int num
, int mcqp_miper
, int shift
);
162 typedef void (*dequant_normal_t
)(const int16_t* quantCoef
, int16_t* coef
, int num
, int scale
, int shift
);
163 typedef int (*count_nonzero_t
)(const int16_t* quantCoeff
, int numCoeff
);
165 typedef void (*weightp_pp_t
)(const pixel
* src
, pixel
* dst
, intptr_t stride
, int width
, int height
, int w0
, int round
, int shift
, int offset
);
166 typedef void (*weightp_sp_t
)(const int16_t* src
, pixel
* dst
, intptr_t srcStride
, intptr_t dstStride
, int width
, int height
, int w0
, int round
, int shift
, int offset
);
167 typedef void (*scale_t
)(pixel
* dst
, const pixel
* src
, intptr_t stride
);
168 typedef void (*downscale_t
)(const pixel
* src0
, pixel
* dstf
, pixel
* dsth
, pixel
* dstv
, pixel
* dstc
,
169 intptr_t src_stride
, intptr_t dst_stride
, int width
, int height
);
170 typedef void (*extendCURowBorder_t
)(pixel
* txt
, intptr_t stride
, int width
, int height
, int marginX
);
171 typedef void (*ssim_4x4x2_core_t
)(const pixel
* pix1
, intptr_t stride1
, const pixel
* pix2
, intptr_t stride2
, int sums
[2][4]);
172 typedef float (*ssim_end4_t
)(int sum0
[5][4], int sum1
[5][4], int width
);
173 typedef uint64_t (*var_t
)(const pixel
* pix
, intptr_t stride
);
174 typedef void (*plane_copy_deinterleave_t
)(pixel
* dstu
, intptr_t dstuStride
, pixel
* dstv
, intptr_t dstvStride
, const pixel
* src
, intptr_t srcStride
, int w
, int h
);
176 typedef void (*filter_pp_t
) (const pixel
* src
, intptr_t srcStride
, pixel
* dst
, intptr_t dstStride
, int coeffIdx
);
177 typedef void (*filter_hps_t
) (const pixel
* src
, intptr_t srcStride
, int16_t* dst
, intptr_t dstStride
, int coeffIdx
, int isRowExt
);
178 typedef void (*filter_ps_t
) (const pixel
* src
, intptr_t srcStride
, int16_t* dst
, intptr_t dstStride
, int coeffIdx
);
179 typedef void (*filter_sp_t
) (const int16_t* src
, intptr_t srcStride
, pixel
* dst
, intptr_t dstStride
, int coeffIdx
);
180 typedef void (*filter_ss_t
) (const int16_t* src
, intptr_t srcStride
, int16_t* dst
, intptr_t dstStride
, int coeffIdx
);
181 typedef void (*filter_hv_pp_t
) (const pixel
* src
, intptr_t srcStride
, pixel
* dst
, intptr_t dstStride
, int idxX
, int idxY
);
182 typedef void (*filter_p2s_t
)(const pixel
* src
, intptr_t srcStride
, int16_t* dst
, int width
, int height
);
184 typedef void (*copy_pp_t
)(pixel
* dst
, intptr_t dstStride
, const pixel
* src
, intptr_t srcStride
); // dst is aligned
185 typedef void (*copy_sp_t
)(pixel
* dst
, intptr_t dstStride
, const int16_t* src
, intptr_t srcStride
);
186 typedef void (*copy_ps_t
)(int16_t* dst
, intptr_t dstStride
, const pixel
* src
, intptr_t srcStride
);
187 typedef void (*copy_ss_t
)(int16_t* dst
, intptr_t dstStride
, const int16_t* src
, intptr_t srcStride
);
189 typedef void (*pixel_sub_ps_t
)(int16_t* dst
, intptr_t dstride
, const pixel
* src0
, const pixel
* src1
, intptr_t sstride0
, intptr_t sstride1
);
190 typedef void (*pixel_add_ps_t
)(pixel
* a
, intptr_t dstride
, const pixel
* b0
, const int16_t* b1
, intptr_t sstride0
, intptr_t sstride1
);
191 typedef void (*addAvg_t
)(const int16_t* src0
, const int16_t* src1
, pixel
* dst
, intptr_t src0Stride
, intptr_t src1Stride
, intptr_t dstStride
);
193 typedef void (*saoCuOrgE0_t
)(pixel
* rec
, int8_t* offsetEo
, int width
, int8_t signLeft
);
194 typedef void (*planecopy_cp_t
) (const uint8_t* src
, intptr_t srcStride
, pixel
* dst
, intptr_t dstStride
, int width
, int height
, int shift
);
195 typedef void (*planecopy_sp_t
) (const uint16_t* src
, intptr_t srcStride
, pixel
* dst
, intptr_t dstStride
, int width
, int height
, int shift
, uint16_t mask
);
197 typedef void (*cutree_propagate_cost
) (int* dst
, const uint16_t* propagateIn
, const int32_t* intraCosts
, const uint16_t* interCosts
, const int32_t* invQscales
, const double* fpsFactor
, int len
);
199 /* Define a structure containing function pointers to optimized encoder
200 * primitives. Each pointer can reference either an assembly routine,
201 * a vectorized primitive, or a C function. */
202 struct EncoderPrimitives
204 pixelcmp_t sad
[NUM_LUMA_PARTITIONS
]; // Sum of Differences for each size
205 pixelcmp_x3_t sad_x3
[NUM_LUMA_PARTITIONS
]; // Sum of Differences 3x for each size
206 pixelcmp_x4_t sad_x4
[NUM_LUMA_PARTITIONS
]; // Sum of Differences 4x for each size
207 pixelcmp_t sse_pp
[NUM_LUMA_PARTITIONS
]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
208 pixelcmp_ss_t sse_ss
[NUM_LUMA_PARTITIONS
]; // Sum of Square Error (short, short) fenc alignment not assumed
209 pixelcmp_sp_t sse_sp
[NUM_LUMA_PARTITIONS
]; // Sum of Square Error (short, pixel) fenc alignment not assumed
210 pixel_ssd_s_t ssd_s
[NUM_SQUARE_BLOCKS
- 1]; // Sum of Square Error (short) fenc alignment not assumed
211 pixelcmp_t satd
[NUM_LUMA_PARTITIONS
]; // Sum of Transformed differences (HADAMARD)
212 pixelcmp_t sa8d_inter
[NUM_LUMA_PARTITIONS
]; // sa8d primitives for motion search partitions
213 pixelcmp_t sa8d
[NUM_SQUARE_BLOCKS
]; // sa8d primitives for square intra blocks
214 pixelcmp_t psy_cost_pp
[NUM_SQUARE_BLOCKS
]; // difference in AC energy between two blocks
215 pixelcmp_ss_t psy_cost_ss
[NUM_SQUARE_BLOCKS
];
218 idct_t idct
[NUM_IDCTS
];
221 dequant_scaling_t dequant_scaling
;
222 dequant_normal_t dequant_normal
;
223 count_nonzero_t count_nonzero
;
224 denoiseDct_t denoiseDct
;
225 calcresidual_t calcresidual
[NUM_SQUARE_BLOCKS
];
226 blockfill_s_t blockfill_s
[NUM_SQUARE_BLOCKS
]; // block fill with value
227 cpy2Dto1D_shl_t cpy2Dto1D_shl
[NUM_SQUARE_BLOCKS
- 1];
228 cpy2Dto1D_shr_t cpy2Dto1D_shr
[NUM_SQUARE_BLOCKS
- 1];
229 cpy1Dto2D_shl_t cpy1Dto2D_shl
[NUM_SQUARE_BLOCKS
- 1];
230 cpy1Dto2D_shr_t cpy1Dto2D_shr
[NUM_SQUARE_BLOCKS
- 1];
231 copy_cnt_t copy_cnt
[NUM_SQUARE_BLOCKS
- 1];
233 intra_pred_t intra_pred
[NUM_INTRA_MODE
][NUM_TR_SIZE
];
234 intra_allangs_t intra_pred_allangs
[NUM_TR_SIZE
];
235 transpose_t transpose
[NUM_SQUARE_BLOCKS
];
236 scale_t scale1D_128to64
;
237 scale_t scale2D_64to32
;
239 var_t var
[NUM_SQUARE_BLOCKS
];
240 ssim_4x4x2_core_t ssim_4x4x2_core
;
241 ssim_end4_t ssim_end_4
;
243 saoCuOrgE0_t saoCuOrgE0
;
245 downscale_t frameInitLowres
;
246 cutree_propagate_cost propagateCost
;
248 extendCURowBorder_t extendRowBorder
;
249 planecopy_cp_t planecopy_cp
;
250 planecopy_sp_t planecopy_sp
;
252 weightp_sp_t weight_sp
;
253 weightp_pp_t weight_pp
;
254 pixelavg_pp_t pixelavg_pp
[NUM_LUMA_PARTITIONS
];
255 addAvg_t luma_addAvg
[NUM_LUMA_PARTITIONS
];
257 filter_pp_t luma_hpp
[NUM_LUMA_PARTITIONS
];
258 filter_hps_t luma_hps
[NUM_LUMA_PARTITIONS
];
259 filter_pp_t luma_vpp
[NUM_LUMA_PARTITIONS
];
260 filter_ps_t luma_vps
[NUM_LUMA_PARTITIONS
];
261 filter_sp_t luma_vsp
[NUM_LUMA_PARTITIONS
];
262 filter_ss_t luma_vss
[NUM_LUMA_PARTITIONS
];
263 filter_hv_pp_t luma_hvpp
[NUM_LUMA_PARTITIONS
];
264 filter_p2s_t luma_p2s
;
266 copy_pp_t luma_copy_pp
[NUM_LUMA_PARTITIONS
];
267 copy_sp_t luma_copy_sp
[NUM_LUMA_PARTITIONS
];
268 copy_ps_t luma_copy_ps
[NUM_LUMA_PARTITIONS
];
269 copy_ss_t luma_copy_ss
[NUM_LUMA_PARTITIONS
];
270 pixel_sub_ps_t luma_sub_ps
[NUM_SQUARE_BLOCKS
];
271 pixel_add_ps_t luma_add_ps
[NUM_SQUARE_BLOCKS
];
275 pixelcmp_t satd
[NUM_LUMA_PARTITIONS
];
276 filter_pp_t filter_vpp
[NUM_LUMA_PARTITIONS
];
277 filter_ps_t filter_vps
[NUM_LUMA_PARTITIONS
];
278 filter_sp_t filter_vsp
[NUM_LUMA_PARTITIONS
];
279 filter_ss_t filter_vss
[NUM_LUMA_PARTITIONS
];
280 filter_pp_t filter_hpp
[NUM_LUMA_PARTITIONS
];
281 filter_hps_t filter_hps
[NUM_LUMA_PARTITIONS
];
282 addAvg_t addAvg
[NUM_LUMA_PARTITIONS
];
283 copy_pp_t copy_pp
[NUM_LUMA_PARTITIONS
];
284 copy_sp_t copy_sp
[NUM_LUMA_PARTITIONS
];
285 copy_ps_t copy_ps
[NUM_LUMA_PARTITIONS
];
286 copy_ss_t copy_ss
[NUM_LUMA_PARTITIONS
];
287 pixel_sub_ps_t sub_ps
[NUM_SQUARE_BLOCKS
];
288 pixel_add_ps_t add_ps
[NUM_SQUARE_BLOCKS
];
290 } chroma
[X265_CSP_COUNT
];
293 void extendPicBorder(pixel
* recon
, intptr_t stride
, int width
, int height
, int marginX
, int marginY
);
295 /* This copy of the table is what gets used by the encoder.
296 * It must be initialized before the encoder begins. */
297 extern EncoderPrimitives primitives
;
299 void Setup_C_Primitives(EncoderPrimitives
&p
);
300 void Setup_Instrinsic_Primitives(EncoderPrimitives
&p
, int cpuMask
);
301 void Setup_Assembly_Primitives(EncoderPrimitives
&p
, int cpuMask
);
302 void Setup_Alias_Primitives(EncoderPrimitives
&p
);
305 #endif // ifndef X265_PRIMITIVES_H