Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * H.26L/H.264/AVC/JVT/14496-10/... decoder | |
3 | * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | /** | |
23 | * @file | |
24 | * H.264 / AVC / MPEG4 part10 macroblock decoding | |
25 | */ | |
26 | ||
27 | #include <stdint.h> | |
28 | ||
29 | #include "config.h" | |
30 | ||
31 | #include "libavutil/common.h" | |
32 | #include "libavutil/intreadwrite.h" | |
33 | #include "avcodec.h" | |
34 | #include "h264.h" | |
35 | #include "qpeldsp.h" | |
36 | #include "svq3.h" | |
37 | #include "thread.h" | |
38 | ||
39 | static inline int get_lowest_part_list_y(H264Context *h, H264Picture *pic, int n, | |
40 | int height, int y_offset, int list) | |
41 | { | |
42 | int raw_my = h->mv_cache[list][scan8[n]][1]; | |
43 | int filter_height_down = (raw_my & 3) ? 3 : 0; | |
44 | int full_my = (raw_my >> 2) + y_offset; | |
45 | int bottom = full_my + filter_height_down + height; | |
46 | ||
47 | av_assert2(height >= 0); | |
48 | ||
49 | return FFMAX(0, bottom); | |
50 | } | |
51 | ||
f6fa7814 | 52 | static inline void get_lowest_part_y(H264Context *h, int16_t refs[2][48], int n, |
2ba45a60 DM |
53 | int height, int y_offset, int list0, |
54 | int list1, int *nrefs) | |
55 | { | |
56 | int my; | |
57 | ||
58 | y_offset += 16 * (h->mb_y >> MB_FIELD(h)); | |
59 | ||
60 | if (list0) { | |
61 | int ref_n = h->ref_cache[0][scan8[n]]; | |
62 | H264Picture *ref = &h->ref_list[0][ref_n]; | |
63 | ||
64 | // Error resilience puts the current picture in the ref list. | |
65 | // Don't try to wait on these as it will cause a deadlock. | |
66 | // Fields can wait on each other, though. | |
67 | if (ref->tf.progress->data != h->cur_pic.tf.progress->data || | |
68 | (ref->reference & 3) != h->picture_structure) { | |
69 | my = get_lowest_part_list_y(h, ref, n, height, y_offset, 0); | |
70 | if (refs[0][ref_n] < 0) | |
71 | nrefs[0] += 1; | |
72 | refs[0][ref_n] = FFMAX(refs[0][ref_n], my); | |
73 | } | |
74 | } | |
75 | ||
76 | if (list1) { | |
77 | int ref_n = h->ref_cache[1][scan8[n]]; | |
78 | H264Picture *ref = &h->ref_list[1][ref_n]; | |
79 | ||
80 | if (ref->tf.progress->data != h->cur_pic.tf.progress->data || | |
81 | (ref->reference & 3) != h->picture_structure) { | |
82 | my = get_lowest_part_list_y(h, ref, n, height, y_offset, 1); | |
83 | if (refs[1][ref_n] < 0) | |
84 | nrefs[1] += 1; | |
85 | refs[1][ref_n] = FFMAX(refs[1][ref_n], my); | |
86 | } | |
87 | } | |
88 | } | |
89 | ||
90 | /** | |
91 | * Wait until all reference frames are available for MC operations. | |
92 | * | |
93 | * @param h the H264 context | |
94 | */ | |
95 | static void await_references(H264Context *h) | |
96 | { | |
97 | const int mb_xy = h->mb_xy; | |
98 | const int mb_type = h->cur_pic.mb_type[mb_xy]; | |
f6fa7814 | 99 | int16_t refs[2][48]; |
2ba45a60 DM |
100 | int nrefs[2] = { 0 }; |
101 | int ref, list; | |
102 | ||
103 | memset(refs, -1, sizeof(refs)); | |
104 | ||
105 | if (IS_16X16(mb_type)) { | |
106 | get_lowest_part_y(h, refs, 0, 16, 0, | |
107 | IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); | |
108 | } else if (IS_16X8(mb_type)) { | |
109 | get_lowest_part_y(h, refs, 0, 8, 0, | |
110 | IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); | |
111 | get_lowest_part_y(h, refs, 8, 8, 8, | |
112 | IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs); | |
113 | } else if (IS_8X16(mb_type)) { | |
114 | get_lowest_part_y(h, refs, 0, 16, 0, | |
115 | IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); | |
116 | get_lowest_part_y(h, refs, 4, 16, 0, | |
117 | IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs); | |
118 | } else { | |
119 | int i; | |
120 | ||
121 | av_assert2(IS_8X8(mb_type)); | |
122 | ||
123 | for (i = 0; i < 4; i++) { | |
124 | const int sub_mb_type = h->sub_mb_type[i]; | |
125 | const int n = 4 * i; | |
126 | int y_offset = (i & 2) << 2; | |
127 | ||
128 | if (IS_SUB_8X8(sub_mb_type)) { | |
129 | get_lowest_part_y(h, refs, n, 8, y_offset, | |
130 | IS_DIR(sub_mb_type, 0, 0), | |
131 | IS_DIR(sub_mb_type, 0, 1), | |
132 | nrefs); | |
133 | } else if (IS_SUB_8X4(sub_mb_type)) { | |
134 | get_lowest_part_y(h, refs, n, 4, y_offset, | |
135 | IS_DIR(sub_mb_type, 0, 0), | |
136 | IS_DIR(sub_mb_type, 0, 1), | |
137 | nrefs); | |
138 | get_lowest_part_y(h, refs, n + 2, 4, y_offset + 4, | |
139 | IS_DIR(sub_mb_type, 0, 0), | |
140 | IS_DIR(sub_mb_type, 0, 1), | |
141 | nrefs); | |
142 | } else if (IS_SUB_4X8(sub_mb_type)) { | |
143 | get_lowest_part_y(h, refs, n, 8, y_offset, | |
144 | IS_DIR(sub_mb_type, 0, 0), | |
145 | IS_DIR(sub_mb_type, 0, 1), | |
146 | nrefs); | |
147 | get_lowest_part_y(h, refs, n + 1, 8, y_offset, | |
148 | IS_DIR(sub_mb_type, 0, 0), | |
149 | IS_DIR(sub_mb_type, 0, 1), | |
150 | nrefs); | |
151 | } else { | |
152 | int j; | |
153 | av_assert2(IS_SUB_4X4(sub_mb_type)); | |
154 | for (j = 0; j < 4; j++) { | |
155 | int sub_y_offset = y_offset + 2 * (j & 2); | |
156 | get_lowest_part_y(h, refs, n + j, 4, sub_y_offset, | |
157 | IS_DIR(sub_mb_type, 0, 0), | |
158 | IS_DIR(sub_mb_type, 0, 1), | |
159 | nrefs); | |
160 | } | |
161 | } | |
162 | } | |
163 | } | |
164 | ||
165 | for (list = h->list_count - 1; list >= 0; list--) | |
166 | for (ref = 0; ref < 48 && nrefs[list]; ref++) { | |
167 | int row = refs[list][ref]; | |
168 | if (row >= 0) { | |
169 | H264Picture *ref_pic = &h->ref_list[list][ref]; | |
170 | int ref_field = ref_pic->reference - 1; | |
171 | int ref_field_picture = ref_pic->field_picture; | |
172 | int pic_height = 16 * h->mb_height >> ref_field_picture; | |
173 | ||
174 | row <<= MB_MBAFF(h); | |
175 | nrefs[list]--; | |
176 | ||
177 | if (!FIELD_PICTURE(h) && ref_field_picture) { // frame referencing two fields | |
178 | ff_thread_await_progress(&ref_pic->tf, | |
179 | FFMIN((row >> 1) - !(row & 1), | |
180 | pic_height - 1), | |
181 | 1); | |
182 | ff_thread_await_progress(&ref_pic->tf, | |
183 | FFMIN((row >> 1), pic_height - 1), | |
184 | 0); | |
185 | } else if (FIELD_PICTURE(h) && !ref_field_picture) { // field referencing one field of a frame | |
186 | ff_thread_await_progress(&ref_pic->tf, | |
187 | FFMIN(row * 2 + ref_field, | |
188 | pic_height - 1), | |
189 | 0); | |
190 | } else if (FIELD_PICTURE(h)) { | |
191 | ff_thread_await_progress(&ref_pic->tf, | |
192 | FFMIN(row, pic_height - 1), | |
193 | ref_field); | |
194 | } else { | |
195 | ff_thread_await_progress(&ref_pic->tf, | |
196 | FFMIN(row, pic_height - 1), | |
197 | 0); | |
198 | } | |
199 | } | |
200 | } | |
201 | } | |
202 | ||
203 | static av_always_inline void mc_dir_part(H264Context *h, H264Picture *pic, | |
204 | int n, int square, int height, | |
205 | int delta, int list, | |
206 | uint8_t *dest_y, uint8_t *dest_cb, | |
207 | uint8_t *dest_cr, | |
208 | int src_x_offset, int src_y_offset, | |
209 | qpel_mc_func *qpix_op, | |
210 | h264_chroma_mc_func chroma_op, | |
211 | int pixel_shift, int chroma_idc) | |
212 | { | |
213 | const int mx = h->mv_cache[list][scan8[n]][0] + src_x_offset * 8; | |
214 | int my = h->mv_cache[list][scan8[n]][1] + src_y_offset * 8; | |
215 | const int luma_xy = (mx & 3) + ((my & 3) << 2); | |
216 | ptrdiff_t offset = ((mx >> 2) << pixel_shift) + (my >> 2) * h->mb_linesize; | |
217 | uint8_t *src_y = pic->f.data[0] + offset; | |
218 | uint8_t *src_cb, *src_cr; | |
219 | int extra_width = 0; | |
220 | int extra_height = 0; | |
221 | int emu = 0; | |
222 | const int full_mx = mx >> 2; | |
223 | const int full_my = my >> 2; | |
224 | const int pic_width = 16 * h->mb_width; | |
225 | const int pic_height = 16 * h->mb_height >> MB_FIELD(h); | |
226 | int ysh; | |
227 | ||
228 | if (mx & 7) | |
229 | extra_width -= 3; | |
230 | if (my & 7) | |
231 | extra_height -= 3; | |
232 | ||
233 | if (full_mx < 0 - extra_width || | |
234 | full_my < 0 - extra_height || | |
235 | full_mx + 16 /*FIXME*/ > pic_width + extra_width || | |
236 | full_my + 16 /*FIXME*/ > pic_height + extra_height) { | |
237 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, | |
238 | src_y - (2 << pixel_shift) - 2 * h->mb_linesize, | |
239 | h->mb_linesize, h->mb_linesize, | |
240 | 16 + 5, 16 + 5 /*FIXME*/, full_mx - 2, | |
241 | full_my - 2, pic_width, pic_height); | |
242 | src_y = h->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize; | |
243 | emu = 1; | |
244 | } | |
245 | ||
246 | qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); // FIXME try variable height perhaps? | |
247 | if (!square) | |
248 | qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize); | |
249 | ||
250 | if (CONFIG_GRAY && h->flags & CODEC_FLAG_GRAY) | |
251 | return; | |
252 | ||
253 | if (chroma_idc == 3 /* yuv444 */) { | |
254 | src_cb = pic->f.data[1] + offset; | |
255 | if (emu) { | |
256 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, | |
257 | src_cb - (2 << pixel_shift) - 2 * h->mb_linesize, | |
258 | h->mb_linesize, h->mb_linesize, | |
259 | 16 + 5, 16 + 5 /*FIXME*/, | |
260 | full_mx - 2, full_my - 2, | |
261 | pic_width, pic_height); | |
262 | src_cb = h->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize; | |
263 | } | |
264 | qpix_op[luma_xy](dest_cb, src_cb, h->mb_linesize); // FIXME try variable height perhaps? | |
265 | if (!square) | |
266 | qpix_op[luma_xy](dest_cb + delta, src_cb + delta, h->mb_linesize); | |
267 | ||
268 | src_cr = pic->f.data[2] + offset; | |
269 | if (emu) { | |
270 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, | |
271 | src_cr - (2 << pixel_shift) - 2 * h->mb_linesize, | |
272 | h->mb_linesize, h->mb_linesize, | |
273 | 16 + 5, 16 + 5 /*FIXME*/, | |
274 | full_mx - 2, full_my - 2, | |
275 | pic_width, pic_height); | |
276 | src_cr = h->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize; | |
277 | } | |
278 | qpix_op[luma_xy](dest_cr, src_cr, h->mb_linesize); // FIXME try variable height perhaps? | |
279 | if (!square) | |
280 | qpix_op[luma_xy](dest_cr + delta, src_cr + delta, h->mb_linesize); | |
281 | return; | |
282 | } | |
283 | ||
284 | ysh = 3 - (chroma_idc == 2 /* yuv422 */); | |
285 | if (chroma_idc == 1 /* yuv420 */ && MB_FIELD(h)) { | |
286 | // chroma offset when predicting from a field of opposite parity | |
287 | my += 2 * ((h->mb_y & 1) - (pic->reference - 1)); | |
288 | emu |= (my >> 3) < 0 || (my >> 3) + 8 >= (pic_height >> 1); | |
289 | } | |
290 | ||
291 | src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + | |
292 | (my >> ysh) * h->mb_uvlinesize; | |
293 | src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + | |
294 | (my >> ysh) * h->mb_uvlinesize; | |
295 | ||
296 | if (emu) { | |
297 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src_cb, | |
298 | h->mb_uvlinesize, h->mb_uvlinesize, | |
299 | 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), | |
300 | pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); | |
301 | src_cb = h->edge_emu_buffer; | |
302 | } | |
303 | chroma_op(dest_cb, src_cb, h->mb_uvlinesize, | |
304 | height >> (chroma_idc == 1 /* yuv420 */), | |
305 | mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7); | |
306 | ||
307 | if (emu) { | |
308 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src_cr, | |
309 | h->mb_uvlinesize, h->mb_uvlinesize, | |
310 | 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), | |
311 | pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); | |
312 | src_cr = h->edge_emu_buffer; | |
313 | } | |
314 | chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */), | |
315 | mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7); | |
316 | } | |
317 | ||
318 | static av_always_inline void mc_part_std(H264Context *h, int n, int square, | |
319 | int height, int delta, | |
320 | uint8_t *dest_y, uint8_t *dest_cb, | |
321 | uint8_t *dest_cr, | |
322 | int x_offset, int y_offset, | |
323 | qpel_mc_func *qpix_put, | |
324 | h264_chroma_mc_func chroma_put, | |
325 | qpel_mc_func *qpix_avg, | |
326 | h264_chroma_mc_func chroma_avg, | |
327 | int list0, int list1, | |
328 | int pixel_shift, int chroma_idc) | |
329 | { | |
330 | qpel_mc_func *qpix_op = qpix_put; | |
331 | h264_chroma_mc_func chroma_op = chroma_put; | |
332 | ||
333 | dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; | |
334 | if (chroma_idc == 3 /* yuv444 */) { | |
335 | dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; | |
336 | dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; | |
337 | } else if (chroma_idc == 2 /* yuv422 */) { | |
338 | dest_cb += (x_offset << pixel_shift) + 2 * y_offset * h->mb_uvlinesize; | |
339 | dest_cr += (x_offset << pixel_shift) + 2 * y_offset * h->mb_uvlinesize; | |
340 | } else { /* yuv420 */ | |
341 | dest_cb += (x_offset << pixel_shift) + y_offset * h->mb_uvlinesize; | |
342 | dest_cr += (x_offset << pixel_shift) + y_offset * h->mb_uvlinesize; | |
343 | } | |
344 | x_offset += 8 * h->mb_x; | |
345 | y_offset += 8 * (h->mb_y >> MB_FIELD(h)); | |
346 | ||
347 | if (list0) { | |
348 | H264Picture *ref = &h->ref_list[0][h->ref_cache[0][scan8[n]]]; | |
349 | mc_dir_part(h, ref, n, square, height, delta, 0, | |
350 | dest_y, dest_cb, dest_cr, x_offset, y_offset, | |
351 | qpix_op, chroma_op, pixel_shift, chroma_idc); | |
352 | ||
353 | qpix_op = qpix_avg; | |
354 | chroma_op = chroma_avg; | |
355 | } | |
356 | ||
357 | if (list1) { | |
358 | H264Picture *ref = &h->ref_list[1][h->ref_cache[1][scan8[n]]]; | |
359 | mc_dir_part(h, ref, n, square, height, delta, 1, | |
360 | dest_y, dest_cb, dest_cr, x_offset, y_offset, | |
361 | qpix_op, chroma_op, pixel_shift, chroma_idc); | |
362 | } | |
363 | } | |
364 | ||
365 | static av_always_inline void mc_part_weighted(H264Context *h, int n, int square, | |
366 | int height, int delta, | |
367 | uint8_t *dest_y, uint8_t *dest_cb, | |
368 | uint8_t *dest_cr, | |
369 | int x_offset, int y_offset, | |
370 | qpel_mc_func *qpix_put, | |
371 | h264_chroma_mc_func chroma_put, | |
372 | h264_weight_func luma_weight_op, | |
373 | h264_weight_func chroma_weight_op, | |
374 | h264_biweight_func luma_weight_avg, | |
375 | h264_biweight_func chroma_weight_avg, | |
376 | int list0, int list1, | |
377 | int pixel_shift, int chroma_idc) | |
378 | { | |
379 | int chroma_height; | |
380 | ||
381 | dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; | |
382 | if (chroma_idc == 3 /* yuv444 */) { | |
383 | chroma_height = height; | |
384 | chroma_weight_avg = luma_weight_avg; | |
385 | chroma_weight_op = luma_weight_op; | |
386 | dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; | |
387 | dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; | |
388 | } else if (chroma_idc == 2 /* yuv422 */) { | |
389 | chroma_height = height; | |
390 | dest_cb += (x_offset << pixel_shift) + 2 * y_offset * h->mb_uvlinesize; | |
391 | dest_cr += (x_offset << pixel_shift) + 2 * y_offset * h->mb_uvlinesize; | |
392 | } else { /* yuv420 */ | |
393 | chroma_height = height >> 1; | |
394 | dest_cb += (x_offset << pixel_shift) + y_offset * h->mb_uvlinesize; | |
395 | dest_cr += (x_offset << pixel_shift) + y_offset * h->mb_uvlinesize; | |
396 | } | |
397 | x_offset += 8 * h->mb_x; | |
398 | y_offset += 8 * (h->mb_y >> MB_FIELD(h)); | |
399 | ||
400 | if (list0 && list1) { | |
401 | /* don't optimize for luma-only case, since B-frames usually | |
402 | * use implicit weights => chroma too. */ | |
403 | uint8_t *tmp_cb = h->bipred_scratchpad; | |
404 | uint8_t *tmp_cr = h->bipred_scratchpad + (16 << pixel_shift); | |
405 | uint8_t *tmp_y = h->bipred_scratchpad + 16 * h->mb_uvlinesize; | |
406 | int refn0 = h->ref_cache[0][scan8[n]]; | |
407 | int refn1 = h->ref_cache[1][scan8[n]]; | |
408 | ||
409 | mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0, | |
410 | dest_y, dest_cb, dest_cr, | |
411 | x_offset, y_offset, qpix_put, chroma_put, | |
412 | pixel_shift, chroma_idc); | |
413 | mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1, | |
414 | tmp_y, tmp_cb, tmp_cr, | |
415 | x_offset, y_offset, qpix_put, chroma_put, | |
416 | pixel_shift, chroma_idc); | |
417 | ||
418 | if (h->use_weight == 2) { | |
419 | int weight0 = h->implicit_weight[refn0][refn1][h->mb_y & 1]; | |
420 | int weight1 = 64 - weight0; | |
421 | luma_weight_avg(dest_y, tmp_y, h->mb_linesize, | |
422 | height, 5, weight0, weight1, 0); | |
423 | if (!CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) { | |
424 | chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, | |
425 | chroma_height, 5, weight0, weight1, 0); | |
426 | chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, | |
427 | chroma_height, 5, weight0, weight1, 0); | |
428 | } | |
429 | } else { | |
430 | luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, | |
431 | h->luma_log2_weight_denom, | |
432 | h->luma_weight[refn0][0][0], | |
433 | h->luma_weight[refn1][1][0], | |
434 | h->luma_weight[refn0][0][1] + | |
435 | h->luma_weight[refn1][1][1]); | |
436 | if (!CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) { | |
437 | chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, | |
438 | h->chroma_log2_weight_denom, | |
439 | h->chroma_weight[refn0][0][0][0], | |
440 | h->chroma_weight[refn1][1][0][0], | |
441 | h->chroma_weight[refn0][0][0][1] + | |
442 | h->chroma_weight[refn1][1][0][1]); | |
443 | chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, | |
444 | h->chroma_log2_weight_denom, | |
445 | h->chroma_weight[refn0][0][1][0], | |
446 | h->chroma_weight[refn1][1][1][0], | |
447 | h->chroma_weight[refn0][0][1][1] + | |
448 | h->chroma_weight[refn1][1][1][1]); | |
449 | } | |
450 | } | |
451 | } else { | |
452 | int list = list1 ? 1 : 0; | |
453 | int refn = h->ref_cache[list][scan8[n]]; | |
454 | H264Picture *ref = &h->ref_list[list][refn]; | |
455 | mc_dir_part(h, ref, n, square, height, delta, list, | |
456 | dest_y, dest_cb, dest_cr, x_offset, y_offset, | |
457 | qpix_put, chroma_put, pixel_shift, chroma_idc); | |
458 | ||
459 | luma_weight_op(dest_y, h->mb_linesize, height, | |
460 | h->luma_log2_weight_denom, | |
461 | h->luma_weight[refn][list][0], | |
462 | h->luma_weight[refn][list][1]); | |
463 | if (!CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) { | |
464 | if (h->use_weight_chroma) { | |
465 | chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, | |
466 | h->chroma_log2_weight_denom, | |
467 | h->chroma_weight[refn][list][0][0], | |
468 | h->chroma_weight[refn][list][0][1]); | |
469 | chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, | |
470 | h->chroma_log2_weight_denom, | |
471 | h->chroma_weight[refn][list][1][0], | |
472 | h->chroma_weight[refn][list][1][1]); | |
473 | } | |
474 | } | |
475 | } | |
476 | } | |
477 | ||
478 | static av_always_inline void prefetch_motion(H264Context *h, int list, | |
479 | int pixel_shift, int chroma_idc) | |
480 | { | |
481 | /* fetch pixels for estimated mv 4 macroblocks ahead | |
482 | * optimized for 64byte cache lines */ | |
483 | const int refn = h->ref_cache[list][scan8[0]]; | |
484 | if (refn >= 0) { | |
485 | const int mx = (h->mv_cache[list][scan8[0]][0] >> 2) + 16 * h->mb_x + 8; | |
486 | const int my = (h->mv_cache[list][scan8[0]][1] >> 2) + 16 * h->mb_y; | |
487 | uint8_t **src = h->ref_list[list][refn].f.data; | |
488 | int off = (mx << pixel_shift) + | |
489 | (my + (h->mb_x & 3) * 4) * h->mb_linesize + | |
490 | (64 << pixel_shift); | |
491 | h->vdsp.prefetch(src[0] + off, h->linesize, 4); | |
492 | if (chroma_idc == 3 /* yuv444 */) { | |
493 | h->vdsp.prefetch(src[1] + off, h->linesize, 4); | |
494 | h->vdsp.prefetch(src[2] + off, h->linesize, 4); | |
495 | } else { | |
496 | off= (((mx>>1)+64)<<pixel_shift) + ((my>>1) + (h->mb_x&7))*h->uvlinesize; | |
497 | h->vdsp.prefetch(src[1] + off, src[2] - src[1], 2); | |
498 | } | |
499 | } | |
500 | } | |
501 | ||
502 | static av_always_inline void xchg_mb_border(H264Context *h, uint8_t *src_y, | |
503 | uint8_t *src_cb, uint8_t *src_cr, | |
504 | int linesize, int uvlinesize, | |
505 | int xchg, int chroma444, | |
506 | int simple, int pixel_shift) | |
507 | { | |
508 | int deblock_topleft; | |
509 | int deblock_top; | |
510 | int top_idx = 1; | |
511 | uint8_t *top_border_m1; | |
512 | uint8_t *top_border; | |
513 | ||
514 | if (!simple && FRAME_MBAFF(h)) { | |
515 | if (h->mb_y & 1) { | |
516 | if (!MB_MBAFF(h)) | |
517 | return; | |
518 | } else { | |
519 | top_idx = MB_MBAFF(h) ? 0 : 1; | |
520 | } | |
521 | } | |
522 | ||
523 | if (h->deblocking_filter == 2) { | |
524 | deblock_topleft = h->slice_table[h->mb_xy - 1 - h->mb_stride] == h->slice_num; | |
525 | deblock_top = h->top_type; | |
526 | } else { | |
527 | deblock_topleft = (h->mb_x > 0); | |
528 | deblock_top = (h->mb_y > !!MB_FIELD(h)); | |
529 | } | |
530 | ||
531 | src_y -= linesize + 1 + pixel_shift; | |
532 | src_cb -= uvlinesize + 1 + pixel_shift; | |
533 | src_cr -= uvlinesize + 1 + pixel_shift; | |
534 | ||
535 | top_border_m1 = h->top_borders[top_idx][h->mb_x - 1]; | |
536 | top_border = h->top_borders[top_idx][h->mb_x]; | |
537 | ||
538 | #define XCHG(a, b, xchg) \ | |
539 | if (pixel_shift) { \ | |
540 | if (xchg) { \ | |
541 | AV_SWAP64(b + 0, a + 0); \ | |
542 | AV_SWAP64(b + 8, a + 8); \ | |
543 | } else { \ | |
544 | AV_COPY128(b, a); \ | |
545 | } \ | |
546 | } else if (xchg) \ | |
547 | AV_SWAP64(b, a); \ | |
548 | else \ | |
549 | AV_COPY64(b, a); | |
550 | ||
551 | if (deblock_top) { | |
552 | if (deblock_topleft) { | |
553 | XCHG(top_border_m1 + (8 << pixel_shift), | |
554 | src_y - (7 << pixel_shift), 1); | |
555 | } | |
556 | XCHG(top_border + (0 << pixel_shift), src_y + (1 << pixel_shift), xchg); | |
557 | XCHG(top_border + (8 << pixel_shift), src_y + (9 << pixel_shift), 1); | |
558 | if (h->mb_x + 1 < h->mb_width) { | |
559 | XCHG(h->top_borders[top_idx][h->mb_x + 1], | |
560 | src_y + (17 << pixel_shift), 1); | |
561 | } | |
562 | if (simple || !CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) { | |
563 | if (chroma444) { | |
564 | if (deblock_topleft) { | |
565 | XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1); | |
566 | XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1); | |
567 | } | |
568 | XCHG(top_border + (16 << pixel_shift), src_cb + (1 << pixel_shift), xchg); | |
569 | XCHG(top_border + (24 << pixel_shift), src_cb + (9 << pixel_shift), 1); | |
570 | XCHG(top_border + (32 << pixel_shift), src_cr + (1 << pixel_shift), xchg); | |
571 | XCHG(top_border + (40 << pixel_shift), src_cr + (9 << pixel_shift), 1); | |
572 | if (h->mb_x + 1 < h->mb_width) { | |
573 | XCHG(h->top_borders[top_idx][h->mb_x + 1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1); | |
574 | XCHG(h->top_borders[top_idx][h->mb_x + 1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1); | |
575 | } | |
576 | } else { | |
577 | if (deblock_topleft) { | |
578 | XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1); | |
579 | XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1); | |
580 | } | |
581 | XCHG(top_border + (16 << pixel_shift), src_cb + 1 + pixel_shift, 1); | |
582 | XCHG(top_border + (24 << pixel_shift), src_cr + 1 + pixel_shift, 1); | |
583 | } | |
584 | } | |
585 | } | |
586 | } | |
587 | ||
588 | static av_always_inline int dctcoef_get(int16_t *mb, int high_bit_depth, | |
589 | int index) | |
590 | { | |
591 | if (high_bit_depth) { | |
592 | return AV_RN32A(((int32_t *)mb) + index); | |
593 | } else | |
594 | return AV_RN16A(mb + index); | |
595 | } | |
596 | ||
597 | static av_always_inline void dctcoef_set(int16_t *mb, int high_bit_depth, | |
598 | int index, int value) | |
599 | { | |
600 | if (high_bit_depth) { | |
601 | AV_WN32A(((int32_t *)mb) + index, value); | |
602 | } else | |
603 | AV_WN16A(mb + index, value); | |
604 | } | |
605 | ||
606 | static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, | |
607 | int mb_type, int is_h264, | |
608 | int simple, | |
609 | int transform_bypass, | |
610 | int pixel_shift, | |
611 | int *block_offset, | |
612 | int linesize, | |
613 | uint8_t *dest_y, int p) | |
614 | { | |
615 | void (*idct_add)(uint8_t *dst, int16_t *block, int stride); | |
616 | void (*idct_dc_add)(uint8_t *dst, int16_t *block, int stride); | |
617 | int i; | |
618 | int qscale = p == 0 ? h->qscale : h->chroma_qp[p - 1]; | |
619 | block_offset += 16 * p; | |
620 | if (IS_INTRA4x4(mb_type)) { | |
621 | if (IS_8x8DCT(mb_type)) { | |
622 | if (transform_bypass) { | |
623 | idct_dc_add = | |
624 | idct_add = h->h264dsp.h264_add_pixels8_clear; | |
625 | } else { | |
626 | idct_dc_add = h->h264dsp.h264_idct8_dc_add; | |
627 | idct_add = h->h264dsp.h264_idct8_add; | |
628 | } | |
629 | for (i = 0; i < 16; i += 4) { | |
630 | uint8_t *const ptr = dest_y + block_offset[i]; | |
631 | const int dir = h->intra4x4_pred_mode_cache[scan8[i]]; | |
632 | if (transform_bypass && h->sps.profile_idc == 244 && dir <= 1) { | |
633 | if (h->x264_build != -1) { | |
634 | h->hpc.pred8x8l_add[dir](ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); | |
635 | } else | |
636 | h->hpc.pred8x8l_filter_add[dir](ptr, h->mb + (i * 16 + p * 256 << pixel_shift), | |
637 | (h-> topleft_samples_available << i) & 0x8000, | |
638 | (h->topright_samples_available << i) & 0x4000, linesize); | |
639 | } else { | |
640 | const int nnz = h->non_zero_count_cache[scan8[i + p * 16]]; | |
641 | h->hpc.pred8x8l[dir](ptr, (h->topleft_samples_available << i) & 0x8000, | |
642 | (h->topright_samples_available << i) & 0x4000, linesize); | |
643 | if (nnz) { | |
644 | if (nnz == 1 && dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) | |
645 | idct_dc_add(ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); | |
646 | else | |
647 | idct_add(ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); | |
648 | } | |
649 | } | |
650 | } | |
651 | } else { | |
652 | if (transform_bypass) { | |
653 | idct_dc_add = | |
654 | idct_add = h->h264dsp.h264_add_pixels4_clear; | |
655 | } else { | |
656 | idct_dc_add = h->h264dsp.h264_idct_dc_add; | |
657 | idct_add = h->h264dsp.h264_idct_add; | |
658 | } | |
659 | for (i = 0; i < 16; i++) { | |
660 | uint8_t *const ptr = dest_y + block_offset[i]; | |
661 | const int dir = h->intra4x4_pred_mode_cache[scan8[i]]; | |
662 | ||
663 | if (transform_bypass && h->sps.profile_idc == 244 && dir <= 1) { | |
664 | h->hpc.pred4x4_add[dir](ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); | |
665 | } else { | |
666 | uint8_t *topright; | |
667 | int nnz, tr; | |
668 | uint64_t tr_high; | |
669 | if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) { | |
670 | const int topright_avail = (h->topright_samples_available << i) & 0x8000; | |
671 | av_assert2(h->mb_y || linesize <= block_offset[i]); | |
672 | if (!topright_avail) { | |
673 | if (pixel_shift) { | |
674 | tr_high = ((uint16_t *)ptr)[3 - linesize / 2] * 0x0001000100010001ULL; | |
675 | topright = (uint8_t *)&tr_high; | |
676 | } else { | |
677 | tr = ptr[3 - linesize] * 0x01010101u; | |
678 | topright = (uint8_t *)&tr; | |
679 | } | |
680 | } else | |
681 | topright = ptr + (4 << pixel_shift) - linesize; | |
682 | } else | |
683 | topright = NULL; | |
684 | ||
685 | h->hpc.pred4x4[dir](ptr, topright, linesize); | |
686 | nnz = h->non_zero_count_cache[scan8[i + p * 16]]; | |
687 | if (nnz) { | |
688 | if (is_h264) { | |
689 | if (nnz == 1 && dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) | |
690 | idct_dc_add(ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); | |
691 | else | |
692 | idct_add(ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); | |
693 | } else if (CONFIG_SVQ3_DECODER) | |
694 | ff_svq3_add_idct_c(ptr, h->mb + i * 16 + p * 256, linesize, qscale, 0); | |
695 | } | |
696 | } | |
697 | } | |
698 | } | |
699 | } else { | |
700 | h->hpc.pred16x16[h->intra16x16_pred_mode](dest_y, linesize); | |
701 | if (is_h264) { | |
702 | if (h->non_zero_count_cache[scan8[LUMA_DC_BLOCK_INDEX + p]]) { | |
703 | if (!transform_bypass) | |
704 | h->h264dsp.h264_luma_dc_dequant_idct(h->mb + (p * 256 << pixel_shift), | |
705 | h->mb_luma_dc[p], | |
706 | h->dequant4_coeff[p][qscale][0]); | |
707 | else { | |
708 | static const uint8_t dc_mapping[16] = { | |
709 | 0 * 16, 1 * 16, 4 * 16, 5 * 16, | |
710 | 2 * 16, 3 * 16, 6 * 16, 7 * 16, | |
711 | 8 * 16, 9 * 16, 12 * 16, 13 * 16, | |
712 | 10 * 16, 11 * 16, 14 * 16, 15 * 16 | |
713 | }; | |
714 | for (i = 0; i < 16; i++) | |
715 | dctcoef_set(h->mb + (p * 256 << pixel_shift), | |
716 | pixel_shift, dc_mapping[i], | |
717 | dctcoef_get(h->mb_luma_dc[p], | |
718 | pixel_shift, i)); | |
719 | } | |
720 | } | |
721 | } else if (CONFIG_SVQ3_DECODER) | |
722 | ff_svq3_luma_dc_dequant_idct_c(h->mb + p * 256, | |
723 | h->mb_luma_dc[p], qscale); | |
724 | } | |
725 | } | |
726 | ||
727 | static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, | |
728 | int is_h264, int simple, | |
729 | int transform_bypass, | |
730 | int pixel_shift, | |
731 | int *block_offset, | |
732 | int linesize, | |
733 | uint8_t *dest_y, int p) | |
734 | { | |
735 | void (*idct_add)(uint8_t *dst, int16_t *block, int stride); | |
736 | int i; | |
737 | block_offset += 16 * p; | |
738 | if (!IS_INTRA4x4(mb_type)) { | |
739 | if (is_h264) { | |
740 | if (IS_INTRA16x16(mb_type)) { | |
741 | if (transform_bypass) { | |
742 | if (h->sps.profile_idc == 244 && | |
743 | (h->intra16x16_pred_mode == VERT_PRED8x8 || | |
744 | h->intra16x16_pred_mode == HOR_PRED8x8)) { | |
745 | h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, | |
746 | h->mb + (p * 256 << pixel_shift), | |
747 | linesize); | |
748 | } else { | |
749 | for (i = 0; i < 16; i++) | |
750 | if (h->non_zero_count_cache[scan8[i + p * 16]] || | |
751 | dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) | |
752 | h->h264dsp.h264_add_pixels4_clear(dest_y + block_offset[i], | |
753 | h->mb + (i * 16 + p * 256 << pixel_shift), | |
754 | linesize); | |
755 | } | |
756 | } else { | |
757 | h->h264dsp.h264_idct_add16intra(dest_y, block_offset, | |
758 | h->mb + (p * 256 << pixel_shift), | |
759 | linesize, | |
760 | h->non_zero_count_cache + p * 5 * 8); | |
761 | } | |
762 | } else if (h->cbp & 15) { | |
763 | if (transform_bypass) { | |
764 | const int di = IS_8x8DCT(mb_type) ? 4 : 1; | |
765 | idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8_clear | |
766 | : h->h264dsp.h264_add_pixels4_clear; | |
767 | for (i = 0; i < 16; i += di) | |
768 | if (h->non_zero_count_cache[scan8[i + p * 16]]) | |
769 | idct_add(dest_y + block_offset[i], | |
770 | h->mb + (i * 16 + p * 256 << pixel_shift), | |
771 | linesize); | |
772 | } else { | |
773 | if (IS_8x8DCT(mb_type)) | |
774 | h->h264dsp.h264_idct8_add4(dest_y, block_offset, | |
775 | h->mb + (p * 256 << pixel_shift), | |
776 | linesize, | |
777 | h->non_zero_count_cache + p * 5 * 8); | |
778 | else | |
779 | h->h264dsp.h264_idct_add16(dest_y, block_offset, | |
780 | h->mb + (p * 256 << pixel_shift), | |
781 | linesize, | |
782 | h->non_zero_count_cache + p * 5 * 8); | |
783 | } | |
784 | } | |
785 | } else if (CONFIG_SVQ3_DECODER) { | |
786 | for (i = 0; i < 16; i++) | |
787 | if (h->non_zero_count_cache[scan8[i + p * 16]] || h->mb[i * 16 + p * 256]) { | |
788 | // FIXME benchmark weird rule, & below | |
789 | uint8_t *const ptr = dest_y + block_offset[i]; | |
790 | ff_svq3_add_idct_c(ptr, h->mb + i * 16 + p * 256, linesize, | |
791 | h->qscale, IS_INTRA(mb_type) ? 1 : 0); | |
792 | } | |
793 | } | |
794 | } | |
795 | } | |
796 | ||
797 | #define BITS 8 | |
798 | #define SIMPLE 1 | |
799 | #include "h264_mb_template.c" | |
800 | ||
801 | #undef BITS | |
802 | #define BITS 16 | |
803 | #include "h264_mb_template.c" | |
804 | ||
805 | #undef SIMPLE | |
806 | #define SIMPLE 0 | |
807 | #include "h264_mb_template.c" | |
808 | ||
809 | void ff_h264_hl_decode_mb(H264Context *h) | |
810 | { | |
811 | const int mb_xy = h->mb_xy; | |
812 | const int mb_type = h->cur_pic.mb_type[mb_xy]; | |
813 | int is_complex = CONFIG_SMALL || h->is_complex || | |
814 | IS_INTRA_PCM(mb_type) || h->qscale == 0; | |
815 | ||
816 | if (CHROMA444(h)) { | |
817 | if (is_complex || h->pixel_shift) | |
818 | hl_decode_mb_444_complex(h); | |
819 | else | |
820 | hl_decode_mb_444_simple_8(h); | |
821 | } else if (is_complex) { | |
822 | hl_decode_mb_complex(h); | |
823 | } else if (h->pixel_shift) { | |
824 | hl_decode_mb_simple_16(h); | |
825 | } else | |
826 | hl_decode_mb_simple_8(h); | |
827 | } |