| 1 | /* |
| 2 | * H.26L/H.264/AVC/JVT/14496-10/... decoder |
| 3 | * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | /** |
| 23 | * @file |
| 24 | * H.264 / AVC / MPEG4 part10 macroblock decoding |
| 25 | */ |
| 26 | |
| 27 | #include <stdint.h> |
| 28 | |
| 29 | #include "config.h" |
| 30 | |
| 31 | #include "libavutil/common.h" |
| 32 | #include "libavutil/intreadwrite.h" |
| 33 | #include "avcodec.h" |
| 34 | #include "h264.h" |
| 35 | #include "qpeldsp.h" |
| 36 | #include "svq3.h" |
| 37 | #include "thread.h" |
| 38 | |
| 39 | static inline int get_lowest_part_list_y(H264Context *h, H264Picture *pic, int n, |
| 40 | int height, int y_offset, int list) |
| 41 | { |
| 42 | int raw_my = h->mv_cache[list][scan8[n]][1]; |
| 43 | int filter_height_down = (raw_my & 3) ? 3 : 0; |
| 44 | int full_my = (raw_my >> 2) + y_offset; |
| 45 | int bottom = full_my + filter_height_down + height; |
| 46 | |
| 47 | av_assert2(height >= 0); |
| 48 | |
| 49 | return FFMAX(0, bottom); |
| 50 | } |
| 51 | |
| 52 | static inline void get_lowest_part_y(H264Context *h, int16_t refs[2][48], int n, |
| 53 | int height, int y_offset, int list0, |
| 54 | int list1, int *nrefs) |
| 55 | { |
| 56 | int my; |
| 57 | |
| 58 | y_offset += 16 * (h->mb_y >> MB_FIELD(h)); |
| 59 | |
| 60 | if (list0) { |
| 61 | int ref_n = h->ref_cache[0][scan8[n]]; |
| 62 | H264Picture *ref = &h->ref_list[0][ref_n]; |
| 63 | |
| 64 | // Error resilience puts the current picture in the ref list. |
| 65 | // Don't try to wait on these as it will cause a deadlock. |
| 66 | // Fields can wait on each other, though. |
| 67 | if (ref->tf.progress->data != h->cur_pic.tf.progress->data || |
| 68 | (ref->reference & 3) != h->picture_structure) { |
| 69 | my = get_lowest_part_list_y(h, ref, n, height, y_offset, 0); |
| 70 | if (refs[0][ref_n] < 0) |
| 71 | nrefs[0] += 1; |
| 72 | refs[0][ref_n] = FFMAX(refs[0][ref_n], my); |
| 73 | } |
| 74 | } |
| 75 | |
| 76 | if (list1) { |
| 77 | int ref_n = h->ref_cache[1][scan8[n]]; |
| 78 | H264Picture *ref = &h->ref_list[1][ref_n]; |
| 79 | |
| 80 | if (ref->tf.progress->data != h->cur_pic.tf.progress->data || |
| 81 | (ref->reference & 3) != h->picture_structure) { |
| 82 | my = get_lowest_part_list_y(h, ref, n, height, y_offset, 1); |
| 83 | if (refs[1][ref_n] < 0) |
| 84 | nrefs[1] += 1; |
| 85 | refs[1][ref_n] = FFMAX(refs[1][ref_n], my); |
| 86 | } |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | /** |
| 91 | * Wait until all reference frames are available for MC operations. |
| 92 | * |
| 93 | * @param h the H264 context |
| 94 | */ |
| 95 | static void await_references(H264Context *h) |
| 96 | { |
| 97 | const int mb_xy = h->mb_xy; |
| 98 | const int mb_type = h->cur_pic.mb_type[mb_xy]; |
| 99 | int16_t refs[2][48]; |
| 100 | int nrefs[2] = { 0 }; |
| 101 | int ref, list; |
| 102 | |
| 103 | memset(refs, -1, sizeof(refs)); |
| 104 | |
| 105 | if (IS_16X16(mb_type)) { |
| 106 | get_lowest_part_y(h, refs, 0, 16, 0, |
| 107 | IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); |
| 108 | } else if (IS_16X8(mb_type)) { |
| 109 | get_lowest_part_y(h, refs, 0, 8, 0, |
| 110 | IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); |
| 111 | get_lowest_part_y(h, refs, 8, 8, 8, |
| 112 | IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs); |
| 113 | } else if (IS_8X16(mb_type)) { |
| 114 | get_lowest_part_y(h, refs, 0, 16, 0, |
| 115 | IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); |
| 116 | get_lowest_part_y(h, refs, 4, 16, 0, |
| 117 | IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs); |
| 118 | } else { |
| 119 | int i; |
| 120 | |
| 121 | av_assert2(IS_8X8(mb_type)); |
| 122 | |
| 123 | for (i = 0; i < 4; i++) { |
| 124 | const int sub_mb_type = h->sub_mb_type[i]; |
| 125 | const int n = 4 * i; |
| 126 | int y_offset = (i & 2) << 2; |
| 127 | |
| 128 | if (IS_SUB_8X8(sub_mb_type)) { |
| 129 | get_lowest_part_y(h, refs, n, 8, y_offset, |
| 130 | IS_DIR(sub_mb_type, 0, 0), |
| 131 | IS_DIR(sub_mb_type, 0, 1), |
| 132 | nrefs); |
| 133 | } else if (IS_SUB_8X4(sub_mb_type)) { |
| 134 | get_lowest_part_y(h, refs, n, 4, y_offset, |
| 135 | IS_DIR(sub_mb_type, 0, 0), |
| 136 | IS_DIR(sub_mb_type, 0, 1), |
| 137 | nrefs); |
| 138 | get_lowest_part_y(h, refs, n + 2, 4, y_offset + 4, |
| 139 | IS_DIR(sub_mb_type, 0, 0), |
| 140 | IS_DIR(sub_mb_type, 0, 1), |
| 141 | nrefs); |
| 142 | } else if (IS_SUB_4X8(sub_mb_type)) { |
| 143 | get_lowest_part_y(h, refs, n, 8, y_offset, |
| 144 | IS_DIR(sub_mb_type, 0, 0), |
| 145 | IS_DIR(sub_mb_type, 0, 1), |
| 146 | nrefs); |
| 147 | get_lowest_part_y(h, refs, n + 1, 8, y_offset, |
| 148 | IS_DIR(sub_mb_type, 0, 0), |
| 149 | IS_DIR(sub_mb_type, 0, 1), |
| 150 | nrefs); |
| 151 | } else { |
| 152 | int j; |
| 153 | av_assert2(IS_SUB_4X4(sub_mb_type)); |
| 154 | for (j = 0; j < 4; j++) { |
| 155 | int sub_y_offset = y_offset + 2 * (j & 2); |
| 156 | get_lowest_part_y(h, refs, n + j, 4, sub_y_offset, |
| 157 | IS_DIR(sub_mb_type, 0, 0), |
| 158 | IS_DIR(sub_mb_type, 0, 1), |
| 159 | nrefs); |
| 160 | } |
| 161 | } |
| 162 | } |
| 163 | } |
| 164 | |
| 165 | for (list = h->list_count - 1; list >= 0; list--) |
| 166 | for (ref = 0; ref < 48 && nrefs[list]; ref++) { |
| 167 | int row = refs[list][ref]; |
| 168 | if (row >= 0) { |
| 169 | H264Picture *ref_pic = &h->ref_list[list][ref]; |
| 170 | int ref_field = ref_pic->reference - 1; |
| 171 | int ref_field_picture = ref_pic->field_picture; |
| 172 | int pic_height = 16 * h->mb_height >> ref_field_picture; |
| 173 | |
| 174 | row <<= MB_MBAFF(h); |
| 175 | nrefs[list]--; |
| 176 | |
| 177 | if (!FIELD_PICTURE(h) && ref_field_picture) { // frame referencing two fields |
| 178 | ff_thread_await_progress(&ref_pic->tf, |
| 179 | FFMIN((row >> 1) - !(row & 1), |
| 180 | pic_height - 1), |
| 181 | 1); |
| 182 | ff_thread_await_progress(&ref_pic->tf, |
| 183 | FFMIN((row >> 1), pic_height - 1), |
| 184 | 0); |
| 185 | } else if (FIELD_PICTURE(h) && !ref_field_picture) { // field referencing one field of a frame |
| 186 | ff_thread_await_progress(&ref_pic->tf, |
| 187 | FFMIN(row * 2 + ref_field, |
| 188 | pic_height - 1), |
| 189 | 0); |
| 190 | } else if (FIELD_PICTURE(h)) { |
| 191 | ff_thread_await_progress(&ref_pic->tf, |
| 192 | FFMIN(row, pic_height - 1), |
| 193 | ref_field); |
| 194 | } else { |
| 195 | ff_thread_await_progress(&ref_pic->tf, |
| 196 | FFMIN(row, pic_height - 1), |
| 197 | 0); |
| 198 | } |
| 199 | } |
| 200 | } |
| 201 | } |
| 202 | |
| 203 | static av_always_inline void mc_dir_part(H264Context *h, H264Picture *pic, |
| 204 | int n, int square, int height, |
| 205 | int delta, int list, |
| 206 | uint8_t *dest_y, uint8_t *dest_cb, |
| 207 | uint8_t *dest_cr, |
| 208 | int src_x_offset, int src_y_offset, |
| 209 | qpel_mc_func *qpix_op, |
| 210 | h264_chroma_mc_func chroma_op, |
| 211 | int pixel_shift, int chroma_idc) |
| 212 | { |
| 213 | const int mx = h->mv_cache[list][scan8[n]][0] + src_x_offset * 8; |
| 214 | int my = h->mv_cache[list][scan8[n]][1] + src_y_offset * 8; |
| 215 | const int luma_xy = (mx & 3) + ((my & 3) << 2); |
| 216 | ptrdiff_t offset = ((mx >> 2) << pixel_shift) + (my >> 2) * h->mb_linesize; |
| 217 | uint8_t *src_y = pic->f.data[0] + offset; |
| 218 | uint8_t *src_cb, *src_cr; |
| 219 | int extra_width = 0; |
| 220 | int extra_height = 0; |
| 221 | int emu = 0; |
| 222 | const int full_mx = mx >> 2; |
| 223 | const int full_my = my >> 2; |
| 224 | const int pic_width = 16 * h->mb_width; |
| 225 | const int pic_height = 16 * h->mb_height >> MB_FIELD(h); |
| 226 | int ysh; |
| 227 | |
| 228 | if (mx & 7) |
| 229 | extra_width -= 3; |
| 230 | if (my & 7) |
| 231 | extra_height -= 3; |
| 232 | |
| 233 | if (full_mx < 0 - extra_width || |
| 234 | full_my < 0 - extra_height || |
| 235 | full_mx + 16 /*FIXME*/ > pic_width + extra_width || |
| 236 | full_my + 16 /*FIXME*/ > pic_height + extra_height) { |
| 237 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, |
| 238 | src_y - (2 << pixel_shift) - 2 * h->mb_linesize, |
| 239 | h->mb_linesize, h->mb_linesize, |
| 240 | 16 + 5, 16 + 5 /*FIXME*/, full_mx - 2, |
| 241 | full_my - 2, pic_width, pic_height); |
| 242 | src_y = h->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize; |
| 243 | emu = 1; |
| 244 | } |
| 245 | |
| 246 | qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); // FIXME try variable height perhaps? |
| 247 | if (!square) |
| 248 | qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize); |
| 249 | |
| 250 | if (CONFIG_GRAY && h->flags & CODEC_FLAG_GRAY) |
| 251 | return; |
| 252 | |
| 253 | if (chroma_idc == 3 /* yuv444 */) { |
| 254 | src_cb = pic->f.data[1] + offset; |
| 255 | if (emu) { |
| 256 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, |
| 257 | src_cb - (2 << pixel_shift) - 2 * h->mb_linesize, |
| 258 | h->mb_linesize, h->mb_linesize, |
| 259 | 16 + 5, 16 + 5 /*FIXME*/, |
| 260 | full_mx - 2, full_my - 2, |
| 261 | pic_width, pic_height); |
| 262 | src_cb = h->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize; |
| 263 | } |
| 264 | qpix_op[luma_xy](dest_cb, src_cb, h->mb_linesize); // FIXME try variable height perhaps? |
| 265 | if (!square) |
| 266 | qpix_op[luma_xy](dest_cb + delta, src_cb + delta, h->mb_linesize); |
| 267 | |
| 268 | src_cr = pic->f.data[2] + offset; |
| 269 | if (emu) { |
| 270 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, |
| 271 | src_cr - (2 << pixel_shift) - 2 * h->mb_linesize, |
| 272 | h->mb_linesize, h->mb_linesize, |
| 273 | 16 + 5, 16 + 5 /*FIXME*/, |
| 274 | full_mx - 2, full_my - 2, |
| 275 | pic_width, pic_height); |
| 276 | src_cr = h->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize; |
| 277 | } |
| 278 | qpix_op[luma_xy](dest_cr, src_cr, h->mb_linesize); // FIXME try variable height perhaps? |
| 279 | if (!square) |
| 280 | qpix_op[luma_xy](dest_cr + delta, src_cr + delta, h->mb_linesize); |
| 281 | return; |
| 282 | } |
| 283 | |
| 284 | ysh = 3 - (chroma_idc == 2 /* yuv422 */); |
| 285 | if (chroma_idc == 1 /* yuv420 */ && MB_FIELD(h)) { |
| 286 | // chroma offset when predicting from a field of opposite parity |
| 287 | my += 2 * ((h->mb_y & 1) - (pic->reference - 1)); |
| 288 | emu |= (my >> 3) < 0 || (my >> 3) + 8 >= (pic_height >> 1); |
| 289 | } |
| 290 | |
| 291 | src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + |
| 292 | (my >> ysh) * h->mb_uvlinesize; |
| 293 | src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + |
| 294 | (my >> ysh) * h->mb_uvlinesize; |
| 295 | |
| 296 | if (emu) { |
| 297 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src_cb, |
| 298 | h->mb_uvlinesize, h->mb_uvlinesize, |
| 299 | 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), |
| 300 | pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); |
| 301 | src_cb = h->edge_emu_buffer; |
| 302 | } |
| 303 | chroma_op(dest_cb, src_cb, h->mb_uvlinesize, |
| 304 | height >> (chroma_idc == 1 /* yuv420 */), |
| 305 | mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7); |
| 306 | |
| 307 | if (emu) { |
| 308 | h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src_cr, |
| 309 | h->mb_uvlinesize, h->mb_uvlinesize, |
| 310 | 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), |
| 311 | pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); |
| 312 | src_cr = h->edge_emu_buffer; |
| 313 | } |
| 314 | chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */), |
| 315 | mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7); |
| 316 | } |
| 317 | |
| 318 | static av_always_inline void mc_part_std(H264Context *h, int n, int square, |
| 319 | int height, int delta, |
| 320 | uint8_t *dest_y, uint8_t *dest_cb, |
| 321 | uint8_t *dest_cr, |
| 322 | int x_offset, int y_offset, |
| 323 | qpel_mc_func *qpix_put, |
| 324 | h264_chroma_mc_func chroma_put, |
| 325 | qpel_mc_func *qpix_avg, |
| 326 | h264_chroma_mc_func chroma_avg, |
| 327 | int list0, int list1, |
| 328 | int pixel_shift, int chroma_idc) |
| 329 | { |
| 330 | qpel_mc_func *qpix_op = qpix_put; |
| 331 | h264_chroma_mc_func chroma_op = chroma_put; |
| 332 | |
| 333 | dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; |
| 334 | if (chroma_idc == 3 /* yuv444 */) { |
| 335 | dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; |
| 336 | dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; |
| 337 | } else if (chroma_idc == 2 /* yuv422 */) { |
| 338 | dest_cb += (x_offset << pixel_shift) + 2 * y_offset * h->mb_uvlinesize; |
| 339 | dest_cr += (x_offset << pixel_shift) + 2 * y_offset * h->mb_uvlinesize; |
| 340 | } else { /* yuv420 */ |
| 341 | dest_cb += (x_offset << pixel_shift) + y_offset * h->mb_uvlinesize; |
| 342 | dest_cr += (x_offset << pixel_shift) + y_offset * h->mb_uvlinesize; |
| 343 | } |
| 344 | x_offset += 8 * h->mb_x; |
| 345 | y_offset += 8 * (h->mb_y >> MB_FIELD(h)); |
| 346 | |
| 347 | if (list0) { |
| 348 | H264Picture *ref = &h->ref_list[0][h->ref_cache[0][scan8[n]]]; |
| 349 | mc_dir_part(h, ref, n, square, height, delta, 0, |
| 350 | dest_y, dest_cb, dest_cr, x_offset, y_offset, |
| 351 | qpix_op, chroma_op, pixel_shift, chroma_idc); |
| 352 | |
| 353 | qpix_op = qpix_avg; |
| 354 | chroma_op = chroma_avg; |
| 355 | } |
| 356 | |
| 357 | if (list1) { |
| 358 | H264Picture *ref = &h->ref_list[1][h->ref_cache[1][scan8[n]]]; |
| 359 | mc_dir_part(h, ref, n, square, height, delta, 1, |
| 360 | dest_y, dest_cb, dest_cr, x_offset, y_offset, |
| 361 | qpix_op, chroma_op, pixel_shift, chroma_idc); |
| 362 | } |
| 363 | } |
| 364 | |
| 365 | static av_always_inline void mc_part_weighted(H264Context *h, int n, int square, |
| 366 | int height, int delta, |
| 367 | uint8_t *dest_y, uint8_t *dest_cb, |
| 368 | uint8_t *dest_cr, |
| 369 | int x_offset, int y_offset, |
| 370 | qpel_mc_func *qpix_put, |
| 371 | h264_chroma_mc_func chroma_put, |
| 372 | h264_weight_func luma_weight_op, |
| 373 | h264_weight_func chroma_weight_op, |
| 374 | h264_biweight_func luma_weight_avg, |
| 375 | h264_biweight_func chroma_weight_avg, |
| 376 | int list0, int list1, |
| 377 | int pixel_shift, int chroma_idc) |
| 378 | { |
| 379 | int chroma_height; |
| 380 | |
| 381 | dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; |
| 382 | if (chroma_idc == 3 /* yuv444 */) { |
| 383 | chroma_height = height; |
| 384 | chroma_weight_avg = luma_weight_avg; |
| 385 | chroma_weight_op = luma_weight_op; |
| 386 | dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; |
| 387 | dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * h->mb_linesize; |
| 388 | } else if (chroma_idc == 2 /* yuv422 */) { |
| 389 | chroma_height = height; |
| 390 | dest_cb += (x_offset << pixel_shift) + 2 * y_offset * h->mb_uvlinesize; |
| 391 | dest_cr += (x_offset << pixel_shift) + 2 * y_offset * h->mb_uvlinesize; |
| 392 | } else { /* yuv420 */ |
| 393 | chroma_height = height >> 1; |
| 394 | dest_cb += (x_offset << pixel_shift) + y_offset * h->mb_uvlinesize; |
| 395 | dest_cr += (x_offset << pixel_shift) + y_offset * h->mb_uvlinesize; |
| 396 | } |
| 397 | x_offset += 8 * h->mb_x; |
| 398 | y_offset += 8 * (h->mb_y >> MB_FIELD(h)); |
| 399 | |
| 400 | if (list0 && list1) { |
| 401 | /* don't optimize for luma-only case, since B-frames usually |
| 402 | * use implicit weights => chroma too. */ |
| 403 | uint8_t *tmp_cb = h->bipred_scratchpad; |
| 404 | uint8_t *tmp_cr = h->bipred_scratchpad + (16 << pixel_shift); |
| 405 | uint8_t *tmp_y = h->bipred_scratchpad + 16 * h->mb_uvlinesize; |
| 406 | int refn0 = h->ref_cache[0][scan8[n]]; |
| 407 | int refn1 = h->ref_cache[1][scan8[n]]; |
| 408 | |
| 409 | mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0, |
| 410 | dest_y, dest_cb, dest_cr, |
| 411 | x_offset, y_offset, qpix_put, chroma_put, |
| 412 | pixel_shift, chroma_idc); |
| 413 | mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1, |
| 414 | tmp_y, tmp_cb, tmp_cr, |
| 415 | x_offset, y_offset, qpix_put, chroma_put, |
| 416 | pixel_shift, chroma_idc); |
| 417 | |
| 418 | if (h->use_weight == 2) { |
| 419 | int weight0 = h->implicit_weight[refn0][refn1][h->mb_y & 1]; |
| 420 | int weight1 = 64 - weight0; |
| 421 | luma_weight_avg(dest_y, tmp_y, h->mb_linesize, |
| 422 | height, 5, weight0, weight1, 0); |
| 423 | if (!CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) { |
| 424 | chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, |
| 425 | chroma_height, 5, weight0, weight1, 0); |
| 426 | chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, |
| 427 | chroma_height, 5, weight0, weight1, 0); |
| 428 | } |
| 429 | } else { |
| 430 | luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, |
| 431 | h->luma_log2_weight_denom, |
| 432 | h->luma_weight[refn0][0][0], |
| 433 | h->luma_weight[refn1][1][0], |
| 434 | h->luma_weight[refn0][0][1] + |
| 435 | h->luma_weight[refn1][1][1]); |
| 436 | if (!CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) { |
| 437 | chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, |
| 438 | h->chroma_log2_weight_denom, |
| 439 | h->chroma_weight[refn0][0][0][0], |
| 440 | h->chroma_weight[refn1][1][0][0], |
| 441 | h->chroma_weight[refn0][0][0][1] + |
| 442 | h->chroma_weight[refn1][1][0][1]); |
| 443 | chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, |
| 444 | h->chroma_log2_weight_denom, |
| 445 | h->chroma_weight[refn0][0][1][0], |
| 446 | h->chroma_weight[refn1][1][1][0], |
| 447 | h->chroma_weight[refn0][0][1][1] + |
| 448 | h->chroma_weight[refn1][1][1][1]); |
| 449 | } |
| 450 | } |
| 451 | } else { |
| 452 | int list = list1 ? 1 : 0; |
| 453 | int refn = h->ref_cache[list][scan8[n]]; |
| 454 | H264Picture *ref = &h->ref_list[list][refn]; |
| 455 | mc_dir_part(h, ref, n, square, height, delta, list, |
| 456 | dest_y, dest_cb, dest_cr, x_offset, y_offset, |
| 457 | qpix_put, chroma_put, pixel_shift, chroma_idc); |
| 458 | |
| 459 | luma_weight_op(dest_y, h->mb_linesize, height, |
| 460 | h->luma_log2_weight_denom, |
| 461 | h->luma_weight[refn][list][0], |
| 462 | h->luma_weight[refn][list][1]); |
| 463 | if (!CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) { |
| 464 | if (h->use_weight_chroma) { |
| 465 | chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, |
| 466 | h->chroma_log2_weight_denom, |
| 467 | h->chroma_weight[refn][list][0][0], |
| 468 | h->chroma_weight[refn][list][0][1]); |
| 469 | chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, |
| 470 | h->chroma_log2_weight_denom, |
| 471 | h->chroma_weight[refn][list][1][0], |
| 472 | h->chroma_weight[refn][list][1][1]); |
| 473 | } |
| 474 | } |
| 475 | } |
| 476 | } |
| 477 | |
| 478 | static av_always_inline void prefetch_motion(H264Context *h, int list, |
| 479 | int pixel_shift, int chroma_idc) |
| 480 | { |
| 481 | /* fetch pixels for estimated mv 4 macroblocks ahead |
| 482 | * optimized for 64byte cache lines */ |
| 483 | const int refn = h->ref_cache[list][scan8[0]]; |
| 484 | if (refn >= 0) { |
| 485 | const int mx = (h->mv_cache[list][scan8[0]][0] >> 2) + 16 * h->mb_x + 8; |
| 486 | const int my = (h->mv_cache[list][scan8[0]][1] >> 2) + 16 * h->mb_y; |
| 487 | uint8_t **src = h->ref_list[list][refn].f.data; |
| 488 | int off = (mx << pixel_shift) + |
| 489 | (my + (h->mb_x & 3) * 4) * h->mb_linesize + |
| 490 | (64 << pixel_shift); |
| 491 | h->vdsp.prefetch(src[0] + off, h->linesize, 4); |
| 492 | if (chroma_idc == 3 /* yuv444 */) { |
| 493 | h->vdsp.prefetch(src[1] + off, h->linesize, 4); |
| 494 | h->vdsp.prefetch(src[2] + off, h->linesize, 4); |
| 495 | } else { |
| 496 | off= (((mx>>1)+64)<<pixel_shift) + ((my>>1) + (h->mb_x&7))*h->uvlinesize; |
| 497 | h->vdsp.prefetch(src[1] + off, src[2] - src[1], 2); |
| 498 | } |
| 499 | } |
| 500 | } |
| 501 | |
| 502 | static av_always_inline void xchg_mb_border(H264Context *h, uint8_t *src_y, |
| 503 | uint8_t *src_cb, uint8_t *src_cr, |
| 504 | int linesize, int uvlinesize, |
| 505 | int xchg, int chroma444, |
| 506 | int simple, int pixel_shift) |
| 507 | { |
| 508 | int deblock_topleft; |
| 509 | int deblock_top; |
| 510 | int top_idx = 1; |
| 511 | uint8_t *top_border_m1; |
| 512 | uint8_t *top_border; |
| 513 | |
| 514 | if (!simple && FRAME_MBAFF(h)) { |
| 515 | if (h->mb_y & 1) { |
| 516 | if (!MB_MBAFF(h)) |
| 517 | return; |
| 518 | } else { |
| 519 | top_idx = MB_MBAFF(h) ? 0 : 1; |
| 520 | } |
| 521 | } |
| 522 | |
| 523 | if (h->deblocking_filter == 2) { |
| 524 | deblock_topleft = h->slice_table[h->mb_xy - 1 - h->mb_stride] == h->slice_num; |
| 525 | deblock_top = h->top_type; |
| 526 | } else { |
| 527 | deblock_topleft = (h->mb_x > 0); |
| 528 | deblock_top = (h->mb_y > !!MB_FIELD(h)); |
| 529 | } |
| 530 | |
| 531 | src_y -= linesize + 1 + pixel_shift; |
| 532 | src_cb -= uvlinesize + 1 + pixel_shift; |
| 533 | src_cr -= uvlinesize + 1 + pixel_shift; |
| 534 | |
| 535 | top_border_m1 = h->top_borders[top_idx][h->mb_x - 1]; |
| 536 | top_border = h->top_borders[top_idx][h->mb_x]; |
| 537 | |
| 538 | #define XCHG(a, b, xchg) \ |
| 539 | if (pixel_shift) { \ |
| 540 | if (xchg) { \ |
| 541 | AV_SWAP64(b + 0, a + 0); \ |
| 542 | AV_SWAP64(b + 8, a + 8); \ |
| 543 | } else { \ |
| 544 | AV_COPY128(b, a); \ |
| 545 | } \ |
| 546 | } else if (xchg) \ |
| 547 | AV_SWAP64(b, a); \ |
| 548 | else \ |
| 549 | AV_COPY64(b, a); |
| 550 | |
| 551 | if (deblock_top) { |
| 552 | if (deblock_topleft) { |
| 553 | XCHG(top_border_m1 + (8 << pixel_shift), |
| 554 | src_y - (7 << pixel_shift), 1); |
| 555 | } |
| 556 | XCHG(top_border + (0 << pixel_shift), src_y + (1 << pixel_shift), xchg); |
| 557 | XCHG(top_border + (8 << pixel_shift), src_y + (9 << pixel_shift), 1); |
| 558 | if (h->mb_x + 1 < h->mb_width) { |
| 559 | XCHG(h->top_borders[top_idx][h->mb_x + 1], |
| 560 | src_y + (17 << pixel_shift), 1); |
| 561 | } |
| 562 | if (simple || !CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) { |
| 563 | if (chroma444) { |
| 564 | if (deblock_topleft) { |
| 565 | XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1); |
| 566 | XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1); |
| 567 | } |
| 568 | XCHG(top_border + (16 << pixel_shift), src_cb + (1 << pixel_shift), xchg); |
| 569 | XCHG(top_border + (24 << pixel_shift), src_cb + (9 << pixel_shift), 1); |
| 570 | XCHG(top_border + (32 << pixel_shift), src_cr + (1 << pixel_shift), xchg); |
| 571 | XCHG(top_border + (40 << pixel_shift), src_cr + (9 << pixel_shift), 1); |
| 572 | if (h->mb_x + 1 < h->mb_width) { |
| 573 | XCHG(h->top_borders[top_idx][h->mb_x + 1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1); |
| 574 | XCHG(h->top_borders[top_idx][h->mb_x + 1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1); |
| 575 | } |
| 576 | } else { |
| 577 | if (deblock_topleft) { |
| 578 | XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1); |
| 579 | XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1); |
| 580 | } |
| 581 | XCHG(top_border + (16 << pixel_shift), src_cb + 1 + pixel_shift, 1); |
| 582 | XCHG(top_border + (24 << pixel_shift), src_cr + 1 + pixel_shift, 1); |
| 583 | } |
| 584 | } |
| 585 | } |
| 586 | } |
| 587 | |
| 588 | static av_always_inline int dctcoef_get(int16_t *mb, int high_bit_depth, |
| 589 | int index) |
| 590 | { |
| 591 | if (high_bit_depth) { |
| 592 | return AV_RN32A(((int32_t *)mb) + index); |
| 593 | } else |
| 594 | return AV_RN16A(mb + index); |
| 595 | } |
| 596 | |
| 597 | static av_always_inline void dctcoef_set(int16_t *mb, int high_bit_depth, |
| 598 | int index, int value) |
| 599 | { |
| 600 | if (high_bit_depth) { |
| 601 | AV_WN32A(((int32_t *)mb) + index, value); |
| 602 | } else |
| 603 | AV_WN16A(mb + index, value); |
| 604 | } |
| 605 | |
| 606 | static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, |
| 607 | int mb_type, int is_h264, |
| 608 | int simple, |
| 609 | int transform_bypass, |
| 610 | int pixel_shift, |
| 611 | int *block_offset, |
| 612 | int linesize, |
| 613 | uint8_t *dest_y, int p) |
| 614 | { |
| 615 | void (*idct_add)(uint8_t *dst, int16_t *block, int stride); |
| 616 | void (*idct_dc_add)(uint8_t *dst, int16_t *block, int stride); |
| 617 | int i; |
| 618 | int qscale = p == 0 ? h->qscale : h->chroma_qp[p - 1]; |
| 619 | block_offset += 16 * p; |
| 620 | if (IS_INTRA4x4(mb_type)) { |
| 621 | if (IS_8x8DCT(mb_type)) { |
| 622 | if (transform_bypass) { |
| 623 | idct_dc_add = |
| 624 | idct_add = h->h264dsp.h264_add_pixels8_clear; |
| 625 | } else { |
| 626 | idct_dc_add = h->h264dsp.h264_idct8_dc_add; |
| 627 | idct_add = h->h264dsp.h264_idct8_add; |
| 628 | } |
| 629 | for (i = 0; i < 16; i += 4) { |
| 630 | uint8_t *const ptr = dest_y + block_offset[i]; |
| 631 | const int dir = h->intra4x4_pred_mode_cache[scan8[i]]; |
| 632 | if (transform_bypass && h->sps.profile_idc == 244 && dir <= 1) { |
| 633 | if (h->x264_build != -1) { |
| 634 | h->hpc.pred8x8l_add[dir](ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); |
| 635 | } else |
| 636 | h->hpc.pred8x8l_filter_add[dir](ptr, h->mb + (i * 16 + p * 256 << pixel_shift), |
| 637 | (h-> topleft_samples_available << i) & 0x8000, |
| 638 | (h->topright_samples_available << i) & 0x4000, linesize); |
| 639 | } else { |
| 640 | const int nnz = h->non_zero_count_cache[scan8[i + p * 16]]; |
| 641 | h->hpc.pred8x8l[dir](ptr, (h->topleft_samples_available << i) & 0x8000, |
| 642 | (h->topright_samples_available << i) & 0x4000, linesize); |
| 643 | if (nnz) { |
| 644 | if (nnz == 1 && dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) |
| 645 | idct_dc_add(ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); |
| 646 | else |
| 647 | idct_add(ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); |
| 648 | } |
| 649 | } |
| 650 | } |
| 651 | } else { |
| 652 | if (transform_bypass) { |
| 653 | idct_dc_add = |
| 654 | idct_add = h->h264dsp.h264_add_pixels4_clear; |
| 655 | } else { |
| 656 | idct_dc_add = h->h264dsp.h264_idct_dc_add; |
| 657 | idct_add = h->h264dsp.h264_idct_add; |
| 658 | } |
| 659 | for (i = 0; i < 16; i++) { |
| 660 | uint8_t *const ptr = dest_y + block_offset[i]; |
| 661 | const int dir = h->intra4x4_pred_mode_cache[scan8[i]]; |
| 662 | |
| 663 | if (transform_bypass && h->sps.profile_idc == 244 && dir <= 1) { |
| 664 | h->hpc.pred4x4_add[dir](ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); |
| 665 | } else { |
| 666 | uint8_t *topright; |
| 667 | int nnz, tr; |
| 668 | uint64_t tr_high; |
| 669 | if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) { |
| 670 | const int topright_avail = (h->topright_samples_available << i) & 0x8000; |
| 671 | av_assert2(h->mb_y || linesize <= block_offset[i]); |
| 672 | if (!topright_avail) { |
| 673 | if (pixel_shift) { |
| 674 | tr_high = ((uint16_t *)ptr)[3 - linesize / 2] * 0x0001000100010001ULL; |
| 675 | topright = (uint8_t *)&tr_high; |
| 676 | } else { |
| 677 | tr = ptr[3 - linesize] * 0x01010101u; |
| 678 | topright = (uint8_t *)&tr; |
| 679 | } |
| 680 | } else |
| 681 | topright = ptr + (4 << pixel_shift) - linesize; |
| 682 | } else |
| 683 | topright = NULL; |
| 684 | |
| 685 | h->hpc.pred4x4[dir](ptr, topright, linesize); |
| 686 | nnz = h->non_zero_count_cache[scan8[i + p * 16]]; |
| 687 | if (nnz) { |
| 688 | if (is_h264) { |
| 689 | if (nnz == 1 && dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) |
| 690 | idct_dc_add(ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); |
| 691 | else |
| 692 | idct_add(ptr, h->mb + (i * 16 + p * 256 << pixel_shift), linesize); |
| 693 | } else if (CONFIG_SVQ3_DECODER) |
| 694 | ff_svq3_add_idct_c(ptr, h->mb + i * 16 + p * 256, linesize, qscale, 0); |
| 695 | } |
| 696 | } |
| 697 | } |
| 698 | } |
| 699 | } else { |
| 700 | h->hpc.pred16x16[h->intra16x16_pred_mode](dest_y, linesize); |
| 701 | if (is_h264) { |
| 702 | if (h->non_zero_count_cache[scan8[LUMA_DC_BLOCK_INDEX + p]]) { |
| 703 | if (!transform_bypass) |
| 704 | h->h264dsp.h264_luma_dc_dequant_idct(h->mb + (p * 256 << pixel_shift), |
| 705 | h->mb_luma_dc[p], |
| 706 | h->dequant4_coeff[p][qscale][0]); |
| 707 | else { |
| 708 | static const uint8_t dc_mapping[16] = { |
| 709 | 0 * 16, 1 * 16, 4 * 16, 5 * 16, |
| 710 | 2 * 16, 3 * 16, 6 * 16, 7 * 16, |
| 711 | 8 * 16, 9 * 16, 12 * 16, 13 * 16, |
| 712 | 10 * 16, 11 * 16, 14 * 16, 15 * 16 |
| 713 | }; |
| 714 | for (i = 0; i < 16; i++) |
| 715 | dctcoef_set(h->mb + (p * 256 << pixel_shift), |
| 716 | pixel_shift, dc_mapping[i], |
| 717 | dctcoef_get(h->mb_luma_dc[p], |
| 718 | pixel_shift, i)); |
| 719 | } |
| 720 | } |
| 721 | } else if (CONFIG_SVQ3_DECODER) |
| 722 | ff_svq3_luma_dc_dequant_idct_c(h->mb + p * 256, |
| 723 | h->mb_luma_dc[p], qscale); |
| 724 | } |
| 725 | } |
| 726 | |
| 727 | static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, |
| 728 | int is_h264, int simple, |
| 729 | int transform_bypass, |
| 730 | int pixel_shift, |
| 731 | int *block_offset, |
| 732 | int linesize, |
| 733 | uint8_t *dest_y, int p) |
| 734 | { |
| 735 | void (*idct_add)(uint8_t *dst, int16_t *block, int stride); |
| 736 | int i; |
| 737 | block_offset += 16 * p; |
| 738 | if (!IS_INTRA4x4(mb_type)) { |
| 739 | if (is_h264) { |
| 740 | if (IS_INTRA16x16(mb_type)) { |
| 741 | if (transform_bypass) { |
| 742 | if (h->sps.profile_idc == 244 && |
| 743 | (h->intra16x16_pred_mode == VERT_PRED8x8 || |
| 744 | h->intra16x16_pred_mode == HOR_PRED8x8)) { |
| 745 | h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, |
| 746 | h->mb + (p * 256 << pixel_shift), |
| 747 | linesize); |
| 748 | } else { |
| 749 | for (i = 0; i < 16; i++) |
| 750 | if (h->non_zero_count_cache[scan8[i + p * 16]] || |
| 751 | dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) |
| 752 | h->h264dsp.h264_add_pixels4_clear(dest_y + block_offset[i], |
| 753 | h->mb + (i * 16 + p * 256 << pixel_shift), |
| 754 | linesize); |
| 755 | } |
| 756 | } else { |
| 757 | h->h264dsp.h264_idct_add16intra(dest_y, block_offset, |
| 758 | h->mb + (p * 256 << pixel_shift), |
| 759 | linesize, |
| 760 | h->non_zero_count_cache + p * 5 * 8); |
| 761 | } |
| 762 | } else if (h->cbp & 15) { |
| 763 | if (transform_bypass) { |
| 764 | const int di = IS_8x8DCT(mb_type) ? 4 : 1; |
| 765 | idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8_clear |
| 766 | : h->h264dsp.h264_add_pixels4_clear; |
| 767 | for (i = 0; i < 16; i += di) |
| 768 | if (h->non_zero_count_cache[scan8[i + p * 16]]) |
| 769 | idct_add(dest_y + block_offset[i], |
| 770 | h->mb + (i * 16 + p * 256 << pixel_shift), |
| 771 | linesize); |
| 772 | } else { |
| 773 | if (IS_8x8DCT(mb_type)) |
| 774 | h->h264dsp.h264_idct8_add4(dest_y, block_offset, |
| 775 | h->mb + (p * 256 << pixel_shift), |
| 776 | linesize, |
| 777 | h->non_zero_count_cache + p * 5 * 8); |
| 778 | else |
| 779 | h->h264dsp.h264_idct_add16(dest_y, block_offset, |
| 780 | h->mb + (p * 256 << pixel_shift), |
| 781 | linesize, |
| 782 | h->non_zero_count_cache + p * 5 * 8); |
| 783 | } |
| 784 | } |
| 785 | } else if (CONFIG_SVQ3_DECODER) { |
| 786 | for (i = 0; i < 16; i++) |
| 787 | if (h->non_zero_count_cache[scan8[i + p * 16]] || h->mb[i * 16 + p * 256]) { |
| 788 | // FIXME benchmark weird rule, & below |
| 789 | uint8_t *const ptr = dest_y + block_offset[i]; |
| 790 | ff_svq3_add_idct_c(ptr, h->mb + i * 16 + p * 256, linesize, |
| 791 | h->qscale, IS_INTRA(mb_type) ? 1 : 0); |
| 792 | } |
| 793 | } |
| 794 | } |
| 795 | } |
| 796 | |
| 797 | #define BITS 8 |
| 798 | #define SIMPLE 1 |
| 799 | #include "h264_mb_template.c" |
| 800 | |
| 801 | #undef BITS |
| 802 | #define BITS 16 |
| 803 | #include "h264_mb_template.c" |
| 804 | |
| 805 | #undef SIMPLE |
| 806 | #define SIMPLE 0 |
| 807 | #include "h264_mb_template.c" |
| 808 | |
| 809 | void ff_h264_hl_decode_mb(H264Context *h) |
| 810 | { |
| 811 | const int mb_xy = h->mb_xy; |
| 812 | const int mb_type = h->cur_pic.mb_type[mb_xy]; |
| 813 | int is_complex = CONFIG_SMALL || h->is_complex || |
| 814 | IS_INTRA_PCM(mb_type) || h->qscale == 0; |
| 815 | |
| 816 | if (CHROMA444(h)) { |
| 817 | if (is_complex || h->pixel_shift) |
| 818 | hl_decode_mb_444_complex(h); |
| 819 | else |
| 820 | hl_decode_mb_444_simple_8(h); |
| 821 | } else if (is_complex) { |
| 822 | hl_decode_mb_complex(h); |
| 823 | } else if (h->pixel_shift) { |
| 824 | hl_decode_mb_simple_16(h); |
| 825 | } else |
| 826 | hl_decode_mb_simple_8(h); |
| 827 | } |