Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / encoder / framefilter.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5 * Min Chen <chenm003@163.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 *
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
24
25 #include "common.h"
26 #include "frame.h"
27 #include "framedata.h"
28 #include "encoder.h"
29 #include "framefilter.h"
30 #include "frameencoder.h"
31 #include "wavefront.h"
32
33 using namespace x265;
34
35 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
36 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
37
38 FrameFilter::FrameFilter()
39 : m_param(NULL)
40 , m_frame(NULL)
41 , m_frameEncoder(NULL)
42 , m_ssimBuf(NULL)
43 {
44 }
45
46 void FrameFilter::destroy()
47 {
48 if (m_param->bEnableSAO)
49 m_sao.destroy();
50
51 X265_FREE(m_ssimBuf);
52 }
53
54 void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
55 {
56 m_param = top->m_param;
57 m_frameEncoder = frame;
58 m_numRows = numRows;
59 m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
60 m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
61 m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
62 m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
63 m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
64 m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
65
66 m_deblock.init();
67
68 if (m_param->bEnableSAO)
69 if (!m_sao.create(m_param))
70 m_param->bEnableSAO = 0;
71
72 if (m_param->bEnableSsim)
73 m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
74 }
75
76 void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
77 {
78 m_frame = frame;
79
80 if (m_param->bEnableSAO)
81 m_sao.startSlice(frame, initState, qp);
82 }
83
84 void FrameFilter::processRow(int row)
85 {
86 ProfileScopeEvent(filterCTURow);
87
88 if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
89 {
90 processRowPost(row);
91 return;
92 }
93 FrameData& encData = *m_frame->m_encData;
94 const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
95 const uint32_t lineStartCUAddr = row * numCols;
96
97 if (m_param->bEnableLoopFilter)
98 {
99 for (uint32_t col = 0; col < numCols; col++)
100 {
101 uint32_t cuAddr = lineStartCUAddr + col;
102 const CUData* ctu = encData.getPicCTU(cuAddr);
103
104 m_deblock.deblockCTU(ctu, Deblock::EDGE_VER);
105
106 if (col > 0)
107 {
108 const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
109 m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
110 }
111 }
112
113 const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
114 m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
115 }
116
117 // SAO
118 SAOParam* saoParam = encData.m_saoParam;
119 if (m_param->bEnableSAO)
120 {
121 m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
122 m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
123 m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
124
125 m_sao.rdoSaoUnitRow(saoParam, row);
126
127 // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
128 if (row >= m_saoRowDelay)
129 processSao(row - m_saoRowDelay);
130 }
131
132 // this row of CTUs has been encoded
133
134 if (row > 0)
135 processRowPost(row - 1);
136
137 if (row == m_numRows - 1)
138 {
139 if (m_param->bEnableSAO)
140 {
141 m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
142
143 for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
144 processSao(i);
145 }
146
147 processRowPost(row);
148 }
149 }
150
151 uint32_t FrameFilter::getCUHeight(int rowNum) const
152 {
153 return rowNum == m_numRows - 1 ? m_lastHeight : g_maxCUSize;
154 }
155
156 void FrameFilter::processRowPost(int row)
157 {
158 PicYuv *reconPic = m_frame->m_reconPic;
159 const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
160 const uint32_t lineStartCUAddr = row * numCols;
161 const int realH = getCUHeight(row);
162
163 // Border extend Left and Right
164 primitives.extendRowBorder(reconPic->getLumaAddr(lineStartCUAddr), reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
165 primitives.extendRowBorder(reconPic->getCbAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
166 primitives.extendRowBorder(reconPic->getCrAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
167
168 // Border extend Top
169 if (!row)
170 {
171 const intptr_t stride = reconPic->m_stride;
172 const intptr_t strideC = reconPic->m_strideC;
173 pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX;
174 pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
175 pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
176
177 for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
178 memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
179
180 for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
181 {
182 memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel));
183 memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel));
184 }
185 }
186
187 // Border extend Bottom
188 if (row == m_numRows - 1)
189 {
190 const intptr_t stride = reconPic->m_stride;
191 const intptr_t strideC = reconPic->m_strideC;
192 pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX + (realH - 1) * stride;
193 pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
194 pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
195 for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
196 memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
197
198 for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
199 {
200 memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel));
201 memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel));
202 }
203 }
204
205 // Notify other FrameEncoders that this row of reconstructed pixels is available
206 m_frame->m_reconRowCount.incr();
207
208 uint32_t cuAddr = lineStartCUAddr;
209 if (m_param->bEnablePsnr)
210 {
211 PicYuv* fencPic = m_frame->m_fencPic;
212
213 intptr_t stride = reconPic->m_stride;
214 uint32_t width = reconPic->m_picWidth - m_pad[0];
215 uint32_t height = getCUHeight(row);
216
217 uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
218 height >>= m_vChromaShift;
219 width >>= m_hChromaShift;
220 stride = reconPic->m_strideC;
221
222 uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
223 uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
224
225 m_frameEncoder->m_SSDY += ssdY;
226 m_frameEncoder->m_SSDU += ssdU;
227 m_frameEncoder->m_SSDV += ssdV;
228 }
229 if (m_param->bEnableSsim && m_ssimBuf)
230 {
231 pixel *rec = m_frame->m_reconPic->m_picOrg[0];
232 pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
233 intptr_t stride1 = m_frame->m_fencPic->m_stride;
234 intptr_t stride2 = m_frame->m_reconPic->m_stride;
235 uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
236 uint32_t bStart = (row == 0);
237 uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
238 uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd;
239 uint32_t ssim_cnt;
240 x265_emms();
241
242 /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
243 * to avoid alignment of ssim blocks with DCT blocks. */
244 minPixY += bStart ? 2 : -6;
245 m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
246 m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
247 m_frameEncoder->m_ssimCnt += ssim_cnt;
248 }
249 if (m_param->decodedPictureHashSEI == 1)
250 {
251 uint32_t height = getCUHeight(row);
252 uint32_t width = reconPic->m_picWidth;
253 intptr_t stride = reconPic->m_stride;
254
255 if (!row)
256 {
257 for (int i = 0; i < 3; i++)
258 MD5Init(&m_frameEncoder->m_state[i]);
259 }
260
261 updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
262 width >>= m_hChromaShift;
263 height >>= m_vChromaShift;
264 stride = reconPic->m_strideC;
265
266 updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
267 updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
268 }
269 else if (m_param->decodedPictureHashSEI == 2)
270 {
271 uint32_t height = getCUHeight(row);
272 uint32_t width = reconPic->m_picWidth;
273 intptr_t stride = reconPic->m_stride;
274 if (!row)
275 m_frameEncoder->m_crc[0] = m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
276 updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
277 width >>= m_hChromaShift;
278 height >>= m_vChromaShift;
279 stride = reconPic->m_strideC;
280
281 updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
282 updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
283 }
284 else if (m_param->decodedPictureHashSEI == 3)
285 {
286 uint32_t width = reconPic->m_picWidth;
287 uint32_t height = getCUHeight(row);
288 intptr_t stride = reconPic->m_stride;
289 uint32_t cuHeight = g_maxCUSize;
290 if (!row)
291 m_frameEncoder->m_checksum[0] = m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
292 updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
293 width >>= m_hChromaShift;
294 height >>= m_vChromaShift;
295 stride = reconPic->m_strideC;
296 cuHeight >>= m_vChromaShift;
297
298 updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
299 updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
300 }
301 }
302
303 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height)
304 {
305 uint64_t ssd = 0;
306
307 if ((width | height) & 3)
308 {
309 /* Slow Path */
310 for (uint32_t y = 0; y < height; y++)
311 {
312 for (uint32_t x = 0; x < width; x++)
313 {
314 int diff = (int)(fenc[x] - rec[x]);
315 ssd += diff * diff;
316 }
317
318 fenc += stride;
319 rec += stride;
320 }
321
322 return ssd;
323 }
324
325 uint32_t y = 0;
326 /* Consume Y in chunks of 64 */
327 for (; y + 64 <= height; y += 64)
328 {
329 uint32_t x = 0;
330
331 if (!(stride & 31))
332 for (; x + 64 <= width; x += 64)
333 ssd += primitives.sse_pp[LUMA_64x64](fenc + x, stride, rec + x, stride);
334
335 if (!(stride & 15))
336 for (; x + 16 <= width; x += 16)
337 ssd += primitives.sse_pp[LUMA_16x64](fenc + x, stride, rec + x, stride);
338
339 for (; x + 4 <= width; x += 4)
340 {
341 ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
342 ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 16 * stride, stride, rec + x + 16 * stride, stride);
343 ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 32 * stride, stride, rec + x + 32 * stride, stride);
344 ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 48 * stride, stride, rec + x + 48 * stride, stride);
345 }
346
347 fenc += stride * 64;
348 rec += stride * 64;
349 }
350
351 /* Consume Y in chunks of 16 */
352 for (; y + 16 <= height; y += 16)
353 {
354 uint32_t x = 0;
355
356 if (!(stride & 31))
357 for (; x + 64 <= width; x += 64)
358 ssd += primitives.sse_pp[LUMA_64x16](fenc + x, stride, rec + x, stride);
359
360 if (!(stride & 15))
361 for (; x + 16 <= width; x += 16)
362 ssd += primitives.sse_pp[LUMA_16x16](fenc + x, stride, rec + x, stride);
363
364 for (; x + 4 <= width; x += 4)
365 ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
366
367 fenc += stride * 16;
368 rec += stride * 16;
369 }
370
371 /* Consume Y in chunks of 4 */
372 for (; y + 4 <= height; y += 4)
373 {
374 uint32_t x = 0;
375
376 if (!(stride & 15))
377 for (; x + 16 <= width; x += 16)
378 ssd += primitives.sse_pp[LUMA_16x4](fenc + x, stride, rec + x, stride);
379
380 for (; x + 4 <= width; x += 4)
381 ssd += primitives.sse_pp[LUMA_4x4](fenc + x, stride, rec + x, stride);
382
383 fenc += stride * 4;
384 rec += stride * 4;
385 }
386
387 return ssd;
388 }
389
390 /* Function to calculate SSIM for each row */
391 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt)
392 {
393 uint32_t z = 0;
394 float ssim = 0.0;
395
396 int(*sum0)[4] = (int(*)[4])buf;
397 int(*sum1)[4] = sum0 + (width >> 2) + 3;
398 width >>= 2;
399 height >>= 2;
400
401 for (uint32_t y = 1; y < height; y++)
402 {
403 for (; z <= y; z++)
404 {
405 std::swap(sum0, sum1);
406 for (uint32_t x = 0; x < width; x += 2)
407 primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]);
408 }
409
410 for (uint32_t x = 0; x < width - 1; x += 4)
411 ssim += primitives.ssim_end_4(sum0 + x, sum1 + x, X265_MIN(4, width - x - 1));
412 }
413
414 cnt = (height - 1) * (width - 1);
415 return ssim;
416 }
417
418 /* restore original YUV samples to recon after SAO (if lossless) */
419 static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
420 {
421 uint32_t size = g_maxCUSize >> depth;
422 int part = partitionFromSizes(size, size);
423
424 PicYuv* reconPic = frame.m_reconPic;
425 PicYuv* fencPic = frame.m_fencPic;
426
427 pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
428 pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
429
430 primitives.luma_copy_pp[part](dst, reconPic->m_stride, src, fencPic->m_stride);
431
432 pixel* dstCb = reconPic->getCbAddr(cu->m_cuAddr, absPartIdx);
433 pixel* srcCb = fencPic->getCbAddr(cu->m_cuAddr, absPartIdx);
434
435 pixel* dstCr = reconPic->getCrAddr(cu->m_cuAddr, absPartIdx);
436 pixel* srcCr = fencPic->getCrAddr(cu->m_cuAddr, absPartIdx);
437
438 int csp = fencPic->m_picCsp;
439 primitives.chroma[csp].copy_pp[part](dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
440 primitives.chroma[csp].copy_pp[part](dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
441 }
442
443 /* Original YUV restoration for CU in lossless coding */
444 static void origCUSampleRestoration(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
445 {
446 if (cu->m_cuDepth[absPartIdx] > depth)
447 {
448 /* TODO: this could use cuGeom.numPartition and flags */
449 uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
450 uint32_t qNumParts = curNumParts >> 2;
451 uint32_t xmax = cu->m_slice->m_sps->picWidthInLumaSamples - cu->m_cuPelX;
452 uint32_t ymax = cu->m_slice->m_sps->picHeightInLumaSamples - cu->m_cuPelY;
453
454 /* process four split sub-cu at next depth */
455 for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
456 {
457 if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
458 origCUSampleRestoration(cu, frame, absPartIdx, depth + 1);
459 }
460
461 return;
462 }
463
464 // restore original YUV samples
465 if (cu->m_tqBypass[absPartIdx])
466 restoreOrigLosslessYuv(cu, frame, absPartIdx, depth);
467 }
468
469 void FrameFilter::processSao(int row)
470 {
471 SAOParam* saoParam = m_frame->m_encData->m_saoParam;
472
473 if (saoParam->bSaoFlag[0])
474 m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
475
476 if (saoParam->bSaoFlag[1])
477 {
478 m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
479 m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
480 }
481
482 if (m_frame->m_encData->m_slice->m_pps->bTransquantBypassEnabled)
483 {
484 uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
485 uint32_t lineStartCUAddr = row * numCols;
486
487 for (uint32_t col = 0; col < numCols; col++)
488 origCUSampleRestoration(m_frame->m_encData->getPicCTU(lineStartCUAddr + col), *m_frame, 0, 0);
489 }
490 }