Imported Upstream version 1.4
[deb_x265.git] / source / encoder / framefilter.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5 * Min Chen <chenm003@163.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 *
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
24
25#include "common.h"
26#include "frame.h"
27#include "framedata.h"
28#include "encoder.h"
29#include "framefilter.h"
30#include "frameencoder.h"
31#include "wavefront.h"
32#include "PPA/ppa.h"
33
34using namespace x265;
35
36static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
37static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
38
39FrameFilter::FrameFilter()
40 : m_param(NULL)
41 , m_frame(NULL)
42 , m_frameEncoder(NULL)
43 , m_ssimBuf(NULL)
44{
45}
46
47void FrameFilter::destroy()
48{
49 if (m_param->bEnableSAO)
50 m_sao.destroy();
51
52 X265_FREE(m_ssimBuf);
53}
54
55void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
56{
57 m_param = top->m_param;
58 m_frameEncoder = frame;
59 m_numRows = numRows;
60 m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
61 m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
62 m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
63 m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
64 m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
65 m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
66
67 m_deblock.init();
68
69 if (m_param->bEnableSAO)
70 if (!m_sao.create(m_param))
71 m_param->bEnableSAO = 0;
72
73 if (m_param->bEnableSsim)
74 m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
75}
76
77void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
78{
79 m_frame = frame;
80
81 if (m_param->bEnableSAO)
82 m_sao.startSlice(frame, initState, qp);
83}
84
85void FrameFilter::processRow(int row)
86{
87 PPAScopeEvent(Thread_filterCU);
88
89 if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
90 {
91 processRowPost(row);
92 return;
93 }
94 FrameData& encData = *m_frame->m_encData;
95 const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
96 const uint32_t lineStartCUAddr = row * numCols;
97
98 if (m_param->bEnableLoopFilter)
99 {
100 for (uint32_t col = 0; col < numCols; col++)
101 {
102 uint32_t cuAddr = lineStartCUAddr + col;
103 CUData* cu = encData.getPicCTU(cuAddr);
104
105 m_deblock.deblockCTU(cu, Deblock::EDGE_VER);
106
107 if (col > 0)
108 {
109 CUData* cuPrev = encData.getPicCTU(cuAddr - 1);
110 m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
111 }
112 }
113
114 CUData* cuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
115 m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
116 }
117
118 // SAO
119 SAOParam* saoParam = encData.m_saoParam;
120 if (m_param->bEnableSAO)
121 {
122 m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
123 m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
124 m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
125
126 m_sao.rdoSaoUnitRow(saoParam, row);
127
128 // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
129 if (row >= m_saoRowDelay)
130 processSao(row - m_saoRowDelay);
131 }
132
133 // this row of CTUs has been encoded
134
135 if (row > 0)
136 processRowPost(row - 1);
137
138 if (row == m_numRows - 1)
139 {
140 if (m_param->bEnableSAO)
141 {
142 m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
143
144 for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
145 processSao(i);
146 }
147
148 processRowPost(row);
149 }
150}
151
152uint32_t FrameFilter::getCUHeight(int rowNum) const
153{
154 return rowNum == m_numRows - 1 ? m_lastHeight : g_maxCUSize;
155}
156
157void FrameFilter::processRowPost(int row)
158{
159 PicYuv *reconPic = m_frame->m_reconPicYuv;
160 const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
161 const uint32_t lineStartCUAddr = row * numCols;
162 const int realH = getCUHeight(row);
163
164 // Border extend Left and Right
165 primitives.extendRowBorder(reconPic->getLumaAddr(lineStartCUAddr), reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
166 primitives.extendRowBorder(reconPic->getCbAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
167 primitives.extendRowBorder(reconPic->getCrAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
168
169 // Border extend Top
170 if (!row)
171 {
172 const intptr_t stride = reconPic->m_stride;
173 const intptr_t strideC = reconPic->m_strideC;
174 pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX;
175 pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
176 pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
177
178 for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
179 memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
180
181 for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
182 {
183 memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel));
184 memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel));
185 }
186 }
187
188 // Border extend Bottom
189 if (row == m_numRows - 1)
190 {
191 const intptr_t stride = reconPic->m_stride;
192 const intptr_t strideC = reconPic->m_strideC;
193 pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX + (realH - 1) * stride;
194 pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
195 pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
196 for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
197 memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
198
199 for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
200 {
201 memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel));
202 memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel));
203 }
204 }
205
206 // Notify other FrameEncoders that this row of reconstructed pixels is available
207 m_frame->m_reconRowCount.incr();
208
209 uint32_t cuAddr = lineStartCUAddr;
210 if (m_param->bEnablePsnr)
211 {
212 PicYuv* origPic = m_frame->m_origPicYuv;
213
214 intptr_t stride = reconPic->m_stride;
215 uint32_t width = reconPic->m_picWidth - m_pad[0];
216 uint32_t height = getCUHeight(row);
217
218 uint64_t ssdY = computeSSD(origPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
219 height >>= m_vChromaShift;
220 width >>= m_hChromaShift;
221 stride = reconPic->m_strideC;
222
223 uint64_t ssdU = computeSSD(origPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
224 uint64_t ssdV = computeSSD(origPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
225
226 m_frameEncoder->m_SSDY += ssdY;
227 m_frameEncoder->m_SSDU += ssdU;
228 m_frameEncoder->m_SSDV += ssdV;
229 }
230 if (m_param->bEnableSsim && m_ssimBuf)
231 {
232 pixel *rec = m_frame->m_reconPicYuv->m_picOrg[0];
233 pixel *org = m_frame->m_origPicYuv->m_picOrg[0];
234 intptr_t stride1 = m_frame->m_origPicYuv->m_stride;
235 intptr_t stride2 = m_frame->m_reconPicYuv->m_stride;
236 uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
237 uint32_t bStart = (row == 0);
238 uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
239 uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd;
240 uint32_t ssim_cnt;
241 x265_emms();
242
243 /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
244 * to avoid alignment of ssim blocks with DCT blocks. */
245 minPixY += bStart ? 2 : -6;
246 m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, org + 2 + minPixY * stride2, stride2,
247 m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
248 m_frameEncoder->m_ssimCnt += ssim_cnt;
249 }
250 if (m_param->decodedPictureHashSEI == 1)
251 {
252 uint32_t height = getCUHeight(row);
253 uint32_t width = reconPic->m_picWidth;
254 intptr_t stride = reconPic->m_stride;
255
256 if (!row)
257 {
258 for (int i = 0; i < 3; i++)
259 MD5Init(&m_frameEncoder->m_state[i]);
260 }
261
262 updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
263 width >>= m_hChromaShift;
264 height >>= m_vChromaShift;
265 stride = reconPic->m_strideC;
266
267 updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
268 updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
269 }
270 else if (m_param->decodedPictureHashSEI == 2)
271 {
272 uint32_t height = getCUHeight(row);
273 uint32_t width = reconPic->m_picWidth;
274 intptr_t stride = reconPic->m_stride;
275 if (!row)
276 m_frameEncoder->m_crc[0] = m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
277 updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
278 width >>= m_hChromaShift;
279 height >>= m_vChromaShift;
280 stride = reconPic->m_strideC;
281
282 updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
283 updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
284 }
285 else if (m_param->decodedPictureHashSEI == 3)
286 {
287 uint32_t width = reconPic->m_picWidth;
288 uint32_t height = getCUHeight(row);
289 intptr_t stride = reconPic->m_stride;
290 uint32_t cuHeight = g_maxCUSize;
291 if (!row)
292 m_frameEncoder->m_checksum[0] = m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
293 updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
294 width >>= m_hChromaShift;
295 height >>= m_vChromaShift;
296 stride = reconPic->m_strideC;
297 cuHeight >>= m_vChromaShift;
298
299 updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
300 updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
301 }
302}
303
304static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height)
305{
306 uint64_t ssd = 0;
307
308 if ((width | height) & 3)
309 {
310 /* Slow Path */
311 for (uint32_t y = 0; y < height; y++)
312 {
313 for (uint32_t x = 0; x < width; x++)
314 {
315 int diff = (int)(fenc[x] - rec[x]);
316 ssd += diff * diff;
317 }
318
319 fenc += stride;
320 rec += stride;
321 }
322
323 return ssd;
324 }
325
326 uint32_t y = 0;
327 /* Consume Y in chunks of 64 */
328 for (; y + 64 <= height; y += 64)
329 {
330 uint32_t x = 0;
331
332 if (!(stride & 31))
333 for (; x + 64 <= width; x += 64)
334 ssd += primitives.sse_pp[LUMA_64x64](fenc + x, stride, rec + x, stride);
335
336 if (!(stride & 15))
337 for (; x + 16 <= width; x += 16)
338 ssd += primitives.sse_pp[LUMA_16x64](fenc + x, stride, rec + x, stride);
339
340 for (; x + 4 <= width; x += 4)
341 {
342 ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
343 ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 16 * stride, stride, rec + x + 16 * stride, stride);
344 ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 32 * stride, stride, rec + x + 32 * stride, stride);
345 ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 48 * stride, stride, rec + x + 48 * stride, stride);
346 }
347
348 fenc += stride * 64;
349 rec += stride * 64;
350 }
351
352 /* Consume Y in chunks of 16 */
353 for (; y + 16 <= height; y += 16)
354 {
355 uint32_t x = 0;
356
357 if (!(stride & 31))
358 for (; x + 64 <= width; x += 64)
359 ssd += primitives.sse_pp[LUMA_64x16](fenc + x, stride, rec + x, stride);
360
361 if (!(stride & 15))
362 for (; x + 16 <= width; x += 16)
363 ssd += primitives.sse_pp[LUMA_16x16](fenc + x, stride, rec + x, stride);
364
365 for (; x + 4 <= width; x += 4)
366 ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
367
368 fenc += stride * 16;
369 rec += stride * 16;
370 }
371
372 /* Consume Y in chunks of 4 */
373 for (; y + 4 <= height; y += 4)
374 {
375 uint32_t x = 0;
376
377 if (!(stride & 15))
378 for (; x + 16 <= width; x += 16)
379 ssd += primitives.sse_pp[LUMA_16x4](fenc + x, stride, rec + x, stride);
380
381 for (; x + 4 <= width; x += 4)
382 ssd += primitives.sse_pp[LUMA_4x4](fenc + x, stride, rec + x, stride);
383
384 fenc += stride * 4;
385 rec += stride * 4;
386 }
387
388 return ssd;
389}
390
391/* Function to calculate SSIM for each row */
392static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt)
393{
394 uint32_t z = 0;
395 float ssim = 0.0;
396
397 int(*sum0)[4] = (int(*)[4])buf;
398 int(*sum1)[4] = sum0 + (width >> 2) + 3;
399 width >>= 2;
400 height >>= 2;
401
402 for (uint32_t y = 1; y < height; y++)
403 {
404 for (; z <= y; z++)
405 {
406 std::swap(sum0, sum1);
407 for (uint32_t x = 0; x < width; x += 2)
408 primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]);
409 }
410
411 for (uint32_t x = 0; x < width - 1; x += 4)
412 ssim += primitives.ssim_end_4(sum0 + x, sum1 + x, X265_MIN(4, width - x - 1));
413 }
414
415 cnt = (height - 1) * (width - 1);
416 return ssim;
417}
418
419/* restore original YUV samples to recon after SAO (if lossless) */
420static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
421{
422 uint32_t size = g_maxCUSize >> depth;
423 int part = partitionFromSizes(size, size);
424
425 PicYuv* reconPic = frame.m_reconPicYuv;
426 PicYuv* fencPic = frame.m_origPicYuv;
427
428 pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
429 pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
430
431 primitives.luma_copy_pp[part](dst, reconPic->m_stride, src, fencPic->m_stride);
432
433 pixel* dstCb = reconPic->getCbAddr(cu->m_cuAddr, absPartIdx);
434 pixel* srcCb = fencPic->getCbAddr(cu->m_cuAddr, absPartIdx);
435
436 pixel* dstCr = reconPic->getCrAddr(cu->m_cuAddr, absPartIdx);
437 pixel* srcCr = fencPic->getCrAddr(cu->m_cuAddr, absPartIdx);
438
439 int csp = fencPic->m_picCsp;
440 primitives.chroma[csp].copy_pp[part](dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
441 primitives.chroma[csp].copy_pp[part](dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
442}
443
444/* Original YUV restoration for CU in lossless coding */
445static void origCUSampleRestoration(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
446{
447 if (cu->m_cuDepth[absPartIdx] > depth)
448 {
449 /* TODO: this could use cuGeom.numPartition and flags */
450 uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
451 uint32_t qNumParts = curNumParts >> 2;
452 uint32_t xmax = cu->m_slice->m_sps->picWidthInLumaSamples - cu->m_cuPelX;
453 uint32_t ymax = cu->m_slice->m_sps->picHeightInLumaSamples - cu->m_cuPelY;
454
455 /* process four split sub-cu at next depth */
456 for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
457 {
458 if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
459 origCUSampleRestoration(cu, frame, absPartIdx, depth + 1);
460 }
461
462 return;
463 }
464
465 // restore original YUV samples
466 if (cu->m_tqBypass[absPartIdx])
467 restoreOrigLosslessYuv(cu, frame, absPartIdx, depth);
468}
469
470void FrameFilter::processSao(int row)
471{
472 SAOParam* saoParam = m_frame->m_encData->m_saoParam;
473
474 if (saoParam->bSaoFlag[0])
475 m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
476
477 if (saoParam->bSaoFlag[1])
478 {
479 m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
480 m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
481 }
482
483 if (m_frame->m_encData->m_slice->m_pps->bTransquantBypassEnabled)
484 {
485 uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
486 uint32_t lineStartCUAddr = row * numCols;
487
488 for (uint32_t col = 0; col < numCols; col++)
489 origCUSampleRestoration(m_frame->m_encData->getPicCTU(lineStartCUAddr + col), *m_frame, 0, 0);
490 }
491}