aee75c6531b0c8c52b0ab87012124db77d497c75
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5 * Min Chen <chenm003@163.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
27 #include "framedata.h"
29 #include "framefilter.h"
30 #include "frameencoder.h"
31 #include "wavefront.h"
36 static uint64_t computeSSD(pixel
*fenc
, pixel
*rec
, intptr_t stride
, uint32_t width
, uint32_t height
);
37 static float calculateSSIM(pixel
*pix1
, intptr_t stride1
, pixel
*pix2
, intptr_t stride2
, uint32_t width
, uint32_t height
, void *buf
, uint32_t& cnt
);
39 FrameFilter::FrameFilter()
42 , m_frameEncoder(NULL
)
47 void FrameFilter::destroy()
49 if (m_param
->bEnableSAO
)
55 void FrameFilter::init(Encoder
*top
, FrameEncoder
*frame
, int numRows
)
57 m_param
= top
->m_param
;
58 m_frameEncoder
= frame
;
60 m_hChromaShift
= CHROMA_H_SHIFT(m_param
->internalCsp
);
61 m_vChromaShift
= CHROMA_V_SHIFT(m_param
->internalCsp
);
62 m_pad
[0] = top
->m_sps
.conformanceWindow
.rightOffset
;
63 m_pad
[1] = top
->m_sps
.conformanceWindow
.bottomOffset
;
64 m_saoRowDelay
= m_param
->bEnableLoopFilter
? 1 : 0;
65 m_lastHeight
= m_param
->sourceHeight
% g_maxCUSize
? m_param
->sourceHeight
% g_maxCUSize
: g_maxCUSize
;
69 if (m_param
->bEnableSAO
)
70 if (!m_sao
.create(m_param
))
71 m_param
->bEnableSAO
= 0;
73 if (m_param
->bEnableSsim
)
74 m_ssimBuf
= X265_MALLOC(int, 8 * (m_param
->sourceWidth
/ 4 + 3));
77 void FrameFilter::start(Frame
*frame
, Entropy
& initState
, int qp
)
81 if (m_param
->bEnableSAO
)
82 m_sao
.startSlice(frame
, initState
, qp
);
85 void FrameFilter::processRow(int row
)
87 PPAScopeEvent(Thread_filterCU
);
89 if (!m_param
->bEnableLoopFilter
&& !m_param
->bEnableSAO
)
94 FrameData
& encData
= *m_frame
->m_encData
;
95 const uint32_t numCols
= encData
.m_slice
->m_sps
->numCuInWidth
;
96 const uint32_t lineStartCUAddr
= row
* numCols
;
98 if (m_param
->bEnableLoopFilter
)
100 for (uint32_t col
= 0; col
< numCols
; col
++)
102 uint32_t cuAddr
= lineStartCUAddr
+ col
;
103 CUData
* cu
= encData
.getPicCTU(cuAddr
);
105 m_deblock
.deblockCTU(cu
, Deblock::EDGE_VER
);
109 CUData
* cuPrev
= encData
.getPicCTU(cuAddr
- 1);
110 m_deblock
.deblockCTU(cuPrev
, Deblock::EDGE_HOR
);
114 CUData
* cuPrev
= encData
.getPicCTU(lineStartCUAddr
+ numCols
- 1);
115 m_deblock
.deblockCTU(cuPrev
, Deblock::EDGE_HOR
);
119 SAOParam
* saoParam
= encData
.m_saoParam
;
120 if (m_param
->bEnableSAO
)
122 m_sao
.m_entropyCoder
.load(m_frameEncoder
->m_initSliceContext
);
123 m_sao
.m_rdContexts
.next
.load(m_frameEncoder
->m_initSliceContext
);
124 m_sao
.m_rdContexts
.cur
.load(m_frameEncoder
->m_initSliceContext
);
126 m_sao
.rdoSaoUnitRow(saoParam
, row
);
128 // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
129 if (row
>= m_saoRowDelay
)
130 processSao(row
- m_saoRowDelay
);
133 // this row of CTUs has been encoded
136 processRowPost(row
- 1);
138 if (row
== m_numRows
- 1)
140 if (m_param
->bEnableSAO
)
142 m_sao
.rdoSaoUnitRowEnd(saoParam
, encData
.m_slice
->m_sps
->numCUsInFrame
);
144 for (int i
= m_numRows
- m_saoRowDelay
; i
< m_numRows
; i
++)
152 uint32_t FrameFilter::getCUHeight(int rowNum
) const
154 return rowNum
== m_numRows
- 1 ? m_lastHeight
: g_maxCUSize
;
157 void FrameFilter::processRowPost(int row
)
159 PicYuv
*reconPic
= m_frame
->m_reconPicYuv
;
160 const uint32_t numCols
= m_frame
->m_encData
->m_slice
->m_sps
->numCuInWidth
;
161 const uint32_t lineStartCUAddr
= row
* numCols
;
162 const int realH
= getCUHeight(row
);
164 // Border extend Left and Right
165 primitives
.extendRowBorder(reconPic
->getLumaAddr(lineStartCUAddr
), reconPic
->m_stride
, reconPic
->m_picWidth
, realH
, reconPic
->m_lumaMarginX
);
166 primitives
.extendRowBorder(reconPic
->getCbAddr(lineStartCUAddr
), reconPic
->m_strideC
, reconPic
->m_picWidth
>> m_hChromaShift
, realH
>> m_vChromaShift
, reconPic
->m_chromaMarginX
);
167 primitives
.extendRowBorder(reconPic
->getCrAddr(lineStartCUAddr
), reconPic
->m_strideC
, reconPic
->m_picWidth
>> m_hChromaShift
, realH
>> m_vChromaShift
, reconPic
->m_chromaMarginX
);
172 const intptr_t stride
= reconPic
->m_stride
;
173 const intptr_t strideC
= reconPic
->m_strideC
;
174 pixel
*pixY
= reconPic
->getLumaAddr(lineStartCUAddr
) - reconPic
->m_lumaMarginX
;
175 pixel
*pixU
= reconPic
->getCbAddr(lineStartCUAddr
) - reconPic
->m_chromaMarginX
;
176 pixel
*pixV
= reconPic
->getCrAddr(lineStartCUAddr
) - reconPic
->m_chromaMarginX
;
178 for (uint32_t y
= 0; y
< reconPic
->m_lumaMarginY
; y
++)
179 memcpy(pixY
- (y
+ 1) * stride
, pixY
, stride
* sizeof(pixel
));
181 for (uint32_t y
= 0; y
< reconPic
->m_chromaMarginY
; y
++)
183 memcpy(pixU
- (y
+ 1) * strideC
, pixU
, strideC
* sizeof(pixel
));
184 memcpy(pixV
- (y
+ 1) * strideC
, pixV
, strideC
* sizeof(pixel
));
188 // Border extend Bottom
189 if (row
== m_numRows
- 1)
191 const intptr_t stride
= reconPic
->m_stride
;
192 const intptr_t strideC
= reconPic
->m_strideC
;
193 pixel
*pixY
= reconPic
->getLumaAddr(lineStartCUAddr
) - reconPic
->m_lumaMarginX
+ (realH
- 1) * stride
;
194 pixel
*pixU
= reconPic
->getCbAddr(lineStartCUAddr
) - reconPic
->m_chromaMarginX
+ ((realH
>> m_vChromaShift
) - 1) * strideC
;
195 pixel
*pixV
= reconPic
->getCrAddr(lineStartCUAddr
) - reconPic
->m_chromaMarginX
+ ((realH
>> m_vChromaShift
) - 1) * strideC
;
196 for (uint32_t y
= 0; y
< reconPic
->m_lumaMarginY
; y
++)
197 memcpy(pixY
+ (y
+ 1) * stride
, pixY
, stride
* sizeof(pixel
));
199 for (uint32_t y
= 0; y
< reconPic
->m_chromaMarginY
; y
++)
201 memcpy(pixU
+ (y
+ 1) * strideC
, pixU
, strideC
* sizeof(pixel
));
202 memcpy(pixV
+ (y
+ 1) * strideC
, pixV
, strideC
* sizeof(pixel
));
206 // Notify other FrameEncoders that this row of reconstructed pixels is available
207 m_frame
->m_reconRowCount
.incr();
209 uint32_t cuAddr
= lineStartCUAddr
;
210 if (m_param
->bEnablePsnr
)
212 PicYuv
* origPic
= m_frame
->m_origPicYuv
;
214 intptr_t stride
= reconPic
->m_stride
;
215 uint32_t width
= reconPic
->m_picWidth
- m_pad
[0];
216 uint32_t height
= getCUHeight(row
);
218 uint64_t ssdY
= computeSSD(origPic
->getLumaAddr(cuAddr
), reconPic
->getLumaAddr(cuAddr
), stride
, width
, height
);
219 height
>>= m_vChromaShift
;
220 width
>>= m_hChromaShift
;
221 stride
= reconPic
->m_strideC
;
223 uint64_t ssdU
= computeSSD(origPic
->getCbAddr(cuAddr
), reconPic
->getCbAddr(cuAddr
), stride
, width
, height
);
224 uint64_t ssdV
= computeSSD(origPic
->getCrAddr(cuAddr
), reconPic
->getCrAddr(cuAddr
), stride
, width
, height
);
226 m_frameEncoder
->m_SSDY
+= ssdY
;
227 m_frameEncoder
->m_SSDU
+= ssdU
;
228 m_frameEncoder
->m_SSDV
+= ssdV
;
230 if (m_param
->bEnableSsim
&& m_ssimBuf
)
232 pixel
*rec
= m_frame
->m_reconPicYuv
->m_picOrg
[0];
233 pixel
*org
= m_frame
->m_origPicYuv
->m_picOrg
[0];
234 intptr_t stride1
= m_frame
->m_origPicYuv
->m_stride
;
235 intptr_t stride2
= m_frame
->m_reconPicYuv
->m_stride
;
236 uint32_t bEnd
= ((row
+ 1) == (this->m_numRows
- 1));
237 uint32_t bStart
= (row
== 0);
238 uint32_t minPixY
= row
* g_maxCUSize
- 4 * !bStart
;
239 uint32_t maxPixY
= (row
+ 1) * g_maxCUSize
- 4 * !bEnd
;
243 /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
244 * to avoid alignment of ssim blocks with DCT blocks. */
245 minPixY
+= bStart
? 2 : -6;
246 m_frameEncoder
->m_ssim
+= calculateSSIM(rec
+ 2 + minPixY
* stride1
, stride1
, org
+ 2 + minPixY
* stride2
, stride2
,
247 m_param
->sourceWidth
- 2, maxPixY
- minPixY
, m_ssimBuf
, ssim_cnt
);
248 m_frameEncoder
->m_ssimCnt
+= ssim_cnt
;
250 if (m_param
->decodedPictureHashSEI
== 1)
252 uint32_t height
= getCUHeight(row
);
253 uint32_t width
= reconPic
->m_picWidth
;
254 intptr_t stride
= reconPic
->m_stride
;
258 for (int i
= 0; i
< 3; i
++)
259 MD5Init(&m_frameEncoder
->m_state
[i
]);
262 updateMD5Plane(m_frameEncoder
->m_state
[0], reconPic
->getLumaAddr(cuAddr
), width
, height
, stride
);
263 width
>>= m_hChromaShift
;
264 height
>>= m_vChromaShift
;
265 stride
= reconPic
->m_strideC
;
267 updateMD5Plane(m_frameEncoder
->m_state
[1], reconPic
->getCbAddr(cuAddr
), width
, height
, stride
);
268 updateMD5Plane(m_frameEncoder
->m_state
[2], reconPic
->getCrAddr(cuAddr
), width
, height
, stride
);
270 else if (m_param
->decodedPictureHashSEI
== 2)
272 uint32_t height
= getCUHeight(row
);
273 uint32_t width
= reconPic
->m_picWidth
;
274 intptr_t stride
= reconPic
->m_stride
;
276 m_frameEncoder
->m_crc
[0] = m_frameEncoder
->m_crc
[1] = m_frameEncoder
->m_crc
[2] = 0xffff;
277 updateCRC(reconPic
->getLumaAddr(cuAddr
), m_frameEncoder
->m_crc
[0], height
, width
, stride
);
278 width
>>= m_hChromaShift
;
279 height
>>= m_vChromaShift
;
280 stride
= reconPic
->m_strideC
;
282 updateCRC(reconPic
->getCbAddr(cuAddr
), m_frameEncoder
->m_crc
[1], height
, width
, stride
);
283 updateCRC(reconPic
->getCrAddr(cuAddr
), m_frameEncoder
->m_crc
[2], height
, width
, stride
);
285 else if (m_param
->decodedPictureHashSEI
== 3)
287 uint32_t width
= reconPic
->m_picWidth
;
288 uint32_t height
= getCUHeight(row
);
289 intptr_t stride
= reconPic
->m_stride
;
290 uint32_t cuHeight
= g_maxCUSize
;
292 m_frameEncoder
->m_checksum
[0] = m_frameEncoder
->m_checksum
[1] = m_frameEncoder
->m_checksum
[2] = 0;
293 updateChecksum(reconPic
->m_picOrg
[0], m_frameEncoder
->m_checksum
[0], height
, width
, stride
, row
, cuHeight
);
294 width
>>= m_hChromaShift
;
295 height
>>= m_vChromaShift
;
296 stride
= reconPic
->m_strideC
;
297 cuHeight
>>= m_vChromaShift
;
299 updateChecksum(reconPic
->m_picOrg
[1], m_frameEncoder
->m_checksum
[1], height
, width
, stride
, row
, cuHeight
);
300 updateChecksum(reconPic
->m_picOrg
[2], m_frameEncoder
->m_checksum
[2], height
, width
, stride
, row
, cuHeight
);
304 static uint64_t computeSSD(pixel
*fenc
, pixel
*rec
, intptr_t stride
, uint32_t width
, uint32_t height
)
308 if ((width
| height
) & 3)
311 for (uint32_t y
= 0; y
< height
; y
++)
313 for (uint32_t x
= 0; x
< width
; x
++)
315 int diff
= (int)(fenc
[x
] - rec
[x
]);
327 /* Consume Y in chunks of 64 */
328 for (; y
+ 64 <= height
; y
+= 64)
333 for (; x
+ 64 <= width
; x
+= 64)
334 ssd
+= primitives
.sse_pp
[LUMA_64x64
](fenc
+ x
, stride
, rec
+ x
, stride
);
337 for (; x
+ 16 <= width
; x
+= 16)
338 ssd
+= primitives
.sse_pp
[LUMA_16x64
](fenc
+ x
, stride
, rec
+ x
, stride
);
340 for (; x
+ 4 <= width
; x
+= 4)
342 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
, stride
, rec
+ x
, stride
);
343 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
+ 16 * stride
, stride
, rec
+ x
+ 16 * stride
, stride
);
344 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
+ 32 * stride
, stride
, rec
+ x
+ 32 * stride
, stride
);
345 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
+ 48 * stride
, stride
, rec
+ x
+ 48 * stride
, stride
);
352 /* Consume Y in chunks of 16 */
353 for (; y
+ 16 <= height
; y
+= 16)
358 for (; x
+ 64 <= width
; x
+= 64)
359 ssd
+= primitives
.sse_pp
[LUMA_64x16
](fenc
+ x
, stride
, rec
+ x
, stride
);
362 for (; x
+ 16 <= width
; x
+= 16)
363 ssd
+= primitives
.sse_pp
[LUMA_16x16
](fenc
+ x
, stride
, rec
+ x
, stride
);
365 for (; x
+ 4 <= width
; x
+= 4)
366 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
, stride
, rec
+ x
, stride
);
372 /* Consume Y in chunks of 4 */
373 for (; y
+ 4 <= height
; y
+= 4)
378 for (; x
+ 16 <= width
; x
+= 16)
379 ssd
+= primitives
.sse_pp
[LUMA_16x4
](fenc
+ x
, stride
, rec
+ x
, stride
);
381 for (; x
+ 4 <= width
; x
+= 4)
382 ssd
+= primitives
.sse_pp
[LUMA_4x4
](fenc
+ x
, stride
, rec
+ x
, stride
);
391 /* Function to calculate SSIM for each row */
392 static float calculateSSIM(pixel
*pix1
, intptr_t stride1
, pixel
*pix2
, intptr_t stride2
, uint32_t width
, uint32_t height
, void *buf
, uint32_t& cnt
)
397 int(*sum0
)[4] = (int(*)[4])buf
;
398 int(*sum1
)[4] = sum0
+ (width
>> 2) + 3;
402 for (uint32_t y
= 1; y
< height
; y
++)
406 std::swap(sum0
, sum1
);
407 for (uint32_t x
= 0; x
< width
; x
+= 2)
408 primitives
.ssim_4x4x2_core(&pix1
[(4 * x
+ (z
* stride1
))], stride1
, &pix2
[(4 * x
+ (z
* stride2
))], stride2
, &sum0
[x
]);
411 for (uint32_t x
= 0; x
< width
- 1; x
+= 4)
412 ssim
+= primitives
.ssim_end_4(sum0
+ x
, sum1
+ x
, X265_MIN(4, width
- x
- 1));
415 cnt
= (height
- 1) * (width
- 1);
419 /* restore original YUV samples to recon after SAO (if lossless) */
420 static void restoreOrigLosslessYuv(const CUData
* cu
, Frame
& frame
, uint32_t absPartIdx
, uint32_t depth
)
422 uint32_t size
= g_maxCUSize
>> depth
;
423 int part
= partitionFromSizes(size
, size
);
425 PicYuv
* reconPic
= frame
.m_reconPicYuv
;
426 PicYuv
* fencPic
= frame
.m_origPicYuv
;
428 pixel
* dst
= reconPic
->getLumaAddr(cu
->m_cuAddr
, absPartIdx
);
429 pixel
* src
= fencPic
->getLumaAddr(cu
->m_cuAddr
, absPartIdx
);
431 primitives
.luma_copy_pp
[part
](dst
, reconPic
->m_stride
, src
, fencPic
->m_stride
);
433 pixel
* dstCb
= reconPic
->getCbAddr(cu
->m_cuAddr
, absPartIdx
);
434 pixel
* srcCb
= fencPic
->getCbAddr(cu
->m_cuAddr
, absPartIdx
);
436 pixel
* dstCr
= reconPic
->getCrAddr(cu
->m_cuAddr
, absPartIdx
);
437 pixel
* srcCr
= fencPic
->getCrAddr(cu
->m_cuAddr
, absPartIdx
);
439 int csp
= fencPic
->m_picCsp
;
440 primitives
.chroma
[csp
].copy_pp
[part
](dstCb
, reconPic
->m_strideC
, srcCb
, fencPic
->m_strideC
);
441 primitives
.chroma
[csp
].copy_pp
[part
](dstCr
, reconPic
->m_strideC
, srcCr
, fencPic
->m_strideC
);
444 /* Original YUV restoration for CU in lossless coding */
445 static void origCUSampleRestoration(const CUData
* cu
, Frame
& frame
, uint32_t absPartIdx
, uint32_t depth
)
447 if (cu
->m_cuDepth
[absPartIdx
] > depth
)
449 /* TODO: this could use cuGeom.numPartition and flags */
450 uint32_t curNumParts
= NUM_CU_PARTITIONS
>> (depth
<< 1);
451 uint32_t qNumParts
= curNumParts
>> 2;
452 uint32_t xmax
= cu
->m_slice
->m_sps
->picWidthInLumaSamples
- cu
->m_cuPelX
;
453 uint32_t ymax
= cu
->m_slice
->m_sps
->picHeightInLumaSamples
- cu
->m_cuPelY
;
455 /* process four split sub-cu at next depth */
456 for (int subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++, absPartIdx
+= qNumParts
)
458 if (g_zscanToPelX
[absPartIdx
] < xmax
&& g_zscanToPelY
[absPartIdx
] < ymax
)
459 origCUSampleRestoration(cu
, frame
, absPartIdx
, depth
+ 1);
465 // restore original YUV samples
466 if (cu
->m_tqBypass
[absPartIdx
])
467 restoreOrigLosslessYuv(cu
, frame
, absPartIdx
, depth
);
470 void FrameFilter::processSao(int row
)
472 SAOParam
* saoParam
= m_frame
->m_encData
->m_saoParam
;
474 if (saoParam
->bSaoFlag
[0])
475 m_sao
.processSaoUnitRow(saoParam
->ctuParam
[0], row
, 0);
477 if (saoParam
->bSaoFlag
[1])
479 m_sao
.processSaoUnitRow(saoParam
->ctuParam
[1], row
, 1);
480 m_sao
.processSaoUnitRow(saoParam
->ctuParam
[2], row
, 2);
483 if (m_frame
->m_encData
->m_slice
->m_pps
->bTransquantBypassEnabled
)
485 uint32_t numCols
= m_frame
->m_encData
->m_slice
->m_sps
->numCuInWidth
;
486 uint32_t lineStartCUAddr
= row
* numCols
;
488 for (uint32_t col
= 0; col
< numCols
; col
++)
489 origCUSampleRestoration(m_frame
->m_encData
->getPicCTU(lineStartCUAddr
+ col
), *m_frame
, 0, 0);