1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5 * Min Chen <chenm003@163.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
27 #include "framedata.h"
29 #include "framefilter.h"
30 #include "frameencoder.h"
31 #include "wavefront.h"
35 static uint64_t computeSSD(pixel
*fenc
, pixel
*rec
, intptr_t stride
, uint32_t width
, uint32_t height
);
36 static float calculateSSIM(pixel
*pix1
, intptr_t stride1
, pixel
*pix2
, intptr_t stride2
, uint32_t width
, uint32_t height
, void *buf
, uint32_t& cnt
);
38 FrameFilter::FrameFilter()
41 , m_frameEncoder(NULL
)
46 void FrameFilter::destroy()
48 if (m_param
->bEnableSAO
)
54 void FrameFilter::init(Encoder
*top
, FrameEncoder
*frame
, int numRows
)
56 m_param
= top
->m_param
;
57 m_frameEncoder
= frame
;
59 m_hChromaShift
= CHROMA_H_SHIFT(m_param
->internalCsp
);
60 m_vChromaShift
= CHROMA_V_SHIFT(m_param
->internalCsp
);
61 m_pad
[0] = top
->m_sps
.conformanceWindow
.rightOffset
;
62 m_pad
[1] = top
->m_sps
.conformanceWindow
.bottomOffset
;
63 m_saoRowDelay
= m_param
->bEnableLoopFilter
? 1 : 0;
64 m_lastHeight
= m_param
->sourceHeight
% g_maxCUSize
? m_param
->sourceHeight
% g_maxCUSize
: g_maxCUSize
;
68 if (m_param
->bEnableSAO
)
69 if (!m_sao
.create(m_param
))
70 m_param
->bEnableSAO
= 0;
72 if (m_param
->bEnableSsim
)
73 m_ssimBuf
= X265_MALLOC(int, 8 * (m_param
->sourceWidth
/ 4 + 3));
76 void FrameFilter::start(Frame
*frame
, Entropy
& initState
, int qp
)
80 if (m_param
->bEnableSAO
)
81 m_sao
.startSlice(frame
, initState
, qp
);
84 void FrameFilter::processRow(int row
)
86 ProfileScopeEvent(filterCTURow
);
88 if (!m_param
->bEnableLoopFilter
&& !m_param
->bEnableSAO
)
93 FrameData
& encData
= *m_frame
->m_encData
;
94 const uint32_t numCols
= encData
.m_slice
->m_sps
->numCuInWidth
;
95 const uint32_t lineStartCUAddr
= row
* numCols
;
97 if (m_param
->bEnableLoopFilter
)
99 for (uint32_t col
= 0; col
< numCols
; col
++)
101 uint32_t cuAddr
= lineStartCUAddr
+ col
;
102 const CUData
* ctu
= encData
.getPicCTU(cuAddr
);
104 m_deblock
.deblockCTU(ctu
, Deblock::EDGE_VER
);
108 const CUData
* ctuPrev
= encData
.getPicCTU(cuAddr
- 1);
109 m_deblock
.deblockCTU(ctuPrev
, Deblock::EDGE_HOR
);
113 const CUData
* ctuPrev
= encData
.getPicCTU(lineStartCUAddr
+ numCols
- 1);
114 m_deblock
.deblockCTU(ctuPrev
, Deblock::EDGE_HOR
);
118 SAOParam
* saoParam
= encData
.m_saoParam
;
119 if (m_param
->bEnableSAO
)
121 m_sao
.m_entropyCoder
.load(m_frameEncoder
->m_initSliceContext
);
122 m_sao
.m_rdContexts
.next
.load(m_frameEncoder
->m_initSliceContext
);
123 m_sao
.m_rdContexts
.cur
.load(m_frameEncoder
->m_initSliceContext
);
125 m_sao
.rdoSaoUnitRow(saoParam
, row
);
127 // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
128 if (row
>= m_saoRowDelay
)
129 processSao(row
- m_saoRowDelay
);
132 // this row of CTUs has been encoded
135 processRowPost(row
- 1);
137 if (row
== m_numRows
- 1)
139 if (m_param
->bEnableSAO
)
141 m_sao
.rdoSaoUnitRowEnd(saoParam
, encData
.m_slice
->m_sps
->numCUsInFrame
);
143 for (int i
= m_numRows
- m_saoRowDelay
; i
< m_numRows
; i
++)
151 uint32_t FrameFilter::getCUHeight(int rowNum
) const
153 return rowNum
== m_numRows
- 1 ? m_lastHeight
: g_maxCUSize
;
156 void FrameFilter::processRowPost(int row
)
158 PicYuv
*reconPic
= m_frame
->m_reconPic
;
159 const uint32_t numCols
= m_frame
->m_encData
->m_slice
->m_sps
->numCuInWidth
;
160 const uint32_t lineStartCUAddr
= row
* numCols
;
161 const int realH
= getCUHeight(row
);
163 // Border extend Left and Right
164 primitives
.extendRowBorder(reconPic
->getLumaAddr(lineStartCUAddr
), reconPic
->m_stride
, reconPic
->m_picWidth
, realH
, reconPic
->m_lumaMarginX
);
165 primitives
.extendRowBorder(reconPic
->getCbAddr(lineStartCUAddr
), reconPic
->m_strideC
, reconPic
->m_picWidth
>> m_hChromaShift
, realH
>> m_vChromaShift
, reconPic
->m_chromaMarginX
);
166 primitives
.extendRowBorder(reconPic
->getCrAddr(lineStartCUAddr
), reconPic
->m_strideC
, reconPic
->m_picWidth
>> m_hChromaShift
, realH
>> m_vChromaShift
, reconPic
->m_chromaMarginX
);
171 const intptr_t stride
= reconPic
->m_stride
;
172 const intptr_t strideC
= reconPic
->m_strideC
;
173 pixel
*pixY
= reconPic
->getLumaAddr(lineStartCUAddr
) - reconPic
->m_lumaMarginX
;
174 pixel
*pixU
= reconPic
->getCbAddr(lineStartCUAddr
) - reconPic
->m_chromaMarginX
;
175 pixel
*pixV
= reconPic
->getCrAddr(lineStartCUAddr
) - reconPic
->m_chromaMarginX
;
177 for (uint32_t y
= 0; y
< reconPic
->m_lumaMarginY
; y
++)
178 memcpy(pixY
- (y
+ 1) * stride
, pixY
, stride
* sizeof(pixel
));
180 for (uint32_t y
= 0; y
< reconPic
->m_chromaMarginY
; y
++)
182 memcpy(pixU
- (y
+ 1) * strideC
, pixU
, strideC
* sizeof(pixel
));
183 memcpy(pixV
- (y
+ 1) * strideC
, pixV
, strideC
* sizeof(pixel
));
187 // Border extend Bottom
188 if (row
== m_numRows
- 1)
190 const intptr_t stride
= reconPic
->m_stride
;
191 const intptr_t strideC
= reconPic
->m_strideC
;
192 pixel
*pixY
= reconPic
->getLumaAddr(lineStartCUAddr
) - reconPic
->m_lumaMarginX
+ (realH
- 1) * stride
;
193 pixel
*pixU
= reconPic
->getCbAddr(lineStartCUAddr
) - reconPic
->m_chromaMarginX
+ ((realH
>> m_vChromaShift
) - 1) * strideC
;
194 pixel
*pixV
= reconPic
->getCrAddr(lineStartCUAddr
) - reconPic
->m_chromaMarginX
+ ((realH
>> m_vChromaShift
) - 1) * strideC
;
195 for (uint32_t y
= 0; y
< reconPic
->m_lumaMarginY
; y
++)
196 memcpy(pixY
+ (y
+ 1) * stride
, pixY
, stride
* sizeof(pixel
));
198 for (uint32_t y
= 0; y
< reconPic
->m_chromaMarginY
; y
++)
200 memcpy(pixU
+ (y
+ 1) * strideC
, pixU
, strideC
* sizeof(pixel
));
201 memcpy(pixV
+ (y
+ 1) * strideC
, pixV
, strideC
* sizeof(pixel
));
205 // Notify other FrameEncoders that this row of reconstructed pixels is available
206 m_frame
->m_reconRowCount
.incr();
208 uint32_t cuAddr
= lineStartCUAddr
;
209 if (m_param
->bEnablePsnr
)
211 PicYuv
* fencPic
= m_frame
->m_fencPic
;
213 intptr_t stride
= reconPic
->m_stride
;
214 uint32_t width
= reconPic
->m_picWidth
- m_pad
[0];
215 uint32_t height
= getCUHeight(row
);
217 uint64_t ssdY
= computeSSD(fencPic
->getLumaAddr(cuAddr
), reconPic
->getLumaAddr(cuAddr
), stride
, width
, height
);
218 height
>>= m_vChromaShift
;
219 width
>>= m_hChromaShift
;
220 stride
= reconPic
->m_strideC
;
222 uint64_t ssdU
= computeSSD(fencPic
->getCbAddr(cuAddr
), reconPic
->getCbAddr(cuAddr
), stride
, width
, height
);
223 uint64_t ssdV
= computeSSD(fencPic
->getCrAddr(cuAddr
), reconPic
->getCrAddr(cuAddr
), stride
, width
, height
);
225 m_frameEncoder
->m_SSDY
+= ssdY
;
226 m_frameEncoder
->m_SSDU
+= ssdU
;
227 m_frameEncoder
->m_SSDV
+= ssdV
;
229 if (m_param
->bEnableSsim
&& m_ssimBuf
)
231 pixel
*rec
= m_frame
->m_reconPic
->m_picOrg
[0];
232 pixel
*fenc
= m_frame
->m_fencPic
->m_picOrg
[0];
233 intptr_t stride1
= m_frame
->m_fencPic
->m_stride
;
234 intptr_t stride2
= m_frame
->m_reconPic
->m_stride
;
235 uint32_t bEnd
= ((row
+ 1) == (this->m_numRows
- 1));
236 uint32_t bStart
= (row
== 0);
237 uint32_t minPixY
= row
* g_maxCUSize
- 4 * !bStart
;
238 uint32_t maxPixY
= (row
+ 1) * g_maxCUSize
- 4 * !bEnd
;
242 /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
243 * to avoid alignment of ssim blocks with DCT blocks. */
244 minPixY
+= bStart
? 2 : -6;
245 m_frameEncoder
->m_ssim
+= calculateSSIM(rec
+ 2 + minPixY
* stride1
, stride1
, fenc
+ 2 + minPixY
* stride2
, stride2
,
246 m_param
->sourceWidth
- 2, maxPixY
- minPixY
, m_ssimBuf
, ssim_cnt
);
247 m_frameEncoder
->m_ssimCnt
+= ssim_cnt
;
249 if (m_param
->decodedPictureHashSEI
== 1)
251 uint32_t height
= getCUHeight(row
);
252 uint32_t width
= reconPic
->m_picWidth
;
253 intptr_t stride
= reconPic
->m_stride
;
257 for (int i
= 0; i
< 3; i
++)
258 MD5Init(&m_frameEncoder
->m_state
[i
]);
261 updateMD5Plane(m_frameEncoder
->m_state
[0], reconPic
->getLumaAddr(cuAddr
), width
, height
, stride
);
262 width
>>= m_hChromaShift
;
263 height
>>= m_vChromaShift
;
264 stride
= reconPic
->m_strideC
;
266 updateMD5Plane(m_frameEncoder
->m_state
[1], reconPic
->getCbAddr(cuAddr
), width
, height
, stride
);
267 updateMD5Plane(m_frameEncoder
->m_state
[2], reconPic
->getCrAddr(cuAddr
), width
, height
, stride
);
269 else if (m_param
->decodedPictureHashSEI
== 2)
271 uint32_t height
= getCUHeight(row
);
272 uint32_t width
= reconPic
->m_picWidth
;
273 intptr_t stride
= reconPic
->m_stride
;
275 m_frameEncoder
->m_crc
[0] = m_frameEncoder
->m_crc
[1] = m_frameEncoder
->m_crc
[2] = 0xffff;
276 updateCRC(reconPic
->getLumaAddr(cuAddr
), m_frameEncoder
->m_crc
[0], height
, width
, stride
);
277 width
>>= m_hChromaShift
;
278 height
>>= m_vChromaShift
;
279 stride
= reconPic
->m_strideC
;
281 updateCRC(reconPic
->getCbAddr(cuAddr
), m_frameEncoder
->m_crc
[1], height
, width
, stride
);
282 updateCRC(reconPic
->getCrAddr(cuAddr
), m_frameEncoder
->m_crc
[2], height
, width
, stride
);
284 else if (m_param
->decodedPictureHashSEI
== 3)
286 uint32_t width
= reconPic
->m_picWidth
;
287 uint32_t height
= getCUHeight(row
);
288 intptr_t stride
= reconPic
->m_stride
;
289 uint32_t cuHeight
= g_maxCUSize
;
291 m_frameEncoder
->m_checksum
[0] = m_frameEncoder
->m_checksum
[1] = m_frameEncoder
->m_checksum
[2] = 0;
292 updateChecksum(reconPic
->m_picOrg
[0], m_frameEncoder
->m_checksum
[0], height
, width
, stride
, row
, cuHeight
);
293 width
>>= m_hChromaShift
;
294 height
>>= m_vChromaShift
;
295 stride
= reconPic
->m_strideC
;
296 cuHeight
>>= m_vChromaShift
;
298 updateChecksum(reconPic
->m_picOrg
[1], m_frameEncoder
->m_checksum
[1], height
, width
, stride
, row
, cuHeight
);
299 updateChecksum(reconPic
->m_picOrg
[2], m_frameEncoder
->m_checksum
[2], height
, width
, stride
, row
, cuHeight
);
303 static uint64_t computeSSD(pixel
*fenc
, pixel
*rec
, intptr_t stride
, uint32_t width
, uint32_t height
)
307 if ((width
| height
) & 3)
310 for (uint32_t y
= 0; y
< height
; y
++)
312 for (uint32_t x
= 0; x
< width
; x
++)
314 int diff
= (int)(fenc
[x
] - rec
[x
]);
326 /* Consume Y in chunks of 64 */
327 for (; y
+ 64 <= height
; y
+= 64)
332 for (; x
+ 64 <= width
; x
+= 64)
333 ssd
+= primitives
.sse_pp
[LUMA_64x64
](fenc
+ x
, stride
, rec
+ x
, stride
);
336 for (; x
+ 16 <= width
; x
+= 16)
337 ssd
+= primitives
.sse_pp
[LUMA_16x64
](fenc
+ x
, stride
, rec
+ x
, stride
);
339 for (; x
+ 4 <= width
; x
+= 4)
341 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
, stride
, rec
+ x
, stride
);
342 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
+ 16 * stride
, stride
, rec
+ x
+ 16 * stride
, stride
);
343 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
+ 32 * stride
, stride
, rec
+ x
+ 32 * stride
, stride
);
344 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
+ 48 * stride
, stride
, rec
+ x
+ 48 * stride
, stride
);
351 /* Consume Y in chunks of 16 */
352 for (; y
+ 16 <= height
; y
+= 16)
357 for (; x
+ 64 <= width
; x
+= 64)
358 ssd
+= primitives
.sse_pp
[LUMA_64x16
](fenc
+ x
, stride
, rec
+ x
, stride
);
361 for (; x
+ 16 <= width
; x
+= 16)
362 ssd
+= primitives
.sse_pp
[LUMA_16x16
](fenc
+ x
, stride
, rec
+ x
, stride
);
364 for (; x
+ 4 <= width
; x
+= 4)
365 ssd
+= primitives
.sse_pp
[LUMA_4x16
](fenc
+ x
, stride
, rec
+ x
, stride
);
371 /* Consume Y in chunks of 4 */
372 for (; y
+ 4 <= height
; y
+= 4)
377 for (; x
+ 16 <= width
; x
+= 16)
378 ssd
+= primitives
.sse_pp
[LUMA_16x4
](fenc
+ x
, stride
, rec
+ x
, stride
);
380 for (; x
+ 4 <= width
; x
+= 4)
381 ssd
+= primitives
.sse_pp
[LUMA_4x4
](fenc
+ x
, stride
, rec
+ x
, stride
);
390 /* Function to calculate SSIM for each row */
391 static float calculateSSIM(pixel
*pix1
, intptr_t stride1
, pixel
*pix2
, intptr_t stride2
, uint32_t width
, uint32_t height
, void *buf
, uint32_t& cnt
)
396 int(*sum0
)[4] = (int(*)[4])buf
;
397 int(*sum1
)[4] = sum0
+ (width
>> 2) + 3;
401 for (uint32_t y
= 1; y
< height
; y
++)
405 std::swap(sum0
, sum1
);
406 for (uint32_t x
= 0; x
< width
; x
+= 2)
407 primitives
.ssim_4x4x2_core(&pix1
[(4 * x
+ (z
* stride1
))], stride1
, &pix2
[(4 * x
+ (z
* stride2
))], stride2
, &sum0
[x
]);
410 for (uint32_t x
= 0; x
< width
- 1; x
+= 4)
411 ssim
+= primitives
.ssim_end_4(sum0
+ x
, sum1
+ x
, X265_MIN(4, width
- x
- 1));
414 cnt
= (height
- 1) * (width
- 1);
418 /* restore original YUV samples to recon after SAO (if lossless) */
419 static void restoreOrigLosslessYuv(const CUData
* cu
, Frame
& frame
, uint32_t absPartIdx
, uint32_t depth
)
421 uint32_t size
= g_maxCUSize
>> depth
;
422 int part
= partitionFromSizes(size
, size
);
424 PicYuv
* reconPic
= frame
.m_reconPic
;
425 PicYuv
* fencPic
= frame
.m_fencPic
;
427 pixel
* dst
= reconPic
->getLumaAddr(cu
->m_cuAddr
, absPartIdx
);
428 pixel
* src
= fencPic
->getLumaAddr(cu
->m_cuAddr
, absPartIdx
);
430 primitives
.luma_copy_pp
[part
](dst
, reconPic
->m_stride
, src
, fencPic
->m_stride
);
432 pixel
* dstCb
= reconPic
->getCbAddr(cu
->m_cuAddr
, absPartIdx
);
433 pixel
* srcCb
= fencPic
->getCbAddr(cu
->m_cuAddr
, absPartIdx
);
435 pixel
* dstCr
= reconPic
->getCrAddr(cu
->m_cuAddr
, absPartIdx
);
436 pixel
* srcCr
= fencPic
->getCrAddr(cu
->m_cuAddr
, absPartIdx
);
438 int csp
= fencPic
->m_picCsp
;
439 primitives
.chroma
[csp
].copy_pp
[part
](dstCb
, reconPic
->m_strideC
, srcCb
, fencPic
->m_strideC
);
440 primitives
.chroma
[csp
].copy_pp
[part
](dstCr
, reconPic
->m_strideC
, srcCr
, fencPic
->m_strideC
);
443 /* Original YUV restoration for CU in lossless coding */
444 static void origCUSampleRestoration(const CUData
* cu
, Frame
& frame
, uint32_t absPartIdx
, uint32_t depth
)
446 if (cu
->m_cuDepth
[absPartIdx
] > depth
)
448 /* TODO: this could use cuGeom.numPartition and flags */
449 uint32_t curNumParts
= NUM_CU_PARTITIONS
>> (depth
<< 1);
450 uint32_t qNumParts
= curNumParts
>> 2;
451 uint32_t xmax
= cu
->m_slice
->m_sps
->picWidthInLumaSamples
- cu
->m_cuPelX
;
452 uint32_t ymax
= cu
->m_slice
->m_sps
->picHeightInLumaSamples
- cu
->m_cuPelY
;
454 /* process four split sub-cu at next depth */
455 for (int subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++, absPartIdx
+= qNumParts
)
457 if (g_zscanToPelX
[absPartIdx
] < xmax
&& g_zscanToPelY
[absPartIdx
] < ymax
)
458 origCUSampleRestoration(cu
, frame
, absPartIdx
, depth
+ 1);
464 // restore original YUV samples
465 if (cu
->m_tqBypass
[absPartIdx
])
466 restoreOrigLosslessYuv(cu
, frame
, absPartIdx
, depth
);
469 void FrameFilter::processSao(int row
)
471 SAOParam
* saoParam
= m_frame
->m_encData
->m_saoParam
;
473 if (saoParam
->bSaoFlag
[0])
474 m_sao
.processSaoUnitRow(saoParam
->ctuParam
[0], row
, 0);
476 if (saoParam
->bSaoFlag
[1])
478 m_sao
.processSaoUnitRow(saoParam
->ctuParam
[1], row
, 1);
479 m_sao
.processSaoUnitRow(saoParam
->ctuParam
[2], row
, 2);
482 if (m_frame
->m_encData
->m_slice
->m_pps
->bTransquantBypassEnabled
)
484 uint32_t numCols
= m_frame
->m_encData
->m_slice
->m_sps
->numCuInWidth
;
485 uint32_t lineStartCUAddr
= row
* numCols
;
487 for (uint32_t col
= 0; col
< numCols
; col
++)
488 origCUSampleRestoration(m_frame
->m_encData
->getPicCTU(lineStartCUAddr
+ col
), *m_frame
, 0, 0);