From: Jérôme Benoit Date: Wed, 14 Jan 2015 16:51:29 +0000 (+0100) Subject: Imported Upstream version 1.4+222+hg5f9f7194267b X-Git-Tag: upstream/1.4+222+hg5f9f7194267b^0 X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=refs%2Fheads%2Fupstream;p=deb_x265.git Imported Upstream version 1.4+222+hg5f9f7194267b --- diff --git a/.gitignore b/.gitignore deleted file mode 100644 index a8da9d5..0000000 --- a/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -doc/uncrustify/uncrustify.exe -build/ -*.rej -*.orig -*.hevc -*.yuv -*.y4m -*.out -*.swp -.DS_Store -.pc diff --git a/.hg_archival.txt b/.hg_archival.txt deleted file mode 100644 index 39aab44..0000000 --- a/.hg_archival.txt +++ /dev/null @@ -1,4 +0,0 @@ -repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf -node: 5e604833c5aa605d0b6efbe5234492b5e7d8ac61 -branch: stable -tag: 1.4 diff --git a/.hgignore b/.hgignore deleted file mode 100644 index 87f1042..0000000 --- a/.hgignore +++ /dev/null @@ -1,11 +0,0 @@ -syntax: glob -doc/uncrustify/uncrustify.exe -build/ -**.rej -**.orig -**.hevc -**.yuv -**.y4m -**.out -**.swp -.DS_Store diff --git a/.hgtags b/.hgtags deleted file mode 100644 index 42d4ebd..0000000 --- a/.hgtags +++ /dev/null @@ -1,17 +0,0 @@ -681eabf8a086faea6141f9c1f5a72c9897ed8b29 LASTKNOWNGOOD1 -3ec4837e6f6c7159f438e1f537dff117c93ee139 LASTKNOWNGOOD2 -9a6800e84295db446fdce2e7f27059ec8ae838a7 LASTKNOWNGOOD -99fab2ef92be051cd3b3b2d817064cead282b42c 0.1 -b3471d9009f5cd487b23c8c61a6bfff8980e54f2 0.2 -3767fbfa970ff4b2dc2e8647db0274168727147e 0.3 -2ba6ec553f218d2b06ad803b87d6ec751fd639f7 0.4 -93707bc4fccdaa89a1f2da11db8808ca912a691c 0.4.1 -69acb3cb777f977f5edde908069ac565915dd366 0.5 -b970ffbdd696e3ce45c93b315902eb6366ff085e 0.6 -d24e2a8c4326b0cd01bfa6c414c5378481af9018 0.7 -527d03c56d6860dc979ddea1196f7e94d13d3e82 0.8 -82bbd2bf3b49ba086be0f0922f91fe0084896351 0.9 -cea97c4d79456842e00ade6be6fd5ec34610e5f8 1.0 -ae9609aeebdc3271114168ece003679e9b1dca1b 1.1 -d6257335c5370ee54317a0426a12c1f0724b18b9 1.2 -c1e4fc0162c14fdb84f5c3bd404fb28cfe10a17f 1.3 diff --git a/ChangeLog b/ChangeLog index 80323fb..8aa1413 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,1639 @@ +2014-12-23 Satoshi Nakagawa + + * source/encoder/rdcost.h, source/encoder/search.cpp: + rdcost: unify scaleChromaDist*() + [5f9f7194267b] [tip] + +2014-12-23 Gopu Govindaswamy + + * source/encoder/encoder.cpp: + encoder: allocate memory for inter and intra analysis data based on + slicetype + [9fdab427a191] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: remove redundant argument in compressIntraCU + [c4ec3f22846b] + +2014-12-20 Satoshi Nakagawa + + * source/encoder/search.cpp: + fix 4:4:4 rd<=1 + [8d2f418829c8] + +2014-12-18 David T Yuen + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm: idct[8x8] sse2 12232 -> 3500 over c code 3550 -> 3500 over + intrinsic + [7b816fdb393d] + +2014-12-17 Steve Borho + + * source/PPA/ppaCPUEvents.h, source/encoder/frameencoder.cpp: + ppa: emit one event per CTU for more clarity, disable frame threads + events + + The frame threads are generally uninteresting when WPP is in use + [78ae7996a1ce] + + * source/PPA/ppaCPUEvents.h, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/slicetype.cpp, + source/x265.cpp: + ppa: refine event names + + Drop the unused names, remove uninteresting events. Try to cover the + main thread pool tasks and the frame encoder times. + [6cbd7d26b2a1] + + * source/PPA/ppa.cpp, source/PPA/ppa.h, source/PPA/ppaApi.h: + ppa: simplify interfaces, enforce coding style + [952a2a361fcb] + + * source/common/common.h, source/encoder/analysis.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/x265.cpp: + ppa: minimize code foot-print of profiling events + + This will allow us to add support for more profiling systems without + littering the code + [3315d6c0ced1] + + * doc/reST/cli.rst, source/x265.h: + doc: improve documentation for --stats and multi-pass in general + [42fb030a4c43] + +2014-12-16 Min Chen + + * source/encoder/nal.cpp: + fix: output wrong WppEntryOffset when emulating start code at end of + WPP row + [295d033cb091] + +2014-12-16 Aasaipriya Chandran + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: chroma_hpp[16x16] for colorspace i420 in avx2 improve + 1540c->969c + [775ebb4694ad] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: chroma_hpp[32x32] for colorspace i420 in avx2 improve + 6189c->3537c + [619c0e654f5b] + +2014-12-13 Steve Borho + + * source/encoder/api.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h: + encoder: combine create() and init() functions + + They were always called back-to-back() and their functionality was + non-distinct. It also now checks for abort errors at startup and + returns a NULL from the encoder open function (early aborts are + usually malloc failures) + [6ba7be7b1697] + + * source/CMakeLists.txt, source/cmake/CMakeASM_YASMInformation.cmake: + cmake: eoln and white-space fixes, slight refactor + [ee36b6311aaf] + +2014-12-12 Steve Borho + + * source/encoder/analysis.h: + analysis: typo + [d00a5b93c07e] + + * source/CMakeLists.txt, source/cmake/CMakeASM_YASMInformation.cmake: + cmake: allow position independent code to be generally configurable + (fixes #91) + + Allow the builder to over-ride the default + [afdcb68dace4] + +2014-12-11 Steve Borho + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: add methods to estimate CU mode decision costs + [e0374c37e745] + +2014-12-12 Steve Borho + + * source/common/pixel.cpp: + pixel: nits + [750839e8e0cf] + + * doc/reST/cli.rst, source/common/param.cpp, source/x265.h: + api: change default AQ mode to 1 + + We've received a lot of feedback that AQ mode 2 is often + problematic, but AQ mode 1 is generally safe and useful. + [cbf5cad2e12b] + +2014-12-12 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vps[4x4] in avx2: improve 337c->219c + [6f770a6b24f0] + +2014-12-11 Steve Borho + + * build/README.txt: + build: update README to not be so specific about yasm 1.2.0 + [b1c2ef980dfe] + +2014-12-10 Steve Borho + + * source/encoder/reference.cpp: + reference: avoid weighting pixels when plane is unweighted + + Just because the luma plane is weighted does not mean either of the + chroma planes are also weighted. If the weight parameters for a + given plane are not present, then just directly use the un-weighted + reference plane. + [ae50be4c3a6e] + +2014-12-11 Aasaipriya Chandran + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: chroma_hpp[4x4] for colorspace i420 in avx2 improve 217c->192c + [667e4ea0899f] + +2014-12-10 Steve Borho + + * doc/reST/cli.rst: + doc: describe what happens when psy-rd is too high for bitrate + [9c3b478a60b2] + +2014-12-10 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: chroma_vpp[32x32] for colorspace i420 in avx2: improve + 3881c->1933c + [04d145864dd6] + +2014-12-10 Steve Borho + + * source/encoder/analysis.cpp: + analysis: avoid redundant MC work + [9e244ebe21d2] + + * source/encoder/analysis.cpp: + analysis: fix chroma predictions for 2Nx2N bidir at zero mv + + Valgrind discovered that the chroma predictions were not in fact + predicted + [0dc816f49c01] + + * source/x265.h: + api: add some blank lines + [ab1e1e0ca75c] + +2014-12-09 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h: + asm: chroma_vpp[4x4] for colorspace i422 in avx2: improve 228c->184c + [5f16dc82652a] + +2014-12-10 Steve Borho + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/frameencoder.cpp, source/encoder/motion.cpp, + source/encoder/reference.cpp, source/encoder/reference.h, + source/encoder/slicetype.cpp: + reference: weight chroma planes of reference pictures if using + chroma satd + [6c32c8d4e0a1] + +2014-12-08 Steve Borho + + * doc/reST/cli.rst, source/encoder/analysis.cpp, + source/encoder/frameencoder.cpp, source/encoder/motion.cpp, + source/encoder/motion.h, source/encoder/search.cpp, + source/encoder/slicetype.cpp: + motion: chroma ME [CHANGES OUTPUTS] + + include chroma distortion in satd decisions when --subme > 2 and + chroma blocks are multiples of 4x4 + + This required making the MotionEstimate class more aware of PicYuv + and its indexing scheme so that it could find the correct chroma + pixels to interpolate. This allowed me to merge the setSourcePlane() + method into the lookahead's version of setSourcePU. + + This requires further work. The Reference class needs to generate + weighted chroma planes if subpel refine will use chroma residual + cost. Until this is fixed, the chroma subpel steps will use + unweighted reference pixels. + [afd5620c77a4] + +2014-12-09 Steve Borho + + * source/common/pixel.cpp, source/common/primitives.cpp: + primitives: use NULL chroma satd func pointers for blocks not + capable of satd + + If the block is not a multiple of 4x4, then chroma satd measurements + are not possible, so we will disable chroma residual measurements + for these block sizes (and thus only measure luma residual) + [4c97d85c8488] + + * source/common/primitives.cpp: + primitives: use luma satd functions for chroma, where applicable + + The commented lines should be considered TODO items for the assembly + team + [29489f2fc2c7] + + * source/common/pixel.cpp, source/common/primitives.h: + primitives: add a chroma satd table that is indexed by luma + partition + + There are a number of chroma partitions that have dimensions of 2 or + 6 and those cannot use satd (which is 4x4 based), so we degrade them + down to SAD which makes me unhappy. + [47c490836fd8] + +2014-12-08 Steve Borho + + * source/common/lowres.h, source/encoder/reference.cpp, + source/encoder/reference.h: + reference: move reconPic pointer to base class so it is available to + ME + [dd55fd39745c] + + * source/encoder/motion.cpp: + motion: sync argument names between the header and the cpp file + [e2b958539e6a] + + * source/common/yuv.cpp: + yuv: fix size check in copyFromYuv + + The target buffer needs to be as large as or larger than the source. + The fact that this check has never failed tells me all users of this + function have equal sized arguments. + [15be837edb36] + + * source/encoder/search.cpp: + search: rename index variable to puIdx for consistency + [1cab6a4c0ab8] + + * source/common/yuv.cpp, source/common/yuv.h, + source/encoder/analysis.cpp, source/encoder/motion.cpp, + source/encoder/motion.h, source/encoder/search.cpp: + motion: add a version of setSourcePU which can accept fenc from + another Yuv + + The analysis code has already gone through the trouble of loading + the CU's fenc pixels from the source picture into a much smaller Yuv + buffer with small strides. This allows us to avoid accessing the + fenc PicYuv in a performance critical portion of the encoder. + + We utilize the Yuv class to copy the PU, since it already has logic + for calculating part offsets for luma and chroma + [1d1f803a3eec] + + * source/encoder/motion.cpp, source/encoder/motion.h, + source/encoder/search.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + motion: use Yuv instance to hold fenc PU pixels (preparing for + chroma ME) + + This required making an init function which accepts the encoder + color space. We use 4:0:0 for lookahead since it does not keep + chroma planes. Note that I explicitly renamed this Yuv instance + fencPUYuv to make sure people understand it is not a duplicate of + the fencYuv kept by the Analysis structure; it will often be a sub- + partition of the CU fenc yuv. + [e640c8461495] + + * source/encoder/slicetype.cpp: + slicetype: cleanups - use bufSATD method where applicable + [b5b05c94ae7c] + + * source/common/yuv.cpp: + yuv: plumb in support for mono-chrome YUV buffers + + The need for this will be obvious in the next commit + [5a44d694ed9b] + +2014-12-09 Aasaipriya Chandran + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: chroma_hpp[8x8] for colorspace i420 in avx2 improve 530c->373c + [88498ec9b10b] + +2014-12-08 Steve Borho + + * source/common/x86/asm-primitives.cpp: + asm: fix x86 link errors + [b376435b31c1] + +2014-12-09 Deepthi Nandakumar + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: chroma_vpp[16x16] for colorspace i420 in avx2: improve + 998c->978c + [d042d1ea2d69] + +2014-12-05 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: chroma_vpp[8x8] for colorspace i420 in avx2: improve 338c->269c + [fee9fb1f9762] + +2014-12-06 Satoshi Nakagawa + + * source/common/cudata.h, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/search.cpp, source/encoder/search.h: + refine tuDepth related + [53f7efef5ebd] + +2014-12-05 Steve Borho + + * source/cmake/version.cmake: + cmake: do not use a cache string for version found in hg_archive.txt + (refs #84) + + This was not passing the tagged version number to version.cpp + [35d086074bb5] + +2014-12-04 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc : fix bug in deciding qp for first frame in CRF + [1458ad34157c] + + * source/encoder/rdcost.h, source/encoder/sao.cpp: + rc: fix chroma qp and chroma lambda derivations. + + fix the chroma qp values for Main10 profile, derive chroma qp from + luma qp values according to the HEVC spec. improves quality at high + qps. + [a1e76461c0d4] + +2014-12-05 Deepthi Nandakumar + + * source/encoder/analysis.cpp: + analysis: comments + [4ae9691c1a23] + +2014-12-05 Satoshi Nakagawa + + * source/encoder/analysis.cpp: + fix chroma distortion for 4:2:2 + [42df5c8bdb25] + +2014-12-04 Steve Borho + + * source/encoder/CMakeLists.txt: + cmake: disable idiotic uninitialized local variable warnings from VC + + If the compiler is not going to make any minimal attempt to figure + out if a variable was initialized, I am not going to make any + attempt to look at their stupid warnings. + [c9fd35f97e6d] + +2014-12-04 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h: + asm: chroma_vpp[4x4] for colorspace i420 in avx2: improve 228c->184c + [23e637065aec] + +2014-12-04 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: cache m_bChromaSa8d and reduce redundant work + + Renamed some 'part' variables to 'puIdx' to avoid variable shadow + warnings and for consistency with search.cpp + [cc327e846dac] + +2014-12-04 Deepthi Nandakumar + + * source/encoder/analysis.cpp: + analysis: add chroma distortion to rdLevels 3 and 4 + + At these rdLevels, inter/bidir and merge candidate decisions were + being taken based on luma sa8dCost only. This will increase bitrate + and lower ssim slightly, with better subjective quality. + + Also fixed some naming nits. + [1d2a11f6a33f] + + * doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp, + source/encoder/frameencoder.cpp, source/encoder/search.cpp, + source/x265.cpp, source/x265.h: + noiseReduction: allow separate strengths to be specified for intra + and inter CUs + [ec06f5878e8b] + +2014-12-04 Aarthi Thirumalai + + * source/common/x86/asm-primitives.cpp: + primitives: fix build error in refactor of chroma p2s primitive. + [511dde5ac1de] + +2014-12-03 Steve Borho + + * source/common/ipfilter.cpp, source/common/lowres.cpp, + source/common/pixel.cpp, source/common/predict.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/shortyuv.cpp, + source/common/x86/asm-primitives.cpp, source/common/yuv.cpp, + source/encoder/search.cpp, source/test/ipfilterharness.cpp, + source/test/pixelharness.cpp: + primitives: cleanup EncoderPrimitives, refactor chroma p2s primitive + + No behavior changes + [b1b5f06fe9ce] + + * source/common/pixel.cpp, source/common/primitives.h: + primitives: remove unused chroma lowres primitive + [bfeee4ac5463] + + * source/encoder/search.cpp: + search: avoid AMVP selection if both MVs are the same + + This is a simple work avoidance optimization, should have no effect + on outputs + [2f66c3284c35] + + * source/common/CMakeLists.txt, source/common/primitives.cpp: + cmake: remove buggy workarounds for partial SIMD support (fixes #92) + + In the past, there were a number of primitives written in SIMD + intrinsics that could work without compiling with YASM. Most of + those are now gone, and we generally require YASM for SIMD support. + This commit remoes support for using the few remaining SIMD + intrinsics without having YASM to provide implementations of + x265_emms(), x265_cpu_cpuid(), etc. Fixing a bug in the process. + [d7b5e73fc91a] + + * doc/reST/cli.rst: + doc: fix typo (closes #83) + [7192725cbb0a] + + * doc/reST/cli.rst, source/common/param.cpp, source/x265.cpp, + source/x265.h: + param: allow NR values from 1..99, clarify docs (closes #87) + [21b869f9f706] + + * doc/reST/Makefile, doc/reST/conf.py, doc/reST/x265.rst: + doc: add support for reST generated man-pages (closes #89) + + This patch was attached to issue #89 by djcj + [ff08fd9b294c] + + * source/common/constants.cpp: + constants: adjust lambda tabels for 10bit encodes (fixes #55) + + Since samples are 10bits, where two bits of extra resolution has + been added to add more granularity, distortion also has two extra + bits. A typical resolution for this problem is to down-shift + distortion by 2 bits everywhere, before adding lambda * bits to + calculate RD cost. Instead, we multiply lambda by 4 (essentially + shift it up by two bits) so distortion and lambda * bits are both at + the higher scale. + + lambda2 uses the square of the up-shifted lambda, so it has the + doubled up-shift same as the squared distortion values used for RDO. + + Example output change: ./x265 + /Volumes/video/sintel_trailer_2k_480p24.y4m o.bin --ssim --no-info + + Main: 195.67 kb/s, SSIM Mean Y: 0.9833338 (17.782 dB) Main10 before: + 363.49 kb/s, SSIM Mean Y: 0.9888182 (19.515 dB) Main10 after: 206.54 + kb/s, SSIM Mean Y: 0.9855121 (18.390 dB) + [014a1e0fb58b] + +2014-12-03 Gopu Govindaswamy + + * source/encoder/encoder.cpp: + encoder: fix binary mismatch for analysis load vs save with same + bitrate + [50d2b92ecc89] + +2014-12-02 Steve Borho + + * Merge + [de54cffaecf2] + +2014-11-27 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[8x16, 8x32] in avx2: improve 1139c->774c, 1968c->1452c + [2e055cbc9046] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[12x16] in avx2: improve 1977c->1418c + [ef4ca8474f5c] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[24x32] in avx2: improve 5637c->3695c + [8aeeaf6950f7] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[48x64] in avx2: improve 21298c->14696c + [d97b1c9f5106] + +2014-12-02 Deepthi Nandakumar + + * source/x265.cpp: + x265: add ratetol to command line help + [f636a0aadd68] + +2014-12-01 Deepthi Nandakumar + + * source/CMakeLists.txt, source/encoder/encoder.cpp, source/x265.h: + encoder: free csv file name + + Since strdup is used uniformly for filenames, csvfn cannot be const. + [bde1753de250] + +2014-11-27 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[8x16, 8x32] in avx2: improve 1139c->774c, 1968c->1452c + [5ee693e4b5fa] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[12x16] in avx2: improve 1977c->1418c + [e280ce2e5076] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[24x32] in avx2: improve 5637c->3695c + [e1ca311bbb5b] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[48x64] in avx2: improve 21298c->14696c + [984271a3aae9] + +2014-11-30 Deepthi Nandakumar + + * source/x265.cpp: + x265: remove validateFanout + [d9f835ddd112] + +2014-11-27 Satoshi Nakagawa + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + primitives: refactor tskip related + [90401d77a05d] + +2014-11-28 Satoshi Nakagawa + + * source/common/dct.cpp, source/common/quant.h, + source/common/x86/dct8.asm, source/common/x86/dct8.h, + source/encoder/search.cpp: + nits + [e2db5f3c6df8] + +2014-11-28 Deepthi Nandakumar + + * source/common/param.cpp: + param: disable b-intra in B frames when tune grain is true. + [d32249002258] + +2014-11-25 Gopu Govindaswamy + + * source/encoder/encoder.h: + encoder: make all member fields public + [af6b68f0feaa] + +2014-11-26 Steve Borho + + * doc/reST/cli.rst, doc/reST/presets.rst: + doc: restructure documentation with better grouping, improve cross- + refs + [dfe0803ae6be] + + * doc/reST/introduction.rst: + doc: fix a sphinx build warning + [f488b394693b] + + * doc/reST/presets.rst: + doc: improve readability of film grain section + [03bd64057e72] + + * doc/reST/cli.rst, doc/reST/presets.rst: + doc: add cbr to the list of tunings, add helpful cross-refs + [071dbe651364] + +2014-11-27 Aarthi Thirumalai + + * source/CMakeLists.txt, source/common/param.cpp, + source/encoder/ratecontrol.cpp, source/x265.cpp, source/x265.h: + rc: introduce cli option to tune for cbr. + [8e602ed5ca4c] + +2014-11-25 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: improve the frame size planning with respect to vbv buffer + occupancy and the lookahead window. + [2870269cdd60] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: adjust qp for B frames from ABR feedback in case of CBR. + + limits the bitrate fluctuation for CBR with respect to the target + bitrate set. + [576c675adf92] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: limit bit amortization in ABR to longer sequences + [11342c8376dd] + +2014-11-26 Steve Borho + + * source/encoder/ratecontrol.cpp: + rc: use c-style typecasts + [c67b4f3a5e3c] + +2014-11-19 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: tune midframe vbv logic for B frames + [8f5fa9538e13] + +2014-11-21 Aarthi Thirumalai + + * source/encoder/slicetype.cpp: + slicetype: fix vbv lookahead data collection for all frames within + the lookahead window. + [52246e09727d] + +2014-11-26 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_hpp[8x8, 8x16, 8x32] in avx2: improve 623c->523c, + 1384c->1083c, 2555c->2058c + [01d82aa06285] + +2014-11-26 Aasaipriya Chandran + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + Luma_hpp[48x64] avx2 asm code : improved 25053c->17882c + [bb7303bb00d1] + +2014-11-26 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_hpp[8x4] in avx2: improve 357c->261c + [a88ddc970748] + +2014-11-26 Aasaipriya Chandran + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + Luma_hpp[32x8 , 32x16 , 32x24 , 32x32 , 32x64] avx2 asm code: + improved 2032c->1556c, 4238c->3014c, 6696c->4801c, 8697c->6433c, + 16823c->12297c + [b0153f354186] + +2014-11-26 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[64x16] in avx2: improve 7245c->4910c + [5700875b428f] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[64x32, 64x48, 64x64] in avx2: improve 14150c->9810c, + 21132c->14684c, 28663c->19616c + [db518f7c8474] + +2014-11-25 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[32x8] in avx2: improve 2047c->1472c + [d57c28a3010b] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[32x24] in avx2: improve 5562c->3899c + [dedc5a8589a6] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[32x16] in avx2: improve 3808c->2491c + [3db00b06aea6] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[32x32, 32x64] in avx2: improve 7247c->4909c, + 14365c->9774c + [adf15e303c37] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[16x32, 16x64] in avx2: improve 3875c->2463c, + 7499c->4894c + [45456cd145d8] + +2014-11-25 Aasaipriya Chandran + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: avx2 for Luma_hpp[16x4, 16x8, 16x12, 16x16 , 16x32, 16x64] + + 619c->458c, 1174c->812c, 1694c->1112c, 2291c->1535c, 4846c->3207c, + 9294c->6104c + [d11d3120361f] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: avx2 for luma_hpp[64x64, 64x48, 64x32, 64x16] + + 33137c->22606c , 24826c->17202c , 16726c->11560c , 7830c->5534c + [1e8a0f1e0889] + +2014-11-22 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: do not use bitmaps for framefilter if not WPP + + The non-WPP row loop wants to do frame filter work in between each + row, with a m_filterRowDelay lag. If we use the functions which + update the bitmap, it would allow a worker thread to process a + filter row before it was ready. In short, the non-WPP path was never + intended to work in the presence of a thread pool. This was causing + crashes when --no-wpp --pmode||--pme was used. + [8011e2a68b88] + +2014-11-24 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: release row lock while waiting during VBV restarts + + This fixes what appears to have been an old deadlock bug that has + just recently become very reproducible + [82f6e4847d57] + +2014-11-21 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[16x4] in avx2: improve 734c->497c + [3c6f703f94ea] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[16x8] in avx2: improve 1195c->745c + [fc83cf5299ae] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[16x12] in avx2: improve 1644c->1018c + [65017182318c] + +2014-11-21 Praveen Tiwari + + * source/common/dct.cpp: + idct32_c: C code optimization + [346fccbba4de] + + * source/common/dct.cpp: + idct16_c: optimization + [388c893d3825] + + * source/common/dct.cpp: + idct8_c: optimization + [f7d7c480b85d] + + * source/common/dct.cpp: + idct4_c: optimization + [69a472a77b49] + + * source/common/dct.cpp: + dct32_c: optimization + [a60dfb900169] + + * source/common/dct.cpp: + dct16_c: optimization + [7e94ea285179] + + * source/common/dct.cpp: + dct8_c: optimization + [d426e93e240c] + + * source/common/dct.cpp: + dct4_c: C code optimization + [d4376e113855] + + * source/common/dct.cpp: + idst4_c: optimization + [8f373c20bc41] + + * source/common/dct.cpp: + dst4_c: optimization + [49b66c57972d] + +2014-11-21 Satoshi Nakagawa + + * source/common/pixel.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + fix copy16to16_shl + [5a8da9cb52e8] + +2014-11-20 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + analysis: explicit locking for pmode and pme parameters + + We've found a repro case involving --no-wpp --pmode --pme --preset + slower where time starved worker threads get stuck in the findJob() + routine and pushed off the CPU in the mean time the master thread + moves on to another CU. This caused very hard to reproduce crashes. + [2f8df4c972b9] + +2014-11-20 David T Yuen + + * source/common/vec/dct-sse3.cpp: + Updated intrinsic of idct8 sse3 for new input format + [2abf89f5c4f2] + +2014-11-20 Divya Manivannan + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[16x16] in avx2: improve 2141c->1284c + [2a2142982602] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[8x4] in avx2: improve 498c->257c + [c2fd1b7d5d99] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: luma_vpp[8x8] in avx2: improve 701c->387c + [562c43f738e4] + +2014-11-20 Steve Borho + + * source/encoder/encoder.cpp: + encoder: nits and alloc zero fix + + intraData needs to be zerod on allocation else if one of the later + allocs failed some of the pointers will be uninitialized and passed + to X265_FREE() + [80dcd3dfb805] + +2014-11-20 Praveen Tiwari + + * source/common/dct.cpp: + Fix for C code mismatch + + This patch is for fix the the binary mismatch in encoded output + introduced during refactorizaton of the transform/quant path. + Basically it is original version of code to make sure all valid + inputs are copied in input buffer, in other hand it is not fully + optimized code but this patch is quick fix for the problem and allow + us to optimze one function at a time. + [1d17ec0cb954] + +2014-11-20 Satoshi Nakagawa + + * source/common/pixel.cpp: + fix for old gcc + [ed587d360b97] + +2014-11-20 Deepthi Nandakumar + + * build/icl32/build-all.bat, build/icl32/make-makefile.bat, + build/icl64/build-all.bat, build/icl64/make-makefile.bat: + build: remove icl32 and icl64 scripts + + Typical Windows ICL users link with Visual Studio + [3649fabf90d3] + +2014-11-20 Praveen Tiwari + + * source/common/x86/ipfilter8.asm: + luma_hpp[4x4]: AVX2 asm code bug fix + [4b637cb9b792] + +2014-11-20 Gopu Govindaswamy + + * source/encoder/encoder.cpp: + encoder: fix analysis file read + [0c25a6eac0ca] + +2014-11-20 Satoshi Nakagawa + + * source/encoder/analysis.cpp: + fix for rd=0 + [b33cbe130c63] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp, source/encoder/frameencoder.cpp, + source/encoder/search.cpp: + replace char to int8_t, where it should be signed char + [14a8bb7bbcab] + +2014-11-19 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp: + disable denoiseDct asm code until fixed for Mac OS + [f236adb703f5] + +2014-11-16 Satoshi Nakagawa + + * source/common/dct.cpp, source/common/ipfilter.cpp, + source/common/picyuv.h, source/common/pixel.cpp, + source/common/predict.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/quant.h, + source/common/shortyuv.cpp, source/common/vec/dct-sse3.cpp, + source/common/vec/dct-ssse3.cpp, source/common/x86/blockcopy8.h, + source/common/x86/dct8.h, source/common/x86/ipfilter8.h, + source/common/x86/mc.h, source/common/x86/pixel-util.h, + source/common/x86/pixel.h, source/common/yuv.cpp, + source/encoder/analysis.cpp, source/encoder/rdcost.h, + source/encoder/search.cpp: + primitives: clarify constness + [99b5cebf8193] + +2014-11-18 Steve Borho + + * source/common/dct.cpp: + dct: fix gcc warnings + [34cb58c53859] + +2014-11-18 Praveen Tiwari + + * source/common/dct.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/quant.cpp, + source/common/quant.h, source/common/vec/dct-sse3.cpp, + source/common/vec/dct-sse41.cpp, source/common/vec/dct-ssse3.cpp, + source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/common/x86/dct8.asm, source/common/x86/dct8.h, + source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm, + source/test/mbdstharness.cpp, source/test/mbdstharness.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + refactorizaton of the transform/quant path. + + This patch involves scaling down the DCT/IDCT coefficients from + int32_t to int16_t as they can be accommodated on int16_t without + any introduction of encode error, this allows us to clean up lots of + DCT/IDCT intermediate buffers, optimize enode efficiency for + different cli options including noise reduction by reducing data + movement operations, accommodating more number of coefficients in a + single register for SIMD operations. This patch include all + necessary changes for the transfor/quant path including unit test + code. + [8bee552a1964] + +2014-11-19 Satoshi Nakagawa + + * source/common/common.h: + fseeko for mingw32 + [cb9bb697fcaa] + +2014-11-19 Steve Borho + + * source/common/threading.h: + threadind: fixes for VC11 Win32 includes, prune two unused functions + [2b830f08d948] + +2014-11-18 Steve Borho + + * source/common/wavefront.cpp: + wavefront: fix msvc warning + + warning C4800: 'unsigned long' : forcing value to bool 'true' or + 'false' (performance warning) + [e29c618cd9a7] + + * source/common/param.cpp, source/common/quant.cpp, + source/common/threading.h, source/common/threadpool.cpp, + source/common/wavefront.cpp, source/common/wavefront.h, + source/common/winxp.h, source/encoder/entropy.cpp, + source/encoder/slicetype.cpp: + threading: use 32bit atomic integer operations exclusively + + The 32bit operations have better portability and have less onerous + alignment restrictions. + [814b687db30e] + + * source/common/constants.cpp, source/common/constants.h, + source/common/primitives.cpp, source/encoder/api.cpp, + source/test/intrapredharness.cpp: + constants: remove init/destroyROM functions + [d3389bb9efd0] + + * source/x265.h: + api: fix range limit docs for RQT limit params + [d059cfa88f1a] + + * source/encoder/frameencoder.cpp: + frameencoder: white-space nits + [29a374b62920] + + * source/encoder/analysis.cpp: + analysis: drop MATCH_NON_PMODE macro + + this was a debugging feature, it's not being tested which means it + will get broken and so it's best just to keep the code clean + [dc61091d5cc4] + + * source/common/threading.cpp: + threading: don't use this-> + + We don't do this anywhere else + [3731d9bc7b88] + + * source/common/threading.cpp, source/common/threading.h, + source/common/threadpool.cpp, source/common/threadpool.h: + threading: copyright comment format nits + + be consistent with our other files + [a7b9b90e1bdd] + + * source/common/param.cpp: + param: use strdup() on input strings uniformly + [ad532c30bc95] + +2014-11-18 Deepthi Nandakumar + + * source/encoder/encoder.cpp: + encoder: init filename to NULL + [2f0062f0791b] + +2014-11-17 Gopu Govindaswamy + + * source/common/common.h, source/encoder/analysis.cpp, + source/encoder/search.cpp: + search: fix binary mismatch and inconsistent crash for share inter + information + [854fcbb50220] + + * source/encoder/encoder.cpp: + encoder: force slicetype using analysis file + [05d824463602] + +2014-11-17 Satoshi Nakagawa + + * source/common/cudata.cpp, source/common/lowres.h, + source/common/mv.h, source/encoder/bitcost.h, + source/encoder/motion.cpp, source/encoder/motion.h, + source/encoder/slicetype.cpp: + modify MV default constructor to do nothing + [7a1ec67bd004] + +2014-11-17 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + vbv: tune vbv predictors for better mapping of predicted bits to + encoded bits + [27d36c4b4a27] + +2014-11-16 Deepthi Nandakumar + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: cleanups, init pointers, variable names are made self- + explanatory + [ed2ba7a90567] + +2014-11-12 Gopu Govindaswamy + + * source/encoder/analysis.cpp: + analysis: fix binary mismatch for share intra save and load mode + with same cli + [10b8d3fbe408] + +2014-11-14 Steve Borho + + * source/x265.cpp: + cli: fix analysis filename argument + + This showed up as a GCC warning about an unused variable, but having + the arg handled here prevented the org from being passed to + x265_param_parse() + [8191e0d02455] + + * source/encoder/encoder.cpp: + encoder: add prefix to FREAD and FWRITE macros to avoid MacOSX macro + conflict + + /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform + /Developer/SDKs/MacOSX10.10.sdk/usr/include/sys/fcntl.h:111:9: note: + previous definition is here #define FWRITE 0x0002 + [b617dca5ce12] + + * source/common/common.h, source/common/frame.h, + source/encoder/encoder.h: + common: move analysis reuse structs to common.h + + files in common/ shouldn't include encoder.h + [72f1222903a3] + +2014-11-14 Satoshi Nakagawa + + * source/encoder/analysis.cpp: + analysis: encodeResidue() directly write to reconPic + [c3096034934f] + +2014-11-14 Deepthi Nandakumar + + * source/CMakeLists.txt, source/common/common.h, + source/common/frame.h, source/common/param.cpp, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/api.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/x265.cpp, source/x265.def.in, + source/x265.h: + analysis save/load: refactor full implementation + + 1. Move analysis inter/intra data into encoder 2. Encoder allocates + and frees memory for x265 analysis, remove api calls 3. Inter and + intra data allocated based on sliceType only 4. frame record size is + now variable + [58c2e06c2e4a] + +2014-11-13 Satoshi Nakagawa + + * source/encoder/analysis.cpp: + analysis: don't add the cost of picture boundary CU to avgCost + [CHANGES OUTPUT] + [64314f8061f1] + +2014-11-13 Steve Borho + + * source/cmake/FindVLD.cmake: + cmake: hack to avoid escaping problems in cmake 3.1 parser + + Fix suggested by Mario *LigH* Rohkrämer + [17f2fb0996db] + +2014-11-13 Satoshi Nakagawa + + * source/common/cudata.cpp, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/sao.cpp: + nits + [03974d78f241] + +2014-11-12 Steve Borho + + * source/encoder/rdcost.h: + rdcost: lower the psy-rd scale factor for I slices to 96/256 + + Based on Santhoshini's testing, this is better at preventing + artifacts + [18aefbde72ab] + +2014-11-11 Satoshi Nakagawa + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + refine initializeGeoms() + [98fb658f3229] + +2014-11-11 Steve Borho + + * source/encoder/analysis.cpp: + analysis: fix bidir non-determinism in --pmode --rd 5 + [306ef9782a30] + + * source/common/param.cpp, source/encoder/analysis.cpp, + source/encoder/search.cpp, source/encoder/search.h: + Merge + [fa2fedd97ff2] + +2014-11-10 Steve Borho + + * source/common/quant.cpp: + quant: allow --nr in all slice types evenly + [38fa64a5c51c] + +2014-11-06 Deepthi Nandakumar + + * source/common/common.h, source/common/quant.cpp, + source/common/quant.h: + noiseReduction: apply only for I and P, move NoiseReduction to + quant.h + + This doubles the number of quant nr categories; intra blocks now use + the lower half. + [ed89e58b44e8] + +2014-11-10 Steve Borho + + * doc/reST/cli.rst, source/common/param.cpp: + param: raise --nr limit to 2000 + [27f293dd9eee] + + * doc/reST/presets.rst, source/common/param.cpp: + param: remove --b-intra from --tune grain, document rdoq restriction + [64ccc616be33] + +2014-11-09 Steve Borho + + * source/encoder/rdcost.h: + rdcost: experimental slice-type based psy-rd scale factor + [4f3fd7ab8868] + +2014-11-08 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp: + analysis: RDO based BIDIR decisions + + At RD 0, 1, and 2, this changes 2Nx2N bidir from a SATD decision to + an SA8D decision. + + At RD 3 and 4, if the bidir SA8D cost is within 17/16 of the best + inter cost, then it makes an RDO decision between bestInter and + Bidir (allowing psy-rd to influence the decision, which is the whole + point) + + At RD 5 and 6, 2Nx2N BIDIR is yet another RD choice at the same + level as 2Nx2N inter and rect and amp. (psy) RDO picks the best mode + for each block. + [4c6c28cc93d9] + +2014-11-11 Deepthi Nandakumar + + * source/x265.cpp: + x265: more meaningful error messages in analysis + [838e41fb256b] + + * source/encoder/api.cpp: + api: cleanup + [3c01e8881946] + + * source/encoder/api.cpp: + api: replace analysis data with pre defined constant + [b4effa4dd53b] + + * source/x265.cpp: + x265: create and initialise recon object if analysis mode is enabled + [47b290236ca3] + + * source/common/param.cpp: + param: add default value to analysis mode + [5c397e744cfd] + +2014-11-11 Gopu Govindaswamy + + * source/encoder/analysis.cpp, source/encoder/api.cpp, + source/x265.cpp, source/x265.h: + x265: remove redundant variables from intra and inter analysis + structure + [ad5177c86756] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h, source/x265.h: + analysis: Dump best MV statistics and re-use this for analysis load + mode + + This patch fixes a bug in inter slices in analysis=load|save mode. + Inter data for all partitions is now saved correctly. + [c8004323493e] + +2014-11-10 Satoshi Nakagawa + + * source/common/cudata.cpp, source/common/cudata.h, + source/common/deblock.cpp, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/search.cpp: + cleanup SIZE_NONE. empty CU has MODE_NONE. + [32513a4c3bd4] + +2014-11-09 Steve Borho + + * source/encoder/search.cpp: + search: fixup + [1e04e178a349] + +2014-11-08 Steve Borho + + * source/encoder/reference.cpp, source/encoder/reference.h, + source/encoder/search.cpp: + reference: add methods for querying CU/PU pointers + [9687a9d1205a] + + * source/encoder/analysis.cpp: + analysis: delay initialization of prediction cu in RD 5 and 6 + [b9147e641ce6] + +2014-11-09 Satoshi Nakagawa + + * source/encoder/analysis.cpp: + fix typo + [3dc9857c59d3] + +2014-11-08 Steve Borho + + * source/encoder/analysis.cpp: + analysis: delay initialization of prediction cu until just before + use + + This avoids initializing CUs that may never be used because of + various early-outs + [3f2d68368554] + + * source/encoder/search.cpp, source/encoder/search.h: + search: keep AMVP candidates in mode structure + + This fixes some work replication in --pme and will also make + handling BIDIR as a seperate prediction easier. + [6124c837b3ab] + + * source/encoder/motion.h, source/encoder/search.cpp, + source/encoder/slicetype.h: + motion: remove trivial set methods; make some members public + [53c146f7eb9f] + +2014-11-07 Steve Borho + + * source/encoder/frameencoder.cpp: + nr: fix denoise offset memcopy size + [0912563c4ac1] + + * source/encoder/entropy.h: + entropy: pass context model (state) to bitsCodeBin as uint32_t + + Should be slightly more efficient + [a67b848d6c04] + + * source/encoder/entropy.cpp: + entropy: nit + [b55799a2f5ad] + + * source/encoder/entropy.cpp: + entropy: ensure X265_CHECK() has braces + [0fd8e0c5272a] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: inline methods which mapped to encodeBin() calls + [640d2936e699] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: inline bit counting functions + [ca7873cab172] + + * source/encoder/entropy.cpp: + entropy: use bitsCodeBin in intra mode bit estimate functions + [84fc74874406] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: rename encodeBinContext to bitsCodeBin, make const + + The function is not modifying the context, so there is no need to + pass as a reference, and the function can be const. Also, group the + bit counting RDO functions together + [a1ee9422183b] + + * source/encoder/entropy.cpp: + entropy: white-space nits + [429742055057] + +2014-11-07 Satoshi Nakagawa + + * source/encoder/search.cpp: + fix bug in 522baf03fbbd + [f2130a4dc876] + +2014-11-07 Deepthi Nandakumar + + * source/encoder/search.cpp: + search: fix warnings + [7338b1f1f43d] + +2014-11-07 Satoshi Nakagawa + + * source/encoder/analysis.cpp: + fix typo + [4f034e3adef8] + +2014-11-05 Ashok Kumar Mishra + + * source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/search.cpp, source/encoder/search.h: + [REVIEW PATCH/OUTPUT CHANGED]search: removed multiple encode + Coefficients from estimateResidualQT() + + Tried to remove multiple encode coefficients from + estimateResidualQT() function. Coefficients are encoded in three + stages: Once for calculation of distortion and twice for split and + unsplit block cost calculation. I have given comments where I have + changed the code. + [eb5a9eb03dd6] + + * source/encoder/search.cpp, source/encoder/search.h: + search: made a function for null cost calculation in + xEstimateResidualQT() + [522baf03fbbd] + + * source/encoder/search.cpp, source/encoder/search.h: + search: made separate functions for encoding cbfs in + xEstimateResidualQT() + [0b7c709335b2] + +2014-11-07 Satoshi Nakagawa + + * source/common/cudata.cpp, source/common/cudata.h: + cudata: remove default argument + [bc4f3dab51db] + +2014-11-06 Steve Borho + + * doc/reST/presets.rst: + doc: fix sub-title depth + + Single dash was already used by a higher section + [0ebd0b00bf9b] + + * doc/reST/cli.rst, doc/reST/presets.rst, source/common/param.cpp: + param: add --tune grain + [ec5588025568] + + * source/encoder/search.cpp: + search: ugly bias hack for bidir with psy-rd + [e33e09549c0c] + + * doc/reST/cli.rst: + docs: document RC params, at least minimally + [beac946dac85] + + * source/x265.h: + api: cleanup comments + [8ceaab303bfa] + + * source/x265.cpp: + cli: cleanup CLI help, add 'verbose' tier + + Remove a lot of uncommon features from the initial help output, + require + --log-level debug --help to see it all + [f599a4df57ac] + + * source/common/param.cpp, source/x265.cpp: + api: expose rate control params via x265_param_parse() and CLI + + Adds range checks for qCompress, which has documented limits. The + others have very minimal explanations; so I'm not adding them to the + CLI help. Users should not touch them unless they know what they are + doing. + + Note this commit doesn't bump X265_BUILD since no new params were + added. + [b37cda5d3092] + +2014-11-05 Steve Borho + + * source/common/deblock.cpp: + deblock: fix type conversion warnings + [4a3997fd4fc1] + +2014-11-05 Satoshi Nakagawa + + * source/common/deblock.cpp, source/common/deblock.h, + source/common/quant.cpp, source/common/slice.h, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/framefilter.cpp, source/encoder/rdcost.h, + source/encoder/sao.cpp: + refine deblocking filter + [65e14d5a5728] + +2014-11-04 Gopu Govindaswamy + + * source/x265.cpp: + cli: bug fix for validatefanout param analysis-mode=save and load + [2a8f3d5820a6] + +2014-11-04 gopi jayaraman + + * source/encoder/encoder.cpp: + encoder: use 6 frameNumThreads for cpucount 32 and above + [0dcc6a1d8f02] + +2014-11-04 Steve Borho + + * source/x265.h: + api: add void to functions that take no parameters for -Wstrict- + prototypes + [0d44fcb269a6] + + * source/common/deblock.cpp, source/common/frame.cpp, + source/common/frame.h, source/common/framedata.h, + source/common/predict.cpp, source/encoder/analysis.cpp, + source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/ratecontrol.cpp, source/encoder/sao.cpp, + source/encoder/search.cpp, source/encoder/slicetype.cpp, + source/encoder/weightPrediction.cpp: + frame: rename m_reconPicYuv -> m_reconPic, m_origPicYuv -> m_fencPic + + the fooPicYuv names were potentially confusing, preferred names: + PicYuv* fooPic; Yuv* fooYuv; + [67bf055c13d5] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: remove unused sa8d pointer and bufSA8D method + [59a08101dfc6] + +2014-11-04 Gopu Govindaswamy + + * source/common/cudata.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/api.cpp, + source/encoder/encoder.cpp, source/encoder/search.cpp, + source/encoder/search.h, source/x265.cpp, source/x265.h: + search: dump and share the best motion statistics for inter(P&B) + slices + [d5f6133b99d4] + +2014-11-03 Steve Borho + + * doc/reST/cli.rst: + docs: fix reST parsing issues + [a8ec469d7fb1] + +2014-11-03 Min Chen + + * source/common/primitives.h, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm: + cleanup: remove unused asm calcrecon + [5637b495e2e1] + + * source/common/x86/ipfilter8.asm: + asm: fix typo error in interp_8tap_vert_pp_4x4_avx2 + [ee88b63aced0] + +2014-11-03 Satoshi Nakagawa + + * source/common/cudata.cpp, source/common/cudata.h, + source/common/quant.cpp, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/search.cpp: + cleanup CUData::m_skipFlag + [2e60f3b81981] + +2014-10-31 Steve Borho + + * source/encoder/encoder.cpp: + encoder: make it clear that --fast-cbf is innefective at lower rd + levels + + This begs the question of whether the feature should exist, or + whether it should be added to the lower RD levels + [eebb372eec89] + + * source/common/param.cpp: + param: show options using their CLI / param_parse names + [c32a733a819b] + +2014-10-30 Steve Borho + + * .hgtags: + remove dead non-release tags + + anyone interested in archeology can still find them; there's no + sense to keep them on the tip since we stopped tracking last known + good more than a year ago + [75cb2ab1ecec] + 2014-10-31 Steve Borho + * source/encoder/encoder.cpp: + Merge with stable + [ae8a661acdc4] + + * .hgtags: + Added tag 1.4 for changeset 5e604833c5aa + [d2db9c1ab44b] + * source/encoder/encoder.cpp: encoder: emit an Active Parameter Sets SEI in stream headers if interlaced diff --git a/build/README.txt b/build/README.txt index c087349..d131884 100644 --- a/build/README.txt +++ b/build/README.txt @@ -11,26 +11,27 @@ Note: MSVC12 requires cmake 2.8.11 or later 1. Yasm 1.2.0 or later, to compile assembly primitives (performance) - For Windows, download - http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win32.exe or - http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win64.exe - depending on your O/S and copy the EXE into C:\Windows or somewhere else - in your %PATH% that a 32-bit app (cmake) can find it. If it is not in the - path, you must manually tell cmake where to find it. + For Windows, download the latest yasm executable + http://yasm.tortall.net/Download.html and copy the EXE into + C:\Windows or somewhere else in your %PATH% that a 32-bit app (cmake) + can find it. If it is not in the path, you must manually tell cmake + where to find it. Note: you do not need the vsyasm packages, x265 + does not use them. You only need the yasm executable itself. - For Linux, yasm-1.2.0 is likely too new to be packaged for your system so you - will need get http://www.tortall.net/projects/yasm/releases/yasm-1.2.0.tar.gz - compile, and install it. + On Linux, the packaged yasm may be older than 1.2, in which case + so you will need get the latest source and build it yourself. Once YASM is properly installed, run cmake to regenerate projects. If you do not see the below line in the cmake output, YASM is not in the PATH. - -- Found Yasm 1.2.0 to build assembly primitives + -- Found Yasm 1.3.0 to build assembly primitives - Now build the encoder and run x265 -V. If you see "assembly" on this - line, you have YASM properly installed: + Now build the encoder and run x265 -V: - x265 [info]: performance primitives: intrinsic assembly + x265 [info]: using cpu capabilities: MMX, SSE2, ... + + If cpu capabilities line says 'none!', then the encoder was built + without yasm. 2. VisualLeakDetector (Windows Only) diff --git a/build/icl32/build-all.bat b/build/icl32/build-all.bat deleted file mode 100644 index cbe9a59..0000000 --- a/build/icl32/build-all.bat +++ /dev/null @@ -1,14 +0,0 @@ -@echo off -if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" ) -if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" ) -if "%ICL%" == "" ( - msg "%username%" "Intel C++ 2013 not detected" - exit 1 -) -if not exist Makefile ( - call make-makefile.bat -) -if exist Makefile ( - call "%ICL%\bin\compilervars.bat" ia32 - nmake -) diff --git a/build/icl32/make-makefile.bat b/build/icl32/make-makefile.bat deleted file mode 100644 index 799344e..0000000 --- a/build/icl32/make-makefile.bat +++ /dev/null @@ -1,15 +0,0 @@ -@echo off -:: -:: run this batch file to create an Intel C++ 2013 NMake makefile for this project. -:: See the cmake documentation for other generator targets -:: -if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" ) -if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" ) -if "%ICL%" == "" ( - msg "%username%" "Intel C++ 2013 not detected" - exit 1 -) -call "%ICL%\bin\compilervars.bat" ia32 -set CC=icl -set CXX=icl -cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source diff --git a/build/icl64/build-all.bat b/build/icl64/build-all.bat deleted file mode 100644 index d1d6b8d..0000000 --- a/build/icl64/build-all.bat +++ /dev/null @@ -1,14 +0,0 @@ -@echo off -if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" ) -if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" ) -if "%ICL%" == "" ( - msg "%username%" "Intel C++ 2013 not detected" - exit 1 -) -if not exist Makefile ( - call make-makefile.bat -) -if exist Makefile ( - call "%ICL%\bin\compilervars.bat" intel64 - nmake -) diff --git a/build/icl64/make-makefile.bat b/build/icl64/make-makefile.bat deleted file mode 100644 index 2d3f629..0000000 --- a/build/icl64/make-makefile.bat +++ /dev/null @@ -1,17 +0,0 @@ -@echo off -:: -:: run this batch file to create an Intel C++ 2013 NMake makefile for this project. -:: See the cmake documentation for other generator targets -:: -if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" ) -if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" ) -if "%ICL%" == "" ( - msg "%username%" "Intel C++ 2013 not detected" - pause - exit 1 -) -call "%ICL%\bin\compilervars.bat" intel64 -set CC=icl -set CXX=icl -cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source -pause diff --git a/debian/changelog b/debian/changelog deleted file mode 100644 index eaa41cc..0000000 --- a/debian/changelog +++ /dev/null @@ -1,131 +0,0 @@ -x265 (1.4-4~ubuntu1) trusty; urgency=medium - - * Support for ARMv7 with NEON extensions - - -- Jérôme Benoit Tue, 09 Dec 2014 22:19:18 +0100 - -x265 (1.4-3~trusty) trusty; urgency=low - - * Upstream bugfixes: - * cli: bug fix for validatefanout param analysis-mode=save and load - * docs: fix reST parsing issues - - -- Marshall Banana Wed, 05 Nov 2014 01:29:32 +0100 - -x265 (1.4-2~trusty2) trusty; urgency=low - - * Install documentation in x265-doc package to avoid package conflicts - - -- Marshall Banana Sat, 01 Nov 2014 02:49:51 +0100 - -x265 (1.4-1~trusty) trusty; urgency=low - - * New upstream release - - -- Marshall Banana Sat, 01 Nov 2014 00:20:42 +0100 - -x265 (1.3-4~trusty) trusty; urgency=low - - * Update manpage - - -- Marshall Banana Wed, 01 Oct 2014 18:09:33 +0200 - -x265 (1.3-3~trusty) trusty; urgency=low - - * Rename x265-10b to x265-10bit - * Provide x265-16bit via symbolic link - - -- Marshall Banana Wed, 01 Oct 2014 17:56:41 +0200 - -x265 (1.3-2~trusty) trusty; urgency=low - - * Add doc-base control file - - -- Marshall Banana Fri, 05 Sep 2014 04:07:20 +0200 - -x265 (1.3-1~trusty) trusty; urgency=low - - * New upstream release - - -- Marshall Banana Fri, 22 Aug 2014 20:30:50 +0200 - -x265 (1.2+510-hg2bdcfcc1bb33-1~trusty) trusty; urgency=low - - * Current snapshot - - -- Marshall Banana Mon, 11 Aug 2014 12:51:05 +0200 - -x265 (1.2-3~trusty) trusty; urgency=low - - * Build static library from different object files - - -- Marshall Banana Mon, 21 Jul 2014 05:29:20 +0200 - -x265 (1.2-2~trusty1) trusty; urgency=low - - * Provide separate optimized shared libraries for i686 - - -- Marshall Banana Fri, 11 Jul 2014 20:36:37 +0200 - -x265 (1.2-1~trusty) trusty; urgency=low - - * New upstream version - * Update patch - * Update man page - * Install upstream changelog - - -- Marshall Banana Thu, 10 Jul 2014 19:40:33 +0200 - -x265 (1.1-4~trusty) trusty; urgency=low - - * Install 10bit binary to make usage of 10bit library possible. - - -- Marshall Banana Tue, 17 Jun 2014 10:53:50 +0200 - -x265 (1.1-3~trusty) trusty; urgency=low - - * Don't rename 10 bit library - - -- Marshall Banana Fri, 13 Jun 2014 16:43:34 +0200 - -x265 (1.1-2~trusty) trusty; urgency=low - - * Build less packages - - -- Marshall Banana Wed, 11 Jun 2014 03:47:39 +0200 - -x265 (1.1-1~trusty) trusty; urgency=low - - * New upstream version - - -- Marshall Banana Fri, 13 Jun 2014 16:42:16 +0200 - -x265 (1.1) unstable; urgency=low - - * New upstream version - - -- Marshall Banana Sat, 07 Jun 2014 17:44:06 +0200 - -x265 (1.0+5-dcf74ea39e31) unstable; urgency=low - - * New upstream version - - -- Marshall Banana Sun, 04 May 2014 19:07:30 +0100 - -x265 (0.9+114-c630b0b393ee) unstable; urgency=low - - * New upstream version - - -- Marshall Banana Fri, 04 Apr 2014 01:45:30 +0100 - -x265 (0.8+52-93861c42b879) unstable; urgency=low - - * New upstream version - - -- Marshall Banana Sat, 08 Mar 2014 10:08:00 +0100 - -x265 (0.7+216-591ca91f0501) unstable; urgency=low - - * Initial upload - - -- Marshall Banana Wed, 19 Feb 2014 21:30:00 +0100 diff --git a/debian/compat b/debian/compat deleted file mode 100644 index ec63514..0000000 --- a/debian/compat +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/debian/confflags b/debian/confflags deleted file mode 100644 index 714d8c1..0000000 --- a/debian/confflags +++ /dev/null @@ -1,45 +0,0 @@ -libdir := lib/$(DEB_HOST_MULTIARCH) - - -common_confflags := \ - -DCMAKE_INSTALL_PREFIX=/usr \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_VERBOSE_MAKEFILE=ON - -8bit_confflags := \ - $(common_confflags) \ - -DLIB_INSTALL_DIR=$(libdir) - -10bit_confflags := \ - $(common_confflags) \ - -DBIN_INSTALL_DIR=$(libdir)/x265-10bit \ - -DLIB_INSTALL_DIR=$(libdir)/x265-10bit \ - -DHIGH_BIT_DEPTH=ON - - -static_confflags := \ - -DCMAKE_INSTALL_PREFIX=/usr \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DENABLE_CLI=OFF \ - -DENABLE_SHARED=OFF - -8bit_static_confflags := \ - $(static_confflags) \ - -DLIB_INSTALL_DIR=$(libdir) - -10bit_static_confflags := \ - $(static_confflags) \ - -DLIB_INSTALL_DIR=$(libdir)/x265-10bit \ - -DHIGH_BIT_DEPTH=ON - - -# disable assembly on x86 and arm -ifneq (,$(filter i386 i486 i586 i686 pentium arm,$(DEB_HOST_GNU_CPU))) -noasm = -DENABLE_ASSEMBLY=OFF -DCMAKE_CXX_FLAGS='-DX86_64=0' -8bit_confflags += $(noasm) -10bit_confflags += $(noasm) -8bit_static_confflags += $(noasm) -10bit_static_confflags += $(noasm) -endif - diff --git a/debian/control b/debian/control deleted file mode 100644 index 35db2b2..0000000 --- a/debian/control +++ /dev/null @@ -1,73 +0,0 @@ -Source: x265 -Section: libs -Priority: optional -Maintainer: Marshall Banana -Homepage: https://bitbucket.org/multicoreware/x265/wiki/Home -Standards-Version: 3.9.5 -Build-Depends: - debhelper (>= 9), - cmake (>= 2.8.8), - python3-sphinx, - yasm (>= 1.2.0) [any-i386] -Vcs-Browser: https://bitbucket.org/multicoreware/x265/src - -Package: x265 -Architecture: any -Section: video -Depends: - ${misc:Depends}, - ${shlibs:Depends} -Description: H.265/HEVC video encoder - frontend binary - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - -Package: libx265-35 -Architecture: any -Pre-Depends: - ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends} -Description: H.265/HEVC video encoder - runtime files - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - -Package: libx265-dev -Architecture: any -Section: libdevel -Depends: - ${misc:Depends}, - libx265-35 (= ${binary:Version}) -Description: H.265/HEVC video encoder - development files - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - . - This package contains the static library and - headers used to build programs that use libx265-35. - -Package: x265-doc -Architecture: all -Section: doc -Depends: - ${misc:Depends}, - libjs-jquery (>= 1.4), - libjs-underscore -Description: x265 documentation - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - . - This package contains the x265 documentation. - -Package: x265-dbg -Architecture: any -Section: debug -Priority: extra -Depends: - ${misc:Depends}, - x265 (= ${binary:Version}), - libx265-35 (= ${binary:Version}) -Description: debugging symbols for x265 and libx265 - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - . - This package contains the debugging symbols for x265. diff --git a/debian/control.in b/debian/control.in deleted file mode 100644 index 8703763..0000000 --- a/debian/control.in +++ /dev/null @@ -1,73 +0,0 @@ -Source: x265 -Section: libs -Priority: optional -Maintainer: Marshall Banana -Homepage: https://bitbucket.org/multicoreware/x265/wiki/Home -Standards-Version: 3.9.5 -Build-Depends: - debhelper (>= 9), - cmake (>= 2.8.8), - python3-sphinx, - yasm (>= 1.2.0) [any-i386] -Vcs-Browser: https://bitbucket.org/multicoreware/x265/src - -Package: x265 -Architecture: any -Section: video -Depends: - ${misc:Depends}, - ${shlibs:Depends} -Description: H.265/HEVC video encoder - frontend binary - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - -Package: libx265-@API@ -Architecture: any -Pre-Depends: - ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends} -Description: H.265/HEVC video encoder - runtime files - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - -Package: libx265-dev -Architecture: any -Section: libdevel -Depends: - ${misc:Depends}, - libx265-@API@ (= ${binary:Version}) -Description: H.265/HEVC video encoder - development files - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - . - This package contains the static library and - headers used to build programs that use libx265-@API@. - -Package: x265-doc -Architecture: all -Section: doc -Depends: - ${misc:Depends}, - libjs-jquery (>= 1.4), - libjs-underscore -Description: x265 documentation - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - . - This package contains the x265 documentation. - -Package: x265-dbg -Architecture: any -Section: debug -Priority: extra -Depends: - ${misc:Depends}, - x265 (= ${binary:Version}), - libx265-@API@ (= ${binary:Version}) -Description: debugging symbols for x265 and libx265 - library for encoding video using the High - Efficiency Video Coding (HEVC/H.265) standard. - . - This package contains the debugging symbols for x265. diff --git a/debian/copyright b/debian/copyright deleted file mode 100644 index bd5f670..0000000 --- a/debian/copyright +++ /dev/null @@ -1,115 +0,0 @@ -Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ -Upstream-Name: x265 -Upstream-Contact: Steve Borho -Source: https://bitbucket.org/multicoreware/x265/wiki/Home - - -Files: * -Copyright: 2013-2014 x265 project -License: GPL-2.0+ - -Files: source/common/const-a.asm - source/common/cpu-a.asm - source/common/intrapred.h - source/common/mc-a*.asm - source/common/pixel.h - source/common/pixel-32.asm - source/common/pixel-a.asm - source/common/sad*.asm - source/common/ssd.asm - source/common/x86inc.asm - source/test/checkasm-a.asm -Copyright: 2003-2014 x264 project -License: GPL-2.0+ - -Files: source/common/x86util.asm -Copyright: 2008-2013 x264 project -License: ISC - -Files: source/compat/getopt/* -Copyright: 1987-2001 Free Software Foundation, Inc. -License: LGPL-2.1+ - -Files: source/Lib/* -Copyright: 2010-2013 ITU/ISO/IEC -License: BSD-3-clause - -Files: debian/* -Copyright: 2014 djcj -License: ISC - - -License: GPL-2.0+ - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - . - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - . - You should have received a copy of the GNU General Public License - along with this program. If not, see . - . - On Debian GNU/Linux systems, the complete text of the GNU General Public - License version 2 can be found in '/usr/share/common-licenses/GPL-2'. - - -License: ISC - Permission to use, copy, modify, and/or distribute this software for any - purpose with or without fee is hereby granted, provided that the above - copyright notice and this permission notice appear in all copies. - . - THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - - -License: LGPL-2.1+ - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - . - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - . - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . - . - On Debian systems, the complete text of the GNU Lesser General - Public License version 3 can be found in '/usr/share/common-licenses/LGPL-2.1'. - - -License: BSD-3-clause - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - . - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of the ITU/ISO/IEC nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - . - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS - BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - THE POSSIBILITY OF SUCH DAMAGE. diff --git a/debian/getapi.sh b/debian/getapi.sh deleted file mode 100755 index c23fd26..0000000 --- a/debian/getapi.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -egrep 'set\(X265_BUILD ' source/CMakeLists.txt | sed -e 's/set(X265_BUILD //; s/)//g' diff --git a/debian/libx265-dev.install b/debian/libx265-dev.install deleted file mode 100644 index f58613d..0000000 --- a/debian/libx265-dev.install +++ /dev/null @@ -1,6 +0,0 @@ -usr/include -usr/lib/*/*.a -usr/lib/*/*.so -usr/lib/*/pkgconfig -usr/lib/*/x265-10bit/*.a -usr/lib/*/x265-10bit/*.so diff --git a/debian/libx265N.install b/debian/libx265N.install deleted file mode 100644 index 3d608f1..0000000 --- a/debian/libx265N.install +++ /dev/null @@ -1,2 +0,0 @@ -usr/lib/*/*.so.* -usr/lib/*/x265-10bit/*.so.* diff --git a/debian/patches/armv7l-support.patch b/debian/patches/armv7l-support.patch deleted file mode 100644 index b56a92e..0000000 --- a/debian/patches/armv7l-support.patch +++ /dev/null @@ -1,43 +0,0 @@ ---- a/source/CMakeLists.txt -+++ b/source/CMakeLists.txt -@@ -49,9 +49,13 @@ if("${SYSPROC}" STREQUAL "" OR X86MATCH - message(STATUS "Detected x86 system processor") - endif() - elseif(${SYSPROC} STREQUAL "armv6l") -- message(STATUS "Detected ARM target processor") -- set(ARM 1) -+ message(STATUS "Detected ARMV6 target processor") -+ set(ARMV6 1) - add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1) -+elseif(${SYSPROC} STREQUAL "armv7l") -+ message(STATUS "Detected ARMV7 target processor") -+ set(ARMV7 1) -+ add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=1) - else() - message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown") - message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}") -@@ -129,9 +133,12 @@ if(GCC) - if(X86 AND NOT X64) - add_definitions(-march=i686) - endif() -- if(ARM) -+ if(ARMV6) - add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp) - endif() -+ if(ARMV7) -+ add_definitions(-fPIC -march=armv7 -mfloat-abi=hard -mfpu=neon) -+ endif() - check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING) - check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS) - if (CC_HAS_NO_ARRAY_BOUNDS) ---- a/source/common/cpu.cpp -+++ b/source/common/cpu.cpp -@@ -356,7 +356,7 @@ uint32_t cpu_detect(void) - // which may result in incorrect detection and the counters stuck enabled. - // right now Apple does not seem to support performance counters for this test - #ifndef __MACH__ -- flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0; -+ //flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0; - #endif - // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc) - #endif // if HAVE_ARMV6 diff --git a/debian/patches/bug_fix_for_validatefanout_param_analysis-mode_save_and_load.patch b/debian/patches/bug_fix_for_validatefanout_param_analysis-mode_save_and_load.patch deleted file mode 100644 index 1048433..0000000 --- a/debian/patches/bug_fix_for_validatefanout_param_analysis-mode_save_and_load.patch +++ /dev/null @@ -1,16 +0,0 @@ ---- a/source/x265.cpp -+++ b/source/x265.cpp -@@ -772,12 +772,9 @@ - {\ - bErr = 0;\ - p = strstr(paramBuf, opt "=");\ -- char* q = strstr(paramBuf, "no-"opt);\ - if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\ - bErr = 1;\ -- else if (!param_val && !q)\ -- bErr = 1;\ -- else if (param_val && (q || !strstr(paramBuf, opt)))\ -+ else if (param_val && strstr(paramBuf, "no-"opt))\ - bErr = 1;\ - if (bErr)\ - {\ diff --git a/debian/patches/cpu-detection.patch b/debian/patches/cpu-detection.patch deleted file mode 100644 index 1a441ac..0000000 --- a/debian/patches/cpu-detection.patch +++ /dev/null @@ -1,18 +0,0 @@ ---- a/source/CMakeLists.txt -+++ b/source/CMakeLists.txt -@@ -39,12 +39,14 @@ - set(X86_ALIASES x86 i386 i686 x86_64 amd64) - list(FIND X86_ALIASES "${SYSPROC}" X86MATCH) - if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1") -- message(STATUS "Detected x86 target processor") - set(X86 1) - add_definitions(-DX265_ARCH_X86=1) - if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8) -+ message(STATUS "Detected x86_64 system processor") - set(X64 1) - add_definitions(-DX86_64=1) -+ else() -+ message(STATUS "Detected x86 system processor") - endif() - elseif(${SYSPROC} STREQUAL "armv6l") - message(STATUS "Detected ARM target processor") diff --git a/debian/patches/fix-reST-parsing-issues-in-docs.patch b/debian/patches/fix-reST-parsing-issues-in-docs.patch deleted file mode 100644 index 9658aff..0000000 --- a/debian/patches/fix-reST-parsing-issues-in-docs.patch +++ /dev/null @@ -1,31 +0,0 @@ ---- a/doc/reST/cli.rst -+++ b/doc/reST/cli.rst -@@ -63,7 +63,7 @@ - is used for WPP and for distributed analysis and motion search: - :option:`--wpp` :option:`--pmode` and :option:`--pme` respectively. - -- If :option:`--threads`=1 is specified, then no thread pool is -+ If :option:`--threads` 1 is specified, then no thread pool is - created. When no thread pool is created, all the thread pool - features are implicitly disabled. If all the pool features are - disabled by the user, then the pool is implicitly disabled. -@@ -904,8 +904,8 @@ - between 0 and 1, or in kbits. In other words these two option pairs - are equivalent:: - -- :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 900 -- :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 0.9 -+ --vbv-bufsize 1000 --vbv-init 900 -+ --vbv-bufsize 1000 --vbv-init 0.9 - - Default 0.9 - -@@ -1206,7 +1206,7 @@ - .. option:: --aud, --no-aud - - Emit an access unit delimiter NAL at the start of each slice access -- unit. If option:`--repeat-headers` is not enabled (indicating the -+ unit. If :option:`--repeat-headers` is not enabled (indicating the - user will be writing headers manually at the start of the stream) - the very first AUD will be skipped since it cannot be placed at the - start of the access unit, where it belongs. Default disabled diff --git a/debian/patches/make_it_clear_that_--fast-cbf_is_innefective_at_lower_rd_levels.patch b/debian/patches/make_it_clear_that_--fast-cbf_is_innefective_at_lower_rd_levels.patch deleted file mode 100644 index d1ee9cb..0000000 --- a/debian/patches/make_it_clear_that_--fast-cbf_is_innefective_at_lower_rd_levels.patch +++ /dev/null @@ -1,15 +0,0 @@ ---- a/source/encoder/encoder.cpp -+++ b/source/encoder/encoder.cpp -@@ -1330,6 +1330,12 @@ - p->bBPyramid = 0; - - /* Disable features which are not supported by the current RD level */ -+ if (p->rdLevel < 5) -+ { -+ if (p->bEnableCbfFastMode) /* impossible */ -+ x265_log(p, X265_LOG_WARNING, "--fast-cbf disabled, requires --rdlevel 5 or higher\n"); -+ p->bEnableCbfFastMode = 0; -+ } - if (p->rdLevel < 4) - { - if (p->psyRdoq > 0) /* impossible */ diff --git a/debian/patches/series b/debian/patches/series deleted file mode 100644 index 1282f51..0000000 --- a/debian/patches/series +++ /dev/null @@ -1,7 +0,0 @@ -version.patch -cpu-detection.patch -show-options-using-their-param_parse-names.patch -make_it_clear_that_--fast-cbf_is_innefective_at_lower_rd_levels.patch -bug_fix_for_validatefanout_param_analysis-mode_save_and_load.patch -fix-reST-parsing-issues-in-docs.patch -armv7l-support.patch diff --git a/debian/patches/show-options-using-their-param_parse-names.patch b/debian/patches/show-options-using-their-param_parse-names.patch deleted file mode 100644 index a6cba2d..0000000 --- a/debian/patches/show-options-using-their-param_parse-names.patch +++ /dev/null @@ -1,13 +0,0 @@ ---- a/source/common/param.cpp -+++ b/source/common/param.cpp -@@ -1152,8 +1152,8 @@ - fprintf(stderr, "psy-rd=%.2lf ", param->psyRd); - if (param->psyRdoq > 0.) - fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq); -- TOOLOPT(param->bEnableEarlySkip, "esd"); -- TOOLOPT(param->bEnableCbfFastMode, "cfm"); -+ TOOLOPT(param->bEnableEarlySkip, "early-skip"); -+ TOOLOPT(param->bEnableCbfFastMode, "fast-cbf"); - if (param->noiseReduction) - fprintf(stderr, "nr=%d ", param->noiseReduction); - TOOLOPT(param->bEnableLoopFilter, "lft"); diff --git a/debian/patches/version.patch b/debian/patches/version.patch deleted file mode 100644 index 639310c..0000000 --- a/debian/patches/version.patch +++ /dev/null @@ -1,13 +0,0 @@ ---- a/source/cmake/version.cmake -+++ b/source/cmake/version.cmake -@@ -6,8 +6,8 @@ - find_package(Git QUIET) # present in 2.8.8 - - # defaults, in case everything below fails --set(X265_VERSION "unknown") --set(X265_LATEST_TAG "0.0") -+set(X265_VERSION "1.4") -+set(X265_LATEST_TAG "1.4") - set(X265_TAG_DISTANCE "0") - - if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt) diff --git a/debian/rules b/debian/rules deleted file mode 100755 index 33f9531..0000000 --- a/debian/rules +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/make -f - -DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) -DEB_HOST_GNU_CPU ?= $(shell dpkg-architecture -qDEB_HOST_GNU_CPU) -DEB_HOST_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE) -API ?= $(shell debian/getapi.sh) -LIB = $(CURDIR)/debian/install/usr/lib/$(DEB_HOST_MULTIARCH) - -include debian/confflags - -builddir = compiling -x265-clean = rm -rf $(builddir) && mkdir $(builddir) -x265-install = $(MAKE) -C $(builddir) install DESTDIR=$(CURDIR)/debian/install -x265-build = dh_auto_build -D$(builddir) -test-build = \ - $(builddir)/x265 --pass 1 --bitrate 10 -o /dev/null debian/test.y4m && \ - $(builddir)/x265 --pass 2 --bitrate 10 -o test.hevc debian/test.y4m - - -%: - dh ${@} --parallel - -override_dh_auto_clean: - dh_auto_clean - rm -rf $(builddir) doc/reST/build debian/install debian/test.y4m - -override_dh_auto_build: - sed -e 's/@API@/$(API)/g' debian/control.in > debian/control - cp -f debian/libx265N.install debian/libx265-$(API).install - unxz -fk debian/test.y4m.xz -# build x265 8-bit - mkdir $(builddir) - cd $(builddir) && cmake $(8bit_confflags) ../source - $(x265-build) - $(x265-install) -# test x265 8-bit -# $(test-build) -# build x265 10-bit - $(x265-clean) - cd $(builddir) && cmake $(10bit_confflags) ../source - $(x265-build) - $(x265-install) - sed -e 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g' \ - debian/x265-10bit.in > $(builddir)/x265-10bit - install -c -D -m755 $(builddir)/x265-10bit $(CURDIR)/debian/install/usr/bin -# test x265 10-bit -# $(test-build) -# build x265 8-bit static - $(x265-clean) - rm -f $(LIB)/libx265.a - cd $(builddir) && cmake $(8bit_static_confflags) ../source - $(x265-build) - install -c -D -m644 $(builddir)/libx265.a $(LIB) -# build x265 10-bit static - $(x265-clean) - rm -f $(LIB)/x265-10bit/libx265.a - cd $(builddir) && cmake $(10bit_static_confflags) ../source - $(x265-build) - install -c -D -m644 $(builddir)/libx265.a $(LIB)/x265-10bit -# build documentation - $(MAKE) -C doc/reST pickle html - -override_dh_install: - dh_install --list-missing --sourcedir=$(CURDIR)/debian/install - -override_dh_installchangelogs: - dh_installchangelogs -px265-doc -plibx265-$(API) ChangeLog - -override_dh_installdocs: - dh_installdocs -plibx265-$(API) -px265-doc -X.buildinfo - dh_installdocs -px265 -px265-dbg -plibx265-dev --link-doc=libx265-$(API) - -override_dh_strip: - dh_strip --dbg-package=x265-dbg - -override_dh_builddeb: - dh_builddeb -- -Zxz -z9 - diff --git a/debian/source/format b/debian/source/format deleted file mode 100644 index 163aaf8..0000000 --- a/debian/source/format +++ /dev/null @@ -1 +0,0 @@ -3.0 (quilt) diff --git a/debian/source/include-binaries b/debian/source/include-binaries deleted file mode 100644 index c489363..0000000 --- a/debian/source/include-binaries +++ /dev/null @@ -1 +0,0 @@ -debian/test.y4m.xz diff --git a/debian/source/options b/debian/source/options deleted file mode 100644 index 74a452b..0000000 --- a/debian/source/options +++ /dev/null @@ -1,2 +0,0 @@ -compression = "xz" -compression-level = 9 diff --git a/debian/test.y4m.xz b/debian/test.y4m.xz deleted file mode 100644 index 28d3c1c..0000000 Binary files a/debian/test.y4m.xz and /dev/null differ diff --git a/debian/upstream-changelog_help.txt b/debian/upstream-changelog_help.txt deleted file mode 100644 index 7539445..0000000 --- a/debian/upstream-changelog_help.txt +++ /dev/null @@ -1 +0,0 @@ -hg log --style changelog > ChangeLog diff --git a/debian/watch b/debian/watch deleted file mode 100644 index 4867948..0000000 --- a/debian/watch +++ /dev/null @@ -1,2 +0,0 @@ -version=3 -https://bitbucket.org/multicoreware/x265/get/([0-9.]+)\.tar\.(?:xz|bz2|gz) \ No newline at end of file diff --git a/debian/x265-10bit.in b/debian/x265-10bit.in deleted file mode 100644 index 18b1fb7..0000000 --- a/debian/x265-10bit.in +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -LIBX265_PATH=/usr/lib/@DEB_HOST_MULTIARCH@/x265-10bit -LD_LIBRARY_PATH="$LIBX265_PATH:$LD_LIBRARY_PATH" $LIBX265_PATH/x265 "$@" diff --git a/debian/x265-doc.doc-base b/debian/x265-doc.doc-base deleted file mode 100644 index 904a43c..0000000 --- a/debian/x265-doc.doc-base +++ /dev/null @@ -1,10 +0,0 @@ -Document: x265 -Title: x265 Documentation -Author: Steve Borho -Abstract: This is the official documentation x265, - a free H.265/HEVC video encoder. -Section: Video - -Format: HTML -Index: /usr/share/doc/x265-doc/html/index.html -Files: /usr/share/doc/x265-doc/html/*.html diff --git a/debian/x265-doc.docs b/debian/x265-doc.docs deleted file mode 100644 index b05b5c7..0000000 --- a/debian/x265-doc.docs +++ /dev/null @@ -1,2 +0,0 @@ -doc/intra -doc/reST/build/html diff --git a/debian/x265-doc.links b/debian/x265-doc.links deleted file mode 100644 index 29586c4..0000000 --- a/debian/x265-doc.links +++ /dev/null @@ -1,2 +0,0 @@ -/usr/share/javascript/jquery/jquery.js /usr/share/doc/x265-doc/html/_static/jquery.js -/usr/share/javascript/underscore/underscore.js /usr/share/doc/x265-doc/html/_static/underscore.js diff --git a/debian/x265-help.txt b/debian/x265-help.txt deleted file mode 100644 index dda07f7..0000000 --- a/debian/x265-help.txt +++ /dev/null @@ -1,174 +0,0 @@ - -Syntax: x265 [options] infile [-o] outfile - infile can be YUV or Y4M - outfile is raw HEVC bitstream - -Executable Options: --h/--help Show this help text and exit --V/--version Show version info and exit - -Output Options: --o/--output Bitstream output file name - --log-level Logging level: none error warning info debug full. Default info - --no-progress Disable CLI progress reports - --[no-]cu-stats Enable logging stats about distribution of cu across all modes. Default disabled - --csv Comma separated log file, log level >= 3 frame log, else one line per run - -Input Options: - --input Raw YUV or Y4M input file name. `-` for stdin - --y4m Force parsing of input stream as YUV4MPEG2 regardless of file extension - --fps Source frame rate (float or num/denom), auto-detected if Y4M - --input-res WxH Source picture size [w x h], auto-detected if Y4M - --input-depth Bit-depth of input file. Default 8 - --input-csp Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420 --f/--frames Maximum number of frames to encode. Default all - --seek First frame to encode - --[no-]interlace Indicate input pictures are interlace fields in temporal order. Default progressive - --dither Enable dither if downscaling to 8 bit pixels. Default disabled - -Quality reporting metrics: - --[no-]ssim Enable reporting SSIM metric scores. Default disabled - --[no-]psnr Enable reporting PSNR metric scores. Default disabled - -Profile, Level, Tier: - --profile Enforce an encode profile: main, main10, mainstillpicture - --level-idc Force a minumum required decoder level (as '5.0' or '50') - --[no-]high-tier If a decoder level is specified, this modifier selects High tier of that level - -Threading, performance: - --threads Number of threads for thread pool (0: detect CPU core count, default) --F/--frame-threads Number of concurrently encoded frames. 0: auto-determined by core count - --[no-]wpp Enable Wavefront Parallel Processing. Default enabled - --[no-]pmode Parallel mode analysis. Default disabled - --[no-]pme Parallel motion estimation. Default disabled - --[no-]asm Override CPU detection. Default: auto - -Presets: --p/--preset Trade off performance for compression efficiency. Default medium - ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo --t/--tune Tune the settings for a particular type of source or situation: - psnr, ssim, zerolatency, or fastdecode - -Quad-Tree size and depth: --s/--ctu <64|32|16> Maximum CU size (default: 64x64). Default 64 - --tu-intra-depth Max TU recursive depth for intra CUs. Default 1 - --tu-inter-depth Max TU recursive depth for inter CUs. Default 1 - --[no-]rect Enable rectangular motion partitions Nx2N and 2NxN. Default disabled - --[no-]amp Enable asymmetric motion partitions, requires --rect. Default disabled - -Analysis: - --rd <0..6> Level of RD in mode decision 0:least....6:full RDO. Default 3 - --psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default 0.000000 - --psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default 0.000000 - --nr An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled - --[no-]tskip-fast Enable fast intra transform skipping. Default disabled - --[no-]early-skip Enable early SKIP detection. Default disabled - --[no-]fast-cbf Enable early outs based on whether residual is coded. Default disabled - -Coding tools: --w/--[no-]weightp Enable weighted prediction in P slices. Default enabled - --[no-]weightb Enable weighted prediction in B slices. Default disabled - --[no-]cu-lossless Consider lossless mode in CU RDO decisions. Default disabled - --[no-]signhide Hide sign bit of one coeff per TU (rdo). Default enabled - --[no-]tskip Enable intra 4x4 transform skipping. Default disabled - -Temporal / motion search options: - --me Motion search method dia hex umh star full. Default 1 --m/--subme Amount of subpel refinement to perform (0:least .. 7:most). Default 2 - --merange Motion search range. Default 57 - --max-merge <1..5> Maximum number of merge candidates. Default 2 - --[no-]temporal-mvp Enable temporal MV predictors. Default enabled - -Spatial / intra options: - --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default enabled - --[no-]constrained-intra Constrained intra prediction (use only intra coded reference pixels) Default disabled - --[no-]b-intra Enable intra in B frames in veryslow presets. Default disabled - --[no-]fast-intra Enable faster search method for angular intra predictions. Default disabled - --rdpenalty <0..2> penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default 0 - -Slice decision options: - --[no-]open-gop Enable open-GOP, allows I slices to be non-IDR. Default enabled --I/--keyint Max IDR period in frames. -1 for infinite-gop. Default 250 --i/--min-keyint Scenecuts closer together than this are coded as I, not IDR. Default: auto - --no-scenecut Disable adaptive I-frame decision - --scenecut How aggressively to insert extra I-frames. Default 40 - --rc-lookahead Number of frames for frame-type lookahead (determines encoder latency) Default 20 - --bframes Maximum number of consecutive b-frames (now it only enables B GOP structure) Default 4 - --bframe-bias Bias towards B frame decisions. Default 0 - --b-adapt <0..2> 0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default 2 - --[no-]b-pyramid Use B-frames as references. Default enabled - --ref max number of L0 references to be allowed (1 .. 16) Default 3 - --qpfile Force frametypes and QPs for some or all frames - Format of each line: framenumber frametype QP - QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b. - QPs are restricted by qpmin/qpmax. - -Rate control, Quantization: - --bitrate Target bitrate (kbps) for ABR (implied). Default 0 --q/--qp QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs - --crf Quality-based VBR (0-51). Default 28.000000 - --[no-]lossless Enable lossless: bypass transform, quant and loop filters globally. Default disabled - --crf-max With CRF+VBV, limit RF to this value. Default 0.000000 - May cause VBV underflows! - --crf-min With CRF+VBV, limit RF to this value. Default 0.000000 - this specifies a minimum rate factor value for encode! - --vbv-maxrate Max local bitrate (kbit/s). Default 0 - --vbv-bufsize Set size of the VBV buffer (kbit). Default 0 - --vbv-init Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default 0.900000 - --aq-mode Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default 2 - --aq-strength Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default 1.000000 - --[no-]cutree Enable cutree for Adaptive Quantization. Default enabled - --ipratio QP factor between I and P. Default 1.400000 - --pbratio QP factor between P and B. Default 1.300000 - --cbqpoffs Chroma Cb QP Offset. Default 0 - --crqpoffs Chroma Cr QP Offset. Default 0 - --stats Filename for stats file in multipass pass rate control. Default x265_2pass.log - --pass Multi pass rate control. - - 1 : First pass, creates stats file - - 2 : Last pass, does not overwrite stats file - - 3 : Nth pass, overwrites stats file - --[no-]slow-firstpass Enable a slow first pass in a multipass rate control mode. Default disabled - --analysis-mode save - Dump analysis info into file, load - Load analysis buffers from the file. Default 0 - --analysis-file Specify file name used for either dumping or reading analysis data. - --scaling-list Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off - --lambda-file Specify a file containing replacement values for the lambda tables - MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table - Blank lines and lines starting with hash(#) are ignored - Comma is considered to be white-space - -Loop filters (deblock and SAO): - --[no-]lft Enable Deblocking Loop Filter. Default enabled - --[no-]sao Enable Sample Adaptive Offset. Default enabled - --[no-]sao-non-deblock Use non-deblocked pixels, else right/bottom boundary areas skipped. Default disabled - -VUI options: - --sar Sample Aspect Ratio, the ratio of width to height of an individual pixel. - Choose from 0=undef, 1=1:1("square"), 2=12:11, 3=10:11, 4=16:11, - 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11, - 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of . Default 0 - --crop-rect Add 'left,top,right,bottom' to the bitstream-level cropping rectangle - --overscan Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef - --videoformat Specify video format from undef, component, pal, ntsc, secam, mac. Default undef - --range Specify black level and range of luma and chroma signals as full or limited Default limited - --colorprim Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m, - smpte240m, film, bt2020. Default undef - --transfer Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m, - smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1, - bt2020-10, bt2020-12. Default undef - --colormatrix Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m, - smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef - --chromaloc Specify chroma sample location (0 to 5). Default of 0 - -Bitstream options: - --[no-]info Emit SEI identifying encoder and parameters. Default enabled - --[no-]aud Emit access unit delimiters at the start of each access unit. Default disabled - --[no-]hrd Enable HRD parameters signalling. Default disabled - --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default disabled - --hash Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default 0 - -Reconstructed video options (debugging): --r/--recon Reconstructed raw image YUV or Y4M output file name - --recon-depth Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M - - -Full documentation may be found at http://x265.readthedocs.org/en/default/cli.html diff --git a/debian/x265.1 b/debian/x265.1 deleted file mode 100644 index c388850..0000000 --- a/debian/x265.1 +++ /dev/null @@ -1,472 +0,0 @@ -.TH X265 "1" "AUGUST 2014" "v1.4" "User Commands" -.SH NAME -x265 \- H.265/HEVC video encoder - -.SH SYNOPSIS -.B x265 \fR[options] \fIinfile \fR[\-o] \fIoutfile -.br -Bit depth: 8 -.PP -.B x265\-10bit \fR[options] \fIinfile \fR[\-o] \fIoutfile -.br -Bit depth: 10 -.PP -infile can be YUV or Y4M -.br -outfile is raw HEVC bitstream - -.SH DESCRIPTION -Increasing demand for high definition and ultra\-high definition video, -along with an increasing desire for video on demand has led to -exponential growth in demand for bandwidth and storage requirements. -These challenges can be met by the new High Efficiency Video Coding -(HEVC) standard, also known as H.265. The x265 HEVC encoder project was -launched by MulticoreWare in 2013, aiming to provide the most efficient, -highest performance HEVC video encoder. -.SS About HEVC -The High Efficiency Video Coding (HEVC) was developed by the ISO/IEC -Moving Picture Experts Group (MPEG) and ITU\-T Video Coding Experts Group -(VCEG), through their Joint Collaborative Team on Video Coding (JCT\-VC). -HEVC is also known as ISO/IEC 23008\-2 MPEG\-H Part 2 and ITU\-T H.265. -HEVC provides superior video quality and up to twice the data -compression as the previous standard (H.264/MPEG\-4 AVC). HEVC can -support 8K Ultra High Definition video, with a picture size up to -8192x4320 pixels. -.SS About x265 -The primary objective of x265 is to become the best H.265/HEVC encoder -available anywhere, offering the highest compression efficiency and the -highest performance on a wide variety of hardware platforms. The x265 -encoder is available as an open source library, published under the -GPLv2 license. It is also available under a commercial license, enabling -commercial companies to utilize and distribute x265 in their solutions -without being subject to the restrictions of the GPL license. - -.SH OPTIONS -.TP -\fB\-h/\-\-help -Show this help text and exit -.TP -\fB\-V/\-\-version -Show version info and exit - -.SS "Output Options:" -.TP -\fB\-\-output -Bitstream output file name -.TP -\fB\-\-log\-level -Logging level: none error warning info debug full. Default info -.TP -\fB\-\-no\-progress -Disable CLI progress reports -.TP -\fB\-\-[no\-]cu\-stats -Enable logging stats about distribution of cu across all modes. Default disabled -.TP -\fB\-\-csv -Comma separated log file, log level >= 3 frame log, else one line per run - -.SS "Input Options:" -.TP -\fB\-\-input -Raw YUV or Y4M input file name. `\-` for stdin -.TP -\fB\-\-y4m -Force parsing of input stream as YUV4MPEG2 regardless of file extension -.TP -\fB\-\-fps -Source frame rate (float or num/denom), auto\-detected if Y4M -.TP -\fB\-\-input\-res WxH -Source picture size [w x h], auto\-detected if Y4M -.TP -\fB\-\-input\-depth -Bit\-depth of input file. Default 8 -.TP -\fB\-\-input\-csp -Source color space: i420, i444 or i422, auto\-detected if Y4M. Default: i420 -.TP -\fB\-f/\-\-frames -Maximum number of frames to encode. Default all -.TP -\fB\-\-seek -First frame to encode -.TP -\fB\-\-[no\-]interlace -Indicate input pictures are interlace fields in temporal order. Default progressive -.TP -\fB\-\-dither -Enable dither if downscaling to 8 bit pixels. Default disabled - -.SS "Quality reporting metrics:" -.TP -\fB\-\-[no\-]ssim -Enable reporting SSIM metric scores. Default disabled -.TP -\fB\-\-[no\-]psnr -Enable reporting PSNR metric scores. Default disabled - -.SS "Profile, Level, Tier:" -.TP -\fB\-\-profile -Enforce an encode profile: main, main10, mainstillpicture -.TP -\fB\-\-level\-idc -Force a minumum required decoder level (as '5.0' or '50') -.TP -\fB\-\-[no\-]high\-tier -If a decoder level is specified, this modifier selects High tier of that level - -.SS "Threading, performance:" -.TP -\fB\-\-threads -Number of threads for thread pool (0: detect CPU core count, default) -.TP -\fB\-F/\-\-frame\-threads -Number of concurrently encoded frames. 0: auto\-determined by core count -.TP -\fB\-\-[no\-]wpp -Enable Wavefront Parallel Processing. Default enabled -.TP -\fB\-\-[no\-]pmode -Parallel mode analysis. Default disabled -.TP -\fB\-\-[no\-]pme -Parallel motion estimation. Default disabled -.TP -\fB\-\-[no\-]asm -Override CPU detection. Default: auto - -.SS Presets: -.TP -\fB\-p/\-\-preset -Trade off performance for compression efficiency. Default medium -.br -ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo -.TP -\fB\-t/--tune -Tune the settings for a particular type of source or situation:" -.br -psnr, ssim, zerolatency, or fastdecode - -.SS "Quad-Tree size and depth:" -.TP -\fB\-s/\-\-ctu <64|32|16> -Maximum CU size (default: 64x64). Default 64 -.TP -\fB\-\-tu\-intra\-depth -Max TU recursive depth for intra CUs. Default 1 -.TP -\fB\-\-tu\-inter\-depth -Max TU recursive depth for inter CUs. Default 1 -.TP -\fB\-\-[no\-]rect -Enable rectangular motion partitions Nx2N and 2NxN. Default disabled -.TP -\fB\-\-[no\-]amp -Enable asymmetric motion partitions, requires \fB\-\-rect\fR. Default disabled -.TP -\fB\-\-rd <0..6> -Level of RD in mode decision 0:least....6:full RDO. Default 3 -.TP -\fB\-\-psy\-rd <0..2.0> -Strength of psycho\-visual rate distortion optimization, 0 to disable. Default 0.000000 -.TP -\fB\-\-psy\-rdoq <0..50.0> -Strength of psycho\-visual optimization in quantization, 0 to disable. Default 0.000000 -.TP -\fB\-\-nr -An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled -.TP -\fB\-\-[no\-]tskip\-fast -Enable fast intra transform skipping. Default disabled -.TP -\fB\-\-[no\-]early\-skip -Enable early SKIP detection. Default disabled -.TP -\fB\-\-[no\-]fast\-cbf -Enable early outs based on whether residual is coded. Default disabled - -.SS "Coding tools:" -.TP -\fB\-w/\-\-[no\-]weightp -Enable weighted prediction in P slices. Default enabled -.TP -\fB\-\-[no\-]weightb -Enable weighted prediction in B slices. Default disabled -.TP -\fB\-\-[no\-]cu-lossless -Consider lossless mode in CU RDO decisions. Default disabled -.TP -\fB\-\-[no\-]signhide -Hide sign bit of one coeff per TU (rdo). Default enabled -.TP -\fB\-\-[no\-]tskip -Enable intra transform skipping. Default disabled - -.SS "Temporal / motion search options:" -.TP -\fB\-\-me -Motion search method dia hex umh star full. Default 1 -.TP -\fB\-m/\-\-subme -Amount of subpel refinement to perform (0:least .. 7:most). Default 2 -.TP -\fB\-\-merange -Motion search range. Default 57 -.TP -\fB\-\-max\-merge <1..5> -Maximum number of merge candidates. Default 2 -.TP -\fB\-\-[no\-]temporal\-mvp -Enable temporal MV predictors. Default enabled - -.SS "Spatial / intra options:" -.TP -\fB\-\-[no\-]strong\-intra\-smoothing -Enable strong intra smoothing for 32x32 blocks. Default enabled -.TP -\fB\-\-[no\-]constrained\-intra -Constrained intra prediction (use only intra coded reference pixels) Default disabled -.TP -\fB\-\-[no\-]b\-intra -Enable intra in B frames in veryslow presets. Default disabled -.TP -\fB\-\-[no\-]fast\-intra -Enable faster search method for angular intra predictions. Default disabled -.TP -\fB\-\-rdpenalty <0..2> -penalty for 32x32 intra TU in non\-I slices. 0:disabled 1:RD\-penalty 2:maximum. Default 0 - -.SS "Slice decision options:" -.TP -\fB\-\-[no\-]open\-gop -Enable open\-GOP, allows I slices to be non\-IDR. Default enabled -.TP -\fB\-I/\-\-keyint -Max IDR period in frames. \-1 for infinite\-gop. Default 250 -.TP -\fB\-i/\-\-min\-keyint -Scenecuts closer together than this are coded as I, not IDR. Default: auto -.TP -\fB\-\-no\-scenecut -Disable adaptive I\-frame decision -.TP -\fB\-\-scenecut -How aggressively to insert extra I\-frames. Default 40 -.TP -\fB\-\-rc\-lookahead -Number of frames for frame\-type lookahead (determines encoder latency) Default 20 -.TP -\fB\-\-bframes -Maximum number of consecutive b\-frames (now it only enables B GOP structure) Default 4 -.TP -\fB\-\-bframe\-bias -Bias towards B frame decisions. Default 0 -.TP -\fB\-\-b\-adapt <0..2> -0 \- none, 1 \- fast, 2 \- full (trellis) adaptive B frame scheduling. Default 2 -.TP -\fB\-\-[no\-]b\-pyramid -Use B\-frames as references. Default enabled -.TP -\fB\-\-ref -max number of L0 references to be allowed (1 .. 16) Default 3 -.TP -\fB\-\-qpfile -Force frametypes and QPs for some or all frames -.br -Format of each line: framenumber frametype QP -.br -QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b. -.br -QPs are restricted by qpmin/qpmax. -.PP - -.SS "Rate control, Quantization:" -.TP -\fB\-\-bitrate -Target bitrate (kbps) for ABR (implied). Default 0 -.TP -\fB\-\-crf -Quality\-based VBR (0\-51). Default 28.000000 -.TP -\fB\-\-[no\-]lossless -Enable lossless: bypass transform, quant and loop filters globally. Default disabled -.TP -\fB\-\-crf\-max -With CRF+VBV, limit RF to this value. Default 0.000000 -.br -May cause VBV underflows! -.TP -\fB\-\-crf\-min -With CRF+VBV, limit RF to this value. Default 0.000000 -.br -this specifies a minimum rate factor value for encode! -.TP -\fB\-\-vbv\-maxrate -Max local bitrate (kbit/s). Default 0 -.TP -\fB\-\-vbv\-bufsize -Set size of the VBV buffer (kbit). Default 0 -.TP -\fB\-\-vbv\-init -Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default 0.900000 -.TP -\fB\-\-aq\-mode -Mode for Adaptive Quantization \- 0:none 1:uniform AQ 2:auto variance. Default 2 -.TP -\fB\-\-aq\-strength -Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default 1.000000 -.TP -\fB\-\-[no\-]cutree -Enable cutree for Adaptive Quantization. Default enabled -.TP -\fB\-\-ipratio -QP factor between I and P. Default 1.400000 -.TP -\fB\-\-pbratio -QP factor between P and B. Default 1.300000 -.TP -\fB\-\-cbqpoffs -Chroma Cb QP Offset. Default 0 -.TP -\fB\-\-crqpoffs -Chroma Cr QP Offset. Default 0 -.TP -\fB\-\-stats -Filename for stats file in multipass pass rate control. Default x265_2pass.log -.TP -\fB\-\-pass -Multi pass rate control. -.br - \- 1 : First pass, creates stats file -.br - \- 2 : Last pass, does not overwrite stats file -.br - \- 3 : Nth pass, overwrites stats file -.TP -\fB\-\-[no\-]slow\-firstpass -Enable a slow first pass in a multipass rate control mode. Default disabled -.TP -\fB\-\-analysis\-mode -save \- Dump analysis info into file, load \- Load analysis buffers from the file. Default 0 -.TP -\fB\-\-analysis-file -Specify file name used for either dumping or reading analysis data. -.TP -\fB\-\-scaling\-list -Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off -.TP -\fB\-\-lambda\-file -Specify a file containing replacement values for the lambda tables -.br -MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table -.br -Blank lines and lines starting with hash(#) are ignored -.br -Comma is considered to be white-space - -.SS "Loop filters (deblock and SAO):" -.TP -\fB\-\-[no\-]lft -Enable Deblocking Loop Filter. Default enabled -.TP -\fB\-\-[no\-]sao -Enable Sample Adaptive Offset. Default enabled -.TP -\fB\-\-[no\-]sao\-non\-deblock -Use non\-deblocked pixels, else right/bottom boundary areas skipped. Default disabled - -.SS "VUI options:" -.TP -\fB\-\-sar -Sample Aspect Ratio, the ratio of width to height of an individual pixel. -.br -Choose from 0=undef, 1=1:1("square"), 2=12:11, 3=10:11, 4=16:11, -5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11, -12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of . Default 0 -.TP -\fB\-\-crop\-rect -Add 'left,top,right,bottom' to the bitstream\-level cropping rectangle -.TP -\fB\-\-overscan -Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef -.TP -\fB\-\-videoformat -Specify video format from undef, component, pal, ntsc, secam, mac. Default undef -.TP -\fB\-\-range -Specify black level and range of luma and chroma signals as full or limited Default limited -.TP -\fB\-\-colorprim -Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m, -smpte240m, film, bt2020. Default undef -.TP -\fB\-\-transfer -Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m, -smpte240m, linear, log100, log316, iec61966\-2\-4, bt1361e, iec61966\-2\-1, -bt2020\-10, bt2020\-12. Default undef -.TP -\fB\-\-colormatrix -Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m, -smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef -.TP -\fB\-\-chromaloc -Specify chroma sample location (0 to 5). Default of 0 - -.SS "Bitstream options:" -.TP -\fB\-\-[no\-]info -Emit SEI identifying encoder and parameters. Default enabled -.TP -\fB\-\-[no\-]aud -Emit access unit delimiters at the start of each access unit. Default disabled -.TP -\fB\-\-[no\-]hrd -Enable HRD parameters signalling. Default disabled -.TP -\fB\-\-[no\-]repeat\-headers -Emit SPS and PPS headers at each keyframe. Default disabled -.TP -\fB\-\-hash -Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default 0 - -.SS "Reconstructed video options (debugging):" -.TP -\fB\-r/\-\-recon -Reconstructed raw image YUV or Y4M output file name -.TP -\fB\-\-recon\-depth -Bit\-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M -.SH COPYRIGHT -Copyright \(co 2013\-2014 MulticoreWare, Inc. -.PP -The x265 software is owned and copyrighted by MulticoreWare, Inc. -MulticoreWare is committed to offering the x265 software under the GNU -GPL v2 license. Companies who do not wish to integrate the x265 -Software in their products under the terms of the GPL license can -contact MulticoreWare (\fIlicense@x265.com\fR) to obtain a commercial -license agreement. Companies who use x265 under the GPL may also wish -to work with MulticoreWare to accelerate the development of specific -features or optimized support for specific hardware or software -platforms, or to contract for support. -.PP -The GNU GPL v2 license or the x265 commercial license agreement govern -your rights to access the copyrighted x265 software source code, but do -not cover any patents that may be applicable to the function of binary -executable software created from the x265 source code. You are -responsible for understanding the laws in your country, and for -licensing all applicable patent rights needed for use or distribution of -software applications created from the x265 source code. A good place -to start is with the Motion Picture Experts Group \- Licensing Authority -\- HEVC Licensing Program. -.PP -x265 is a registered trademark of MulticoreWare, Inc. The x265 logo is -a trademark of MulticoreWare, and may only be used with explicit written -permission. All rights reserved. - -.SH "SEE ALSO" -.TP -\fIhttp://x265.readthedocs.org/en/default/cli.html\fR -online documentation diff --git a/debian/x265.install b/debian/x265.install deleted file mode 100644 index 3500295..0000000 --- a/debian/x265.install +++ /dev/null @@ -1,2 +0,0 @@ -usr/bin -usr/lib/*/x265-10bit/x265 diff --git a/debian/x265.links b/debian/x265.links deleted file mode 100644 index 740a041..0000000 --- a/debian/x265.links +++ /dev/null @@ -1,3 +0,0 @@ -/usr/bin/x265-10bit /usr/bin/x265-16bit -/usr/share/man/man1/x265.1.gz /usr/share/man/man1/x265-10bit.1.gz -/usr/share/man/man1/x265.1.gz /usr/share/man/man1/x265-16bit.1.gz diff --git a/debian/x265.manpages b/debian/x265.manpages deleted file mode 100644 index 300fc8f..0000000 --- a/debian/x265.manpages +++ /dev/null @@ -1 +0,0 @@ -debian/x265.1 diff --git a/doc/reST/Makefile b/doc/reST/Makefile index 6b1d44c..b2d1c3d 100644 --- a/doc/reST/Makefile +++ b/doc/reST/Makefile @@ -13,7 +13,7 @@ PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -.PHONY: help clean html web pickle htmlhelp qthelp qhc latex changes linkcheck +.PHONY: help clean distclean html web pickle htmlhelp qthelp qhc latex changes linkcheck help: @echo "Please use \`make ' where is one of" @@ -24,12 +24,16 @@ help: @echo " qthelp to make HTML files and a qthelp project" @echo " qhc to make QHC file" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " man to make manpages" @echo " changes to make an overview over all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" clean: -rm -rf build/* +distclean: clean + -rmdir build/ + html: mkdir -p build/html build/doctrees $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html @@ -83,6 +87,14 @@ latex: @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ "run these through (pdf)latex." +man: + mkdir -p build/man build/doctrees + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) build/man + @echo + @echo "Build finished; the manpages are in build/man." + @echo "Run \`man -l build/man/x265.1' or \`man -l build/man/libx265.3'" \ + "to view them." + changes: mkdir -p build/changes build/doctrees $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes diff --git a/doc/reST/api.rst b/doc/reST/api.rst index 214881a..f15212d 100644 --- a/doc/reST/api.rst +++ b/doc/reST/api.rst @@ -32,6 +32,12 @@ library was compiled (it will contain a value of 8 or 10). Further, x265 which was compiled, and **x265_build_info_str** is a pointer to a string identifying the compiler and build options. +.. Note:: + + **x265_version_str** is only updated when **cmake** runs. If you are + making binaries for others to use, it is recommended to run + **cmake** prior to **make** in your build scripts. + x265 will accept input pixels of any depth between 8 and 16 bits regardless of the depth of its internal pixels (8 or 10). It will shift and mask input pixels as required to reach the internal depth. If diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index 324b83a..8740c8e 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -30,8 +30,8 @@ Generally, when an option expects a string value from a list of strings the user may specify the integer ordinal of the value they desire. ie: :option:`--log-level` 3 is equivalent to :option:`--log-level` debug. -Standalone Executable Options -============================= +Executable Options +================== .. option:: --help, -h @@ -45,6 +45,59 @@ Standalone Executable Options **CLI ONLY** +Logging/Statistic Options +========================= + +.. option:: --log-level + + Logging level. Debug level enables per-frame QP, metric, and bitrate + logging. If a CSV file is being generated, debug level makes the log + be per-frame rather than per-encode. Full level enables hash and + weight logging. -1 disables all logging, except certain fatal + errors, and can be specified by the string "none". + + 0. error + 1. warning + 2. info **(default)** + 3. debug + 4. full + +.. option:: --no-progress + + Disable periodic progress reports from the CLI + + **CLI ONLY** + +.. option:: --csv + + Writes encoding results to a comma separated value log file. Creates + the file if it doesnt already exist, else adds one line per run. if + :option:`--log-level` is debug or above, it writes one line per + frame. Default none + +.. option:: --cu-stats, --no-cu-stats + + Records statistics on how each CU was coded (split depths and other + mode decisions) and reports those statistics at the end of the + encode. Default disabled + +.. option:: --ssim, --no-ssim + + Calculate and report Structural Similarity values. It is + recommended to use :option:`--tune` ssim if you are measuring ssim, + else the results should not be used for comparison purposes. + Default disabled + +.. option:: --psnr, --no-psnr + + Calculate and report Peak Signal to Noise Ratio. It is recommended + to use :option:`--tune` psnr if you are measuring PSNR, else the + results should not be used for comparison purposes. Default + disabled + +Performance Options +=================== + .. option:: --asm , --no-asm x265 will use all detected CPU SIMD architectures by default. You can @@ -57,13 +110,24 @@ Standalone Executable Options One may also directly supply the CPU capability bitmap as an integer. +.. option:: --frame-threads, -F + + Number of concurrently encoded frames. Using a single frame thread + gives a slight improvement in compression, since the entire reference + frames are always available for motion compensation, but it has + severe performance implications. Default is an autodetected count + based on the number of CPU cores and whether WPP is enabled or not. + + Over-allocation of frame threads will not improve performance, it + will generally just increase memory use. + .. option:: --threads Number of threads to allocate for the worker thread pool This pool is used for WPP and for distributed analysis and motion search: :option:`--wpp` :option:`--pmode` and :option:`--pme` respectively. - If :option:`--threads`=1 is specified, then no thread pool is + If :option:`--threads` 1 is specified, then no thread pool is created. When no thread pool is created, all the thread pool features are implicitly disabled. If all the pool features are disabled by the user, then the pool is implicitly disabled. @@ -71,13 +135,24 @@ Standalone Executable Options Default 0, one thread is allocated per detected hardware thread (logical CPU cores) +.. option:: --wpp, --no-wpp + + Enable Wavefront Parallel Processing. The encoder may begin encoding + a row as soon as the row above it is at least two CTUs ahead in the + encode process. This gives a 3-5x gain in parallelism for about 1% + overhead in compression efficiency. + + This feature is implicitly disabled when no thread pool is present. + + Default: Enabled + .. option:: --pmode, --no-pmode Parallel mode decision, or distributed mode analysis. When enabled the encoder will distribute the analysis work of each CU (merge, inter, intra) across multiple worker threads. Only recommended if x265 is not already saturating the CPU cores. In RD levels 3 and 4 - it will be most effective if --rect was enabled. At RD levels 5 and + it will be most effective if --rect is enabled. At RD levels 5 and 6 there is generally always enough work to distribute to warrant the overhead, assuming your CPUs are not already saturated. @@ -85,7 +160,8 @@ Standalone Executable Options efficiency. In fact, since the modes are all measured in parallel it makes certain early-outs impractical and thus you usually get slightly better compression when it is enabled (at the expense of - not skipping improbable modes). + not skipping improbable modes). This bypassing of early-outs can + cause pmode to slow down encodes, especially at faster presets. This feature is implicitly disabled when no thread pool is present. @@ -113,7 +189,8 @@ Standalone Executable Options Sets parameters to preselected values, trading off compression efficiency against encoding speed. These parameters are applied before all other input parameters are - applied, and so you can override any parameters that these values control. + applied, and so you can override any parameters that these values control. See + :ref:`presets ` for more detail. 0. ultrafast 1. superfast @@ -129,84 +206,18 @@ Standalone Executable Options .. option:: --tune, -t Tune the settings for a particular type of source or situation. The changes will - be applied after :option:`--preset` but before all other parameters. Default none - - **Values:** psnr, ssim, zero-latency, fast-decode. - -.. option:: --frame-threads, -F - - Number of concurrently encoded frames. Using a single frame thread - gives a slight improvement in compression, since the entire reference - frames are always available for motion compensation, but it has - severe performance implications. Default is an autodetected count - based on the number of CPU cores and whether WPP is enabled or not. - - Over-allocation of frame threads will not improve performance, it - will generally just increase memory use. - -.. option:: --log-level - - Logging level. Debug level enables per-frame QP, metric, and bitrate - logging. If a CSV file is being generated, debug level makes the log - be per-frame rather than per-encode. Full level enables hash and - weight logging. -1 disables all logging, except certain fatal - errors, and can be specified by the string "none". - - 0. error - 1. warning - 2. info **(default)** - 3. debug - 4. full - -.. option:: --csv - - Writes encoding results to a comma separated value log file. Creates - the file if it doesnt already exist, else adds one line per run. if - :option:`--log-level` is debug or above, it writes one line per - frame. Default none - -.. option:: --cu-stats, --no-cu-stats - - Records statistics on how each CU was coded (split depths and other - mode decisions) and reports those statistics at the end of the - encode. Default disabled - -.. option:: --output, -o - - Bitstream output file name. If there are two extra CLI options, the - first is implicitly the input filename and the second is the output - filename, making the :option:`--output` option optional. - - The output file will always contain a raw HEVC bitstream, the CLI - does not support any container file formats. - - **CLI ONLY** - -.. option:: --no-progress + be applied after :option:`--preset` but before all other parameters. Default none. + See :ref:`tunings ` for more detail. - Disable CLI periodic progress reports + **Values:** psnr, ssim, grain, zero-latency, fast-decode, cbr. - **CLI ONLY** - -Quality reporting metrics +Input/Output File Options ========================= -.. option:: --ssim, --no-ssim - - Calculate and report Structural Similarity values. It is - recommended to use :option:`--tune` ssim if you are measuring ssim, - else the results should not be used for comparison purposes. - Default disabled - -.. option:: --psnr, --no-psnr - - Calculate and report Peak Signal to Noise Ratio. It is recommended - to use :option:`--tune` psnr if you are measuring PSNR, else the - results should not be used for comparison purposes. Default - disabled - -Input Options -============= +These options all describe the input video sequence or, in the case of +:option:`--dither`, operations that are performed on the sequence prior +to encode. All options dealing with files (names, formats, offsets or +frame counts) are only applicable to the CLI application. .. option:: --input @@ -242,21 +253,6 @@ Input Options **CLI ONLY** -.. option:: --nr - - Noise reduction - an adaptive deadzone applied after DCT - (subtracting from DCT coefficients), before quantization, on inter - blocks. It does no pixel-level filtering, doesn't cross DCT block - boundaries, has no overlap, doesn't affect intra blocks. The higher - the strength value parameter, the more aggressively it will reduce - noise. - - Enabling noise reduction will make outputs diverge between different - numbers of frame threads. Outputs will be deterministic but the - outputs of -F2 will no longer match the outputs of -F3, etc. - - **Values:** any value in range of 100 to 1000. Default disabled. - .. option:: --input-res YUV only: Source picture size [w x h] @@ -285,8 +281,6 @@ Input Options .. option:: --interlaceMode , --no-interlaceMode - **EXPERIMENTAL** Specify interlace type of source pictures. - 0. progressive pictures **(default)** 1. top field first 2. bottom field first @@ -305,61 +299,20 @@ Input Options .. option:: --frames, -f - Number of frames to be encoded. Default 0 (all) + Number of frames of input sequence to be encoded. Default 0 (all) **CLI ONLY** -.. option:: --qpfile - - Specify a text file which contains frametypes and QPs for some or - all frames. The format of each line is: - - framenumber frametype QP - - Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame, - **b** is an unreferenced B frame. **I** is a keyframe (random - access point) while **i** is a I frame that is not a keyframe - (references are not broken). - - Specifying QP (integer) is optional, and if specified they are - clamped within the encoder to qpmin/qpmax. - -.. option:: --scaling-list - - Quantization scaling lists. HEVC supports 6 quantization scaling - lists to be defined; one each for Y, Cb, Cr for intra prediction and - one each for inter prediction. - - x265 does not use scaling lists by default, but this can also be - made explicit by :option:`--scaling-list` *off*. - - HEVC specifies a default set of scaling lists which may be enabled - without requiring them to be signaled in the SPS. Those scaling - lists can be enabled via :option:`--scaling-list` *default*. - - All other strings indicate a filename containing custom scaling - lists in the HM format. The encode will abort if the file is not - parsed correctly. Custom lists must be signaled in the SPS +.. option:: --output, -o -.. option:: --lambda-file + Bitstream output file name. If there are two extra CLI options, the + first is implicitly the input filename and the second is the output + filename, making the :option:`--output` option optional. - Specify a text file containing values for x265_lambda_tab and - x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float - values. - - The text file syntax is simple. Comma is considered to be - white-space. All white-space is ignored. Lines must be less than 2k - bytes in length. Content following hash (#) characters are ignored. - The values read from the file are logged at :option:`--log-level` - debug. + The output file will always contain a raw HEVC bitstream, the CLI + does not support any container file formats. - Note that the lambda tables are process-global and so the new values - affect all encoders running in the same process. - - Lambda values affect encoder mode decisions, the lower the lambda - the more bits it will try to spend on signaling information (motion - vectors and splits) and less on residual. This feature is intended - for experimentation. + **CLI ONLY** Profile, Level, Tier ==================== @@ -417,15 +370,41 @@ Profile, Level, Tier parameters to meet those requirements but it will never raise them. -Quad-Tree analysis -================== +Mode decision / Analysis +======================== -.. option:: --wpp, --no-wpp +.. option:: --rd <0..6> - Enable Wavefront Parallel Processing. The encoder may begin encoding - a row as soon as the row above it is at least two CTUs ahead in the - encode process. This gives a 3-5x gain in parallelism for about 1% - overhead in compression efficiency. Default: Enabled + Level of RDO in mode decision. The higher the value, the more + exhaustive the analysis and the more rate distortion optimization is + used. The lower the value the faster the encode, the higher the + value the smaller the bitstream (in general). Default 3 + + Note that this table aims for accuracy, but is not necessarily our + final target behavior for each mode. + + +-------+---------------------------------------------------------------+ + | Level | Description | + +=======+===============================================================+ + | 0 | sa8d mode and split decisions, intra w/ source pixels | + +-------+---------------------------------------------------------------+ + | 1 | recon generated (better intra), RDO merge/skip selection | + +-------+---------------------------------------------------------------+ + | 2 | RDO splits and merge/skip selection | + +-------+---------------------------------------------------------------+ + | 3 | RDO mode and split decisions, chroma residual used for sa8d | + +-------+---------------------------------------------------------------+ + | 4 | Adds RDO Quant | + +-------+---------------------------------------------------------------+ + | 5 | Adds RDO prediction decisions | + +-------+---------------------------------------------------------------+ + | 6 | Currently same as 5 | + +-------+---------------------------------------------------------------+ + + **Range of values:** 0: least .. 6: full RDO analysis + +Options which affect the coding unit quad-tree, sometimes referred to as +the prediction quad-tree. .. option:: --ctu, -s <64|32|16> @@ -436,6 +415,108 @@ Quad-Tree analysis and less frame parallelism as well. Because of this the faster presets use a CU size of 32. Default: 64 +.. option:: --rect, --no-rect + + Enable analysis of rectangular motion partitions Nx2N and 2NxN + (50/50 splits, two directions). Default disabled + +.. option:: --amp, --no-amp + + Enable analysis of asymmetric motion partitions (75/25 splits, four + directions). At RD levels 0 through 4, AMP partitions are only + considered at CU sizes 32x32 and below. At RD levels 5 and 6, it + will only consider AMP partitions as merge candidates (no motion + search) at 64x64, and as merge or inter candidates below 64x64. + + The AMP partitions which are searched are derived from the current + best inter partition. If Nx2N (vertical rectangular) is the best + current prediction, then left and right asymmetrical splits will be + evaluated. If 2NxN (horizontal rectangular) is the best current + prediction, then top and bottom asymmetrical splits will be + evaluated, If 2Nx2N is the best prediction, and the block is not a + merge/skip, then all four AMP partitions are evaluated. + + This setting has no effect if rectangular partitions are disabled. + Default disabled + +.. option:: --early-skip, --no-early-skip + + Measure full CU size (2Nx2N) merge candidates first; if no residual + is found the analysis is short circuited. Default disabled + +.. option:: --fast-cbf, --no-fast-cbf + + Short circuit analysis if a prediction is found that does not set + the coded block flag (aka: no residual was encoded). It prevents + the encoder from perhaps finding other predictions that also have no + residual but require less signaling bits or have less distortion. + Only applicable for RD levels 5 and 6. Default disabled + +.. option:: --fast-intra, --no-fast-intra + + Perform an initial scan of every fifth intra angular mode, then + check modes +/- 2 distance from the best mode, then +/- 1 distance + from the best mode, effectively performing a gradient descent. When + enabled 10 modes in total are checked. When disabled all 33 angular + modes are checked. Only applicable for :option:`--rd` levels 4 and + below (medium preset and faster). + +.. option:: --b-intra, --no-b-intra + + Enables the evaluation of intra modes in B slices. Default disabled. + +.. option:: --cu-lossless, --no-cu-lossless + + For each CU, evaluate lossless (transform and quant bypass) encode + of the best non-lossless mode option as a potential rate distortion + optimization. If the global option :option:`--lossless` has been + specified, all CUs will be encoded as lossless unconditionally + regardless of whether this option was enabled. Default disabled. + + Only effective at RD levels 3 and above, which perform RDO mode + decisions. + +.. option:: --tskip, --no-tskip + + Enable evaluation of transform skip (bypass DCT but still use + quantization) coding for 4x4 TU coded blocks. + + Only effective at RD levels 3 and above, which perform RDO mode + decisions. Default disabled + +.. option:: --tskip-fast, --no-tskip-fast + + Only evaluate transform skip for NxN intra predictions (4x4 blocks). + Only applicable if transform skip is enabled. For chroma, only + evaluate if luma used tskip. Inter block tskip analysis is + unmodified. Default disabled + +Analysis re-use options, to improve performance when encoding the same +sequence multiple times (presumably at varying bitrates). The encoder +will not reuse analysis if the resolution and slice type parameters do +not match. + +.. option:: --analysis-mode + + Specify whether analysis information of each frame is output by encoder + or input for reuse. By reading the analysis data writen by an + earlier encode of the same sequence, substantial redundant work may + be avoided. + + The following data may be stored and reused: + I frames - split decisions and luma intra directions of all CUs. + P/B frames - motion vectors are dumped at each depth for all CUs. + + **Values:** off(0), save(1): dump analysis data, load(2): read analysis data + +.. option:: --analysis-file + + Specify a filename for analysis data (see :option:`--analysis-mode`) + If no filename is specified, x265_analysis.dat is used. + +Options which affect the transform unit quad-tree, sometimes referred to +as the residual quad-tree (RQT). + .. option:: --tu-intra-depth <1..4> The transform unit (residual) quad-tree begins with the same depth @@ -504,147 +585,44 @@ Temporal / motion search options | 5 | 1 | 8 | 1 | 8 | true | +----+------------+-----------+------------+-----------+-----------+ | 6 | 2 | 8 | 1 | 8 | true | - +----+------------+-----------+------------+-----------+-----------+ - | 7 | 2 | 8 | 2 | 8 | true | - +----+------------+-----------+------------+-----------+-----------+ - -.. option:: --merange - - Motion search range. Default 57 - - The default is derived from the default CTU size (64) minus the luma - interpolation half-length (4) minus maximum subpel distance (2) - minus one extra pixel just in case the hex search method is used. If - the search range were any larger than this, another CTU row of - latency would be required for reference frames. - - **Range of values:** an integer from 0 to 32768 - -.. option:: --max-merge <1..5> - - Maximum number of neighbor (spatial and temporal) candidate blocks - that the encoder may consider for merging motion predictions. If a - merge candidate results in no residual, it is immediately selected - as a "skip". Otherwise the merge candidates are tested as part of - motion estimation when searching for the least cost inter option. - The max candidate number is encoded in the SPS and determines the - bit cost of signaling merge CUs. Default 2 - -.. option:: --temporal-mvp, --no-temporal-mvp - - Enable temporal motion vector predictors in P and B slices. - This enables the use of the motion vector from the collocated block - in the previous frame to be used as a predictor. Default is enabled - -Spatial/intra options -===================== - -.. option:: --rdpenalty <0..2> - - When set to 1, transform units of size 32x32 are given a 4x bit cost - penalty compared to smaller transform units, in intra coded CUs in P - or B slices. - - When set to 2, transform units of size 32x32 are not even attempted, - unless otherwise required by the maximum recursion depth. For this - option to be effective with 32x32 intra CUs, - :option:`--tu-intra-depth` must be at least 2. For it to be - effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be - at least 3. - - Note that in HEVC an intra transform unit (a block of the residual - quad-tree) is also a prediction unit, meaning that the intra - prediction signal is generated for each TU block, the residual - subtracted and then coded. The coding unit simply provides the - prediction modes that will be used when predicting all of the - transform units within the CU. This means that when you prevent - 32x32 intra transform units, you are preventing 32x32 intra - predictions. - - Default 0, disabled. - - **Values:** 0:disabled 1:4x cost penalty 2:force splits - -.. option:: --b-intra, --no-b-intra - - Enables the evaluation of intra modes in B slices. Default disabled. - -.. option:: --tskip, --no-tskip - - Enable evaluation of transform skip (bypass DCT but still use - quantization) coding for 4x4 TU coded blocks. - - Only effective at RD levels 3 and above, which perform RDO mode - decisions. Default disabled - -.. option:: --tskip-fast, --no-tskip-fast - - Only evaluate transform skip for NxN intra predictions (4x4 blocks). - Only applicable if transform skip is enabled. For chroma, only - evaluate if luma used tskip. Inter block tskip analysis is - unmodified. Default disabled - -.. option:: --strong-intra-smoothing, --no-strong-intra-smoothing - - Enable strong intra smoothing for 32x32 intra blocks. Default enabled - -.. option:: --constrained-intra, --no-constrained-intra - - Constrained intra prediction. When generating intra predictions for - blocks in inter slices, only intra-coded reference pixels are used. - Inter-coded reference pixels are replaced with intra-coded neighbor - pixels or default values. The general idea is to block the - propagation of reference errors that may have resulted from lossy - signals. Default disabled - -Mode decision / Analysis -======================== - -.. option:: --rect, --no-rect - - Enable analysis of rectangular motion partitions Nx2N and 2NxN - (50/50 splits, two directions). Default disabled - -.. option:: --amp, --no-amp - - Enable analysis of asymmetric motion partitions (75/25 splits, four - directions). At RD levels 0 through 4, AMP partitions are only - considered at CU sizes 32x32 and below. At RD levels 5 and 6, it - will only consider AMP partitions as merge candidates (no motion - search) at 64x64, and as merge or inter candidates below 64x64. - - The AMP partitions which are searched are derived from the current - best inter partition. If Nx2N (vertical rectangular) is the best - current prediction, then left and right asymmetrical splits will be - evaluated. If 2NxN (horizontal rectangular) is the best current - prediction, then top and bottom asymmetrical splits will be - evaluated, If 2Nx2N is the best prediction, and the block is not a - merge/skip, then all four AMP partitions are evaluated. + +----+------------+-----------+------------+-----------+-----------+ + | 7 | 2 | 8 | 2 | 8 | true | + +----+------------+-----------+------------+-----------+-----------+ - This setting has no effect if rectangular partitions are disabled. - Default disabled + At --subme values larger than 2, chroma residual cost is included + in all subpel refinement steps and chroma residual is included in + all motion estimation decisions (selecting the best reference + picture in each list, and chosing between merge, uni-directional + motion and bi-directional motion). The 'slow' preset is the first + preset to enable the use of chroma residual. -.. option:: --early-skip, --no-early-skip +.. option:: --merange - Measure full CU size (2Nx2N) merge candidates first; if no residual - is found the analysis is short circuited. Default disabled + Motion search range. Default 57 -.. option:: --fast-cbf, --no-fast-cbf + The default is derived from the default CTU size (64) minus the luma + interpolation half-length (4) minus maximum subpel distance (2) + minus one extra pixel just in case the hex search method is used. If + the search range were any larger than this, another CTU row of + latency would be required for reference frames. - Short circuit analysis if a prediction is found that does not set - the coded block flag (aka: no residual was encoded). It prevents - the encoder from perhaps finding other predictions that also have no - residual but require less signaling bits or have less distortion. - Only applicable for RD levels 5 and 6. Default disabled + **Range of values:** an integer from 0 to 32768 -.. option:: --fast-intra, --no-fast-intra +.. option:: --max-merge <1..5> - Perform an initial scan of every fifth intra angular mode, then - check modes +/- 2 distance from the best mode, then +/- 1 distance - from the best mode, effectively performing a gradient descent. When - enabled 10 modes in total are checked. When disabled all 33 angular - modes are checked. Only applicable for :option:`--rd` levels 3 and - below (medium preset and faster). + Maximum number of neighbor (spatial and temporal) candidate blocks + that the encoder may consider for merging motion predictions. If a + merge candidate results in no residual, it is immediately selected + as a "skip". Otherwise the merge candidates are tested as part of + motion estimation when searching for the least cost inter option. + The max candidate number is encoded in the SPS and determines the + bit cost of signaling merge CUs. Default 2 + +.. option:: --temporal-mvp, --no-temporal-mvp + + Enable temporal motion vector predictors in P and B slices. + This enables the use of the motion vector from the collocated block + in the previous frame to be used as a predictor. Default is enabled .. option:: --weightp, -w, --no-weightp @@ -660,54 +638,48 @@ Mode decision / Analysis Enable weighted prediction in B slices. Default disabled -.. option:: --rd <0..6> +Spatial/intra options +===================== - Level of RDO in mode decision. The higher the value, the more - exhaustive the analysis and the more rate distortion optimization is - used. The lower the value the faster the encode, the higher the - value the smaller the bitstream (in general). Default 3 +.. option:: --strong-intra-smoothing, --no-strong-intra-smoothing - Note that this table aims for accuracy, but is not necessarily our - final target behavior for each mode. + Enable strong intra smoothing for 32x32 intra blocks. Default enabled - +-------+---------------------------------------------------------------+ - | Level | Description | - +=======+===============================================================+ - | 0 | sa8d mode and split decisions, intra w/ source pixels | - +-------+---------------------------------------------------------------+ - | 1 | recon generated (better intra), RDO merge/skip selection | - +-------+---------------------------------------------------------------+ - | 2 | RDO splits and merge/skip selection | - +-------+---------------------------------------------------------------+ - | 3 | RDO mode and split decisions | - +-------+---------------------------------------------------------------+ - | 4 | Adds RDO Quant | - +-------+---------------------------------------------------------------+ - | 5 | Adds RDO prediction decisions | - +-------+---------------------------------------------------------------+ - | 6 | Currently same as 5 | - +-------+---------------------------------------------------------------+ +.. option:: --constrained-intra, --no-constrained-intra - **Range of values:** 0: least .. 6: full RDO analysis + Constrained intra prediction. When generating intra predictions for + blocks in inter slices, only intra-coded reference pixels are used. + Inter-coded reference pixels are replaced with intra-coded neighbor + pixels or default values. The general idea is to block the + propagation of reference errors that may have resulted from lossy + signals. Default disabled -.. option:: --cu-lossless, --no-cu-lossless +.. option:: --rdpenalty <0..2> - For each CU, evaluate lossless (transform and quant bypass) encode - of the best non-lossless mode option as a potential rate distortion - optimization. If the global option :option:`--lossless` has been - specified, all CUs will be encoded as lossless unconditionally - regardless of whether this option was enabled. Default disabled. + When set to 1, transform units of size 32x32 are given a 4x bit cost + penalty compared to smaller transform units, in intra coded CUs in P + or B slices. - Only effective at RD levels 3 and above, which perform RDO mode - decisions. + When set to 2, transform units of size 32x32 are not even attempted, + unless otherwise required by the maximum recursion depth. For this + option to be effective with 32x32 intra CUs, + :option:`--tu-intra-depth` must be at least 2. For it to be + effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be + at least 3. -.. option:: --signhide, --no-signhide + Note that in HEVC an intra transform unit (a block of the residual + quad-tree) is also a prediction unit, meaning that the intra + prediction signal is generated for each TU block, the residual + subtracted and then coded. The coding unit simply provides the + prediction modes that will be used when predicting all of the + transform units within the CU. This means that when you prevent + 32x32 intra transform units, you are preventing 32x32 intra + predictions. + + Default 0, disabled. + + **Values:** 0:disabled 1:4x cost penalty 2:force splits - Hide sign bit of one coeff per TU (rdo). The last sign is implied. - This requires analyzing all the coefficients to determine if a sign - must be toggled, and then to determine which one can be toggled with - the least amount of distortion. Default enabled - Psycho-visual options ===================== @@ -753,8 +725,16 @@ quality and begin introducing artifacts and increase bitrate, which may force rate control to increase global QP. Finding the optimal psycho-visual parameters for a given video requires experimentation. Our recommended defaults (1.0 for both) are generally on the low end of the -spectrum. And generally the lower the bitrate, the lower the optimal -psycho-visual settings. +spectrum. + +The lower the bitrate, the lower the optimal psycho-visual settings. If +the bitrate is too low for the psycho-visual settings, you will begin to +see temporal artifacts (motion judder). This is caused when the encoder +is forced to code skip blocks (no residual) in areas of difficult motion +because it is the best option psycho-visually (they have great amounts +of energy and no residual cost). One can lower psy-rd settings when +judder is happening, and allow the encoder to use some blur in these +areas of high motion. .. option:: --psy-rd @@ -880,9 +860,7 @@ Quality, rate control and rate distortion options .. option:: --crf-min <0..51.0> Specify an lower limit to the rate factor which may be assigned to - any given frame (ensuring a min QP). This is dangerous when CRF is - used in combination with VBV as it may result in buffer underruns. - Default disabled + any given frame (ensuring a min compression factor). .. option:: --vbv-bufsize @@ -904,8 +882,8 @@ Quality, rate control and rate distortion options between 0 and 1, or in kbits. In other words these two option pairs are equivalent:: - :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 900 - :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 0.9 + --vbv-bufsize 1000 --vbv-init 900 + --vbv-bufsize 1000 --vbv-init 0.9 Default 0.9 @@ -923,18 +901,6 @@ Quality, rate control and rate distortion options **Range of values:** an integer from 0 to 51 -.. option:: --ipratio - - QP ratio factor between I and P slices. This ratio is used in all of - the rate control modes. Some :option:`--tune` options may change the - default value. It is not typically manually specified. Default 1.4 - -.. option:: --pbratio - - QP ratio factor between P and B slices. This ratio is used in all of - the rate control modes. Some :option:`--tune` options may change the - default value. It is not typically manually specified. Default 1.3 - .. option:: --lossless, --no-lossless Enables true lossless coding by bypassing scaling, transform, @@ -954,8 +920,8 @@ Quality, rate control and rate distortion options and not enough in flat areas. 0. disabled - 1. AQ enabled - 2. AQ enabled with auto-variance **(default)** + 1. AQ enabled **(default)** + 2. AQ enabled with auto-variance .. option:: --aq-strength @@ -974,25 +940,23 @@ Quality, rate control and rate distortion options less bits. This tends to improve detail in the backgrounds of video with less detail in areas of high motion. Default enabled -.. option:: --cbqpoffs - - Offset of Cb chroma QP from the luma QP selected by rate control. - This is a general way to spend more or less bits on the chroma - channel. Default 0 - - **Range of values:** -12 to 12 +.. option:: --nr-intra , --nr-inter -.. option:: --crqpoffs + Noise reduction - an adaptive deadzone applied after DCT + (subtracting from DCT coefficients), before quantization. It does + no pixel-level filtering, doesn't cross DCT block boundaries, has no + overlap, The higher the strength value parameter, the more + aggressively it will reduce noise. - Offset of Cr chroma QP from the luma QP selected by rate control. - This is a general way to spend more or less bits on the chroma - channel. Default 0 + Enabling noise reduction will make outputs diverge between different + numbers of frame threads. Outputs will be deterministic but the + outputs of -F2 will no longer match the outputs of -F3, etc. - **Range of values:** -12 to 12 + **Values:** any value in range of 0 to 2000. Default 0 (disabled). .. option:: --pass - Enable multipass rate control mode. Input is encoded multiple times, + Enable multi-pass rate control mode. Input is encoded multiple times, storing the encoded information of each pass in a stats file from which the consecutive pass tunes the qp of each frame to improve the quality of the output. Default disabled @@ -1003,12 +967,17 @@ Quality, rate control and rate distortion options **Range of values:** 1 to 3 +.. option:: --stats + + Specify file name of of the multi-pass stats file. If unspecified + the encoder will use x265_2pass.log + .. option:: --slow-firstpass, --no-slow-firstpass - Enable a slow and more detailed first pass encode in Multipass rate + Enable a slow and more detailed first pass encode in multi-pass rate control mode. Speed of the first pass encode is slightly lesser and quality midly improved when compared to the default settings in a - multipass encode. Default disabled (turbo mode enabled) + multi-pass encode. Default disabled (turbo mode enabled) When **turbo** first pass is not disabled, these options are set on the first pass to improve performance: @@ -1023,30 +992,146 @@ Quality, rate control and rate distortion options * :option:`--subme` = MIN(2, :option:`--subme`) * :option:`--rd` = MIN(2, :option:`--rd`) -.. option:: --analysis-mode +.. option:: --cbqpoffs - Specify whether analysis information of each frame is output by encoder - or input for reuse. By reading the analysis data writen by an - earlier encode of the same sequence, substantial redundant work may - be avoided. + Offset of Cb chroma QP from the luma QP selected by rate control. + This is a general way to spend more or less bits on the chroma + channel. Default 0 - The following data may be stored and reused: - I frames - split decisions and luma intra directions of all CUs. - P/B frames - motion vectors are dumped at each depth for all CUs. + **Range of values:** -12 to 12 - **Values:** off(0), save(1): dump analysis data, load(2): read analysis data +.. option:: --crqpoffs -.. option:: --analysis-file + Offset of Cr chroma QP from the luma QP selected by rate control. + This is a general way to spend more or less bits on the chroma + channel. Default 0 - Specify a filename for analysis data (see :option:`--analysis-mode`) - If no filename is specified, x265_analysis.dat is used. + **Range of values:** -12 to 12 + +.. option:: --ipratio + + QP ratio factor between I and P slices. This ratio is used in all of + the rate control modes. Some :option:`--tune` options may change the + default value. It is not typically manually specified. Default 1.4 + +.. option:: --pbratio + + QP ratio factor between P and B slices. This ratio is used in all of + the rate control modes. Some :option:`--tune` options may change the + default value. It is not typically manually specified. Default 1.3 + +.. option:: --qcomp + + qComp sets the quantizer curve compression factor. It weights the + frame quantizer based on the complexity of residual (measured by + lookahead). Default value is 0.6. Increasing it to 1 will + effectively generate CQP + +.. option:: --qstep + + The maximum single adjustment in QP allowed to rate control. Default + 4 + +.. option:: --ratetol + + The degree of rate fluctuation that x265 tolerates. Rate tolerance + is used along with overflow (difference between actual and target + bitrate), to adjust qp. Default is 1.0 + +.. option:: --qblur + + Temporally blur quants. Default 0.5 + +.. option:: --cplxblur + + temporally blur complexity. default 20 + +Quantization Options +==================== + +Note that rate-distortion optimized quantization (RDOQ) is enabled +implicitly at :option:`--rd` 4, 5, and 6 and disabled implicitly at all +other levels. + +.. option:: --signhide, --no-signhide + + Hide sign bit of one coeff per TU (rdo). The last sign is implied. + This requires analyzing all the coefficients to determine if a sign + must be toggled, and then to determine which one can be toggled with + the least amount of distortion. Default enabled + +.. option:: --qpfile + + Specify a text file which contains frametypes and QPs for some or + all frames. The format of each line is: + + framenumber frametype QP + + Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame, + **b** is an unreferenced B frame. **I** is a keyframe (random + access point) while **i** is a I frame that is not a keyframe + (references are not broken). + + Specifying QP (integer) is optional, and if specified they are + clamped within the encoder to qpmin/qpmax. + +.. option:: --scaling-list + + Quantization scaling lists. HEVC supports 6 quantization scaling + lists to be defined; one each for Y, Cb, Cr for intra prediction and + one each for inter prediction. + + x265 does not use scaling lists by default, but this can also be + made explicit by :option:`--scaling-list` *off*. + + HEVC specifies a default set of scaling lists which may be enabled + without requiring them to be signaled in the SPS. Those scaling + lists can be enabled via :option:`--scaling-list` *default*. + + All other strings indicate a filename containing custom scaling + lists in the HM format. The encode will abort if the file is not + parsed correctly. Custom lists must be signaled in the SPS + +.. option:: --lambda-file + + Specify a text file containing values for x265_lambda_tab and + x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float + values. + + The text file syntax is simple. Comma is considered to be + white-space. All white-space is ignored. Lines must be less than 2k + bytes in length. Content following hash (#) characters are ignored. + The values read from the file are logged at :option:`--log-level` + debug. + + Note that the lambda tables are process-global and so the new values + affect all encoders running in the same process. + + Lambda values affect encoder mode decisions, the lower the lambda + the more bits it will try to spend on signaling information (motion + vectors and splits) and less on residual. This feature is intended + for experimentation. Loop filters ============ -.. option:: --lft, --no-lft +.. option:: --deblock=:, --no-deblock + + Toggle deblocking loop filter, optionally specify deblocking + strength offsets. + + : - parsed as tC offset and Beta offset + , - parsed as tC offset and Beta offset + - both tC and Beta offsets assigned the same value + + If unspecified, the offsets default to 0. The offsets must be in a + range of -6 (lowest strength) to 6 (highest strength). + + To disable the deblocking filter entirely, use --no-deblock or + --deblock=false. Default enabled, with both offsets defaulting to 0 - Toggle deblocking loop filter, default enabled + If deblocking is disabled, or the offsets are non-zero, these + changes from the default configuration are signaled in the PPS. .. option:: --sao, --no-sao @@ -1172,7 +1257,7 @@ VUI fields must be manually specified. 9. bt2020nc 10. bt2020c -.. option:: --chromalocs <0..5> +.. option:: --chromaloc <0..5> Specify chroma sample location for 4:2:0 inputs. Consult the HEVC specification for a description of these values. Default undefined @@ -1206,7 +1291,7 @@ Bitstream options .. option:: --aud, --no-aud Emit an access unit delimiter NAL at the start of each slice access - unit. If option:`--repeat-headers` is not enabled (indicating the + unit. If :option:`--repeat-headers` is not enabled (indicating the user will be writing headers manually at the start of the stream) the very first AUD will be skipped since it cannot be placed at the start of the access unit, where it belongs. Default disabled diff --git a/doc/reST/conf.py b/doc/reST/conf.py index 561f7d0..eea837f 100644 --- a/doc/reST/conf.py +++ b/doc/reST/conf.py @@ -15,3 +15,12 @@ copyright = u'2014 MulticoreWare Inc' # -- Options for HTML output --------------------------------------------------- html_theme = "default" + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'libx265', 'Full x265 Documentation', + ['MulticoreWare Inc'], 3), + ('x265', 'x265', 'x265 CLI Documentation', + ['MulticoreWare Inc'], 1) +] diff --git a/doc/reST/introduction.rst b/doc/reST/introduction.rst index 1d953f4..c503946 100644 --- a/doc/reST/introduction.rst +++ b/doc/reST/introduction.rst @@ -75,7 +75,7 @@ responsible for understanding the laws in your country, and for licensing all applicable patent rights needed for use or distribution of software applications created from the x265 source code. A good place to start is with the `Motion Picture Experts Group - Licensing Authority -- HEVC Licensing Program`_. +- HEVC Licensing Program `_. x265 is a registered trademark of MulticoreWare, Inc. The x265 logo is a trademark of MulticoreWare, and may only be used with explicit written diff --git a/doc/reST/presets.rst b/doc/reST/presets.rst index 99085a2..63134ea 100644 --- a/doc/reST/presets.rst +++ b/doc/reST/presets.rst @@ -1,11 +1,11 @@ Preset Options -------------- +.. _presets: + Presets ======= -.. _preset-tune-ref: - x265 has a number of predefined :option:`--preset` options that make trade-offs between encode speed (encoded frames per second) and compression efficiency (quality per bit in the bitstream). The default @@ -66,7 +66,7 @@ The presets adjust encoder parameters to affect these trade-offs. +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | rdLevel | 2 | 2 | 2 | 2 | 2 | 3 | 4 | 6 | 6 | 6 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ -| lft | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | +| deblock | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | tu-intra | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | 4 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ @@ -75,6 +75,8 @@ The presets adjust encoder parameters to affect these trade-offs. Placebo mode enables transform-skip prediction evaluation. +.. _tunings: + Tuning ====== @@ -97,7 +99,46 @@ after the preset. +--------------+-----------------------------------------------------+ | ssim | enables adaptive quant auto-mode, disables psy-rd | +--------------+-----------------------------------------------------+ +| grain | improves retention of film grain. more below | ++--------------+-----------------------------------------------------+ | fastdecode | no loop filters, no weighted pred, no intra in B | +--------------+-----------------------------------------------------+ | zerolatency | no lookahead, no B frames, no cutree | +--------------+-----------------------------------------------------+ +| cbr | --pbratio 1.0 --ratetol 0.5 | ++--------------+-----------------------------------------------------+ + + +Film Grain Retention +~~~~~~~~~~~~~~~~~~~~ + +:option:`--tune` grain tries to improve the retention of film grain in +the reconstructed output. It helps rate distortion optimizations select +modes which preserve high frequency noise: + + * :option:`--psy-rd` 0.5 + * :option:`--psy-rdoq` 30 + +.. Note:: + + --psy-rdoq is only effective when RDOQuant is enabled, which is at + RD levels 4, 5, and 6 (presets slow and below). + +It lowers the strength of adaptive quantization, so residual energy can +be more evenly distributed across the (noisy) picture: + + * :option:`--aq-mode` 1 + * :option:`--aq-strength` 0.3 + +And it similarly tunes rate control to prevent the slice QP from +swinging too wildly from frame to frame: + + * :option:`--ipratio` 1.1 + * :option:`--pbratio` 1.1 + * :option:`--qcomp` 0.8 + +And lastly it reduces the strength of deblocking to prevent grain being +blurred on block boundaries: + + * :option:`--deblock` -2 + diff --git a/doc/reST/x265.rst b/doc/reST/x265.rst new file mode 100644 index 0000000..32a416d --- /dev/null +++ b/doc/reST/x265.rst @@ -0,0 +1,49 @@ +x265 CLI Documentation +###################### + + +SYNOPSIS +======== + +**x265** [options] infile [-o] outfile + +Bit depth: 8 + + +**x265-10bit** [options] infile [-o] outfile + +Bit depth: 10 + + +infile can be YUV or Y4M + +outfile is raw HEVC bitstream + + +DESCRIPTION +=========== + +.. toctree:: + :maxdepth: 2 + + introduction + + +OPTIONS +======= + +.. toctree:: + :maxdepth: 2 + + cli + presets + lossless + + +SEE ALSO +======== + +**libx265**\(3) + +Online documentation: http://x265.readthedocs.org/en/default/cli.html + diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index ba63f81..75b5eb4 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -21,7 +21,7 @@ include(CheckSymbolExists) include(CheckCXXCompilerFlag) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 35) +set(X265_BUILD 40) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" @@ -63,6 +63,12 @@ if(UNIX) endif() endif(UNIX) +if(X64 AND NOT WIN32) + option(ENABLE_PIC "Enable Position Independent Code" ON) +else() + option(ENABLE_PIC "Enable Position Independent Code" OFF) +endif(X64 AND NOT WIN32) + # Compiler detection if(CMAKE_GENERATOR STREQUAL "Xcode") set(XCODE 1) @@ -121,9 +127,9 @@ endif() if(GCC) add_definitions(-Wall -Wextra -Wshadow) add_definitions(-D__STDC_LIMIT_MACROS=1) - if(X64 AND NOT WIN32) - add_definitions(-fPIC) - endif(X64 AND NOT WIN32) + if(ENABLE_PIC) + add_definitions(-fPIC) + endif(ENABLE_PIC) if(X86 AND NOT X64) add_definitions(-march=i686) endif() diff --git a/source/PPA/ppa.cpp b/source/PPA/ppa.cpp index 607a946..9e924c9 100644 --- a/source/PPA/ppa.cpp +++ b/source/PPA/ppa.cpp @@ -41,8 +41,10 @@ typedef ppa::Base *(FUNC_PPALibInit)(const char **, int); typedef void (FUNC_PPALibRelease)(ppa::Base* &); } +using namespace ppa; + static FUNC_PPALibRelease *_pfuncPpaRelease; -ppa::Base *ppabase; +ppa::Base *ppa::ppabase; static void _ppaReleaseAtExit() { diff --git a/source/PPA/ppa.h b/source/PPA/ppa.h index 42f43b8..abb2a12 100644 --- a/source/PPA/ppa.h +++ b/source/PPA/ppa.h @@ -21,17 +21,8 @@ * For more information, contact us at license @ x265.com. *****************************************************************************/ -#ifndef _PPA_H_ -#define _PPA_H_ - -#if !defined(ENABLE_PPA) - -#define PPA_INIT() -#define PPAStartCpuEventFunc(e) -#define PPAStopCpuEventFunc(e) -#define PPAScopeEvent(e) - -#else +#ifndef PPA_H +#define PPA_H /* declare enum list of users CPU events */ #define PPA_REGISTER_CPU_EVENT(x) x, @@ -40,32 +31,13 @@ enum PPACpuEventEnum #include "ppaCPUEvents.h" PPACpuGroupNums }; - #undef PPA_REGISTER_CPU_EVENT -#define PPA_INIT() initializePPA() -#define PPAStartCpuEventFunc(e) if (ppabase) ppabase->triggerStartEvent(ppabase->getEventId(e)) -#define PPAStopCpuEventFunc(e) if (ppabase) ppabase->triggerEndEvent(ppabase->getEventId(e)) -#define PPAScopeEvent(e) _PPAScope __scope_(e) - #include "ppaApi.h" void initializePPA(); -extern ppa::Base *ppabase; - -class _PPAScope -{ -protected: - - ppa::EventID m_id; - -public: - - _PPAScope(int e) { if (ppabase) { m_id = ppabase->getEventId(e); ppabase->triggerStartEvent(m_id); } else m_id = 0; } - ~_PPAScope() { if (ppabase) ppabase->triggerEndEvent(m_id); } -}; - -#endif // if !defined(ENABLE_PPA) +#define PPA_INIT() initializePPA() +#define PPAScopeEvent(e) ppa::ProfileScope ppaScope_(e) -#endif /* _PPA_H_ */ +#endif /* PPA_H */ diff --git a/source/PPA/ppaApi.h b/source/PPA/ppaApi.h index 149de6d..15fa76b 100644 --- a/source/PPA/ppaApi.h +++ b/source/PPA/ppaApi.h @@ -54,6 +54,17 @@ protected: virtual void init(const char **pNames, int eventCount) = 0; }; + +extern ppa::Base *ppabase; + +struct ProfileScope +{ + ppa::EventID id; + + ProfileScope(int e) { if (ppabase) { id = ppabase->getEventId(e); ppabase->triggerStartEvent(id); } else id = 0; } + ~ProfileScope() { if (ppabase) ppabase->triggerEndEvent(id); } +}; + } #endif //_PPA_API_H_ diff --git a/source/PPA/ppaCPUEvents.h b/source/PPA/ppaCPUEvents.h index 1a47b39..203f055 100644 --- a/source/PPA/ppaCPUEvents.h +++ b/source/PPA/ppaCPUEvents.h @@ -1,25 +1,6 @@ -PPA_REGISTER_CPU_EVENT(encode_block) -PPA_REGISTER_CPU_EVENT(bitstream_write) -PPA_REGISTER_CPU_EVENT(DPB_prepareEncode) -PPA_REGISTER_CPU_EVENT(FrameEncoder_compressFrame) -PPA_REGISTER_CPU_EVENT(FrameEncoder_compressRows) -PPA_REGISTER_CPU_EVENT(CompressCU) -PPA_REGISTER_CPU_EVENT(CompressCU_Depth1) -PPA_REGISTER_CPU_EVENT(CompressCU_Depth2) -PPA_REGISTER_CPU_EVENT(CompressCU_Depth3) -PPA_REGISTER_CPU_EVENT(CompressCU_Depth4) -PPA_REGISTER_CPU_EVENT(CompressIntraCU) -PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth1) -PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth2) -PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth3) -PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth4) -PPA_REGISTER_CPU_EVENT(CheckRDCostIntra) -PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth1) -PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth2) -PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth3) -PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth4) -PPA_REGISTER_CPU_EVENT(CalcRDCostIntra) -PPA_REGISTER_CPU_EVENT(Thread_ProcessRow) -PPA_REGISTER_CPU_EVENT(Thread_compressCU) -PPA_REGISTER_CPU_EVENT(Thread_encodeCU) -PPA_REGISTER_CPU_EVENT(Thread_filterCU) +PPA_REGISTER_CPU_EVENT(bitstreamWrite) +PPA_REGISTER_CPU_EVENT(frameThread) +PPA_REGISTER_CPU_EVENT(encodeCTU) +PPA_REGISTER_CPU_EVENT(filterCTURow) +PPA_REGISTER_CPU_EVENT(slicetypeDecideEV) +PPA_REGISTER_CPU_EVENT(costEstimateRow) diff --git a/source/cmake/CMakeASM_YASMInformation.cmake b/source/cmake/CMakeASM_YASMInformation.cmake index 0af7c24..7a7586c 100644 --- a/source/cmake/CMakeASM_YASMInformation.cmake +++ b/source/cmake/CMakeASM_YASMInformation.cmake @@ -2,7 +2,10 @@ set(ASM_DIALECT "_YASM") set(CMAKE_ASM${ASM_DIALECT}_SOURCE_FILE_EXTENSIONS asm) if(X64) - list(APPEND ASM_FLAGS -DARCH_X86_64=1 -DPIC) + list(APPEND ASM_FLAGS -DARCH_X86_64=1) + if(ENABLE_PIC) + list(APPEND ASM_FLAGS -DPIC) + endif() if(APPLE) set(ARGS -f macho64 -m amd64 -DPREFIX) elseif(UNIX AND NOT CYGWIN) diff --git a/source/cmake/FindVLD.cmake b/source/cmake/FindVLD.cmake index 716625c..ece8bae 100644 --- a/source/cmake/FindVLD.cmake +++ b/source/cmake/FindVLD.cmake @@ -54,11 +54,14 @@ ELSEIF (CMAKE_SIZEOF_VOID_P EQUAL 8) LIST (APPEND _VLD_POSSIBLE_LIB_SUFFIXES lib/Win64) ENDIF (CMAKE_SIZEOF_VOID_P EQUAL 4) +SET (PFILES "ProgramFiles") +SET (PFILES_X86 "ProgramFiles(x86)") # hack to avoid escaping issues in cmake 3.1 + FIND_PATH (VLD_ROOT_DIR NAMES include/vld.h PATHS ENV VLDROOT - "$ENV{PROGRAMFILES}/Visual Leak Detector" - "$ENV{PROGRAMFILES(X86)}/Visual Leak Detector" + "$ENV{PFILES}/Visual Leak Detector" + "$ENV{PFILES_X86}/Visual Leak Detector" "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]" "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]" DOC "VLD root directory") diff --git a/source/cmake/version.cmake b/source/cmake/version.cmake index b6adfb9..0662cda 100644 --- a/source/cmake/version.cmake +++ b/source/cmake/version.cmake @@ -6,9 +6,9 @@ endif() find_package(Git QUIET) # present in 2.8.8 # defaults, in case everything below fails -set(X265_VERSION "unknown") -set(X265_LATEST_TAG "0.0") -set(X265_TAG_DISTANCE "0") +set(X265_VERSION "1.4+222+hg5f9f7194267b") +set(X265_LATEST_TAG "1.4") +set(X265_TAG_DISTANCE "222") if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt) # read the lines of the archive summary file to extract the version @@ -22,9 +22,9 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt) set(hg_${key} ${value}) endforeach() if(DEFINED hg_tag) - set(X265_VERSION ${hg_tag} CACHE STRING "x265 version string.") + set(X265_VERSION ${hg_tag}) set(X265_LATEST_TAG ${hg_tag}) - set(X265_TAG_DISTANCE "0") + set(X265_TAG_DISTANCE "222") elseif(DEFINED hg_node) string(SUBSTRING "${hg_node}" 0 16 hg_id) set(X265_VERSION "${hg_latesttag}+${hg_latesttagdistance}-${hg_id}") diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 46929ca..4ead346 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -1,44 +1,46 @@ # vim: syntax=cmake -set(SSE3 vec/dct-sse3.cpp) -set(SSSE3 vec/dct-ssse3.cpp) -set(SSE41 vec/dct-sse41.cpp) -if(MSVC AND X86) - set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41}) - set(WARNDISABLE "/wd4100") # unreferenced formal parameter - if(INTEL_CXX) - add_definitions(/Qwd111) # statement is unreachable - add_definitions(/Qwd128) # loop is unreachable - add_definitions(/Qwd177) # declared function is unused - add_definitions(/Qwd185) # dynamic initialization in unreachable code - add_definitions(/Qwd280) # conditional expression is constant - endif() - if(X64) - set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE}") - else() - # x64 implies SSE4, so only add /arch:SSE2 if building for Win32 - set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2") - endif() -endif() -if(GCC AND X86) - if(CLANG) - # llvm intrinsic headers cause shadow warnings - set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter") - else() - set(WARNDISABLE "-Wno-unused-parameter") - endif() - if(INTEL_CXX OR CLANG OR (NOT CC_VERSION VERSION_LESS 4.3)) +if(ENABLE_ASSEMBLY) + set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1) + + set(SSE3 vec/dct-sse3.cpp) + set(SSSE3 vec/dct-ssse3.cpp) + set(SSE41 vec/dct-sse41.cpp) + + if(MSVC AND X86) set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41}) - set_source_files_properties(${SSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse3") - set_source_files_properties(${SSSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -mssse3") - set_source_files_properties(${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse4.1") + set(WARNDISABLE "/wd4100") # unreferenced formal parameter + if(INTEL_CXX) + add_definitions(/Qwd111) # statement is unreachable + add_definitions(/Qwd128) # loop is unreachable + add_definitions(/Qwd177) # declared function is unused + add_definitions(/Qwd185) # dynamic initialization in unreachable code + add_definitions(/Qwd280) # conditional expression is constant + endif() + if(X64) + set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE}") + else() + # x64 implies SSE4, so only add /arch:SSE2 if building for Win32 + set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2") + endif() endif() -endif() -set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES}) -source_group(Intrinsics FILES ${VEC_PRIMITIVES}) + if(GCC AND X86) + if(CLANG) + # llvm intrinsic headers cause shadow warnings + set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter") + else() + set(WARNDISABLE "-Wno-unused-parameter") + endif() + if(INTEL_CXX OR CLANG OR (NOT CC_VERSION VERSION_LESS 4.3)) + set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41}) + set_source_files_properties(${SSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse3") + set_source_files_properties(${SSSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -mssse3") + set_source_files_properties(${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse4.1") + endif() + endif() + set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES}) + source_group(Intrinsics FILES ${VEC_PRIMITIVES}) -if(ENABLE_ASSEMBLY) - set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1) set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm mc-a2.asm pixel-util8.asm blockcopy8.asm diff --git a/source/common/common.h b/source/common/common.h index b447bb3..a0f8286 100644 --- a/source/common/common.h +++ b/source/common/common.h @@ -41,6 +41,15 @@ #include "x265.h" +#if ENABLE_PPA +#include "PPA/ppa.h" +#define ProfileScopeEvent(x) PPAScopeEvent(x) +#define PROFILE_INIT() PPA_INIT() +#else +#define ProfileScopeEvent(x) +#define PROFILE_INIT() +#endif + #define FENC_STRIDE 64 #define NUM_INTRA_MODE 35 @@ -56,6 +65,10 @@ extern "C" intptr_t x265_stack_align(void (*func)(), ...); #define x265_stack_align(func, ...) func(__VA_ARGS__) #endif +#if defined(__MINGW32__) +#define fseeko fseeko64 +#endif + #elif defined(_MSC_VER) #define ALIGN_VAR_8(T, var) __declspec(align(8)) T var @@ -245,9 +258,6 @@ typedef int16_t coeff_t; // transform coefficient #define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE) #define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE) -#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */ -#define MAX_NUM_TR_CATEGORIES 8 /* 32, 16, 8, 4 transform categories each for luma and chroma */ - #define COEF_REMAIN_BIN_REDUCTION 3 // indicates the level at which the VLC // transitions from Golomb-Rice to TU+EG(k) @@ -297,21 +307,12 @@ typedef int16_t coeff_t; // transform coefficient #define CHROMA_H_SHIFT(x) (x == X265_CSP_I420 || x == X265_CSP_I422) #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420) +#define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8 namespace x265 { enum { SAO_NUM_OFFSET = 4 }; -// NOTE: MUST be alignment to 16 or 32 bytes for asm code -struct NoiseReduction -{ - /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32 - * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */ - uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]; - uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]; - uint32_t count[MAX_NUM_TR_CATEGORIES]; -}; - enum SaoMergeMode { SAO_MERGE_NONE, @@ -358,6 +359,20 @@ struct SAOParam } }; +/* Stores inter (motion estimation) analysis data for a single frame */ +struct analysis_inter_data +{ + int ref; +}; + +/* Stores intra analysis data for a single frame. This struct needs better packing */ +struct analysis_intra_data +{ + uint8_t* depth; + uint8_t* modes; + char* partSizes; +}; + enum TextType { TEXT_LUMA = 0, // luma diff --git a/source/common/constants.cpp b/source/common/constants.cpp index 4252cb4..749d888 100644 --- a/source/common/constants.cpp +++ b/source/common/constants.cpp @@ -27,21 +27,46 @@ namespace x265 { -static int initialized /* = 0 */; - -// initialize ROM variables -void initROM() +#if HIGH_BIT_DEPTH +// lambda = pow(2, (double)q / 6 - 2) * (1 << (X265_DEPTH - 8)); +double x265_lambda_tab[QP_MAX_MAX + 1] = { - if (ATOMIC_CAS32(&initialized, 0, 1) == 1) - return; -} + 1.0000, 1.1225, 1.2599, 1.4142, 1.5874, + 1.7818, 2.0000, 2.2449, 2.5198, 2.8284, + 3.1748, 3.5636, 4.0000, 4.4898, 5.0397, + 5.6569, 6.3496, 7.1272, 8.0000, 8.9797, + 10.0794, 11.3137, 12.6992, 14.2544, 16.0000, + 17.9594, 20.1587, 22.6274, 25.3984, 28.5088, + 32.0000, 35.9188, 40.3175, 45.2548, 50.7968, + 57.0175, 64.0000, 71.8376, 80.6349, 90.5097, + 101.5937, 114.0350, 128.0000, 143.6751, 161.2699, + 181.0193, 203.1873, 228.0701, 256.0000, 287.3503, + 322.5398, 362.0387, 406.3747, 456.1401, 512.0000, + 574.7006, 645.0796, 724.0773, 812.7493, 912.2803, + 1024.0000, 1149.4011, 1290.1592, 1448.1547, 1625.4987, + 1824.5606, 2048.0000, 2298.8023, 2580.3183, 2896.3094, +}; -void destroyROM() +// lambda2 = pow(lambda, 2) * scale (0.85); +double x265_lambda2_tab[QP_MAX_MAX + 1] = { - if (ATOMIC_CAS32(&initialized, 1, 0) == 0) - return; -} + 0.8500, 1.0709, 1.3493, 1.7000, 2.1419, + 2.6986, 3.4000, 4.2837, 5.3972, 6.8000, + 8.5675, 10.7943, 13.6000, 17.1349, 21.5887, + 27.2000, 34.2699, 43.1773, 54.4000, 68.5397, + 86.3546, 108.8000, 137.0794, 172.7092, 217.6000, + 274.1588, 345.4185, 435.2000, 548.3176, 690.8369, + 870.4000, 1096.6353, 1381.6739, 1740.8000, 2193.2706, + 2763.3478, 3481.6000, 4386.5411, 5526.6955, 6963.2000, + 8773.0823, 11053.3910, 13926.4000, 17546.1645, 22106.7820, + 27852.8000, 35092.3290, 44213.5640, 55705.6000, 70184.6580, + 88427.1280, 111411.2000, 140369.3161, 176854.2561, 222822.4000, + 280738.6321, 353708.5122, 445644.8000, 561477.2643, 707417.0243, + 891289.6000, 1122954.5286, 1414834.0486, 1782579.2000, 2245909.0572, + 2829668.0973, 3565158.4000, 4491818.1144, 5659336.1946, 7130316.8000, +}; +#else /* !HIGH_BIT_DEPTH */ // lambda = pow(2, (double)q / 6 - 2); double x265_lambda_tab[QP_MAX_MAX + 1] = @@ -81,6 +106,8 @@ double x265_lambda2_tab[QP_MAX_MAX + 1] = 176854.2222, 222822.4000, 280738.6627, 353708.5368, 445644.7459 }; +#endif + const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] = { 16, 20, 25, 32, 40, 50, diff --git a/source/common/constants.h b/source/common/constants.h index 9db47db..fa82b80 100644 --- a/source/common/constants.h +++ b/source/common/constants.h @@ -29,9 +29,6 @@ namespace x265 { // private namespace -void initROM(); -void destroyROM(); - void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx); void initRasterToZscan(uint32_t maxFullDepth); diff --git a/source/common/cudata.cpp b/source/common/cudata.cpp index d28e005..c2e3c23 100644 --- a/source/common/cudata.cpp +++ b/source/common/cudata.cpp @@ -227,16 +227,15 @@ void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, /* Each CU's data is layed out sequentially within the charMemBlock */ uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance; - m_qp = (char*)charBuf; charBuf += m_numPartitions; + m_qp = (int8_t*)charBuf; charBuf += m_numPartitions; m_log2CUSize = charBuf; charBuf += m_numPartitions; - m_partSize = charBuf; charBuf += m_numPartitions; - m_predMode = charBuf; charBuf += m_numPartitions; m_lumaIntraDir = charBuf; charBuf += m_numPartitions; m_tqBypass = charBuf; charBuf += m_numPartitions; - m_refIdx[0] = (char*)charBuf; charBuf += m_numPartitions; - m_refIdx[1] = (char*)charBuf; charBuf += m_numPartitions; + m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions; + m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions; m_cuDepth = charBuf; charBuf += m_numPartitions; - m_skipFlag = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */ + m_predMode = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */ + m_partSize = charBuf; charBuf += m_numPartitions; m_mergeFlag = charBuf; charBuf += m_numPartitions; m_interDir = charBuf; charBuf += m_numPartitions; m_mvpIdx[0] = charBuf; charBuf += m_numPartitions; @@ -278,8 +277,6 @@ void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp) /* sequential memsets */ m_partSet((uint8_t*)m_qp, (uint8_t)qp); m_partSet(m_log2CUSize, (uint8_t)g_maxLog2CUSize); - m_partSet(m_partSize, (uint8_t)SIZE_NONE); - m_partSet(m_predMode, (uint8_t)MODE_NONE); m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX); m_partSet(m_tqBypass, (uint8_t)frame.m_encData->m_param->bLossless); if (m_slice->m_sliceType != I_SLICE) @@ -291,7 +288,7 @@ void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp) X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n"); /* initialize the remaining CU data in one memset */ - memset(m_cuDepth, 0, (BytesPerPartition - 8) * m_numPartitions); + memset(m_cuDepth, 0, (BytesPerPartition - 6) * m_numPartitions); uint32_t widthInCU = m_slice->m_sps->numCuInWidth; m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL; @@ -318,8 +315,6 @@ void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom) /* sequential memsets */ m_partSet((uint8_t*)m_qp, (uint8_t)ctu.m_qp[0]); m_partSet(m_log2CUSize, (uint8_t)cuGeom.log2CUSize); - m_partSet(m_partSize, (uint8_t)SIZE_NONE); - m_partSet(m_predMode, (uint8_t)MODE_NONE); m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX); m_partSet(m_tqBypass, (uint8_t)m_encData->m_param->bLossless); m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID); @@ -327,7 +322,7 @@ void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom) m_partSet(m_cuDepth, (uint8_t)cuGeom.depth); /* initialize the remaining CU data in one memset */ - memset(m_skipFlag, 0, (BytesPerPartition - 9) * m_numPartitions); + memset(m_predMode, 0, (BytesPerPartition - 7) * m_numPartitions); } /* Copy the results of a sub-part (split) CU to the parent CU */ @@ -339,14 +334,13 @@ void CUData::copyPartFrom(const CUData& subCU, const CUGeom& childGeom, uint32_t m_subPartCopy((uint8_t*)m_qp + offset, (uint8_t*)subCU.m_qp); m_subPartCopy(m_log2CUSize + offset, subCU.m_log2CUSize); - m_subPartCopy(m_partSize + offset, subCU.m_partSize); - m_subPartCopy(m_predMode + offset, subCU.m_predMode); m_subPartCopy(m_lumaIntraDir + offset, subCU.m_lumaIntraDir); m_subPartCopy(m_tqBypass + offset, subCU.m_tqBypass); m_subPartCopy((uint8_t*)m_refIdx[0] + offset, (uint8_t*)subCU.m_refIdx[0]); m_subPartCopy((uint8_t*)m_refIdx[1] + offset, (uint8_t*)subCU.m_refIdx[1]); m_subPartCopy(m_cuDepth + offset, subCU.m_cuDepth); - m_subPartCopy(m_skipFlag + offset, subCU.m_skipFlag); + m_subPartCopy(m_predMode + offset, subCU.m_predMode); + m_subPartCopy(m_partSize + offset, subCU.m_partSize); m_subPartCopy(m_mergeFlag + offset, subCU.m_mergeFlag); m_subPartCopy(m_interDir + offset, subCU.m_interDir); m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]); @@ -410,7 +404,7 @@ void CUData::initLosslessCU(const CUData& cu, const CUGeom& cuGeom) m_partSet(m_tqBypass, true); /* clear residual coding flags */ - m_partSet(m_skipFlag, 0); + m_partSet(m_predMode, cu.m_predMode[0] & (MODE_INTRA | MODE_INTER)); m_partSet(m_tuDepth, 0); m_partSet(m_transformSkip[0], 0); m_partSet(m_transformSkip[1], 0); @@ -427,14 +421,13 @@ void CUData::copyToPic(uint32_t depth) const m_partCopy((uint8_t*)ctu.m_qp + m_absIdxInCTU, (uint8_t*)m_qp); m_partCopy(ctu.m_log2CUSize + m_absIdxInCTU, m_log2CUSize); - m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize); - m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode); m_partCopy(ctu.m_lumaIntraDir + m_absIdxInCTU, m_lumaIntraDir); m_partCopy(ctu.m_tqBypass + m_absIdxInCTU, m_tqBypass); m_partCopy((uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU, (uint8_t*)m_refIdx[0]); m_partCopy((uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU, (uint8_t*)m_refIdx[1]); m_partCopy(ctu.m_cuDepth + m_absIdxInCTU, m_cuDepth); - m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag); + m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode); + m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize); m_partCopy(ctu.m_mergeFlag + m_absIdxInCTU, m_mergeFlag); m_partCopy(ctu.m_interDir + m_absIdxInCTU, m_interDir); m_partCopy(ctu.m_mvpIdx[0] + m_absIdxInCTU, m_mvpIdx[0]); @@ -477,13 +470,13 @@ void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom) /* copy out all prediction info for this part */ m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU); m_partCopy(m_log2CUSize, ctu.m_log2CUSize + m_absIdxInCTU); - m_partCopy(m_partSize, ctu.m_partSize + m_absIdxInCTU); - m_partCopy(m_predMode, ctu.m_predMode + m_absIdxInCTU); m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU); m_partCopy(m_tqBypass, ctu.m_tqBypass + m_absIdxInCTU); m_partCopy((uint8_t*)m_refIdx[0], (uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU); m_partCopy((uint8_t*)m_refIdx[1], (uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU); m_partCopy(m_cuDepth, ctu.m_cuDepth + m_absIdxInCTU); + m_partSet(m_predMode, ctu.m_predMode[m_absIdxInCTU] & (MODE_INTRA | MODE_INTER)); /* clear skip flag */ + m_partCopy(m_partSize, ctu.m_partSize + m_absIdxInCTU); m_partCopy(m_mergeFlag, ctu.m_mergeFlag + m_absIdxInCTU); m_partCopy(m_interDir, ctu.m_interDir + m_absIdxInCTU); m_partCopy(m_mvpIdx[0], ctu.m_mvpIdx[0] + m_absIdxInCTU); @@ -496,7 +489,6 @@ void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom) memcpy(m_mvd[1], ctu.m_mvd[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV)); /* clear residual coding flags */ - m_partSet(m_skipFlag, 0); m_partSet(m_tuDepth, 0); m_partSet(m_transformSkip[0], 0); m_partSet(m_transformSkip[1], 0); @@ -515,7 +507,7 @@ void CUData::updatePic(uint32_t depth) const m_partCopy(ctu.m_transformSkip[0] + m_absIdxInCTU, m_transformSkip[0]); m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]); m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]); - m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag); + m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode); m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth); m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]); m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]); @@ -552,7 +544,7 @@ const CUData* CUData::getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) return m_cuLeft; } -const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary) const +const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx) const { uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx]; @@ -563,15 +555,10 @@ const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx if (isEqualRow(absPartIdx, absZorderCUIdx, s_numPartInCUSize)) return m_encData->getPicCTU(m_cuAddr); else - { aPartUnitIdx -= m_absIdxInCTU; - return this; - } + return this; } - if (planarAtCTUBoundary) - return NULL; - aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize]; return m_cuAbove; } @@ -785,7 +772,7 @@ const CUData* CUData::getQpMinCuAbove(uint32_t& aPartUnitIdx, uint32_t curAbsIdx } /* Get reference QP from left QpMinCu or latest coded QP */ -char CUData::getRefQP(uint32_t curAbsIdxInCTU) const +int8_t CUData::getRefQP(uint32_t curAbsIdxInCTU) const { uint32_t lPartIdx = 0, aPartIdx = 0; const CUData* cULeft = getQpMinCuLeft(lPartIdx, m_absIdxInCTU + curAbsIdxInCTU); @@ -807,7 +794,7 @@ int CUData::getLastValidPartIdx(int absPartIdx) const return lastValidPartIdx; } -char CUData::getLastCodedQP(uint32_t absPartIdx) const +int8_t CUData::getLastCodedQP(uint32_t absPartIdx) const { uint32_t quPartIdxMask = 0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2; int lastValidPartIdx = getLastValidPartIdx(absPartIdx & quPartIdxMask); @@ -821,7 +808,7 @@ char CUData::getLastCodedQP(uint32_t absPartIdx) const else if (m_cuAddr > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled && !(m_cuAddr % m_slice->m_sps->numCuInWidth))) return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_CU_PARTITIONS); else - return (char)m_slice->m_sliceQp; + return (int8_t)m_slice->m_sliceQp; } } @@ -859,7 +846,7 @@ int CUData::getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred leftIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX; // Get intra direction of above PU - tempCU = getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx, true); + tempCU = g_zscanToPelY[m_absIdxInCTU + absPartIdx] > 0 ? getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx) : NULL; aboveIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX; @@ -912,7 +899,7 @@ uint32_t CUData::getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const void CUData::getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const { uint32_t log2CUSize = m_log2CUSize[absPartIdx]; - uint32_t splitFlag = m_partSize[absPartIdx] == SIZE_NxN; + uint32_t splitFlag = m_partSize[absPartIdx] != SIZE_2Nx2N; tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize; tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize; @@ -949,7 +936,7 @@ uint32_t CUData::getCtxSkipFlag(uint32_t absPartIdx) const return ctx; } -bool CUData::setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth) +bool CUData::setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth) { uint32_t curPartNumb = NUM_CU_PARTITIONS >> (depth << 1); uint32_t curPartNumQ = curPartNumb >> 2; @@ -1224,7 +1211,7 @@ void CUData::setPUMv(int list, const MV& mv, int absPartIdx, int puIdx) setAllPU(m_mv[list], mv, absPartIdx, puIdx); } -void CUData::setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx) +void CUData::setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx) { setAllPU(m_refIdx[list], refIdx, absPartIdx, puIdx); } @@ -1250,7 +1237,7 @@ void CUData::getMvField(const CUData* cu, uint32_t absPartIdx, int picList, MVFi else { // OUT OF BOUNDARY - outMvField.mv.word = 0; + outMvField.mv = 0; outMvField.refIdx = REF_NOT_VALID; } } @@ -1412,6 +1399,8 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV for (uint32_t i = 0; i < maxNumMergeCand; ++i) { + mvFieldNeighbours[i][0].mv = 0; + mvFieldNeighbours[i][1].mv = 0; mvFieldNeighbours[i][0].refIdx = REF_NOT_VALID; mvFieldNeighbours[i][1].refIdx = REF_NOT_VALID; } @@ -1441,7 +1430,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV bool isAvailableA1 = cuLeft && cuLeft->isDiffMER(xP - 1, yP + nPSH - 1, xP, yP) && !(puIdx == 1 && (curPS == SIZE_Nx2N || curPS == SIZE_nLx2N || curPS == SIZE_nRx2N)) && - !cuLeft->isIntra(leftPartIdx); + cuLeft->isInter(leftPartIdx); if (isAvailableA1) { // get Inter Dir @@ -1465,7 +1454,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV bool isAvailableB1 = cuAbove && cuAbove->isDiffMER(xP + nPSW - 1, yP - 1, xP, yP) && !(puIdx == 1 && (curPS == SIZE_2NxN || curPS == SIZE_2NxnU || curPS == SIZE_2NxnD)) && - !cuAbove->isIntra(abovePartIdx); + cuAbove->isInter(abovePartIdx); if (isAvailableB1 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAbove, abovePartIdx))) { // get Inter Dir @@ -1486,7 +1475,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV const CUData* cuAboveRight = getPUAboveRight(aboveRightPartIdx, partIdxRT); bool isAvailableB0 = cuAboveRight && cuAboveRight->isDiffMER(xP + nPSW, yP - 1, xP, yP) && - !cuAboveRight->isIntra(aboveRightPartIdx); + cuAboveRight->isInter(aboveRightPartIdx); if (isAvailableB0 && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveRight, aboveRightPartIdx))) { // get Inter Dir @@ -1507,7 +1496,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV const CUData* cuLeftBottom = this->getPUBelowLeft(leftBottomPartIdx, partIdxLB); bool isAvailableA0 = cuLeftBottom && cuLeftBottom->isDiffMER(xP - 1, yP + nPSH, xP, yP) && - !cuLeftBottom->isIntra(leftBottomPartIdx); + cuLeftBottom->isInter(leftBottomPartIdx); if (isAvailableA0 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuLeftBottom, leftBottomPartIdx))) { // get Inter Dir @@ -1530,7 +1519,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV const CUData* cuAboveLeft = getPUAboveLeft(aboveLeftPartIdx, absPartAddr); bool isAvailableB2 = cuAboveLeft && cuAboveLeft->isDiffMER(xP - 1, yP - 1, xP, yP) && - !cuAboveLeft->isIntra(aboveLeftPartIdx); + cuAboveLeft->isInter(aboveLeftPartIdx); if (isAvailableB2 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAboveLeft, aboveLeftPartIdx)) && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveLeft, aboveLeftPartIdx))) { @@ -1659,7 +1648,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV while (count < maxNumMergeCand) { interDirNeighbours[count] = 1; - mvFieldNeighbours[count][0].mv.word = 0; + mvFieldNeighbours[count][0].mv = 0; mvFieldNeighbours[count][0].refIdx = r; if (isInterB) @@ -1966,26 +1955,18 @@ bool CUData::addMVPCandOrder(MV& outMV, int picList, int refIdx, uint32_t partUn bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int partUnitIdx) const { - uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK; - - int colRefPicList; - int colPOC, colRefPOC, curPOC, curRefPOC; - MV colmv; - - // use coldir. - Frame *colPic = m_slice->m_refPicList[m_slice->isInterB() ? 1 - m_slice->m_colFromL0Flag : 0][m_slice->m_colRefIdx]; - CUData *colCU = colPic->m_encData->getPicCTU(cuAddr); + const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx]; + const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr); - if (colCU->m_partSize[partUnitIdx] == SIZE_NONE) + if (colCU->m_predMode[partUnitIdx] == MODE_NONE) return false; - curPOC = m_slice->m_poc; - colPOC = colCU->m_slice->m_poc; + uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK; if (colCU->isIntra(absPartAddr)) return false; - colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag; + int colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag; int colRefIdx = colCU->m_refIdx[colRefPicList][absPartAddr]; @@ -1999,9 +1980,12 @@ bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int p } // Scale the vector - colRefPOC = colCU->m_slice->m_refPOCList[colRefPicList][colRefIdx]; - colmv = colCU->m_mv[colRefPicList][absPartAddr]; - curRefPOC = m_slice->m_refPOCList[picList][outRefIdx]; + int colRefPOC = colCU->m_slice->m_refPOCList[colRefPicList][colRefIdx]; + int colPOC = colCU->m_slice->m_poc; + MV colmv = colCU->m_mv[colRefPicList][absPartAddr]; + + int curRefPOC = m_slice->m_refPOCList[picList][outRefIdx]; + int curPOC = m_slice->m_poc; scaleMvByPOCDist(outMV, colmv, curPOC, curRefPOC, colPOC, colRefPOC); return true; @@ -2096,7 +2080,7 @@ void CUData::getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uin #define CU_SET_FLAG(bitfield, flag, value) (bitfield) = ((bitfield) & (~(flag))) | ((~((value) - 1)) & (flag)) -void CUData::calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const +void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) { // Initialize the coding blocks inside the CTB for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= MIN_LOG2_CU_SIZE; log2CUSize--) @@ -2111,10 +2095,10 @@ void CUData::calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUS uint32_t depthIdx = g_depthScanIdx[sbY][sbX]; uint32_t cuIdx = rangeCUIdx + depthIdx; uint32_t childIdx = rangeCUIdx + sbWidth * sbWidth + (depthIdx << 2); - uint32_t px = m_cuPelX + sbX * blockSize; - uint32_t py = m_cuPelY + sbY * blockSize; - int32_t presentFlag = px < picWidth && py < picHeight; - int32_t splitMandatoryFlag = presentFlag && !lastLevelFlag && (px + blockSize > picWidth || py + blockSize > picHeight); + uint32_t px = sbX * blockSize; + uint32_t py = sbY * blockSize; + int32_t presentFlag = px < ctuWidth && py < ctuHeight; + int32_t splitMandatoryFlag = presentFlag && !lastLevelFlag && (px + blockSize > ctuWidth || py + blockSize > ctuHeight); /* Offset of the luma CU in the X, Y direction in terms of pixels from the CTU origin */ uint32_t xOffset = (sbX * blockSize) >> 3; diff --git a/source/common/cudata.h b/source/common/cudata.h index 7f735d6..e5d88cf 100644 --- a/source/common/cudata.h +++ b/source/common/cudata.h @@ -46,14 +46,15 @@ enum PartSize SIZE_2NxnD, // asymmetric motion partition, 2Nx(3N/2) + 2Nx( N/2) SIZE_nLx2N, // asymmetric motion partition, ( N/2)x2N + (3N/2)x2N SIZE_nRx2N, // asymmetric motion partition, (3N/2)x2N + ( N/2)x2N - SIZE_NONE = 15 + NUM_SIZES }; enum PredMode { - MODE_INTER, - MODE_INTRA, - MODE_NONE = 15 + MODE_NONE = 0, + MODE_INTER = (1 << 0), + MODE_INTRA = (1 << 1), + MODE_SKIP = (1 << 2) | MODE_INTER }; // motion vector predictor direction used in AMVP @@ -126,15 +127,14 @@ public: int m_vChromaShift; /* Per-part data, stored contiguously */ - char* m_qp; // array of QP values + int8_t* m_qp; // array of QP values uint8_t* m_log2CUSize; // array of cu log2Size TODO: seems redundant to depth - uint8_t* m_partSize; // array of partition sizes - uint8_t* m_predMode; // array of prediction modes uint8_t* m_lumaIntraDir; // array of intra directions (luma) uint8_t* m_tqBypass; // array of CU lossless flags - char* m_refIdx[2]; // array of motion reference indices per list + int8_t* m_refIdx[2]; // array of motion reference indices per list uint8_t* m_cuDepth; // array of depths - uint8_t* m_skipFlag; // array of skip flags + uint8_t* m_predMode; // array of prediction modes + uint8_t* m_partSize; // array of partition sizes uint8_t* m_mergeFlag; // array of merge flags uint8_t* m_interDir; // array of inter directions uint8_t* m_mvpIdx[2]; // array of motion vector predictor candidates or merge candidate indices [0] @@ -142,7 +142,7 @@ public: uint8_t* m_transformSkip[3]; // array of transform skipping flags per plane uint8_t* m_cbf[3]; // array of coded block flags (CBF) per plane uint8_t* m_chromaIntraDir; // array of intra directions (chroma) - enum { BytesPerPartition = 22 }; // combined sizeof() of all per-part data + enum { BytesPerPartition = 21 }; // combined sizeof() of all per-part data coeff_t* m_trCoeff[3]; // transformed coefficient buffer per plane @@ -158,7 +158,7 @@ public: CUData(); void initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance); - void calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const; + static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]); void initCTU(const Frame& frame, uint32_t cuAddr, int qp); void initSubCU(const CUData& ctu, const CUGeom& cuGeom); @@ -173,12 +173,11 @@ public: void updatePic(uint32_t depth) const; void setPartSizeSubParts(PartSize size) { m_partSet(m_partSize, (uint8_t)size); } - void setSkipFlagSubParts(uint8_t skipFlag) { m_partSet(m_skipFlag, skipFlag); } void setPredModeSubParts(PredMode mode) { m_partSet(m_predMode, (uint8_t)mode); } void clearCbf() { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); } /* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */ - void setQPSubParts(char qp, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); } + void setQPSubParts(int8_t qp, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); } void setTUDepthSubParts(uint8_t tuDepth, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_tuDepth + absPartIdx, tuDepth); } void setLumaIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_lumaIntraDir + absPartIdx, dir); } void setChromIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_chromaIntraDir + absPartIdx, dir); } @@ -187,15 +186,15 @@ public: void setTransformSkipSubParts(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_transformSkip[ttype] + absPartIdx, tskip); } void setTransformSkipPartRange(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t coveredPartIdxes) { memset(m_transformSkip[ttype] + absPartIdx, tskip, coveredPartIdxes); } - bool setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth); + bool setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth); void setPUInterDir(uint8_t dir, uint32_t absPartIdx, uint32_t puIdx); void setPUMv(int list, const MV& mv, int absPartIdx, int puIdx); - void setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx); + void setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx); - uint8_t getCbf(uint32_t absPartIdx, TextType ttype, uint32_t trDepth) const { return (m_cbf[ttype][absPartIdx] >> trDepth) & 0x1; } + uint8_t getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; } uint8_t getQtRootCbf(uint32_t absPartIdx) const { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; } - char getRefQP(uint32_t currAbsIdxInCTU) const; + int8_t getRefQP(uint32_t currAbsIdxInCTU) const; uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const; void clipMv(MV& outMV) const; int fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const; @@ -204,7 +203,8 @@ public: uint32_t getNumPartInter() const { return nbPartsTable[(int)m_partSize[0]]; } bool isIntra(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_INTRA; } - bool isSkipped(uint32_t absPartIdx) const { return !!m_skipFlag[absPartIdx]; } + bool isInter(uint32_t absPartIdx) const { return !!(m_predMode[absPartIdx] & MODE_INTER); } + bool isSkipped(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_SKIP; } bool isBipredRestriction() const { return m_log2CUSize[0] == 3 && m_partSize[0] != SIZE_2Nx2N; } void getPartIndexAndSize(uint32_t puIdx, uint32_t& absPartIdx, int& puWidth, int& puHeight) const; @@ -221,7 +221,7 @@ public: void getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const; const CUData* getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const; - const CUData* getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary = false) const; + const CUData* getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx) const; const CUData* getPUAboveLeft(uint32_t& alPartUnitIdx, uint32_t curPartUnitIdx) const; const CUData* getPUAboveRight(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx) const; const CUData* getPUBelowLeft(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx) const; @@ -237,7 +237,7 @@ protected: template void setAllPU(T *p, const T& val, int absPartIdx, int puIdx); - char getLastCodedQP(uint32_t absPartIdx) const; + int8_t getLastCodedQP(uint32_t absPartIdx) const; int getLastValidPartIdx(int absPartIdx) const; bool hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const; diff --git a/source/common/dct.cpp b/source/common/dct.cpp index 714006e..09cf829 100644 --- a/source/common/dct.cpp +++ b/source/common/dct.cpp @@ -41,7 +41,7 @@ namespace { // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm // give identical results -void fastForwardDst(int16_t *block, int16_t *coeff, int shift) // input block, output coeff +void fastForwardDst(const int16_t* block, int16_t* coeff, int shift) // input block, output coeff { int c[4]; int rnd_factor = 1 << (shift - 1); @@ -61,7 +61,7 @@ void fastForwardDst(int16_t *block, int16_t *coeff, int shift) // input block, } } -void inversedst(int16_t *tmp, int16_t *block, int shift) // input tmp, output block +void inversedst(const int16_t* tmp, int16_t* block, int shift) // input tmp, output block { int i, c[4]; int rnd_factor = 1 << (shift - 1); @@ -81,7 +81,7 @@ void inversedst(int16_t *tmp, int16_t *block, int shift) // input tmp, output b } } -void partialButterfly16(int16_t *src, int16_t *dst, int shift, int line) +void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line) { int j, k; int E[8], O[8]; @@ -134,7 +134,7 @@ void partialButterfly16(int16_t *src, int16_t *dst, int shift, int line) } } -void partialButterfly32(int16_t *src, int16_t *dst, int shift, int line) +void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line) { int j, k; int E[16], O[16]; @@ -203,7 +203,7 @@ void partialButterfly32(int16_t *src, int16_t *dst, int shift, int line) } } -void partialButterfly8(int16_t *src, int16_t *dst, int shift, int line) +void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line) { int j, k; int E[4], O[4]; @@ -240,7 +240,7 @@ void partialButterfly8(int16_t *src, int16_t *dst, int shift, int line) } } -void partialButterflyInverse4(int16_t *src, int16_t *dst, int shift, int line) +void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line) { int j; int E[2], O[2]; @@ -265,7 +265,7 @@ void partialButterflyInverse4(int16_t *src, int16_t *dst, int shift, int line) } } -void partialButterflyInverse8(int16_t *src, int16_t *dst, int shift, int line) +void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line) { int j, k; int E[4], O[4]; @@ -301,7 +301,7 @@ void partialButterflyInverse8(int16_t *src, int16_t *dst, int shift, int line) } } -void partialButterflyInverse16(int16_t *src, int16_t *dst, int shift, int line) +void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line) { int j, k; int E[8], O[8]; @@ -352,7 +352,7 @@ void partialButterflyInverse16(int16_t *src, int16_t *dst, int shift, int line) } } -void partialButterflyInverse32(int16_t *src, int16_t *dst, int shift, int line) +void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line) { int j, k; int E[16], O[16]; @@ -416,7 +416,7 @@ void partialButterflyInverse32(int16_t *src, int16_t *dst, int shift, int line) } } -void partialButterfly4(int16_t *src, int16_t *dst, int shift, int line) +void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line) { int j; int E[2], O[2]; @@ -440,7 +440,7 @@ void partialButterfly4(int16_t *src, int16_t *dst, int shift, int line) } } -void dst4_c(int16_t *src, int32_t *dst, intptr_t stride) +void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 1 + X265_DEPTH - 8; const int shift_2nd = 8; @@ -450,25 +450,14 @@ void dst4_c(int16_t *src, int32_t *dst, intptr_t stride) for (int i = 0; i < 4; i++) { - memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t)); + memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t)); } fastForwardDst(block, coef, shift_1st); - fastForwardDst(coef, block, shift_2nd); - -#define N (4) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - dst[i * N + j] = block[i * N + j]; - } - } - -#undef N + fastForwardDst(coef, dst, shift_2nd); } -void dct4_c(int16_t *src, int32_t *dst, intptr_t stride) +void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 1 + X265_DEPTH - 8; const int shift_2nd = 8; @@ -478,24 +467,14 @@ void dct4_c(int16_t *src, int32_t *dst, intptr_t stride) for (int i = 0; i < 4; i++) { - memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t)); + memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t)); } partialButterfly4(block, coef, shift_1st, 4); - partialButterfly4(coef, block, shift_2nd, 4); -#define N (4) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - dst[i * N + j] = block[i * N + j]; - } - } - -#undef N + partialButterfly4(coef, dst, shift_2nd, 4); } -void dct8_c(int16_t *src, int32_t *dst, intptr_t stride) +void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 2 + X265_DEPTH - 8; const int shift_2nd = 9; @@ -505,25 +484,14 @@ void dct8_c(int16_t *src, int32_t *dst, intptr_t stride) for (int i = 0; i < 8; i++) { - memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t)); + memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t)); } partialButterfly8(block, coef, shift_1st, 8); - partialButterfly8(coef, block, shift_2nd, 8); - -#define N (8) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - dst[i * N + j] = block[i * N + j]; - } - } - -#undef N + partialButterfly8(coef, dst, shift_2nd, 8); } -void dct16_c(int16_t *src, int32_t *dst, intptr_t stride) +void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 3 + X265_DEPTH - 8; const int shift_2nd = 10; @@ -533,25 +501,14 @@ void dct16_c(int16_t *src, int32_t *dst, intptr_t stride) for (int i = 0; i < 16; i++) { - memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t)); + memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t)); } partialButterfly16(block, coef, shift_1st, 16); - partialButterfly16(coef, block, shift_2nd, 16); - -#define N (16) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - dst[i * N + j] = block[i * N + j]; - } - } - -#undef N + partialButterfly16(coef, dst, shift_2nd, 16); } -void dct32_c(int16_t *src, int32_t *dst, intptr_t stride) +void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 4 + X265_DEPTH - 8; const int shift_2nd = 11; @@ -561,25 +518,14 @@ void dct32_c(int16_t *src, int32_t *dst, intptr_t stride) for (int i = 0; i < 32; i++) { - memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t)); + memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t)); } partialButterfly32(block, coef, shift_1st, 32); - partialButterfly32(coef, block, shift_2nd, 32); - -#define N (32) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - dst[i * N + j] = block[i * N + j]; - } - } - -#undef N + partialButterfly32(coef, dst, shift_2nd, 32); } -void idst4_c(int32_t *src, int16_t *dst, intptr_t stride) +void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -587,27 +533,16 @@ void idst4_c(int32_t *src, int16_t *dst, intptr_t stride) ALIGN_VAR_32(int16_t, coef[4 * 4]); ALIGN_VAR_32(int16_t, block[4 * 4]); -#define N (4) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - block[i * N + j] = (int16_t)src[i * N + j]; - } - } - -#undef N - - inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output + inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output for (int i = 0; i < 4; i++) { - memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t)); + memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t)); } } -void idct4_c(int32_t *src, int16_t *dst, intptr_t stride) +void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -615,27 +550,16 @@ void idct4_c(int32_t *src, int16_t *dst, intptr_t stride) ALIGN_VAR_32(int16_t, coef[4 * 4]); ALIGN_VAR_32(int16_t, block[4 * 4]); -#define N (4) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - block[i * N + j] = (int16_t)src[i * N + j]; - } - } - -#undef N - - partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output + partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output for (int i = 0; i < 4; i++) { - memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t)); + memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t)); } } -void idct8_c(int32_t *src, int16_t *dst, intptr_t stride) +void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -643,26 +567,16 @@ void idct8_c(int32_t *src, int16_t *dst, intptr_t stride) ALIGN_VAR_32(int16_t, coef[8 * 8]); ALIGN_VAR_32(int16_t, block[8 * 8]); -#define N (8) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - block[i * N + j] = (int16_t)src[i * N + j]; - } - } - -#undef N - - partialButterflyInverse8(block, coef, shift_1st, 8); + partialButterflyInverse8(src, coef, shift_1st, 8); partialButterflyInverse8(coef, block, shift_2nd, 8); + for (int i = 0; i < 8; i++) { - memcpy(&dst[i * stride], &block[i * 8], 8 * sizeof(int16_t)); + memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t)); } } -void idct16_c(int32_t *src, int16_t *dst, intptr_t stride) +void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -670,26 +584,16 @@ void idct16_c(int32_t *src, int16_t *dst, intptr_t stride) ALIGN_VAR_32(int16_t, coef[16 * 16]); ALIGN_VAR_32(int16_t, block[16 * 16]); -#define N (16) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - block[i * N + j] = (int16_t)src[i * N + j]; - } - } - -#undef N - - partialButterflyInverse16(block, coef, shift_1st, 16); + partialButterflyInverse16(src, coef, shift_1st, 16); partialButterflyInverse16(coef, block, shift_2nd, 16); + for (int i = 0; i < 16; i++) { - memcpy(&dst[i * stride], &block[i * 16], 16 * sizeof(int16_t)); + memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t)); } } -void idct32_c(int32_t *src, int16_t *dst, intptr_t stride) +void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -697,27 +601,16 @@ void idct32_c(int32_t *src, int16_t *dst, intptr_t stride) ALIGN_VAR_32(int16_t, coef[32 * 32]); ALIGN_VAR_32(int16_t, block[32 * 32]); -#define N (32) - for (int i = 0; i < N; i++) - { - for (int j = 0; j < N; j++) - { - block[i * N + j] = (int16_t)src[i * N + j]; - } - } - -#undef N - - partialButterflyInverse32(block, coef, shift_1st, 32); + partialButterflyInverse32(src, coef, shift_1st, 32); partialButterflyInverse32(coef, block, shift_2nd, 32); for (int i = 0; i < 32; i++) { - memcpy(&dst[i * stride], &block[i * 32], 32 * sizeof(int16_t)); + memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t)); } } -void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift) +void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift) { #if HIGH_BIT_DEPTH X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > 2), "dequant invalid scale %d\n", scale); @@ -737,11 +630,11 @@ void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scal for (int n = 0; n < num; n++) { coeffQ = (quantCoef[n] * scale + add) >> shift; - coef[n] = Clip3(-32768, 32767, coeffQ); + coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ); } } -void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift) +void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift) { X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); @@ -756,7 +649,7 @@ void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int for (int n = 0; n < num; n++) { coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per); - coef[n] = Clip3(-32768, 32767, coeffQ); + coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ); } } else @@ -764,12 +657,12 @@ void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int for (int n = 0; n < num; n++) { coeffQ = Clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]); - coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift)); + coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ << (per - shift)); } } } -uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff) +uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff) { X265_CHECK(qBits >= 8, "qBits less than 8\n"); X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n"); @@ -793,7 +686,7 @@ uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* q return numSig; } -uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff) +uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff) { X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n"); X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n"); @@ -817,7 +710,7 @@ uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, return numSig; } -int count_nonzero_c(const int16_t *quantCoeff, int numCoeff) +int count_nonzero_c(const int16_t* quantCoeff, int numCoeff) { X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n"); X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff); @@ -833,22 +726,22 @@ int count_nonzero_c(const int16_t *quantCoeff, int numCoeff) } template -uint32_t copy_count(int16_t* coeff, int16_t* residual, intptr_t stride) +uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride) { uint32_t numSig = 0; for (int k = 0; k < trSize; k++) { for (int j = 0; j < trSize; j++) { - coeff[k * trSize + j] = residual[k * stride + j]; - numSig += (residual[k * stride + j] != 0); + coeff[k * trSize + j] = residual[k * resiStride + j]; + numSig += (residual[k * resiStride + j] != 0); } } return numSig; } -void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff) +void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff) { for (int i = 0; i < numCoeff; i++) { @@ -857,7 +750,7 @@ void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numC level = (level + sign) ^ sign; resSum[i] += level; level -= offset[i]; - dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign; + dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign); } } diff --git a/source/common/deblock.cpp b/source/common/deblock.cpp index c9a2731..7369787 100644 --- a/source/common/deblock.cpp +++ b/source/common/deblock.cpp @@ -33,20 +33,44 @@ using namespace x265; #define DEBLOCK_SMALLEST_BLOCK 8 #define DEFAULT_INTRA_TC_OFFSET 2 -void Deblock::deblockCTU(CUData* cu, int32_t dir) +void Deblock::deblockCTU(const CUData* ctu, int32_t dir) { - uint8_t blockingStrength[MAX_NUM_PARTITIONS]; + uint8_t blockStrength[MAX_NUM_PARTITIONS]; - memset(blockingStrength, 0, sizeof(uint8_t) * m_numPartitions); + memset(blockStrength, 0, sizeof(uint8_t) * m_numPartitions); - deblockCU(cu, 0, 0, dir, blockingStrength); + deblockCU(ctu, 0, 0, dir, blockStrength); +} + +static inline uint8_t bsCuEdge(const CUData* cu, uint32_t absPartIdx, int32_t dir) +{ + if (dir == Deblock::EDGE_VER) + { + if (cu->m_cuPelX + g_zscanToPelX[absPartIdx] > 0) + { + uint32_t tempPartIdx; + const CUData* tempCU = cu->getPULeft(tempPartIdx, absPartIdx); + return tempCU ? 2 : 0; + } + } + else + { + if (cu->m_cuPelY + g_zscanToPelY[absPartIdx] > 0) + { + uint32_t tempPartIdx; + const CUData* tempCU = cu->getPUAbove(tempPartIdx, absPartIdx); + return tempCU ? 2 : 0; + } + } + + return 0; } /* Deblocking filter process in CU-based (the same function as conventional's) * param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */ -void Deblock::deblockCU(CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockingStrength[]) +void Deblock::deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[]) { - if (cu->m_partSize[absPartIdx] == SIZE_NONE) + if (cu->m_predMode[absPartIdx] == MODE_NONE) return; uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1); @@ -60,23 +84,21 @@ void Deblock::deblockCU(CUData* cu, uint32_t absPartIdx, uint32_t depth, const i uint32_t ymax = sps.picHeightInLumaSamples - cu->m_cuPelY; for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts) if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax) - deblockCU(cu, absPartIdx, depth + 1, dir, blockingStrength); + deblockCU(cu, absPartIdx, depth + 1, dir, blockStrength); return; } - const uint32_t widthInBaseUnits = sps.numPartInCUSize >> depth; - Param params; - setLoopfilterParam(cu, absPartIdx, ¶ms); - setEdgefilterPU(cu, absPartIdx, dir, blockingStrength, widthInBaseUnits); - setEdgefilterTU(cu, absPartIdx, depth, dir, blockingStrength); - setEdgefilterMultiple(cu, absPartIdx, dir, 0, (dir == EDGE_VER ? params.leftEdge : params.topEdge), blockingStrength, widthInBaseUnits); + const uint32_t numUnits = sps.numPartInCUSize >> depth; + setEdgefilterPU(cu, absPartIdx, dir, blockStrength, numUnits); + setEdgefilterTU(cu, absPartIdx, depth, dir, blockStrength); + setEdgefilterMultiple(cu, absPartIdx, dir, 0, bsCuEdge(cu, absPartIdx, dir), blockStrength, numUnits); for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + curNumParts; partIdx++) { uint32_t bsCheck = !(partIdx & (1 << dir)); - if (bsCheck && blockingStrength[partIdx]) - getBoundaryStrengthSingle(cu, dir, partIdx, blockingStrength); + if (bsCheck && blockStrength[partIdx]) + blockStrength[partIdx] = getBoundaryStrength(cu, dir, partIdx, blockStrength); } const uint32_t partIdxIncr = DEBLOCK_SMALLEST_BLOCK >> LOG2_UNIT_SIZE; @@ -87,34 +109,33 @@ void Deblock::deblockCU(CUData* cu, uint32_t absPartIdx, uint32_t depth, const i for (uint32_t e = 0; e < sizeInPU; e += partIdxIncr) { - edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockingStrength); + edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength); if (!((e0 + e) & chromaMask)) - edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockingStrength); + edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockStrength); } } -static inline uint32_t calcBsIdx(CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx) +static inline uint32_t calcBsIdx(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx) { - uint32_t ctuWidthInBaseUnits = cu->m_slice->m_sps->numPartInCUSize; + uint32_t numPartInCUSize = cu->m_slice->m_sps->numPartInCUSize; if (dir) - return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * ctuWidthInBaseUnits + baseUnitIdx]; + return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * numPartInCUSize + baseUnitIdx]; else - return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * ctuWidthInBaseUnits + edgeIdx]; + return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numPartInCUSize + edgeIdx]; } -void Deblock::setEdgefilterMultiple(CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits) +void Deblock::setEdgefilterMultiple(const CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits) { - const uint32_t numElem = widthInBaseUnits; - X265_CHECK(numElem > 0, "numElem edge filter check\n"); - for (uint32_t i = 0; i < numElem; i++) + X265_CHECK(numUnits > 0, "numUnits edge filter check\n"); + for (uint32_t i = 0; i < numUnits; i++) { const uint32_t bsidx = calcBsIdx(cu, scanIdx, dir, edgeIdx, i); - blockingStrength[bsidx] = value; + blockStrength[bsidx] = value; } } -void Deblock::setEdgefilterTU(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[]) +void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[]) { if ((uint32_t)cu->m_tuDepth[absPartIdx] + cu->m_cuDepth[absPartIdx] > depth) { @@ -122,47 +143,47 @@ void Deblock::setEdgefilterTU(CUData* cu, uint32_t absPartIdx, uint32_t depth, i const uint32_t qNumParts = curNumParts >> 2; for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts) - setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockingStrength); + setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockStrength); return; } - uint32_t widthInBaseUnits = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE); - setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockingStrength, widthInBaseUnits); + uint32_t numUnits = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE); + setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockStrength, numUnits); } -void Deblock::setEdgefilterPU(CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits) +void Deblock::setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits) { - const uint32_t hWidthInBaseUnits = widthInBaseUnits >> 1; - const uint32_t qWidthInBaseUnits = widthInBaseUnits >> 2; + const uint32_t hNumUnits = numUnits >> 1; + const uint32_t qNumUnits = numUnits >> 2; switch (cu->m_partSize[absPartIdx]) { case SIZE_2NxN: if (EDGE_HOR == dir) - setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits); break; case SIZE_Nx2N: if (EDGE_VER == dir) - setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits); break; case SIZE_NxN: - setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits); break; case SIZE_2NxnU: if (EDGE_HOR == dir) - setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits); break; case SIZE_nLx2N: if (EDGE_VER == dir) - setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits); break; case SIZE_2NxnD: if (EDGE_HOR == dir) - setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits); break; case SIZE_nRx2N: if (EDGE_VER == dir) - setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits); break; case SIZE_2Nx2N: @@ -171,151 +192,65 @@ void Deblock::setEdgefilterPU(CUData* cu, uint32_t absPartIdx, int32_t dir, uint } } -void Deblock::setLoopfilterParam(CUData* cu, uint32_t absPartIdx, Param *params) +uint8_t Deblock::getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]) { - uint32_t x = cu->m_cuPelX + g_zscanToPelX[absPartIdx]; - uint32_t y = cu->m_cuPelY + g_zscanToPelY[absPartIdx]; - - const CUData* tempCU; - uint32_t tempPartIdx; + // Calculate block index + uint32_t partP; + const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ)); - if (!x) - params->leftEdge = 0; - else - { - tempCU = cu->getPULeft(tempPartIdx, absPartIdx); - if (tempCU) - params->leftEdge = 2; - else - params->leftEdge = 0; - } + // Set BS for Intra MB : BS = 2 + if (cuP->isIntra(partP) || cuQ->isIntra(partQ)) + return 2; - if (!y) - params->topEdge = 0; - else - { - tempCU = cu->getPUAbove(tempPartIdx, absPartIdx); - if (tempCU) - params->topEdge = 2; - else - params->topEdge = 0; - } -} + // Set BS for not Intra MB : BS = 1 or 0 + if (blockStrength[partQ] > 1 && + (cuQ->getCbf(partQ, TEXT_LUMA, cuQ->m_tuDepth[partQ]) || + cuP->getCbf(partP, TEXT_LUMA, cuP->m_tuDepth[partP]))) + return 1; -void Deblock::getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t absPartIdx, uint8_t blockingStrength[]) -{ - const Slice* const slice = cu->m_slice; - const uint32_t partQ = absPartIdx; - CUData* const cuQ = cu; + static const MV zeroMv(0, 0); + const Slice* const sliceQ = cuQ->m_slice; + const Slice* const sliceP = cuP->m_slice; - uint32_t partP; - const CUData* cuP; - uint8_t bs = 0; + const Frame* refP0 = sliceP->getRefPic(0, cuP->m_refIdx[0][partP]); + const Frame* refQ0 = sliceQ->getRefPic(0, cuQ->m_refIdx[0][partQ]); + const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv; + const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv; - // Calculate block index - if (dir == EDGE_VER) - cuP = cuQ->getPULeft(partP, partQ); - else // (dir == EDGE_HOR) - cuP = cuQ->getPUAbove(partP, partQ); + if (sliceQ->isInterP() && sliceP->isInterP()) + { + return ((refP0 != refQ0) || + (abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4)) ? 1 : 0; + } - // Set BS for Intra MB : BS = 4 or 3 - if (cuP->isIntra(partP) || cuQ->isIntra(partQ)) - bs = 2; + // (sliceQ->isInterB() || sliceP->isInterB()) + const Frame* refP1 = sliceP->getRefPic(1, cuP->m_refIdx[1][partP]); + const Frame* refQ1 = sliceQ->getRefPic(1, cuQ->m_refIdx[1][partQ]); + const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv; + const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv; - // Set BS for not Intra MB : BS = 2 or 1 or 0 - if (!cuP->isIntra(partP) && !cuQ->isIntra(partQ)) + if (((refP0 == refQ0) && (refP1 == refQ1)) || ((refP0 == refQ1) && (refP1 == refQ0))) { - uint32_t nsPartQ = partQ; - uint32_t nsPartP = partP; - - if (blockingStrength[absPartIdx] > 1 && - (cuQ->getCbf(nsPartQ, TEXT_LUMA, cuQ->m_tuDepth[nsPartQ]) || - cuP->getCbf(nsPartP, TEXT_LUMA, cuP->m_tuDepth[nsPartP]))) - bs = 1; - else + if (refP0 != refP1) // Different L0 & L1 { - if (dir == EDGE_HOR) - cuP = cuQ->getPUAbove(partP, partQ); - - if (slice->isInterB() || cuP->m_slice->isInterB()) - { - int32_t refIdx; - Frame *refP0, *refP1, *refQ0, *refQ1; - refIdx = cuP->m_refIdx[0][partP]; - refP0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx]; - refIdx = cuP->m_refIdx[1][partP]; - refP1 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[1][refIdx]; - refIdx = cuQ->m_refIdx[0][partQ]; - refQ0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx]; - refIdx = cuQ->m_refIdx[1][partQ]; - refQ1 = (refIdx < 0) ? NULL : slice->m_refPicList[1][refIdx]; - - MV mvp0 = cuP->m_mv[0][partP]; - MV mvp1 = cuP->m_mv[1][partP]; - MV mvq0 = cuQ->m_mv[0][partQ]; - MV mvq1 = cuQ->m_mv[1][partQ]; - - if (!refP0) mvp0 = 0; - if (!refP1) mvp1 = 0; - if (!refQ0) mvq0 = 0; - if (!refQ1) mvq1 = 0; - - if (((refP0 == refQ0) && (refP1 == refQ1)) || ((refP0 == refQ1) && (refP1 == refQ0))) - { - if (refP0 != refP1) // Different L0 & L1 - { - if (refP0 == refQ0) - { - bs = ((abs(mvq0.x - mvp0.x) >= 4) || - (abs(mvq0.y - mvp0.y) >= 4) || - (abs(mvq1.x - mvp1.x) >= 4) || - (abs(mvq1.y - mvp1.y) >= 4)) ? 1 : 0; - } - else - { - bs = ((abs(mvq1.x - mvp0.x) >= 4) || - (abs(mvq1.y - mvp0.y) >= 4) || - (abs(mvq0.x - mvp1.x) >= 4) || - (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0; - } - } - else // Same L0 & L1 - { - bs = ((abs(mvq0.x - mvp0.x) >= 4) || - (abs(mvq0.y - mvp0.y) >= 4) || - (abs(mvq1.x - mvp1.x) >= 4) || - (abs(mvq1.y - mvp1.y) >= 4)) && - ((abs(mvq1.x - mvp0.x) >= 4) || - (abs(mvq1.y - mvp0.y) >= 4) || - (abs(mvq0.x - mvp1.x) >= 4) || - (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0; - } - } - else // for all different Ref_Idx - bs = 1; - } - else // slice->isInterP() - { - int32_t refIdx; - Frame *refp0, *refq0; - refIdx = cuP->m_refIdx[0][partP]; - refp0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx]; - refIdx = cuQ->m_refIdx[0][partQ]; - refq0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx]; - MV mvp0 = cuP->m_mv[0][partP]; - MV mvq0 = cuQ->m_mv[0][partQ]; - - if (!refp0) mvp0 = 0; - if (!refq0) mvq0 = 0; - - bs = ((refp0 != refq0) || - (abs(mvq0.x - mvp0.x) >= 4) || - (abs(mvq0.y - mvp0.y) >= 4)) ? 1 : 0; - } + if (refP0 == refQ0) + return ((abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4) || + (abs(mvQ1.x - mvP1.x) >= 4) || (abs(mvQ1.y - mvP1.y) >= 4)) ? 1 : 0; + else + return ((abs(mvQ1.x - mvP0.x) >= 4) || (abs(mvQ1.y - mvP0.y) >= 4) || + (abs(mvQ0.x - mvP1.x) >= 4) || (abs(mvQ0.y - mvP1.y) >= 4)) ? 1 : 0; + } + else // Same L0 & L1 + { + return (((abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4) || + (abs(mvQ1.x - mvP1.x) >= 4) || (abs(mvQ1.y - mvP1.y) >= 4)) && + ((abs(mvQ1.x - mvP0.x) >= 4) || (abs(mvQ1.y - mvP0.y) >= 4) || + (abs(mvQ0.x - mvP1.x) >= 4) || (abs(mvQ0.y - mvP1.y) >= 4))) ? 1 : 0; } } - - blockingStrength[absPartIdx] = bs; + + // for all different Ref_Idx + return 1; } static inline int32_t calcDP(pixel* src, intptr_t offset) @@ -340,46 +275,45 @@ static inline bool useStrongFiltering(intptr_t offset, int32_t beta, int32_t tc, } /* Deblocking for the luminance component with strong or weak filter - * \param src pointer to picture data - * \param offset offset value for picture data - * \param tc tc value - * \param partPNoFilter indicator to disable filtering on partP - * \param partQNoFilter indicator to disable filtering on partQ - * \param filterSecondP decision weak filter/no filter for partP - * \param filterSecondQ decision weak filter/no filter for partQ */ -static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter) + * \param src pointer to picture data + * \param offset offset value for picture data + * \param tc tc value + * \param maskP indicator to enable filtering on partP + * \param maskQ indicator to enable filtering on partQ + * \param maskP1 decision weak filter/no filter for partP + * \param maskQ1 decision weak filter/no filter for partQ */ +static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ) { + int32_t tc2 = 2 * tc; + int32_t tcP = (tc2 & maskP); + int32_t tcQ = (tc2 & maskQ); for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) { int16_t m4 = (int16_t)src[0]; int16_t m3 = (int16_t)src[-offset]; int16_t m5 = (int16_t)src[offset]; int16_t m2 = (int16_t)src[-offset * 2]; - int32_t tc2 = 2 * tc; - if (!partPNoFilter) - { - int16_t m1 = (int16_t)src[-offset * 3]; - int16_t m0 = (int16_t)src[-offset * 4]; - src[-offset * 3] = (pixel)(Clip3(-tc2, tc2, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1); - src[-offset * 2] = (pixel)(Clip3(-tc2, tc2, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2); - src[-offset] = (pixel)(Clip3(-tc2, tc2, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3); - } - if (!partQNoFilter) - { - int16_t m6 = (int16_t)src[offset * 2]; - int16_t m7 = (int16_t)src[offset * 3]; - src[0] = (pixel)(Clip3(-tc2, tc2, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4); - src[offset] = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5); - src[offset * 2] = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6); - } + int16_t m6 = (int16_t)src[offset * 2]; + int16_t m1 = (int16_t)src[-offset * 3]; + int16_t m7 = (int16_t)src[offset * 3]; + int16_t m0 = (int16_t)src[-offset * 4]; + src[-offset * 3] = (pixel)(Clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1); + src[-offset * 2] = (pixel)(Clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2); + src[-offset] = (pixel)(Clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3); + src[0] = (pixel)(Clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4); + src[offset] = (pixel)(Clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5); + src[offset * 2] = (pixel)(Clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6); } } /* Weak filter */ -static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter, - bool filterSecondP, bool filterSecondQ) +static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ, + int32_t maskP1, int32_t maskQ1) { int32_t thrCut = tc * 10; + int32_t tc2 = tc >> 1; + maskP1 &= maskP; + maskQ1 &= maskQ; for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) { @@ -394,38 +328,31 @@ static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, { delta = Clip3(-tc, tc, delta); - int32_t tc2 = tc >> 1; - if (!partPNoFilter) + src[-offset] = Clip(m3 + (delta & maskP)); + src[0] = Clip(m4 - (delta & maskQ)); + if (maskP1) { - src[-offset] = Clip(m3 + delta); - if (filterSecondP) - { - int16_t m1 = (int16_t)src[-offset * 3]; - int32_t delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1)); - src[-offset * 2] = Clip(m2 + delta1); - } + int16_t m1 = (int16_t)src[-offset * 3]; + int32_t delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1)); + src[-offset * 2] = Clip(m2 + delta1); } - if (!partQNoFilter) + if (maskQ1) { - src[0] = Clip(m4 - delta); - if (filterSecondQ) - { - int16_t m6 = (int16_t)src[offset * 2]; - int32_t delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1)); - src[offset] = Clip(m5 + delta2); - } + int16_t m6 = (int16_t)src[offset * 2]; + int32_t delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1)); + src[offset] = Clip(m5 + delta2); } } } } /* Deblocking of one line/column for the chrominance component - * \param src pointer to picture data - * \param offset offset value for picture data - * \param tc tc value - * \param partPNoFilter indicator to disable filtering on partP - * \param partQNoFilter indicator to disable filtering on partQ */ -static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter) + * \param src pointer to picture data + * \param offset offset value for picture data + * \param tc tc value + * \param maskP indicator to disable filtering on partP + * \param maskQ indicator to disable filtering on partQ */ +static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ) { for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) { @@ -435,31 +362,25 @@ static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset int16_t m2 = (int16_t)src[-offset * 2]; int32_t delta = Clip3(-tc, tc, ((((m4 - m3) << 2) + m2 - m5 + 4) >> 3)); - if (!partPNoFilter) - src[-offset] = Clip(m3 + delta); - if (!partQNoFilter) - src[0] = Clip(m4 - delta); + src[-offset] = Clip(m3 + (delta & maskP)); + src[0] = Clip(m4 - (delta & maskQ)); } } -void Deblock::edgeFilterLuma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]) +void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]) { - PicYuv* reconYuv = cu->m_encData->m_reconPicYuv; - pixel* src = reconYuv->getLumaAddr(cu->m_cuAddr, absPartIdx); - - intptr_t stride = reconYuv->m_stride; - uint32_t numParts = cu->m_slice->m_sps->numPartInCUSize >> depth; + PicYuv* reconPic = cuQ->m_encData->m_reconPic; + pixel* src = reconPic->getLumaAddr(cuQ->m_cuAddr, absPartIdx); + intptr_t stride = reconPic->m_stride; + const PPS* pps = cuQ->m_slice->m_pps; intptr_t offset, srcStep; - bool partPNoFilter = false; - bool partQNoFilter = false; - uint32_t partP = 0; - uint32_t partQ = 0; - const CUData* cuP = cu; - const CUData* cuQ = cu; - int32_t betaOffset = cuQ->m_slice->m_pps->deblockingFilterBetaOffsetDiv2 << 1; - int32_t tcOffset = cuQ->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1; + int32_t maskP = -1; + int32_t maskQ = -1; + int32_t betaOffset = pps->deblockingFilterBetaOffsetDiv2 << 1; + int32_t tcOffset = pps->deblockingFilterTcOffsetDiv2 << 1; + bool bCheckNoFilter = pps->bTransquantBypassEnabled; if (dir == EDGE_VER) { @@ -474,106 +395,103 @@ void Deblock::edgeFilterLuma(CUData* cu, uint32_t absPartIdx, uint32_t depth, in src += (edge << LOG2_UNIT_SIZE) * stride; } - for (uint32_t idx = 0; idx < numParts; idx++) + uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> depth; + for (uint32_t idx = 0; idx < numUnits; idx++) { - uint32_t unitOffset = idx << LOG2_UNIT_SIZE; - uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx); - uint32_t bs = blockingStrength[bsAbsIdx]; - if (bs) - { - int32_t qpQ = cu->m_qp[bsAbsIdx]; - partQ = bsAbsIdx; + uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx); + uint32_t bs = blockStrength[partQ]; - // Derive neighboring PU index - if (dir == EDGE_VER) - cuP = cuQ->getPULeft(partP, partQ); - else // (dir == EDGE_HOR) - cuP = cuQ->getPUAbove(partP, partQ); + if (!bs) + continue; - int32_t qpP = cuP->m_qp[partP]; - int32_t qp = (qpP + qpQ + 1) >> 1; + int32_t qpQ = cuQ->m_qp[partQ]; - int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset); + // Derive neighboring PU index + uint32_t partP; + const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ)); - const int32_t bitdepthShift = X265_DEPTH - 8; - int32_t beta = s_betaTable[indexB] << bitdepthShift; + int32_t qpP = cuP->m_qp[partP]; + int32_t qp = (qpP + qpQ + 1) >> 1; - int32_t dp0 = calcDP(src + srcStep * (unitOffset + 0), offset); - int32_t dq0 = calcDQ(src + srcStep * (unitOffset + 0), offset); - int32_t dp3 = calcDP(src + srcStep * (unitOffset + 3), offset); - int32_t dq3 = calcDQ(src + srcStep * (unitOffset + 3), offset); - int32_t d0 = dp0 + dq0; - int32_t d3 = dp3 + dq3; + int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset); - int32_t d = d0 + d3; + const int32_t bitdepthShift = X265_DEPTH - 8; + int32_t beta = s_betaTable[indexB] << bitdepthShift; - if (d < beta) - { - if (cu->m_slice->m_pps->bTransquantBypassEnabled) - { - // check if each of PUs is lossless coded - partPNoFilter = !!cuP->m_tqBypass[partP]; - partQNoFilter = !!cuQ->m_tqBypass[partQ]; - } - - int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset)); - int32_t tc = s_tcTable[indexTC] << bitdepthShift; - - bool sw = (2 * d0 < (beta >> 2) && - 2 * d3 < (beta >> 2) && - useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 0)) && - useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 3))); - - if (sw) - pelFilterLumaStrong(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter); - else - { - int32_t sideThreshold = (beta + (beta >> 1)) >> 3; - int32_t dp = dp0 + dp3; - int32_t dq = dq0 + dq3; - bool filterP = (dp < sideThreshold); - bool filterQ = (dq < sideThreshold); - - pelFilterLuma(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter, filterP, filterQ); - } - } + intptr_t unitOffset = idx * srcStep << LOG2_UNIT_SIZE; + int32_t dp0 = calcDP(src + unitOffset , offset); + int32_t dq0 = calcDQ(src + unitOffset , offset); + int32_t dp3 = calcDP(src + unitOffset + srcStep * 3, offset); + int32_t dq3 = calcDQ(src + unitOffset + srcStep * 3, offset); + int32_t d0 = dp0 + dq0; + int32_t d3 = dp3 + dq3; + + int32_t d = d0 + d3; + + if (d >= beta) + continue; + + if (bCheckNoFilter) + { + // check if each of PUs is lossless coded + maskP = (cuP->m_tqBypass[partP] ? 0 : -1); + maskQ = (cuQ->m_tqBypass[partQ] ? 0 : -1); + } + + int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset)); + int32_t tc = s_tcTable[indexTC] << bitdepthShift; + + bool sw = (2 * d0 < (beta >> 2) && + 2 * d3 < (beta >> 2) && + useStrongFiltering(offset, beta, tc, src + unitOffset ) && + useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3)); + + if (sw) + pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ); + else + { + int32_t sideThreshold = (beta + (beta >> 1)) >> 3; + int32_t dp = dp0 + dp3; + int32_t dq = dq0 + dq3; + int32_t maskP1 = (dp < sideThreshold ? -1 : 0); + int32_t maskQ1 = (dq < sideThreshold ? -1 : 0); + + pelFilterLuma(src + unitOffset, srcStep, offset, tc, maskP, maskQ, maskP1, maskQ1); } } } -void Deblock::edgeFilterChroma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]) +void Deblock::edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]) { - int32_t chFmt = cu->m_chromaFormat, chromaShift; + int32_t chFmt = cuQ->m_chromaFormat, chromaShift; intptr_t offset, srcStep; + const PPS* pps = cuQ->m_slice->m_pps; - bool partPNoFilter = false; - bool partQNoFilter = false; - uint32_t partP; - uint32_t partQ; - const CUData* cuP; - const CUData* cuQ = cu; - int32_t tcOffset = cu->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1; + int32_t maskP = -1; + int32_t maskQ = -1; + int32_t tcOffset = pps->deblockingFilterTcOffsetDiv2 << 1; X265_CHECK(((dir == EDGE_VER) - ? ((g_zscanToPelX[absPartIdx] + edge * UNIT_SIZE) >> cu->m_hChromaShift) - : ((g_zscanToPelY[absPartIdx] + edge * UNIT_SIZE) >> cu->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0, + ? ((g_zscanToPelX[absPartIdx] + edge * UNIT_SIZE) >> cuQ->m_hChromaShift) + : ((g_zscanToPelY[absPartIdx] + edge * UNIT_SIZE) >> cuQ->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0, "invalid edge\n"); - PicYuv* reconPic = cu->m_encData->m_reconPicYuv; + PicYuv* reconPic = cuQ->m_encData->m_reconPic; intptr_t stride = reconPic->m_strideC; - intptr_t srcOffset = reconPic->getChromaAddrOffset(cu->m_cuAddr, absPartIdx); + intptr_t srcOffset = reconPic->getChromaAddrOffset(cuQ->m_cuAddr, absPartIdx); + bool bCheckNoFilter = pps->bTransquantBypassEnabled; if (dir == EDGE_VER) { - chromaShift = cu->m_vChromaShift; - srcOffset += (edge << (LOG2_UNIT_SIZE - cu->m_hChromaShift)); + chromaShift = cuQ->m_vChromaShift; + srcOffset += (edge << (LOG2_UNIT_SIZE - cuQ->m_hChromaShift)); offset = 1; srcStep = stride; } else // (dir == EDGE_HOR) { - chromaShift = cu->m_hChromaShift; - srcOffset += edge * stride << (LOG2_UNIT_SIZE - cu->m_vChromaShift); + chromaShift = cuQ->m_hChromaShift; + srcOffset += edge * stride << (LOG2_UNIT_SIZE - cuQ->m_vChromaShift); offset = stride; srcStep = 1; } @@ -582,53 +500,50 @@ void Deblock::edgeFilterChroma(CUData* cu, uint32_t absPartIdx, uint32_t depth, srcChroma[0] = reconPic->m_picOrg[1] + srcOffset; srcChroma[1] = reconPic->m_picOrg[2] + srcOffset; - uint32_t numUnits = cu->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift); + uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift); for (uint32_t idx = 0; idx < numUnits; idx++) { - uint32_t unitOffset = idx << LOG2_UNIT_SIZE; - uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx << chromaShift); - uint32_t bs = blockingStrength[bsAbsIdx]; + uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx << chromaShift); + uint32_t bs = blockStrength[partQ]; - if (bs > 1) - { - int32_t qpQ = cu->m_qp[bsAbsIdx]; - partQ = bsAbsIdx; + if (bs <= 1) + continue; - // Derive neighboring PU index - if (dir == EDGE_VER) - cuP = cuQ->getPULeft(partP, partQ); - else // (dir == EDGE_HOR) - cuP = cuQ->getPUAbove(partP, partQ); + int32_t qpQ = cuQ->m_qp[partQ]; - int32_t qpP = cuP->m_qp[partP]; + // Derive neighboring PU index + uint32_t partP; + const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ)); - if (cu->m_slice->m_pps->bTransquantBypassEnabled) - { - // check if each of PUs is lossless coded - partPNoFilter = !!cuP->m_tqBypass[partP]; - partQNoFilter = !!cuQ->m_tqBypass[partQ]; - } + int32_t qpP = cuP->m_qp[partP]; - for (uint32_t chromaIdx = 0; chromaIdx < 2; chromaIdx++) + if (bCheckNoFilter) + { + // check if each of PUs is lossless coded + maskP = (cuP->m_tqBypass[partP] ? 0 : -1); + maskQ = (cuQ->m_tqBypass[partQ] ? 0 : -1); + } + + intptr_t unitOffset = idx * srcStep << LOG2_UNIT_SIZE; + for (uint32_t chromaIdx = 0; chromaIdx < 2; chromaIdx++) + { + int32_t chromaQPOffset = pps->chromaQpOffset[chromaIdx]; + int32_t qp = ((qpP + qpQ + 1) >> 1) + chromaQPOffset; + if (qp >= 30) { - int32_t chromaQPOffset = !chromaIdx ? cu->m_slice->m_pps->chromaCbQpOffset : cu->m_slice->m_pps->chromaCrQpOffset; - int32_t qp = ((qpP + qpQ + 1) >> 1) + chromaQPOffset; - if (qp >= 30) - { - if (chFmt == X265_CSP_I420) - qp = g_chromaScale[qp]; - else - qp = X265_MIN(qp, 51); - } - - int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET + tcOffset)); - const int32_t bitdepthShift = X265_DEPTH - 8; - int32_t tc = s_tcTable[indexTC] << bitdepthShift; - pixel* srcC = srcChroma[chromaIdx]; - - pelFilterChroma(srcC + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter); + if (chFmt == X265_CSP_I420) + qp = g_chromaScale[qp]; + else + qp = X265_MIN(qp, 51); } + + int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET + tcOffset)); + const int32_t bitdepthShift = X265_DEPTH - 8; + int32_t tc = s_tcTable[indexTC] << bitdepthShift; + pixel* srcC = srcChroma[chromaIdx]; + + pelFilterChroma(srcC + unitOffset, srcStep, offset, tc, maskP, maskQ); } } } diff --git a/source/common/deblock.h b/source/common/deblock.h index 4bdfeff..f872625 100644 --- a/source/common/deblock.h +++ b/source/common/deblock.h @@ -42,31 +42,24 @@ public: void init() { m_numPartitions = 1 << (g_maxFullDepth * 2); } - void deblockCTU(CUData* cu, int32_t dir); + void deblockCTU(const CUData* ctu, int32_t dir); protected: // CU-level deblocking function - void deblockCU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, const int32_t Edge, uint8_t blockingStrength[]); - - struct Param - { - uint8_t leftEdge; - uint8_t topEdge; - }; + void deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[]); // set filtering functions - void setLoopfilterParam(CUData* cu, uint32_t absZOrderIdx, Param *params); - void setEdgefilterTU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[]); - void setEdgefilterPU(CUData* cu, uint32_t absZOrderIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits); - void setEdgefilterMultiple(CUData* cu, uint32_t absZOrderIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits); + void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[]); + void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits); + void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits); // get filtering functions - void getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t partIdx, uint8_t blockingStrength[]); + uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]); // filter luma/chroma functions - void edgeFilterLuma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]); - void edgeFilterChroma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]); + void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]); + void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]); static const uint8_t s_tcTable[54]; static const uint8_t s_betaTable[52]; diff --git a/source/common/frame.cpp b/source/common/frame.cpp index 8ae912f..9c7abee 100644 --- a/source/common/frame.cpp +++ b/source/common/frame.cpp @@ -34,7 +34,7 @@ Frame::Frame() m_reconRowCount.set(0); m_countRefEncoders = 0; m_encData = NULL; - m_reconPicYuv = NULL; + m_reconPic = NULL; m_next = NULL; m_prev = NULL; memset(&m_lowres, 0, sizeof(m_lowres)); @@ -42,26 +42,26 @@ Frame::Frame() bool Frame::create(x265_param *param) { - m_origPicYuv = new PicYuv; + m_fencPic = new PicYuv; - return m_origPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp) && - m_lowres.create(m_origPicYuv, param->bframes, !!param->rc.aqMode); + return m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) && + m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode); } bool Frame::allocEncodeData(x265_param *param, const SPS& sps) { m_encData = new FrameData; - m_reconPicYuv = new PicYuv; - m_encData->m_reconPicYuv = m_reconPicYuv; - bool ok = m_encData->create(param, sps) && m_reconPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp); + m_reconPic = new PicYuv; + m_encData->m_reconPic = m_reconPic; + bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp); if (ok) { /* initialize right border of m_reconpicYuv as SAO may read beyond the * end of the picture accessing uninitialized pixels */ int maxHeight = sps.numCuInHeight * g_maxCUSize; - memset(m_reconPicYuv->m_picOrg[0], 0, m_reconPicYuv->m_stride * maxHeight); - memset(m_reconPicYuv->m_picOrg[1], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift)); - memset(m_reconPicYuv->m_picOrg[2], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift)); + memset(m_reconPic->m_picOrg[0], 0, m_reconPic->m_stride * maxHeight); + memset(m_reconPic->m_picOrg[1], 0, m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift)); + memset(m_reconPic->m_picOrg[2], 0, m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift)); } return ok; } @@ -70,7 +70,7 @@ bool Frame::allocEncodeData(x265_param *param, const SPS& sps) void Frame::reinit(const SPS& sps) { m_bChromaExtended = false; - m_reconPicYuv = m_encData->m_reconPicYuv; + m_reconPic = m_encData->m_reconPic; m_encData->reinit(sps); } @@ -83,18 +83,18 @@ void Frame::destroy() m_encData = NULL; } - if (m_origPicYuv) + if (m_fencPic) { - m_origPicYuv->destroy(); - delete m_origPicYuv; - m_origPicYuv = NULL; + m_fencPic->destroy(); + delete m_fencPic; + m_fencPic = NULL; } - if (m_reconPicYuv) + if (m_reconPic) { - m_reconPicYuv->destroy(); - delete m_reconPicYuv; - m_reconPicYuv = NULL; + m_reconPic->destroy(); + delete m_reconPic; + m_reconPic = NULL; } m_lowres.destroy(); diff --git a/source/common/frame.h b/source/common/frame.h index 0fae62a..d023946 100644 --- a/source/common/frame.h +++ b/source/common/frame.h @@ -43,30 +43,29 @@ public: /* These two items will be NULL until the Frame begins to be encoded, at which point * it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */ - FrameData* m_encData; - PicYuv* m_reconPicYuv; + FrameData* m_encData; + PicYuv* m_reconPic; /* Data associated with x265_picture */ - PicYuv* m_origPicYuv; - int m_poc; - int64_t m_pts; // user provided presentation time stamp - int64_t m_reorderedPts; - int64_t m_dts; - int32_t m_forceqp; // Force to use the qp specified in qp file - x265_intra_data* m_intraData; - x265_inter_data* m_interData; - void* m_userData; // user provided pointer passed in with this picture + PicYuv* m_fencPic; + int m_poc; + int64_t m_pts; // user provided presentation time stamp + int64_t m_reorderedPts; + int64_t m_dts; + int32_t m_forceqp; // Force to use the qp specified in qp file + void* m_userData; // user provided pointer passed in with this picture - Lowres m_lowres; - bool m_bChromaExtended; // orig chroma planes motion extended for weight analysis + Lowres m_lowres; + bool m_bChromaExtended; // orig chroma planes motion extended for weight analysis /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */ - ThreadSafeInteger m_reconRowCount; // count of CTU rows completely reconstructed and extended for motion reference - volatile uint32_t m_countRefEncoders; // count of FrameEncoder threads monitoring m_reconRowCount + ThreadSafeInteger m_reconRowCount; // count of CTU rows completely reconstructed and extended for motion reference + volatile uint32_t m_countRefEncoders; // count of FrameEncoder threads monitoring m_reconRowCount - Frame* m_next; // PicList doubly linked list pointers - Frame* m_prev; + Frame* m_next; // PicList doubly linked list pointers + Frame* m_prev; + x265_analysis_data m_analysisData; Frame(); bool create(x265_param *param); diff --git a/source/common/framedata.h b/source/common/framedata.h index f6ea9d4..92754ce 100644 --- a/source/common/framedata.h +++ b/source/common/framedata.h @@ -49,7 +49,7 @@ public: x265_param* m_param; FrameData* m_freeListNext; - PicYuv* m_reconPicYuv; + PicYuv* m_reconPic; bool m_bHasReferences; /* used during DPB/RPS updates */ int m_frameEncoderID; /* the ID of the FrameEncoder encoding this frame */ diff --git a/source/common/ipfilter.cpp b/source/common/ipfilter.cpp index 4982cba..8467654 100644 --- a/source/common/ipfilter.cpp +++ b/source/common/ipfilter.cpp @@ -35,7 +35,7 @@ using namespace x265; namespace { template -void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) +void filterConvertPelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height) { int shift = IF_INTERNAL_PREC - X265_DEPTH; int row, col; @@ -74,9 +74,9 @@ void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, in } template -void interp_horiz_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx) { - int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; + const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; int headRoom = IF_FILTER_PREC; int offset = (1 << (headRoom - 1)); uint16_t maxVal = (1 << X265_DEPTH) - 1; @@ -115,9 +115,9 @@ void interp_horiz_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstS } template -void interp_horiz_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) { - int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; + const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; int headRoom = IF_INTERNAL_PREC - X265_DEPTH; int shift = IF_FILTER_PREC - headRoom; int offset = -IF_INTERNAL_OFFS << shift; @@ -160,9 +160,9 @@ void interp_horiz_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t ds } template -void interp_vert_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx) { - int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; + const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; int shift = IF_FILTER_PREC; int offset = 1 << (shift - 1); uint16_t maxVal = (1 << X265_DEPTH) - 1; @@ -201,9 +201,9 @@ void interp_vert_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstSt } template -void interp_vert_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx) { - int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; + const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; int headRoom = IF_INTERNAL_PREC - X265_DEPTH; int shift = IF_FILTER_PREC - headRoom; int offset = -IF_INTERNAL_OFFS << shift; @@ -239,13 +239,13 @@ void interp_vert_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dst } template -void interp_vert_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx) { int headRoom = IF_INTERNAL_PREC - X265_DEPTH; int shift = IF_FILTER_PREC + headRoom; int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC); uint16_t maxVal = (1 << X265_DEPTH) - 1; - const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]); + const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]); src -= (N / 2 - 1) * srcStride; @@ -282,9 +282,9 @@ void interp_vert_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dst } template -void interp_vert_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx) { - const int16_t *const c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]); + const int16_t* c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]); int shift = IF_FILTER_PREC; int row, col; @@ -317,13 +317,13 @@ void interp_vert_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t d } template -void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx) +void filterVertical_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int coeffIdx) { int headRoom = IF_INTERNAL_PREC - X265_DEPTH; int shift = IF_FILTER_PREC + headRoom; int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC); uint16_t maxVal = (1 << X265_DEPTH) - 1; - const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]); + const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]); src -= (N / 2 - 1) * srcStride; @@ -360,7 +360,7 @@ void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t } template -void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) +void interp_hv_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY) { short immedVals[(64 + 8) * (64 + 8)]; @@ -509,9 +509,9 @@ void Setup_C_IPFilterPrimitives(EncoderPrimitives& p) CHROMA_444(16, 64); p.luma_p2s = filterConvertPelToShort_c; - p.chroma_p2s[X265_CSP_I444] = filterConvertPelToShort_c; - p.chroma_p2s[X265_CSP_I420] = filterConvertPelToShort_c; - p.chroma_p2s[X265_CSP_I422] = filterConvertPelToShort_c; + p.chroma[X265_CSP_I444].p2s = filterConvertPelToShort_c; + p.chroma[X265_CSP_I420].p2s = filterConvertPelToShort_c; + p.chroma[X265_CSP_I422].p2s = filterConvertPelToShort_c; p.extendRowBorder = extendCURowColBorder; } diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp index fe4f7b9..50bbc89 100644 --- a/source/common/lowres.cpp +++ b/source/common/lowres.cpp @@ -69,6 +69,7 @@ bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled) lowresPlane[3] = buffer[3] + padoffset; CHECKED_MALLOC(intraCost, int32_t, cuCount); + CHECKED_MALLOC(intraMode, uint8_t, cuCount); for (int i = 0; i < bframes + 2; i++) { @@ -99,6 +100,7 @@ void Lowres::destroy() X265_FREE(buffer[i]); X265_FREE(intraCost); + X265_FREE(intraMode); for (int i = 0; i < bframes + 2; i++) { @@ -155,7 +157,7 @@ void Lowres::init(PicYuv *origPic, int poc, int type) intraMbs[i] = 0; /* downscale and generate 4 hpel planes for lookahead */ - primitives.frame_init_lowres_core(origPic->m_picOrg[0], + primitives.frameInitLowres(origPic->m_picOrg[0], lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3], origPic->m_stride, lumaStride, width, lines); @@ -164,5 +166,5 @@ void Lowres::init(PicYuv *origPic, int poc, int type) extendPicBorder(lowresPlane[1], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY); extendPicBorder(lowresPlane[2], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY); extendPicBorder(lowresPlane[3], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY); - fpelPlane = lowresPlane[0]; + fpelPlane[0] = lowresPlane[0]; } diff --git a/source/common/lowres.h b/source/common/lowres.h index b88ad3e..a206c96 100644 --- a/source/common/lowres.h +++ b/source/common/lowres.h @@ -26,27 +26,36 @@ #include "primitives.h" #include "common.h" +#include "picyuv.h" #include "mv.h" namespace x265 { // private namespace -class PicYuv; - struct ReferencePlanes { ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); } - pixel* fpelPlane; + pixel* fpelPlane[3]; pixel* lowresPlane[4]; + PicYuv* reconPic; bool isWeighted; bool isLowres; + intptr_t lumaStride; - int weight; - int offset; - int shift; - int round; + intptr_t chromaStride; + + struct { + int weight; + int offset; + int shift; + int round; + } w[3]; + + pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[0] + reconPic->m_cuOffsetY[ctuAddr] + reconPic->m_buOffsetY[absPartIdx]; } + pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[1] + reconPic->m_cuOffsetC[ctuAddr] + reconPic->m_buOffsetC[absPartIdx]; } + pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[2] + reconPic->m_cuOffsetC[ctuAddr] + reconPic->m_buOffsetC[absPartIdx]; } /* lowres motion compensation, you must provide a buffer and stride for QPEL averaged pixels * in case QPEL is required. Else it returns a pointer to the HPEL pixels */ @@ -56,11 +65,10 @@ struct ReferencePlanes { int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1); pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride; - - MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2); - int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1); - - pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride; + int qmvx = qmv.x + (qmv.x & 1); + int qmvy = qmv.y + (qmv.y & 1); + int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1); + pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride; primitives.pixelavg_pp[LUMA_8x8](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32); return buf; } @@ -79,9 +87,10 @@ struct ReferencePlanes ALIGN_VAR_16(pixel, subpelbuf[8 * 8]); int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1); pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride; - MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2); - int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1); - pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride; + int qmvx = qmv.x + (qmv.x & 1); + int qmvy = qmv.y + (qmv.y & 1); + int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1); + pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride; primitives.pixelavg_pp[LUMA_8x8](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32); return comp(fenc, FENC_STRIDE, subpelbuf, 8); } @@ -116,6 +125,7 @@ struct Lowres : public ReferencePlanes int32_t* rowSatds[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]; int intraMbs[X265_BFRAME_MAX + 2]; int32_t* intraCost; + uint8_t* intraMode; int64_t satdCost; uint16_t* lowresCostForRc; uint16_t(*lowresCosts[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]); diff --git a/source/common/mv.h b/source/common/mv.h index 22a7073..dad3729 100644 --- a/source/common/mv.h +++ b/source/common/mv.h @@ -44,19 +44,19 @@ public: int32_t word; }; - MV() : word(0) {} - + MV() {} + MV(int32_t w) : word(w) {} MV(int16_t _x, int16_t _y) : x(_x), y(_y) {} - const MV& operator =(uint32_t w) { word = w; return *this; } + MV& operator =(uint32_t w) { word = w; return *this; } - const MV& operator +=(const MV& other) { x += other.x; y += other.y; return *this; } + MV& operator +=(const MV& other) { x += other.x; y += other.y; return *this; } - const MV& operator -=(const MV& other) { x -= other.x; y -= other.y; return *this; } + MV& operator -=(const MV& other) { x -= other.x; y -= other.y; return *this; } - const MV& operator >>=(int i) { x >>= i; y >>= i; return *this; } + MV& operator >>=(int i) { x >>= i; y >>= i; return *this; } - const MV& operator <<=(int i) { x <<= i; y <<= i; return *this; } + MV& operator <<=(int i) { x <<= i; y <<= i; return *this; } MV operator >>(int i) const { return MV(x >> i, y >> i); } @@ -64,16 +64,18 @@ public: MV operator *(int16_t i) const { return MV(x * i, y * i); } - const MV operator -(const MV& other) const { return MV(x - other.x, y - other.y); } + MV operator -(const MV& other) const { return MV(x - other.x, y - other.y); } - const MV operator +(const MV& other) const { return MV(x + other.x, y + other.y); } + MV operator +(const MV& other) const { return MV(x + other.x, y + other.y); } bool operator ==(const MV& other) const { return word == other.word; } bool operator !=(const MV& other) const { return word != other.word; } + bool operator !() const { return !word; } + // Scale down a QPEL mv to FPEL mv, rounding up by one HPEL offset - MV roundToFPel() const { return MV(x + 2, y + 2) >> 2; } + MV roundToFPel() const { return MV((x + 2) >> 2, (y + 2) >> 2); } // Scale up an FPEL mv to QPEL by shifting up two bits MV toQPel() const { return *this << 2; } diff --git a/source/common/param.cpp b/source/common/param.cpp index af70058..2159fd9 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -176,6 +176,8 @@ void x265_param_default(x265_param *param) param->rdPenalty = 0; param->psyRd = 0.0; param->psyRdoq = 0.0; + param->analysisMode = 0; + param->analysisFileName = NULL; param->bIntraInBFrames = 0; param->bLossless = 0; param->bCULossless = 0; @@ -193,7 +195,7 @@ void x265_param_default(x265_param *param) param->rc.qpStep = 4; param->rc.rateControlMode = X265_RC_CRF; param->rc.qp = 32; - param->rc.aqMode = X265_AQ_AUTO_VARIANCE; + param->rc.aqMode = X265_AQ_VARIANCE; param->rc.aqStrength = 1.0; param->rc.cuTree = 1; param->rc.rfConstantMax = 0; @@ -406,6 +408,24 @@ int x265_param_default_preset(x265_param *param, const char *preset, const char param->scenecutThreshold = 0; param->rc.cuTree = 0; } + else if (!strcmp(tune, "grain")) + { + param->deblockingFilterBetaOffset = -2; + param->deblockingFilterTCOffset = -2; + param->bIntraInBFrames = 0; + param->psyRdoq = 30; + param->psyRd = 0.5; + param->rc.ipFactor = 1.1; + param->rc.pbFactor = 1.1; + param->rc.aqMode = X265_AQ_VARIANCE; + param->rc.aqStrength = 0.3; + param->rc.qCompress = 0.8; + } + else if (!strcmp(tune, "cbr")) + { + param->rc.pbFactor = 1.0; + param->rc.rateTolerance = 0.5; + } else return -1; } @@ -532,9 +552,6 @@ int x265_param_parse(x265_param *p, const char *name, const char *value) } } } - OPT("csv") p->csvfn = value; - OPT("scaling-list") p->scalingLists = value; - OPT("lambda-file") p->rc.lambdaFileName = value; OPT("threads") p->poolNumThreads = atoi(value); OPT("frame-threads") p->frameNumThreads = atoi(value); OPT("pmode") p->bDistributeModeAnalysis = atobool(value); @@ -623,7 +640,22 @@ int x265_param_parse(x265_param *p, const char *name, const char *value) OPT("psy-rdoq") p->psyRdoq = atof(value); OPT("signhide") p->bEnableSignHiding = atobool(value); OPT("b-intra") p->bIntraInBFrames = atobool(value); - OPT("lft") p->bEnableLoopFilter = atobool(value); + OPT("lft") p->bEnableLoopFilter = atobool(value); /* DEPRECATED */ + OPT("deblock") + { + if (2 == sscanf(value, "%d:%d", &p->deblockingFilterTCOffset, &p->deblockingFilterBetaOffset) || + 2 == sscanf(value, "%d,%d", &p->deblockingFilterTCOffset, &p->deblockingFilterBetaOffset)) + { + p->bEnableLoopFilter = true; + } + else if (sscanf(value, "%d", &p->deblockingFilterTCOffset)) + { + p->bEnableLoopFilter = 1; + p->deblockingFilterBetaOffset = p->deblockingFilterTCOffset; + } + else + p->bEnableLoopFilter = atobool(value); + } OPT("sao") p->bEnableSAO = atobool(value); OPT("sao-non-deblock") p->bSaoNonDeblocked = atobool(value); OPT("ssim") p->bEnableSsim = atobool(value); @@ -635,6 +667,11 @@ int x265_param_parse(x265_param *p, const char *name, const char *value) OPT("hrd") p->bEmitHRDSEI = atobool(value); OPT2("ipratio", "ip-factor") p->rc.ipFactor = atof(value); OPT2("pbratio", "pb-factor") p->rc.pbFactor = atof(value); + OPT("qcomp") p->rc.qCompress = atof(value); + OPT("qpstep") p->rc.qpStep = atoi(value); + OPT("ratetol") p->rc.rateTolerance = atof(value); + OPT("cplxblur") p->rc.complexityBlur = atof(value); + OPT("qblur") p->rc.qblur = atof(value); OPT("aq-mode") p->rc.aqMode = atoi(value); OPT("aq-strength") p->rc.aqStrength = atof(value); OPT("vbv-maxrate") p->rc.vbvMaxBitrate = atoi(value); @@ -729,7 +766,8 @@ int x265_param_parse(x265_param *p, const char *name, const char *value) &p->vui.defDispWinRightOffset, &p->vui.defDispWinBottomOffset) != 4; } - OPT("nr") p->noiseReduction = atoi(value); + OPT("nr-intra") p->noiseReductionIntra = atoi(value); + OPT("nr-inter") p->noiseReductionInter = atoi(value); OPT("pass") { int pass = Clip3(0, 3, atoi(value)); @@ -737,6 +775,10 @@ int x265_param_parse(x265_param *p, const char *name, const char *value) p->rc.bStatRead = pass & 2; } OPT("stats") p->rc.statFileName = strdup(value); + OPT("csv") p->csvfn = strdup(value); + OPT("scaling-list") p->scalingLists = strdup(value); + OPT("lambda-file") p->rc.lambdaFileName = strdup(value); + OPT("analysis-file") p->analysisFileName = strdup(value); else return X265_PARAM_BAD_NAME; #undef OPT @@ -960,6 +1002,10 @@ int x265_check_params(x265_param *param) "Aq-Mode is out of range"); CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3, "Aq-Strength is out of range"); + CHECK(param->deblockingFilterTCOffset < -6 || param->deblockingFilterTCOffset > 6, + "deblocking filter tC offset must be in the range of -6 to +6"); + CHECK(param->deblockingFilterBetaOffset < -6 || param->deblockingFilterBetaOffset > 6, + "deblocking filter Beta offset must be in the range of -6 to +6"); CHECK(param->psyRd < 0 || 2.0 < param->psyRd, "Psy-rd strength must be between 0 and 2.0"); CHECK(param->psyRdoq < 0 || 50.0 < param->psyRdoq, "Psy-rdoq strength must be between 0 and 50.0"); CHECK(param->bEnableWavefront < 0, "WaveFrontSynchro cannot be negative"); @@ -1031,8 +1077,12 @@ int x265_check_params(x265_param *param) "Valid initial VBV buffer occupancy must be a fraction 0 - 1, or size in kbits"); CHECK(param->rc.bitrate < 0, "Target bitrate can not be less than zero"); - if (param->noiseReduction) - CHECK(100 > param->noiseReduction || param->noiseReduction > 1000, "Valid noise reduction range 100 - 1000"); + CHECK(param->rc.qCompress < 0.5 || param->rc.qCompress > 1.0, + "qCompress must be between 0.5 and 1.0"); + if (param->noiseReductionIntra) + CHECK(0 > param->noiseReductionIntra || param->noiseReductionIntra > 2000, "Valid noise reduction range 0 - 2000"); + if (param->noiseReductionInter) + CHECK(0 > param->noiseReductionInter || param->noiseReductionInter > 2000, "Valid noise reduction range 0 - 2000"); CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead, "Constant rate-factor is incompatible with 2pass"); CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead, @@ -1061,7 +1111,7 @@ int x265_set_globals(x265_param *param) { static int once /* = 0 */; - if (ATOMIC_CAS32(&once, 0, 1) == 1) + if (ATOMIC_INC(&once) > 1) { if (param->maxCUSize != g_maxCUSize) { @@ -1152,11 +1202,19 @@ void x265_print_params(x265_param *param) fprintf(stderr, "psy-rd=%.2lf ", param->psyRd); if (param->psyRdoq > 0.) fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq); - TOOLOPT(param->bEnableEarlySkip, "esd"); - TOOLOPT(param->bEnableCbfFastMode, "cfm"); - if (param->noiseReduction) - fprintf(stderr, "nr=%d ", param->noiseReduction); - TOOLOPT(param->bEnableLoopFilter, "lft"); + TOOLOPT(param->bEnableEarlySkip, "early-skip"); + TOOLOPT(param->bEnableCbfFastMode, "fast-cbf"); + if (param->noiseReductionIntra) + fprintf(stderr, "nr-intra=%d ", param->noiseReductionIntra); + if (param->noiseReductionInter) + fprintf(stderr, "nr-inter=%d ", param->noiseReductionInter); + if (param->bEnableLoopFilter) + { + if (param->deblockingFilterBetaOffset || param->deblockingFilterTCOffset) + fprintf(stderr, "deblock(tC=%d:B=%d) ", param->deblockingFilterTCOffset, param->deblockingFilterBetaOffset); + else + TOOLOPT(param->bEnableLoopFilter, "deblock"); + } if (param->bEnableSAO) fprintf(stderr, "sao%s ", param->bSaoNonDeblocked ? "-non-deblock" : ""); TOOLOPT(param->bEnableSignHiding, "signhide"); diff --git a/source/common/picyuv.h b/source/common/picyuv.h index 1e18d8c..a702517 100644 --- a/source/common/picyuv.h +++ b/source/common/picyuv.h @@ -76,12 +76,21 @@ public: pixel* getCrAddr(uint32_t ctuAddr) { return m_picOrg[2] + m_cuOffsetC[ctuAddr]; } pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr]; } pixel* getPlaneAddr(uint32_t plane, uint32_t ctuAddr) { return m_picOrg[plane] + (plane ? m_cuOffsetC[ctuAddr] : m_cuOffsetY[ctuAddr]); } + const pixel* getLumaAddr(uint32_t ctuAddr) const { return m_picOrg[0] + m_cuOffsetY[ctuAddr]; } + const pixel* getCbAddr(uint32_t ctuAddr) const { return m_picOrg[1] + m_cuOffsetC[ctuAddr]; } + const pixel* getCrAddr(uint32_t ctuAddr) const { return m_picOrg[2] + m_cuOffsetC[ctuAddr]; } + const pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr) const { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr]; } + const pixel* getPlaneAddr(uint32_t plane, uint32_t ctuAddr) const { return m_picOrg[plane] + (plane ? m_cuOffsetC[ctuAddr] : m_cuOffsetY[ctuAddr]); } /* get pointer to CU start address */ pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[0] + m_cuOffsetY[ctuAddr] + m_buOffsetY[absPartIdx]; } pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[1] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[2] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } + const pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[0] + m_cuOffsetY[ctuAddr] + m_buOffsetY[absPartIdx]; } + const pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[1] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } + const pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[2] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } + const pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } }; void updateChecksum(const pixel* plane, uint32_t& checksumVal, uint32_t height, uint32_t width, intptr_t stride, int row, uint32_t cuHeight); diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp index 3e0530d..a56b8d7 100644 --- a/source/common/pixel.cpp +++ b/source/common/pixel.cpp @@ -32,32 +32,32 @@ using namespace x265; -#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \ - p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>; +#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \ + p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>; #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \ p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \ @@ -90,16 +90,14 @@ namespace { // place functions in anonymous namespace (file static) template -int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { int sum = 0; for (int y = 0; y < ly; y++) { for (int x = 0; x < lx; x++) - { sum += abs(pix1[x] - pix2[x]); - } pix1 += stride_pix1; pix2 += stride_pix2; @@ -109,16 +107,14 @@ int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) } template -int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2) +int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2) { int sum = 0; for (int y = 0; y < ly; y++) { for (int x = 0; x < lx; x++) - { sum += abs(pix1[x] - pix2[x]); - } pix1 += stride_pix1; pix2 += stride_pix2; @@ -128,7 +124,7 @@ int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2 } template -void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstride, int32_t *res) +void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res) { res[0] = 0; res[1] = 0; @@ -150,7 +146,7 @@ void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstr } template -void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, intptr_t frefstride, int32_t *res) +void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res) { res[0] = 0; res[1] = 0; @@ -175,17 +171,17 @@ void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, int } template -int sse(T1 *pix1, intptr_t stride_pix1, T2 *pix2, intptr_t stride_pix2) +int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2) { int sum = 0; - int iTemp; + int tmp; for (int y = 0; y < ly; y++) { for (int x = 0; x < lx; x++) { - iTemp = pix1[x] - pix2[x]; - sum += (iTemp * iTemp); + tmp = pix1[x] - pix2[x]; + sum += (tmp * tmp); } pix1 += stride_pix1; @@ -217,7 +213,7 @@ inline sum2_t abs2(sum2_t a) return (a + s) ^ s; } -int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { sum2_t tmp[4][2]; sum2_t a0, a1, a2, a3, b0, b1; @@ -245,7 +241,7 @@ int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix return (int)(sum >> 1); } -int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2) +int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2) { ssum2_t tmp[4][2]; ssum2_t a0, a1, a2, a3, b0, b1; @@ -274,7 +270,7 @@ int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride } // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once -int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { sum2_t tmp[4][4]; sum2_t a0, a1, a2, a3; @@ -300,41 +296,33 @@ int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix template // calculate satd in blocks of 4x4 -int satd4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { int satd = 0; for (int row = 0; row < h; row += 4) - { for (int col = 0; col < w; col += 4) - { satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1, pix2 + row * stride_pix2 + col, stride_pix2); - } - } return satd; } template // calculate satd in blocks of 8x4 -int satd8(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { int satd = 0; for (int row = 0; row < h; row += 4) - { for (int col = 0; col < w; col += 8) - { satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1, pix2 + row * stride_pix2 + col, stride_pix2); - } - } return satd; } -inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { sum2_t tmp[8][4]; sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; @@ -371,12 +359,12 @@ inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) return (int)sum; } -int sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); } -inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2) +inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2) { ssum2_t tmp[8][4]; ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; @@ -413,12 +401,12 @@ inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_p return (int)sum; } -int sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2) +int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2) { return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); } -int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2) @@ -432,159 +420,129 @@ int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) template // Calculate sa8d in blocks of 8x8 -int sa8d8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { int cost = 0; for (int y = 0; y < h; y += 8) - { for (int x = 0; x < w; x += 8) - { cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); - } - } return cost; } template // Calculate sa8d in blocks of 16x16 -int sa8d16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { int cost = 0; for (int y = 0; y < h; y += 16) - { for (int x = 0; x < w; x += 16) - { cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); - } - } return cost; } template -int pixel_ssd_s_c(short *a, intptr_t dstride) +int pixel_ssd_s_c(const int16_t* a, intptr_t dstride) { int sum = 0; for (int y = 0; y < size; y++) { for (int x = 0; x < size; x++) - { sum += a[x] * a[x]; - } + a += dstride; } return sum; } template -void blockfil_s_c(int16_t *dst, intptr_t dstride, int16_t val) +void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val) { for (int y = 0; y < size; y++) - { for (int x = 0; x < size; x++) - { dst[y * dstride + x] = val; - } - } -} - -void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size) -{ - for (int i = 0; i < size; i++) - { - for (int j = 0; j < size; j++) - { - dst[i * size + j] = ((int)src[i * stride + j]) << shift; - } - } } template -void convert16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset) +void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) { - for (int i = 0; i < size; i++) - { - for (int j = 0; j < size; j++) - { - dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift; - } - } -} - -void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size) -{ - int round = 1 << (shift - 1); + X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); + X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); + X265_CHECK(shift >= 0, "invalid shift\n"); for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) - { - dst[j] = (int16_t)((src[j] + round) >> shift); - } + dst[j] = src[j] << shift; - src += size; - dst += stride; + src += srcStride; + dst += size; } } -void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size) +template +void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) { - int round = 1 << (shift - 1); + X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); + X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); + X265_CHECK(shift > 0, "invalid shift\n"); + int16_t round = 1 << (shift - 1); for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) - { - dst[j] = (int16_t)((src[j] + round) >> shift); - } + dst[j] = (src[j] + round) >> shift; - src += size; - dst += stride; + src += srcStride; + dst += size; } } template -void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) { + X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); + X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); + X265_CHECK(shift >= 0, "invalid shift\n"); + for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) - { - dst[j] = ((int16_t)src[j] << shift); - } + dst[j] = src[j] << shift; src += size; - dst += stride; + dst += dstStride; } } template -void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) { + X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); + X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); + X265_CHECK(shift > 0, "invalid shift\n"); + + int16_t round = 1 << (shift - 1); for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) - { - dst[j] = (src[j] << shift); - } + dst[j] = (src[j] + round) >> shift; src += size; - dst += stride; + dst += dstStride; } } template -void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) +void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride) { for (int y = 0; y < blockSize; y++) { for (int x = 0; x < blockSize; x++) - { residual[x] = static_cast(fenc[x]) - static_cast(pred[x]); - } fenc += stride; residual += stride; @@ -593,18 +551,14 @@ void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) } template -void transpose(pixel* dst, pixel* src, intptr_t stride) +void transpose(pixel* dst, const pixel* src, intptr_t stride) { for (int k = 0; k < blockSize; k++) - { for (int l = 0; l < blockSize; l++) - { dst[k * blockSize + l] = src[l * stride + k]; - } - } } -void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) +void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) { int x, y; @@ -622,7 +576,7 @@ void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStrid } } -void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) +void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) { int x, y; @@ -646,14 +600,12 @@ void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, } template -void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int) +void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) { for (int y = 0; y < ly; y++) { for (int x = 0; x < lx; x++) - { dst[x] = (src0[x] + src1[x] + 1) >> 1; - } src0 += sstride0; src1 += sstride1; @@ -661,7 +613,7 @@ void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, p } } -void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/) +void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/) { int x; @@ -675,9 +627,9 @@ void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/) } } -void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) +void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride) { - int x, y; + uint32_t x, y; for (y = 0; y < 64; y += 2) { @@ -694,13 +646,13 @@ void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) } } -void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc, +void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, intptr_t src_stride, intptr_t dst_stride, int width, int height) { for (int y = 0; y < height; y++) { - pixel *src1 = src0 + src_stride; - pixel *src2 = src1 + src_stride; + const pixel* src1 = src0 + src_stride; + const pixel* src2 = src1 + src_stride; for (int x = 0; x < width; x++) { // slower than naive bilinear, but matches asm @@ -720,7 +672,7 @@ void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, } /* structural similarity metric */ -void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]) +void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]) { for (int z = 0; z < 2; z++) { @@ -794,7 +746,7 @@ float ssim_end_4(int sum0[5][4], int sum1[5][4], int width) } template -uint64_t pixel_var(pixel *pix, intptr_t i_stride) +uint64_t pixel_var(const pixel* pix, intptr_t i_stride) { uint32_t sum = 0, sqr = 0; @@ -817,7 +769,7 @@ uint64_t pixel_var(pixel *pix, intptr_t i_stride) #endif template -int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) +int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) { static pixel zeroBuf[8] /* = { 0 } */; @@ -850,7 +802,7 @@ int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) } template -int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride) +int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) { static int16_t zeroBuf[8] /* = { 0 } */; @@ -882,28 +834,13 @@ int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstri } } -void plane_copy_deinterleave_chroma(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, - pixel *src, intptr_t srcStride, int w, int h) -{ - for (int y = 0; y < h; y++, dstu += dstuStride, dstv += dstvStride, src += srcStride) - { - for (int x = 0; x < w; x++) - { - dstu[x] = src[2 * x]; - dstv[x] = src[2 * x + 1]; - } - } -} - template -void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb) +void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = b[x]; - } a += stridea; b += strideb; @@ -911,14 +848,12 @@ void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb) } template -void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb) +void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = b[x]; - } a += stridea; b += strideb; @@ -926,7 +861,7 @@ void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb) } template -void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb) +void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb) { for (int y = 0; y < by; y++) { @@ -942,14 +877,12 @@ void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb) } template -void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb) +void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = (int16_t)b[x]; - } a += stridea; b += strideb; @@ -957,14 +890,12 @@ void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb) } template -void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1) +void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = (int16_t)(b0[x] - b1[x]); - } b0 += sstride0; b1 += sstride1; @@ -973,14 +904,12 @@ void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t } template -void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1) +void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = Clip(b0[x] + b1[x]); - } b0 += sstride0; b1 += sstride1; @@ -989,7 +918,7 @@ void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t } template -void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) +void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) { int shiftNum, offset; @@ -1010,28 +939,24 @@ void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intpt } } -void planecopy_cp_c(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) +void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift) { for (int r = 0; r < height; r++) { for (int c = 0; c < width; c++) - { dst[c] = ((pixel)src[c]) << shift; - } dst += dstStride; src += srcStride; } } -void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) +void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) { for (int r = 0; r < height; r++) { for (int c = 0; c < width; c++) - { dst[c] = (pixel)((src[c] >> shift) & mask); - } dst += dstStride; src += srcStride; @@ -1040,8 +965,8 @@ void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstS /* Estimate the total amount of influence on future quality that could be had if we * were to improve the reference samples used to inter predict any given CU. */ -void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, - int32_t *invQscales, double *fpsFactor, int len) +void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, + const int32_t* invQscales, const double* fpsFactor, int len) { double fps = *fpsFactor / 256; @@ -1068,12 +993,12 @@ void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int mar primitives.extendRowBorder(pic, stride, width, height, marginX); /* copy top row to create above margin */ - pixel *top = pic - marginX; + pixel* top = pic - marginX; for (int y = 0; y < marginY; y++) memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel)); /* copy bottom row to create below margin */ - pixel *bot = pic - marginX + (height - 1) * stride; + pixel* bot = pic - marginX + (height - 1) * stride; for (int y = 0; y < marginY; y++) memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel)); } @@ -1113,6 +1038,62 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.satd[LUMA_64x16] = satd8<64, 16>; p.satd[LUMA_16x64] = satd8<16, 64>; + p.chroma[X265_CSP_I420].satd[CHROMA_2x2] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = satd_4x4; + p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = satd8<8, 8>; + p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>; + + p.chroma[X265_CSP_I420].satd[CHROMA_4x2] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_2x4] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = satd_8x4; + p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = satd4<4, 8>; + p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = satd8<16, 8>; + p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = satd8<8, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>; + + p.chroma[X265_CSP_I420].satd[CHROMA_8x6] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_6x8] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_8x2] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_2x8] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>; + p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = satd4<16, 4>; + p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = satd4<4, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>; + p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>; + p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = satd8<32, 8>; + p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = satd8<8, 32>; + + p.chroma[X265_CSP_I422].satd[CHROMA422_2x4] = NULL; + p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = satd4<4, 8>; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = satd8<8, 16>; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>; + + p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = satd_4x4; + p.chroma[X265_CSP_I422].satd[CHROMA422_2x8] = NULL; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = satd8<8, 8>; + p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = satd4<4, 16>; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = satd8<8, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>; + + p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>; + p.chroma[X265_CSP_I422].satd[CHROMA422_6x16] = NULL; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = satd4<8, 4>; + p.chroma[X265_CSP_I422].satd[CHROMA422_2x16] = NULL; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>; + p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = satd8<16, 8>; + p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>; + p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>; + #define CHROMA_420(W, H) \ p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg; \ p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c; \ @@ -1121,13 +1102,14 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c; #define CHROMA_422(W, H) \ - p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg; \ + p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg; \ p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c; \ p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c; \ p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c; \ p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c; #define CHROMA_444(W, H) \ + p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H] = p.satd[LUMA_ ## W ## x ## H]; \ p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg; \ p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c; \ p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c; \ @@ -1157,8 +1139,6 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c; \ p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c; - - LUMA(4, 4); LUMA(8, 8); CHROMA_420(4, 4); @@ -1278,9 +1258,9 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) CHROMA_444(64, 16); CHROMA_444(16, 64); - SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel) - SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel) - SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t) + SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel) + SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel) + SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t) p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>; p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>; @@ -1288,22 +1268,22 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>; p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>; - p.cvt16to32_shl = convert16to32_shl; - p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>; - p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>; - p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>; - p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>; - p.cvt32to16_shr = convert32to16_shr; - p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>; - p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>; - p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>; - p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>; - - p.copy_shr = copy_shr; - p.copy_shl[BLOCK_4x4] = copy_shl<4>; - p.copy_shl[BLOCK_8x8] = copy_shl<8>; - p.copy_shl[BLOCK_16x16] = copy_shl<16>; - p.copy_shl[BLOCK_32x32] = copy_shl<32>; + p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>; + p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>; + p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>; + p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>; + p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>; + p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>; + p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>; + p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>; + p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>; + p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>; + p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>; + p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>; + p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>; + p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>; + p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>; + p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>; p.sa8d[BLOCK_4x4] = satd_4x4; p.sa8d[BLOCK_8x8] = sa8d_8x8; @@ -1371,7 +1351,7 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.scale1D_128to64 = scale1D_128to64; p.scale2D_64to32 = scale2D_64to32; - p.frame_init_lowres_core = frame_init_lowres_core; + p.frameInitLowres = frame_init_lowres_core; p.ssim_4x4x2_core = ssim_4x4x2_core; p.ssim_end_4 = ssim_end_4; @@ -1379,7 +1359,6 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.var[BLOCK_16x16] = pixel_var<16>; p.var[BLOCK_32x32] = pixel_var<32>; p.var[BLOCK_64x64] = pixel_var<64>; - p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; p.planecopy_cp = planecopy_cp_c; p.planecopy_sp = planecopy_sp_c; p.propagateCost = estimateCUPropagateCost; diff --git a/source/common/predict.cpp b/source/common/predict.cpp index a142c5a..6be4f19 100644 --- a/source/common/predict.cpp +++ b/source/common/predict.cpp @@ -83,7 +83,8 @@ void Predict::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, ui { int tuSize = 1 << log2TrSize; - pixel *refLft, *refAbv; + pixel* refLft; + pixel* refAbv; if (!(g_intraFilterFlags[dirMode] & tuSize)) { @@ -187,18 +188,18 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma) ShortYuv& shortYuv = m_predShortYuv[0]; if (bLuma) - predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); if (bChroma) - predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma); } else { if (bLuma) - predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); if (bChroma) - predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); } } else @@ -253,13 +254,13 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma) if (bLuma) { - predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); - predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); } if (bChroma) { - predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); - predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); } if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag)) @@ -277,18 +278,18 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma) ShortYuv& shortYuv = m_predShortYuv[0]; if (bLuma) - predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); if (bChroma) - predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma); } else { if (bLuma) - predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); if (bChroma) - predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); } } else @@ -302,18 +303,18 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma) ShortYuv& shortYuv = m_predShortYuv[0]; if (bLuma) - predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); if (bChroma) - predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma); } else { if (bLuma) - predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); if (bChroma) - predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); } } } @@ -321,13 +322,13 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma) void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const { - pixel *dst = dstYuv.getLumaAddr(m_puAbsPartIdx); + pixel* dst = dstYuv.getLumaAddr(m_puAbsPartIdx); intptr_t dstStride = dstYuv.m_size; intptr_t srcStride = refPic.m_stride; intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride; int partEnum = partitionFromSizes(m_puWidth, m_puHeight); - pixel* src = const_cast(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset; + const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset; int xFrac = mv.x & 0x3; int yFrac = mv.y & 0x3; @@ -350,12 +351,12 @@ void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const { - int16_t *dst = dstSYuv.getLumaAddr(m_puAbsPartIdx); + int16_t* dst = dstSYuv.getLumaAddr(m_puAbsPartIdx); int dstStride = dstSYuv.m_size; intptr_t srcStride = refPic.m_stride; intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride; - pixel *src = const_cast(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset; + const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset; int xFrac = mv.x & 0x3; int yFrac = mv.y & 0x3; @@ -391,8 +392,8 @@ void Predict::predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride; - pixel* refCb = const_cast(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; - pixel* refCr = const_cast(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; pixel* dstCb = dstYuv.getCbAddr(m_puAbsPartIdx); pixel* dstCr = dstYuv.getCrAddr(m_puAbsPartIdx); @@ -441,8 +442,8 @@ void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, cons intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride; - pixel* refCb = const_cast(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; - pixel* refCr = const_cast(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; int16_t* dstCb = dstSYuv.getCbAddr(m_puAbsPartIdx); int16_t* dstCr = dstSYuv.getCrAddr(m_puAbsPartIdx); @@ -459,8 +460,8 @@ void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, cons if (!(yFrac | xFrac)) { - primitives.chroma_p2s[m_csp](refCb, refStride, dstCb, cxWidth, cxHeight); - primitives.chroma_p2s[m_csp](refCr, refStride, dstCr, cxWidth, cxHeight); + primitives.chroma[m_csp].p2s(refCb, refStride, dstCb, cxWidth, cxHeight); + primitives.chroma[m_csp].p2s(refCr, refStride, dstCr, cxWidth, cxHeight); } else if (!yFrac) { @@ -492,20 +493,12 @@ void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& int w0, w1, offset, shiftNum, shift, round; uint32_t src0Stride, src1Stride, dststride; - pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx); - pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx); - pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx); - - const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx); - const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx); - const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx); - - const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx); - const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx); - const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx); - if (bLuma) { + pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx); + const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx); + const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx); + // Luma w0 = wp0[0].w; offset = wp0[0].o + wp1[0].o; @@ -542,6 +535,13 @@ void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& if (bChroma) { + pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx); + pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx); + const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx); + const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx); + const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx); + const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx); + // Chroma U w0 = wp0[1].w; offset = wp0[1].o + wp1[1].o; @@ -602,19 +602,14 @@ void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& /* weighted averaging for uni-pred */ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const { - pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx); - pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx); - pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx); - - const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx); - const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx); - const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx); - int w0, offset, shiftNum, shift, round; uint32_t srcStride, dstStride; if (bLuma) { + pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx); + const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx); + // Luma w0 = wp[0].w; offset = wp[0].offset; @@ -624,11 +619,16 @@ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightVal srcStride = srcYuv.m_size; dstStride = predYuv.m_size; - primitives.weight_sp(const_cast(srcY0), dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset); + primitives.weight_sp(srcY0, dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset); } if (bChroma) { + pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx); + pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx); + const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx); + const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx); + // Chroma U w0 = wp[1].w; offset = wp[1].offset; @@ -642,7 +642,7 @@ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightVal uint32_t cwidth = m_puWidth >> srcYuv.m_hChromaShift; uint32_t cheight = m_puHeight >> srcYuv.m_vChromaShift; - primitives.weight_sp(const_cast(srcU0), dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset); + primitives.weight_sp(srcU0, dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset); // Chroma V w0 = wp[2].w; @@ -650,7 +650,7 @@ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightVal shift = wp[2].shift + shiftNum; round = shift ? (1 << (shift - 1)) : 0; - primitives.weight_sp(const_cast(srcV0), dstV, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset); + primitives.weight_sp(srcV0, dstV, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset); } } @@ -668,8 +668,8 @@ void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t ab int tuSize = intraNeighbors.tuSize; int tuSize2 = tuSize << 1; - pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - intptr_t picStride = cu.m_encData->m_reconPicYuv->m_stride; + pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = cu.m_encData->m_reconPic->m_stride; fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors); @@ -744,8 +744,8 @@ void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint3 initIntraNeighbors(cu, absPartIdx, partDepth, false, &intraNeighbors); uint32_t tuSize = intraNeighbors.tuSize; - const pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - intptr_t picStride = cu.m_encData->m_reconPicYuv->m_strideC; + const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = cu.m_encData->m_reconPic->m_strideC; pixel* adiRef = getAdiChromaBuf(chromaId, tuSize); fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors); @@ -765,7 +765,7 @@ void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t } int numIntraNeighbor = 0; - bool *bNeighborFlags = intraNeighbors->bNeighborFlags; + bool* bNeighborFlags = intraNeighbors->bNeighborFlags; uint32_t partIdxLT, partIdxRT, partIdxLB; @@ -829,15 +829,15 @@ void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, p } else // reference samples are partially available { - const bool *bNeighborFlags = intraNeighbors.bNeighborFlags; - const bool *pNeighborFlags; + const bool* bNeighborFlags = intraNeighbors.bNeighborFlags; + const bool* pNeighborFlags; int aboveUnits = intraNeighbors.aboveUnits; int leftUnits = intraNeighbors.leftUnits; int unitWidth = intraNeighbors.unitWidth; int unitHeight = intraNeighbors.unitHeight; int totalSamples = (leftUnits * unitHeight) + ((aboveUnits + 1) * unitWidth); pixel adiLineBuffer[5 * MAX_CU_SIZE]; - pixel *adi; + pixel* adi; // Initialize for (int i = 0; i < totalSamples; i++) @@ -893,7 +893,7 @@ void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, p while (next < totalUnits && !bNeighborFlags[next]) next++; - pixel *pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth))); + pixel* pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth))); const pixel refSample = *pAdiLineNext; // Pad unavailable samples with new value int nextOrTop = X265_MIN(next, leftUnits); @@ -959,12 +959,12 @@ bool Predict::isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT) return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft); } -int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags) +int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags) { const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT]; const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT] + 1; const uint32_t idxStep = 1; - bool *validFlagPtr = bValidFlags; + bool* validFlagPtr = bValidFlags; int numIntra = 0; for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep) @@ -985,12 +985,12 @@ int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t par return numIntra; } -int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags) +int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags) { const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT]; const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB] + 1; const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize; - bool *validFlagPtr = bValidFlags; + bool* validFlagPtr = bValidFlags; int numIntra = 0; for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep) @@ -1011,10 +1011,10 @@ int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t part return numIntra; } -int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags) +int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags) { const uint32_t numUnitsInPU = g_zscanToRaster[partIdxRT] - g_zscanToRaster[partIdxLT] + 1; - bool *validFlagPtr = bValidFlags; + bool* validFlagPtr = bValidFlags; int numIntra = 0; for (uint32_t offset = 1; offset <= numUnitsInPU; offset++) @@ -1035,10 +1035,10 @@ int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_ return numIntra; } -int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags) +int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags) { const uint32_t numUnitsInPU = (g_zscanToRaster[partIdxLB] - g_zscanToRaster[partIdxLT]) / cu.m_slice->m_sps->numPartInCUSize + 1; - bool *validFlagPtr = bValidFlags; + bool* validFlagPtr = bValidFlags; int numIntra = 0; for (uint32_t offset = 1; offset <= numUnitsInPU; offset++) diff --git a/source/common/primitives.cpp b/source/common/primitives.cpp index 7592d27..ebb8af6 100644 --- a/source/common/primitives.cpp +++ b/source/common/primitives.cpp @@ -75,7 +75,8 @@ void Setup_Alias_Primitives(EncoderPrimitives &p) p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i]; p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i]; p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i]; - p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i]; + p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i]; + p.chroma[X265_CSP_I444].satd[i] = p.satd[i]; } for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) @@ -84,15 +85,6 @@ void Setup_Alias_Primitives(EncoderPrimitives &p) p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i]; } - for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) - { - int partL = partitionFromLog2Size(i + 2); - p.square_copy_pp[i] = p.luma_copy_pp[partL]; - p.square_copy_ps[i] = p.luma_copy_ps[partL]; - p.square_copy_sp[i] = p.luma_copy_sp[partL]; - p.square_copy_ss[i] = p.luma_copy_ss[partL]; - } - primitives.sa8d[BLOCK_4x4] = primitives.sa8d_inter[LUMA_4x4]; primitives.sa8d[BLOCK_8x8] = primitives.sa8d_inter[LUMA_8x8]; primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16]; @@ -107,6 +99,52 @@ void Setup_Alias_Primitives(EncoderPrimitives &p) primitives.sa8d_inter[LUMA_16x4] = primitives.satd[LUMA_16x4]; primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12]; primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16]; + + // Chroma SATD can often reuse luma primitives + p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = primitives.satd[LUMA_4x4]; + p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = primitives.satd[LUMA_8x8]; + p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = primitives.satd[LUMA_16x16]; + p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = primitives.satd[LUMA_32x32]; + + p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = primitives.satd[LUMA_8x4]; + p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = primitives.satd[LUMA_4x8]; + p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = primitives.satd[LUMA_16x8]; + p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = primitives.satd[LUMA_8x16]; + p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = primitives.satd[LUMA_32x16]; + p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = primitives.satd[LUMA_16x32]; + + p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = primitives.satd[LUMA_16x12]; + p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = primitives.satd[LUMA_12x16]; + p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = primitives.satd[LUMA_16x4]; + p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = primitives.satd[LUMA_4x16]; + p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = primitives.satd[LUMA_32x24]; + p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = primitives.satd[LUMA_24x32]; + p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = primitives.satd[LUMA_32x8]; + p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = primitives.satd[LUMA_8x32]; + + p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = primitives.satd[LUMA_4x8]; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = primitives.satd[LUMA_8x16]; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = primitives.satd[LUMA_16x32]; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = primitives.satd[LUMA_32x64]; + + p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = primitives.satd[LUMA_4x4]; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = primitives.satd[LUMA_8x8]; + p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = primitives.satd[LUMA_4x16]; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = primitives.satd[LUMA_16x16]; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = primitives.satd[LUMA_8x32]; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = primitives.satd[LUMA_32x32]; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = primitives.satd[LUMA_16x64]; + + //p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = primitives.satd[LUMA_8x4]; + //p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>; + //p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = primitives.satd[LUMA_16x8]; + //p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>; + //p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>; + //p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = primitives.satd[LUMA_32x16]; + //p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>; } } using namespace x265; @@ -123,17 +161,15 @@ void x265_setup_primitives(x265_param *param, int cpuid) if (!primitives.sad[0]) { Setup_C_Primitives(primitives); - Setup_Instrinsic_Primitives(primitives, cpuid); #if ENABLE_ASSEMBLY + Setup_Instrinsic_Primitives(primitives, cpuid); Setup_Assembly_Primitives(primitives, cpuid); #else x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n"); #endif Setup_Alias_Primitives(primitives); - - initROM(); } if (param->logLevel >= X265_LOG_INFO) @@ -169,74 +205,14 @@ void x265_setup_primitives(x265_param *param, int cpuid) } } -#if !defined(ENABLE_ASSEMBLY) -#if defined(_MSC_VER) -#include -#endif - +#if ENABLE_ASSEMBLY +/* these functions are implemented in assembly. When assembly is not being + * compiled, they are unnecessary and can be NOPs */ +#else extern "C" { -// the intrinsic primitives will not use MMX instructions, so if assembly -// is disabled there should be no reason to use EMMS. +int x265_cpu_cpuid_test(void) { return 0; } void x265_cpu_emms(void) {} - -#if defined(X265_ARCH_X86) - -#if defined(_MSC_VER) -# pragma warning(disable: 4100) -#elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax -# define __cpuidex(regsArray, level, index) \ - __asm__ __volatile__ ("cpuid" \ - : "=a" ((regsArray)[0]), "=b" ((regsArray)[1]), "=c" ((regsArray)[2]), "=d" ((regsArray)[3]) \ - : "0" (level), "2" (index)); -#else -# error "compiler not supported" -#endif - -int x265_cpu_cpuid_test(void) -{ - return 0; +void x265_cpu_cpuid(uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *) {} +void x265_cpu_xgetbv(uint32_t, uint32_t *, uint32_t *) {} } - -void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) -{ - int output[4]; - - __cpuidex(output, op, 0); - *eax = output[0]; - *ebx = output[1]; - *ecx = output[2]; - *edx = output[3]; -} - -void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx) -{ - uint64_t out = 0; - -#if X265_ARCH_X86 - -#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) - - // MSVC 2010 SP1 or later, or similar Intel release - out = _xgetbv(op); - -#elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax - - uint32_t a, d; - __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (op) :); - *eax = a; - *edx = d; - return; - -#elif defined(_WIN64) // On x64 with older compilers, this is impossible - -#endif // if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) - -#endif // if x86 - - *eax = (uint32_t)out; - *edx = (uint32_t)(out >> 32); -} - -#endif // X265_ARCH_X86 -} -#endif // if !ENABLE_ASSEMBLY +#endif diff --git a/source/common/primitives.h b/source/common/primitives.h index 8300c21..0c93f98 100644 --- a/source/common/primitives.h +++ b/source/common/primitives.h @@ -132,162 +132,147 @@ inline int partitionFromLog2Size(int log2Size) return log2Size - 2; } -typedef int (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned -typedef int (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride); -typedef int (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); -typedef int (*pixel_ssd_s_t)(int16_t *fenc, intptr_t fencstride); -typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res); -typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res); -typedef void (*blockcpy_sp_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned -typedef void (*blockcpy_sc_t)(int bx, int by, int16_t *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned -typedef void (*pixelsub_ps_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); -typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight); -typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val); - -typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter); -typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma); - -typedef void (*cvt16to32_shl_t)(int32_t *dst, int16_t *src, intptr_t, int, int); -typedef void (*cvt16to32_shr_t)(int32_t *dst, int16_t *src, intptr_t, int, int); -typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int); -typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int); -typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride); -typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size); -typedef void (*copy_shl_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift); - -typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride); -typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride); -typedef void (*denoiseDct_t)(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff); - -typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); -typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride); -typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); -typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); -typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift); -typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift); -typedef int (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff); - -typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); -typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); -typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride); -typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc, +typedef int (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned +typedef int (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride); +typedef int (*pixelcmp_sp_t)(const int16_t* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); +typedef int (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride); +typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res); +typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res); +typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight); +typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val); + +typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter); +typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma); + +typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +typedef void (*cpy1Dto2D_shl_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +typedef void (*cpy1Dto2D_shr_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t resiStride); + +typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t srcStride); +typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t dstStride); +typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff); + +typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride); +typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff); +typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff); +typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift); +typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift); +typedef int (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff); + +typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); +typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); +typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride); +typedef void (*downscale_t)(const pixel* src0, pixel* dstf, pixel* dsth, pixel* dstv, pixel* dstc, intptr_t src_stride, intptr_t dst_stride, int width, int height); typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX); -typedef void (*ssim_4x4x2_core_t)(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]); +typedef void (*ssim_4x4x2_core_t)(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]); typedef float (*ssim_end4_t)(int sum0[5][4], int sum1[5][4], int width); -typedef uint64_t (*var_t)(pixel *pix, intptr_t stride); -typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src, intptr_t srcStride, int w, int h); +typedef uint64_t (*var_t)(const pixel* pix, intptr_t stride); +typedef void (*plane_copy_deinterleave_t)(pixel* dstu, intptr_t dstuStride, pixel* dstv, intptr_t dstvStride, const pixel* src, intptr_t srcStride, int w, int h); -typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx); -typedef void (*filter_hps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt); -typedef void (*filter_ps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); -typedef void (*filter_sp_t) (int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx); -typedef void (*filter_ss_t) (int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); -typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY); -typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); +typedef void (*filter_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_hps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); +typedef void (*filter_ps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_sp_t) (const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_ss_t) (const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); +typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height); -typedef void (*copy_pp_t)(pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned -typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride); -typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride); -typedef void (*copy_ss_t)(int16_t *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride); +typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned +typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +typedef void (*copy_ps_t)(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); +typedef void (*copy_ss_t)(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); -typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); -typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1); -typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride); +typedef void (*pixel_sub_ps_t)(int16_t* dst, intptr_t dstride, const pixel* src0, const pixel* src1, intptr_t sstride0, intptr_t sstride1); +typedef void (*pixel_add_ps_t)(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); +typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride); -typedef void (*saoCuOrgE0_t)(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft); -typedef void (*planecopy_cp_t) (uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift); -typedef void (*planecopy_sp_t) (uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); +typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft); +typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift); +typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); -typedef void (*cutree_propagate_cost) (int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len); +typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len); /* Define a structure containing function pointers to optimized encoder * primitives. Each pointer can reference either an assembly routine, * a vectorized primitive, or a C function. */ struct EncoderPrimitives { - pixelcmp_t sad[NUM_LUMA_PARTITIONS]; // Sum of Differences for each size - pixelcmp_x3_t sad_x3[NUM_LUMA_PARTITIONS]; // Sum of Differences 3x for each size - pixelcmp_x4_t sad_x4[NUM_LUMA_PARTITIONS]; // Sum of Differences 4x for each size - pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed - pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed - pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed - pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed - pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD) - pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions - pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks - pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in AC energy between two blocks - pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS]; - - blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value - cvt16to32_shl_t cvt16to32_shl; - cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1]; - cvt32to16_shr_t cvt32to16_shr; - cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1]; - copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1]; - copy_shr_t copy_shr; - copy_shl_t copy_shl[NUM_SQUARE_BLOCKS - 1]; - - copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS]; - copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS]; - copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS]; - copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS]; - pixel_sub_ps_t luma_sub_ps[NUM_SQUARE_BLOCKS]; - pixel_add_ps_t luma_add_ps[NUM_SQUARE_BLOCKS]; - copy_pp_t square_copy_pp[NUM_SQUARE_BLOCKS]; - copy_sp_t square_copy_sp[NUM_SQUARE_BLOCKS]; - copy_ps_t square_copy_ps[NUM_SQUARE_BLOCKS]; - copy_ss_t square_copy_ss[NUM_SQUARE_BLOCKS]; - - filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS]; - filter_hps_t luma_hps[NUM_LUMA_PARTITIONS]; - filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS]; - filter_ps_t luma_vps[NUM_LUMA_PARTITIONS]; - filter_sp_t luma_vsp[NUM_LUMA_PARTITIONS]; - filter_ss_t luma_vss[NUM_LUMA_PARTITIONS]; - filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS]; - filter_p2s_t luma_p2s; - filter_p2s_t chroma_p2s[X265_CSP_COUNT]; - - weightp_sp_t weight_sp; - weightp_pp_t weight_pp; - pixelavg_pp_t pixelavg_pp[NUM_LUMA_PARTITIONS]; - addAvg_t luma_addAvg[NUM_LUMA_PARTITIONS]; - - intra_pred_t intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE]; - intra_allangs_t intra_pred_allangs[NUM_TR_SIZE]; - scale_t scale1D_128to64; - scale_t scale2D_64to32; - - dct_t dct[NUM_DCTS]; - idct_t idct[NUM_IDCTS]; - quant_t quant; - nquant_t nquant; - dequant_scaling_t dequant_scaling; - dequant_normal_t dequant_normal; - count_nonzero_t count_nonzero; - denoiseDct_t denoiseDct; - - calcresidual_t calcresidual[NUM_SQUARE_BLOCKS]; - transpose_t transpose[NUM_SQUARE_BLOCKS]; - - var_t var[NUM_SQUARE_BLOCKS]; - ssim_4x4x2_core_t ssim_4x4x2_core; - ssim_end4_t ssim_end_4; - - downscale_t frame_init_lowres_core; - plane_copy_deinterleave_t plane_copy_deinterleave_c; - extendCURowBorder_t extendRowBorder; - // sao primitives - saoCuOrgE0_t saoCuOrgE0; - planecopy_cp_t planecopy_cp; - planecopy_sp_t planecopy_sp; - - cutree_propagate_cost propagateCost; + pixelcmp_t sad[NUM_LUMA_PARTITIONS]; // Sum of Differences for each size + pixelcmp_x3_t sad_x3[NUM_LUMA_PARTITIONS]; // Sum of Differences 3x for each size + pixelcmp_x4_t sad_x4[NUM_LUMA_PARTITIONS]; // Sum of Differences 4x for each size + pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed + pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed + pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed + pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed + pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD) + pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions + pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks + pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in AC energy between two blocks + pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS]; + + dct_t dct[NUM_DCTS]; + idct_t idct[NUM_IDCTS]; + quant_t quant; + nquant_t nquant; + dequant_scaling_t dequant_scaling; + dequant_normal_t dequant_normal; + count_nonzero_t count_nonzero; + denoiseDct_t denoiseDct; + calcresidual_t calcresidual[NUM_SQUARE_BLOCKS]; + blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value + cpy2Dto1D_shl_t cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1]; + cpy2Dto1D_shr_t cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1]; + cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1]; + cpy1Dto2D_shr_t cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1]; + copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1]; + + intra_pred_t intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE]; + intra_allangs_t intra_pred_allangs[NUM_TR_SIZE]; + transpose_t transpose[NUM_SQUARE_BLOCKS]; + scale_t scale1D_128to64; + scale_t scale2D_64to32; + + var_t var[NUM_SQUARE_BLOCKS]; + ssim_4x4x2_core_t ssim_4x4x2_core; + ssim_end4_t ssim_end_4; + + saoCuOrgE0_t saoCuOrgE0; + + downscale_t frameInitLowres; + cutree_propagate_cost propagateCost; + + extendCURowBorder_t extendRowBorder; + planecopy_cp_t planecopy_cp; + planecopy_sp_t planecopy_sp; + + weightp_sp_t weight_sp; + weightp_pp_t weight_pp; + pixelavg_pp_t pixelavg_pp[NUM_LUMA_PARTITIONS]; + addAvg_t luma_addAvg[NUM_LUMA_PARTITIONS]; + + filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS]; + filter_hps_t luma_hps[NUM_LUMA_PARTITIONS]; + filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS]; + filter_ps_t luma_vps[NUM_LUMA_PARTITIONS]; + filter_sp_t luma_vsp[NUM_LUMA_PARTITIONS]; + filter_ss_t luma_vss[NUM_LUMA_PARTITIONS]; + filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS]; + filter_p2s_t luma_p2s; + + copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS]; + copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS]; + copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS]; + copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS]; + pixel_sub_ps_t luma_sub_ps[NUM_SQUARE_BLOCKS]; + pixel_add_ps_t luma_add_ps[NUM_SQUARE_BLOCKS]; struct { + pixelcmp_t satd[NUM_LUMA_PARTITIONS]; filter_pp_t filter_vpp[NUM_LUMA_PARTITIONS]; filter_ps_t filter_vps[NUM_LUMA_PARTITIONS]; filter_sp_t filter_vsp[NUM_LUMA_PARTITIONS]; @@ -301,7 +286,8 @@ struct EncoderPrimitives copy_ss_t copy_ss[NUM_LUMA_PARTITIONS]; pixel_sub_ps_t sub_ps[NUM_SQUARE_BLOCKS]; pixel_add_ps_t add_ps[NUM_SQUARE_BLOCKS]; - } chroma[4]; // X265_CSP_COUNT - do not want to include x265.h here + filter_p2s_t p2s; + } chroma[X265_CSP_COUNT]; }; void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY); diff --git a/source/common/quant.cpp b/source/common/quant.cpp index 387962c..a6b50fd 100644 --- a/source/common/quant.cpp +++ b/source/common/quant.cpp @@ -50,7 +50,7 @@ inline int fastMin(int x, int y) return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y) } -inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) +inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) { X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n"); X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); @@ -81,7 +81,7 @@ inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOne // NOTE: mapping to x86 hardware instruction BSR unsigned long size; - CLZ32(size, absLevel); + CLZ(size, absLevel); int egs = size * 2 + 1; rate += egs << 15; @@ -106,7 +106,7 @@ inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOne } /* Calculates the cost for specific absolute transform level */ -inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) +inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) { X265_CHECK(absLevel, "absLevel should not be zero\n"); @@ -135,7 +135,7 @@ inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *g if (symbol) { unsigned long idx; - CLZ32(idx, symbol + 1); + CLZ(idx, symbol + 1); length = idx; } @@ -166,7 +166,7 @@ bool Quant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList, m_useRDOQ = useRDOQ; m_psyRdoqScale = (int64_t)(psyScale * 256.0); m_scalingList = &scalingList; - m_resiDctCoeff = X265_MALLOC(int32_t, MAX_TR_SIZE * MAX_TR_SIZE * 2); + m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2); m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE); m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE); @@ -195,8 +195,8 @@ void Quant::setQPforQuant(const CUData& ctu) m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL; int qpy = ctu.m_qp[0]; m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET); - setChromaQP(qpy + ctu.m_slice->m_pps->chromaCbQpOffset, TEXT_CHROMA_U, ctu.m_chromaFormat); - setChromaQP(qpy + ctu.m_slice->m_pps->chromaCrQpOffset, TEXT_CHROMA_V, ctu.m_chromaFormat); + setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat); + setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat); } void Quant::setChromaQP(int qpin, TextType ttype, int chFmt) @@ -216,7 +216,7 @@ void Quant::setChromaQP(int qpin, TextType ttype, int chFmt) uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams) { const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG; - const uint16_t *scan = codeParams.scan; + const uint16_t* scan = codeParams.scan; bool lastCG = true; for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--) @@ -322,58 +322,55 @@ uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSi return numSig; } -uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16_t* residual, uint32_t stride, +uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip) { + const uint32_t sizeIdx = log2TrSize - 2; if (cu.m_tqBypass[absPartIdx]) { X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n"); - return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride); + return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride); } bool isLuma = ttype == TEXT_LUMA; bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip; - bool isIntra = cu.m_predMode[absPartIdx] == MODE_INTRA; int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform - int trSize = 1 << log2TrSize; X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n"); if (useTransformSkip) { #if X265_DEPTH <= 10 - primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize); + X265_CHECK(transformShift >= 0, "invalid transformShift\n"); + primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift); #else if (transformShift >= 0) - primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize); + primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift); else - { - int shift = -transformShift; - int offset = (1 << (shift - 1)); - primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset); - } + primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift); #endif } else { - const uint32_t sizeIdx = log2TrSize - 2; + bool isIntra = cu.isIntra(absPartIdx); int useDST = !sizeIdx && isLuma && isIntra; int index = DCT_4x4 + sizeIdx - useDST; - primitives.dct[index](residual, m_resiDctCoeff, stride); + primitives.dct[index](residual, m_resiDctCoeff, resiStride); /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so * there is no risk of performing this DCT unnecessarily */ if (usePsy) { + int trSize = 1 << log2TrSize; /* perform DCT on source pixels for psy-rdoq */ - primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride); + primitives.luma_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride); primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize); } - if (m_nr && !isIntra) + if (m_nr) { /* denoise is not applied to intra residual, so DST can be ignored */ - int cat = sizeIdx + 4 * !isLuma; + int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra; int numCoeff = 1 << (log2TrSize * 2); primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff); m_nr->count[cat]++; @@ -389,7 +386,7 @@ uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16 int scalingListType = ttype + (isLuma ? 3 : 0); int rem = m_qpParam[ttype].rem; int per = m_qpParam[ttype].per; - int32_t *quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; + const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; int qbits = QUANT_SHIFT + per + transformShift; int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9); @@ -408,12 +405,13 @@ uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16 } } -void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, +void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff, uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig) { + const uint32_t sizeIdx = log2TrSize - 2; if (transQuantBypass) { - primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0); + primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0); return; } @@ -427,7 +425,7 @@ void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t s if (m_scalingList->m_bEnabled) { int scalingListType = (bIntra ? 0 : 3) + ttype; - int32_t *dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; + const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem]; primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift); } else @@ -438,20 +436,18 @@ void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t s if (useTransformSkip) { - int trSize = 1 << log2TrSize; - #if X265_DEPTH <= 10 - primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize); + X265_CHECK(transformShift > 0, "invalid transformShift\n"); + primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift); #else if (transformShift > 0) - primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize); + primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift); else - primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift); + primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift); #endif } else { - const uint32_t sizeIdx = log2TrSize - 2; int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra; X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n"); @@ -459,23 +455,23 @@ void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t s // DC only if (numSig == 1 && coeff[0] != 0 && !useDST) { - const int shift_1st = 7; + const int shift_1st = 7 - 6; const int add_1st = 1 << (shift_1st - 1); - const int shift_2nd = 12 - (X265_DEPTH - 8); + const int shift_2nd = 12 - (X265_DEPTH - 8) - 3; const int add_2nd = 1 << (shift_2nd - 1); - int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd; - primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val); + int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd; + primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val); return; } - primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride); + primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride); } } /* Rate distortion optimized quantization for entropy coding engines using * probability models like CABAC */ -uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy) +uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy) { int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; @@ -486,7 +482,7 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex int per = m_qpParam[ttype].per; int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ int add = (1 << (qbits - 1)); - int32_t *qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; + const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; int numCoeff = 1 << (log2TrSize * 2); @@ -503,7 +499,7 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) * scale applied that must be removed during unquant. Note that in real dequant there is clipping * at several stages. We skip the clipping for simplicity when measuring RD cost */ - int32_t *unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; + const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; int scaleBits = SCALE_BITS - 2 * transformShift; @@ -616,8 +612,8 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex // coefficient level estimation const uint32_t oneCtx = 4 * ctxSet + c1; const uint32_t absCtx = ctxSet + c2; - const int *greaterOneBits = estBitsSbac.greaterOneBits[oneCtx]; - const int *levelAbsBits = estBitsSbac.levelAbsBits[absCtx]; + const int* greaterOneBits = estBitsSbac.greaterOneBits[oneCtx]; + const int* levelAbsBits = estBitsSbac.levelAbsBits[absCtx]; uint16_t level = 0; uint32_t sigCoefBits = 0; diff --git a/source/common/quant.h b/source/common/quant.h index ac575f7..2801f00 100644 --- a/source/common/quant.h +++ b/source/common/quant.h @@ -58,6 +58,20 @@ struct QpParam } }; +#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */ +#define MAX_NUM_TR_CATEGORIES 16 /* 32, 16, 8, 4 transform categories each for luma and chroma */ + +// NOTE: MUST be 16-byte aligned for asm code +struct NoiseReduction +{ + /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32 + * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 + * Intra 0..7 - Inter 8..15 */ + uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]; + uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]; + uint32_t count[MAX_NUM_TR_CATEGORIES]; +}; + class Quant { protected: @@ -69,8 +83,8 @@ protected: bool m_useRDOQ; int64_t m_psyRdoqScale; - int32_t* m_resiDctCoeff; - int32_t* m_fencDctCoeff; + int16_t* m_resiDctCoeff; + int16_t* m_fencDctCoeff; int16_t* m_fencShortBuf; enum { IEP_RATE = 32768 }; /* FIX15 cost of an equal probable bit */ @@ -90,10 +104,10 @@ public: /* CU setup */ void setQPforQuant(const CUData& ctu); - uint32_t transformNxN(CUData& cu, pixel *fenc, uint32_t fencstride, int16_t* residual, uint32_t stride, coeff_t* coeff, + uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip); - void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, + void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff, uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig); /* static methods shared with entropy.cpp */ @@ -107,7 +121,7 @@ protected: uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters); - uint32_t rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy); + uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy); inline uint32_t getRateLast(uint32_t posx, uint32_t posy) const; }; diff --git a/source/common/shortyuv.cpp b/source/common/shortyuv.cpp index 2a7e153..0b95f0e 100644 --- a/source/common/shortyuv.cpp +++ b/source/common/shortyuv.cpp @@ -84,7 +84,7 @@ void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_ const int16_t* src = getLumaAddr(absPartIdx); int16_t* dst = dstYuv.getLumaAddr(absPartIdx); - primitives.square_copy_ss[log2Size - 2](dst, dstYuv.m_size, const_cast(src), m_size); + primitives.luma_copy_ss[log2Size - 2](dst, dstYuv.m_size, src, m_size); } void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const @@ -92,7 +92,7 @@ void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log const int16_t* src = getLumaAddr(absPartIdx); pixel* dst = dstYuv.getLumaAddr(absPartIdx); - primitives.square_copy_sp[log2Size - 2](dst, dstYuv.m_size, const_cast(src), m_size); + primitives.luma_copy_sp[log2Size - 2](dst, dstYuv.m_size, src, m_size); } void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const @@ -103,8 +103,8 @@ void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint3 int16_t* dstU = dstYuv.getCbAddr(absPartIdx); int16_t* dstV = dstYuv.getCrAddr(absPartIdx); - primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, const_cast(srcU), m_csize); - primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, const_cast(srcV), m_csize); + primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, srcU, m_csize); + primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, srcV, m_csize); } void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const @@ -115,6 +115,6 @@ void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t l pixel* dstU = dstYuv.getCbAddr(absPartIdx); pixel* dstV = dstYuv.getCrAddr(absPartIdx); - primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, const_cast(srcU), m_csize); - primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, const_cast(srcV), m_csize); + primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, srcU, m_csize); + primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, srcV, m_csize); } diff --git a/source/common/slice.h b/source/common/slice.h index bd0ba63..90712b9 100644 --- a/source/common/slice.h +++ b/source/common/slice.h @@ -242,8 +242,7 @@ struct PPS { uint32_t maxCuDQPDepth; - int chromaCbQpOffset; // use param - int chromaCrQpOffset; // use param + int chromaQpOffset[2]; // use param bool bUseWeightPred; // use param bool bUseWeightedBiPred; // use param @@ -334,6 +333,8 @@ public: void setRefPicList(PicList& picList); + const Frame* getRefPic(int list, int refIdx) const { return refIdx >= 0 ? m_refPicList[list][refIdx] : NULL; } + bool getRapPicFlag() const { return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL diff --git a/source/common/threading.cpp b/source/common/threading.cpp index cb50eb2..1d888ae 100644 --- a/source/common/threading.cpp +++ b/source/common/threading.cpp @@ -1,6 +1,4 @@ /***************************************************************************** - * x265: threading class and intrinsics - ***************************************************************************** * Copyright (C) 2013 x265 project * * Authors: Steve Borho @@ -48,21 +46,21 @@ bool Thread::start() { DWORD threadId; - this->thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)ThreadShim, this, 0, &threadId); + thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)ThreadShim, this, 0, &threadId); return threadId > 0; } void Thread::stop() { - if (this->thread) - WaitForSingleObject(this->thread, INFINITE); + if (thread) + WaitForSingleObject(thread, INFINITE); } Thread::~Thread() { - if (this->thread) - CloseHandle(this->thread); + if (thread) + CloseHandle(thread); } #else /* POSIX / pthreads */ @@ -79,10 +77,9 @@ static void *ThreadShim(void *opaque) bool Thread::start() { - if (pthread_create(&this->thread, NULL, ThreadShim, this)) + if (pthread_create(&thread, NULL, ThreadShim, this)) { - this->thread = 0; - + thread = 0; return false; } @@ -91,8 +88,8 @@ bool Thread::start() void Thread::stop() { - if (this->thread) - pthread_join(this->thread, NULL); + if (thread) + pthread_join(thread, NULL); } Thread::~Thread() {} @@ -101,6 +98,7 @@ Thread::~Thread() {} Thread::Thread() { - this->thread = 0; + thread = 0; } + } diff --git a/source/common/threading.h b/source/common/threading.h index ef5642a..5f95782 100644 --- a/source/common/threading.h +++ b/source/common/threading.h @@ -1,6 +1,4 @@ /***************************************************************************** - * x265: threading class and intrinsics - ***************************************************************************** * Copyright (C) 2013 x265 project * * Authors: Steve Borho @@ -49,11 +47,10 @@ #include #include -#define CLZ32(id, x) id = (unsigned long)__builtin_clz(x) ^ 31 -#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x) -#define ATOMIC_OR(ptr, mask) __sync_or_and_fetch(ptr, mask) -#define ATOMIC_CAS(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval) -#define ATOMIC_CAS32(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval) +#define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31 +#define CTZ(id, x) id = (unsigned long)__builtin_ctz(x) +#define ATOMIC_OR(ptr, mask) __sync_fetch_and_or(ptr, mask) +#define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask) #define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1) #define ATOMIC_DEC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, -1) #define GIVE_UP_TIME() usleep(0) @@ -62,53 +59,12 @@ #include -#if !_WIN64 -inline int _BitScanReverse64(DWORD *id, uint64_t x64) // fake 64bit CLZ -{ - uint32_t high32 = (uint32_t)(x64 >> 32); - uint32_t low32 = (uint32_t)x64; - - if (high32) - { - _BitScanReverse(id, high32); - *id += 32; - return 1; - } - else if (low32) - return _BitScanReverse(id, low32); - else - return *id = 0; -} - -inline int _BitScanForward64(DWORD *id, uint64_t x64) // fake 64bit CLZ -{ - uint32_t high32 = (uint32_t)(x64 >> 32); - uint32_t low32 = (uint32_t)x64; - - if (high32) - { - _BitScanForward(id, high32); - *id += 32; - return 1; - } - else if (low32) - return _BitScanForward(id, low32); - else - return *id = 0; -} - -#endif // if !_WIN64 - -#ifndef ATOMIC_OR -#define ATOMIC_OR(ptr, mask) InterlockedOr64((volatile LONG64*)ptr, mask) -#endif - -#define CLZ32(id, x) _BitScanReverse(&id, x) -#define CTZ64(id, x) _BitScanForward64(&id, x) -#define ATOMIC_CAS(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval) -#define ATOMIC_CAS32(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval) +#define CLZ(id, x) _BitScanReverse(&id, x) +#define CTZ(id, x) _BitScanForward(&id, x) #define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr) #define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr) +#define ATOMIC_OR(ptr, mask) _InterlockedOr((volatile LONG*)ptr, (LONG)mask) +#define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask) #define GIVE_UP_TIME() Sleep(0) #endif // ifdef __GNUC__ diff --git a/source/common/threadpool.cpp b/source/common/threadpool.cpp index 8a2ab9d..d32e55f 100644 --- a/source/common/threadpool.cpp +++ b/source/common/threadpool.cpp @@ -1,6 +1,4 @@ /***************************************************************************** - * x265: singleton thread pool and interface classes - ***************************************************************************** * Copyright (C) 2013 x265 project * * Authors: Steve Borho @@ -87,7 +85,7 @@ private: int m_numThreads; int m_numSleepMapWords; PoolThread *m_threads; - volatile uint64_t *m_sleepMap; + volatile uint32_t *m_sleepMap; /* Lock for write access to the provider lists. Threads are * always allowed to read m_firstProvider and follow the @@ -174,8 +172,8 @@ void PoolThread::threadMain() void ThreadPoolImpl::markThreadAsleep(int id) { - int word = id >> 6; - uint64_t bit = 1LL << (id & 63); + int word = id >> 5; + uint32_t bit = 1 << (id & 31); ATOMIC_OR(&m_sleepMap[word], bit); } @@ -186,16 +184,16 @@ void ThreadPoolImpl::pokeIdleThread() * not give up until a thread is awakened or all of them are awake */ for (int i = 0; i < m_numSleepMapWords; i++) { - uint64_t oldval = m_sleepMap[i]; + uint32_t oldval = m_sleepMap[i]; while (oldval) { unsigned long id; - CTZ64(id, oldval); + CTZ(id, oldval); - uint64_t newval = oldval & ~(1LL << id); - if (ATOMIC_CAS(&m_sleepMap[i], oldval, newval) == oldval) + uint32_t bit = 1 << id; + if (ATOMIC_AND(&m_sleepMap[i], ~bit) & bit) { - m_threads[(i << 6) | id].poke(); + m_threads[i * 32 + id].poke(); return; } @@ -249,8 +247,8 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads) , m_firstProvider(NULL) , m_lastProvider(NULL) { - m_numSleepMapWords = (numThreads + 63) >> 6; - m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords); + m_numSleepMapWords = (numThreads + 31) >> 5; + m_sleepMap = X265_MALLOC(uint32_t, m_numSleepMapWords); char *buffer = (char*)X265_MALLOC(PoolThread, numThreads); m_threads = reinterpret_cast(buffer); @@ -259,9 +257,7 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads) if (m_threads && m_sleepMap) { for (int i = 0; i < m_numSleepMapWords; i++) - { m_sleepMap[i] = 0; - } m_ok = true; int i; @@ -277,9 +273,7 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads) } if (m_ok) - { waitForAllIdle(); - } else { // stop threads that did start up @@ -300,12 +294,10 @@ void ThreadPoolImpl::waitForAllIdle() int id = 0; do { - int word = id >> 6; - uint64_t bit = 1LL << (id & 63); + int word = id >> 5; + uint32_t bit = 1 << (id & 31); if (m_sleepMap[word] & bit) - { id++; - } else { GIVE_UP_TIME(); @@ -338,9 +330,7 @@ ThreadPoolImpl::~ThreadPoolImpl() { // cleanup thread handles for (int i = 0; i < m_numThreads; i++) - { m_threads[i].~PoolThread(); - } X265_FREE(reinterpret_cast(m_threads)); } diff --git a/source/common/threadpool.h b/source/common/threadpool.h index 7616670..2c192ca 100644 --- a/source/common/threadpool.h +++ b/source/common/threadpool.h @@ -1,6 +1,4 @@ /***************************************************************************** - * x265: singleton thread pool and interface classes - ***************************************************************************** * Copyright (C) 2013 x265 project * * Authors: Steve Borho diff --git a/source/common/vec/dct-sse3.cpp b/source/common/vec/dct-sse3.cpp index c435b52..53333e6 100644 --- a/source/common/vec/dct-sse3.cpp +++ b/source/common/vec/dct-sse3.cpp @@ -52,30 +52,22 @@ ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) = { 83, 36, 83, 36, 83, 36, 83, 36 }, { 36, -83, 36, -83, 36, -83, 36, -83 } }; -void idct8(int32_t *src, int16_t *dst, intptr_t stride) +void idct8(const int16_t* src, int16_t* dst, intptr_t stride) { __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; __m128i T00, T01, T02, T03, T04, T05, T06, T07; m128iAdd = _mm_set1_epi32(64); - T00 = _mm_load_si128((__m128i*)&src[8 + 0]); - T01 = _mm_load_si128((__m128i*)&src[8 + 4]); - m128iS1 = _mm_packs_epi32(T00, T01); - T00 = _mm_load_si128((__m128i*)&src[24 + 0]); - T01 = _mm_load_si128((__m128i*)&src[24 + 4]); - m128iS3 = _mm_packs_epi32(T00, T01); + m128iS1 = _mm_load_si128((__m128i*)&src[8 + 0]); + m128iS3 = _mm_load_si128((__m128i*)&src[24 + 0]); m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); - T00 = _mm_load_si128((__m128i*)&src[40 + 0]); - T01 = _mm_load_si128((__m128i*)&src[40 + 4]); - m128iS5 = _mm_packs_epi32(T00, T01); - T00 = _mm_load_si128((__m128i*)&src[56 + 0]); - T01 = _mm_load_si128((__m128i*)&src[56 + 4]); - m128iS7 = _mm_packs_epi32(T00, T01); + m128iS5 = _mm_load_si128((__m128i*)&src[40 + 0]); + m128iS7 = _mm_load_si128((__m128i*)&src[56 + 0]); m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); @@ -107,12 +99,8 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride) /* ------- */ - T00 = _mm_load_si128((__m128i*)&src[0 + 0]); - T01 = _mm_load_si128((__m128i*)&src[0 + 4]); - m128iS0 = _mm_packs_epi32(T00, T01); - T00 = _mm_load_si128((__m128i*)&src[32 + 0]); - T01 = _mm_load_si128((__m128i*)&src[32 + 4]); - m128iS4 = _mm_packs_epi32(T00, T01); + m128iS0 = _mm_load_si128((__m128i*)&src[0 + 0]); + m128iS4 = _mm_load_si128((__m128i*)&src[32 + 0]); m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); @@ -123,12 +111,8 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride) /* ------- */ - T00 = _mm_load_si128((__m128i*)&src[16 + 0]); - T01 = _mm_load_si128((__m128i*)&src[16 + 4]); - m128iS2 = _mm_packs_epi32(T00, T01); - T00 = _mm_load_si128((__m128i*)&src[48 + 0]); - T01 = _mm_load_si128((__m128i*)&src[48 + 4]); - m128iS6 = _mm_packs_epi32(T00, T01); + m128iS2 = _mm_load_si128((__m128i*)&src[16 + 0]); + m128iS6 = _mm_load_si128((__m128i*)&src[48 + 0]); m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); @@ -305,7 +289,7 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride) _mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(T11)); } -void idct16(int32_t *src, int16_t *dst, intptr_t stride) +void idct16(const int16_t *src, int16_t *dst, intptr_t stride) { const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050); @@ -367,71 +351,22 @@ void idct16(int32_t *src, int16_t *dst, intptr_t stride) for (int i = 0; i < 2; i++) { const int offset = (i << 3); - __m128i T00, T01; - - T00 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset + 4]); - in00[i] = _mm_packs_epi32(T00, T01); // [07 06 05 04 03 02 01 00] - - T00 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset + 4]); - in01[i] = _mm_packs_epi32(T00, T01); // [17 16 15 14 13 12 11 10] - - T00 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset + 4]); - in02[i] = _mm_packs_epi32(T00, T01); // [27 26 25 24 23 22 21 20] - - T00 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset + 4]); - in03[i] = _mm_packs_epi32(T00, T01); // [37 36 35 34 33 32 31 30] - - T00 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset + 4]); - in04[i] = _mm_packs_epi32(T00, T01); // [47 46 45 44 43 42 41 40] - - T00 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset + 4]); - in05[i] = _mm_packs_epi32(T00, T01); // [57 56 55 54 53 52 51 50] - - T00 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset + 4]); - in06[i] = _mm_packs_epi32(T00, T01); // [67 66 65 64 63 62 61 60] - - T00 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset + 4]); - in07[i] = _mm_packs_epi32(T00, T01); // [77 76 75 74 73 72 71 70] - - T00 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset + 4]); - in08[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset + 4]); - in09[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset + 4]); - in10[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset + 4]); - in11[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset + 4]); - in12[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset + 4]); - in13[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset + 4]); - in14[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset + 4]); - in15[i] = _mm_packs_epi32(T00, T01); + in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00] + in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10] + in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20] + in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30] + in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); // [47 46 45 44 43 42 41 40] + in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); // [57 56 55 54 53 52 51 50] + in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); // [67 66 65 64 63 62 61 60] + in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); // [77 76 75 74 73 72 71 70] + in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]); + in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]); + in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]); + in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]); + in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]); + in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]); + in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]); + in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]); } for (int pass = 0; pass < 2; pass++) @@ -716,7 +651,7 @@ void idct16(int32_t *src, int16_t *dst, intptr_t stride) _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]); } -void idct32(int32_t *src, int16_t *dst, intptr_t stride) +void idct32(const int16_t *src, int16_t *dst, intptr_t stride) { //Odd const __m128i c16_p90_p90 = _mm_set1_epi32(0x005A005A); //column 0 @@ -909,135 +844,38 @@ void idct32(int32_t *src, int16_t *dst, intptr_t stride) for (int i = 0; i < 4; i++) { const int offset = (i << 3); - __m128i T00, T01; - - T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]); - in00[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]); - in01[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]); - in02[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]); - in03[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]); - in04[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]); - in05[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]); - in06[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]); - in07[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]); - in08[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]); - in09[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]); - in10[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]); - in11[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]); - in12[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]); - in13[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]); - in14[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]); - in15[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]); - in16[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]); - in17[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]); - in18[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]); - in19[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]); - in20[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]); - in21[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]); - in22[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]); - in23[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]); - in24[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]); - in25[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]); - in26[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]); - in27[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]); - in28[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]); - in29[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]); - in30[i] = _mm_packs_epi32(T00, T01); - - T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]); - T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]); - in31[i] = _mm_packs_epi32(T00, T01); + in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]); + in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]); + in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]); + in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]); + in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]); + in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]); + in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]); + in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]); + in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]); + in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]); + in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]); + in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]); + in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]); + in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]); + in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]); + in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]); + in16[i] = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]); + in17[i] = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]); + in18[i] = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]); + in19[i] = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]); + in20[i] = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]); + in21[i] = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]); + in22[i] = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]); + in23[i] = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]); + in24[i] = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]); + in25[i] = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]); + in26[i] = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]); + in27[i] = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]); + in28[i] = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]); + in29[i] = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]); + in30[i] = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]); + in31[i] = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]); } for (int pass = 0; pass < 2; pass++) diff --git a/source/common/vec/dct-sse41.cpp b/source/common/vec/dct-sse41.cpp index aa52709..81a7889 100644 --- a/source/common/vec/dct-sse41.cpp +++ b/source/common/vec/dct-sse41.cpp @@ -36,7 +36,7 @@ using namespace x265; namespace { -void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift) +void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift) { X265_CHECK(num <= 32 * 32, "dequant num too large\n"); @@ -66,11 +66,7 @@ void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32 quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per)); quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2); - sign = _mm_srai_epi16(quantCoef12, 15); - quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); - _mm_storeu_si128((__m128i*)(coef + n), quantCoef1); - quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); - _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2); + _mm_storeu_si128((__m128i*)(coef + n), quantCoef12); } } else @@ -100,11 +96,7 @@ void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32 quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift)); quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2); - sign = _mm_srai_epi16(quantCoef12, 15); - quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); - _mm_storeu_si128((__m128i*)(coef + n), quantCoef1); - quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); - _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2); + _mm_storeu_si128((__m128i*)(coef + n), quantCoef12); } } } diff --git a/source/common/vec/dct-ssse3.cpp b/source/common/vec/dct-ssse3.cpp index bbb7858..251d500 100644 --- a/source/common/vec/dct-ssse3.cpp +++ b/source/common/vec/dct-ssse3.cpp @@ -100,7 +100,7 @@ ALIGN_VAR_32(static const int16_t, tab_dct_16_1[][8]) = #undef MAKE_COEF }; -void dct16(int16_t *src, int32_t *dst, intptr_t stride) +void dct16(const int16_t *src, int16_t *dst, intptr_t stride) { // Const __m128i c_4 = _mm_set1_epi32(4); @@ -344,8 +344,10 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride) T41 = _mm_hsub_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10); - _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40); - _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41); + T40 = _mm_packs_epi32(T40, T40); + T41 = _mm_packs_epi32(T41, T41); + _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40); + _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8])); @@ -366,7 +368,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride) T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); - _mm_storeu_si128((__m128i*)&dst[4 * 16 + i], T40); + T40 = _mm_packs_epi32(T40, T40); + _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9])); @@ -387,7 +390,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride) T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); - _mm_storeu_si128((__m128i*)&dst[12 * 16 + i], T40); + T40 = _mm_packs_epi32(T40, T40); + _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10])); @@ -408,7 +412,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride) T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); - _mm_storeu_si128((__m128i*)&dst[2 * 16 + i], T40); + T40 = _mm_packs_epi32(T40, T40); + _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11])); @@ -429,7 +434,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride) T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); - _mm_storeu_si128((__m128i*)&dst[6 * 16 + i], T40); + T40 = _mm_packs_epi32(T40, T40); + _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12])); @@ -450,7 +456,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride) T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); - _mm_storeu_si128((__m128i*)&dst[10 * 16 + i], T40); + T40 = _mm_packs_epi32(T40, T40); + _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13])); @@ -471,7 +478,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride) T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); - _mm_storeu_si128((__m128i*)&dst[14 * 16 + i], T40); + T40 = _mm_packs_epi32(T40, T40); + _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40); #define MAKE_ODD(tab, dstPos) \ T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \ @@ -493,7 +501,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride) \ T40 = _mm_hadd_epi32(T30, T31); \ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \ - _mm_storeu_si128((__m128i*)&dst[(dstPos) * 16 + i], T40); + T40 = _mm_packs_epi32(T40, T40); \ + _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40); MAKE_ODD(14, 1); MAKE_ODD(16, 3); @@ -657,7 +666,7 @@ ALIGN_VAR_32(static const int16_t, tab_dct_32_1[][8]) = #undef MAKE_COEF16 }; -void dct32(int16_t *src, int32_t *dst, intptr_t stride) +void dct32(const int16_t *src, int16_t *dst, intptr_t stride) { // Const __m128i c_8 = _mm_set1_epi32(8); @@ -1050,7 +1059,8 @@ void dct32(int16_t *src, int32_t *dst, intptr_t stride) T60 = _mm_hadd_epi32(T60, T61); \ \ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \ - _mm_storeu_si128((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \ + T60 = _mm_packs_epi32(T60, T60); \ + _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \ MAKE_ODD(44, 44, 44, 44, 0); MAKE_ODD(45, 45, 45, 45, 16); diff --git a/source/common/wavefront.cpp b/source/common/wavefront.cpp index 17c44aa..533e768 100644 --- a/source/common/wavefront.cpp +++ b/source/common/wavefront.cpp @@ -33,14 +33,14 @@ bool WaveFront::init(int numRows) { m_numRows = numRows; - m_numWords = (numRows + 63) >> 6; - m_internalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords); + m_numWords = (numRows + 31) >> 5; + m_internalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords); if (m_internalDependencyBitmap) - memset((void*)m_internalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords); + memset((void*)m_internalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords); - m_externalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords); + m_externalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords); if (m_externalDependencyBitmap) - memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords); + memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords); return m_internalDependencyBitmap && m_externalDependencyBitmap; } @@ -53,58 +53,31 @@ WaveFront::~WaveFront() void WaveFront::clearEnabledRowMask() { - memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords); + memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords); } void WaveFront::enqueueRow(int row) { - // thread safe - uint64_t bit = 1LL << (row & 63); - - X265_CHECK(row < m_numRows, "invalid row\n"); - ATOMIC_OR(&m_internalDependencyBitmap[row >> 6], bit); + uint32_t bit = 1 << (row & 31); + ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit); if (m_pool) m_pool->pokeIdleThread(); } void WaveFront::enableRow(int row) { - // thread safe - uint64_t bit = 1LL << (row & 63); - - X265_CHECK(row < m_numRows, "invalid row\n"); - ATOMIC_OR(&m_externalDependencyBitmap[row >> 6], bit); + uint32_t bit = 1 << (row & 31); + ATOMIC_OR(&m_externalDependencyBitmap[row >> 5], bit); } void WaveFront::enableAllRows() { - memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint64_t) * m_numWords); -} - -bool WaveFront::checkHigherPriorityRow(int curRow) -{ - int fullwords = curRow >> 6; - uint64_t mask = (1LL << (curRow & 63)) - 1; - - // Check full bitmap words before curRow - for (int i = 0; i < fullwords; i++) - { - if (m_internalDependencyBitmap[i] & m_externalDependencyBitmap[i]) - return true; - } - - // check the partially masked bitmap word of curRow - if (m_internalDependencyBitmap[fullwords] & m_externalDependencyBitmap[fullwords] & mask) - return true; - return false; + memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint32_t) * m_numWords); } bool WaveFront::dequeueRow(int row) { - uint64_t oldval, newval; - - oldval = m_internalDependencyBitmap[row >> 6]; - newval = oldval & ~(1LL << (row & 63)); - return ATOMIC_CAS(&m_internalDependencyBitmap[row >> 6], oldval, newval) == oldval; + uint32_t bit = 1 << (row & 31); + return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit); } bool WaveFront::findJob(int threadId) @@ -114,22 +87,21 @@ bool WaveFront::findJob(int threadId) // thread safe for (int w = 0; w < m_numWords; w++) { - uint64_t oldval = m_internalDependencyBitmap[w]; - while (oldval & m_externalDependencyBitmap[w]) + uint32_t oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w]; + while (oldval) { - uint64_t mask = oldval & m_externalDependencyBitmap[w]; - - CTZ64(id, mask); + CTZ(id, oldval); - uint64_t newval = oldval & ~(1LL << id); - if (ATOMIC_CAS(&m_internalDependencyBitmap[w], oldval, newval) == oldval) + uint32_t bit = 1 << id; + if (ATOMIC_AND(&m_internalDependencyBitmap[w], ~bit) & bit) { - // we cleared the bit, process row - processRow(w * 64 + id, threadId); + /* we cleared the bit, we get to process the row */ + processRow(w * 32 + id, threadId); return true; } + // some other thread cleared the bit, try another bit - oldval = m_internalDependencyBitmap[w]; + oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w]; } } diff --git a/source/common/wavefront.h b/source/common/wavefront.h index a34b9a4..4692a41 100644 --- a/source/common/wavefront.h +++ b/source/common/wavefront.h @@ -43,8 +43,8 @@ private: // Dependencies are categorized as internal and external. Internal dependencies // are caused by neighbor block availability. External dependencies are generally // reference frame reconstructed pixels being available. - uint64_t volatile *m_internalDependencyBitmap; - uint64_t volatile *m_externalDependencyBitmap; + uint32_t volatile *m_internalDependencyBitmap; + uint32_t volatile *m_externalDependencyBitmap; // number of words in the bitmap int m_numWords; @@ -92,10 +92,6 @@ public: // Start or resume encode processing of this row, must be implemented by // derived classes. virtual void processRow(int row, int threadId) = 0; - - // Returns true if a row above curRow is available for processing. The processRow() - // method may call this function periodically and voluntarily exit - bool checkHigherPriorityRow(int curRow); }; } // end namespace x265 diff --git a/source/common/winxp.h b/source/common/winxp.h index b105804..0446265 100644 --- a/source/common/winxp.h +++ b/source/common/winxp.h @@ -56,30 +56,6 @@ void cond_destroy(ConditionVariable *cond); #define WakeAllConditionVariable x265::cond_broadcast #define XP_CONDITION_VAR_FREE x265::cond_destroy -#if defined(_MSC_VER) - -/* Windows XP did not define atomic OR 64, but gcc has a good version, so - * only use this workaround when targeting XP with MSVC */ -FORCEINLINE LONGLONG interlocked_OR64(__inout LONGLONG volatile *Destination, - __in LONGLONG Value) -{ - LONGLONG Old; - - do - { - Old = *Destination; - } - while (_InterlockedCompareExchange64(Destination, Old | Value, Old) != Old); - - return Old; -} - -#define ATOMIC_OR(ptr, mask) x265::interlocked_OR64((volatile LONG64*)ptr, mask) - -#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) -#pragma intrinsic(_InterlockedCompareExchange64) -#endif -#endif // defined(_MSC_VER) } // namespace x265 #else // if defined(_WIN32) && (_WIN32_WINNT < 0x0600) diff --git a/source/common/x86/asm-primitives.cpp b/source/common/x86/asm-primitives.cpp index ec1607d..b81115b 100644 --- a/source/common/x86/asm-primitives.cpp +++ b/source/common/x86/asm-primitives.cpp @@ -1336,11 +1336,22 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2; p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2; - p.cvt32to16_shr = x265_cvt32to16_shr_sse2; - p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2; - p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2; - p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2; - p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2; + p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2; + p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2; + p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2; + p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2; + p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2; + p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2; + p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2; + p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2; + p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2; + p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2; + p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2; + p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2; + p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2; + p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2; + p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2; + p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2; CHROMA_PIXELSUB_PS(_sse2); CHROMA_PIXELSUB_PS_422(_sse2); @@ -1354,9 +1365,9 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) CHROMA_VERT_FILTERS_422(_sse2); CHROMA_VERT_FILTERS_444(_sse2); p.luma_p2s = x265_luma_p2s_sse2; - p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2; - p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_sse2; - p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s + p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2; + p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2; + p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2; p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2; @@ -1376,6 +1387,9 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.dct[DCT_4x4] = x265_dct4_sse2; p.idct[IDCT_4x4] = x265_idct4_sse2; +#if X86_64 + p.idct[IDCT_8x8] = x265_idct8_sse2; +#endif p.idct[IDST_4x4] = x265_idst4_sse2; LUMA_SS_FILTERS(_sse2); @@ -1407,11 +1421,6 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.quant = x265_quant_sse4; p.nquant = x265_nquant_sse4; p.dequant_normal = x265_dequant_normal_sse4; - p.cvt16to32_shl = x265_cvt16to32_shl_sse4; - p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4; - p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4; - p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4; - p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4; p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4; p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4; p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4; @@ -1428,7 +1437,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) } if (cpuMask & X265_CPU_XOP) { - p.frame_init_lowres_core = x265_frame_init_lowres_core_xop; + p.frameInitLowres = x265_frame_init_lowres_core_xop; SA8D_INTER_FROM_BLOCK(xop); INIT7(satd, _xop); HEVC_SATD(xop); @@ -1440,6 +1449,14 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.nquant = x265_nquant_avx2; p.dequant_normal = x265_dequant_normal_avx2; p.scale1D_128to64 = x265_scale1D_128to64_avx2; + p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2; + p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2; + p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2; + p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2; + p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2; + p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2; + p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2; + p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2; #if X86_64 p.dct[DCT_8x8] = x265_dct8_avx2; p.dct[DCT_16x16] = x265_dct16_avx2; @@ -1448,7 +1465,6 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.idct[IDCT_8x8] = x265_idct8_avx2; p.idct[IDCT_16x16] = x265_idct16_avx2; p.idct[IDCT_32x32] = x265_idct32_avx2; - p.transpose[BLOCK_8x8] = x265_transpose8_avx2; p.transpose[BLOCK_16x16] = x265_transpose16_avx2; p.transpose[BLOCK_32x32] = x265_transpose32_avx2; @@ -1500,7 +1516,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) INIT8(sad_x4, _mmx2); p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2; p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2; - p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2; + p.frameInitLowres = x265_frame_init_lowres_core_mmx2; PIXEL_AVG(sse2); PIXEL_AVG_W4(mmx2); @@ -1548,14 +1564,26 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2; p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2; - p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2; + p.frameInitLowres = x265_frame_init_lowres_core_sse2; SA8D_INTER_FROM_BLOCK(sse2); - p.cvt32to16_shr = x265_cvt32to16_shr_sse2; - p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2; - p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2; - p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2; - p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2; + p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2; + p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2; + p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2; + p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2; + p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2; + p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2; + p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2; + p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2; + p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2; + p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2; + p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2; + p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2; + p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2; + p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2; + p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2; + p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2; + p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2; p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2; p.transpose[BLOCK_4x4] = x265_transpose4_sse2; @@ -1565,18 +1593,19 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.transpose[BLOCK_64x64] = x265_transpose64_sse2; p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; p.ssim_end_4 = x265_pixel_ssim_end4_sse2; + p.dct[DCT_4x4] = x265_dct4_sse2; p.idct[IDCT_4x4] = x265_idct4_sse2; +#if X86_64 + p.idct[IDCT_8x8] = x265_idct8_sse2; +#endif p.idct[IDST_4x4] = x265_idst4_sse2; + p.planecopy_sp = x265_downShift_16_sse2; - p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2; - p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2; - p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2; - p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2; } if (cpuMask & X265_CPU_SSSE3) { - p.frame_init_lowres_core = x265_frame_init_lowres_core_ssse3; + p.frameInitLowres = x265_frame_init_lowres_core_ssse3; SA8D_INTER_FROM_BLOCK(ssse3); p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3; ASSGN_SSE(ssse3); @@ -1601,9 +1630,9 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3; p.luma_p2s = x265_luma_p2s_ssse3; - p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3; - p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3; - p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s + p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3; + p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3; + p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_ssse3; // for i444, chroma_p2s can use luma_p2s p.dct[DST_4x4] = x265_dst4_ssse3; p.idct[IDCT_8x8] = x265_idct8_ssse3; @@ -1616,11 +1645,6 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) LUMA_ADDAVG(_sse4); CHROMA_ADDAVG(_sse4); CHROMA_ADDAVG_422(_sse4); - p.cvt16to32_shl = x265_cvt16to32_shl_sse4; - p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4; - p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4; - p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4; - p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4; // TODO: check POPCNT flag! p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4; @@ -1690,12 +1714,11 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) INTRA_ANG_SSE4(sse4); p.dct[DCT_8x8] = x265_dct8_sse4; - p.copy_shr = x265_copy_shr_sse4; - p.denoiseDct = x265_denoise_dct_sse4; +// p.denoiseDct = x265_denoise_dct_sse4; } if (cpuMask & X265_CPU_AVX) { - p.frame_init_lowres_core = x265_frame_init_lowres_core_avx; + p.frameInitLowres = x265_frame_init_lowres_core_avx; HEVC_SATD(avx); SA8D_INTER_FROM_BLOCK(avx); ASSGN_SSE(avx); @@ -1736,7 +1759,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) } if (cpuMask & X265_CPU_XOP) { - p.frame_init_lowres_core = x265_frame_init_lowres_core_xop; + p.frameInitLowres = x265_frame_init_lowres_core_xop; SA8D_INTER_FROM_BLOCK(xop); INIT7(satd, _xop); INIT5_NAME(sse_pp, ssd, _xop); @@ -1761,15 +1784,21 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2; p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2; - p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2; - p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2; - p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2; - p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2; - p.denoiseDct = x265_denoise_dct_avx2; + p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2; + p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2; + p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2; + p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2; + p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2; + p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2; + p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2; + p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2; + +// p.denoiseDct = x265_denoise_dct_avx2; p.dct[DCT_4x4] = x265_dct4_avx2; p.quant = x265_quant_avx2; p.nquant = x265_nquant_avx2; p.dequant_normal = x265_dequant_normal_avx2; + p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx; p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx; p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx; @@ -1785,6 +1814,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.weight_pp = x265_weight_pp_avx2; #if X86_64 + p.dct[DCT_8x8] = x265_dct8_avx2; p.dct[DCT_16x16] = x265_dct16_avx2; p.dct[DCT_32x32] = x265_dct32_avx2; @@ -1797,8 +1827,83 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.transpose[BLOCK_16x16] = x265_transpose16_avx2; p.transpose[BLOCK_32x32] = x265_transpose32_avx2; p.transpose[BLOCK_64x64] = x265_transpose64_avx2; + + p.luma_vpp[LUMA_12x16] = x265_interp_8tap_vert_pp_12x16_avx2; + + p.luma_vpp[LUMA_16x4] = x265_interp_8tap_vert_pp_16x4_avx2; + p.luma_vpp[LUMA_16x8] = x265_interp_8tap_vert_pp_16x8_avx2; + p.luma_vpp[LUMA_16x12] = x265_interp_8tap_vert_pp_16x12_avx2; + p.luma_vpp[LUMA_16x16] = x265_interp_8tap_vert_pp_16x16_avx2; + p.luma_vpp[LUMA_16x32] = x265_interp_8tap_vert_pp_16x32_avx2; + p.luma_vpp[LUMA_16x64] = x265_interp_8tap_vert_pp_16x64_avx2; + + p.luma_vpp[LUMA_24x32] = x265_interp_8tap_vert_pp_24x32_avx2; + + p.luma_vpp[LUMA_32x8] = x265_interp_8tap_vert_pp_32x8_avx2; + p.luma_vpp[LUMA_32x16] = x265_interp_8tap_vert_pp_32x16_avx2; + p.luma_vpp[LUMA_32x24] = x265_interp_8tap_vert_pp_32x24_avx2; + p.luma_vpp[LUMA_32x32] = x265_interp_8tap_vert_pp_32x32_avx2; + p.luma_vpp[LUMA_32x64] = x265_interp_8tap_vert_pp_32x64_avx2; + + p.luma_vpp[LUMA_48x64] = x265_interp_8tap_vert_pp_48x64_avx2; + + p.luma_vpp[LUMA_64x16] = x265_interp_8tap_vert_pp_64x16_avx2; + p.luma_vpp[LUMA_64x32] = x265_interp_8tap_vert_pp_64x32_avx2; + p.luma_vpp[LUMA_64x48] = x265_interp_8tap_vert_pp_64x48_avx2; + p.luma_vpp[LUMA_64x64] = x265_interp_8tap_vert_pp_64x64_avx2; #endif p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2; + + p.luma_hpp[LUMA_8x4] = x265_interp_8tap_horiz_pp_8x4_avx2; + p.luma_hpp[LUMA_8x8] = x265_interp_8tap_horiz_pp_8x8_avx2; + p.luma_hpp[LUMA_8x16] = x265_interp_8tap_horiz_pp_8x16_avx2; + p.luma_hpp[LUMA_8x32] = x265_interp_8tap_horiz_pp_8x32_avx2; + + p.luma_hpp[LUMA_16x4] = x265_interp_8tap_horiz_pp_16x4_avx2; + p.luma_hpp[LUMA_16x8] = x265_interp_8tap_horiz_pp_16x8_avx2; + p.luma_hpp[LUMA_16x12] = x265_interp_8tap_horiz_pp_16x12_avx2; + p.luma_hpp[LUMA_16x16] = x265_interp_8tap_horiz_pp_16x16_avx2; + p.luma_hpp[LUMA_16x32] = x265_interp_8tap_horiz_pp_16x32_avx2; + p.luma_hpp[LUMA_16x64] = x265_interp_8tap_horiz_pp_16x64_avx2; + + p.luma_hpp[LUMA_32x8] = x265_interp_8tap_horiz_pp_32x8_avx2; + p.luma_hpp[LUMA_32x16] = x265_interp_8tap_horiz_pp_32x16_avx2; + p.luma_hpp[LUMA_32x24] = x265_interp_8tap_horiz_pp_32x24_avx2; + p.luma_hpp[LUMA_32x32] = x265_interp_8tap_horiz_pp_32x32_avx2; + p.luma_hpp[LUMA_32x64] = x265_interp_8tap_horiz_pp_32x64_avx2; + + p.luma_hpp[LUMA_64x64] = x265_interp_8tap_horiz_pp_64x64_avx2; + p.luma_hpp[LUMA_64x48] = x265_interp_8tap_horiz_pp_64x48_avx2; + p.luma_hpp[LUMA_64x32] = x265_interp_8tap_horiz_pp_64x32_avx2; + p.luma_hpp[LUMA_64x16] = x265_interp_8tap_horiz_pp_64x16_avx2; + + p.luma_hpp[LUMA_48x64] = x265_interp_8tap_horiz_pp_48x64_avx2; + + p.chroma[X265_CSP_I420].filter_hpp[CHROMA_8x8] = x265_interp_4tap_horiz_pp_8x8_avx2; + p.chroma[X265_CSP_I420].filter_hpp[CHROMA_4x4] = x265_interp_4tap_horiz_pp_4x4_avx2; + p.chroma[X265_CSP_I420].filter_hpp[CHROMA_32x32] = x265_interp_4tap_horiz_pp_32x32_avx2; + p.chroma[X265_CSP_I420].filter_hpp[CHROMA_16x16] = x265_interp_4tap_horiz_pp_16x16_avx2; + + p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2; + + p.luma_vpp[LUMA_8x4] = x265_interp_8tap_vert_pp_8x4_avx2; + p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2; + p.luma_vpp[LUMA_8x16] = x265_interp_8tap_vert_pp_8x16_avx2; + p.luma_vpp[LUMA_8x32] = x265_interp_8tap_vert_pp_8x32_avx2; + + // color space i420 + p.chroma[X265_CSP_I420].filter_vpp[CHROMA_4x4] = x265_interp_4tap_vert_pp_4x4_avx2; + p.chroma[X265_CSP_I420].filter_vpp[CHROMA_8x8] = x265_interp_4tap_vert_pp_8x8_avx2; + + // color space i422 + p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_4x4] = x265_interp_4tap_vert_pp_4x4_avx2; + + p.luma_vps[LUMA_4x4] = x265_interp_8tap_vert_ps_4x4_avx2; + +#if X86_64 + p.chroma[X265_CSP_I420].filter_vpp[CHROMA_16x16] = x265_interp_4tap_vert_pp_16x16_avx2; + p.chroma[X265_CSP_I420].filter_vpp[CHROMA_32x32] = x265_interp_4tap_vert_pp_32x32_avx2; +#endif } #endif // if HIGH_BIT_DEPTH } diff --git a/source/common/x86/blockcopy8.asm b/source/common/x86/blockcopy8.asm index e892157..f82ff79 100644 --- a/source/common/x86/blockcopy8.asm +++ b/source/common/x86/blockcopy8.asm @@ -41,7 +41,7 @@ cextern pb_128 SECTION .text ;----------------------------------------------------------------------------- -; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x4, 4, 7, 0 @@ -59,7 +59,7 @@ cglobal blockcopy_pp_2x4, 4, 7, 0 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x8, 4, 7, 0 @@ -97,7 +97,7 @@ cglobal blockcopy_pp_2x8, 4, 7, 0 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x16, 4, 7, 0 @@ -115,7 +115,7 @@ cglobal blockcopy_pp_2x16, 4, 7, 0 ;----------------------------------------------------------------------------- -; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x2, 4, 6, 0 @@ -127,7 +127,7 @@ cglobal blockcopy_pp_4x2, 4, 6, 0 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x4, 4, 4, 4 @@ -145,7 +145,7 @@ cglobal blockcopy_pp_4x4, 4, 4, 4 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W4_H8 2 INIT_XMM sse2 @@ -192,7 +192,7 @@ BLOCKCOPY_PP_W4_H8 4, 16 BLOCKCOPY_PP_W4_H8 4, 32 ;----------------------------------------------------------------------------- -; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_6x8, 4, 7, 8 @@ -257,7 +257,7 @@ cglobal blockcopy_pp_6x8, 4, 7, 8 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_6x16, 4, 7, 2 @@ -279,7 +279,7 @@ cglobal blockcopy_pp_6x16, 4, 7, 2 ;----------------------------------------------------------------------------- -; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x2, 4, 4, 2 @@ -291,7 +291,7 @@ cglobal blockcopy_pp_8x2, 4, 4, 2 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x4, 4, 4, 4 @@ -309,7 +309,7 @@ cglobal blockcopy_pp_8x4, 4, 4, 4 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x6, 4, 7, 6 @@ -333,7 +333,7 @@ cglobal blockcopy_pp_8x6, 4, 7, 6 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x12, 4, 5, 2 @@ -350,7 +350,7 @@ cglobal blockcopy_pp_8x12, 4, 5, 2 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W8_H8 2 INIT_XMM sse2 @@ -397,7 +397,7 @@ BLOCKCOPY_PP_W8_H8 8, 32 BLOCKCOPY_PP_W8_H8 8, 64 ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W12_H4 2 INIT_XMM sse2 @@ -439,7 +439,7 @@ BLOCKCOPY_PP_W12_H4 12, 16 BLOCKCOPY_PP_W12_H4 12, 32 ;----------------------------------------------------------------------------- -; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W16_H4 2 INIT_XMM sse2 @@ -471,7 +471,7 @@ BLOCKCOPY_PP_W16_H4 16, 4 BLOCKCOPY_PP_W16_H4 16, 12 ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W16_H8 2 INIT_XMM sse2 @@ -519,7 +519,7 @@ BLOCKCOPY_PP_W16_H8 16, 64 BLOCKCOPY_PP_W16_H8 16, 24 ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W24_H4 2 INIT_XMM sse2 @@ -560,7 +560,7 @@ BLOCKCOPY_PP_W24_H4 24, 32 BLOCKCOPY_PP_W24_H4 24, 64 ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W32_H4 2 INIT_XMM sse2 @@ -684,7 +684,7 @@ cglobal blockcopy_pp_32x16, 4, 6, 6 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_YMM avx cglobal blockcopy_pp_32x24, 4, 7, 6 @@ -722,7 +722,7 @@ mov r6d, 24/8 RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W32_H16_avx 2 INIT_YMM avx @@ -788,7 +788,7 @@ BLOCKCOPY_PP_W32_H16_avx 32, 48 BLOCKCOPY_PP_W32_H16_avx 32, 64 ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W48_H2 2 INIT_XMM sse2 @@ -836,7 +836,7 @@ cglobal blockcopy_pp_%1x%2, 4, 5, 6 BLOCKCOPY_PP_W48_H2 48, 64 ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4 2 INIT_XMM sse2 @@ -897,7 +897,7 @@ BLOCKCOPY_PP_W64_H4 64, 48 BLOCKCOPY_PP_W64_H4 64, 64 ;----------------------------------------------------------------------------- -; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_2x4, 4, 5, 2 @@ -926,7 +926,7 @@ RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_2x8, 4, 5, 2 @@ -974,11 +974,11 @@ pextrw [r0 + r1], m0, 4 RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W2_H2 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride add r3, r3 mov r6d, %2/2 .loop: @@ -1003,10 +1003,10 @@ BLOCKCOPY_SP_W2_H2 2, 8 BLOCKCOPY_SP_W2_H2 2, 16 ;----------------------------------------------------------------------------- -; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride +cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride add r3, r3 @@ -1022,10 +1022,10 @@ movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride +cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride add r3, r3 @@ -1049,10 +1049,10 @@ movd [r0 + r1], m2 RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride +cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride add r3, r3 @@ -1092,11 +1092,11 @@ movd [r0 + r1], m6 RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W4_H8 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/8 @@ -1150,7 +1150,7 @@ BLOCKCOPY_SP_W4_H8 4, 16 BLOCKCOPY_SP_W4_H8 4, 32 ;----------------------------------------------------------------------------- -; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_6x8, 4, 4, 2 @@ -1213,11 +1213,11 @@ cglobal blockcopy_sp_6x8, 4, 4, 2 RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W6_H2 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride add r3, r3 mov r6d, %2/2 .loop: @@ -1247,10 +1247,10 @@ BLOCKCOPY_SP_W6_H2 6, 8 BLOCKCOPY_SP_W6_H2 6, 16 ;----------------------------------------------------------------------------- -; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride +cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride add r3, r3 @@ -1265,10 +1265,10 @@ movhps [r0 + r1], m0 RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride +cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride add r3, r3 @@ -1290,10 +1290,10 @@ movhps [r0 + r1], m2 RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride +cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride add r3, r3 @@ -1322,10 +1322,10 @@ movhps [r0 + r1], m4 RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride +cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride add r3, r3 @@ -1361,11 +1361,11 @@ movhps [r0 + r1], m6 RET ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W8_H4 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride add r3, r3 mov r4d, %2/4 .loop: @@ -1391,11 +1391,11 @@ cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride BLOCKCOPY_SP_W8_H4 8, 12 ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W8_H8 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/8 @@ -1446,11 +1446,11 @@ BLOCKCOPY_SP_W8_H8 8, 32 BLOCKCOPY_SP_W8_H8 8, 64 ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W12_H4 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/4 @@ -1503,11 +1503,11 @@ BLOCKCOPY_SP_W12_H4 12, 16 BLOCKCOPY_SP_W12_H4 12, 32 ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W16_H4 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/4 @@ -1554,11 +1554,11 @@ BLOCKCOPY_SP_W16_H4 16, 64 BLOCKCOPY_SP_W16_H4 16, 24 ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W24_H2 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2/2 @@ -1595,11 +1595,11 @@ BLOCKCOPY_SP_W24_H2 24, 32 BLOCKCOPY_SP_W24_H2 24, 64 ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W32_H2 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/2 @@ -1643,11 +1643,11 @@ BLOCKCOPY_SP_W32_H2 32, 64 BLOCKCOPY_SP_W32_H2 32, 48 ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W48_H2 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2 @@ -1681,11 +1681,11 @@ RET BLOCKCOPY_SP_W48_H2 48, 64 ;----------------------------------------------------------------------------- -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W64_H1 2 INIT_XMM sse2 -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2 @@ -1726,10 +1726,10 @@ BLOCKCOPY_SP_W64_H1 64, 48 BLOCKCOPY_SP_W64_H1 64, 64 ;----------------------------------------------------------------------------- -; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val) +; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val +cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val add r1, r1 @@ -1745,10 +1745,10 @@ movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- -; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val) +; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val +cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val add r1, r1 @@ -1774,11 +1774,11 @@ movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- -; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val) +; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- %macro BLOCKFILL_S_W16_H8 2 INIT_XMM sse2 -cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val +cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val mov r3d, %2/8 @@ -1855,11 +1855,11 @@ movu [r0 + r3], m0 RET ;----------------------------------------------------------------------------- -; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val) +; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- %macro BLOCKFILL_S_W32_H4 2 INIT_XMM sse2 -cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val +cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val mov r3d, %2/4 @@ -1983,10 +1983,10 @@ movu [r0 + r3 + 32], m0 RET ;----------------------------------------------------------------------------- -; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 @@ -2013,10 +2013,10 @@ RET ;----------------------------------------------------------------------------- -; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 @@ -2065,10 +2065,10 @@ RET ;----------------------------------------------------------------------------- -; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride +cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride add r1, r1 mov r4d, 16/2 .loop: @@ -2086,10 +2086,10 @@ cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride ;----------------------------------------------------------------------------- -; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 @@ -2105,10 +2105,10 @@ RET ;----------------------------------------------------------------------------- -; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 @@ -2135,11 +2135,11 @@ RET ;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W4_H4 2 INIT_XMM sse4 -cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 @@ -2180,11 +2180,11 @@ BLOCKCOPY_PS_W4_H4 4, 32 ;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W6_H4 2 INIT_XMM sse4 -cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 @@ -2227,10 +2227,10 @@ BLOCKCOPY_PS_W6_H4 6, 8 BLOCKCOPY_PS_W6_H4 6, 16 ;----------------------------------------------------------------------------- -; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 @@ -2245,10 +2245,10 @@ movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- -; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 @@ -2274,10 +2274,10 @@ movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- -; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 @@ -2314,11 +2314,11 @@ movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W8_H4 2 INIT_XMM sse4 -cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 @@ -2361,11 +2361,11 @@ BLOCKCOPY_PS_W8_H4 8, 64 ;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W12_H2 2 INIT_XMM sse4 -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 @@ -2398,10 +2398,10 @@ BLOCKCOPY_PS_W12_H2 12, 16 BLOCKCOPY_PS_W12_H2 12, 32 ;----------------------------------------------------------------------------- -; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride +cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride add r1, r1 pxor m0, m0 @@ -2436,11 +2436,11 @@ movu [r0 + r1 + 16], m1 RET ;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W16_H4 2 INIT_XMM sse4 -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 @@ -2492,11 +2492,11 @@ BLOCKCOPY_PS_W16_H4 16, 64 BLOCKCOPY_PS_W16_H4 16, 24 ;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W24_H2 2 INIT_XMM sse4 -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 @@ -2537,11 +2537,11 @@ BLOCKCOPY_PS_W24_H2 24, 32 BLOCKCOPY_PS_W24_H2 24, 64 ;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W32_H2 2 INIT_XMM sse4 -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 @@ -2590,11 +2590,11 @@ BLOCKCOPY_PS_W32_H2 32, 64 BLOCKCOPY_PS_W32_H2 32, 48 ;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W48_H2 2 INIT_XMM sse4 -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 @@ -2649,11 +2649,11 @@ RET BLOCKCOPY_PS_W48_H2 48, 64 ;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W64_H2 2 INIT_XMM sse4 -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 @@ -2723,7 +2723,7 @@ BLOCKCOPY_PS_W64_H2 64, 48 BLOCKCOPY_PS_W64_H2 64, 64 ;----------------------------------------------------------------------------- -; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x4, 4, 6, 0 @@ -2746,7 +2746,7 @@ cglobal blockcopy_ss_2x4, 4, 6, 0 RET ;----------------------------------------------------------------------------- -; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x8, 4, 6, 0 @@ -2785,7 +2785,7 @@ cglobal blockcopy_ss_2x8, 4, 6, 0 RET ;----------------------------------------------------------------------------- -; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x16, 4, 7, 0 @@ -2805,7 +2805,7 @@ cglobal blockcopy_ss_2x16, 4, 7, 0 ;----------------------------------------------------------------------------- -; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_4x2, 4, 4, 2 @@ -2821,7 +2821,7 @@ cglobal blockcopy_ss_4x2, 4, 4, 2 RET ;----------------------------------------------------------------------------- -; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_4x4, 4, 4, 4 @@ -2841,7 +2841,7 @@ cglobal blockcopy_ss_4x4, 4, 4, 4 RET ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W4_H8 2 INIT_XMM sse2 @@ -2889,7 +2889,7 @@ BLOCKCOPY_SS_W4_H8 4, 16 BLOCKCOPY_SS_W4_H8 4, 32 ;----------------------------------------------------------------------------- -; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_6x8, 4, 4, 4 @@ -2944,7 +2944,7 @@ cglobal blockcopy_ss_6x8, 4, 4, 4 RET ;----------------------------------------------------------------------------- -; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_6x16, 4, 5, 4 @@ -2968,7 +2968,7 @@ cglobal blockcopy_ss_6x16, 4, 5, 4 ;----------------------------------------------------------------------------- -; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x2, 4, 4, 2 @@ -2984,7 +2984,7 @@ cglobal blockcopy_ss_8x2, 4, 4, 2 RET ;----------------------------------------------------------------------------- -; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x4, 4, 4, 4 @@ -3005,7 +3005,7 @@ cglobal blockcopy_ss_8x4, 4, 4, 4 RET ;----------------------------------------------------------------------------- -; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x6, 4, 4, 4 @@ -3034,7 +3034,7 @@ cglobal blockcopy_ss_8x6, 4, 4, 4 RET ;----------------------------------------------------------------------------- -; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x12, 4, 5, 2 @@ -3054,7 +3054,7 @@ cglobal blockcopy_ss_8x12, 4, 5, 2 ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W8_H8 2 INIT_XMM sse2 @@ -3105,7 +3105,7 @@ BLOCKCOPY_SS_W8_H8 8, 32 BLOCKCOPY_SS_W8_H8 8, 64 ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W12_H4 2 INIT_XMM sse2 @@ -3149,7 +3149,7 @@ BLOCKCOPY_SS_W12_H4 12, 16 BLOCKCOPY_SS_W12_H4 12, 32 ;----------------------------------------------------------------------------- -; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H4 2 INIT_XMM sse2 @@ -3192,7 +3192,7 @@ BLOCKCOPY_SS_W16_H4 16, 4 BLOCKCOPY_SS_W16_H4 16, 12 ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H4_avx 2 INIT_YMM avx @@ -3229,7 +3229,7 @@ BLOCKCOPY_SS_W16_H4_avx 16, 32 BLOCKCOPY_SS_W16_H4_avx 16, 64 ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H8 2 INIT_XMM sse2 @@ -3302,7 +3302,7 @@ BLOCKCOPY_SS_W16_H8 16, 64 BLOCKCOPY_SS_W16_H8 16, 24 ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W24_H4 2 INIT_XMM sse2 @@ -3354,7 +3354,7 @@ BLOCKCOPY_SS_W24_H4 24, 32 BLOCKCOPY_SS_W24_H4 24, 64 ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W32_H4 2 INIT_XMM sse2 @@ -3422,7 +3422,7 @@ BLOCKCOPY_SS_W32_H4 32, 64 BLOCKCOPY_SS_W32_H4 32, 48 ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W48_H2 2 INIT_XMM sse2 @@ -3500,11 +3500,11 @@ RET BLOCKCOPY_SS_W48_H2 48, 64 ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W64_H4 2 INIT_XMM sse2 -cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride +cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2/4 add r1, r1 add r3, r3 @@ -3606,11 +3606,11 @@ BLOCKCOPY_SS_W64_H4 64, 48 BLOCKCOPY_SS_W64_H4 64, 64 ;----------------------------------------------------------------------------- -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W64_H4_avx 2 INIT_YMM avx -cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride +cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r1, r1 add r3, r3 @@ -3669,229 +3669,83 @@ BLOCKCOPY_SS_W64_H4_avx 64, 32 BLOCKCOPY_SS_W64_H4_avx 64, 48 BLOCKCOPY_SS_W64_H4_avx 64, 64 -;----------------------------------------------------------------------------- -; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size) -;----------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal cvt32to16_shr, 4, 7, 3, dst, src, stride -%define rnd m2 -%define shift m1 - - ; make shift - mov r5d, r3m - movd shift, r5d - - ; make round - dec r5 - xor r6, r6 - bts r6, r5 - - movd rnd, r6d - pshufd rnd, rnd, 0 - - ; register alloc - ; r0 - dst - ; r1 - src - ; r2 - stride * 2 (short*) - ; r3 - lx - ; r4 - size - ; r5 - ly - ; r6 - diff - add r2d, r2d - - mov r4d, r4m - mov r5, r4 - mov r6, r2 - sub r6, r4 - add r6, r6 - - shr r5, 1 -.loop_row: - - mov r3, r4 - shr r3, 2 -.loop_col: - ; row 0 - movu m0, [r1] - paddd m0, rnd - psrad m0, shift - packssdw m0, m0 - movh [r0], m0 - - ; row 1 - movu m0, [r1 + r4 * 4] - paddd m0, rnd - psrad m0, shift - packssdw m0, m0 - movh [r0 + r2], m0 - - ; move col pointer - add r1, 16 - add r0, 8 - - dec r3 - jg .loop_col - - ; update pointer - lea r1, [r1 + r4 * 4] - add r0, r6 - - ; end of loop_row - dec r5 - jg .loop_row - - RET - - -;-------------------------------------------------------------------------------------- -; void cvt16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size); -;-------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal cvt16to32_shl, 5, 7, 2, dst, src, stride, shift, size -%define shift m1 - - ; make shift - mov r5d, r3m - movd shift, r5d - - ; register alloc - ; r0 - dst - ; r1 - src - ; r2 - stride - ; r3 - shift - ; r4 - size - - sub r2d, r4d - add r2d, r2d - mov r5d, r4d - shr r4d, 2 -.loop_row: - mov r6d, r4d - -.loop_col: - pmovsxwd m0, [r1] - pslld m0, shift - movu [r0], m0 - - add r1, 8 - add r0, 16 - - dec r6d - jnz .loop_col - - add r1, r2 - dec r5d - jnz .loop_row - RET - - ;-------------------------------------------------------------------------------------- -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset); +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal cvt16to32_shr_4, 3,3,3 +INIT_XMM sse2 +cglobal cpy2Dto1D_shr_4, 3, 4, 4 add r2d, r2d movd m0, r3m - movd m1, r4m - pshufd m1, m1, 0 + pcmpeqw m1, m1 + psllw m1, m0 + psraw m1, 1 ; register alloc ; r0 - dst ; r1 - src - ; r2 - stride + ; r2 - srcStride ; m0 - shift - ; m1 - dword [offset] - - ; Row 0 - pmovsxwd m2, [r1] - paddd m2, m1 - psrad m2, m0 - movu [r0 + 0 * mmsize], m2 - - ; Row 1 - pmovsxwd m2, [r1 + r2] - paddd m2, m1 - psrad m2, m0 - movu [r0 + 1 * mmsize], m2 + ; m1 - word [-round] - ; Row 2 + ; Row 0-3 + movh m2, [r1] + movhps m2, [r1 + r2] lea r1, [r1 + r2 * 2] - pmovsxwd m2, [r1] - paddd m2, m1 - psrad m2, m0 - movu [r0 + 2 * mmsize], m2 - - ; Row 3 - pmovsxwd m2, [r1 + r2] - paddd m2, m1 - psrad m2, m0 - movu [r0 + 3 * mmsize], m2 + movh m3, [r1] + movhps m3, [r1 + r2] + psubw m2, m1 + psubw m3, m1 + psraw m2, m0 + psraw m3, m0 + mova [r0 + 0 * mmsize], m2 + mova [r0 + 1 * mmsize], m3 RET ;-------------------------------------------------------------------------------------- -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset); +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal cvt16to32_shr_8, 3,5,3 +INIT_XMM sse2 +cglobal cpy2Dto1D_shr_8, 3, 5, 4 add r2d, r2d movd m0, r3m - movd m1, r4m - pshufd m1, m1, 0 + pcmpeqw m1, m1 + psllw m1, m0 + psraw m1, 1 mov r3d, 8/4 lea r4, [r2 * 3] ; register alloc ; r0 - dst ; r1 - src - ; r2 - stride + ; r2 - srcStride ; r3 - loop counter ; r4 - stride * 3 ; m0 - shift - ; m1 - dword [offset] + ; m1 - word [-round] .loop: - ; Row 0 - pmovsxwd m2, [r1] - pmovsxwd m3, [r1 + mmsize/2] - paddd m2, m1 - paddd m3, m1 - psrad m2, m0 - psrad m3, m0 - movu [r0 + 0 * mmsize], m2 - movu [r0 + 1 * mmsize], m3 + ; Row 0-1 + mova m2, [r1] + mova m3, [r1 + r2] + psubw m2, m1 + psubw m3, m1 + psraw m2, m0 + psraw m3, m0 + mova [r0 + 0 * mmsize], m2 + mova [r0 + 1 * mmsize], m3 - ; Row 1 - pmovsxwd m2, [r1 + r2] - pmovsxwd m3, [r1 + r2 + mmsize/2] - paddd m2, m1 - paddd m3, m1 - psrad m2, m0 - psrad m3, m0 - movu [r0 + 2 * mmsize], m2 - movu [r0 + 3 * mmsize], m3 - - ; Row 2 - pmovsxwd m2, [r1 + r2 * 2] - pmovsxwd m3, [r1 + r2 * 2 + mmsize/2] - paddd m2, m1 - paddd m3, m1 - psrad m2, m0 - psrad m3, m0 - movu [r0 + 4 * mmsize], m2 - movu [r0 + 5 * mmsize], m3 - - ; Row 3 - pmovsxwd m2, [r1 + r4] - pmovsxwd m3, [r1 + r4 + mmsize/2] - paddd m2, m1 - paddd m3, m1 - psrad m2, m0 - psrad m3, m0 - movu [r0 + 6 * mmsize], m2 - movu [r0 + 7 * mmsize], m3 - - add r0, 8 * mmsize + ; Row 2-3 + mova m2, [r1 + r2 * 2] + mova m3, [r1 + r4] + psubw m2, m1 + psubw m3, m1 + psraw m2, m0 + psraw m3, m0 + mova [r0 + 2 * mmsize], m2 + mova [r0 + 3 * mmsize], m3 + + add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop @@ -3899,62 +3753,47 @@ cglobal cvt16to32_shr_8, 3,5,3 ;-------------------------------------------------------------------------------------- -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset); +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal cvt16to32_shr_16, 3,4,6 +INIT_XMM sse2 +cglobal cpy2Dto1D_shr_16, 3, 4, 4 add r2d, r2d movd m0, r3m - movd m1, r4m - pshufd m1, m1, 0 + pcmpeqw m1, m1 + psllw m1, m0 + psraw m1, 1 mov r3d, 16/2 ; register alloc ; r0 - dst ; r1 - src - ; r2 - stride + ; r2 - srcStride ; r3 - loop counter ; m0 - shift - ; m1 - dword [offset] + ; m1 - word [-round] .loop: ; Row 0 - pmovsxwd m2, [r1 + 0 * mmsize/2] - pmovsxwd m3, [r1 + 1 * mmsize/2] - pmovsxwd m4, [r1 + 2 * mmsize/2] - pmovsxwd m5, [r1 + 3 * mmsize/2] - paddd m2, m1 - paddd m3, m1 - paddd m4, m1 - paddd m5, m1 - psrad m2, m0 - psrad m3, m0 - psrad m4, m0 - psrad m5, m0 - movu [r0 + 0 * mmsize], m2 - movu [r0 + 1 * mmsize], m3 - movu [r0 + 2 * mmsize], m4 - movu [r0 + 3 * mmsize], m5 + mova m2, [r1 + 0 * mmsize] + mova m3, [r1 + 1 * mmsize] + psubw m2, m1 + psubw m3, m1 + psraw m2, m0 + psraw m3, m0 + mova [r0 + 0 * mmsize], m2 + mova [r0 + 1 * mmsize], m3 ; Row 1 - pmovsxwd m2, [r1 + r2 + 0 * mmsize/2] - pmovsxwd m3, [r1 + r2 +1 * mmsize/2] - pmovsxwd m4, [r1 + r2 +2 * mmsize/2] - pmovsxwd m5, [r1 + r2 +3 * mmsize/2] - paddd m2, m1 - paddd m3, m1 - paddd m4, m1 - paddd m5, m1 - psrad m2, m0 - psrad m3, m0 - psrad m4, m0 - psrad m5, m0 - movu [r0 + 4 * mmsize], m2 - movu [r0 + 5 * mmsize], m3 - movu [r0 + 6 * mmsize], m4 - movu [r0 + 7 * mmsize], m5 - - add r0, 8 * mmsize + mova m2, [r1 + r2 + 0 * mmsize] + mova m3, [r1 + r2 + 1 * mmsize] + psubw m2, m1 + psubw m3, m1 + psraw m2, m0 + psraw m3, m0 + mova [r0 + 2 * mmsize], m2 + mova [r0 + 3 * mmsize], m3 + + add r0, 4 * mmsize lea r1, [r1 + r2 * 2] dec r3d jnz .loop @@ -3962,61 +3801,45 @@ cglobal cvt16to32_shr_16, 3,4,6 ;-------------------------------------------------------------------------------------- -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset); +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal cvt16to32_shr_32, 3,4,6 +INIT_XMM sse2 +cglobal cpy2Dto1D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m - movd m1, r4m - pshufd m1, m1, 0 + pcmpeqw m1, m1 + psllw m1, m0 + psraw m1, 1 mov r3d, 32/1 ; register alloc ; r0 - dst ; r1 - src - ; r2 - stride + ; r2 - srcStride ; r3 - loop counter ; m0 - shift - ; m1 - dword [offset] + ; m1 - word [-round] .loop: ; Row 0 - pmovsxwd m2, [r1 + 0 * mmsize/2] - pmovsxwd m3, [r1 + 1 * mmsize/2] - pmovsxwd m4, [r1 + 2 * mmsize/2] - pmovsxwd m5, [r1 + 3 * mmsize/2] - paddd m2, m1 - paddd m3, m1 - paddd m4, m1 - paddd m5, m1 - psrad m2, m0 - psrad m3, m0 - psrad m4, m0 - psrad m5, m0 - movu [r0 + 0 * mmsize], m2 - movu [r0 + 1 * mmsize], m3 - movu [r0 + 2 * mmsize], m4 - movu [r0 + 3 * mmsize], m5 - - pmovsxwd m2, [r1 + 4 * mmsize/2] - pmovsxwd m3, [r1 + 5 * mmsize/2] - pmovsxwd m4, [r1 + 6 * mmsize/2] - pmovsxwd m5, [r1 + 7 * mmsize/2] - paddd m2, m1 - paddd m3, m1 - paddd m4, m1 - paddd m5, m1 - psrad m2, m0 - psrad m3, m0 - psrad m4, m0 - psrad m5, m0 - movu [r0 + 4 * mmsize], m2 - movu [r0 + 5 * mmsize], m3 - movu [r0 + 6 * mmsize], m4 - movu [r0 + 7 * mmsize], m5 - - add r0, 8 * mmsize + mova m2, [r1 + 0 * mmsize] + mova m3, [r1 + 1 * mmsize] + mova m4, [r1 + 2 * mmsize] + mova m5, [r1 + 3 * mmsize] + psubw m2, m1 + psubw m3, m1 + psubw m4, m1 + psubw m5, m1 + psraw m2, m0 + psraw m3, m0 + psraw m4, m0 + psraw m5, m0 + mova [r0 + 0 * mmsize], m2 + mova [r0 + 1 * mmsize], m3 + mova [r0 + 2 * mmsize], m4 + mova [r0 + 3 * mmsize], m5 + + add r0, 4 * mmsize add r1, r2 dec r3d jnz .loop @@ -4024,172 +3847,150 @@ cglobal cvt16to32_shr_32, 3,4,6 ;-------------------------------------------------------------------------------------- -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 -cglobal cvt32to16_shl_4, 3,3,5 +cglobal cpy1Dto2D_shl_4, 3, 3, 3 add r2d, r2d movd m0, r3m ; Row 0-3 - movu m1, [r1 + 0 * mmsize] - movu m2, [r1 + 1 * mmsize] - movu m3, [r1 + 2 * mmsize] - movu m4, [r1 + 3 * mmsize] - packssdw m1, m2 - packssdw m3, m4 + mova m1, [r1 + 0 * mmsize] + mova m2, [r1 + 1 * mmsize] psllw m1, m0 - psllw m3, m0 + psllw m2, m0 movh [r0], m1 movhps [r0 + r2], m1 - movh [r0 + r2 * 2], m3 + movh [r0 + r2 * 2], m2 lea r2, [r2 * 3] - movhps [r0 + r2], m3 + movhps [r0 + r2], m2 RET INIT_YMM avx2 -cglobal cvt32to16_shl_4, 3,3,3 +cglobal cpy1Dto2D_shl_4, 3, 3, 2 add r2d, r2d movd xm0, r3m ; Row 0-3 - movu m1, [r1 + 0 * mmsize] - movu m2, [r1 + 1 * mmsize] - packssdw m1, m2 + movu m1, [r1] psllw m1, xm0 vextracti128 xm0, m1, 1 movq [r0], xm1 - movq [r0 + r2], xm0 + movhps [r0 + r2], xm1 lea r0, [r0 + r2 * 2] - movhps [r0], xm1 + movq [r0], xm0 movhps [r0 + r2], xm0 RET ;-------------------------------------------------------------------------------------- -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 -cglobal cvt32to16_shl_8, 3,5,5 +cglobal cpy1Dto2D_shl_8, 3, 4, 5 add r2d, r2d movd m0, r3m - mov r3d, 8/4 - lea r4, [r2 * 3] + lea r3, [r2 * 3] -.loop: - ; Row 0-1 - movu m1, [r1 + 0 * mmsize] - movu m2, [r1 + 1 * mmsize] - movu m3, [r1 + 2 * mmsize] - movu m4, [r1 + 3 * mmsize] - packssdw m1, m2 - packssdw m3, m4 + ; Row 0-3 + mova m1, [r1 + 0 * mmsize] + mova m2, [r1 + 1 * mmsize] + mova m3, [r1 + 2 * mmsize] + mova m4, [r1 + 3 * mmsize] psllw m1, m0 + psllw m2, m0 psllw m3, m0 - movu [r0], m1 - movu [r0 + r2], m3 + psllw m4, m0 + mova [r0], m1 + mova [r0 + r2], m2 + mova [r0 + r2 * 2], m3 + mova [r0 + r3], m4 + lea r0, [r0 + r2 * 4] - ; Row 2-3 - movu m1, [r1 + 4 * mmsize] - movu m2, [r1 + 5 * mmsize] - movu m3, [r1 + 6 * mmsize] - movu m4, [r1 + 7 * mmsize] - packssdw m1, m2 - packssdw m3, m4 + ; Row 4-7 + mova m1, [r1 + 4 * mmsize] + mova m2, [r1 + 5 * mmsize] + mova m3, [r1 + 6 * mmsize] + mova m4, [r1 + 7 * mmsize] psllw m1, m0 + psllw m2, m0 psllw m3, m0 - movu [r0 + r2 * 2], m1 - movu [r0 + r4], m3 - - add r1, 8 * mmsize - lea r0, [r0 + r2 * 4] - dec r3d - jnz .loop + psllw m4, m0 + mova [r0], m1 + mova [r0 + r2], m2 + mova [r0 + r2 * 2], m3 + mova [r0 + r3], m4 RET INIT_YMM avx2 -cglobal cvt32to16_shl_8, 3,4,3 +cglobal cpy1Dto2D_shl_8, 3, 4, 3 add r2d, r2d movd xm0, r3m lea r3, [r2 * 3] - ; Row 0-1 - movu xm1, [r1 + 0 * mmsize] - vinserti128 m1, m1, [r1 + 1 * mmsize], 1 - movu xm2, [r1 + 0 * mmsize + mmsize/2] - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 - packssdw m1, m2 - psllw m1, xm0 - movu [r0], xm1 - vextracti128 [r0 + r2], m1, 1 - - ; Row 2-3 - movu xm1, [r1 + 2 * mmsize] - vinserti128 m1, m1, [r1 + 3 * mmsize], 1 - movu xm2, [r1 + 2 * mmsize + mmsize/2] - vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1 - packssdw m1, m2 - psllw m1, xm0 - movu [r0 + r2 * 2], xm1 - vextracti128 [r0 + r3], m1, 1 - - add r1, 4 * mmsize - lea r0, [r0 + r2 * 4] - - ; Row 4-5 + ; Row 0-3 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] - packssdw m1, m2 - vpermq m1, m1, 11011000b psllw m1, xm0 + psllw m2, xm0 movu [r0], xm1 vextracti128 [r0 + r2], m1, 1 + movu [r0 + r2 * 2], xm2 + vextracti128 [r0 + r3], m2, 1 - ; Row 6-7 + ; Row 4-7 movu m1, [r1 + 2 * mmsize] movu m2, [r1 + 3 * mmsize] - packssdw m1, m2 - vpermq m1, m1, 11011000b + lea r0, [r0 + r2 * 4] psllw m1, xm0 - movu [r0 + r2 * 2], xm1 - vextracti128 [r0 + r3], m1, 1 + psllw m2, xm0 + movu [r0], xm1 + vextracti128 [r0 + r2], m1, 1 + movu [r0 + r2 * 2], xm2 + vextracti128 [r0 + r3], m2, 1 RET + ;-------------------------------------------------------------------------------------- -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 -cglobal cvt32to16_shl_16, 3,4,5 +cglobal cpy1Dto2D_shl_16, 3, 4, 5 add r2d, r2d movd m0, r3m - mov r3d, 16/2 + mov r3d, 16/4 .loop: - ; Row 0 - movu m1, [r1 + 0 * mmsize] - movu m2, [r1 + 1 * mmsize] - movu m3, [r1 + 2 * mmsize] - movu m4, [r1 + 3 * mmsize] - packssdw m1, m2 - packssdw m3, m4 + ; Row 0-1 + mova m1, [r1 + 0 * mmsize] + mova m2, [r1 + 1 * mmsize] + mova m3, [r1 + 2 * mmsize] + mova m4, [r1 + 3 * mmsize] psllw m1, m0 + psllw m2, m0 psllw m3, m0 - movu [r0], m1 - movu [r0 + mmsize], m3 + psllw m4, m0 + mova [r0], m1 + mova [r0 + 16], m2 + mova [r0 + r2], m3 + mova [r0 + r2 + 16], m4 - ; Row 1 - movu m1, [r1 + 4 * mmsize] - movu m2, [r1 + 5 * mmsize] - movu m3, [r1 + 6 * mmsize] - movu m4, [r1 + 7 * mmsize] - packssdw m1, m2 - packssdw m3, m4 + ; Row 2-3 + mova m1, [r1 + 4 * mmsize] + mova m2, [r1 + 5 * mmsize] + mova m3, [r1 + 6 * mmsize] + mova m4, [r1 + 7 * mmsize] + lea r0, [r0 + r2 * 2] psllw m1, m0 + psllw m2, m0 psllw m3, m0 - movu [r0 + r2], m1 - movu [r0 + r2 + mmsize], m3 + psllw m4, m0 + mova [r0], m1 + mova [r0 + 16], m2 + mova [r0 + r2], m3 + mova [r0 + r2 + 16], m4 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] @@ -4199,49 +4000,28 @@ cglobal cvt32to16_shl_16, 3,4,5 INIT_YMM avx2 -cglobal cvt32to16_shl_16, 3,5,3 +cglobal cpy1Dto2D_shl_16, 3, 5, 3 add r2d, r2d movd xm0, r3m mov r3d, 16/4 lea r4, [r2 * 3] .loop: - ; Row 0 - movu xm1, [r1 + 0 * mmsize] - vinserti128 m1, m1, [r1 + 1 * mmsize], 1 - movu xm2, [r1 + 0 * mmsize + mmsize/2] - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 - packssdw m1, m2 + ; Row 0-1 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] psllw m1, xm0 + psllw m2, xm0 movu [r0], m1 + movu [r0 + r2], m2 - ; Row 1 - movu xm1, [r1 + 2 * mmsize] - vinserti128 m1, m1, [r1 + 3 * mmsize], 1 - movu xm2, [r1 + 2 * mmsize + mmsize/2] - vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1 - packssdw m1, m2 - psllw m1, xm0 - movu [r0 + r2], m1 - - add r1, 4 * mmsize - - ; Row 2 - movu xm1, [r1 + 0 * mmsize] - vinserti128 m1, m1, [r1 + 1 * mmsize], 1 - movu xm2, [r1 + 0 * mmsize + mmsize/2] - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 - packssdw m1, m2 - psllw m1, xm0 - movu [r0 + r2 * 2], m1 - - ; Row 3 + ; Row 2-3 movu m1, [r1 + 2 * mmsize] movu m2, [r1 + 3 * mmsize] - packssdw m1, m2 psllw m1, xm0 - vpermq m1, m1, 11011000b - movu [r0 + r4], m1 + psllw m2, xm0 + movu [r0 + r2 * 2], m1 + movu [r0 + r4], m2 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] @@ -4251,84 +4031,70 @@ cglobal cvt32to16_shl_16, 3,5,3 ;-------------------------------------------------------------------------------------- -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 -cglobal cvt32to16_shl_32, 3,4,5 +cglobal cpy1Dto2D_shl_32, 3, 4, 5 add r2d, r2d movd m0, r3m - mov r3d, 32/1 + mov r3d, 32/2 .loop: ; Row 0 - movu m1, [r1 + 0 * mmsize] - movu m2, [r1 + 1 * mmsize] - movu m3, [r1 + 2 * mmsize] - movu m4, [r1 + 3 * mmsize] - packssdw m1, m2 - packssdw m3, m4 + mova m1, [r1 + 0 * mmsize] + mova m2, [r1 + 1 * mmsize] + mova m3, [r1 + 2 * mmsize] + mova m4, [r1 + 3 * mmsize] psllw m1, m0 + psllw m2, m0 psllw m3, m0 - movu [r0 + 0 * mmsize], m1 - movu [r0 + 1 * mmsize], m3 - - movu m1, [r1 + 4 * mmsize] - movu m2, [r1 + 5 * mmsize] - movu m3, [r1 + 6 * mmsize] - movu m4, [r1 + 7 * mmsize] - packssdw m1, m2 - packssdw m3, m4 + psllw m4, m0 + mova [r0 + 0 * mmsize], m1 + mova [r0 + 1 * mmsize], m2 + mova [r0 + 2 * mmsize], m3 + mova [r0 + 3 * mmsize], m4 + + ; Row 1 + mova m1, [r1 + 4 * mmsize] + mova m2, [r1 + 5 * mmsize] + mova m3, [r1 + 6 * mmsize] + mova m4, [r1 + 7 * mmsize] psllw m1, m0 + psllw m2, m0 psllw m3, m0 - movu [r0 + 2 * mmsize], m1 - movu [r0 + 3 * mmsize], m3 + psllw m4, m0 + mova [r0 + r2 + 0 * mmsize], m1 + mova [r0 + r2 + 1 * mmsize], m2 + mova [r0 + r2 + 2 * mmsize], m3 + mova [r0 + r2 + 3 * mmsize], m4 add r1, 8 * mmsize - add r0, r2 + lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 -cglobal cvt32to16_shl_32, 3,4,5 +cglobal cpy1Dto2D_shl_32, 3, 4, 5 add r2d, r2d movd xm0, r3m mov r3d, 32/2 .loop: - ; Row 0 - movu xm1, [r1 + 0 * mmsize] - vinserti128 m1, m1, [r1 + 1 * mmsize], 1 - movu xm2, [r1 + 0 * mmsize + mmsize/2] - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 - movu xm3, [r1 + 2 * mmsize] - vinserti128 m3, m3, [r1 + 3 * mmsize], 1 - movu xm4, [r1 + 2 * mmsize + mmsize/2] - vinserti128 m4, m4, [r1 + 3 * mmsize + mmsize/2], 1 - packssdw m1, m2 - packssdw m3, m4 - psllw m1, xm0 - psllw m3, xm0 - movu [r0], m1 - movu [r0 + mmsize], m3 - - add r1, 4 * mmsize - - ; Row 1 - movu xm1, [r1 + 0 * mmsize] - vinserti128 m1, m1, [r1 + 1 * mmsize], 1 - movu xm2, [r1 + 0 * mmsize + mmsize/2] - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 + ; Row 0-1 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] movu m3, [r1 + 2 * mmsize] movu m4, [r1 + 3 * mmsize] - packssdw m1, m2 - packssdw m3, m4 psllw m1, xm0 + psllw m2, xm0 psllw m3, xm0 - vpermq m3, m3, 11011000b - movu [r0 + r2], m1 - movu [r0 + r2 + mmsize], m3 + psllw m4, xm0 + movu [r0], m1 + movu [r0 + mmsize], m2 + movu [r0 + r2], m3 + movu [r0 + r2 + mmsize], m4 add r1, 4 * mmsize lea r0, [r0 + r2 * 2] @@ -4338,7 +4104,7 @@ cglobal cvt32to16_shl_32, 3,4,5 ;-------------------------------------------------------------------------------------- -; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride); +; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_4, 3,3,3 @@ -4377,7 +4143,7 @@ cglobal copy_cnt_4, 3,3,3 ;-------------------------------------------------------------------------------------- -; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride); +; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_8, 3,3,6 @@ -4481,7 +4247,7 @@ cglobal copy_cnt_8, 3,4,5 ;-------------------------------------------------------------------------------------- -; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride); +; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_16, 3,4,6 @@ -4592,7 +4358,7 @@ cglobal copy_cnt_16, 3, 5, 5 RET ;-------------------------------------------------------------------------------------- -; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride); +; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_32, 3,4,6 @@ -4699,227 +4465,470 @@ cglobal copy_cnt_32, 3, 5, 5 movd eax, xm4 RET -;----------------------------------------------------------------------------- -; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal copy_shr, 4, 7, 4, dst, src, stride -%define rnd m2 -%define shift m1 +;-------------------------------------------------------------------------------------- +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cpy2Dto1D_shl_4, 4, 4, 4 + add r2d, r2d + movd m0, r3d + + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - srcStride + ; m0 - shift - ; make shift - mov r5d, r3m - movd shift, r5d + ; Row 0-3 + movh m2, [r1] + movhps m2, [r1 + r2] + lea r1, [r1 + r2 * 2] + movh m3, [r1] + movhps m3, [r1 + r2] + psllw m2, m0 + psllw m3, m0 + mova [r0 + 0 * mmsize], m2 + mova [r0 + 1 * mmsize], m3 - ; make round - dec r5 - xor r6, r6 - bts r6, r5 + RET - movd rnd, r6d - pshufd rnd, rnd, 0 + +;-------------------------------------------------------------------------------------- +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cpy2Dto1D_shl_8, 4, 5, 4 + add r2d, r2d + movd m0, r3d + mov r3d, 8/4 + lea r4, [r2 * 3] ; register alloc ; r0 - dst ; r1 - src - ; r2 - stride * 2 (short*) - ; r3 - lx - ; r4 - size - ; r5 - ly - ; r6 - diff - add r2d, r2d + ; r2 - srcStride + ; r3 - loop counter + ; r4 - stride * 3 + ; m0 - shift + +.loop: + ; Row 0, 1 + mova m2, [r1] + mova m3, [r1 + r2] + psllw m2, m0 + psllw m3, m0 + mova [r0 + 0 * mmsize], m2 + mova [r0 + 1 * mmsize], m3 + + ; Row 2, 3 + mova m2, [r1 + r2 * 2] + mova m3, [r1 + r4] + psllw m2, m0 + psllw m3, m0 + mova [r0 + 2 * mmsize], m2 + mova [r0 + 3 * mmsize], m3 + + add r0, 4 * mmsize + lea r1, [r1 + r2 * 4] + dec r3d + jnz .loop + RET - mov r4d, r4m - mov r5, r4 ; size - mov r6, r2 ; stride - sub r6, r4 - add r6, r6 - shr r5, 1 -.loop_row: +;-------------------------------------------------------------------------------------- +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cpy2Dto1D_shl_16, 4, 4, 4 + add r2d, r2d + movd m0, r3d + mov r3d, 16/2 - mov r3, r4 - shr r3, 2 -.loop_col: - ; row 0 - movh m3, [r1] - pmovsxwd m0, m3 - paddd m0, rnd - psrad m0, shift - packssdw m0, m0 - movh [r0], m0 + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - srcStride + ; r3 - loop counter + ; m0 - shift - ; row 1 - movh m3, [r1 + r4 * 2] - pmovsxwd m0, m3 - paddd m0, rnd - psrad m0, shift - packssdw m0, m0 - movh [r0 + r2], m0 +.loop: + ; Row 0 + mova m2, [r1 + 0 * mmsize] + mova m3, [r1 + 1 * mmsize] + psllw m2, m0 + psllw m3, m0 + mova [r0 + 0 * mmsize], m2 + mova [r0 + 1 * mmsize], m3 - ; move col pointer - add r1, 8 - add r0, 8 + ; Row 1 + mova m2, [r1 + r2 + 0 * mmsize] + mova m3, [r1 + r2 + 1 * mmsize] + psllw m2, m0 + psllw m3, m0 + mova [r0 + 2 * mmsize], m2 + mova [r0 + 3 * mmsize], m3 + + add r0, 4 * mmsize + lea r1, [r1 + r2 * 2] + dec r3d + jnz .loop + RET - dec r3 - jg .loop_col - ; update pointer - lea r1, [r1 + r4 * 2] - add r0, r6 +;-------------------------------------------------------------------------------------- +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cpy2Dto1D_shl_32, 4, 4, 6 + add r2d, r2d + movd m0, r3d + mov r3d, 32/1 - ; end of loop_row - dec r5 - jg .loop_row + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - srcStride + ; r3 - loop counter + ; m0 - shift +.loop: + ; Row 0 + mova m2, [r1 + 0 * mmsize] + mova m3, [r1 + 1 * mmsize] + mova m4, [r1 + 2 * mmsize] + mova m5, [r1 + 3 * mmsize] + psllw m2, m0 + psllw m3, m0 + psllw m4, m0 + psllw m5, m0 + mova [r0 + 0 * mmsize], m2 + mova [r0 + 1 * mmsize], m3 + mova [r0 + 2 * mmsize], m4 + mova [r0 + 3 * mmsize], m5 + + add r0, 4 * mmsize + add r1, r2 + dec r3d + jnz .loop RET + ;-------------------------------------------------------------------------------------- -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 -cglobal copy_shl_4, 3,3,3 +cglobal cpy1Dto2D_shr_4, 3, 3, 4 add r2d, r2d movd m0, r3m + pcmpeqw m1, m1 + psllw m1, m0 + psraw m1, 1 ; Row 0-3 - movu m1, [r1 + 0 * mmsize] - movu m2, [r1 + 1 * mmsize] - psllw m1, m0 - psllw m2, m0 - movh [r0], m1 - movhps [r0 + r2], m1 - movh [r0 + r2 * 2], m2 - lea r2, [r2 * 3] + mova m2, [r1 + 0 * mmsize] + mova m3, [r1 + 1 * mmsize] + psubw m2, m1 + psubw m3, m1 + psraw m2, m0 + psraw m3, m0 + movh [r0], m2 movhps [r0 + r2], m2 + movh [r0 + r2 * 2], m3 + lea r2, [r2 * 3] + movhps [r0 + r2], m3 RET + +INIT_YMM avx2 +cglobal cpy1Dto2D_shr_4, 3, 3, 3 + add r2d, r2d + movd xm0, r3m + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + + ; Row 0-3 + movu m2, [r1] + psubw m2, m1 + psraw m2, xm0 + vextracti128 xm1, m2, 1 + movq [r0], xm2 + movhps [r0 + r2], xm2 + lea r0, [r0 + r2 * 2] + movq [r0], xm1 + movhps [r0 + r2], xm1 + RET + + ;-------------------------------------------------------------------------------------- -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 -cglobal copy_shl_8, 3,4,5 +cglobal cpy1Dto2D_shr_8, 3, 4, 6 add r2d, r2d movd m0, r3m + pcmpeqw m1, m1 + psllw m1, m0 + psraw m1, 1 + lea r3, [r2 * 3] ; Row 0-3 - movu m1, [r1 + 0 * mmsize] - movu m2, [r1 + 1 * mmsize] - movu m3, [r1 + 2 * mmsize] - movu m4, [r1 + 3 * mmsize] - psllw m1, m0 - psllw m2, m0 - psllw m3, m0 - psllw m4, m0 - movu [r0], m1 - movu [r0 + r2], m2 - movu [r0 + 2 * r2], m3 - lea r0, [r0 + 2 * r2] - movu [r0 + r2], m4 + mova m2, [r1 + 0 * mmsize] + mova m3, [r1 + 1 * mmsize] + mova m4, [r1 + 2 * mmsize] + mova m5, [r1 + 3 * mmsize] + psubw m2, m1 + psubw m3, m1 + psubw m4, m1 + psubw m5, m1 + psraw m2, m0 + psraw m3, m0 + psraw m4, m0 + psraw m5, m0 + mova [r0], m2 + mova [r0 + r2], m3 + mova [r0 + r2 * 2], m4 + mova [r0 + r3], m5 ; Row 4-7 - movu m1, [r1 + 4 * mmsize] - movu m2, [r1 + 5 * mmsize] - movu m3, [r1 + 6 * mmsize] - movu m4, [r1 + 7 * mmsize] - psllw m1, m0 - psllw m2, m0 - psllw m3, m0 - psllw m4, m0 - movu [r0 + r2 * 2], m1 - lea r0, [r0 + 2 * r2] - movu [r0 + r2], m2 - movu [r0 + 2 * r2], m3 - lea r0, [r0 + 2 * r2] - movu [r0 + r2], m4 + mova m2, [r1 + 4 * mmsize] + mova m3, [r1 + 5 * mmsize] + mova m4, [r1 + 6 * mmsize] + mova m5, [r1 + 7 * mmsize] + lea r0, [r0 + r2 * 4] + psubw m2, m1 + psubw m3, m1 + psubw m4, m1 + psubw m5, m1 + psraw m2, m0 + psraw m3, m0 + psraw m4, m0 + psraw m5, m0 + mova [r0], m2 + mova [r0 + r2], m3 + mova [r0 + r2 * 2], m4 + mova [r0 + r3], m5 + RET + + +INIT_YMM avx2 +cglobal cpy1Dto2D_shr_8, 3, 4, 4 + add r2d, r2d + movd xm0, r3m + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + lea r3, [r2 * 3] + + ; Row 0-3 + movu m2, [r1 + 0 * mmsize] + movu m3, [r1 + 1 * mmsize] + psubw m2, m1 + psubw m3, m1 + psraw m2, xm0 + psraw m3, xm0 + movu [r0], xm2 + vextracti128 [r0 + r2], m2, 1 + movu [r0 + r2 * 2], xm3 + vextracti128 [r0 + r3], m3, 1 + + ; Row 4-7 + movu m2, [r1 + 2 * mmsize] + movu m3, [r1 + 3 * mmsize] + lea r0, [r0 + r2 * 4] + psubw m2, m1 + psubw m3, m1 + psraw m2, xm0 + psraw m3, xm0 + movu [r0], xm2 + vextracti128 [r0 + r2], m2, 1 + movu [r0 + r2 * 2], xm3 + vextracti128 [r0 + r3], m3, 1 RET + ;-------------------------------------------------------------------------------------- -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 -cglobal copy_shl_16, 3,4,5 +cglobal cpy1Dto2D_shr_16, 3, 5, 6 add r2d, r2d movd m0, r3m - mov r3d, 256/64 + pcmpeqw m1, m1 + psllw m1, m0 + psraw m1, 1 + mov r3d, 16/4 + lea r4, [r2 * 3] .loop: - ; Row 0-3 - movu m1, [r1 + 0 * mmsize] - movu m2, [r1 + 1 * mmsize] - movu m3, [r1 + 2 * mmsize] - movu m4, [r1 + 3 * mmsize] - psllw m1, m0 - psllw m2, m0 - psllw m3, m0 - psllw m4, m0 - movu [r0], m1 - movu [r0 + 16], m2 - movu [r0 + r2], m3 - movu [r0 + r2 + 16], m4 + ; Row 0-1 + mova m2, [r1 + 0 * mmsize] + mova m3, [r1 + 1 * mmsize] + mova m4, [r1 + 2 * mmsize] + mova m5, [r1 + 3 * mmsize] + psubw m2, m1 + psubw m3, m1 + psubw m4, m1 + psubw m5, m1 + psraw m2, m0 + psraw m3, m0 + psraw m4, m0 + psraw m5, m0 + mova [r0], m2 + mova [r0 + mmsize], m3 + mova [r0 + r2], m4 + mova [r0 + r2 + mmsize], m5 - ; Row 4-7 - movu m1, [r1 + 4 * mmsize] - movu m2, [r1 + 5 * mmsize] - movu m3, [r1 + 6 * mmsize] - movu m4, [r1 + 7 * mmsize] - psllw m1, m0 - psllw m2, m0 - psllw m3, m0 - psllw m4, m0 - movu [r0 + r2 * 2], m1 - movu [r0 + r2 * 2 + 16], m2 - lea r0, [r0 + r2 * 2] - movu [r0 + r2], m3 - movu [r0 + r2 + 16], m4 + ; Row 2-3 + mova m2, [r1 + 4 * mmsize] + mova m3, [r1 + 5 * mmsize] + mova m4, [r1 + 6 * mmsize] + mova m5, [r1 + 7 * mmsize] + psubw m2, m1 + psubw m3, m1 + psubw m4, m1 + psubw m5, m1 + psraw m2, m0 + psraw m3, m0 + psraw m4, m0 + psraw m5, m0 + mova [r0 + r2 * 2], m2 + mova [r0 + r2 * 2 + mmsize], m3 + mova [r0 + r4], m4 + mova [r0 + r4 + mmsize], m5 add r1, 8 * mmsize - lea r0, [r0 + r2 * 2] + lea r0, [r0 + r2 * 4] + dec r3d + jnz .loop + RET + + +INIT_YMM avx2 +cglobal cpy1Dto2D_shr_16, 3, 5, 4 + add r2d, r2d + movd xm0, r3m + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + mov r3d, 16/4 + lea r4, [r2 * 3] + +.loop: + ; Row 0-1 + movu m2, [r1 + 0 * mmsize] + movu m3, [r1 + 1 * mmsize] + psubw m2, m1 + psubw m3, m1 + psraw m2, xm0 + psraw m3, xm0 + movu [r0], m2 + movu [r0 + r2], m3 + + ; Row 2-3 + movu m2, [r1 + 2 * mmsize] + movu m3, [r1 + 3 * mmsize] + psubw m2, m1 + psubw m3, m1 + psraw m2, xm0 + psraw m3, xm0 + movu [r0 + r2 * 2], m2 + movu [r0 + r4], m3 + + add r1, 4 * mmsize + lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET + ;-------------------------------------------------------------------------------------- -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 -cglobal copy_shl_32, 3,4,5 +cglobal cpy1Dto2D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m - mov r3d, 1024/64 + pcmpeqw m1, m1 + psllw m1, m0 + psraw m1, 1 + mov r3d, 32/2 .loop: - ; Row 0-3 - movu m1, [r1 + 0 * mmsize] - movu m2, [r1 + 1 * mmsize] - movu m3, [r1 + 2 * mmsize] - movu m4, [r1 + 3 * mmsize] - psllw m1, m0 - psllw m2, m0 - psllw m3, m0 - psllw m4, m0 - movu [r0], m1 - movu [r0 + 16], m2 - movu [r0 + 32], m3 - movu [r0 + 48], m4 + ; Row 0 + mova m2, [r1 + 0 * mmsize] + mova m3, [r1 + 1 * mmsize] + mova m4, [r1 + 2 * mmsize] + mova m5, [r1 + 3 * mmsize] + psubw m2, m1 + psubw m3, m1 + psubw m4, m1 + psubw m5, m1 + psraw m2, m0 + psraw m3, m0 + psraw m4, m0 + psraw m5, m0 + mova [r0 + 0 * mmsize], m2 + mova [r0 + 1 * mmsize], m3 + mova [r0 + 2 * mmsize], m4 + mova [r0 + 3 * mmsize], m5 - ; Row 4-7 - movu m1, [r1 + 4 * mmsize] - movu m2, [r1 + 5 * mmsize] - movu m3, [r1 + 6 * mmsize] - movu m4, [r1 + 7 * mmsize] - psllw m1, m0 - psllw m2, m0 - psllw m3, m0 - psllw m4, m0 - movu [r0 + r2], m1 - movu [r0 + r2 + 16], m2 - movu [r0 + r2 + 32], m3 - movu [r0 + r2 + 48], m4 + ; Row 1 + mova m2, [r1 + 4 * mmsize] + mova m3, [r1 + 5 * mmsize] + mova m4, [r1 + 6 * mmsize] + mova m5, [r1 + 7 * mmsize] + psubw m2, m1 + psubw m3, m1 + psubw m4, m1 + psubw m5, m1 + psraw m2, m0 + psraw m3, m0 + psraw m4, m0 + psraw m5, m0 + mova [r0 + r2 + 0 * mmsize], m2 + mova [r0 + r2 + 1 * mmsize], m3 + mova [r0 + r2 + 2 * mmsize], m4 + mova [r0 + r2 + 3 * mmsize], m5 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET + + +INIT_YMM avx2 +cglobal cpy1Dto2D_shr_32, 3, 4, 6 + add r2d, r2d + movd xm0, r3m + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + mov r3d, 32/2 + +.loop: + ; Row 0-1 + movu m2, [r1 + 0 * mmsize] + movu m3, [r1 + 1 * mmsize] + movu m4, [r1 + 2 * mmsize] + movu m5, [r1 + 3 * mmsize] + psubw m2, m1 + psubw m3, m1 + psubw m4, m1 + psubw m5, m1 + psraw m2, xm0 + psraw m3, xm0 + psraw m4, xm0 + psraw m5, xm0 + movu [r0], m2 + movu [r0 + mmsize], m3 + movu [r0 + r2], m4 + movu [r0 + r2 + mmsize], m5 + + add r1, 4 * mmsize + lea r0, [r0 + r2 * 2] + dec r3d + jnz .loop + RET diff --git a/source/common/x86/blockcopy8.h b/source/common/x86/blockcopy8.h index 115e340..9fbbeea 100644 --- a/source/common/x86/blockcopy8.h +++ b/source/common/x86/blockcopy8.h @@ -24,48 +24,53 @@ #ifndef X265_BLOCKCOPY8_H #define X265_BLOCKCOPY8_H -void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int); -void x265_cvt32to16_shl_4_sse2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_8_sse2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_16_sse2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_32_sse2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_4_avx2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_8_avx2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_16_avx2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_32_avx2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_cvt16to32_shr_4_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_copy_shr_sse4(int16_t * dst, int16_t *src, intptr_t, int, int); -void x265_copy_shl_4_sse2(int16_t * dst, int16_t *src, intptr_t, int); -void x265_copy_shl_8_sse2(int16_t * dst, int16_t *src, intptr_t, int); -void x265_copy_shl_16_sse2(int16_t * dst, int16_t *src, intptr_t, int); -void x265_copy_shl_32_sse2(int16_t * dst, int16_t *src, intptr_t, int); -uint32_t x265_copy_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_32_sse4(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_4_avx2(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_8_avx2(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_16_avx2(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_32_avx2(int16_t * dst, int16_t * src, intptr_t); +void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride); #define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \ - void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \ - void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb); \ - void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb); + void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \ + void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); \ + void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define SETUP_BLOCKCOPY_PS(W, H, cpu) \ - void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t * dst, intptr_t dstStride, pixel * src, intptr_t srcStride); + void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); #define SETUP_BLOCKCOPY_SP(W, H, cpu) \ - void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb); + void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \ - void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \ - void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb); + void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \ + void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define BLOCKCOPY_COMMON(cpu) \ SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \ @@ -178,31 +183,31 @@ BLOCKCOPY_PS(_sse4); BLOCKCOPY_SP(_sse2); -void x265_blockfill_s_4x4_sse2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockfill_s_8x8_sse2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockfill_s_32x32_sse2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockcopy_ss_16x4_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x8_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x12_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x24_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_64x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_64x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_64x48_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_64x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); - -void x265_blockcopy_pp_32x8_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x16_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x24_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x32_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x48_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); - -void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockfill_s_32x32_avx2(int16_t *dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_4x4_sse2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); + +void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x24_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); + +void x265_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val); #undef BLOCKCOPY_COMMON #undef BLOCKCOPY_SS_PP diff --git a/source/common/x86/dct8.asm b/source/common/x86/dct8.asm index 5323a42..7e1ebbc 100644 --- a/source/common/x86/dct8.asm +++ b/source/common/x86/dct8.asm @@ -245,7 +245,7 @@ avx2_idct4_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64 avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83 -const idct4_shuf1, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 +const idct4_shuf1, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11 @@ -318,7 +318,7 @@ cextern pd_2048 cextern pw_ppppmmmm ;------------------------------------------------------ -;void dct4(int16_t *src, int32_t *dst, intptr_t stride) +;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride) ;------------------------------------------------------ INIT_XMM sse2 cglobal dct4, 3, 4, 8 @@ -384,28 +384,28 @@ cglobal dct4, 3, 4, 8 paddd m1, m3 paddd m1, m7 psrad m1, 8 - movu [r1 + 0 * 16], m1 - pmaddwd m1, m2, m5 + pmaddwd m4, m2, m5 pmaddwd m3, m0, m5 - psubd m1, m3 - paddd m1, m7 - psrad m1, 8 - movu [r1 + 1 * 16], m1 + psubd m4, m3 + paddd m4, m7 + psrad m4, 8 + packssdw m1, m4 + movu [r1 + 0 * 16], m1 pmaddwd m1, m2, m6 pmaddwd m3, m0, m6 paddd m1, m3 paddd m1, m7 psrad m1, 8 - movu [r1 + 2 * 16], m1 pmaddwd m2, [r3 + 3 * 16] pmaddwd m0, [r3 + 3 * 16] psubd m2, m0 paddd m2, m7 psrad m2, 8 - movu [r1 + 3 * 16], m2 + packssdw m1, m2 + movu [r1 + 1 * 16], m1 RET ; DCT 4x4 @@ -470,14 +470,12 @@ cglobal dct4, 3, 4, 8, src, dst, srcStride paddd m2, m7 psrad m2, 8 - movu [r1], xm3 - movu [r1 + mmsize/2], m2 - vextracti128 [r1 + mmsize], m3, 1 - vextracti128 [r1 + mmsize + mmsize/2], m2, 1 + packssdw m3, m2 + movu [r1], m3 RET ;------------------------------------------------------- -;void idct4(int32_t *src, int16_t *dst, intptr_t stride) +;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_XMM sse2 cglobal idct4, 3, 4, 7 @@ -497,11 +495,6 @@ cglobal idct4, 3, 4, 7 movu m0, [r0 + 0 * 16] movu m1, [r0 + 1 * 16] - packssdw m0, m1 - - movu m1, [r0 + 2 * 16] - movu m2, [r0 + 3 * 16] - packssdw m1, m2 punpcklwd m2, m0, m1 pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1 @@ -572,7 +565,7 @@ cglobal idct4, 3, 4, 7 RET ;------------------------------------------------------ -;void dst4(int16_t *src, int32_t *dst, intptr_t stride) +;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride) ;------------------------------------------------------ INIT_XMM ssse3 %if ARCH_X86_64 @@ -638,33 +631,33 @@ cglobal dst4, 3, 4, 8 phaddd m0, m1 paddd m0, m5 psrad m0, 8 - movu [r1 + 0 * 16], m0 - pmaddwd m0, m2, coef1 + pmaddwd m4, m2, coef1 pmaddwd m1, m3, coef1 - phaddd m0, m1 - paddd m0, m5 - psrad m0, 8 - movu [r1 + 1 * 16], m0 + phaddd m4, m1 + paddd m4, m5 + psrad m4, 8 + packssdw m0, m4 + movu [r1 + 0 * 16], m0 pmaddwd m0, m2, coef2 pmaddwd m1, m3, coef2 phaddd m0, m1 paddd m0, m5 psrad m0, 8 - movu [r1 + 2 * 16], m0 pmaddwd m2, coef3 pmaddwd m3, coef3 phaddd m2, m3 paddd m2, m5 psrad m2, 8 - movu [r1 + 3 * 16], m2 + packssdw m0, m2 + movu [r1 + 1 * 16], m0 RET ;------------------------------------------------------- -;void idst4(int32_t *src, int16_t *dst, intptr_t stride) +;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_XMM sse2 cglobal idst4, 3, 4, 7 @@ -683,11 +676,6 @@ cglobal idst4, 3, 4, 7 movu m0, [r0 + 0 * 16] movu m1, [r0 + 1 * 16] - packssdw m0, m1 - - movu m1, [r0 + 2 * 16] - movu m2, [r0 + 3 * 16] - packssdw m1, m2 punpcklwd m2, m0, m1 ; m2 = m128iAC punpckhwd m0, m1 ; m0 = m128iBD @@ -762,7 +750,7 @@ cglobal idst4, 3, 4, 7 ;------------------------------------------------------- -; void dct8(int16_t *src, int32_t *dst, intptr_t stride) +; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride) ;------------------------------------------------------- INIT_XMM sse4 cglobal dct8, 3,6,7,0-16*mmsize @@ -935,10 +923,16 @@ cglobal dct8, 3,6,7,0-16*mmsize phsubd m4, m2 ; m4 = [Row6 Row4] paddd m4, m6 psrad m4, 9 - movh [r1 + 0*2*mmsize], m3 - movhps [r1 + 2*2*mmsize], m3 - movh [r1 + 4*2*mmsize], m4 - movhps [r1 + 6*2*mmsize], m4 + + packssdw m3, m3 + movd [r1 + 0*mmsize], m3 + pshufd m3, m3, 1 + movd [r1 + 2*mmsize], m3 + + packssdw m4, m4 + movd [r1 + 4*mmsize], m4 + pshufd m4, m4, 1 + movd [r1 + 6*mmsize], m4 ; odd pmulld m2, m0, [r4 + 2*16] @@ -950,8 +944,11 @@ cglobal dct8, 3,6,7,0-16*mmsize phaddd m2, m4 ; m2 = [Row3 Row1] paddd m2, m6 psrad m2, 9 - movh [r1 + 1*2*mmsize], m2 - movhps [r1 + 3*2*mmsize], m2 + + packssdw m2, m2 + movd [r1 + 1*mmsize], m2 + pshufd m2, m2, 1 + movd [r1 + 3*mmsize], m2 pmulld m2, m0, [r4 + 4*16] pmulld m3, m1, [r4 + 4*16] @@ -962,10 +959,13 @@ cglobal dct8, 3,6,7,0-16*mmsize phaddd m2, m4 ; m2 = [Row7 Row5] paddd m2, m6 psrad m2, 9 - movh [r1 + 5*2*mmsize], m2 - movhps [r1 + 7*2*mmsize], m2 - add r1, mmsize/2 + packssdw m2, m2 + movd [r1 + 5*mmsize], m2 + pshufd m2, m2, 1 + movd [r1 + 7*mmsize], m2 + + add r1, mmsize/4 add r0, 2*2*mmsize %endrep @@ -974,17 +974,392 @@ cglobal dct8, 3,6,7,0-16*mmsize RET ;------------------------------------------------------- -; void idct8(int32_t *src, int16_t *dst, intptr_t stride) +; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride) +;------------------------------------------------------- +%if ARCH_X86_64 +INIT_XMM sse2 +%if BIT_DEPTH == 10 + %define IDCT_SHIFT 10 + %define IDCT_ADD pd_512 +%elif BIT_DEPTH == 8 + %define IDCT_SHIFT 12 + %define IDCT_ADD pd_2048 +%else + %error Unsupported BIT_DEPTH! +%endif + +cglobal idct8, 3, 6, 16, 0-5*mmsize + mova m9, [r0 + 1 * mmsize] + mova m1, [r0 + 3 * mmsize] + mova m7, m9 + punpcklwd m7, m1 + punpckhwd m9, m1 + mova m14, [tab_idct8_3] + mova m3, m14 + pmaddwd m14, m7 + pmaddwd m3, m9 + mova m0, [r0 + 5 * mmsize] + mova m10, [r0 + 7 * mmsize] + mova m2, m0 + punpcklwd m2, m10 + punpckhwd m0, m10 + mova m15, [tab_idct8_3 + 1 * mmsize] + mova m11, [tab_idct8_3 + 1 * mmsize] + pmaddwd m15, m2 + mova m4, [tab_idct8_3 + 2 * mmsize] + pmaddwd m11, m0 + mova m1, [tab_idct8_3 + 2 * mmsize] + paddd m15, m14 + mova m5, [tab_idct8_3 + 4 * mmsize] + mova m12, [tab_idct8_3 + 4 * mmsize] + paddd m11, m3 + mova [rsp + 0 * mmsize], m11 + mova [rsp + 1 * mmsize], m15 + pmaddwd m4, m7 + pmaddwd m1, m9 + mova m14, [tab_idct8_3 + 3 * mmsize] + mova m3, [tab_idct8_3 + 3 * mmsize] + pmaddwd m14, m2 + pmaddwd m3, m0 + paddd m14, m4 + paddd m3, m1 + mova [rsp + 2 * mmsize], m3 + pmaddwd m5, m9 + pmaddwd m9, [tab_idct8_3 + 6 * mmsize] + mova m6, [tab_idct8_3 + 5 * mmsize] + pmaddwd m12, m7 + pmaddwd m7, [tab_idct8_3 + 6 * mmsize] + mova m4, [tab_idct8_3 + 5 * mmsize] + pmaddwd m6, m2 + paddd m6, m12 + pmaddwd m2, [tab_idct8_3 + 7 * mmsize] + paddd m7, m2 + mova [rsp + 3 * mmsize], m6 + pmaddwd m4, m0 + pmaddwd m0, [tab_idct8_3 + 7 * mmsize] + paddd m9, m0 + paddd m5, m4 + mova m6, [r0 + 0 * mmsize] + mova m0, [r0 + 4 * mmsize] + mova m4, m6 + punpcklwd m4, m0 + punpckhwd m6, m0 + mova m12, [r0 + 2 * mmsize] + mova m0, [r0 + 6 * mmsize] + mova m13, m12 + mova m8, [tab_dct4] + punpcklwd m13, m0 + mova m10, [tab_dct4] + punpckhwd m12, m0 + pmaddwd m8, m4 + mova m3, m8 + pmaddwd m4, [tab_dct4 + 2 * mmsize] + pmaddwd m10, m6 + mova m2, [tab_dct4 + 1 * mmsize] + mova m1, m10 + pmaddwd m6, [tab_dct4 + 2 * mmsize] + mova m0, [tab_dct4 + 1 * mmsize] + pmaddwd m2, m13 + paddd m3, m2 + psubd m8, m2 + mova m2, m6 + pmaddwd m13, [tab_dct4 + 3 * mmsize] + pmaddwd m0, m12 + paddd m1, m0 + psubd m10, m0 + mova m0, m4 + pmaddwd m12, [tab_dct4 + 3 * mmsize] + paddd m3, [pd_64] + paddd m1, [pd_64] + paddd m8, [pd_64] + paddd m10, [pd_64] + paddd m0, m13 + paddd m2, m12 + paddd m0, [pd_64] + paddd m2, [pd_64] + psubd m4, m13 + psubd m6, m12 + paddd m4, [pd_64] + paddd m6, [pd_64] + mova m12, m8 + psubd m8, m7 + psrad m8, 7 + paddd m15, m3 + psubd m3, [rsp + 1 * mmsize] + psrad m15, 7 + paddd m12, m7 + psrad m12, 7 + paddd m11, m1 + mova m13, m14 + psrad m11, 7 + packssdw m15, m11 + psubd m1, [rsp + 0 * mmsize] + psrad m1, 7 + mova m11, [rsp + 2 * mmsize] + paddd m14, m0 + psrad m14, 7 + psubd m0, m13 + psrad m0, 7 + paddd m11, m2 + mova m13, [rsp + 3 * mmsize] + psrad m11, 7 + packssdw m14, m11 + mova m11, m6 + psubd m6, m5 + paddd m13, m4 + psrad m13, 7 + psrad m6, 7 + paddd m11, m5 + psrad m11, 7 + packssdw m13, m11 + mova m11, m10 + psubd m4, [rsp + 3 * mmsize] + psubd m10, m9 + psrad m4, 7 + psrad m10, 7 + packssdw m4, m6 + packssdw m8, m10 + paddd m11, m9 + psrad m11, 7 + packssdw m12, m11 + psubd m2, [rsp + 2 * mmsize] + mova m5, m15 + psrad m2, 7 + packssdw m0, m2 + mova m2, m14 + psrad m3, 7 + packssdw m3, m1 + mova m6, m13 + punpcklwd m5, m8 + punpcklwd m2, m4 + mova m1, m12 + punpcklwd m6, m0 + punpcklwd m1, m3 + mova m9, m5 + punpckhwd m13, m0 + mova m0, m2 + punpcklwd m9, m6 + punpckhwd m5, m6 + punpcklwd m0, m1 + punpckhwd m2, m1 + punpckhwd m15, m8 + mova m1, m5 + punpckhwd m14, m4 + punpckhwd m12, m3 + mova m6, m9 + punpckhwd m9, m0 + punpcklwd m1, m2 + mova m4, [tab_idct8_3 + 0 * mmsize] + punpckhwd m5, m2 + punpcklwd m6, m0 + mova m2, m15 + mova m0, m14 + mova m7, m9 + punpcklwd m2, m13 + punpcklwd m0, m12 + punpcklwd m7, m5 + punpckhwd m14, m12 + mova m10, m2 + punpckhwd m15, m13 + punpckhwd m9, m5 + pmaddwd m4, m7 + mova m13, m1 + punpckhwd m2, m0 + punpcklwd m10, m0 + mova m0, m15 + punpckhwd m15, m14 + mova m12, m1 + mova m3, [tab_idct8_3 + 0 * mmsize] + punpcklwd m0, m14 + pmaddwd m3, m9 + mova m11, m2 + punpckhwd m2, m15 + punpcklwd m11, m15 + mova m8, [tab_idct8_3 + 1 * mmsize] + punpcklwd m13, m0 + punpckhwd m12, m0 + pmaddwd m8, m11 + paddd m8, m4 + mova [rsp + 4 * mmsize], m8 + mova m4, [tab_idct8_3 + 2 * mmsize] + pmaddwd m4, m7 + mova m15, [tab_idct8_3 + 2 * mmsize] + mova m5, [tab_idct8_3 + 1 * mmsize] + pmaddwd m15, m9 + pmaddwd m5, m2 + paddd m5, m3 + mova [rsp + 3 * mmsize], m5 + mova m14, [tab_idct8_3 + 3 * mmsize] + mova m5, [tab_idct8_3 + 3 * mmsize] + pmaddwd m14, m11 + paddd m14, m4 + mova [rsp + 2 * mmsize], m14 + pmaddwd m5, m2 + paddd m5, m15 + mova [rsp + 1 * mmsize], m5 + mova m15, [tab_idct8_3 + 4 * mmsize] + mova m5, [tab_idct8_3 + 4 * mmsize] + pmaddwd m15, m7 + pmaddwd m7, [tab_idct8_3 + 6 * mmsize] + pmaddwd m5, m9 + pmaddwd m9, [tab_idct8_3 + 6 * mmsize] + mova m4, [tab_idct8_3 + 5 * mmsize] + pmaddwd m4, m2 + paddd m5, m4 + mova m4, m6 + mova m8, [tab_idct8_3 + 5 * mmsize] + punpckhwd m6, m10 + pmaddwd m2, [tab_idct8_3 + 7 * mmsize] + punpcklwd m4, m10 + paddd m9, m2 + pmaddwd m8, m11 + mova m10, [tab_dct4] + paddd m8, m15 + pmaddwd m11, [tab_idct8_3 + 7 * mmsize] + paddd m7, m11 + mova [rsp + 0 * mmsize], m8 + pmaddwd m10, m6 + pmaddwd m6, [tab_dct4 + 2 * mmsize] + mova m1, m10 + mova m8, [tab_dct4] + mova m3, [tab_dct4 + 1 * mmsize] + pmaddwd m8, m4 + pmaddwd m4, [tab_dct4 + 2 * mmsize] + mova m0, m8 + mova m2, [tab_dct4 + 1 * mmsize] + pmaddwd m3, m13 + psubd m8, m3 + paddd m0, m3 + mova m3, m6 + pmaddwd m13, [tab_dct4 + 3 * mmsize] + pmaddwd m2, m12 + paddd m1, m2 + psubd m10, m2 + mova m2, m4 + pmaddwd m12, [tab_dct4 + 3 * mmsize] + paddd m0, [IDCT_ADD] + paddd m1, [IDCT_ADD] + paddd m8, [IDCT_ADD] + paddd m10, [IDCT_ADD] + paddd m2, m13 + paddd m3, m12 + paddd m2, [IDCT_ADD] + paddd m3, [IDCT_ADD] + psubd m4, m13 + psubd m6, m12 + paddd m4, [IDCT_ADD] + paddd m6, [IDCT_ADD] + mova m15, [rsp + 4 * mmsize] + mova m12, m8 + psubd m8, m7 + psrad m8, IDCT_SHIFT + mova m11, [rsp + 3 * mmsize] + paddd m15, m0 + psrad m15, IDCT_SHIFT + psubd m0, [rsp + 4 * mmsize] + psrad m0, IDCT_SHIFT + paddd m12, m7 + paddd m11, m1 + mova m14, [rsp + 2 * mmsize] + psrad m11, IDCT_SHIFT + packssdw m15, m11 + psubd m1, [rsp + 3 * mmsize] + psrad m1, IDCT_SHIFT + mova m11, [rsp + 1 * mmsize] + paddd m14, m2 + psrad m14, IDCT_SHIFT + packssdw m0, m1 + psrad m12, IDCT_SHIFT + psubd m2, [rsp + 2 * mmsize] + paddd m11, m3 + mova m13, [rsp + 0 * mmsize] + psrad m11, IDCT_SHIFT + packssdw m14, m11 + mova m11, m6 + psubd m6, m5 + paddd m13, m4 + psrad m13, IDCT_SHIFT + mova m1, m15 + paddd m11, m5 + psrad m11, IDCT_SHIFT + packssdw m13, m11 + mova m11, m10 + psubd m10, m9 + psrad m10, IDCT_SHIFT + packssdw m8, m10 + psrad m6, IDCT_SHIFT + psubd m4, [rsp + 0 * mmsize] + paddd m11, m9 + psrad m11, IDCT_SHIFT + packssdw m12, m11 + punpcklwd m1, m14 + mova m5, m13 + psrad m4, IDCT_SHIFT + packssdw m4, m6 + psubd m3, [rsp + 1 * mmsize] + psrad m2, IDCT_SHIFT + mova m6, m8 + psrad m3, IDCT_SHIFT + punpcklwd m5, m12 + packssdw m2, m3 + punpcklwd m6, m4 + punpckhwd m8, m4 + mova m4, m1 + mova m3, m2 + punpckhdq m1, m5 + punpckldq m4, m5 + punpcklwd m3, m0 + punpckhwd m2, m0 + mova m0, m6 + lea r2, [r2 + r2] + lea r4, [r2 + r2] + lea r3, [r4 + r2] + lea r4, [r4 + r3] + lea r0, [r4 + r2 * 2] + movq [r1], m4 + punpckhwd m15, m14 + movhps [r1 + r2], m4 + punpckhdq m0, m3 + movq [r1 + r2 * 2], m1 + punpckhwd m13, m12 + movhps [r1 + r3], m1 + mova m1, m6 + punpckldq m1, m3 + movq [r1 + 8], m1 + movhps [r1 + r2 + 8], m1 + movq [r1 + r2 * 2 + 8], m0 + movhps [r1 + r3 + 8], m0 + mova m0, m15 + punpckhdq m15, m13 + punpckldq m0, m13 + movq [r1 + r2 * 4], m0 + movhps [r1 + r4], m0 + mova m0, m8 + punpckhdq m8, m2 + movq [r1 + r3 * 2], m15 + punpckldq m0, m2 + movhps [r1 + r0], m15 + movq [r1 + r2 * 4 + 8], m0 + movhps [r1 + r4 + 8], m0 + movq [r1 + r3 * 2 + 8], m8 + movhps [r1 + r0 + 8], m8 + RET + +%undef IDCT_SHIFT +%undef IDCT_ADD +%endif + +;------------------------------------------------------- +; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_XMM ssse3 cglobal patial_butterfly_inverse_internal_pass1 - movu m0, [r0] - movu m1, [r0 + 4 * 32] - movu m2, [r0 + 2 * 32] - movu m3, [r0 + 6 * 32] - packssdw m0, m2 - packssdw m1, m3 + movh m0, [r0] + movhps m0, [r0 + 2 * 16] + movh m1, [r0 + 4 * 16] + movhps m1, [r0 + 6 * 16] + punpckhwd m2, m0, m1 ; [2 6] punpcklwd m0, m1 ; [0 4] pmaddwd m1, m0, [r6] ; EE[0] @@ -1004,12 +1379,10 @@ cglobal patial_butterfly_inverse_internal_pass1 paddd m3, m5 paddd m4, m5 - movu m2, [r0 + 32] - movu m5, [r0 + 5 * 32] - packssdw m2, m5 - movu m5, [r0 + 3 * 32] - movu m6, [r0 + 7 * 32] - packssdw m5, m6 + movh m2, [r0 + 16] + movhps m2, [r0 + 5 * 16] + movh m5, [r0 + 3 * 16] + movhps m5, [r0 + 7 * 16] punpcklwd m6, m2, m5 ;[1 3] punpckhwd m2, m5 ;[5 7] @@ -1136,7 +1509,7 @@ cglobal idct8, 3,7,8 ;,0-16*mmsize call patial_butterfly_inverse_internal_pass1 - add r0, 16 + add r0, 8 add r5, 8 call patial_butterfly_inverse_internal_pass1 @@ -1167,27 +1540,35 @@ cglobal idct8, 3,7,8 ;,0-16*mmsize ;----------------------------------------------------------------------------- -; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size) +; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal denoise_dct, 4, 4, 6 pxor m5, m5 - shr r3d, 2 + shr r3d, 3 .loop: mova m0, [r0] - pabsd m1, m0 + pabsw m1, m0 + mova m2, [r1] - paddd m2, m1 + pmovsxwd m3, m1 + paddd m2, m3 mova [r1], m2 - pmovzxwd m3, [r2] - psubd m1, m3 - pcmpgtd m4, m1, m5 + mova m2, [r1 + 16] + psrldq m3, m1, 8 + pmovsxwd m4, m3 + paddd m2, m4 + mova [r1 + 16], m2 + + movu m3, [r2] + psubsw m1, m3 + pcmpgtw m4, m1, m5 pand m1, m4 - psignd m1, m0 + psignw m1, m0 mova [r0], m1 add r0, 16 - add r1, 16 - add r2, 8 + add r1, 32 + add r2, 16 dec r3d jnz .loop RET @@ -1195,25 +1576,32 @@ cglobal denoise_dct, 4, 4, 6 INIT_YMM avx2 cglobal denoise_dct, 4, 4, 6 pxor m5, m5 - shr r3d, 3 + shr r3d, 4 .loop: movu m0, [r0] - pabsd m1, m0 + pabsw m1, m0 movu m2, [r1] - paddd m2, m1 + pmovsxwd m4, xm1 + paddd m2, m4 movu [r1], m2 - pmovzxwd m3, [r2] - psubd m1, m3 - pcmpgtd m4, m1, m5 + vextracti128 xm4, m1, 1 + movu m2, [r1 + 32] + pmovsxwd m3, xm4 + paddd m2, m3 + movu [r1 + 32], m2 + movu m3, [r2] + psubw m1, m3 + pcmpgtw m4, m1, m5 pand m1, m4 - psignd m1, m0 + psignw m1, m0 movu [r0], m1 add r0, 32 - add r1, 32 - add r2, 16 + add r1, 64 + add r2, 32 dec r3d jnz .loop RET + %if ARCH_X86_64 == 1 %macro DCT8_PASS_1 4 vpbroadcastq m0, [r6 + %1] @@ -1227,7 +1615,7 @@ cglobal denoise_dct, 4, 4, 6 mova [r5 + %2], xm2 %endmacro -%macro DCT8_PASS_2 1 +%macro DCT8_PASS_2 2 vbroadcasti128 m4, [r6 + %1] pmaddwd m6, m0, m4 pmaddwd m7, m1, m4 @@ -1238,10 +1626,25 @@ cglobal denoise_dct, 4, 4, 6 phaddd m6, m8 paddd m6, m5 psrad m6, DCT_SHIFT2 + + vbroadcasti128 m4, [r6 + %2] + pmaddwd m10, m0, m4 + pmaddwd m7, m1, m4 + pmaddwd m8, m2, m4 + pmaddwd m9, m3, m4 + phaddd m10, m7 + phaddd m8, m9 + phaddd m10, m8 + paddd m10, m5 + psrad m10, DCT_SHIFT2 + + packssdw m6, m10 + vpermq m10, m6, 0xD8 + %endmacro INIT_YMM avx2 -cglobal dct8, 3, 7, 10, 0-8*16 +cglobal dct8, 3, 7, 11, 0-8*16 %if BIT_DEPTH == 10 %define DCT_SHIFT 4 vbroadcasti128 m5, [pd_8] @@ -1294,9 +1697,6 @@ cglobal dct8, 3, 7, 10, 0-8*16 DCT8_PASS_1 7 * 16, 7 * 16, 4, 1 ;pass2 - mov r2d, 32 - lea r3, [r2 * 3] - lea r4, [r1 + r2 * 4] vbroadcasti128 m5, [pd_256] mova m0, [r5] @@ -1304,22 +1704,14 @@ cglobal dct8, 3, 7, 10, 0-8*16 mova m2, [r5 + 64] mova m3, [r5 + 96] - DCT8_PASS_2 0 * 16 - movu [r1], m6 - DCT8_PASS_2 1 * 16 - movu [r1 + r2], m6 - DCT8_PASS_2 2 * 16 - movu [r1 + r2 * 2], m6 - DCT8_PASS_2 3 * 16 - movu [r1 + r3], m6 - DCT8_PASS_2 4 * 16 - movu [r4], m6 - DCT8_PASS_2 5 * 16 - movu [r4 + r2], m6 - DCT8_PASS_2 6 * 16 - movu [r4 + r2 * 2], m6 - DCT8_PASS_2 7 * 16 - movu [r4 + r3], m6 + DCT8_PASS_2 0 * 16, 1 * 16 + movu [r1], m10 + DCT8_PASS_2 2 * 16, 3 * 16 + movu [r1 + 32], m10 + DCT8_PASS_2 4 * 16, 5 * 16 + movu [r1 + 64], m10 + DCT8_PASS_2 6 * 16, 7 * 16 + movu [r1 + 96], m10 RET %macro DCT16_PASS_1_E 2 @@ -1360,7 +1752,7 @@ cglobal dct8, 3, 7, 10, 0-8*16 mova [r5 + %2], xm10 %endmacro -%macro DCT16_PASS_2 1 +%macro DCT16_PASS_2 2 vbroadcasti128 m8, [r7 + %1] vbroadcasti128 m13, [r8 + %1] @@ -1385,9 +1777,40 @@ cglobal dct8, 3, 7, 10, 0-8*16 phaddd m10, m11 paddd m10, m9 psrad m10, DCT_SHIFT2 + + + vbroadcasti128 m8, [r7 + %2] + vbroadcasti128 m13, [r8 + %2] + + pmaddwd m14, m0, m8 + pmaddwd m11, m1, m13 + paddd m14, m11 + + pmaddwd m11, m2, m8 + pmaddwd m12, m3, m13 + paddd m11, m12 + phaddd m14, m11 + + pmaddwd m11, m4, m8 + pmaddwd m12, m5, m13 + paddd m11, m12 + + pmaddwd m12, m6, m8 + pmaddwd m13, m7, m13 + paddd m12, m13 + phaddd m11, m12 + + phaddd m14, m11 + paddd m14, m9 + psrad m14, DCT_SHIFT2 + + packssdw m10, m14 + vextracti128 xm14, m10, 1 + movlhps xm15, xm10, xm14 + movhlps xm14, xm10 %endmacro INIT_YMM avx2 -cglobal dct16, 3, 9, 15, 0-16*mmsize +cglobal dct16, 3, 9, 16, 0-16*mmsize %if BIT_DEPTH == 10 %define DCT_SHIFT 5 vbroadcasti128 m9, [pd_16] @@ -1487,7 +1910,7 @@ cglobal dct16, 3, 9, 15, 0-16*mmsize mov r5, rsp mov r4d, 2 - mov r2d, 64 + mov r2d, 32 lea r3, [r2 * 3] vbroadcasti128 m9, [pd_512] @@ -1504,46 +1927,42 @@ cglobal dct16, 3, 9, 15, 0-16*mmsize mova m6, [r5 + 3 * 32] ; [row3lo row7lo] mova m7, [r5 + 11 * 32] ; [row3hi row7hi] - DCT16_PASS_2 -8 * 16 - movu [r1], m10 - DCT16_PASS_2 -7 * 16 - movu [r1 + r2], m10 - DCT16_PASS_2 -6 * 16 - movu [r1 + r2 * 2], m10 - DCT16_PASS_2 -5 * 16 - movu [r1 + r3], m10 + DCT16_PASS_2 -8 * 16, -7 * 16 + movu [r1], xm15 + movu [r1 + r2], xm14 + + DCT16_PASS_2 -6 * 16, -5 * 16 + movu [r1 + r2 * 2], xm15 + movu [r1 + r3], xm14 lea r6, [r1 + r2 * 4] - DCT16_PASS_2 -4 * 16 - movu [r6], m10 - DCT16_PASS_2 -3 * 16 - movu [r6 + r2], m10 - DCT16_PASS_2 -2 * 16 - movu [r6 + r2 * 2], m10 - DCT16_PASS_2 -1 * 16 - movu [r6 + r3], m10 + DCT16_PASS_2 -4 * 16, -3 * 16 + movu [r6], xm15 + movu [r6 + r2], xm14 + + DCT16_PASS_2 -2 * 16, -1 * 16 + movu [r6 + r2 * 2], xm15 + movu [r6 + r3], xm14 lea r6, [r6 + r2 * 4] - DCT16_PASS_2 0 * 16 - movu [r6], m10 - DCT16_PASS_2 1 * 16 - movu [r6 + r2], m10 - DCT16_PASS_2 2 * 16 - movu [r6 + r2 * 2], m10 - DCT16_PASS_2 3 * 16 - movu [r6 + r3], m10 + DCT16_PASS_2 0 * 16, 1 * 16 + movu [r6], xm15 + movu [r6 + r2], xm14 + + DCT16_PASS_2 2 * 16, 3 * 16 + movu [r6 + r2 * 2], xm15 + movu [r6 + r3], xm14 lea r6, [r6 + r2 * 4] - DCT16_PASS_2 4 * 16 - movu [r6], m10 - DCT16_PASS_2 5 * 16 - movu [r6 + r2], m10 - DCT16_PASS_2 6 * 16 - movu [r6 + r2 * 2], m10 - DCT16_PASS_2 7 * 16 - movu [r6 + r3], m10 - - add r1, 32 + DCT16_PASS_2 4 * 16, 5 * 16 + movu [r6], xm15 + movu [r6 + r2], xm14 + + DCT16_PASS_2 6 * 16, 7 * 16 + movu [r6 + r2 * 2], xm15 + movu [r6 + r3], xm14 + + add r1, 16 add r5, 128 dec r4d @@ -1609,6 +2028,7 @@ cglobal dct16, 3, 9, 15, 0-16*mmsize paddd xm11, xm9 psrad xm11, DCT_SHIFT2 + packssdw xm11, xm11 %endmacro @@ -1704,7 +2124,7 @@ cglobal dct32, 3, 9, 16, 0-64*mmsize dec r4d jnz .pass1 - mov r2d, 128 + mov r2d, 64 lea r3, [r2 * 3] mov r5, rsp mov r4d, 8 @@ -1724,86 +2144,86 @@ cglobal dct32, 3, 9, 16, 0-64*mmsize mova m7, [r5 + 3 * 64 + 32] DCT32_PASS_2 0 * 32 - movu [r1], xm11 + movq [r1], xm11 DCT32_PASS_2 1 * 32 - movu [r1 + r2], xm11 + movq [r1 + r2], xm11 DCT32_PASS_2 2 * 32 - movu [r1 + r2 * 2], xm11 + movq [r1 + r2 * 2], xm11 DCT32_PASS_2 3 * 32 - movu [r1 + r3], xm11 + movq [r1 + r3], xm11 lea r6, [r1 + r2 * 4] DCT32_PASS_2 4 * 32 - movu [r6], xm11 + movq [r6], xm11 DCT32_PASS_2 5 * 32 - movu [r6 + r2], xm11 + movq [r6 + r2], xm11 DCT32_PASS_2 6 * 32 - movu [r6 + r2 * 2], xm11 + movq [r6 + r2 * 2], xm11 DCT32_PASS_2 7 * 32 - movu [r6 + r3], xm11 + movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 8 * 32 - movu [r6], xm11 + movq [r6], xm11 DCT32_PASS_2 9 * 32 - movu [r6 + r2], xm11 + movq [r6 + r2], xm11 DCT32_PASS_2 10 * 32 - movu [r6 + r2 * 2], xm11 + movq [r6 + r2 * 2], xm11 DCT32_PASS_2 11 * 32 - movu [r6 + r3], xm11 + movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 12 * 32 - movu [r6], xm11 + movq [r6], xm11 DCT32_PASS_2 13 * 32 - movu [r6 + r2], xm11 + movq [r6 + r2], xm11 DCT32_PASS_2 14 * 32 - movu [r6 + r2 * 2], xm11 + movq [r6 + r2 * 2], xm11 DCT32_PASS_2 15 * 32 - movu [r6 + r3], xm11 + movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 16 * 32 - movu [r6], xm11 + movq [r6], xm11 DCT32_PASS_2 17 * 32 - movu [r6 + r2], xm11 + movq [r6 + r2], xm11 DCT32_PASS_2 18 * 32 - movu [r6 + r2 * 2], xm11 + movq [r6 + r2 * 2], xm11 DCT32_PASS_2 19 * 32 - movu [r6 + r3], xm11 + movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 20 * 32 - movu [r6], xm11 + movq [r6], xm11 DCT32_PASS_2 21 * 32 - movu [r6 + r2], xm11 + movq [r6 + r2], xm11 DCT32_PASS_2 22 * 32 - movu [r6 + r2 * 2], xm11 + movq [r6 + r2 * 2], xm11 DCT32_PASS_2 23 * 32 - movu [r6 + r3], xm11 + movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 24 * 32 - movu [r6], xm11 + movq [r6], xm11 DCT32_PASS_2 25 * 32 - movu [r6 + r2], xm11 + movq [r6 + r2], xm11 DCT32_PASS_2 26 * 32 - movu [r6 + r2 * 2], xm11 + movq [r6 + r2 * 2], xm11 DCT32_PASS_2 27 * 32 - movu [r6 + r3], xm11 + movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 28 * 32 - movu [r6], xm11 + movq [r6], xm11 DCT32_PASS_2 29 * 32 - movu [r6 + r2], xm11 + movq [r6 + r2], xm11 DCT32_PASS_2 30 * 32 - movu [r6 + r2 * 2], xm11 + movq [r6 + r2 * 2], xm11 DCT32_PASS_2 31 * 32 - movu [r6 + r3], xm11 + movq [r6 + r3], xm11 add r5, 256 - add r1, 16 + add r1, 8 dec r4d jnz .pass2 @@ -1926,28 +2346,25 @@ cglobal idct8, 3, 7, 13, 0-8*16 lea r6, [avx2_idct8_2] ;pass1 - mova m0, [r0 + 0 * 32] - mova m1, [r0 + 4 * 32] - packssdw m0, m1 ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4] - mova m1, [r0 + 2 * 32] - mova m2, [r0 + 6 * 32] - packssdw m1, m2 ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6] - mova m2, [r0 + 1 * 32] - mova m3, [r0 + 5 * 32] - packssdw m2, m3 ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5] - mova m3, [r0 + 3 * 32] - mova m4, [r0 + 7 * 32] - packssdw m3, m4 ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7] + mova m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1] + mova m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3] + vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] + vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] + vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2] + vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3] + vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3] + + mova m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5] + mova m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7] + vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] + vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] + vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6] + vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7] + vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7] mova m5, [idct8_shuf1] - - punpcklwd m4, m0, m1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2] - punpckhwd m0, m1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6] vpermd m4, m5, m4 vpermd m0, m5, m0 - - punpcklwd m1, m2, m3 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3] - punpckhwd m2, m3 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7] vpermd m1, m5, m1 vpermd m2, m5, m2 @@ -2065,7 +2482,7 @@ cglobal idct8, 3, 7, 13, 0-8*16 %endmacro ;------------------------------------------------------- -; void idct16(int32_t *src, int16_t *dst, intptr_t stride) +; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_YMM avx2 cglobal idct16, 3, 7, 16, 0-16*mmsize @@ -2087,37 +2504,53 @@ cglobal idct16, 3, 7, 16, 0-16*mmsize mov r4d, 2 .pass1: - movu m0, [r0 + 0 * 64] - movu m1, [r0 + 8 * 64] - packssdw m0, m1 ;[0L 8L 0H 8H] - - movu m1, [r0 + 1 * 64] - movu m2, [r0 + 9 * 64] - packssdw m1, m2 ;[1L 9L 1H 9H] - - movu m2, [r0 + 2 * 64] - movu m3, [r0 + 10 * 64] - packssdw m2, m3 ;[2L 10L 2H 10H] - - movu m3, [r0 + 3 * 64] - movu m4, [r0 + 11 * 64] - packssdw m3, m4 ;[3L 11L 3H 11H] - - movu m4, [r0 + 4 * 64] - movu m5, [r0 + 12 * 64] - packssdw m4, m5 ;[4L 12L 4H 12H] - - movu m5, [r0 + 5 * 64] - movu m6, [r0 + 13 * 64] - packssdw m5, m6 ;[5L 13L 5H 13H] - - movu m6, [r0 + 6 * 64] - movu m7, [r0 + 14 * 64] - packssdw m6, m7 ;[6L 14L 6H 14H] - - movu m7, [r0 + 7 * 64] - movu m8, [r0 + 15 * 64] - packssdw m7, m8 ;[7L 15L 7H 15H] + movu xm0, [r0 + 0 * 32] + movu xm1, [r0 + 8 * 32] + punpckhqdq xm2, xm0, xm1 + punpcklqdq xm0, xm1 + vinserti128 m0, m0, xm2, 1 + + movu xm1, [r0 + 1 * 32] + movu xm2, [r0 + 9 * 32] + punpckhqdq xm3, xm1, xm2 + punpcklqdq xm1, xm2 + vinserti128 m1, m1, xm3, 1 + + movu xm2, [r0 + 2 * 32] + movu xm3, [r0 + 10 * 32] + punpckhqdq xm4, xm2, xm3 + punpcklqdq xm2, xm3 + vinserti128 m2, m2, xm4, 1 + + movu xm3, [r0 + 3 * 32] + movu xm4, [r0 + 11 * 32] + punpckhqdq xm5, xm3, xm4 + punpcklqdq xm3, xm4 + vinserti128 m3, m3, xm5, 1 + + movu xm4, [r0 + 4 * 32] + movu xm5, [r0 + 12 * 32] + punpckhqdq xm6, xm4, xm5 + punpcklqdq xm4, xm5 + vinserti128 m4, m4, xm6, 1 + + movu xm5, [r0 + 5 * 32] + movu xm6, [r0 + 13 * 32] + punpckhqdq xm7, xm5, xm6 + punpcklqdq xm5, xm6 + vinserti128 m5, m5, xm7, 1 + + movu xm6, [r0 + 6 * 32] + movu xm7, [r0 + 14 * 32] + punpckhqdq xm8, xm6, xm7 + punpcklqdq xm6, xm7 + vinserti128 m6, m6, xm8, 1 + + movu xm7, [r0 + 7 * 32] + movu xm8, [r0 + 15 * 32] + punpckhqdq xm9, xm7, xm8 + punpcklqdq xm7, xm8 + vinserti128 m7, m7, xm9, 1 punpckhwd m8, m0, m2 ;[8 10] punpcklwd m0, m2 ;[0 2] @@ -2160,7 +2593,7 @@ cglobal idct16, 3, 7, 16, 0-16*mmsize IDCT_PASS1 4, 10 IDCT_PASS1 6, 8 - add r0, 32 + add r0, 16 add r3, 16 dec r4d jnz .pass1 @@ -2328,7 +2761,7 @@ cglobal idct16, 3, 7, 16, 0-16*mmsize %endmacro ;------------------------------------------------------- -; void idct32(int32_t *src, int16_t *dst, intptr_t stride) +; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- ; TODO: Reduce PHADDD instruction by PADDD @@ -2345,54 +2778,69 @@ cglobal idct32, 3, 6, 16, 0-32*64 mov r5d, 8 .pass1: - movu xm0, [r0 + 2 * 128] - movu xm1, [r0 + 18 * 128] - vinserti128 m0, m0, [r0 + 0 * 128], 1 - vinserti128 m1, m1, [r0 + 16 * 128], 1 - - packssdw m0, m1 ;[2 18 0 16] - - movu xm1, [r0 + 1 * 128] - movu xm2, [r0 + 9 * 128] - vinserti128 m1, m1, [r0 + 17 * 128], 1 - vinserti128 m2, m2, [r0 + 25 * 128], 1 - packssdw m1, m2 ;[1 9 17 25] - - movu xm2, [r0 + 6 * 128] - movu xm3, [r0 + 22 * 128] - vinserti128 m2, m2, [r0 + 4 * 128], 1 - vinserti128 m3, m3, [r0 + 20 * 128], 1 - packssdw m2, m3 ;[6 22 4 20] - - movu xm3, [r0 + 3 * 128] - movu xm4, [r0 + 11 * 128] - vinserti128 m3, m3, [r0 + 19 * 128], 1 - vinserti128 m4, m4, [r0 + 27 * 128], 1 - packssdw m3, m4 ;[3 11 19 27] - - movu xm4, [r0 + 10 * 128] - movu xm5, [r0 + 26 * 128] - vinserti128 m4, m4, [r0 + 8 * 128], 1 - vinserti128 m5, m5, [r0 + 24 * 128], 1 - packssdw m4, m5 ;[10 26 8 24] - - movu xm5, [r0 + 5 * 128] - movu xm6, [r0 + 13 * 128] - vinserti128 m5, m5, [r0 + 21 * 128], 1 - vinserti128 m6, m6, [r0 + 29 * 128], 1 - packssdw m5, m6 ;[5 13 21 29] - - movu xm6, [r0 + 14 * 128] - movu xm7, [r0 + 30 * 128] - vinserti128 m6, m6, [r0 + 12 * 128], 1 - vinserti128 m7, m7, [r0 + 28 * 128], 1 - packssdw m6, m7 ;[14 30 12 28] - - movu xm7, [r0 + 7 * 128] - movu xm8, [r0 + 15 * 128] - vinserti128 m7, m7, [r0 + 23 * 128], 1 - vinserti128 m8, m8, [r0 + 31 * 128], 1 - packssdw m7, m8 ;[7 15 23 31] + movq xm0, [r0 + 2 * 64] + movq xm1, [r0 + 18 * 64] + punpcklqdq xm0, xm0, xm1 + movq xm1, [r0 + 0 * 64] + movq xm2, [r0 + 16 * 64] + punpcklqdq xm1, xm1, xm2 + vinserti128 m0, m0, xm1, 1 ;[2 18 0 16] + + movq xm1, [r0 + 1 * 64] + movq xm2, [r0 + 9 * 64] + punpcklqdq xm1, xm1, xm2 + movq xm2, [r0 + 17 * 64] + movq xm3, [r0 + 25 * 64] + punpcklqdq xm2, xm2, xm3 + vinserti128 m1, m1, xm2, 1 ;[1 9 17 25] + + movq xm2, [r0 + 6 * 64] + movq xm3, [r0 + 22 * 64] + punpcklqdq xm2, xm2, xm3 + movq xm3, [r0 + 4 * 64] + movq xm4, [r0 + 20 * 64] + punpcklqdq xm3, xm3, xm4 + vinserti128 m2, m2, xm3, 1 ;[6 22 4 20] + + movq xm3, [r0 + 3 * 64] + movq xm4, [r0 + 11 * 64] + punpcklqdq xm3, xm3, xm4 + movq xm4, [r0 + 19 * 64] + movq xm5, [r0 + 27 * 64] + punpcklqdq xm4, xm4, xm5 + vinserti128 m3, m3, xm4, 1 ;[3 11 17 25] + + movq xm4, [r0 + 10 * 64] + movq xm5, [r0 + 26 * 64] + punpcklqdq xm4, xm4, xm5 + movq xm5, [r0 + 8 * 64] + movq xm6, [r0 + 24 * 64] + punpcklqdq xm5, xm5, xm6 + vinserti128 m4, m4, xm5, 1 ;[10 26 8 24] + + movq xm5, [r0 + 5 * 64] + movq xm6, [r0 + 13 * 64] + punpcklqdq xm5, xm5, xm6 + movq xm6, [r0 + 21 * 64] + movq xm7, [r0 + 29 * 64] + punpcklqdq xm6, xm6, xm7 + vinserti128 m5, m5, xm6, 1 ;[5 13 21 9] + + movq xm6, [r0 + 14 * 64] + movq xm7, [r0 + 30 * 64] + punpcklqdq xm6, xm6, xm7 + movq xm7, [r0 + 12 * 64] + movq xm8, [r0 + 28 * 64] + punpcklqdq xm7, xm7, xm8 + vinserti128 m6, m6, xm7, 1 ;[14 30 12 28] + + movq xm7, [r0 + 7 * 64] + movq xm8, [r0 + 15 * 64] + punpcklqdq xm7, xm7, xm8 + movq xm8, [r0 + 23 * 64] + movq xm9, [r0 + 31 * 64] + punpcklqdq xm8, xm8, xm9 + vinserti128 m7, m7, xm8, 1 ;[7 15 23 31] punpckhwd m8, m0, m2 ;[18 22 16 20] punpcklwd m0, m2 ;[2 6 0 4] @@ -2451,7 +2899,7 @@ cglobal idct32, 3, 6, 16, 0-32*64 IDCT32_PASS1 6 IDCT32_PASS1 7 - add r0, 16 + add r0, 8 add r3, 4 add r4, 4 dec r5d @@ -2612,7 +3060,7 @@ cglobal idct32, 3, 6, 16, 0-32*64 RET ;------------------------------------------------------- -; void idct4(int32_t *src, int16_t *dst, intptr_t stride) +; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_YMM avx2 cglobal idct4, 3, 4, 6 @@ -2632,13 +3080,14 @@ cglobal idct4, 3, 4, 6 add r2d, r2d lea r3, [r2 * 3] - movu m0, [r0] ;[00 01 02 03 10 11 12 13] - movu m1, [r0 + 32] ;[20 21 22 23 30 31 32 33] + movu m0, [r0] ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33] - packssdw m0, m1 ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33] - pshufb m0, [idct4_shuf1] ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33] - vpermq m2, m0, 0x44 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23] - vpermq m0, m0, 0xEE ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33] + pshufb m0, [idct4_shuf1] ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33] + vextracti128 xm1, m0, 1 ;[20 22 21 23 30 32 31 33] + punpcklwd xm2, xm0, xm1 ;[00 20 02 22 01 21 03 23] + punpckhwd xm0, xm1 ;[10 30 12 32 11 31 13 33] + vinserti128 m2, m2, xm2, 1 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23] + vinserti128 m0, m0, xm0, 1 ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33] mova m1, [avx2_idct4_1] mova m3, [avx2_idct4_1 + 32] diff --git a/source/common/x86/dct8.h b/source/common/x86/dct8.h index 3b74f2a..f9516d6 100644 --- a/source/common/x86/dct8.h +++ b/source/common/x86/dct8.h @@ -23,23 +23,24 @@ #ifndef X265_DCT8_H #define X265_DCT8_H -void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride); -void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride); -void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride); -void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride); -void x265_dct8_avx2(int16_t *src, int32_t *dst, intptr_t stride); -void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride); -void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride); -void x265_idct32_avx2(int32_t *src, int16_t *dst, intptr_t stride); +void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride); +void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride); +void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride); +void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride); +void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride); +void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride); +void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride); -void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride); -void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride); -void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride); -void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride); -void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride); -void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride); +void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride); +void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride); +void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride); +void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride); +void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride); +void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride); +void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride); +void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride); -void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size); -void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size); +void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size); +void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size); #endif // ifndef X265_DCT8_H diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm index 52fc42c..c62eef6 100644 --- a/source/common/x86/ipfilter8.asm +++ b/source/common/x86/ipfilter8.asm @@ -31,6 +31,13 @@ tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14 +ALIGN 32 +const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15 + +ALIGN 32 +const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4 + dd 2, 3, 3, 4, 4, 5, 5, 6 + ALIGN 32 tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10 @@ -42,7 +49,6 @@ tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3 -tab_c_512: times 8 dw 512 tab_c_526336: times 4 dd 8192*64+2048 tab_ChromaCoeff: db 0, 64, 0, 0 @@ -123,13 +129,63 @@ tab_LumaCoeffVer: times 8 db 0, 0 times 8 db 58, -10 times 8 db 4, -1 -tab_c_128: times 16 db 0x80 +ALIGN 32 +tab_LumaCoeffVer_32: times 16 db 0, 0 + times 16 db 0, 64 + times 16 db 0, 0 + times 16 db 0, 0 + + times 16 db -1, 4 + times 16 db -10, 58 + times 16 db 17, -5 + times 16 db 1, 0 + + times 16 db -1, 4 + times 16 db -11, 40 + times 16 db 40, -11 + times 16 db 4, -1 + + times 16 db 0, 1 + times 16 db -5, 17 + times 16 db 58, -10 + times 16 db 4, -1 + +ALIGN 32 +tab_ChromaCoeffVer_32: times 16 db 0, 64 + times 16 db 0, 0 + + times 16 db -2, 58 + times 16 db 10, -2 + + times 16 db -4, 54 + times 16 db 16, -2 + + times 16 db -6, 46 + times 16 db 28, -4 + + times 16 db -4, 36 + times 16 db 36, -4 + + times 16 db -4, 28 + times 16 db 46, -6 + + times 16 db -2, 16 + times 16 db 54, -4 + + times 16 db -2, 10 + times 16 db 58, -2 + tab_c_64_n64: times 8 db 64, -64 +const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 + +ALIGN 32 +interp4_horiz_shuf1: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 SECTION .text -cextern idct4_shuf1 +cextern pb_128 cextern pw_1 cextern pw_512 cextern pw_2000 @@ -171,7 +227,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 2 @@ -203,7 +259,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 4 @@ -235,7 +291,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] mov r5d, 16/2 @@ -285,7 +341,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] FILTER_H4_w4_2 t0, t1, t2 @@ -313,7 +369,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 2 @@ -345,7 +401,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 4 @@ -377,7 +433,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 8 @@ -409,7 +465,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] mov r5d, 32/2 @@ -423,6 +479,9 @@ jnz .loop RET +ALIGN 32 +const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 + %macro FILTER_H4_w6 3 movu %1, [srcq - 1] @@ -606,7 +665,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] mov r5d, %2 pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] mova Tm1, [tab_Tm + 16] @@ -662,7 +721,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4] mov r5d, %2 pshufd coef2, coef2, 0 -mova t2, [tab_c_512] +mova t2, [pw_512] mova Tm0, [tab_Tm] mova Tm1, [tab_Tm + 16] @@ -749,7 +808,7 @@ cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8 punpcklqdq m3, m3 %ifidn %3, pp - mova m2, [tab_c_512] + mova m2, [pw_512] %else mova m2, [pw_2000] %endif @@ -845,7 +904,7 @@ cglobal interp_8tap_horiz_pp_4x4, 4,6,6 pmulhrsw m3, [pw_512] vextracti128 xm4, m3, 1 packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] - pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0] + pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0] lea r0, [r3 * 3] movd [r2], xm3 @@ -854,7 +913,664 @@ cglobal interp_8tap_horiz_pp_4x4, 4,6,6 pextrd [r2+r0], xm3, 3 RET +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] +%endif + + mova m1, [tab_Lm] + mova m2, [tab_Lm + 32] + + ; register map + ; m0 - interpolate coeff + ; m1, m2 - shuffle order table + + sub r0, 3 + lea r5, [r1 * 3] + lea r4, [r3 * 3] + + ; Row 0 + vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m2 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddubsw m4, m0 + phaddw m3, m4 + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m2 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 + + phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] + pmulhrsw m3, [pw_512] + + ; Row 2 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m2 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 + ; Row 3 + vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m6, m5, m2 + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddubsw m6, m0 + phaddw m5, m6 + + phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] + pmulhrsw m4, [pw_512] + + packuswb m3, m4 + vextracti128 xm4, m3, 1 + punpcklwd xm5, xm3, xm4 + + movq [r2], xm5 + movhps [r2 + r3], xm5 + + punpckhwd xm5, xm3, xm4 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm5 + RET + +%macro IPFILTER_LUMA_AVX2_8xN 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] +%endif + + mova m1, [tab_Lm] + mova m2, [tab_Lm + 32] + + ; register map + ; m0 - interpolate coeff + ; m1, m2 - shuffle order table + + sub r0, 3 + lea r5, [r1 * 3] + lea r6, [r3 * 3] + mov r4d, %2 / 4 +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m2 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddubsw m4, m0 + phaddw m3, m4 + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m2 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 + + phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] + pmulhrsw m3, [pw_512] + + ; Row 2 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m2 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 + ; Row 3 + vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m6, m5, m2 + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddubsw m6, m0 + phaddw m5, m6 + + phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] + pmulhrsw m4, [pw_512] + + packuswb m3, m4 + vextracti128 xm4, m3, 1 + punpcklwd xm5, xm3, xm4 + + movq [r2], xm5 + movhps [r2 + r3], xm5 + + punpckhwd xm5, xm3, xm4 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm5 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + dec r4d + jnz .loop + RET +%endmacro + +IPFILTER_LUMA_AVX2_8xN 8, 8 +IPFILTER_LUMA_AVX2_8xN 8, 16 +IPFILTER_LUMA_AVX2_8xN 8, 32 + +%macro IPFILTER_LUMA_AVX2 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [tab_Tm + 16] + vpbroadcastd m7, [pw_1] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + mov r4d, %2/2 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0 + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m2, m3 + pshufb m2, [tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0 + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2], xm4 + movu [r2+r3], xm5 + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop + RET +%endmacro + +%macro IPFILTER_LUMA_32x_avx2 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [tab_Tm + 16] + vpbroadcastd m7, [pw_1] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + + mov r4d, %2 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 8] + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + vbroadcasti128 m2, [r0 + 16] + pshufb m5, m2, m3 + pshufb m2, [tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2], xm4 + movu [r2 + 16], xm5 + lea r0, [r0 + r1] + lea r2, [r2 + r3] + dec r4d + jnz .loop + RET +%endmacro + +%macro IPFILTER_LUMA_64x_avx2 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [tab_Tm + 16] + vpbroadcastd m7, [pw_1] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + + mov r4d, %2 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 8] + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + vbroadcasti128 m2, [r0 + 16] + pshufb m5, m2, m3 + pshufb m2, [tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2], xm4 + movu [r2 + 16], xm5 + + vbroadcasti128 m4, [r0 + 32] + pshufb m5, m4, m3 + pshufb m4, [tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 40] + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 + pmulhrsw m4, [pw_512] + vbroadcasti128 m2, [r0 + 48] + pshufb m5, m2, m3 + pshufb m2, [tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + 56] + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2 +32], xm4 + movu [r2 + 48], xm5 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + dec r4d + jnz .loop + RET +%endmacro + +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_48x64, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [tab_Tm + 16] + vpbroadcastd m7, [pw_1] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + + mov r4d, 64 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 8] + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + + vbroadcasti128 m2, [r0 + 16] + pshufb m5, m2, m3 + pshufb m2, [tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2], xm4 + movu [r2 + 16], xm5 + + vbroadcasti128 m4, [r0 + 32] + pshufb m5, m4, m3 + pshufb m4, [tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 40] + pshufb m6, m5, m3 + pshufb m5, [tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 + pmulhrsw m4, [pw_512] + packuswb m4, m4 + vpermq m4, m4, 11011000b + pshufd xm4, xm4, 11011000b + movu [r2 + 32], xm4 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + dec r4d + jnz .loop + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_4x4, 4,6,6 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + vpbroadcastd m2, [pw_1] + vbroadcasti128 m1, [tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + + ; Row 0-1 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 2-3 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m4, m4, [r0 + r1], 1 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + pmulhrsw m3, [pw_512] + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + + lea r0, [r3 * 3] + movd [r2], xm3 + pextrd [r2+r3], xm3, 2 + pextrd [r2+r3*2], xm3, 1 + pextrd [r2+r0], xm3, 3 + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_32x32, 4,6,7 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + mova m6, [pw_512] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, 32 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + movu [r2], m3 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + dec r4d + jnz .loop + RET + + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + mova m6, [pw_512] + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, 8 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movu [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r4d + jnz .loop + RET ;-------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- @@ -863,6 +1579,91 @@ cglobal interp_8tap_horiz_pp_4x4, 4,6,6 IPFILTER_LUMA 12, 16, pp IPFILTER_LUMA 4, 16, pp +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x8, 4,6,6 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + movu m1, [tab_Tm] + vpbroadcastd m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + sub r0, 1 + mov r4d, 2 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, [pw_512] + lea r0, [r0 + r1 * 2] + + ; Row 2 + vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + ; Row 3 + vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, [pw_512] + + packuswb m3, m4 + mova m5, [interp_4tap_8x8_horiz_shuf] + vpermd m3, m5, m3 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movhps [r2 + r3], xm3 + lea r2, [r2 + r3 * 2] + movq [r2], xm4 + movhps [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1*2] + dec r4d + jnz .loop + RET + + IPFILTER_LUMA_AVX2 16, 4 + IPFILTER_LUMA_AVX2 16, 8 + IPFILTER_LUMA_AVX2 16, 12 + IPFILTER_LUMA_AVX2 16, 16 + IPFILTER_LUMA_AVX2 16, 32 + IPFILTER_LUMA_AVX2 16, 64 + + IPFILTER_LUMA_32x_avx2 32 , 8 + IPFILTER_LUMA_32x_avx2 32 , 16 + IPFILTER_LUMA_32x_avx2 32 , 24 + IPFILTER_LUMA_32x_avx2 32 , 32 + IPFILTER_LUMA_32x_avx2 32 , 64 + + IPFILTER_LUMA_64x_avx2 64 , 64 + IPFILTER_LUMA_64x_avx2 64 , 48 + IPFILTER_LUMA_64x_avx2 64 , 32 + IPFILTER_LUMA_64x_avx2 64 , 16 + ;-------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- @@ -1040,7 +1841,7 @@ cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 mov r4, rsp .loopH: - FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3] + FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3] psubw m1, [pw_2000] mova [r4], m1 @@ -1108,7 +1909,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4] lea r4, [r1 * 3] lea r5, [r0 + 4 * r1] pshufb m0, [tab_Cm] -mova m1, [tab_c_512] +mova m1, [pw_512] movd m2, [r0] movd m3, [r0 + r1] @@ -1181,7 +1982,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4] pshufb m0, [tab_Cm] -mova m1, [tab_c_512] +mova m1, [pw_512] mov r4d, %2 lea r5, [3 * r1] @@ -1289,7 +2090,7 @@ pmaddubsw m3, m0 phaddw m2, m3 -pmulhrsw m2, [tab_c_512] +pmulhrsw m2, [pw_512] packuswb m2, m2 movd [r2], m2 pextrd [r2 + r3], m2, 1 @@ -1313,7 +2114,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] -mova m1, [tab_c_512] +mova m1, [pw_512] lea r5, [r0 + 4 * r1] lea r4, [r1 * 3] @@ -1369,6 +2170,51 @@ pextrd [r2 + r3], m2, 3 RET +INIT_YMM avx2 +cglobal interp_4tap_vert_pp_4x4, 4, 6, 3 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0] + mova m2, [interp4_vpp_shuf1] + vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0] + mova m2, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2] + + mova m2, [interp4_vpp_shuf] + pshufb m0, m0, m2 + pshufb m1, m1, m2 + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + mmsize] + paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] + pmulhrsw m0, [pw_512] + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + lea r5, [r3 * 3] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r5], xm0, 3 + RET + ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- @@ -1388,7 +2234,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4] pshufb m0, [tab_Cm] -mova m1, [tab_c_512] +mova m1, [pw_512] mov r4d, %2 @@ -1590,7 +2436,7 @@ pmaddubsw m4, m5 paddw m0, m4 -mova m4, [tab_c_512] +mova m4, [pw_512] pmulhrsw m0, m4 packuswb m0, m0 @@ -2495,7 +3341,7 @@ movd m5, [tab_ChromaCoeff + r4 * 4] pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] -mova m4, [tab_c_512] +mova m4, [pw_512] lea r5, [r1 * 3] mov r4d, %2 @@ -2573,6 +3419,84 @@ FILTER_V4_W8_H8_H16_H32 8, 32 FILTER_V4_W8_H8_H16_H32 8, 12 FILTER_V4_W8_H8_H16_H32 8, 64 +%macro PROCESS_CHROMA_AVX2_W8_8R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m0, [r5 + 1 * mmsize] + paddw m4, m0 +%endmacro + +INIT_YMM avx2 +cglobal interp_4tap_vert_pp_8x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + PROCESS_CHROMA_AVX2_W8_8R + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r4], xm4 + RET ;----------------------------------------------------------------------------- ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) @@ -2593,7 +3517,7 @@ movd m5, [tab_ChromaCoeff + r4 * 4] pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] -mova m4, [tab_c_512] +mova m4, [pw_512] mov r4d, %2 lea r5, [3 * r1] @@ -2716,7 +3640,7 @@ punpckhbw m6, m5, m7 pmaddubsw m6, m0 paddw m2, m6 -mova m6, [tab_c_512] +mova m6, [pw_512] pmulhrsw m4, m6 pmulhrsw m2, m6 @@ -2806,7 +3730,7 @@ punpcklbw m7, m5, m6 pmaddubsw m7, m0 paddw m4, m7 -mova m7, [tab_c_512] +mova m7, [pw_512] pmulhrsw m4, m7 pmulhrsw m2, m7 @@ -2855,6 +3779,217 @@ FILTER_V4_W16_H2 16, 32 FILTER_V4_W16_H2 16, 24 FILTER_V4_W16_H2 16, 64 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_pp_16x16, 4, 6, 15 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m12, [r5] + mova m13, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 + lea r5, [r3 * 3] + mova m14, [pw_512] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, m12 + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, m12 + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, m13 + paddw m0, m4 + pmaddubsw m2, m12 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, m13 + paddw m1, m5 + pmaddubsw m3, m12 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, m13 + paddw m2, m6 + pmaddubsw m4, m12 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, m13 + paddw m3, m7 + pmaddubsw m5, m12 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, m13 + paddw m4, m8 + pmaddubsw m6, m12 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, m13 + paddw m5, m9 + pmaddubsw m7, m12 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, m13 + paddw m6, m10 + pmaddubsw m8, m12 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, m13 + paddw m7, m11 + pmaddubsw m9, m12 + + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r5], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r5], xm7 + lea r2, [r2 + r3 * 4] + + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm6, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm6, 1 + pmaddubsw m6, m10, m13 + paddw m8, m6 + pmaddubsw m10, m12 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 12 + punpckhbw xm7, xm11, xm6 + punpcklbw xm11, xm6 + vinserti128 m11, m11, xm7, 1 + pmaddubsw m7, m11, m13 + paddw m9, m7 + pmaddubsw m11, m12 + + movu xm7, [r0 + r1] ; m7 = row 13 + punpckhbw xm0, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm0, 1 + pmaddubsw m0, m6, m13 + paddw m10, m0 + pmaddubsw m6, m12 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm7, xm0 + punpcklbw xm7, xm0 + vinserti128 m7, m7, xm1, 1 + pmaddubsw m1, m7, m13 + paddw m11, m1 + pmaddubsw m7, m12 + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, m13 + paddw m6, m2 + pmaddubsw m0, m12 + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, m13 + paddw m7, m3 + pmaddubsw m1, m12 + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m2, m13 + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m3, m13 + paddw m1, m3 + + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m6, m14 ; m6 = word: row 12 + pmulhrsw m7, m14 ; m7 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m6, m7 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m6, m6, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm7, m6, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r5], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm6 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm0 + movu [r2 + r5], xm1 + RET +%endif + ;----------------------------------------------------------------------------- ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- @@ -2899,7 +4034,7 @@ punpckhbw m6, m5, m7 pmaddubsw m6, m0 paddw m2, m6 -mova m6, [tab_c_512] +mova m6, [pw_512] pmulhrsw m4, m6 pmulhrsw m2, m6 @@ -2998,7 +4133,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4] pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] -mova m7, [tab_c_512] +mova m7, [pw_512] mov r4d, %2 @@ -3076,6 +4211,96 @@ FILTER_V4_W32 32, 32 FILTER_V4_W32 32, 48 FILTER_V4_W32 32, 64 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_pp_32x32, 4, 7, 13 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m10, [r5] + mova m11, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 + lea r5, [r3 * 3] + mova m12, [pw_512] + mov r6d, 8 +.loopW: + movu m0, [r0] ; m0 = row 0 + movu m1, [r0 + r1] ; m1 = row 1 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + movu m0, [r0 + r1 * 2] ; m0 = row 2 + punpcklbw m4, m1, m0 + punpckhbw m5, m1, m0 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + movu m1, [r0 + r4] ; m1 = row 3 + punpcklbw m6, m0, m1 + punpckhbw m7, m0, m1 + pmaddubsw m8, m6, m11 + pmaddubsw m9, m7, m11 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + paddw m2, m8 + paddw m3, m9 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2], m2 + + lea r0, [r0 + r1 * 4] + movu m0, [r0] ; m0 = row 4 + punpcklbw m2, m1, m0 + punpckhbw m3, m1, m0 + pmaddubsw m8, m2, m11 + pmaddubsw m9, m3, m11 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + paddw m4, m8 + paddw m5, m9 + pmulhrsw m4, m12 + pmulhrsw m5, m12 + packuswb m4, m5 + movu [r2 + r3], m4 + + movu m1, [r0 + r1] ; m1 = row 5 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m6, m4 + paddw m7, m5 + pmulhrsw m6, m12 + pmulhrsw m7, m12 + packuswb m6, m7 + movu [r2 + r3 * 2], m6 + + movu m0, [r0 + r1 * 2] ; m0 = row 6 + punpcklbw m6, m1, m0 + punpckhbw m7, m1, m0 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m2, m6 + paddw m3, m7 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2 + r5], m2 + + lea r2, [r2 + r3 * 4] + dec r6d + jnz .loopW + RET +%endif ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) @@ -3126,7 +4351,7 @@ punpcklbw m7, m5, m6 pmaddubsw m7, m0 paddw m4, m7 -mova m7, [tab_c_512] +mova m7, [pw_512] pmulhrsw m4, m7 pmulhrsw m2, m7 @@ -3190,7 +4415,7 @@ cglobal luma_p2s, 3, 7, 6 mov r4d, r4m ; load constant - mova m4, [tab_c_128] + mova m4, [pb_128] mova m5, [tab_c_64_n64] .loopH: @@ -3379,7 +4604,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 %endif %ifidn %3,pp - mova m3, [tab_c_512] + mova m3, [pw_512] %else mova m3, [pw_2000] %endif @@ -3421,6 +4646,149 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 RET %endmacro + +INIT_YMM avx2 +cglobal interp_8tap_vert_pp_4x4, 4,6,8 + mov r4d, r4m + lea r5, [r1 * 3] + sub r0, r5 + + ; TODO: VPGATHERDD + movd xm1, [r0] ; m1 = row0 + movd xm2, [r0 + r1] ; m2 = row1 + punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00] + + movd xm3, [r0 + r1 * 2] ; m3 = row2 + punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10] + movd xm4, [r0 + r5] + punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20] + punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] + + lea r0, [r0 + r1 * 4] + movd xm5, [r0] ; m5 = row4 + punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30] + punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] + vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] + movd xm2, [r0 + r1] ; m2 = row5 + punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40] + punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] + movd xm6, [r0 + r1 * 2] ; m6 = row6 + punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50] + punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] + vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] + movd xm4, [r0 + r5] ; m4 = row7 + punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60] + punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] + + lea r0, [r0 + r1 * 4] + movd xm7, [r0] ; m7 = row8 + punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70] + punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] + vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] + movd xm2, [r0 + r1] ; m2 = row9 + punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80] + punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] + movd xm7, [r0 + r1 * 2] ; m7 = rowA + punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90] + punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] + vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] + + ; load filter coeff +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8 + 0] + vpbroadcastd m2, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0] + vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4] +%endif + + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddubsw m5, m2 + pmaddubsw m6, m2 + vbroadcasti128 m0, [pw_1] + pmaddwd m1, m0 + pmaddwd m3, m0 + pmaddwd m5, m0 + pmaddwd m6, m0 + paddd m1, m5 ; m1 = DQWORD ROW[1 0] + paddd m3, m6 ; m3 = DQWORD ROW[3 2] + packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0] + + ; TODO: does it overflow? + pmulhrsw m1, [pw_512] + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 2 + pextrd [r2 + r3 * 2], xm1, 1 + lea r4, [r3 * 3] + pextrd [r2 + r4], xm1, 3 + RET + +INIT_YMM avx2 +cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + + add r3d, r3d + + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] + mova m3, [interp4_vpp_shuf1] + vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] + mova m3, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] + + mova m3, [interp4_vpp_shuf] + pshufb m0, m0, m3 + pshufb m1, m1, m3 + pshufb m4, m4, m3 + pshufb m2, m2, m3 + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m4, [r5 + 2 * mmsize] + pmaddubsw m2, [r5 + 3 * mmsize] + paddw m0, m1 + paddw m0, m4 + paddw m0, m2 ; m0 = WORD ROW[3 2 1 0] + + vbroadcasti128 m3, [pw_2000] + psubw m0, m3 + vextracti128 xm2, m0, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 + RET + ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- @@ -3451,6 +4819,122 @@ FILTER_VER_LUMA_4xN 4, 8, ps ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_4xN 4, 16, ps +%macro PROCESS_LUMA_AVX2_W8_8R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 + pmaddubsw m3, m0, [r5 + 2 * mmsize] + paddw m1, m3 + pmaddubsw m0, [r5 + 1 * mmsize] + paddw m4, m0 + + movq xm3, [r0 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 12 + punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] + vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] + pmaddubsw m3, m6, [r5 + 3 * mmsize] + paddw m1, m3 + pmaddubsw m6, [r5 + 2 * mmsize] + paddw m4, m6 + movq xm3, [r0 + r1] ; m3 = row 13 + punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] + movq xm6, [r0 + r1 * 2] ; m6 = row 14 + punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] + vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] + pmaddubsw m0, [r5 + 3 * mmsize] + paddw m4, m0 +%endmacro + +%macro PROCESS_LUMA_AVX2_W8_4R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 +%endmacro + ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- @@ -3473,7 +4957,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 %endif %ifidn %3,pp - mova m3, [tab_c_512] + mova m3, [pw_512] %else mova m3, [pw_2000] %endif @@ -3520,6 +5004,115 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 RET %endmacro +%macro FILTER_VER_LUMA_AVX2_8xN 2 +INIT_YMM avx2 +cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r1 * 4] + mov word [rsp], %2 / 8 + mova m7, [pw_512] + +.loop: + PROCESS_LUMA_AVX2_W8_8R + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + lea r2, [r2 + r3 * 2] + movhps [r2], xm5 + movhps [r2 + r3], xm2 + lea r2, [r2 + r3 * 2] + movq [r2], xm1 + movq [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm1 + movhps [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + sub r0, r6 + dec word [rsp] + jnz .loop + RET +%endmacro + +INIT_YMM avx2 +cglobal interp_8tap_vert_pp_8x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + PROCESS_LUMA_AVX2_W8_8R + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r4], xm4 + RET + +INIT_YMM avx2 +cglobal interp_8tap_vert_pp_8x4, 4, 6, 7 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + PROCESS_LUMA_AVX2_W8_4R + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + packuswb m5, m2 + vextracti128 xm2, m5, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + RET + ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- @@ -3534,11 +5127,13 @@ FILTER_VER_LUMA_8xN 8, 8, pp ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 16, pp +FILTER_VER_LUMA_AVX2_8xN 8, 16 ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 32, pp +FILTER_VER_LUMA_AVX2_8xN 8, 32 ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) @@ -3581,7 +5176,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 %endif %ifidn %3,pp - mova m3, [tab_c_512] + mova m3, [pw_512] %else mova m3, [pw_2000] %endif @@ -3674,6 +5269,2260 @@ FILTER_VER_LUMA_12xN 12, 16, pp ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_12xN 12, 16, ps +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_12x16, 4, 7, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + mova m14, [pw_512] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movq [r2], xm0 + pextrd [r2 + 8], xm0, 2 + movq [r2 + r3], xm1 + pextrd [r2 + r3 + 8], xm1, 2 + movq [r2 + r3 * 2], xm2 + pextrd [r2 + r3 * 2 + 8], xm2, 2 + movq [r2 + r6], xm3 + pextrd [r2 + r6 + 8], xm3, 2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + pextrd [r2 + 8], xm4, 2 + movq [r2 + r3], xm5 + pextrd [r2 + r3 + 8], xm5, 2 + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movq [r2 + r3 * 2], xm6 + pextrd [r2 + r3 * 2 + 8], xm6, 2 + movq [r2 + r6], xm7 + pextrd [r2 + r6 + 8], xm7, 2 + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 + + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movq [r2], xm8 + pextrd [r2 + 8], xm8, 2 + movq [r2 + r3], xm9 + pextrd [r2 + r3 + 8], xm9, 2 + movq [r2 + r3 * 2], xm10 + pextrd [r2 + r3 * 2 + 8], xm10, 2 + movq [r2 + r6], xm11 + pextrd [r2 + r6 + 8], xm11, 2 + lea r2, [r2 + r3 * 4] + movq [r2], xm12 + pextrd [r2 + 8], xm12, 2 + movq [r2 + r3], xm13 + pextrd [r2 + r3 + 8], xm13, 2 + movq [r2 + r3 * 2], xm0 + pextrd [r2 + r3 * 2 + 8], xm0, 2 + movq [r2 + r6], xm1 + pextrd [r2 + r6 + 8], xm1, 2 + RET +%endif + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_16x16, 4, 7, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + mova m14, [pw_512] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 + + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 + RET +%endif + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_16x12, 4, 7, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + mova m14, [pw_512] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + packuswb m8, m9 + packuswb m10, m11 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + RET +%endif + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_16x8, 4, 7, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + mova m14, [pw_512] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 + RET +%endif + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_16x4, 4, 7, 13 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + mova m12, [pw_512] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + + pmulhrsw m0, m12 ; m0 = word: row 0 + pmulhrsw m1, m12 ; m1 = word: row 1 + pmulhrsw m2, m12 ; m2 = word: row 2 + pmulhrsw m3, m12 ; m3 = word: row 3 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + RET +%endif + +%macro FILTER_VER_LUMA_AVX2_16xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + lea r7, [r1 * 4] + mova m14, [pw_512] + mov r8d, %2 / 16 + +.loop: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 + + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 + lea r2, [r2 + r3 * 4] + sub r0, r7 + dec r8d + jnz .loop + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_16xN 16, 32 +FILTER_VER_LUMA_AVX2_16xN 16, 64 + +%macro PROCESS_LUMA_AVX2_W16_16R 0 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r8, [r2 + r3 * 4] + movu [r8], xm4 + movu [r8 + r3], xm5 + + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 + lea r8, [r8 + r3 * 4] + + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r7 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r7, [r7 + r1 * 4] + movu xm6, [r7] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r7 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r7 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 + + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + lea r8, [r8 + r3 * 4] + movu [r8], xm12 + movu [r8 + r3], xm13 + movu [r8 + r3 * 2], xm0 + movu [r8 + r6], xm1 +%endmacro + +%macro PROCESS_LUMA_AVX2_W16_8R 0 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r8, [r2 + r3 * 4] + movu [r8], xm4 + movu [r8 + r3], xm5 + + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%endmacro + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_24x32, 4, 11, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + lea r10, [r1 * 4] + mova m14, [pw_512] + mov r9d, 2 +.loopH: + PROCESS_LUMA_AVX2_W16_16R + add r2, 16 + add r0, 16 + + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r7, [r0 + r1 * 4] + movq xm1, [r7] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r7 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r7 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r7, [r7 + r1 * 4] + movq xm0, [r7] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r7 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r7 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 + pmaddubsw m3, m0, [r5 + 2 * mmsize] + paddw m1, m3 + pmaddubsw m3, m0, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m0, [r5] + + movq xm3, [r7 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r7, [r7 + r1 * 4] + movq xm7, [r7] ; m7 = row 12 + punpcklbw xm3, xm7 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 3 * mmsize] + paddw m1, m3 + pmaddubsw m3, m6, [r5 + 2 * mmsize] + paddw m4, m3 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m6, [r5] + movq xm3, [r7 + r1] ; m3 = row 13 + punpcklbw xm7, xm3 + movq xm8, [r7 + r1 * 2] ; m8 = row 14 + punpcklbw xm3, xm8 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m3, m7, [r5 + 3 * mmsize] + paddw m4, m3 + pmaddubsw m3, m7, [r5 + 2 * mmsize] + paddw m0, m3 + pmaddubsw m3, m7, [r5 + 1 * mmsize] + paddw m6, m3 + pmaddubsw m7, [r5] + movq xm3, [r7 + r4] ; m3 = row 15 + punpcklbw xm8, xm3 + lea r7, [r7 + r1 * 4] + movq xm9, [r7] ; m9 = row 16 + punpcklbw xm3, xm9 + vinserti128 m8, m8, xm3, 1 + pmaddubsw m3, m8, [r5 + 3 * mmsize] + paddw m0, m3 + pmaddubsw m3, m8, [r5 + 2 * mmsize] + paddw m6, m3 + pmaddubsw m3, m8, [r5 + 1 * mmsize] + paddw m7, m3 + pmaddubsw m8, [r5] + movq xm3, [r7 + r1] ; m3 = row 17 + punpcklbw xm9, xm3 + movq xm10, [r7 + r1 * 2] ; m10 = row 18 + punpcklbw xm3, xm10 + vinserti128 m9, m9, xm3, 1 + pmaddubsw m3, m9, [r5 + 3 * mmsize] + paddw m6, m3 + pmaddubsw m3, m9, [r5 + 2 * mmsize] + paddw m7, m3 + pmaddubsw m3, m9, [r5 + 1 * mmsize] + paddw m8, m3 + movq xm3, [r7 + r4] ; m3 = row 19 + punpcklbw xm10, xm3 + lea r7, [r7 + r1 * 4] + movq xm9, [r7] ; m9 = row 20 + punpcklbw xm3, xm9 + vinserti128 m10, m10, xm3, 1 + pmaddubsw m3, m10, [r5 + 3 * mmsize] + paddw m7, m3 + pmaddubsw m3, m10, [r5 + 2 * mmsize] + paddw m8, m3 + movq xm3, [r7 + r1] ; m3 = row 21 + punpcklbw xm9, xm3 + movq xm10, [r7 + r1 * 2] ; m10 = row 22 + punpcklbw xm3, xm10 + vinserti128 m9, m9, xm3, 1 + pmaddubsw m3, m9, [r5 + 3 * mmsize] + paddw m8, m3 + + pmulhrsw m5, m14 ; m5 = word: row 0, row 1 + pmulhrsw m2, m14 ; m2 = word: row 2, row 3 + pmulhrsw m1, m14 ; m1 = word: row 4, row 5 + pmulhrsw m4, m14 ; m4 = word: row 6, row 7 + pmulhrsw m0, m14 ; m0 = word: row 8, row 9 + pmulhrsw m6, m14 ; m6 = word: row 10, row 11 + pmulhrsw m7, m14 ; m7 = word: row 12, row 13 + pmulhrsw m8, m14 ; m8 = word: row 14, row 15 + packuswb m5, m2 + packuswb m1, m4 + packuswb m0, m6 + packuswb m7, m8 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + vextracti128 xm6, m0, 1 + vextracti128 xm8, m7, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r8, [r2 + r3 * 4] + movq [r8], xm1 + movq [r8 + r3], xm4 + movhps [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm4 + lea r8, [r8 + r3 * 4] + movq [r8], xm0 + movq [r8 + r3], xm6 + movhps [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm6 + lea r8, [r8 + r3 * 4] + movq [r8], xm7 + movq [r8 + r3], xm8 + movhps [r8 + r3 * 2], xm7 + movhps [r8 + r6], xm8 + + sub r7, r10 + lea r0, [r7 - 16] + lea r2, [r8 + r3 * 4 - 16] + dec r9d + jnz .loopH + RET +%endif + +%macro FILTER_VER_LUMA_AVX2_32xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + lea r11, [r1 * 4] + mova m14, [pw_512] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 16 +.loopW: + PROCESS_LUMA_AVX2_W16_16R + add r2, 16 + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 16] + lea r2, [r8 + r3 * 4 - 16] + dec r9d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_32xN 32, 32 +FILTER_VER_LUMA_AVX2_32xN 32, 64 + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_32x16, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + mova m14, [pw_512] + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_16R + add r2, 16 + add r0, 16 + dec r9d + jnz .loopW + RET +%endif + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_32x24, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + mova m14, [pw_512] + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_16R + add r2, 16 + add r0, 16 + dec r9d + jnz .loopW + lea r9, [r1 * 4] + sub r7, r9 + lea r0, [r7 - 16] + lea r2, [r8 + r3 * 4 - 16] + mov r9d, 2 +.loop: + PROCESS_LUMA_AVX2_W16_8R + add r2, 16 + add r0, 16 + dec r9d + jnz .loop + RET +%endif + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_32x8, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + mova m14, [pw_512] + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_8R + add r2, 16 + add r0, 16 + dec r9d + jnz .loopW + RET +%endif + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_48x64, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + lea r11, [r1 * 4] + mova m14, [pw_512] + mov r9d, 4 +.loopH: + mov r10d, 3 +.loopW: + PROCESS_LUMA_AVX2_W16_16R + add r2, 16 + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 32] + lea r2, [r8 + r3 * 4 - 32] + dec r9d + jnz .loopH + RET +%endif + +%macro FILTER_VER_LUMA_AVX2_64xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + lea r11, [r1 * 4] + mova m14, [pw_512] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 16 +.loopW: + PROCESS_LUMA_AVX2_W16_16R + add r2, 16 + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 48] + lea r2, [r8 + r3 * 4 - 48] + dec r9d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_64xN 64, 32 +FILTER_VER_LUMA_AVX2_64xN 64, 48 +FILTER_VER_LUMA_AVX2_64xN 64, 64 + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_pp_64x16, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r3 * 3] + mova m14, [pw_512] + mov r9d, 4 +.loopW: + PROCESS_LUMA_AVX2_W16_16R + add r2, 16 + add r0, 16 + dec r9d + jnz .loopW + RET +%endif + ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- @@ -3695,7 +7544,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize %endif %ifidn %3,pp - mova m3, [tab_c_512] + mova m3, [pw_512] %else mova m3, [pw_2000] %endif @@ -3959,7 +7808,7 @@ cglobal chroma_p2s, 3, 7, 4 mov r4d, r4m ; load constant - mova m2, [tab_c_128] + mova m2, [pb_128] mova m3, [tab_c_64_n64] .loopH: diff --git a/source/common/x86/ipfilter8.h b/source/common/x86/ipfilter8.h index 3949409..6c8683f 100644 --- a/source/common/x86/ipfilter8.h +++ b/source/common/x86/ipfilter8.h @@ -25,10 +25,10 @@ #define X265_IPFILTER8_H #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \ - void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ - void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ - void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ - void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ + void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); #define LUMA_FILTERS(cpu) \ SETUP_LUMA_FUNC_DEF(4, 4, cpu); \ @@ -58,7 +58,7 @@ SETUP_LUMA_FUNC_DEF(16, 64, cpu) #define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \ - void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); + void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); #define LUMA_SP_FILTERS(cpu) \ SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \ @@ -88,7 +88,7 @@ SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu); #define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \ - void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); #define LUMA_SS_FILTERS(cpu) \ SETUP_LUMA_SS_FUNC_DEF(4, 4, cpu); \ @@ -120,10 +120,10 @@ #if HIGH_BIT_DEPTH #define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \ - void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); \ - void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ - void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ - void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); #define CHROMA_VERT_FILTERS(cpu) \ SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \ @@ -208,8 +208,8 @@ SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu) #define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \ - void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ - void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); + void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); #define CHROMA_HORIZ_FILTERS(cpu) \ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \ @@ -289,8 +289,8 @@ SETUP_CHROMA_HORIZ_FUNC_DEF(64, 16, cpu); \ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu) -void x265_chroma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); -void x265_luma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); +void x265_chroma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height); +void x265_luma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height); CHROMA_VERT_FILTERS(_sse2); CHROMA_HORIZ_FILTERS(_sse4); @@ -319,10 +319,10 @@ CHROMA_HORIZ_FILTERS_444(_sse4); #else // if HIGH_BIT_DEPTH #define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \ - void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ - void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ - void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ - void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ + void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); #define CHROMA_FILTERS(cpu) \ SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \ @@ -403,7 +403,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4); SETUP_CHROMA_FUNC_DEF(16, 64, cpu); #define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \ - void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); + void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); #define CHROMA_SP_FILTERS(cpu) \ SETUP_CHROMA_SP_FUNC_DEF(8, 2, cpu); \ @@ -488,7 +488,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4); SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu); #define SETUP_CHROMA_SS_FUNC_DEF(W, H, cpu) \ - void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); #define CHROMA_SS_FILTERS(cpu) \ SETUP_CHROMA_SS_FUNC_DEF(4, 4, cpu); \ @@ -573,12 +573,14 @@ CHROMA_HORIZ_FILTERS_444(_sse4); SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu); CHROMA_FILTERS(_sse4); +CHROMA_FILTERS(_avx2); CHROMA_SP_FILTERS(_sse2); CHROMA_SP_FILTERS_SSE4(_sse4); CHROMA_SS_FILTERS(_sse2); CHROMA_SS_FILTERS_SSE4(_sse4); CHROMA_FILTERS_422(_sse4); +CHROMA_FILTERS_422(_avx2); CHROMA_SP_FILTERS_422(_sse2); CHROMA_SP_FILTERS_422_SSE4(_sse4); CHROMA_SS_FILTERS_422(_sse2); @@ -588,7 +590,7 @@ CHROMA_FILTERS_444(_sse4); CHROMA_SP_FILTERS_444(_sse4); CHROMA_SS_FILTERS_444(_sse2); -void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); +void x265_chroma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height); #undef SETUP_CHROMA_FUNC_DEF #undef SETUP_CHROMA_SP_FUNC_DEF @@ -616,8 +618,8 @@ LUMA_SP_FILTERS(_sse4); LUMA_SS_FILTERS(_sse2); LUMA_FILTERS(_avx2); -void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY); -void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); +void x265_interp_8tap_hv_pp_8x8_ssse3(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); +void x265_luma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height); #undef LUMA_FILTERS #undef LUMA_SP_FILTERS diff --git a/source/common/x86/mc.h b/source/common/x86/mc.h index 95cb609..9bf4611 100644 --- a/source/common/x86/mc.h +++ b/source/common/x86/mc.h @@ -25,7 +25,7 @@ #define X265_MC_H #define LOWRES(cpu) \ - void x265_frame_init_lowres_core_ ## cpu(pixel * src0, pixel * dst0, pixel * dsth, pixel * dstv, pixel * dstc, \ + void x265_frame_init_lowres_core_ ## cpu(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \ intptr_t src_stride, intptr_t dst_stride, int width, int height); LOWRES(mmx2) LOWRES(sse2) @@ -37,31 +37,31 @@ LOWRES(xop) void func ## _mmx2 args; \ void func ## _sse2 args; \ void func ## _ssse3 args; -DECL_SUF(x265_pixel_avg_64x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_64x48, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_64x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_64x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_48x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_32x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_32x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_32x24, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_32x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_32x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_24x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_16x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_16x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_16x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_16x12, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_16x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_16x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_12x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_8x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_8x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_8x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_8x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_4x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_4x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) -DECL_SUF(x265_pixel_avg_4x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_64x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_64x48, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_64x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_64x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_48x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x24, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_24x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x12, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x4, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_12x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_8x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_8x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_8x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_8x4, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_4x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_4x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) +DECL_SUF(x265_pixel_avg_4x4, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int)) #undef LOWRES #undef DECL_SUF diff --git a/source/common/x86/pixel-util.h b/source/common/x86/pixel-util.h index 90bb4fc..3c8fe3e 100644 --- a/source/common/x86/pixel-util.h +++ b/source/common/x86/pixel-util.h @@ -24,59 +24,52 @@ #ifndef X265_PIXEL_UTIL_H #define X265_PIXEL_UTIL_H -void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); - -void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); -void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); -void x265_getResidual16_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); -void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); -void x265_getResidual32_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); -void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); - -void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride); -void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride); -void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride); -void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride); -void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride); - -void x265_transpose8_avx2(pixel *dest, pixel *src, intptr_t stride); -void x265_transpose16_avx2(pixel *dest, pixel *src, intptr_t stride); -void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride); -void x265_transpose64_avx2(pixel *dest, pixel *src, intptr_t stride); - -uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); -uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); -uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); -uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); -void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift); -void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift); -int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff); - -void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); -void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); -void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); - -void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1, - const uint8_t * pix2, intptr_t stride2, int sums[2][4]); -void x265_pixel_ssim_4x4x2_core_sse2(const pixel * pix1, intptr_t stride1, - const pixel * pix2, intptr_t stride2, int sums[2][4]); -void x265_pixel_ssim_4x4x2_core_avx(const pixel * pix1, intptr_t stride1, - const pixel * pix2, intptr_t stride2, int sums[2][4]); +void x265_getResidual4_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void x265_getResidual8_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void x265_getResidual16_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); + +void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride); +void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride); +void x265_transpose16_sse2(pixel* dest, const pixel* src, intptr_t stride); +void x265_transpose32_sse2(pixel* dest, const pixel* src, intptr_t stride); +void x265_transpose64_sse2(pixel* dest, const pixel* src, intptr_t stride); + +void x265_transpose8_avx2(pixel* dest, const pixel* src, intptr_t stride); +void x265_transpose16_avx2(pixel* dest, const pixel* src, intptr_t stride); +void x265_transpose32_avx2(pixel* dest, const pixel* src, intptr_t stride); +void x265_transpose64_avx2(pixel* dest, const pixel* src, intptr_t stride); + +uint32_t x265_quant_sse4(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff); +uint32_t x265_quant_avx2(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff); +uint32_t x265_nquant_sse4(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff); +uint32_t x265_nquant_avx2(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff); +void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift); +void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift); +int x265_count_nonzero_ssse3(const int16_t* quantCoeff, int numCoeff); + +void x265_weight_pp_sse4(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); +void x265_weight_pp_avx2(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); +void x265_weight_sp_sse4(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); + +void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t* pix1, intptr_t stride1, + const uint8_t* pix2, intptr_t stride2, int sums[2][4]); +void x265_pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1, + const pixel* pix2, intptr_t stride2, int sums[2][4]); +void x265_pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1, + const pixel* pix2, intptr_t stride2, int sums[2][4]); float x265_pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width); float x265_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width); -void x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t); -void x265_scale1D_128to64_avx2(pixel *, pixel *, intptr_t); -void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t); +void x265_scale1D_128to64_ssse3(pixel*, const pixel*, intptr_t); +void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t); +void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t); #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \ - void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \ - void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1); + void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \ + void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* scr1, intptr_t srcStride0, intptr_t srcStride1); #define CHROMA_PIXELSUB_DEF(cpu) \ SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \ @@ -91,8 +84,8 @@ void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t); SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu); #define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \ - void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \ - void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1); + void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \ + void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* scr1, intptr_t srcStride0, intptr_t srcStride1); #define LUMA_PIXELSUB_DEF(cpu) \ SETUP_LUMA_PIXELSUB_PS_FUNC(8, 8, cpu); \ @@ -109,7 +102,7 @@ CHROMA_PIXELSUB_DEF_422(_sse4); CHROMA_PIXELSUB_DEF_422(_sse2); #define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \ - uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel * pix, intptr_t pixstride); + uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(const pixel* pix, intptr_t pixstride); #define LUMA_PIXELVAR_DEF(cpu) \ SETUP_LUMA_PIXELVAR_FUNC(8, 8, cpu); \ diff --git a/source/common/x86/pixel-util8.asm b/source/common/x86/pixel-util8.asm index 38fb52e..8adeb84 100644 --- a/source/common/x86/pixel-util8.asm +++ b/source/common/x86/pixel-util8.asm @@ -61,447 +61,6 @@ cextern pd_1 cextern pd_32767 cextern pd_n32768 -;----------------------------------------------------------------------------- -; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred) -;----------------------------------------------------------------------------- -INIT_XMM sse2 -%if HIGH_BIT_DEPTH -%if ARCH_X86_64 == 1 -cglobal calcRecons4, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons4, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r4d, r4d - add r5d, r5d - add r6d, r6d - - pxor m4, m4 - mova m5, [pw_pixel_max] - mov t7b, 4/2 -.loop: - movh m0, [r0] - movh m1, [r0 + r4] - punpcklqdq m0, m1 - movh m2, [r1] - movh m3, [r1 + r4] - punpcklqdq m2, m3 - paddw m0, m2 - CLIPW m0, m4, m5 - - ; store recipred[] - movh [r3], m0 - movhps [r3 + r6], m0 - - ; store recqt[] - movh [r2], m0 - movhps [r2 + r5], m0 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 2] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%else ;HIGH_BIT_DEPTH - -%if ARCH_X86_64 == 1 -cglobal calcRecons4, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons4, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r5d, r5d - - pxor m0, m0 - mov t7b, 4/2 -.loop: - movd m1, [r0] - movd m2, [r0 + r4] - punpckldq m1, m2 - punpcklbw m1, m0 - movh m2, [r1] - movh m3, [r1 + r4 * 2] - punpcklqdq m2, m3 - paddw m1, m2 - packuswb m1, m1 - - ; store recon[] and recipred[] - movd [r3], m1 - pshufd m2, m1, 1 - movd [r3 + r6], m2 - - ; store recqt[] - punpcklbw m1, m0 - movh [r2], m1 - movhps [r2 + r5], m1 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%endif ;HIGH_BIT_DEPTH - - -INIT_XMM sse2 -%if ARCH_X86_64 == 1 -cglobal calcRecons8, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons8, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - -%if HIGH_BIT_DEPTH - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r4d, r4d - add r5d, r5d - add r6d, r6d - - pxor m4, m4 - mova m5, [pw_pixel_max] - mov t7b, 8/2 -.loop: - movu m0, [r0] - movu m1, [r0 + r4] - movu m2, [r1] - movu m3, [r1 + r4] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recipred[] - movu [r3], m0 - movu [r3 + r6], m1 - - ; store recqt[] - movu [r2], m0 - movu [r2 + r5], m1 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 2] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%else ;HIGH_BIT_DEPTH - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r5d, r5d - - pxor m0, m0 - mov t7b, 8/2 -.loop: - movh m1, [r0] - movh m2, [r0 + r4] - punpcklbw m1, m0 - punpcklbw m2, m0 - movu m3, [r1] - movu m4, [r1 + r4 * 2] - paddw m1, m3 - paddw m2, m4 - packuswb m1, m2 - - ; store recon[] and recipred[] - movh [r3], m1 - movhps [r3 + r6], m1 - - ; store recqt[] - punpcklbw m2, m1, m0 - punpckhbw m1, m0 - movu [r2], m2 - movu [r2 + r5], m1 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%endif ;HIGH_BIT_DEPTH - - - -%if HIGH_BIT_DEPTH -INIT_XMM sse2 -%if ARCH_X86_64 == 1 -cglobal calcRecons16, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons16, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r4d, r4d - add r5d, r5d - add r6d, r6d - - pxor m4, m4 - mova m5, [pw_pixel_max] - mov t7b, 16/2 -.loop: - movu m0, [r0] - movu m1, [r0 + 16] - movu m2, [r1] - movu m3, [r1 + 16] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recipred[] - movu [r3], m0 - movu [r3 + 16], m1 - - ; store recqt[] - movu [r2], m0 - movu [r2 + 16], m1 - - movu m0, [r0 + r4] - movu m1, [r0 + r4 + 16] - movu m2, [r1 + r4] - movu m3, [r1 + r4 + 16] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recon[] and recipred[] - movu [r3 + r6], m0 - movu [r3 + r6 + 16], m1 - - ; store recqt[] - movu [r2 + r5], m0 - movu [r2 + r5 + 16], m1 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 2] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%else ;HIGH_BIT_DEPTH - -INIT_XMM sse4 -%if ARCH_X86_64 == 1 -cglobal calcRecons16, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons16, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r5d, r5d - - pxor m0, m0 - mov t7b, 16 -.loop: - movu m2, [r0] - pmovzxbw m1, m2 - punpckhbw m2, m0 - paddw m1, [r1] - paddw m2, [r1 + 16] - packuswb m1, m2 - - ; store recon[] and recipred[] - movu [r3], m1 - - ; store recqt[] - pmovzxbw m2, m1 - punpckhbw m1, m0 - movu [r2], m2 - movu [r2 + 16], m1 - - add r2, r5 - add r3, r6 - add r0, r4 - lea r1, [r1 + r4 * 2] - - dec t7b - jnz .loop - RET -%endif ;HIGH_BIT_DEPTH - -%if HIGH_BIT_DEPTH -INIT_XMM sse2 -%if ARCH_X86_64 == 1 -cglobal calcRecons32, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons32, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r4d, r4d - add r5d, r5d - add r6d, r6d - - pxor m4, m4 - mova m5, [pw_pixel_max] - mov t7b, 32/2 -.loop: - - movu m0, [r0] - movu m1, [r0 + 16] - movu m2, [r1] - movu m3, [r1 + 16] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recipred[] - movu [r3], m0 - movu [r3 + 16], m1 - - ; store recqt[] - movu [r2], m0 - movu [r2 + 16], m1 - - movu m0, [r0 + 32] - movu m1, [r0 + 48] - movu m2, [r1 + 32] - movu m3, [r1 + 48] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recon[] and recipred[] - movu [r3 + 32], m0 - movu [r3 + 48], m1 - - ; store recqt[] - movu [r2 + 32], m0 - movu [r2 + 48], m1 - add r2, r5 - - movu m0, [r0 + r4] - movu m1, [r0 + r4 + 16] - movu m2, [r1 + r4] - movu m3, [r1 + r4 + 16] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recon[] and recipred[] - movu [r3 + r6], m0 - movu [r3 + r6 + 16], m1 - - ; store recqt[] - movu [r2], m0 - movu [r2 + 16], m1 - - movu m0, [r0 + r4 + 32] - movu m1, [r0 + r4 + 48] - movu m2, [r1 + r4 + 32] - movu m3, [r1 + r4 + 48] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recon[] and recipred[] - movu [r3 + r6 + 32], m0 - movu [r3 + r6 + 48], m1 - lea r3, [r3 + r6 * 2] - - ; store recqt[] - movu [r2 + 32], m0 - movu [r2 + 48], m1 - add r2, r5 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 2] - - dec t7b - jnz .loop - RET -%else ;HIGH_BIT_DEPTH -INIT_XMM sse4 -%if ARCH_X86_64 == 1 -cglobal calcRecons32, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons32, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r5d, r5d - - pxor m0, m0 - mov t7b, 32 -.loop: - movu m2, [r0] - movu m4, [r0 + 16] - pmovzxbw m1, m2 - punpckhbw m2, m0 - pmovzxbw m3, m4 - punpckhbw m4, m0 - - paddw m1, [r1 + 0 * 16] - paddw m2, [r1 + 1 * 16] - packuswb m1, m2 - - paddw m3, [r1 + 2 * 16] - paddw m4, [r1 + 3 * 16] - packuswb m3, m4 - - ; store recon[] and recipred[] - movu [r3], m1 - movu [r3 + 16], m3 - - ; store recqt[] - pmovzxbw m2, m1 - punpckhbw m1, m0 - movu [r2 + 0 * 16], m2 - movu [r2 + 1 * 16], m1 - pmovzxbw m4, m3 - punpckhbw m3, m0 - movu [r2 + 2 * 16], m4 - movu [r2 + 3 * 16], m3 - - add r2, r5 - add r3, r6 - add r0, r4 - lea r1, [r1 + r4 * 2] - - dec t7b - jnz .loop - RET -%endif ;HIGH_BIT_DEPTH - ;----------------------------------------------------------------------------- ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) @@ -861,7 +420,7 @@ cglobal getResidual32, 4,5,7 ;----------------------------------------------------------------------------- -; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); +; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal quant, 5,6,8 @@ -883,7 +442,7 @@ cglobal quant, 5,6,8 pxor m7, m7 ; m7 = numZero .loop: ; 4 coeff - movu m0, [r0] ; m0 = level + pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -901,7 +460,7 @@ cglobal quant, 5,6,8 movh [r3], m3 ; 4 coeff - movu m0, [r0 + 16] ; m0 = level + pmovsxwd m0, [r0 + 8] ; m0 = level pabsd m1, m0 pmulld m1, [r1 + 16] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -916,7 +475,7 @@ cglobal quant, 5,6,8 packssdw m3, m3 movh [r3 + 8], m3 - add r0, 32 + add r0, 16 add r1, 32 add r2, 32 add r3, 16 @@ -953,7 +512,7 @@ cglobal quant, 5,5,10 pxor m7, m7 ; m7 = numZero .loop: ; 8 coeff - movu m0, [r0] ; m0 = level + pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -966,7 +525,7 @@ cglobal quant, 5,5,10 psignd m2, m0 ; 8 coeff - movu m0, [r0 + mmsize] ; m0 = level + pmovsxwd m0, [r0 + mmsize/2] ; m0 = level pabsd m1, m0 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 paddd m3, m1, m5 @@ -987,7 +546,7 @@ cglobal quant, 5,5,10 pminuw m2, m9 paddw m7, m2 - add r0, mmsize*2 + add r0, mmsize add r1, mmsize*2 add r2, mmsize*2 add r3, mmsize @@ -1025,7 +584,7 @@ cglobal quant, 5,6,8 pxor m7, m7 ; m7 = numZero .loop: ; 8 coeff - movu m0, [r0] ; m0 = level + pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -1044,7 +603,7 @@ cglobal quant, 5,6,8 movu [r3], xm3 ; 8 coeff - movu m0, [r0 + mmsize] ; m0 = level + pmovsxwd m0, [r0 + mmsize/2] ; m0 = level pabsd m1, m0 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -1062,7 +621,7 @@ cglobal quant, 5,6,8 vpermq m3, m3, q0020 movu [r3 + mmsize/2], xm3 - add r0, mmsize*2 + add r0, mmsize add r1, mmsize*2 add r2, mmsize*2 add r3, mmsize @@ -1083,7 +642,7 @@ IACA_END ;----------------------------------------------------------------------------- -; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); +; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal nquant, 3,5,8 @@ -1096,8 +655,8 @@ cglobal nquant, 3,5,8 shr r4d, 3 .loop: - movu m0, [r0] ; m0 = level - movu m1, [r0 + 16] ; m1 = level + pmovsxwd m0, [r0] ; m0 = level + pmovsxwd m1, [r0 + 8] ; m1 = level pabsd m2, m0 pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff @@ -1114,7 +673,7 @@ cglobal nquant, 3,5,8 packssdw m2, m3 movu [r2], m2 - add r0, 32 + add r0, 16 add r1, 32 add r2, 16 @@ -1144,14 +703,14 @@ cglobal nquant, 3,5,7 shr r4d, 4 .loop: - movu m0, [r0] ; m0 = level + pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff paddd m1, m4 psrad m1, xm3 ; m0 = level1 psignd m1, m0 - movu m0, [r0 + mmsize] ; m0 = level + pmovsxwd m0, [r0 + mmsize/2] ; m0 = level pabsd m2, m0 pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff paddd m2, m4 @@ -1162,7 +721,7 @@ cglobal nquant, 3,5,7 vpermq m2, m1, q3120 movu [r2], m2 - add r0, mmsize * 2 + add r0, mmsize add r1, mmsize * 2 add r2, mmsize @@ -1211,15 +770,11 @@ cglobal dequant_normal, 5,5,5 pmaddwd m4, m1 psrad m3, m0 psrad m4, m0 - packssdw m3, m3 ; OPT_ME: store must be 32 bits - pmovsxwd m3, m3 - packssdw m4, m4 - pmovsxwd m4, m4 + packssdw m3, m4 mova [r1], m3 - mova [r1 + 16], m4 add r0, 16 - add r1, 32 + add r1, 16 sub r2d, 8 jnz .loop @@ -1259,13 +814,12 @@ cglobal dequant_normal, 5,5,7 pmaxsd m3, m6 pminsd m4, m5 pmaxsd m4, m6 + packssdw m3, m4 mova [r1 + 0 * mmsize/2], xm3 - mova [r1 + 1 * mmsize/2], xm4 - vextracti128 [r1 + 2 * mmsize/2], m3, 1 - vextracti128 [r1 + 3 * mmsize/2], m4, 1 + vextracti128 [r1 + 1 * mmsize/2], m3, 1 add r0, mmsize - add r1, mmsize * 2 + add r1, mmsize dec r2d jnz .loop diff --git a/source/common/x86/pixel.h b/source/common/x86/pixel.h index e99b1ee..cb85dbd 100644 --- a/source/common/x86/pixel.h +++ b/source/common/x86/pixel.h @@ -57,17 +57,17 @@ ret x265_pixel_ ## name ## _12x16_ ## suffix args; \ #define DECL_X1(name, suffix) \ - DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t)) + DECL_PIXELS(int, name, suffix, (const pixel*, intptr_t, const pixel*, intptr_t)) #define DECL_X1_SS(name, suffix) \ - DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t)) + DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const int16_t*, intptr_t)) #define DECL_X1_SP(name, suffix) \ - DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, pixel *, intptr_t)) + DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const pixel*, intptr_t)) #define DECL_X4(name, suffix) \ - DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \ - DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) + DECL_PIXELS(void, name ## _x3, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*)) \ + DECL_PIXELS(void, name ## _x4, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*)) /* sad-a.asm */ DECL_X1(sad, mmx2) @@ -103,11 +103,11 @@ DECL_X1(satd, sse4) DECL_X1(satd, avx) DECL_X1(satd, xop) DECL_X1(satd, avx2) -int x265_pixel_satd_8x32_sse2(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_satd_16x4_sse2(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_satd_16x12_sse2(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_satd_8x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_16x4_sse2(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_16x12_sse2(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_16x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_16x64_sse2(const pixel*, intptr_t, const pixel*, intptr_t); DECL_X1(sa8d, mmx2) DECL_X1(sa8d, sse2) @@ -138,42 +138,42 @@ DECL_X1_SS(ssd_ss, xop) DECL_X1_SS(ssd_ss, avx2) DECL_X1_SP(ssd_sp, sse4) #define DECL_HEVC_SSD(suffix) \ - int x265_pixel_ssd_32x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_16x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_32x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_32x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_16x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_32x24_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_24x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_32x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_8x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_16x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_16x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_8x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_16x12_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_16x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_8x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ - int x265_pixel_ssd_8x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); + int x265_pixel_ssd_32x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_16x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_32x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_32x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_16x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_32x24_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_24x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_32x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_8x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_16x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_16x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_8x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_16x12_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_16x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_8x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \ + int x265_pixel_ssd_8x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); DECL_HEVC_SSD(sse2) DECL_HEVC_SSD(ssse3) DECL_HEVC_SSD(avx) -int x265_pixel_ssd_12x16_sse4(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_ssd_24x32_sse4(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_ssd_48x64_sse4(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_ssd_64x16_sse4(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_ssd_64x32_sse4(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t); -int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_ssd_12x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_ssd_24x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_ssd_48x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_ssd_64x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_ssd_64x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_ssd_64x48_sse4(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_ssd_64x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t); -int x265_pixel_ssd_s_4_sse2(int16_t *, intptr_t); -int x265_pixel_ssd_s_8_sse2(int16_t *, intptr_t); -int x265_pixel_ssd_s_16_sse2(int16_t *, intptr_t); -int x265_pixel_ssd_s_32_sse2(int16_t *, intptr_t); -int x265_pixel_ssd_s_32_avx2(int16_t *, intptr_t); +int x265_pixel_ssd_s_4_sse2(const int16_t*, intptr_t); +int x265_pixel_ssd_s_8_sse2(const int16_t*, intptr_t); +int x265_pixel_ssd_s_16_sse2(const int16_t*, intptr_t); +int x265_pixel_ssd_s_32_sse2(const int16_t*, intptr_t); +int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t); #define ADDAVG(func) \ - void x265_ ## func ## _sse4(int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t); + void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); ADDAVG(addAvg_2x4) ADDAVG(addAvg_2x8) ADDAVG(addAvg_4x2); @@ -216,8 +216,8 @@ ADDAVG(addAvg_16x24) ADDAVG(addAvg_24x64) ADDAVG(addAvg_32x48) -void x265_downShift_16_sse2(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); -void x265_upShift_8_sse4(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift); +void x265_downShift_16_sse2(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); +void x265_upShift_8_sse4(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift); #undef DECL_PIXELS #undef DECL_HEVC_SSD diff --git a/source/common/yuv.cpp b/source/common/yuv.cpp index fffc215..eedb5b2 100644 --- a/source/common/yuv.cpp +++ b/source/common/yuv.cpp @@ -43,21 +43,31 @@ bool Yuv::create(uint32_t size, int csp) m_hChromaShift = CHROMA_H_SHIFT(csp); m_vChromaShift = CHROMA_V_SHIFT(csp); - // set width and height m_size = size; - m_csize = size >> m_hChromaShift; m_part = partitionFromSizes(size, size); - size_t sizeL = size * size; - size_t sizeC = sizeL >> (m_vChromaShift + m_hChromaShift); + if (csp == X265_CSP_I400) + { + CHECKED_MALLOC(m_buf[0], pixel, size * size + 8); + m_buf[1] = m_buf[2] = 0; + m_csize = MAX_INT; + return true; + } + else + { + m_csize = size >> m_hChromaShift; - X265_CHECK((sizeC & 15) == 0, "invalid size"); + size_t sizeL = size * size; + size_t sizeC = sizeL >> (m_vChromaShift + m_hChromaShift); - // memory allocation (padded for SIMD reads) - CHECKED_MALLOC(m_buf[0], pixel, sizeL + sizeC * 2 + 8); - m_buf[1] = m_buf[0] + sizeL; - m_buf[2] = m_buf[0] + sizeL + sizeC; - return true; + X265_CHECK((sizeC & 15) == 0, "invalid size"); + + // memory allocation (padded for SIMD reads) + CHECKED_MALLOC(m_buf[0], pixel, sizeL + sizeC * 2 + 8); + m_buf[1] = m_buf[0] + sizeL; + m_buf[2] = m_buf[0] + sizeL + sizeC; + return true; + } fail: return false; @@ -71,7 +81,6 @@ void Yuv::destroy() void Yuv::copyToPicYuv(PicYuv& dstPic, uint32_t cuAddr, uint32_t absPartIdx) const { pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx); - primitives.luma_copy_pp[m_part](dstY, dstPic.m_stride, m_buf[0], m_size); pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx); @@ -82,29 +91,41 @@ void Yuv::copyToPicYuv(PicYuv& dstPic, uint32_t cuAddr, uint32_t absPartIdx) con void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx) { - /* We cheat with const_cast internally because the get methods are not capable of - * returning const buffers and the primitives are not const aware, but we know - * this function does not modify srcPic */ - PicYuv& srcPicSafe = const_cast(srcPic); - pixel* srcY = srcPicSafe.getLumaAddr(cuAddr, absPartIdx); - + const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx); primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcY, srcPic.m_stride); - pixel* srcU = srcPicSafe.getCbAddr(cuAddr, absPartIdx); - pixel* srcV = srcPicSafe.getCrAddr(cuAddr, absPartIdx); - primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPicSafe.m_strideC); - primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPicSafe.m_strideC); + const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx); + const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx); + primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPic.m_strideC); + primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPic.m_strideC); } void Yuv::copyFromYuv(const Yuv& srcYuv) { - X265_CHECK(m_size <= srcYuv.m_size, "invalid size\n"); + X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n"); primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size); primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize); primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize); } +/* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */ +void Yuv::copyPUFromYuv(const Yuv& srcYuv, uint32_t absPartIdx, int partEnum, bool bChroma) +{ + X265_CHECK(m_size == FENC_STRIDE && m_size >= srcYuv.m_size, "PU buffer size mismatch\n"); + + const pixel* srcY = srcYuv.m_buf[0] + getAddrOffset(absPartIdx, srcYuv.m_size); + primitives.luma_copy_pp[partEnum](m_buf[0], m_size, srcY, srcYuv.m_size); + + if (bChroma) + { + const pixel* srcU = srcYuv.m_buf[1] + srcYuv.getChromaAddrOffset(absPartIdx); + const pixel* srcV = srcYuv.m_buf[2] + srcYuv.getChromaAddrOffset(absPartIdx); + primitives.chroma[m_csp].copy_pp[partEnum](m_buf[1], m_csize, srcU, srcYuv.m_csize); + primitives.chroma[m_csp].copy_pp[partEnum](m_buf[2], m_csize, srcV, srcYuv.m_csize); + } +} + void Yuv::copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const { pixel* dstY = dstYuv.getLumaAddr(absPartIdx); @@ -120,7 +141,6 @@ void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const { pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size); pixel* dstY = dstYuv.m_buf[0]; - primitives.luma_copy_pp[dstYuv.m_part](dstY, dstYuv.m_size, srcY, m_size); pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx); @@ -144,21 +164,19 @@ void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absP if (bLuma) { - int16_t* srcY0 = const_cast(srcYuv0).getLumaAddr(absPartIdx); - int16_t* srcY1 = const_cast(srcYuv1).getLumaAddr(absPartIdx); + const int16_t* srcY0 = srcYuv0.getLumaAddr(absPartIdx); + const int16_t* srcY1 = srcYuv1.getLumaAddr(absPartIdx); pixel* dstY = getLumaAddr(absPartIdx); - primitives.luma_addAvg[part](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size); } if (bChroma) { - int16_t* srcU0 = const_cast(srcYuv0).getCbAddr(absPartIdx); - int16_t* srcV0 = const_cast(srcYuv0).getCrAddr(absPartIdx); - int16_t* srcU1 = const_cast(srcYuv1).getCbAddr(absPartIdx); - int16_t* srcV1 = const_cast(srcYuv1).getCrAddr(absPartIdx); + const int16_t* srcU0 = srcYuv0.getCbAddr(absPartIdx); + const int16_t* srcV0 = srcYuv0.getCrAddr(absPartIdx); + const int16_t* srcU1 = srcYuv1.getCbAddr(absPartIdx); + const int16_t* srcV1 = srcYuv1.getCrAddr(absPartIdx); pixel* dstU = getCbAddr(absPartIdx); pixel* dstV = getCrAddr(absPartIdx); - primitives.chroma[m_csp].addAvg[part](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize); primitives.chroma[m_csp].addAvg[part](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize); } @@ -168,7 +186,7 @@ void Yuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size { const pixel* src = getLumaAddr(absPartIdx); pixel* dst = dstYuv.getLumaAddr(absPartIdx); - primitives.square_copy_pp[log2Size - 2](dst, dstYuv.m_size, const_cast(src), m_size); + primitives.luma_copy_pp[log2Size - 2](dst, dstYuv.m_size, src, m_size); } void Yuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const @@ -178,7 +196,6 @@ void Yuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Si const pixel* srcV = getCrAddr(absPartIdx); pixel* dstU = dstYuv.getCbAddr(absPartIdx); pixel* dstV = dstYuv.getCrAddr(absPartIdx); - - primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, const_cast(srcU), m_csize); - primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, const_cast(srcV), m_csize); + primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, srcU, m_csize); + primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, srcV, m_csize); } diff --git a/source/common/yuv.h b/source/common/yuv.h index a02987c..5911c30 100644 --- a/source/common/yuv.h +++ b/source/common/yuv.h @@ -63,6 +63,9 @@ public: // Copy from same size YUV buffer void copyFromYuv(const Yuv& srcYuv); + // Copy portion of srcYuv into ME prediction buffer + void copyPUFromYuv(const Yuv& srcYuv, uint32_t absPartIdx, int partEnum, bool bChroma); + // Copy Small YUV buffer to the part of other Big YUV buffer void copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const; diff --git a/source/encoder/CMakeLists.txt b/source/encoder/CMakeLists.txt index 020364f..0e995ed 100644 --- a/source/encoder/CMakeLists.txt +++ b/source/encoder/CMakeLists.txt @@ -3,6 +3,9 @@ if(GCC) add_definitions(-Wno-uninitialized) endif() +if(MSVC) + add_definitions(/wd4701) # potentially uninitialized local variable 'foo' used +endif() add_library(encoder OBJECT ../x265.h analysis.cpp analysis.h diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp index c62f5f0..8b8d103 100644 --- a/source/encoder/analysis.cpp +++ b/source/encoder/analysis.cpp @@ -33,8 +33,6 @@ #include "rdcost.h" #include "encoder.h" -#include "PPA/ppa.h" - using namespace x265; /* An explanation of rate distortion levels (--rd-level) @@ -61,9 +59,12 @@ using namespace x265; * * RDO selection between merge and skip * sa8d selection of best inter mode + * sa8d decisions include chroma residual cost * RDO selection between (merge/skip) / best inter mode / intra / split * * rd-level 4 enables RDOQuant + * chroma residual cost included in satd decisions, including subpel refine + * (as a result of --subme 3 being used by preset slow) * * rd-level 5,6 does RDO for each inter mode */ @@ -71,12 +72,15 @@ using namespace x265; Analysis::Analysis() { m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0; + m_reuseIntraDataCTU = NULL; + m_reuseInterDataCTU = NULL; } bool Analysis::create(ThreadLocalData *tld) { m_tld = tld; m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2; + m_bChromaSa8d = m_param->rdLevel >= 3; int csp = m_param->internalCsp; uint32_t cuSize = g_maxCUSize; @@ -116,7 +120,7 @@ void Analysis::destroy() } } -Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext) +Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext) { m_slice = ctu.m_slice; m_frame = &frame; @@ -124,27 +128,26 @@ Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuG invalidateContexts(0); m_quant.setQPforQuant(ctu); m_rqt[0].cur.load(initialContext); - m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_origPicYuv, ctu.m_cuAddr, 0); + m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0); uint32_t numPartition = ctu.m_numPartitions; + if (m_param->analysisMode) + { + m_reuseIntraDataCTU = (analysis_intra_data *)m_frame->m_analysisData.intraData; + int numPredDir = m_slice->isInterP() ? 1 : 2; + m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData + ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir; + } + if (m_slice->m_sliceType == I_SLICE) { uint32_t zOrder = 0; - if (m_param->analysisMode == X265_ANALYSIS_LOAD) - compressIntraCU(ctu, cuGeom, m_frame->m_intraData, zOrder); - else + compressIntraCU(ctu, cuGeom, zOrder); + if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData) { - compressIntraCU(ctu, cuGeom, NULL, zOrder); - - if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_intraData) - { - CUData *bestCU = &m_modeDepth[0].bestMode->cu; - memcpy(&m_frame->m_intraData->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition); - memcpy(&m_frame->m_intraData->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition); - memcpy(&m_frame->m_intraData->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition); - m_frame->m_intraData->cuAddr[ctu.m_cuAddr] = ctu.m_cuAddr; - m_frame->m_intraData->poc[ctu.m_cuAddr] = m_frame->m_poc; - } + CUData *bestCU = &m_modeDepth[0].bestMode->cu; + memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition); + memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition); + memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition); } } else @@ -152,10 +155,10 @@ Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuG if (!m_param->rdLevel) { /* In RD Level 0/1, copy source pixels into the reconstructed block so - * they are available for intra predictions */ - m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPicYuv, ctu.m_cuAddr, 0); - - compressInterCU_rd0_4(ctu, cuGeom); // TODO: this really wants to be compressInterCU_rd0_1 + * they are available for intra predictions */ + m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0); + + compressInterCU_rd0_4(ctu, cuGeom); /* generate residual for entire CTU at once and copy to reconPic */ encodeResidue(ctu, cuGeom); @@ -178,7 +181,7 @@ void Analysis::tryLossless(const CUGeom& cuGeom) if (!md.bestMode->distortion) /* already lossless */ return; - else if (md.bestMode->cu.m_predMode[0] == MODE_INTRA) + else if (md.bestMode->cu.isIntra(0)) { md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0]; @@ -195,7 +198,7 @@ void Analysis::tryLossless(const CUGeom& cuGeom) } } -void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* shared, uint32_t& zOrder) +void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder) { uint32_t depth = cuGeom.depth; ModeDepth& md = m_modeDepth[depth]; @@ -204,20 +207,20 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); - if (shared) + if (m_param->analysisMode == X265_ANALYSIS_LOAD) { - uint8_t* sharedDepth = &shared->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; - char* sharedPartSizes = &shared->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; - uint8_t* sharedModes = &shared->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; + uint8_t* reuseDepth = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; + uint8_t* reuseModes = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; + char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; - if (mightNotSplit && depth == sharedDepth[zOrder] && zOrder == cuGeom.encodeIdx) + if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx) { m_quant.setQPforQuant(parentCTU); - PartSize size = (PartSize)sharedPartSizes[zOrder]; + PartSize size = (PartSize)reusePartSizes[zOrder]; Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN]; mode.cu.initSubCU(parentCTU, cuGeom); - checkIntra(mode, cuGeom, size, sharedModes); + checkIntra(mode, cuGeom, size, &reuseModes[zOrder]); checkBestMode(mode, depth); if (m_bTryLossless) @@ -227,7 +230,7 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2 addSplitFlagCost(*md.bestMode, cuGeom.depth); // increment zOrder offset to point to next best depth in sharedDepth buffer - zOrder += g_depthInc[g_maxCUDepth - 1][sharedDepth[zOrder]]; + zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]]; mightSplit = false; } } @@ -267,23 +270,23 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { - const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); - if (childCuData.flags & CUGeom::PRESENT) + const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childGeom.flags & CUGeom::PRESENT) { - m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); m_rqt[nextDepth].cur.load(*nextContext); - compressIntraCU(parentCTU, childCuData, shared, zOrder); + compressIntraCU(parentCTU, childGeom, zOrder); // Save best CU and pred data for this sub CU - splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); + splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); splitPred->addSubCosts(*nd.bestMode); - nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); + nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); nextContext = &nd.bestMode->contexts; } else { /* record the depth of this non-present sub-CU */ - splitCU->setEmptyPart(childCuData, subPartIdx); + splitCU->setEmptyPart(childGeom, subPartIdx); zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth]; } } @@ -300,38 +303,45 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2 /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); if (md.bestMode != &md.pred[PRED_SPLIT]) - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx); + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx); } bool Analysis::findJob(int threadId) { /* try to acquire a CU mode to analyze */ + m_pmodeLock.acquire(); if (m_totalNumJobs > m_numAcquiredJobs) { - /* ATOMIC_INC returns the incremented value */ - int id = ATOMIC_INC(&m_numAcquiredJobs); - if (m_totalNumJobs >= id) - { - parallelModeAnalysis(threadId, id - 1); + int id = m_numAcquiredJobs++; + m_pmodeLock.release(); - if (ATOMIC_INC(&m_numCompletedJobs) == m_totalNumJobs) - m_modeCompletionEvent.trigger(); - return true; - } + parallelModeAnalysis(threadId, id); + + m_pmodeLock.acquire(); + if (++m_numCompletedJobs == m_totalNumJobs) + m_modeCompletionEvent.trigger(); + m_pmodeLock.release(); + return true; } + else + m_pmodeLock.release(); + m_meLock.acquire(); if (m_totalNumME > m_numAcquiredME) { - int id = ATOMIC_INC(&m_numAcquiredME); - if (m_totalNumME >= id) - { - parallelME(threadId, id - 1); + int id = m_numAcquiredME++; + m_meLock.release(); - if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME) - m_meCompletionEvent.trigger(); - return true; - } + parallelME(threadId, id); + + m_meLock.acquire(); + if (++m_numCompletedME == m_totalNumME) + m_meCompletionEvent.trigger(); + m_meLock.release(); + return true; } + else + m_meLock.release(); return false; } @@ -349,18 +359,14 @@ void Analysis::parallelME(int threadId, int meId) slave->m_slice = m_slice; slave->m_frame = m_frame; - PicYuv* fencPic = m_frame->m_origPicYuv; - pixel* pu = fencPic->getLumaAddr(m_curMECu->m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx); - slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride); - slave->m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight); - - slave->prepMotionCompensation(*m_curMECu, *m_curGeom, m_curPart); + slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight); + slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart); } if (meId < m_slice->m_numRefIdx[0]) - slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 0, meId); + slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId); else - slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]); + slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]); } void Analysis::parallelModeAnalysis(int threadId, int jobId) @@ -376,8 +382,6 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId) slave->m_frame = m_frame; slave->setQP(*m_slice, m_rdCost.m_qp); slave->invalidateContexts(0); - if (jobId) - slave->m_me.setSourcePlane(m_frame->m_origPicYuv->m_picOrg[0], m_frame->m_origPicYuv->m_stride); } ModeDepth& md = m_modeDepth[m_curGeom->depth]; @@ -389,13 +393,15 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId) case 0: if (slave != this) slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur); - slave->checkIntraInInter_rd0_4(md.pred[PRED_INTRA], *m_curGeom); + slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom); if (m_param->rdLevel > 2) slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom); break; case 1: slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N); + if (m_slice->m_sliceType == B_SLICE) + slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom); break; case 2: @@ -446,6 +452,13 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId) case 1: slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false); + md.pred[PRED_BIDIR].rdCost = MAX_INT64; + if (m_slice->m_sliceType == B_SLICE) + { + slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom); + if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) + slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom); + } break; case 2: @@ -499,6 +512,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo /* Initialize all prediction CUs based on parentCTU */ md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom); md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); if (m_param->bEnableRectInter) @@ -520,12 +534,14 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); } + m_pmodeLock.acquire(); m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4; m_numAcquiredJobs = !bTryIntra; m_numCompletedJobs = m_numAcquiredJobs; m_curGeom = &cuGeom; m_bJobsQueued = true; JobProvider::enqueue(); + m_pmodeLock.release(); for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++) m_pool->pokeIdleThread(); @@ -572,17 +588,26 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo if (m_param->rdLevel > 2) { - /* encode best inter */ - for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) + /* RD selection between merge, inter, bidir and intra */ + if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */ { - prepMotionCompensation(bestInter->cu, cuGeom, puIdx); - motionCompensation(bestInter->predYuv, false, true); + for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) + { + prepMotionCompensation(bestInter->cu, cuGeom, puIdx); + motionCompensation(bestInter->predYuv, false, true); + } } encodeResAndCalcRdInterCU(*bestInter, cuGeom); - - /* RD selection between merge, inter and intra */ checkBestMode(*bestInter, depth); + /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */ + if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 && + md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17) + { + encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom); + checkBestMode(md.pred[PRED_BIDIR], depth); + } + if (bTryIntra) checkBestMode(md.pred[PRED_INTRA], depth); } @@ -591,6 +616,9 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) md.bestMode = bestInter; + if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost) + md.bestMode = &md.pred[PRED_BIDIR]; + if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) { md.bestMode = &md.pred[PRED_INTRA]; @@ -614,6 +642,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo m_modeCompletionEvent.wait(); checkBestMode(md.pred[PRED_2Nx2N], depth); + checkBestMode(md.pred[PRED_BIDIR], depth); if (m_param->bEnableRectInter) { @@ -640,7 +669,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra) { md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); - checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom); + checkIntraInInter(md.pred[PRED_INTRA], cuGeom); encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); checkBestMode(md.pred[PRED_INTRA], depth); } @@ -655,7 +684,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo bool bNoSplit = false; if (md.bestMode) { - bNoSplit = !!md.bestMode->cu.isSkipped(0); + bNoSplit = md.bestMode->cu.isSkipped(0); if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4) bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); } @@ -674,22 +703,22 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { - const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); - if (childCuData.flags & CUGeom::PRESENT) + const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childGeom.flags & CUGeom::PRESENT) { - m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); m_rqt[nextDepth].cur.load(*nextContext); - compressInterCU_dist(parentCTU, childCuData); + compressInterCU_dist(parentCTU, childGeom); // Save best CU and pred data for this sub CU - splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); + splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); splitPred->addSubCosts(*nd.bestMode); - nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); + nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); nextContext = &nd.bestMode->contexts; } else - splitCU->setEmptyPart(childCuData, subPartIdx); + splitCU->setEmptyPart(childGeom, subPartIdx); } nextContext->store(splitPred->contexts); @@ -701,10 +730,10 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo checkBestMode(*splitPred, depth); } - if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA) + if (mightNotSplit) { /* early-out statistics */ - FrameData& curEncData = const_cast(*m_frame->m_encData); + FrameData& curEncData = *m_frame->m_encData; FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; cuStat.count[depth] += 1; @@ -716,7 +745,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); if (md.bestMode != &md.pred[PRED_SPLIT]) - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx); + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx); } void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom) @@ -734,24 +763,9 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe { bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames; - /* Initialize all prediction CUs based on parentCTU */ - md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); + /* Compute Merge Cost */ md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); - if (m_param->bEnableRectInter) - { - md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); - md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); - } - if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6) - { - md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); - md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); - md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); - md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); - } - - /* Compute Merge Cost */ checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); bool earlyskip = false; @@ -760,14 +774,24 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe if (!earlyskip) { + md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N); - Mode *bestInter = &md.pred[PRED_2Nx2N]; + if (m_slice->m_sliceType == B_SLICE) + { + md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom); + checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom); + } + + Mode *bestInter = &md.pred[PRED_2Nx2N]; if (m_param->bEnableRectInter) { + md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N); if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_Nx2N]; + + md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN); if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxN]; @@ -789,18 +813,24 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe if (bHor) { + md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU); if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxnU]; + + md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD); if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxnD]; } if (bVer) { + md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N); if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_nLx2N]; + + md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N); if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_nRx2N]; @@ -810,37 +840,48 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe if (m_param->rdLevel >= 3) { /* Calculate RD cost of best inter option */ - for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) + if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */ { - prepMotionCompensation(bestInter->cu, cuGeom, puIdx); - motionCompensation(bestInter->predYuv, false, true); + for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) + { + prepMotionCompensation(bestInter->cu, cuGeom, puIdx); + motionCompensation(bestInter->predYuv, false, true); + } } - encodeResAndCalcRdInterCU(*bestInter, cuGeom); + checkBestMode(*bestInter, depth); - if (!md.bestMode || bestInter->rdCost < md.bestMode->rdCost) - md.bestMode = bestInter; + /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */ + if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 && + md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17) + { + encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom); + checkBestMode(md.pred[PRED_BIDIR], depth); + } if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) || md.bestMode->sa8dCost == MAX_INT64) { md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); - checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom); + checkIntraInInter(md.pred[PRED_INTRA], cuGeom); encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); - if (md.pred[PRED_INTRA].rdCost < md.bestMode->rdCost) - md.bestMode = &md.pred[PRED_INTRA]; + checkBestMode(md.pred[PRED_INTRA], depth); } } else { - /* SA8D choice between merge/skip, inter, and intra */ + /* SA8D choice between merge/skip, inter, bidir, and intra */ if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) md.bestMode = bestInter; + if (m_slice->m_sliceType == B_SLICE && + md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost) + md.bestMode = &md.pred[PRED_BIDIR]; + if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64) { md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); - checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom); + checkIntraInInter(md.pred[PRED_INTRA], cuGeom); if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) md.bestMode = &md.pred[PRED_INTRA]; } @@ -854,7 +895,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe /* prediction already generated for this CU, and if rd level * is not 0, it is already fully encoded */ } - else if (md.bestMode->cu.m_predMode[0] == MODE_INTER) + else if (md.bestMode->cu.isInter(0)) { for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++) { @@ -865,8 +906,23 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); else if (m_param->rdLevel == 1) { - m_rqt[cuGeom.depth].tmpResiYuv.subtract(md.fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize); - generateCoeffRecon(*md.bestMode, cuGeom); + /* generate recon pixels with no rate distortion considerations */ + CUData& cu = md.bestMode->cu; + m_quant.setQPforQuant(cu); + + uint32_t tuDepthRange[2]; + cu.getInterTUQtDepthRange(tuDepthRange, 0); + + m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize); + residualTransformQuantInter(*md.bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange); + if (cu.getQtRootCbf(0)) + md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]); + else + { + md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv); + if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) + cu.setPredModeSubParts(MODE_SKIP); + } } } else @@ -874,7 +930,20 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe if (m_param->rdLevel == 2) encodeIntraInInter(*md.bestMode, cuGeom); else if (m_param->rdLevel == 1) - generateCoeffRecon(*md.bestMode, cuGeom); + { + /* generate recon pixels with no rate distortion considerations */ + CUData& cu = md.bestMode->cu; + m_quant.setQPforQuant(cu); + + uint32_t tuDepthRange[2]; + cu.getIntraTUQtDepthRange(tuDepthRange, 0); + + uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; + residualTransformQuantIntra(*md.bestMode, cuGeom, initTuDepth, 0, tuDepthRange); + getBestIntraModeChroma(*md.bestMode, cuGeom); + residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0); + md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO: + } } } } // !earlyskip @@ -889,7 +958,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe bool bNoSplit = false; if (md.bestMode) { - bNoSplit = !!md.bestMode->cu.isSkipped(0); + bNoSplit = md.bestMode->cu.isSkipped(0); if (mightSplit && depth && depth >= minDepth && !bNoSplit) bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); } @@ -908,54 +977,48 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { - const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); - if (childCuData.flags & CUGeom::PRESENT) + const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childGeom.flags & CUGeom::PRESENT) { - m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); m_rqt[nextDepth].cur.load(*nextContext); - compressInterCU_rd0_4(parentCTU, childCuData); + compressInterCU_rd0_4(parentCTU, childGeom); // Save best CU and pred data for this sub CU - splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); + splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); splitPred->addSubCosts(*nd.bestMode); if (m_param->rdLevel) - nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); + nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); else - nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childCuData.numPartitions * subPartIdx); + nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx); if (m_param->rdLevel > 1) nextContext = &nd.bestMode->contexts; } else - splitCU->setEmptyPart(childCuData, subPartIdx); + splitCU->setEmptyPart(childGeom, subPartIdx); } nextContext->store(splitPred->contexts); if (mightNotSplit) addSplitFlagCost(*splitPred, cuGeom.depth); - else if (m_param->rdLevel <= 1) - splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits); - else + else if (m_param->rdLevel > 1) updateModeCost(*splitPred); + else + splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits); if (!md.bestMode) md.bestMode = splitPred; - else if (m_param->rdLevel >= 1) - { - if (splitPred->rdCost < md.bestMode->rdCost) - md.bestMode = splitPred; - } - else - { - if (splitPred->sa8dCost < md.bestMode->sa8dCost) - md.bestMode = splitPred; - } + else if (m_param->rdLevel > 1) + checkBestMode(*splitPred, cuGeom.depth); + else if (splitPred->sa8dCost < md.bestMode->sa8dCost) + md.bestMode = splitPred; } - if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA) + if (mightNotSplit) { /* early-out statistics */ - FrameData& curEncData = const_cast(*m_frame->m_encData); + FrameData& curEncData = *m_frame->m_encData; FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; cuStat.count[depth] += 1; @@ -967,7 +1030,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel) - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx); + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx); } void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom) @@ -981,27 +1044,39 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe if (mightNotSplit) { - for (int i = 0; i < MAX_PRED_TYPES; i++) - md.pred[i].cu.initSubCU(parentCTU, cuGeom); - + md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0); if (!earlySkip) { + md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false); checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); + if (m_slice->m_sliceType == B_SLICE) + { + md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom); + checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom); + if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) + { + encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom); + checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth); + } + } + if (m_param->bEnableRectInter) { - // Nx2N rect if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) { + md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false); checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth); } if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) { + md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false); checkBestMode(md.pred[PRED_2NxN], cuGeom.depth); } @@ -1027,11 +1102,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe { if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) { + md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly); checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth); } if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) { + md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly); checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth); } @@ -1040,11 +1117,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe { if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) { + md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly); checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth); } if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) { + md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly); checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth); } @@ -1054,11 +1133,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))) { + md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); checkBestMode(md.pred[PRED_INTRA], depth); if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) { + md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL); checkBestMode(md.pred[PRED_INTRA_NxN], depth); } @@ -1087,21 +1168,21 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { - const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); - if (childCuData.flags & CUGeom::PRESENT) + const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childGeom.flags & CUGeom::PRESENT) { - m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); m_rqt[nextDepth].cur.load(*nextContext); - compressInterCU_rd5_6(parentCTU, childCuData); + compressInterCU_rd5_6(parentCTU, childGeom); // Save best CU and pred data for this sub CU - splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); + splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); splitPred->addSubCosts(*nd.bestMode); - nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); + nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); nextContext = &nd.bestMode->contexts; } else - splitCU->setEmptyPart(childCuData, subPartIdx); + splitCU->setEmptyPart(childGeom, subPartIdx); } nextContext->store(splitPred->contexts); if (mightNotSplit) @@ -1117,7 +1198,7 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); if (md.bestMode != &md.pred[PRED_SPLIT]) - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx); + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx); } /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ @@ -1148,7 +1229,12 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe bestPred->sa8dCost = MAX_INT64; int bestSadCand = -1; - int sizeIdx = cuGeom.log2CUSize - 2; + int cpart, sizeIdx = cuGeom.log2CUSize - 2; + if (m_bChromaSa8d) + { + int cuSize = 1 << cuGeom.log2CUSize; + cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); + } for (uint32_t i = 0; i < maxNumMergeCand; ++i) { if (m_bFrameParallel && @@ -1159,16 +1245,20 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx tempPred->cu.m_interDir[0] = interDirNeighbours[i]; tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; - tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx; + tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx; tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; - tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx; + tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx; - // do MC only for Luma part prepMotionCompensation(tempPred->cu, cuGeom, 0); - motionCompensation(tempPred->predYuv, true, false); + motionCompensation(tempPred->predYuv, true, m_bChromaSa8d); tempPred->sa8dBits = getTUBits(i, maxNumMergeCand); tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size); + if (m_bChromaSa8d) + { + tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize); + tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize); + } tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits); if (tempPred->sa8dCost < bestPred->sa8dCost) @@ -1183,8 +1273,11 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe return; /* calculate the motion compensation for chroma for the best mode selected */ - prepMotionCompensation(bestPred->cu, cuGeom, 0); - motionCompensation(bestPred->predYuv, false, true); + if (!m_bChromaSa8d) /* Chroma MC was done above */ + { + prepMotionCompensation(bestPred->cu, cuGeom, 0); + motionCompensation(bestPred->predYuv, false, true); + } if (m_param->rdLevel) { @@ -1197,9 +1290,9 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand; tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); - tempPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); + tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); - tempPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); + tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); tempPred->sa8dCost = bestPred->sa8dCost; tempPred->predYuv.copyFromYuv(bestPred->predYuv); @@ -1213,9 +1306,9 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe /* broadcast sets of MV field data */ bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); - bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); + bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); - bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); + bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); } /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ @@ -1269,10 +1362,10 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */ tempPred->cu.m_interDir[0] = interDirNeighbours[i]; tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; - tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx; + tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx; tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; - tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx; - tempPred->cu.setSkipFlagSubParts(false); /* must be cleared between encode iterations */ + tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx; + tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */ prepMotionCompensation(tempPred->cu, cuGeom, 0); motionCompensation(tempPred->predYuv, true, true); @@ -1302,10 +1395,10 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; tempPred->cu.m_interDir[0] = interDirNeighbours[i]; tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; - tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx; + tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx; tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; - tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx; - tempPred->cu.setSkipFlagSubParts(false); + tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx; + tempPred->cu.setPredModeSubParts(MODE_INTER); tempPred->predYuv.copyFromYuv(bestPred->predYuv); } @@ -1324,9 +1417,9 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0]; bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0); bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0); - bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestCand][0].refIdx, 0, 0); + bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0); bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0); - bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestCand][1].refIdx, 0, 0); + bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0); } } @@ -1335,14 +1428,48 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize interMode.initCosts(); interMode.cu.setPartSizeSubParts(partSize); interMode.cu.setPredModeSubParts(MODE_INTER); + int numPredDir = m_slice->isInterP() ? 1 : 2; - if (predInterSearch(interMode, cuGeom, false, false)) + if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU) + { + for (uint32_t part = 0; part < interMode.cu.getNumPartInter(); part++) + { + MotionData* bestME = interMode.bestME[part]; + for (int32_t i = 0; i < numPredDir; i++) + { + bestME[i].ref = m_reuseInterDataCTU->ref; + m_reuseInterDataCTU++; + } + } + } + if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d)) { /* predInterSearch sets interMode.sa8dBits */ const Yuv& fencYuv = *interMode.fencYuv; Yuv& predYuv = interMode.predYuv; - interMode.distortion = primitives.sa8d[cuGeom.log2CUSize - 2](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size); + int part = partitionFromLog2Size(cuGeom.log2CUSize); + interMode.distortion = primitives.sa8d[part](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size); + if (m_bChromaSa8d) + { + uint32_t cuSize = 1 << cuGeom.log2CUSize; + int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); + interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize); + interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize); + } interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits); + + if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU) + { + for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) + { + MotionData* bestME = interMode.bestME[puIdx]; + for (int32_t i = 0; i < numPredDir; i++) + { + m_reuseInterDataCTU->ref = bestME[i].ref; + m_reuseInterDataCTU++; + } + } + } } else { @@ -1356,11 +1483,37 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize interMode.initCosts(); interMode.cu.setPartSizeSubParts(partSize); interMode.cu.setPredModeSubParts(MODE_INTER); + int numPredDir = m_slice->isInterP() ? 1 : 2; + if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU) + { + for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) + { + MotionData* bestME = interMode.bestME[puIdx]; + for (int32_t i = 0; i < numPredDir; i++) + { + bestME[i].ref = m_reuseInterDataCTU->ref; + m_reuseInterDataCTU++; + } + } + } if (predInterSearch(interMode, cuGeom, bMergeOnly, true)) { /* predInterSearch sets interMode.sa8dBits, but this is ignored */ encodeResAndCalcRdInterCU(interMode, cuGeom); + + if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU) + { + for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) + { + MotionData* bestME = interMode.bestME[puIdx]; + for (int32_t i = 0; i < numPredDir; i++) + { + m_reuseInterDataCTU->ref = bestME[i].ref; + m_reuseInterDataCTU++; + } + } + } } else { @@ -1369,221 +1522,151 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize } } -/* Note that this function does not save the best intra prediction, it must - * be generated later. It records the best mode in the cu */ -void Analysis::checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom) +void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom) { - CUData& cu = intraMode.cu; - uint32_t depth = cu.m_cuDepth[0]; + CUData& cu = bidir2Nx2N.cu; - cu.setPartSizeSubParts(SIZE_2Nx2N); - cu.setPredModeSubParts(MODE_INTRA); - - uint32_t initTrDepth = 0; - uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth; - uint32_t tuSize = 1 << log2TrSize; - const uint32_t absPartIdx = 0; - - // Reference sample smoothing - initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX); - - pixel* fenc = m_modeDepth[depth].fencYuv.m_buf[0]; - uint32_t stride = m_modeDepth[depth].fencYuv.m_size; - - pixel *above = m_refAbove + tuSize - 1; - pixel *aboveFiltered = m_refAboveFlt + tuSize - 1; - pixel *left = m_refLeft + tuSize - 1; - pixel *leftFiltered = m_refLeftFlt + tuSize - 1; - int sad, bsad; - uint32_t bits, bbits, mode, bmode; - uint64_t cost, bcost; - - // 33 Angle modes once - ALIGN_VAR_32(pixel, bufScale[32 * 32]); - ALIGN_VAR_32(pixel, bufTrans[32 * 32]); - ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]); - int scaleTuSize = tuSize; - int scaleStride = stride; - int costShift = 0; - int sizeIdx = log2TrSize - 2; - - if (tuSize > 32) + if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT) { - // origin is 64x64, we scale to 32x32 and setup required parameters - primitives.scale2D_64to32(bufScale, fenc, stride); - fenc = bufScale; - - // reserve space in case primitives need to store data in above - // or left buffers - pixel _above[4 * 32 + 1]; - pixel _left[4 * 32 + 1]; - pixel *aboveScale = _above + 2 * 32; - pixel *leftScale = _left + 2 * 32; - aboveScale[0] = leftScale[0] = above[0]; - primitives.scale1D_128to64(aboveScale + 1, above + 1, 0); - primitives.scale1D_128to64(leftScale + 1, left + 1, 0); - - scaleTuSize = 32; - scaleStride = 32; - costShift = 2; - sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 - - // Filtered and Unfiltered refAbove and refLeft pointing to above and left. - above = aboveScale; - left = leftScale; - aboveFiltered = aboveScale; - leftFiltered = leftScale; + bidir2Nx2N.sa8dCost = MAX_INT64; + bidir2Nx2N.rdCost = MAX_INT64; + return; } - pixelcmp_t sa8d = primitives.sa8d[sizeIdx]; - int predsize = scaleTuSize * scaleTuSize; - - m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); + const Yuv& fencYuv = *bidir2Nx2N.fencYuv; + MV mvzero(0, 0); + int cpart, partEnum = cuGeom.log2CUSize - 2; - /* there are three cost tiers for intra modes: - * pred[0] - mode probable, least cost - * pred[1], pred[2] - less probable, slightly more cost - * non-mpm modes - all cost the same (rbits) */ - uint64_t mpms; - uint32_t preds[3]; - uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms); - - // DC - primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16)); - bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; - bmode = mode = DC_IDX; - bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; - bcost = m_rdCost.calcRdSADCost(bsad, bbits); - - pixel *abovePlanar = above; - pixel *leftPlanar = left; - - if (tuSize & (8 | 16 | 32)) + if (m_bChromaSa8d) { - abovePlanar = aboveFiltered; - leftPlanar = leftFiltered; + int cuSize = 1 << cuGeom.log2CUSize; + cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); } - // PLANAR - primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0); - sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; - mode = PLANAR_IDX; - bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; - cost = m_rdCost.calcRdSADCost(sad, bits); - COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); - - // Transpose NxN - primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride); - - primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16)); - - bool modeHor; - pixel *cmp; - intptr_t srcStride; - -#define TRY_ANGLE(angle) \ - modeHor = angle < 18; \ - cmp = modeHor ? bufTrans : fenc; \ - srcStride = modeHor ? scaleTuSize : scaleStride; \ - sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \ - bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \ - cost = m_rdCost.calcRdSADCost(sad, bits) + bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0]; + bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1]; + MotionData* bestME = bidir2Nx2N.bestME[0]; + int ref0 = bestME[0].ref; + MV mvp0 = bestME[0].mvp; + int mvpIdx0 = bestME[0].mvpIdx; + int ref1 = bestME[1].ref; + MV mvp1 = bestME[1].mvp; + int mvpIdx1 = bestME[1].mvpIdx; + + bidir2Nx2N.initCosts(); + cu.setPartSizeSubParts(SIZE_2Nx2N); + cu.setPredModeSubParts(MODE_INTER); + cu.setPUInterDir(3, 0, 0); + cu.setPURefIdx(0, (int8_t)ref0, 0, 0); + cu.setPURefIdx(1, (int8_t)ref1, 0, 0); + cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0; + cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1; + cu.m_mergeFlag[0] = 0; + + /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */ + cu.setPUMv(0, bestME[0].mv, 0, 0); + cu.m_mvd[0][0] = bestME[0].mv - mvp0; + + cu.setPUMv(1, bestME[1].mv, 0, 0); + cu.m_mvd[1][0] = bestME[1].mv - mvp1; + + prepMotionCompensation(cu, cuGeom, 0); + motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d); + + int sa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size); + if (m_bChromaSa8d) + { + /* Add in chroma distortion */ + sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize); + sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize); + } + bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); + bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits); - if (m_param->bEnableFastIntra) + bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero(); + if (bTryZero) + { + /* Do not try zero MV if unidir motion predictors are beyond + * valid search area */ + MV mvmin, mvmax; + int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight); + setSearchRange(cu, mvzero, merange, mvmin, mvmax); + mvmax.y += 2; // there is some pad for subpel refine + mvmin <<= 2; + mvmax <<= 2; + + bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax); + bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax); + } + if (bTryZero) { - int asad = 0; - uint32_t lowmode, highmode, amode = 5, abits = 0; - uint64_t acost = MAX_INT64; + /* Estimate cost of BIDIR using coincident blocks */ + Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; - /* pick the best angle, sampling at distance of 5 */ - for (mode = 5; mode < 35; mode += 5) - { - TRY_ANGLE(mode); - COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits); - } + int zsa8d; - /* refine best angle at distance 2, then distance 1 */ - for (uint32_t dist = 2; dist >= 1; dist--) + if (m_bChromaSa8d) { - lowmode = amode - dist; - highmode = amode + dist; + cu.m_mv[0][0] = mvzero; + cu.m_mv[1][0] = mvzero; - X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n"); - TRY_ANGLE(lowmode); - COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits); + prepMotionCompensation(cu, cuGeom, 0); + motionCompensation(tmpPredYuv, true, true); - X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n"); - TRY_ANGLE(highmode); - COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits); + zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize); + zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize); } - - if (amode == 33) + else { - TRY_ANGLE(34); - COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits); - } + pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx); + pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx); + intptr_t refStride = m_slice->m_mref[0][0].lumaStride; - COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits); - } - else // calculate and search all intra prediction angles for lowest cost - { - for (mode = 2; mode < 35; mode++) - { - TRY_ANGLE(mode); - COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); + primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32); + zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size); } - } - cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTrDepth); - intraMode.initCosts(); - intraMode.totalBits = bbits; - intraMode.distortion = bsad; - intraMode.sa8dCost = bcost; - intraMode.sa8dBits = bbits; -} + uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); + uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); + uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); -void Analysis::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom) -{ - CUData& cu = intraMode.cu; - Yuv* reconYuv = &intraMode.reconYuv; - Yuv* fencYuv = &m_modeDepth[cuGeom.depth].fencYuv; + /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */ + checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvp0, mvpIdx0, bits0, zcost); + checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvp1, mvpIdx1, bits1, zcost); - X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n"); - X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n"); + uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); + zcost = zsa8d + m_rdCost.getCost(zbits); - m_quant.setQPforQuant(cu); - - uint32_t tuDepthRange[2]; - cu.getIntraTUQtDepthRange(tuDepthRange, 0); - - m_entropyCoder.load(m_rqt[cuGeom.depth].cur); - - Cost icosts; - codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange); - extractIntraResultQT(cu, *reconYuv, 0, 0); - - intraMode.distortion = icosts.distortion; - intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom); - - m_entropyCoder.resetBits(); - if (m_slice->m_pps->bTransquantBypassEnabled) - m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); - m_entropyCoder.codeSkipFlag(cu, 0); - m_entropyCoder.codePredMode(cu.m_predMode[0]); - m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); - m_entropyCoder.codePredInfo(cu, 0); - intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits(); + if (zcost < bidir2Nx2N.sa8dCost) + { + bidir2Nx2N.sa8dBits = zbits; + bidir2Nx2N.sa8dCost = zcost; - bool bCodeDQP = m_slice->m_pps->bUseDQP; - m_entropyCoder.codeCoeff(cu, 0, cuGeom.depth, bCodeDQP, tuDepthRange); + cu.setPUMv(0, mvzero, 0, 0); + cu.m_mvd[0][0] = mvzero - mvp0; + cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0; - intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); - intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits; - if (m_rdCost.m_psyRd) - intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + cu.setPUMv(1, mvzero, 0, 0); + cu.m_mvd[1][0] = mvzero - mvp1; + cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1; - m_entropyCoder.store(intraMode.contexts); - updateModeCost(intraMode); + if (m_bChromaSa8d) + /* real MC was already performed */ + bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv); + else + { + prepMotionCompensation(cu, cuGeom, 0); + motionCompensation(bidir2Nx2N.predYuv, true, true); + } + } + else if (m_bChromaSa8d) + { + /* recover overwritten motion vectors */ + cu.m_mv[0][0] = bestME[0].mv; + cu.m_mv[1][0] = bestME[1].mv; + } + } } void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) @@ -1592,9 +1675,9 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) { for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { - const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); - if (childCuData.flags & CUGeom::PRESENT) - encodeResidue(ctu, childCuData); + const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childGeom.flags & CUGeom::PRESENT) + encodeResidue(ctu, childGeom); } return; } @@ -1602,29 +1685,31 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) uint32_t absPartIdx = cuGeom.encodeIdx; int sizeIdx = cuGeom.log2CUSize - 2; - Yuv& fencYuv = m_modeDepth[0].fencYuv; - /* reuse the bestMode data structures at the current depth */ Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode; - Yuv& reconYuv = bestMode->reconYuv; CUData& cu = bestMode->cu; cu.copyFromPic(ctu, cuGeom); m_quant.setQPforQuant(cu); - if (cu.m_predMode[0] == MODE_INTRA) + Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv; + if (cuGeom.depth) + m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx); + X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n"); + + if (cu.isIntra(0)) { uint32_t tuDepthRange[2]; cu.getIntraTUQtDepthRange(tuDepthRange, 0); - uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN; - residualTransformQuantIntra(*bestMode, cuGeom, initTrDepth, 0, tuDepthRange); + uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; + residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0, tuDepthRange); getBestIntraModeChroma(*bestMode, cuGeom); residualQTIntraChroma(*bestMode, cuGeom, 0, 0); } - else if (cu.m_predMode[0] == MODE_INTER) + else // if (cu.isInter(0)) { - X265_CHECK(!ctu.m_skipFlag[absPartIdx], "skip not expected prior to transform\n"); + X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n"); /* Calculate residual for current CU part into depth sized resiYuv */ @@ -1637,16 +1722,16 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) pixel* predV = predYuv.getCrAddr(absPartIdx); primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size, - fencYuv.getLumaAddr(absPartIdx), predY, + fencYuv.m_buf[0], predY, fencYuv.m_size, predYuv.m_size); primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize, - fencYuv.getCbAddr(absPartIdx), predU, - fencYuv.m_csize, predYuv.m_csize); + fencYuv.m_buf[1], predU, + fencYuv.m_csize, predYuv.m_csize); primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize, - fencYuv.getCrAddr(absPartIdx), predV, - fencYuv.m_csize, predYuv.m_csize); + fencYuv.m_buf[2], predV, + fencYuv.m_csize, predYuv.m_csize); uint32_t tuDepthRange[2]; cu.getInterTUQtDepthRange(tuDepthRange, 0); @@ -1654,57 +1739,38 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange); if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) - cu.setSkipFlagSubParts(true); + cu.setPredModeSubParts(MODE_SKIP); - PicYuv& reconPicYuv = *m_frame->m_reconPicYuv; - if (cu.getQtRootCbf(0)) // TODO: split to each component - { - /* residualTransformQuantInter() wrote transformed residual back into - * resiYuv. Generate the recon pixels by adding it to the prediction */ + /* residualTransformQuantInter() wrote transformed residual back into + * resiYuv. Generate the recon pixels by adding it to the prediction */ - primitives.luma_add_ps[sizeIdx](reconYuv.m_buf[0], reconYuv.m_size, + PicYuv& reconPic = *m_frame->m_reconPic; + if (cu.m_cbf[0][0]) + primitives.luma_add_ps[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size); - primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[1], reconYuv.m_csize, - predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize); - primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[2], reconYuv.m_csize, - predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize); - - /* copy the reconstructed part to the recon pic for later intra - * predictions */ - reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, absPartIdx); - } else - { - /* copy the prediction pixels to the recon pic for later intra - * predictions */ - - primitives.luma_copy_pp[sizeIdx](reconPicYuv.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_stride, + primitives.luma_copy_pp[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, predYuv.m_size); - primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCbAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC, + + if (cu.m_cbf[1][0]) + primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, + predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize); + else + primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, predYuv.m_csize); - primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCrAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC, + + if (cu.m_cbf[2][0]) + primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, + predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize); + else + primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, predYuv.m_csize); - } } - /* else if (cu.m_predMode[0] == MODE_NONE) {} */ checkDQP(cu, cuGeom); cu.updatePic(cuGeom.depth); } -/* check whether current try is the best with identifying the depth of current try */ -void Analysis::checkBestMode(Mode& mode, uint32_t depth) -{ - ModeDepth& md = m_modeDepth[depth]; - if (md.bestMode) - { - if (mode.rdCost < md.bestMode->rdCost) - md.bestMode = &mode; - } - else - md.bestMode = &mode; -} - void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth) { if (m_param->rdLevel >= 3) @@ -1817,7 +1883,7 @@ bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom * each quantity */ uint32_t depth = cuGeom.depth; - FrameData& curEncData = const_cast(*m_frame->m_encData); + FrameData& curEncData = *m_frame->m_encData; FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth]; uint64_t cuCount = cuStat.count[depth]; @@ -1855,7 +1921,7 @@ bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom } // give 60% weight to all CU's and 40% weight to neighbour CU's - if (neighCost + cuCount) + if (neighCount + cuCount) { uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount)); uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost; diff --git a/source/encoder/analysis.h b/source/encoder/analysis.h index 404cc90..aff7d66 100644 --- a/source/encoder/analysis.h +++ b/source/encoder/analysis.h @@ -49,6 +49,7 @@ public: PRED_SKIP, PRED_INTRA, PRED_2Nx2N, + PRED_BIDIR, PRED_Nx2N, PRED_2NxN, PRED_SPLIT, @@ -71,11 +72,15 @@ public: ModeDepth m_modeDepth[NUM_CU_DEPTH]; bool m_bTryLossless; + bool m_bChromaSa8d; + /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */ + analysis_intra_data* m_reuseIntraDataCTU; + analysis_inter_data* m_reuseInterDataCTU; Analysis(); bool create(ThreadLocalData* tld); void destroy(); - Search::Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext); + Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext); protected: @@ -83,13 +88,14 @@ protected: int m_totalNumJobs; volatile int m_numAcquiredJobs; volatile int m_numCompletedJobs; + Lock m_pmodeLock; Event m_modeCompletionEvent; bool findJob(int threadId); void parallelModeAnalysis(int threadId, int jobId); void parallelME(int threadId, int meId); /* full analysis for an I-slice CU */ - void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* sdata, uint32_t &zOrder); + void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder); /* full analysis for a P or B slice CU */ void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom); @@ -104,20 +110,36 @@ protected: void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize); void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly); - /* measure intra options */ - void checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom); - void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom); + void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom); /* encode current bestMode losslessly, pick best RD cost */ void tryLossless(const CUGeom& cuGeom); - void checkDQP(CUData& cu, const CUGeom& cuGeom); + /* add the RD cost of coding a split flag (0 or 1) to the given mode */ void addSplitFlagCost(Mode& mode, uint32_t depth); - void checkBestMode(Mode& mode, uint32_t depth); + + /* update CBF flags and QP values to be internally consistent */ + void checkDQP(CUData& cu, const CUGeom& cuGeom); + + /* work-avoidance heuristics for RD levels < 5 */ uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom); bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode); + /* generate residual and recon pixels for an entire CTU recursively (RD0) */ void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom); + + /* check whether current mode is the new best */ + inline void checkBestMode(Mode& mode, uint32_t depth) + { + ModeDepth& md = m_modeDepth[depth]; + if (md.bestMode) + { + if (mode.rdCost < md.bestMode->rdCost) + md.bestMode = &mode; + } + else + md.bestMode = &mode; + } }; struct ThreadLocalData diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp index 66f8e28..74cee73 100644 --- a/source/encoder/api.cpp +++ b/source/encoder/api.cpp @@ -73,7 +73,11 @@ x265_encoder *x265_encoder_open(x265_param *p) determineLevel(*param, encoder->m_vps); encoder->create(); - encoder->init(); + if (encoder->m_aborted) + { + delete encoder; + return NULL; + } x265_print_params(param); @@ -178,7 +182,6 @@ void x265_encoder_close(x265_encoder *enc) extern "C" void x265_cleanup(void) { - destroyROM(); BitCost::destroy(); } @@ -198,13 +201,12 @@ void x265_picture_init(x265_param *param, x265_picture *pic) pic->forceqp = X265_QP_AUTO; if (param->analysisMode) { - uint32_t numPartitions = 1 << (g_maxFullDepth * 2); uint32_t widthInCU = (param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize; uint32_t heightInCU = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize; uint32_t numCUsInFrame = widthInCU * heightInCU; pic->analysisData.numCUsInFrame = numCUsInFrame; - pic->analysisData.numPartitions = numPartitions; + pic->analysisData.numPartitions = NUM_CU_PARTITIONS; } } @@ -213,37 +215,3 @@ void x265_picture_free(x265_picture *p) { return x265_free(p); } - -int x265_alloc_analysis_data(x265_picture* pic) -{ - CHECKED_MALLOC(pic->analysisData.interData, x265_inter_data, pic->analysisData.numCUsInFrame * 85); - CHECKED_MALLOC(pic->analysisData.intraData, x265_intra_data, 1); - pic->analysisData.intraData->cuAddr = NULL; - pic->analysisData.intraData->depth = NULL; - pic->analysisData.intraData->modes = NULL; - pic->analysisData.intraData->partSizes = NULL; - pic->analysisData.intraData->poc = NULL; - CHECKED_MALLOC(pic->analysisData.intraData->depth, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame); - CHECKED_MALLOC(pic->analysisData.intraData->modes, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame); - CHECKED_MALLOC(pic->analysisData.intraData->partSizes, char, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame); - CHECKED_MALLOC(pic->analysisData.intraData->cuAddr, uint32_t, pic->analysisData.numCUsInFrame); - CHECKED_MALLOC(pic->analysisData.intraData->poc, int, pic->analysisData.numCUsInFrame); - return 0; - -fail: - x265_free_analysis_data(pic); - return -1; -} - -void x265_free_analysis_data(x265_picture* pic) -{ - X265_FREE(pic->analysisData.interData); - pic->analysisData.interData = NULL; - X265_FREE(pic->analysisData.intraData->depth); - X265_FREE(pic->analysisData.intraData->modes); - X265_FREE(pic->analysisData.intraData->partSizes); - X265_FREE(pic->analysisData.intraData->cuAddr); - X265_FREE(pic->analysisData.intraData->poc); - X265_FREE(pic->analysisData.intraData); - pic->analysisData.intraData = NULL; -} diff --git a/source/encoder/bitcost.h b/source/encoder/bitcost.h index d28486b..674dffa 100644 --- a/source/encoder/bitcost.h +++ b/source/encoder/bitcost.h @@ -35,7 +35,7 @@ class BitCost { public: - BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0) {} + BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0), m_mvp(0) {} void setQP(unsigned int qp); diff --git a/source/encoder/dpb.cpp b/source/encoder/dpb.cpp index 1c82a76..9ca1d04 100644 --- a/source/encoder/dpb.cpp +++ b/source/encoder/dpb.cpp @@ -52,8 +52,8 @@ DPB::~DPB() FrameData* next = m_picSymFreeList->m_freeListNext; m_picSymFreeList->destroy(); - m_picSymFreeList->m_reconPicYuv->destroy(); - delete m_picSymFreeList->m_reconPicYuv; + m_picSymFreeList->m_reconPic->destroy(); + delete m_picSymFreeList->m_reconPic; delete m_picSymFreeList; m_picSymFreeList = next; @@ -82,7 +82,7 @@ void DPB::recycleUnreferenced() curFrame->m_encData->m_freeListNext = m_picSymFreeList; m_picSymFreeList = curFrame->m_encData; curFrame->m_encData = NULL; - curFrame->m_reconPicYuv = NULL; + curFrame->m_reconPic = NULL; } } } diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index 44e82af..14a2200 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -51,6 +51,8 @@ static const char *summaryCSVHeader = "B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), " "Version\n"; +const char* defaultAnalysisFileName = "x265_analysis.dat"; + using namespace x265; Encoder::Encoder() @@ -78,6 +80,7 @@ Encoder::Encoder() m_buOffsetC = NULL; m_threadPool = 0; m_numThreadLocalData = 0; + m_analysisFile = NULL; } void Encoder::create() @@ -131,7 +134,7 @@ void Encoder::create() int cpuCount = getCpuCount(); if (!p->bEnableWavefront) p->frameNumThreads = X265_MIN(cpuCount, (rows + 1) / 2); - else if (cpuCount > 32) + else if (cpuCount >= 32) p->frameNumThreads = 6; // dual-socket 10-core IvyBridge or higher else if (cpuCount >= 16) p->frameNumThreads = 5; // 8 HT cores, or dual socket @@ -194,13 +197,13 @@ void Encoder::create() m_csvfpt = fopen(m_param->csvfn, "r"); if (m_csvfpt) { - // file already exists, re-open for append + /* file already exists, re-open for append */ fclose(m_csvfpt); m_csvfpt = fopen(m_param->csvfn, "ab"); } else { - // new CSV file, write header + /* new CSV file, write header */ m_csvfpt = fopen(m_param->csvfn, "wb"); if (m_csvfpt) { @@ -218,7 +221,44 @@ void Encoder::create() } } + if (m_frameEncoder) + { + int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; + int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; + for (int i = 0; i < m_param->frameNumThreads; i++) + { + if (!m_frameEncoder[i].init(this, numRows, numCols, i)) + { + x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n"); + m_aborted = true; + } + } + } + + if (m_param->bEmitHRDSEI) + m_rateControl->initHRD(&m_sps); + if (!m_rateControl->init(&m_sps)) + m_aborted = true; + + m_lookahead->init(); + + if (m_param->analysisMode) + { + const char* name = m_param->analysisFileName; + if (!name) + name = defaultAnalysisFileName; + const char* mode = m_param->analysisMode == X265_ANALYSIS_LOAD ? "rb" : "wb"; + m_analysisFile = fopen(name, mode); + if (!m_analysisFile) + { + x265_log(NULL, X265_LOG_ERROR, "Analysis load/save: failed to open file %s\n", name); + m_aborted = true; + } + } + m_aborted |= parseLambdaFile(m_param); + + m_encodeStartTime = x265_mdate(); } void Encoder::destroy() @@ -270,6 +310,10 @@ void Encoder::destroy() X265_FREE(m_buOffsetY); X265_FREE(m_buOffsetC); + if (m_analysisFile) + fclose(m_analysisFile); + free(m_param->analysisFileName); + free(m_param->csvfn); if (m_csvfpt) fclose(m_csvfpt); free(m_param->rc.statFileName); // alloc'd by strdup @@ -277,29 +321,6 @@ void Encoder::destroy() X265_FREE(m_param); } -void Encoder::init() -{ - if (m_frameEncoder) - { - int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; - int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; - for (int i = 0; i < m_param->frameNumThreads; i++) - { - if (!m_frameEncoder[i].init(this, numRows, numCols, i)) - { - x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n"); - m_aborted = true; - } - } - } - if (m_param->bEmitHRDSEI) - m_rateControl->initHRD(&m_sps); - if (!m_rateControl->init(&m_sps)) - m_aborted = true; - m_lookahead->init(); - m_encodeStartTime = x265_mdate(); -} - void Encoder::updateVbvPlan(RateControl* rc) { for (int i = 0; i < m_param->frameNumThreads; i++) @@ -367,14 +388,14 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) * allocated by this top level encoder */ if (m_cuOffsetY) { - inFrame->m_origPicYuv->m_cuOffsetC = m_cuOffsetC; - inFrame->m_origPicYuv->m_cuOffsetY = m_cuOffsetY; - inFrame->m_origPicYuv->m_buOffsetC = m_buOffsetC; - inFrame->m_origPicYuv->m_buOffsetY = m_buOffsetY; + inFrame->m_fencPic->m_cuOffsetC = m_cuOffsetC; + inFrame->m_fencPic->m_cuOffsetY = m_cuOffsetY; + inFrame->m_fencPic->m_buOffsetC = m_buOffsetC; + inFrame->m_fencPic->m_buOffsetY = m_buOffsetY; } else { - if (!inFrame->m_origPicYuv->createOffsets(m_sps)) + if (!inFrame->m_fencPic->createOffsets(m_sps)) { m_aborted = true; x265_log(m_param, X265_LOG_ERROR, "memory allocation failure, aborting encode\n"); @@ -384,10 +405,10 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) } else { - m_cuOffsetC = inFrame->m_origPicYuv->m_cuOffsetC; - m_cuOffsetY = inFrame->m_origPicYuv->m_cuOffsetY; - m_buOffsetC = inFrame->m_origPicYuv->m_buOffsetC; - m_buOffsetY = inFrame->m_origPicYuv->m_buOffsetY; + m_cuOffsetC = inFrame->m_fencPic->m_cuOffsetC; + m_cuOffsetY = inFrame->m_fencPic->m_cuOffsetY; + m_buOffsetC = inFrame->m_fencPic->m_buOffsetC; + m_buOffsetY = inFrame->m_fencPic->m_buOffsetY; } } } @@ -405,9 +426,8 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) /* Copy input picture into a Frame and PicYuv, send to lookahead */ inFrame->m_poc = ++m_pocLast; - inFrame->m_origPicYuv->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset); - inFrame->m_intraData = pic_in->analysisData.intraData; - inFrame->m_interData = pic_in->analysisData.interData; + inFrame->m_fencPic->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset); + inFrame->m_userData = pic_in->userData; inFrame->m_pts = pic_in->pts; inFrame->m_forceqp = pic_in->forceqp; @@ -436,6 +456,23 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) /* Use the frame types from the first pass, if available */ int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : pic_in->sliceType; + + /* In analysisSave mode, x265_analysis_data is allocated in pic_in and inFrame points to this */ + /* Load analysis data before lookahead->addPicture, since sliceType has been decided */ + if (m_param->analysisMode == X265_ANALYSIS_LOAD) + { + x265_picture* inputPic = const_cast(pic_in); + /* readAnalysisFile reads analysis data for the frame and allocates memory based on slicetype */ + readAnalysisFile(&inputPic->analysisData, inFrame->m_poc); + inFrame->m_analysisData.poc = inFrame->m_poc; + inFrame->m_analysisData.sliceType = inputPic->analysisData.sliceType; + inFrame->m_analysisData.numCUsInFrame = inputPic->analysisData.numCUsInFrame; + inFrame->m_analysisData.numPartitions = inputPic->analysisData.numPartitions; + inFrame->m_analysisData.interData = inputPic->analysisData.interData; + inFrame->m_analysisData.intraData = inputPic->analysisData.intraData; + sliceType = inputPic->analysisData.sliceType; + } + m_lookahead->addPicture(inFrame, sliceType); m_numDelayedPic++; } @@ -454,9 +491,14 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) if (outFrame) { Slice *slice = outFrame->m_encData->m_slice; + + /* Free up pic_in->analysisData since it has already been used */ + if (m_param->analysisMode == X265_ANALYSIS_LOAD) + freeAnalysis(&outFrame->m_analysisData); + if (pic_out) { - PicYuv *recpic = outFrame->m_reconPicYuv; + PicYuv *recpic = outFrame->m_reconPic; pic_out->poc = slice->m_poc; pic_out->bitDepth = X265_DEPTH; pic_out->userData = outFrame->m_userData; @@ -484,16 +526,20 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) pic_out->stride[1] = (int)(recpic->m_strideC * sizeof(pixel)); pic_out->planes[2] = recpic->m_picOrg[2]; pic_out->stride[2] = (int)(recpic->m_strideC * sizeof(pixel)); - } - if (m_param->analysisMode) - { - pic_out->analysisData.interData = outFrame->m_interData; - pic_out->analysisData.intraData = outFrame->m_intraData; - pic_out->analysisData.numCUsInFrame = slice->m_sps->numCUsInFrame; - pic_out->analysisData.numPartitions = slice->m_sps->numPartitions; + /* Dump analysis data from pic_out to file in save mode and free */ + if (m_param->analysisMode == X265_ANALYSIS_SAVE) + { + pic_out->analysisData.poc = pic_out->poc; + pic_out->analysisData.sliceType = pic_out->sliceType; + pic_out->analysisData.numCUsInFrame = outFrame->m_analysisData.numCUsInFrame; + pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions; + pic_out->analysisData.interData = outFrame->m_analysisData.interData; + pic_out->analysisData.intraData = outFrame->m_analysisData.intraData; + writeAnalysisFile(&pic_out->analysisData); + freeAnalysis(&pic_out->analysisData); + } } - if (slice->m_sliceType == P_SLICE) { if (slice->m_weightPredTable[0][0][0].bPresentFlag) @@ -521,8 +567,8 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) } if (m_aborted) return -1; - finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits); + // Allow this frame to be recycled if no frame encoders are using it for reference if (!pic_out) { @@ -557,10 +603,10 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) slice->m_pps = &m_pps; slice->m_maxNumMergeCand = m_param->maxNumMergeCand; slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_CU_PARTITIONS); - frameEnc->m_reconPicYuv->m_cuOffsetC = m_cuOffsetC; - frameEnc->m_reconPicYuv->m_cuOffsetY = m_cuOffsetY; - frameEnc->m_reconPicYuv->m_buOffsetC = m_buOffsetC; - frameEnc->m_reconPicYuv->m_buOffsetY = m_buOffsetY; + frameEnc->m_reconPic->m_cuOffsetC = m_cuOffsetC; + frameEnc->m_reconPic->m_cuOffsetY = m_cuOffsetY; + frameEnc->m_reconPic->m_buOffsetC = m_buOffsetC; + frameEnc->m_reconPic->m_buOffsetY = m_buOffsetY; } curEncoder->m_rce.encodeOrder = m_encodedFrameNum++; if (m_bframeDelay) @@ -573,6 +619,20 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) } else frameEnc->m_dts = frameEnc->m_reorderedPts; + /* Allocate analysis data before encode in save mode. This is allocated in frameEnc*/ + if (m_param->analysisMode == X265_ANALYSIS_SAVE) + { + x265_analysis_data* analysis = &frameEnc->m_analysisData; + analysis->poc = frameEnc->m_poc; + analysis->sliceType = frameEnc->m_lowres.sliceType; + uint32_t widthInCU = (m_param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize; + uint32_t heightInCU = (m_param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize; + + uint32_t numCUsInFrame = widthInCU * heightInCU; + analysis->numCUsInFrame = numCUsInFrame; + analysis->numPartitions = NUM_CU_PARTITIONS; + allocAnalysis(analysis); + } // determine references, setup RPS, etc m_dpb->prepareEncode(frameEnc); @@ -965,7 +1025,7 @@ static const char*digestToString(const unsigned char digest[3][16], int numChar) void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, uint64_t bits) { - PicYuv* reconPic = curFrame->m_reconPicYuv; + PicYuv* reconPic = curFrame->m_reconPic; //===== calculate PSNR ===== int width = reconPic->m_picWidth - m_sps.conformanceWindow.rightOffset; @@ -1280,8 +1340,8 @@ void Encoder::initPPS(PPS *pps) pps->maxCuDQPDepth = 0; } - pps->chromaCbQpOffset = m_param->cbQpOffset; - pps->chromaCrQpOffset = m_param->crQpOffset; + pps->chromaQpOffset[0] = m_param->cbQpOffset; + pps->chromaQpOffset[1] = m_param->crQpOffset; pps->bConstrainedIntraPred = m_param->bEnableConstrainedIntra; pps->bUseWeightPred = m_param->bEnableWeightedPred; @@ -1290,13 +1350,10 @@ void Encoder::initPPS(PPS *pps) pps->bTransformSkipEnabled = m_param->bEnableTransformSkip; pps->bSignHideEnabled = m_param->bEnableSignHiding; - /* If offsets are ever configured, enable bDeblockingFilterControlPresent and set - * deblockingFilterBetaOffsetDiv2 / deblockingFilterTcOffsetDiv2 */ - bool bDeblockOffsetInPPS = 0; - pps->bDeblockingFilterControlPresent = !m_param->bEnableLoopFilter || bDeblockOffsetInPPS; + pps->bDeblockingFilterControlPresent = !m_param->bEnableLoopFilter || m_param->deblockingFilterBetaOffset || m_param->deblockingFilterTCOffset; pps->bPicDisableDeblockingFilter = !m_param->bEnableLoopFilter; - pps->deblockingFilterBetaOffsetDiv2 = 0; - pps->deblockingFilterTcOffsetDiv2 = 0; + pps->deblockingFilterBetaOffsetDiv2 = m_param->deblockingFilterBetaOffset; + pps->deblockingFilterTcOffsetDiv2 = m_param->deblockingFilterTCOffset; pps->bEntropyCodingSyncEnabled = m_param->bEnableWavefront; } @@ -1330,6 +1387,12 @@ void Encoder::configure(x265_param *p) p->bBPyramid = 0; /* Disable features which are not supported by the current RD level */ + if (p->rdLevel < 5) + { + if (p->bEnableCbfFastMode) /* impossible */ + x265_log(p, X265_LOG_WARNING, "--fast-cbf disabled, requires --rdlevel 5 or higher\n"); + p->bEnableCbfFastMode = 0; + } if (p->rdLevel < 4) { if (p->psyRdoq > 0) /* impossible */ @@ -1458,35 +1521,189 @@ void Encoder::configure(x265_param *p) x265_log(p, X265_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s); } - //========= set default display window ================================== + /* initialize the conformance window */ m_conformanceWindow.bEnabled = false; m_conformanceWindow.rightOffset = 0; m_conformanceWindow.topOffset = 0; m_conformanceWindow.bottomOffset = 0; m_conformanceWindow.leftOffset = 0; - //======== set pad size if width is not multiple of the minimum CU size ========= - const uint32_t minCUSize = MIN_CU_SIZE; - if (p->sourceWidth & (minCUSize - 1)) + /* set pad size if width is not multiple of the minimum CU size */ + if (p->sourceWidth & (MIN_CU_SIZE - 1)) { - uint32_t rem = p->sourceWidth & (minCUSize - 1); - uint32_t padsize = minCUSize - rem; + uint32_t rem = p->sourceWidth & (MIN_CU_SIZE - 1); + uint32_t padsize = MIN_CU_SIZE - rem; p->sourceWidth += padsize; - /* set the confirmation window offsets */ m_conformanceWindow.bEnabled = true; m_conformanceWindow.rightOffset = padsize; } - //======== set pad size if height is not multiple of the minimum CU size ========= - if (p->sourceHeight & (minCUSize - 1)) + /* set pad size if height is not multiple of the minimum CU size */ + if (p->sourceHeight & (MIN_CU_SIZE - 1)) { - uint32_t rem = p->sourceHeight & (minCUSize - 1); - uint32_t padsize = minCUSize - rem; + uint32_t rem = p->sourceHeight & (MIN_CU_SIZE - 1); + uint32_t padsize = MIN_CU_SIZE - rem; p->sourceHeight += padsize; - /* set the confirmation window offsets */ m_conformanceWindow.bEnabled = true; m_conformanceWindow.bottomOffset = padsize; } + if (p->bDistributeModeAnalysis && p->analysisMode) + { + p->analysisMode = X265_ANALYSIS_OFF; + x265_log(p, X265_LOG_WARNING, "Analysis save and load mode not supported for distributed mode analysis\n"); + } +} + +void Encoder::allocAnalysis(x265_analysis_data* analysis) +{ + if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) + { + analysis_intra_data *intraData = (analysis_intra_data*)analysis->intraData; + CHECKED_MALLOC_ZERO(intraData, analysis_intra_data, 1); + CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); + CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); + CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame); + analysis->intraData = intraData; + } + else + { + analysis_inter_data *interData = (analysis_inter_data*)analysis->interData; + CHECKED_MALLOC(interData, analysis_inter_data, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2); + analysis->interData = interData; + } + return; + +fail: + freeAnalysis(analysis); + m_aborted = true; +} + +void Encoder::freeAnalysis(x265_analysis_data* analysis) +{ + if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) + { + X265_FREE(((analysis_intra_data*)analysis->intraData)->depth); + X265_FREE(((analysis_intra_data*)analysis->intraData)->modes); + X265_FREE(((analysis_intra_data*)analysis->intraData)->partSizes); + X265_FREE(analysis->intraData); + } + else + X265_FREE(analysis->interData); +} + +void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc) +{ + +#define X265_FREAD(val, size, readSize, fileOffset)\ + if (fread(val, size, readSize, fileOffset) != readSize)\ + {\ + x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\ + freeAnalysis(analysis);\ + m_aborted = true;\ + return;\ + }\ + + static uint64_t consumedBytes = 0; + static uint64_t totalConsumedBytes = 0; + fseeko(m_analysisFile, totalConsumedBytes, SEEK_SET); + + int poc; uint32_t frameRecordSize; + X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile); + X265_FREAD(&poc, sizeof(int), 1, m_analysisFile); + + uint64_t currentOffset = totalConsumedBytes; + + /* Seeking to the right frame Record */ + while (poc != curPoc && !feof(m_analysisFile)) + { + currentOffset += frameRecordSize; + fseeko(m_analysisFile, currentOffset, SEEK_SET); + X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile); + X265_FREAD(&poc, sizeof(int), 1, m_analysisFile); + } + + if (poc != curPoc || feof(m_analysisFile)) + { + x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc); + freeAnalysis(analysis); + return; + } + + /* Now arrived at the right frame, read the record */ + analysis->poc = poc; + analysis->frameRecordSize = frameRecordSize; + X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFile); + X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile); + X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFile); + + /* Memory is allocated for inter and intra analysis data based on the slicetype */ + allocAnalysis(analysis); + + if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) + { + X265_FREAD(((analysis_intra_data *)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FREAD(((analysis_intra_data *)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + analysis->sliceType = X265_TYPE_I; + consumedBytes += frameRecordSize; + } + else if (analysis->sliceType == X265_TYPE_P) + { + X265_FREAD(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile); + consumedBytes += frameRecordSize; + totalConsumedBytes = consumedBytes; + } + else + { + X265_FREAD(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile); + consumedBytes += frameRecordSize; + } +#undef X265_FREAD +} + +void Encoder::writeAnalysisFile(x265_analysis_data* analysis) +{ + +#define X265_FWRITE(val, size, writeSize, fileOffset)\ + if (fwrite(val, size, writeSize, fileOffset) < writeSize)\ + {\ + x265_log(NULL, X265_LOG_ERROR, "Error writing analysis data\n");\ + freeAnalysis(analysis);\ + m_aborted = true;\ + return;\ + }\ + + /* calculate frameRecordSize */ + analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(analysis->poc) + sizeof(analysis->sliceType) + + sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions); + if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) + analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 3; + else if (analysis->sliceType == X265_TYPE_P) + analysis->frameRecordSize += sizeof(analysis_inter_data) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU; + else + analysis->frameRecordSize += sizeof(analysis_inter_data) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2; + + X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFile); + X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFile); + X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFile); + X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile); + X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFile); + + if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) + { + X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + } + else if (analysis->sliceType == X265_TYPE_P) + { + X265_FWRITE(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile); + } + else + { + X265_FWRITE(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile); + } +#undef X265_FWRITE } diff --git a/source/encoder/encoder.h b/source/encoder/encoder.h index 8a387c2..27f0736 100644 --- a/source/encoder/encoder.h +++ b/source/encoder/encoder.h @@ -74,7 +74,7 @@ struct ThreadLocalData; class Encoder : public x265_encoder { -private: +public: int m_pocLast; // time index (POC) int m_encodedFrameNum; @@ -113,9 +113,7 @@ private: int m_numChromaWPFrames; // number of P frames with weighted chroma reference int m_numLumaWPBiFrames; // number of B frames with weighted luma reference int m_numChromaWPBiFrames; // number of B frames with weighted chroma reference - -public: - + FILE* m_analysisFile; int m_conformanceMode; VPS m_vps; SPS m_sps; @@ -136,12 +134,10 @@ public: bool m_aborted; // fatal error detected Encoder(); - ~Encoder() {} void create(); void destroy(); - void init(); int encode(const x265_picture* pic, x265_picture *pic_out); @@ -163,12 +159,20 @@ public: void updateVbvPlan(RateControl* rc); + void allocAnalysis(x265_analysis_data* analysis); + + void freeAnalysis(x265_analysis_data* analysis); + + void readAnalysisFile(x265_analysis_data* analysis, int poc); + + void writeAnalysisFile(x265_analysis_data* pic); + + void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits); + protected: void initSPS(SPS *sps); void initPPS(PPS *pps); - - void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits); }; } diff --git a/source/encoder/entropy.cpp b/source/encoder/entropy.cpp index 13eaf57..f7eb566 100644 --- a/source/encoder/entropy.cpp +++ b/source/encoder/entropy.cpp @@ -154,8 +154,8 @@ void Entropy::codePPS(const PPS& pps) if (pps.bUseDQP) WRITE_UVLC(pps.maxCuDQPDepth, "diff_cu_qp_delta_depth"); - WRITE_SVLC(pps.chromaCbQpOffset, "pps_cb_qp_offset"); - WRITE_SVLC(pps.chromaCrQpOffset, "pps_cr_qp_offset"); + WRITE_SVLC(pps.chromaQpOffset[0], "pps_cb_qp_offset"); + WRITE_SVLC(pps.chromaQpOffset[1], "pps_cr_qp_offset"); WRITE_FLAG(0, "pps_slice_chroma_qp_offsets_present_flag"); WRITE_FLAG(pps.bUseWeightPred, "weighted_pred_flag"); @@ -397,7 +397,9 @@ void Entropy::codeSliceHeader(const Slice& slice, FrameData& encData) // Ideally this process should not be repeated for each slice in a picture if (slice.isIRAP()) for (int picIdx = 0; picIdx < slice.m_rps.numberOfPictures; picIdx++) + { X265_CHECK(!slice.m_rps.bUsed[picIdx], "pic unused failure\n"); + } #endif WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag"); @@ -515,9 +517,9 @@ void Entropy::encodeCTU(const CUData& ctu, const CUGeom& cuGeom) } /* encode a CU block recursively */ -void Entropy::encodeCU(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP) +void Entropy::encodeCU(const CUData& ctu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP) { - const Slice* slice = cu.m_slice; + const Slice* slice = ctu.m_slice; if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP) bEncodeDQP = true; @@ -527,78 +529,124 @@ void Entropy::encodeCU(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartI if (!cuUnsplitFlag) { - uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2; - for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts) + uint32_t qNumParts = cuGeom.numPartitions >> 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) { - const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); - if (childCuData.flags & CUGeom::PRESENT) - encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP); + const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx); + if (childGeom.flags & CUGeom::PRESENT) + encodeCU(ctu, childGeom, absPartIdx, depth + 1, bEncodeDQP); } return; } // We need to split, so don't try these modes. if (cuSplitFlag) - codeSplitFlag(cu, absPartIdx, depth); + codeSplitFlag(ctu, absPartIdx, depth); - if (depth < cu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth) + if (depth < ctu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth) { - uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2; - - for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts) + uint32_t qNumParts = cuGeom.numPartitions >> 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) { - const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); - encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP); + const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx); + encodeCU(ctu, childGeom, absPartIdx, depth + 1, bEncodeDQP); } return; } if (slice->m_pps->bTransquantBypassEnabled) - codeCUTransquantBypassFlag(cu.m_tqBypass[absPartIdx]); + codeCUTransquantBypassFlag(ctu.m_tqBypass[absPartIdx]); if (!slice->isIntra()) - codeSkipFlag(cu, absPartIdx); - - if (cu.isSkipped(absPartIdx)) { - codeMergeIndex(cu, absPartIdx); - finishCU(cu, absPartIdx, depth); - return; + codeSkipFlag(ctu, absPartIdx); + if (ctu.isSkipped(absPartIdx)) + { + codeMergeIndex(ctu, absPartIdx); + finishCU(ctu, absPartIdx, depth); + return; + } + codePredMode(ctu.m_predMode[absPartIdx]); } - if (!slice->isIntra()) - codePredMode(cu.m_predMode[absPartIdx]); - - codePartSize(cu, absPartIdx, depth); + codePartSize(ctu, absPartIdx, depth); // prediction Info ( Intra : direction mode, Inter : Mv, reference idx ) - codePredInfo(cu, absPartIdx); + codePredInfo(ctu, absPartIdx); uint32_t tuDepthRange[2]; - if (cu.isIntra(absPartIdx)) - cu.getIntraTUQtDepthRange(tuDepthRange, absPartIdx); + if (ctu.isIntra(absPartIdx)) + ctu.getIntraTUQtDepthRange(tuDepthRange, absPartIdx); else - cu.getInterTUQtDepthRange(tuDepthRange, absPartIdx); + ctu.getInterTUQtDepthRange(tuDepthRange, absPartIdx); // Encode Coefficients, allow codeCoeff() to modify bEncodeDQP - codeCoeff(cu, absPartIdx, depth, bEncodeDQP, tuDepthRange); + codeCoeff(ctu, absPartIdx, bEncodeDQP, tuDepthRange); // --- write terminating bit --- - finishCU(cu, absPartIdx, depth); + finishCU(ctu, absPartIdx, depth); +} + +/* Return bit count of signaling inter mode */ +uint32_t Entropy::bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t depth) const +{ + uint32_t bits; + bits = bitsCodeBin(0, m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); /* not skip */ + bits += bitsCodeBin(0, m_contextState[OFF_PRED_MODE_CTX]); /* inter */ + PartSize partSize = (PartSize)cu.m_partSize[absPartIdx]; + switch (partSize) + { + case SIZE_2Nx2N: + bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX]); + break; + + case SIZE_2NxN: + case SIZE_2NxnU: + case SIZE_2NxnD: + bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]); + bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 1]); + if (cu.m_slice->m_sps->maxAMPDepth > depth) + { + bits += bitsCodeBin((partSize == SIZE_2NxN) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]); + if (partSize != SIZE_2NxN) + bits++; // encodeBinEP((partSize == SIZE_2NxnU ? 0 : 1)); + } + break; + + case SIZE_Nx2N: + case SIZE_nLx2N: + case SIZE_nRx2N: + bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]); + bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 1]); + if (depth == g_maxCUDepth && !(cu.m_log2CUSize[absPartIdx] == 3)) + bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 2]); + if (cu.m_slice->m_sps->maxAMPDepth > depth) + { + bits += bitsCodeBin((partSize == SIZE_Nx2N) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]); + if (partSize != SIZE_Nx2N) + bits++; // encodeBinEP((partSize == SIZE_nLx2N ? 0 : 1)); + } + break; + default: + X265_CHECK(0, "invalid CU partition\n"); + break; + } + + return bits; } /* finish encoding a cu and handle end-of-slice conditions */ -void Entropy::finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth) +void Entropy::finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth) { - const Slice* slice = cu.m_slice; - X265_CHECK(cu.m_slice->m_endCUAddr == cu.m_slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n"); + const Slice* slice = ctu.m_slice; uint32_t realEndAddress = slice->m_endCUAddr; - uint32_t cuAddr = cu.getSCUAddr() + absPartIdx; + uint32_t cuAddr = ctu.getSCUAddr() + absPartIdx; + X265_CHECK(realEndAddress == slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n"); uint32_t granularityMask = g_maxCUSize - 1; - uint32_t cuSize = 1 << cu.m_log2CUSize[absPartIdx]; - uint32_t rpelx = cu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize; - uint32_t bpely = cu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize; + uint32_t cuSize = 1 << ctu.m_log2CUSize[absPartIdx]; + uint32_t rpelx = ctu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize; + uint32_t bpely = ctu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize; bool granularityBoundary = (((rpelx & granularityMask) == 0 || (rpelx == slice->m_sps->picWidthInLumaSamples )) && ((bpely & granularityMask) == 0 || (bpely == slice->m_sps->picHeightInLumaSamples))); @@ -618,41 +666,18 @@ void Entropy::finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth) } } -void Entropy::encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLuma, uint32_t offsetChroma, uint32_t absPartIdx, - uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx, bool& bCodeDQP, uint32_t depthRange[2]) +void Entropy::encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize, + bool& bCodeDQP, const uint32_t depthRange[2]) { - const bool subdiv = cu.m_tuDepth[absPartIdx] + cu.m_cuDepth[absPartIdx] > (uint8_t)depth; - uint32_t hChromaShift = cu.m_hChromaShift; - uint32_t vChromaShift = cu.m_vChromaShift; - uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, trIdx); - uint32_t cbfU = cu.getCbf(absPartIdx, TEXT_CHROMA_U, trIdx); - uint32_t cbfV = cu.getCbf(absPartIdx, TEXT_CHROMA_V, trIdx); - - if (!trIdx) - state.bakAbsPartIdxCU = absPartIdx; - - if (log2TrSize == 2 && cu.m_chromaFormat != X265_CSP_I444) - { - uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1); - if (!(absPartIdx & (partNum - 1))) - { - state.bakAbsPartIdx = absPartIdx; - state.bakChromaOffset = offsetChroma; - } - else if ((absPartIdx & (partNum - 1)) == (partNum - 1)) - { - cbfU = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_U, trIdx); - cbfV = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_V, trIdx); - } - } + const bool subdiv = cu.m_tuDepth[absPartIdx] > tuDepth; /* in each of these conditions, the subdiv flag is implied and not signaled, * so we have checks to make sure the implied value matches our intentions */ - if (cu.m_predMode[absPartIdx] == MODE_INTRA && cu.m_partSize[absPartIdx] == SIZE_NxN && depth == cu.m_cuDepth[absPartIdx]) + if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth) { X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n"); } - else if (cu.m_predMode[absPartIdx] == MODE_INTER && (cu.m_partSize[absPartIdx] != SIZE_2Nx2N) && depth == cu.m_cuDepth[absPartIdx] && + else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth && cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1) { X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2TrSize %d, depthRange[0] %d\n", log2TrSize, depthRange[0]); @@ -671,127 +696,111 @@ void Entropy::encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t codeTransformSubdivFlag(subdiv, 5 - log2TrSize); } - const uint32_t trDepthCurr = depth - cu.m_cuDepth[absPartIdx]; - const bool bFirstCbfOfCU = trDepthCurr == 0; - - bool mCodeAll = true; - const uint32_t numPels = 1 << (log2TrSize * 2 - hChromaShift - vChromaShift); - if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE)) - mCodeAll = false; - - if (bFirstCbfOfCU || mCodeAll) + uint32_t hChromaShift = cu.m_hChromaShift; + uint32_t vChromaShift = cu.m_vChromaShift; + bool bSmallChroma = (log2TrSize - hChromaShift < 2); + if (!tuDepth || !bSmallChroma) { - uint32_t tuSize = 1 << log2TrSize; - if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1)) - codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_U, trDepthCurr, (subdiv == 0)); - if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1)) - codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_V, trDepthCurr, (subdiv == 0)); + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1)) + codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv); + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1)) + codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv); } else { - X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1), "chroma xform size match failure\n"); - X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1), "chroma xform size match failure\n"); + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma xform size match failure\n"); + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma xform size match failure\n"); } if (subdiv) { - log2TrSize--; - uint32_t numCoeff = 1 << (log2TrSize * 2); - uint32_t numCoeffC = (numCoeff >> (hChromaShift + vChromaShift)); - trIdx++; - ++depth; - absPartIdxStep >>= 2; - const uint32_t partNum = NUM_CU_PARTITIONS >> (depth << 1); + --log2TrSize; + ++tuDepth; - encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange); + uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2; - absPartIdx += partNum; - offsetLuma += numCoeff; - offsetChroma += numCoeffC; - encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange); + encodeTransform(cu, absPartIdx + 0 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange); + encodeTransform(cu, absPartIdx + 1 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange); + encodeTransform(cu, absPartIdx + 2 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange); + encodeTransform(cu, absPartIdx + 3 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange); + return; + } - absPartIdx += partNum; - offsetLuma += numCoeff; - offsetChroma += numCoeffC; - encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange); + uint32_t absPartIdxC = bSmallChroma ? absPartIdx & 0xFC : absPartIdx; - absPartIdx += partNum; - offsetLuma += numCoeff; - offsetChroma += numCoeffC; - encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange); + if (cu.isInter(absPartIdxC) && !tuDepth && !cu.getCbf(absPartIdxC, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdxC, TEXT_CHROMA_V, 0)) + { + X265_CHECK(cu.getCbf(absPartIdxC, TEXT_LUMA, 0), "CBF should have been set\n"); } else + codeQtCbfLuma(cu, absPartIdx, tuDepth); + + uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth); + uint32_t cbfU = cu.getCbf(absPartIdxC, TEXT_CHROMA_U, tuDepth); + uint32_t cbfV = cu.getCbf(absPartIdxC, TEXT_CHROMA_V, tuDepth); + if (!(cbfY || cbfU || cbfV)) + return; + + // dQP: only for CTU once + if (cu.m_slice->m_pps->bUseDQP && bCodeDQP) { - if (cu.m_predMode[absPartIdx] != MODE_INTRA && depth == cu.m_cuDepth[absPartIdx] && !cu.getCbf(absPartIdx, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdx, TEXT_CHROMA_V, 0)) - { - X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n"); - } - else - codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]); + uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx]; + uint32_t absPartIdxLT = absPartIdx & (0xFF << (log2CUSize - LOG2_UNIT_SIZE) * 2); + codeDeltaQP(cu, absPartIdxLT); + bCodeDQP = false; + } - if (cbfY || cbfU || cbfV) - { - // dQP: only for CTU once - if (cu.m_slice->m_pps->bUseDQP) - { - if (bCodeDQP) - { - codeDeltaQP(cu, state.bakAbsPartIdxCU); - bCodeDQP = false; - } - } - } - if (cbfY) - codeCoeffNxN(cu, cu.m_trCoeff[0] + offsetLuma, absPartIdx, log2TrSize, TEXT_LUMA); + if (cbfY) + { + uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2); + codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2TrSize, TEXT_LUMA); + if (!(cbfU || cbfV)) + return; + } - int chFmt = cu.m_chromaFormat; - if (log2TrSize == 2 && chFmt != X265_CSP_I444) + if (bSmallChroma) + { + if ((absPartIdx & 3) != 3) + return; + + const uint32_t log2TrSizeC = 2; + const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422); + const uint32_t curPartNum = 4; + uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift)); + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) { - uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1); - if ((absPartIdx & (partNum - 1)) == (partNum - 1)) + TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdxC); + const coeff_t* coeffChroma = cu.m_trCoeff[chromaId]; + do { - const uint32_t log2TrSizeC = 2; - const bool splitIntoSubTUs = (chFmt == X265_CSP_I422); - - uint32_t curPartNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1); - - for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs)) { - TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, state.bakAbsPartIdx); - const coeff_t* coeffChroma = cu.m_trCoeff[chromaId]; - do - { - uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs); - if (cbf) - { - uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - codeCoeffNxN(cu, coeffChroma + state.bakChromaOffset + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId); - } - } - while (tuIterator.isNextSection()); + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId); } } + while (tuIterator.isNextSection()); } - else + } + else + { + uint32_t log2TrSizeC = log2TrSize - hChromaShift; + const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422); + uint32_t curPartNum = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2; + uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift)); + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) { - uint32_t log2TrSizeC = log2TrSize - hChromaShift; - const bool splitIntoSubTUs = (chFmt == X265_CSP_I422); - uint32_t curPartNum = NUM_CU_PARTITIONS >> (depth << 1); - for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdxC); + const coeff_t* coeffChroma = cu.m_trCoeff[chromaId]; + do { - TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdx); - const coeff_t* coeffChroma = cu.m_trCoeff[chromaId]; - do + if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs)) { - uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs); - if (cbf) - { - uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - codeCoeffNxN(cu, coeffChroma + offsetChroma + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId); - } + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId); } - while (tuIterator.isNextSection()); } + while (tuIterator.isNextSection()); } } } @@ -808,14 +817,14 @@ void Entropy::codePredInfo(const CUData& cu, uint32_t absPartIdx) codeIntraDirChroma(cu, absPartIdx, chromaDirMode); - if ((cu.m_chromaFormat == X265_CSP_I444) && (cu.m_partSize[absPartIdx] == SIZE_NxN)) + if (cu.m_chromaFormat == X265_CSP_I444 && cu.m_partSize[absPartIdx] != SIZE_2Nx2N) { - uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2; - for (uint32_t i = 1; i <= 3; i++) + uint32_t qNumParts = 1 << (cu.m_log2CUSize[absPartIdx] - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 1; qIdx < 4; ++qIdx) { - uint32_t offset = absPartIdx + i * partOffset; - cu.getAllowedChromaDir(offset, chromaDirMode); - codeIntraDirChroma(cu, offset, chromaDirMode); + absPartIdx += qNumParts; + cu.getAllowedChromaDir(absPartIdx, chromaDirMode); + codeIntraDirChroma(cu, absPartIdx, chromaDirMode); } } } @@ -867,7 +876,7 @@ void Entropy::codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list) codeRefFrmIdx(cu, absPartIdx, list); } -void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2]) +void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2]) { if (!cu.isIntra(absPartIdx)) { @@ -877,12 +886,8 @@ void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, b return; } - uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx]; - uint32_t lumaOffset = absPartIdx << (LOG2_UNIT_SIZE * 2); - uint32_t chromaOffset = lumaOffset >> (cu.m_hChromaShift + cu.m_vChromaShift); - uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> (depth << 1); - CoeffCodeState state; - encodeTransform(cu, state, lumaOffset, chromaOffset, absPartIdx, absPartIdxStep, depth, log2CUSize, 0, bCodeDQP, depthRange); + uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx]; + encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange); } void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane) @@ -1116,7 +1121,7 @@ void Entropy::writeCoefRemainExGolomb(uint32_t codeNumber, uint32_t absGoRice) if (codeNumber != 0) { unsigned long idx; - CLZ32(idx, codeNumber + 1); + CLZ(idx, codeNumber + 1); length = idx; codeNumber -= (1 << idx) - 1; } @@ -1145,11 +1150,6 @@ void Entropy::copyFrom(const Entropy& src) markValid(); } -void Entropy::codeMVPIdx(uint32_t symbol) -{ - encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); -} - void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth) { PartSize partSize = (PartSize)cu.m_partSize[absPartIdx]; @@ -1200,32 +1200,6 @@ void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth } } -void Entropy::codePredMode(int predMode) -{ - encodeBin(predMode == MODE_INTER ? 0 : 1, m_contextState[OFF_PRED_MODE_CTX]); -} - -void Entropy::codeCUTransquantBypassFlag(uint32_t symbol) -{ - encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]); -} - -void Entropy::codeSkipFlag(const CUData& cu, uint32_t absPartIdx) -{ - // get context function is here - uint32_t symbol = cu.isSkipped(absPartIdx) ? 1 : 0; - uint32_t ctxSkip = cu.getCtxSkipFlag(absPartIdx); - - encodeBin(symbol, m_contextState[OFF_SKIP_FLAG_CTX + ctxSkip]); -} - -void Entropy::codeMergeFlag(const CUData& cu, uint32_t absPartIdx) -{ - const uint32_t symbol = cu.m_mergeFlag[absPartIdx] ? 1 : 0; - - encodeBin(symbol, m_contextState[OFF_MERGE_FLAG_EXT_CTX]); -} - void Entropy::codeMergeIndex(const CUData& cu, uint32_t absPartIdx) { uint32_t numCand = cu.m_slice->m_maxNumMergeCand; @@ -1246,50 +1220,18 @@ void Entropy::codeMergeIndex(const CUData& cu, uint32_t absPartIdx) } } -void Entropy::codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth) -{ - X265_CHECK(depth < g_maxCUDepth, "invalid depth\n"); - - uint32_t ctx = cu.getCtxSplitFlag(absPartIdx, depth); - uint32_t currSplitFlag = (cu.m_cuDepth[absPartIdx] > depth) ? 1 : 0; - - X265_CHECK(ctx < 3, "ctx out of range\n"); - encodeBin(currSplitFlag, m_contextState[OFF_SPLIT_FLAG_CTX + ctx]); -} - -void Entropy::codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx) -{ - encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]); -} - -uint32_t Entropy::bitsIntraModeNonMPM() const -{ - uint32_t mstate = m_contextState[OFF_ADI_CTX]; - uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 0)) >> 15; - return bits + 5; /* fixed cost for encodeBinsEP() */ -} - -uint32_t Entropy::bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const -{ - X265_CHECK(dir == preds[0] || dir == preds[1] || dir == preds[2], "dir must be a most probable mode\n"); - uint32_t mstate = m_contextState[OFF_ADI_CTX]; - uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 1)) >> 15; - return bits + (dir == preds[0] ? 1 : 2); -} - void Entropy::codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple) { uint32_t dir[4], j; uint32_t preds[4][3]; int predIdx[4]; - PartSize mode = (PartSize)cu.m_partSize[absPartIdx]; - uint32_t partNum = isMultiple ? (mode == SIZE_NxN ? 4 : 1) : 1; - uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2; + uint32_t partNum = isMultiple && cu.m_partSize[absPartIdx] != SIZE_2Nx2N ? 4 : 1; + uint32_t qNumParts = 1 << (cu.m_log2CUSize[absPartIdx] - 1 - LOG2_UNIT_SIZE) * 2; - for (j = 0; j < partNum; j++) + for (j = 0; j < partNum; j++, absPartIdx += qNumParts) { - dir[j] = cu.m_lumaIntraDir[absPartIdx + partOffset * j]; - cu.getIntraDirLumaPredictor(absPartIdx + partOffset * j, preds[j]); + dir[j] = cu.m_lumaIntraDir[absPartIdx]; + cu.getIntraDirLumaPredictor(absPartIdx, preds[j]); predIdx[j] = -1; for (uint32_t i = 0; i < 3; i++) if (dir[j] == preds[j][i]) @@ -1444,46 +1386,25 @@ void Entropy::codeDeltaQP(const CUData& cu, uint32_t absPartIdx) } } -void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel) +void Entropy::codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel) { - uint32_t ctx = ctxCbf[ttype][trDepth]; + uint32_t ctx = tuDepth + 2; - bool canQuadSplit = (width >= (MIN_TU_SIZE * 2)) && (height >= (MIN_TU_SIZE * 2)); - uint32_t lowestTUDepth = trDepth + ((!lowestLevel && !canQuadSplit) ? 1 : 0); // unsplittable TUs inherit their parent's CBF + uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth; + bool canQuadSplit = (log2TrSize - cu.m_hChromaShift > 2); + uint32_t lowestTUDepth = tuDepth + ((!lowestLevel && !canQuadSplit) ? 1 : 0); // unsplittable TUs inherit their parent's CBF - if ((width != height) && (lowestLevel || !canQuadSplit)) // if sub-TUs are present + if (cu.m_chromaFormat == X265_CSP_I422 && (lowestLevel || !canQuadSplit)) // if sub-TUs are present { uint32_t subTUDepth = lowestTUDepth + 1; // if this is the lowest level of the TU-tree, the sub-TUs are directly below. // Otherwise, this must be the level above the lowest level (as specified above) - uint32_t partIdxesPerSubTU = absPartIdxStep >> 1; - - for (uint32_t subTU = 0; subTU < 2; subTU++) - { - uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU); - uint32_t cbf = cu.getCbf(subTUAbsPartIdx, ttype, subTUDepth); + uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1); - encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]); - } + encodeBin(cu.getCbf(absPartIdx , ttype, subTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]); + encodeBin(cu.getCbf(absPartIdx + tuNumParts, ttype, subTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]); } else - { - uint32_t cbf = cu.getCbf(absPartIdx, ttype, lowestTUDepth); - - encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]); - } -} - -void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth) -{ - uint32_t ctx = ctxCbf[ttype][trDepth]; - uint32_t cbf = cu.getCbf(absPartIdx, ttype, trDepth); - encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]); -} - -void Entropy::codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth) -{ - uint32_t ctx = ctxCbf[ttype][trDepth]; - encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]); + encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]); } void Entropy::codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype) @@ -1497,26 +1418,6 @@ void Entropy::codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint encodeBin(useTransformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]); } -void Entropy::codeQtRootCbf(uint32_t cbf) -{ - encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]); -} - -void Entropy::codeQtCbfZero(TextType ttype, uint32_t trDepth) -{ - // this function is only used to estimate the bits when cbf is 0 - // and will never be called when writing the bitsream. - uint32_t ctx = ctxCbf[ttype][trDepth]; - encodeBin(0, m_contextState[OFF_QT_CBF_CTX + ctx]); -} - -void Entropy::codeQtRootCbfZero() -{ - // this function is only used to estimate the bits when cbf is 0 - // and will never be called when writing the bistream. - encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]); -} - /** Encode (X,Y) position of the last significant coefficient * \param posx X component of last coefficient * \param posy Y component of last coefficient @@ -2006,9 +1907,9 @@ void Entropy::encodeBin(uint32_t binValue, uint8_t &ctxModel) if ((binValue ^ mstate) & 1) { // NOTE: lps is non-zero and the maximum of idx is 8 because lps less than 256 - //numBits = g_renormTable[lps >> 3]; + //numBits = g_renormTable[lps >> 3]; unsigned long idx; - CLZ32(idx, lps); + CLZ(idx, lps); X265_CHECK(state != 63 || idx == 1, "state failure\n"); numBits = 8 - idx; diff --git a/source/encoder/entropy.h b/source/encoder/entropy.h index bed06cf..9cd927f 100644 --- a/source/encoder/entropy.h +++ b/source/encoder/entropy.h @@ -27,6 +27,7 @@ #include "common.h" #include "bitstream.h" #include "frame.h" +#include "cudata.h" #include "contexts.h" #include "slice.h" @@ -35,8 +36,6 @@ namespace x265 { struct SaoCtuParam; struct EstBitsSbac; -class CUData; -struct CUGeom; class ScalingList; enum SplitType @@ -154,41 +153,55 @@ public: void finishSlice() { encodeBinTrm(1); finish(); dynamic_cast(m_bitIf)->writeByteAlignment(); } void encodeCTU(const CUData& cu, const CUGeom& cuGeom); - void codeSaoOffset(const SaoCtuParam& ctuParam, int plane); - void codeSaoMerge(uint32_t code) { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); } - void codeCUTransquantBypassFlag(uint32_t symbol); - void codeSkipFlag(const CUData& cu, uint32_t absPartIdx); - void codeMergeFlag(const CUData& cu, uint32_t absPartIdx); + void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple); + void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode); + void codeMergeIndex(const CUData& cu, uint32_t absPartIdx); - void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth); - void codeMVPIdx(uint32_t symbol); void codeMvd(const CUData& cu, uint32_t absPartIdx, int list); void codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth); - void codePredMode(int predMode); void codePredInfo(const CUData& cu, uint32_t absPartIdx); - void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx); - void codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel); - void codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth); - void codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth); - void codeQtCbfZero(TextType ttype, uint32_t trDepth); - void codeQtRootCbfZero(); - void codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2]); + inline void codeQtCbfLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth) { codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); } + + void codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel); + void codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2]); void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype); - uint32_t bitsIntraModeNonMPM() const; - uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const; - void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple); - void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode); + inline void codeSaoMerge(uint32_t code) { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); } + inline void codeMVPIdx(uint32_t symbol) { encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); } + inline void codeMergeFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.m_mergeFlag[absPartIdx], m_contextState[OFF_MERGE_FLAG_EXT_CTX]); } + inline void codeSkipFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.isSkipped(absPartIdx), m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); } + inline void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth) { encodeBin(cu.m_cuDepth[absPartIdx] > depth, m_contextState[OFF_SPLIT_FLAG_CTX + cu.getCtxSplitFlag(absPartIdx, depth)]); } + inline void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx) { encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]); } + inline void codePredMode(int predMode) { encodeBin(predMode == MODE_INTRA ? 1 : 0, m_contextState[OFF_PRED_MODE_CTX]); } + inline void codeCUTransquantBypassFlag(uint32_t symbol) { encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]); } + inline void codeQtCbfLuma(uint32_t cbf, uint32_t tuDepth) { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + !tuDepth]); } + inline void codeQtCbfChroma(uint32_t cbf, uint32_t tuDepth) { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + 2 + tuDepth]); } + inline void codeQtRootCbf(uint32_t cbf) { encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]); } + + void codeSaoOffset(const SaoCtuParam& ctuParam, int plane); - // RDO functions + /* RDO functions */ void estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const; void estCBFBit(EstBitsSbac& estBitsSbac) const; void estSignificantCoeffGroupMapBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const; void estSignificantMapBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const; void estSignificantCoefficientsBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const; + inline uint32_t bitsIntraModeNonMPM() const { return bitsCodeBin(0, m_contextState[OFF_ADI_CTX]) + 5; } + inline uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const { return bitsCodeBin(1, m_contextState[OFF_ADI_CTX]) + (dir == preds[0] ? 1 : 2); } + inline uint32_t estimateCbfBits(uint32_t cbf, TextType ttype, uint32_t tuDepth) const { return bitsCodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctxCbf[ttype][tuDepth]]); } + uint32_t bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t depth) const; + uint32_t bitsIntraMode(const CUData& cu, uint32_t absPartIdx) const + { + return bitsCodeBin(0, m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]) + /* not skip */ + bitsCodeBin(1, m_contextState[OFF_PRED_MODE_CTX]); /* intra */ + } + + /* these functions are only used to estimate the bits when cbf is 0 and will never be called when writing the bistream. */ + inline void codeQtRootCbfZero() { encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]); } + private: /* CABAC private methods */ @@ -200,8 +213,15 @@ private: void encodeBinsEP(uint32_t binValues, int numBins); void encodeBinTrm(uint32_t binValue); - void encodeCU(const CUData& cu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP); - void finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth); + /* return the bits of encoding the context bin without updating */ + inline uint32_t bitsCodeBin(uint32_t binValue, uint32_t ctxModel) const + { + uint64_t fracBits = (m_fracBits & 32767) + sbacGetEntropyBits(ctxModel, binValue); + return (uint32_t)(fracBits >> 15); + } + + void encodeCU(const CUData& ctu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP); + void finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth); void writeOut(); @@ -217,7 +237,6 @@ private: void codePredWeightTable(const Slice& slice); void codeInterDir(const CUData& cu, uint32_t absPartIdx); void codePUWise(const CUData& cu, uint32_t absPartIdx); - void codeQtRootCbf(uint32_t cbf); void codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list); void codeRefFrmIdx(const CUData& cu, uint32_t absPartIdx, int list); @@ -227,16 +246,8 @@ private: void codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx); void codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype); - struct CoeffCodeState - { - uint32_t bakAbsPartIdx; - uint32_t bakChromaOffset; - uint32_t bakAbsPartIdxCU; - }; - - void encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma, - uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx, - bool& bCodeDQP, uint32_t depthRange[2]); + void encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize, + bool& bCodeDQP, const uint32_t depthRange[2]); void copyFrom(const Entropy& src); void copyContextsFrom(const Entropy& src); diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp index c6e6915..5f4d2f7 100644 --- a/source/encoder/frameencoder.cpp +++ b/source/encoder/frameencoder.cpp @@ -29,8 +29,6 @@ #include "wavefront.h" #include "param.h" -#include "PPA/ppa.h" - #include "encoder.h" #include "frameencoder.h" #include "common.h" @@ -126,23 +124,24 @@ bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id) ok &= m_rce.picTimingSEI && m_rce.hrdTiming; } - if (m_param->noiseReduction) + if (m_param->noiseReductionIntra || m_param->noiseReductionInter) m_nr = X265_MALLOC(NoiseReduction, 1); if (m_nr) memset(m_nr, 0, sizeof(NoiseReduction)); else - m_param->noiseReduction = 0; + m_param->noiseReductionIntra = m_param->noiseReductionInter = 0; start(); return ok; } /* Generate a complete list of unique geom sets for the current picture dimensions */ -bool FrameEncoder::initializeGeoms(const FrameData& encData) +bool FrameEncoder::initializeGeoms() { /* Geoms only vary between CTUs in the presence of picture edges */ - int heightRem = m_param->sourceHeight & (m_param->maxCUSize - 1); - int widthRem = m_param->sourceWidth & (m_param->maxCUSize - 1); + int maxCUSize = m_param->maxCUSize; + int heightRem = m_param->sourceHeight & (maxCUSize - 1); + int widthRem = m_param->sourceWidth & (maxCUSize - 1); int allocGeoms = 1; // body if (heightRem && widthRem) allocGeoms = 4; // body, right, bottom, corner @@ -154,33 +153,45 @@ bool FrameEncoder::initializeGeoms(const FrameData& encData) if (!m_cuGeoms || !m_ctuGeomMap) return false; - CUGeom cuLocalData[CUGeom::MAX_GEOMS]; - memset(cuLocalData, 0, sizeof(cuLocalData)); // temporal fix for memcmp + // body + CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, m_cuGeoms); + memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols); + if (allocGeoms == 1) + return true; - int countGeoms = 0; - for (uint32_t ctuAddr = 0; ctuAddr < m_numRows * m_numCols; ctuAddr++) + int countGeoms = 1; + if (widthRem) { - /* TODO: detach this logic from TComDataCU */ - encData.m_picCTU[ctuAddr].initCTU(*m_frame, ctuAddr, 0); - encData.m_picCTU[ctuAddr].calcCTUGeoms(m_param->sourceWidth, m_param->sourceHeight, m_param->maxCUSize, cuLocalData); - - m_ctuGeomMap[ctuAddr] = MAX_INT; - for (int i = 0; i < countGeoms; i++) + // right + CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); + for (int i = 0; i < m_numRows; i++) { - if (!memcmp(cuLocalData, m_cuGeoms + i * CUGeom::MAX_GEOMS, sizeof(CUGeom) * CUGeom::MAX_GEOMS)) - { - m_ctuGeomMap[ctuAddr] = i * CUGeom::MAX_GEOMS; - break; - } + uint32_t ctuAddr = m_numCols * (i + 1) - 1; + m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS; } + countGeoms++; + } + if (heightRem) + { + // bottom + CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); + for (uint32_t i = 0; i < m_numCols; i++) + { + uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i; + m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS; + } + countGeoms++; - if (m_ctuGeomMap[ctuAddr] == MAX_INT) + if (widthRem) { - X265_CHECK(countGeoms < allocGeoms, "geometry match check failure\n"); + // corner + CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); + + uint32_t ctuAddr = m_numCols * m_numRows - 1; m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS; - memcpy(m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS, cuLocalData, sizeof(CUGeom) * CUGeom::MAX_GEOMS); countGeoms++; } + X265_CHECK(countGeoms == allocGeoms, "geometry match check failure\n"); } return true; @@ -191,11 +202,13 @@ bool FrameEncoder::startCompressFrame(Frame* curFrame) m_frame = curFrame; curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it curFrame->m_encData->m_slice->m_mref = m_mref; + if (!m_cuGeoms) { - if (!initializeGeoms(*curFrame->m_encData)) + if (!initializeGeoms()) return false; } + m_enable.trigger(); return true; } @@ -217,7 +230,7 @@ void FrameEncoder::threadMain() void FrameEncoder::compressFrame() { - PPAScopeEvent(FrameEncoder_compressFrame); + //ProfileScopeEvent(frameThread); int64_t startCompressTime = x265_mdate(); Slice* slice = m_frame->m_encData->m_slice; @@ -252,7 +265,7 @@ void FrameEncoder::compressFrame() WeightParam *w = NULL; if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag) w = slice->m_weightPredTable[l][ref]; - m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPicYuv, w); + m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param); } } @@ -481,7 +494,7 @@ void FrameEncoder::compressFrame() for (int i = 0; i < m_top->m_numThreadLocalData; i++) { NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID]; - memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); + memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES); memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); } @@ -569,7 +582,6 @@ void FrameEncoder::encodeSlice() void FrameEncoder::compressCTURows() { - PPAScopeEvent(FrameEncoder_compressRows); Slice* slice = m_frame->m_encData->m_slice; m_bAllRowsStop = false; @@ -643,12 +655,12 @@ void FrameEncoder::compressCTURows() } } - processRow(i * 2 + 0, -1); + processRowEncoder(i, *m_tld); } // Filter if (i >= m_filterRowDelay) - processRow((i - m_filterRowDelay) * 2 + 1, -1); + m_frameFilter.processRow(i - m_filterRowDelay); } } m_frameTime = (double)m_totalTime / 1000000; @@ -666,7 +678,7 @@ void FrameEncoder::processRow(int row, int threadId) processRowEncoder(realRow, tld); else { - processRowFilter(realRow); + m_frameFilter.processRow(realRow); // NOTE: Active next row if (realRow != m_numRows - 1) @@ -679,8 +691,6 @@ void FrameEncoder::processRow(int row, int threadId) // Called by worker threads void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) { - PPAScopeEvent(Thread_ProcessRow); - CTURow& curRow = m_rows[row]; { @@ -707,9 +717,6 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder; FrameData& curEncData = *m_frame->m_encData; Slice *slice = curEncData.m_slice; - PicYuv* fencPic = m_frame->m_origPicYuv; - - tld.analysis.m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride); int64_t startTime = x265_mdate(); const uint32_t numCols = m_numCols; @@ -718,6 +725,8 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) while (curRow.completed < numCols) { + ProfileScopeEvent(encodeCTU); + int col = curRow.completed; const uint32_t cuAddr = lineStartCUAddr + col; CUData* ctu = curEncData.getPicCTU(cuAddr); @@ -744,7 +753,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp); tld.analysis.setQP(*slice, qp); qp = Clip3(QP_MIN, QP_MAX_SPEC, qp); - ctu->setQPSubParts((char)qp, 0, 0); + ctu->setQPSubParts((int8_t)qp, 0, 0); curEncData.m_rowStat[row].sumQpAq += qp; } else @@ -758,7 +767,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) } // Does all the CU analysis, returns best top level mode decision - Search::Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder); + Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder); /* advance top-level row coder to include the context of this CTU. * if SAO is disabled, rowCoder writes the final CTU bitstream */ @@ -839,9 +848,13 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) if (dequeueRow(r * 2)) stopRow.active = false; else + { + /* we must release the row lock to allow the thread to exit */ + stopRow.lock.release(); GIVE_UP_TIME(); + stopRow.lock.acquire(); + } } - stopRow.lock.release(); bool bRowBusy = true; @@ -937,19 +950,22 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) m_top->m_rateControl->rateControlUpdateStats(&m_rce); } - // trigger row-wise loop filters - if (row >= m_filterRowDelay) + if (m_param->bEnableWavefront) { - enableRowFilter(row - m_filterRowDelay); + /* trigger row-wise loop filters */ + if (row >= m_filterRowDelay) + { + enableRowFilter(row - m_filterRowDelay); - // NOTE: Active Filter to first row (row 0) - if (row == m_filterRowDelay) - enqueueRowFilter(0); - } - if (row == m_numRows - 1) - { - for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++) - enableRowFilter(i); + /* NOTE: Activate filter if first row (row 0) */ + if (row == m_filterRowDelay) + enqueueRowFilter(0); + } + if (row == m_numRows - 1) + { + for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++) + enableRowFilter(i); + } } m_totalTime += x265_mdate() - startTime; @@ -971,13 +987,13 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu) log->cntIntra[depth]++; log->qTreeIntraCnt[depth]++; - if (ctu.m_partSize[absPartIdx] == SIZE_NONE) + if (ctu.m_predMode[absPartIdx] == MODE_NONE) { log->totalCu--; log->cntIntra[depth]--; log->qTreeIntraCnt[depth]--; } - else if (ctu.m_partSize[absPartIdx] == SIZE_NxN) + else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) { /* TODO: log intra modes at absPartIdx +0 to +3 */ X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); @@ -1000,7 +1016,7 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu) log->totalCu++; log->cntTotalCu[depth]++; - if (ctu.m_partSize[absPartIdx] == SIZE_NONE) + if (ctu.m_predMode[absPartIdx] == MODE_NONE) { log->totalCu--; log->cntTotalCu[depth]--; @@ -1011,7 +1027,7 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu) log->cntSkipCu[depth]++; log->qTreeSkipCnt[depth]++; } - else if (ctu.m_predMode[absPartIdx] == MODE_INTER) + else if (ctu.isInter(absPartIdx)) { log->cntInter[depth]++; log->qTreeInterCnt[depth]++; @@ -1021,12 +1037,12 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu) else log->cuInterDistribution[depth][AMP_ID]++; } - else if (ctu.m_predMode[absPartIdx] == MODE_INTRA) + else if (ctu.isIntra(absPartIdx)) { log->cntIntra[depth]++; log->qTreeIntraCnt[depth]++; - if (ctu.m_partSize[absPartIdx] == SIZE_NxN) + if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) { X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); log->cntIntraNxN++; @@ -1061,7 +1077,8 @@ void FrameEncoder::noiseReductionUpdate() m_nr->count[cat] >>= 1; } - uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat]; + int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter; + uint64_t scaledCount = (uint64_t)nrStrength * m_nr->count[cat]; for (int i = 0; i < coefCount; i++) { @@ -1091,8 +1108,8 @@ int FrameEncoder::calcQpForCu(uint32_t ctuAddr, double baseQp) /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */ double qp_offset = 0; - uint32_t maxBlockCols = (m_frame->m_origPicYuv->m_picWidth + (16 - 1)) / 16; - uint32_t maxBlockRows = (m_frame->m_origPicYuv->m_picHeight + (16 - 1)) / 16; + uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16; + uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16; uint32_t noOfBlocks = g_maxCUSize / 16; uint32_t block_y = (ctuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks; uint32_t block_x = (ctuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth; diff --git a/source/encoder/frameencoder.h b/source/encoder/frameencoder.h index 625c025..ec5cf5b 100644 --- a/source/encoder/frameencoder.h +++ b/source/encoder/frameencoder.h @@ -185,7 +185,7 @@ public: protected: - bool initializeGeoms(const FrameData& encData); + bool initializeGeoms(); /* analyze / compress frame, can be run in parallel within reference constraints */ void compressFrame(); @@ -204,7 +204,6 @@ protected: /* Called by WaveFront::findJob() */ void processRow(int row, int threadId); void processRowEncoder(int row, ThreadLocalData& tld); - void processRowFilter(int row) { m_frameFilter.processRow(row); } void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); } void enqueueRowFilter(int row) { WaveFront::enqueueRow(row * 2 + 1); } diff --git a/source/encoder/framefilter.cpp b/source/encoder/framefilter.cpp index aee75c6..f6c233d 100644 --- a/source/encoder/framefilter.cpp +++ b/source/encoder/framefilter.cpp @@ -29,7 +29,6 @@ #include "framefilter.h" #include "frameencoder.h" #include "wavefront.h" -#include "PPA/ppa.h" using namespace x265; @@ -84,7 +83,7 @@ void FrameFilter::start(Frame *frame, Entropy& initState, int qp) void FrameFilter::processRow(int row) { - PPAScopeEvent(Thread_filterCU); + ProfileScopeEvent(filterCTURow); if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO) { @@ -100,19 +99,19 @@ void FrameFilter::processRow(int row) for (uint32_t col = 0; col < numCols; col++) { uint32_t cuAddr = lineStartCUAddr + col; - CUData* cu = encData.getPicCTU(cuAddr); + const CUData* ctu = encData.getPicCTU(cuAddr); - m_deblock.deblockCTU(cu, Deblock::EDGE_VER); + m_deblock.deblockCTU(ctu, Deblock::EDGE_VER); if (col > 0) { - CUData* cuPrev = encData.getPicCTU(cuAddr - 1); - m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR); + const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1); + m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR); } } - CUData* cuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1); - m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR); + const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1); + m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR); } // SAO @@ -156,7 +155,7 @@ uint32_t FrameFilter::getCUHeight(int rowNum) const void FrameFilter::processRowPost(int row) { - PicYuv *reconPic = m_frame->m_reconPicYuv; + PicYuv *reconPic = m_frame->m_reconPic; const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth; const uint32_t lineStartCUAddr = row * numCols; const int realH = getCUHeight(row); @@ -209,19 +208,19 @@ void FrameFilter::processRowPost(int row) uint32_t cuAddr = lineStartCUAddr; if (m_param->bEnablePsnr) { - PicYuv* origPic = m_frame->m_origPicYuv; + PicYuv* fencPic = m_frame->m_fencPic; intptr_t stride = reconPic->m_stride; uint32_t width = reconPic->m_picWidth - m_pad[0]; uint32_t height = getCUHeight(row); - uint64_t ssdY = computeSSD(origPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height); + uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height); height >>= m_vChromaShift; width >>= m_hChromaShift; stride = reconPic->m_strideC; - uint64_t ssdU = computeSSD(origPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height); - uint64_t ssdV = computeSSD(origPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height); + uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height); + uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height); m_frameEncoder->m_SSDY += ssdY; m_frameEncoder->m_SSDU += ssdU; @@ -229,10 +228,10 @@ void FrameFilter::processRowPost(int row) } if (m_param->bEnableSsim && m_ssimBuf) { - pixel *rec = m_frame->m_reconPicYuv->m_picOrg[0]; - pixel *org = m_frame->m_origPicYuv->m_picOrg[0]; - intptr_t stride1 = m_frame->m_origPicYuv->m_stride; - intptr_t stride2 = m_frame->m_reconPicYuv->m_stride; + pixel *rec = m_frame->m_reconPic->m_picOrg[0]; + pixel *fenc = m_frame->m_fencPic->m_picOrg[0]; + intptr_t stride1 = m_frame->m_fencPic->m_stride; + intptr_t stride2 = m_frame->m_reconPic->m_stride; uint32_t bEnd = ((row + 1) == (this->m_numRows - 1)); uint32_t bStart = (row == 0); uint32_t minPixY = row * g_maxCUSize - 4 * !bStart; @@ -243,7 +242,7 @@ void FrameFilter::processRowPost(int row) /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right * to avoid alignment of ssim blocks with DCT blocks. */ minPixY += bStart ? 2 : -6; - m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, org + 2 + minPixY * stride2, stride2, + m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2, m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt); m_frameEncoder->m_ssimCnt += ssim_cnt; } @@ -422,8 +421,8 @@ static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absP uint32_t size = g_maxCUSize >> depth; int part = partitionFromSizes(size, size); - PicYuv* reconPic = frame.m_reconPicYuv; - PicYuv* fencPic = frame.m_origPicYuv; + PicYuv* reconPic = frame.m_reconPic; + PicYuv* fencPic = frame.m_fencPic; pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx); pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx); diff --git a/source/encoder/motion.cpp b/source/encoder/motion.cpp index f6129ff..61376ac 100644 --- a/source/encoder/motion.cpp +++ b/source/encoder/motion.cpp @@ -34,6 +34,7 @@ using namespace x265; namespace { + struct SubpelWorkload { int hpel_iters; @@ -43,7 +44,7 @@ struct SubpelWorkload bool hpel_satd; }; -SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] = +const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] = { { 1, 4, 0, 4, false }, // 4 SAD HPEL only { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL @@ -54,15 +55,14 @@ SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] = { 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL { 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL }; -} -static int size_scale[NUM_LUMA_PARTITIONS]; -#define SAD_THRESH(v) (bcost < (((v >> 4) * size_scale[partEnum]))) +int sizeScale[NUM_LUMA_PARTITIONS]; +#define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum]))) -static void init_scales(void) +void initScales(void) { #define SETUP_SCALE(W, H) \ - size_scale[LUMA_ ## W ## x ## H] = (H * H) >> 4; + sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4; SETUP_SCALE(4, 4); SETUP_SCALE(8, 8); SETUP_SCALE(8, 4); @@ -91,51 +91,18 @@ static void init_scales(void) #undef SETUP_SCALE } -MotionEstimate::MotionEstimate() - : searchMethod(3) - , subpelRefine(5) -{ - if (size_scale[0] == 0) - init_scales(); - - fenc = X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE); -} - -MotionEstimate::~MotionEstimate() -{ - X265_FREE(fenc); -} - -void MotionEstimate::setSourcePU(intptr_t offset, int width, int height) -{ - partEnum = partitionFromSizes(width, height); - X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n"); - sad = primitives.sad[partEnum]; - satd = primitives.satd[partEnum]; - sa8d = primitives.sa8d_inter[partEnum]; - sad_x3 = primitives.sad_x3[partEnum]; - sad_x4 = primitives.sad_x4[partEnum]; - - blockwidth = width; - blockheight = height; - blockOffset = offset; - - /* copy PU block into cache */ - primitives.luma_copy_pp[partEnum](fenc, FENC_STRIDE, fencplane + offset, fencLumaStride); -} - /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */ -static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) }; -static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */ -static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) }; -static const MV hex4[16] = +const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) }; +const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */ +const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) }; +const MV hex4[16] = { - MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3), + MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3), MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1), - MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1), + MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1), MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3), }; -static const MV offsets[] = +const MV offsets[] = { MV(-1, 0), MV(0, -1), MV(-1, -1), MV(1, -1), @@ -147,8 +114,8 @@ static const MV offsets[] = MV(1, 0), MV(0, 1), }; // offsets for Two Point Search -/* sum of absolute differences between MV candidates */ -static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidates) +/* sum of absolute differences between MV candidates, used for adaptive ME range */ +inline int predictorDifference(const MV *mvc, intptr_t numCandidates) { int sum = 0; @@ -161,6 +128,77 @@ static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidate return sum; } +} + +MotionEstimate::MotionEstimate() +{ + ctuAddr = -1; + absPartIdx = -1; + searchMethod = X265_HEX_SEARCH; + subpelRefine = 2; + bChromaSATD = false; + chromaSatd = NULL; +} + +void MotionEstimate::init(int method, int refine, int csp) +{ + if (!sizeScale[0]) + initScales(); + + searchMethod = method; + subpelRefine = refine; + fencPUYuv.create(FENC_STRIDE, csp); +} + +MotionEstimate::~MotionEstimate() +{ + fencPUYuv.destroy(); +} + +/* Called by lookahead, luma only, no use of PicYuv */ +void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight) +{ + partEnum = partitionFromSizes(pwidth, pheight); + X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n"); + sad = primitives.sad[partEnum]; + satd = primitives.satd[partEnum]; + sad_x3 = primitives.sad_x3[partEnum]; + sad_x4 = primitives.sad_x4[partEnum]; + + blockwidth = pwidth; + blockOffset = offset; + absPartIdx = ctuAddr = -1; + + /* copy PU block into cache */ + primitives.luma_copy_pp[partEnum](fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride); + X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n"); +} + +/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */ +void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight) +{ + partEnum = partitionFromSizes(pwidth, pheight); + X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n"); + sad = primitives.sad[partEnum]; + satd = primitives.satd[partEnum]; + sad_x3 = primitives.sad_x3[partEnum]; + sad_x4 = primitives.sad_x4[partEnum]; + chromaSatd = primitives.chroma[fencPUYuv.m_csp].satd[partEnum]; + + /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size + * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */ + bChromaSATD = subpelRefine > 2 && chromaSatd; + X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n"); + + ctuAddr = _ctuAddr; + absPartIdx = cuPartIdx + puPartIdx; + blockwidth = pwidth; + blockOffset = 0; + + /* copy PU from CU Yuv */ + fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD); +} + #define COST_MV_PT_DIST(mx, my, point, dist) \ do \ { \ @@ -291,8 +329,9 @@ void MotionEstimate::StarPatternSearch(ReferencePlanes *ref, int merange) { ALIGN_VAR_16(int, costs[16]); - pixel *fref = ref->fpelPlane + blockOffset; - size_t stride = ref->lumaStride; + pixel* fenc = fencPUYuv.m_buf[0]; + pixel* fref = ref->fpelPlane[0] + blockOffset; + intptr_t stride = ref->lumaStride; MV omv = bmv; int saved = bcost; @@ -532,8 +571,11 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref, MV & outQMv) { ALIGN_VAR_16(int, costs[16]); - size_t stride = ref->lumaStride; - pixel *fref = ref->fpelPlane + blockOffset; + if (ctuAddr >= 0) + blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0); + intptr_t stride = ref->lumaStride; + pixel* fenc = fencPUYuv.m_buf[0]; + pixel* fref = ref->fpelPlane[0] + blockOffset; setMVP(qmvp); @@ -561,9 +603,7 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref, MV bmv = pmv.roundToFPel(); int bcost = bprecost; if (pmv.isSubpel()) - { bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2); - } // measure SAD cost at MV(0) if MVP is not zero if (pmv.notZero()) @@ -577,21 +617,35 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref, } // measure SAD cost at each QPEL motion vector candidate - for (int i = 0; i < numCandidates; i++) + if (ref->isLowres) { - MV m = mvc[i].clipped(qmvmin, qmvmax); - if (m.notZero() && m != pmv && m != bestpre) // check already measured + for (int i = 0; i < numCandidates; i++) { - int cost; - if (ref->isLowres) - cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m); - else - cost = subpelCompare(ref, m, sad) + mvcost(m); - - if (cost < bprecost) + MV m = mvc[i].clipped(qmvmin, qmvmax); + if (m.notZero() && m != pmv && m != bestpre) // check already measured { - bprecost = cost; - bestpre = m; + int cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m); + if (cost < bprecost) + { + bprecost = cost; + bestpre = m; + } + } + } + } + else + { + for (int i = 0; i < numCandidates; i++) + { + MV m = mvc[i].clipped(qmvmin, qmvmax); + if (m.notZero() && m != pmv && m != bestpre) // check already measured + { + int cost = subpelCompare(ref, m, sad) + mvcost(m); + if (cost < bprecost) + { + bprecost = cost; + bestpre = m; + } } } } @@ -780,7 +834,7 @@ me_hex2: mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y); denom++; } - mvd += x265_predictor_difference(mvc, numCandidates); + mvd += predictorDifference(mvc, numCandidates); } sad_ctx = SAD_THRESH(1000) ? 0 @@ -1043,7 +1097,7 @@ me_hex2: else bmv = bmv.toQPel(); // promote search bmv to qpel - SubpelWorkload& wl = workload[this->subpelRefine]; + const SubpelWorkload& wl = workload[this->subpelRefine]; if (!bcost) { @@ -1053,11 +1107,11 @@ me_hex2: } else if (ref->isLowres) { - int bdir = 0, cost; + int bdir = 0; for (int i = 1; i <= wl.hpel_dirs; i++) { MV qmv = bmv + square1[i] * 2; - cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv); + int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv); COPY2_IF_LT(bcost, cost, bdir, i); } @@ -1068,7 +1122,7 @@ me_hex2: for (int i = 1; i <= wl.qpel_dirs; i++) { MV qmv = bmv + square1[i]; - cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv); + int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv); COPY2_IF_LT(bcost, cost, bdir, i); } @@ -1088,11 +1142,11 @@ me_hex2: for (int iter = 0; iter < wl.hpel_iters; iter++) { - int bdir = 0, cost; + int bdir = 0; for (int i = 1; i <= wl.hpel_dirs; i++) { MV qmv = bmv + square1[i] * 2; - cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); + int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); COPY2_IF_LT(bcost, cost, bdir, i); } @@ -1108,11 +1162,11 @@ me_hex2: for (int iter = 0; iter < wl.qpel_iters; iter++) { - int bdir = 0, cost; + int bdir = 0; for (int i = 1; i <= wl.qpel_dirs; i++) { MV qmv = bmv + square1[i]; - cost = subpelCompare(ref, qmv, satd) + mvcost(qmv); + int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv); COPY2_IF_LT(bcost, cost, bdir, i); } @@ -1130,40 +1184,100 @@ me_hex2: int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp) { + intptr_t refStride = ref->lumaStride; + pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride; int xFrac = qmv.x & 0x3; int yFrac = qmv.y & 0x3; + int cost; + intptr_t lclStride = fencPUYuv.m_size; + X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n"); - if ((yFrac | xFrac) == 0) - { - pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride; - return cmp(fenc, FENC_STRIDE, fref, ref->lumaStride); - } + if (!(yFrac | xFrac)) + cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride); else { - /* We are taking a short-cut here if the reference is weighted. To be + /* we are taking a short-cut here if the reference is weighted. To be * accurate we should be interpolating unweighted pixels and weighting - * the final 16bit values prior to rounding and downshifting. Instead we + * the final 16bit values prior to rounding and down shifting. Instead we * are simply interpolating the weighted full-pel pixels. Not 100% * accurate but good enough for fast qpel ME */ ALIGN_VAR_32(pixel, subpelbuf[64 * 64]); - pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride; - if (yFrac == 0) + if (!yFrac) + primitives.luma_hpp[partEnum](fref, refStride, subpelbuf, lclStride, xFrac); + else if (!xFrac) + primitives.luma_vpp[partEnum](fref, refStride, subpelbuf, lclStride, yFrac); + else { - primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac); + ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_LUMA)]); + + int filterSize = NTAPS_LUMA; + int halfFilterSize = filterSize >> 1; + primitives.luma_hps[partEnum](fref, refStride, immed, blockwidth, xFrac, 1); + primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, lclStride, yFrac); } - else if (xFrac == 0) + cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride); + } + + if (bChromaSATD) + { + int csp = fencPUYuv.m_csp; + int hshift = fencPUYuv.m_hChromaShift; + int vshift = fencPUYuv.m_vChromaShift; + int shiftHor = (2 + hshift); + int shiftVer = (2 + vshift); + lclStride = fencPUYuv.m_csize; + + intptr_t refStrideC = ref->reconPic->m_strideC; + intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC; + + const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset; + const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset; + + xFrac = qmv.x & ((1 << shiftHor) - 1); + yFrac = qmv.y & ((1 << shiftVer) - 1); + + if (!(yFrac | xFrac)) { - primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac); + cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC); + cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC); } else { - ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]); + ALIGN_VAR_32(pixel, subpelbuf[64 * 64]); + if (!yFrac) + { + primitives.chroma[csp].filter_hpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift)); + cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride); - int filterSize = NTAPS_LUMA; - int halfFilterSize = filterSize >> 1; - primitives.luma_hps[partEnum](fref, ref->lumaStride, immed, blockwidth, xFrac, 1); - primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac); + primitives.chroma[csp].filter_hpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift)); + cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride); + } + else if (!xFrac) + { + primitives.chroma[csp].filter_vpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift)); + cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride); + + primitives.chroma[csp].filter_vpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift)); + cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride); + } + else + { + ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]); + + int extStride = blockwidth >> hshift; + int filterSize = NTAPS_CHROMA; + int halfFilterSize = (filterSize >> 1); + + primitives.chroma[csp].filter_hps[partEnum](refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1); + primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift)); + cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride); + + primitives.chroma[csp].filter_hps[partEnum](refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1); + primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift)); + cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride); + } } - return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE); } + + return cost; } diff --git a/source/encoder/motion.h b/source/encoder/motion.h index 51687f5..a5fddd7 100644 --- a/source/encoder/motion.h +++ b/source/encoder/motion.h @@ -28,6 +28,7 @@ #include "reference.h" #include "mv.h" #include "bitcost.h" +#include "yuv.h" namespace x265 { // private x265 namespace @@ -36,63 +37,59 @@ class MotionEstimate : public BitCost { protected: - /* Aligned copy of original pixels, extra room for manual alignment */ - pixel *fencplane; - intptr_t fencLumaStride; - - pixelcmp_t sad; - pixelcmp_t satd; - pixelcmp_t sa8d; - pixelcmp_x3_t sad_x3; - pixelcmp_x4_t sad_x4; - intptr_t blockOffset; - int partEnum; + + int ctuAddr; + int absPartIdx; // part index of PU, including CU offset within CTU + int searchMethod; int subpelRefine; - /* subpel generation buffers */ int blockwidth; int blockheight; + pixelcmp_t sad; + pixelcmp_x3_t sad_x3; + pixelcmp_x4_t sad_x4; + pixelcmp_t satd; + pixelcmp_t chromaSatd; + MotionEstimate& operator =(const MotionEstimate&); public: static const int COST_MAX = 1 << 28; - pixel *fenc; + Yuv fencPUYuv; + int partEnum; + bool bChromaSATD; MotionEstimate(); - ~MotionEstimate(); - void setSearchMethod(int i) { searchMethod = i; } - - void setSubpelRefine(int i) { subpelRefine = i; } + void init(int method, int refine, int csp); /* Methods called at slice setup */ - void setSourcePlane(pixel *Y, intptr_t luma) - { - fencplane = Y; - fencLumaStride = luma; - } - - void setSourcePU(intptr_t offset, int pwidth, int pheight); + void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight); + void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight); /* buf*() and motionEstimate() methods all use cached fenc pixels and thus * require setSourcePU() to be called prior. */ - inline int bufSAD(pixel *fref, intptr_t stride) { return sad(fenc, FENC_STRIDE, fref, stride); } + inline int bufSAD(const pixel* fref, intptr_t stride) { return sad(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); } - inline int bufSA8D(pixel *fref, intptr_t stride) { return sa8d(fenc, FENC_STRIDE, fref, stride); } + inline int bufSATD(const pixel* fref, intptr_t stride) { return satd(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); } - inline int bufSATD(pixel *fref, intptr_t stride) { return satd(fenc, FENC_STRIDE, fref, stride); } + inline int bufChromaSATD(const Yuv& refYuv, int puPartIdx) + { + return chromaSatd(refYuv.getCbAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[1], fencPUYuv.m_csize) + + chromaSatd(refYuv.getCrAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[2], fencPUYuv.m_csize); + } - int motionEstimate(ReferencePlanes *ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv); + int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv); - int subpelCompare(ReferencePlanes * ref, const MV &qmv, pixelcmp_t); + int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t); protected: diff --git a/source/encoder/nal.cpp b/source/encoder/nal.cpp index c38c651..b0bc4c7 100644 --- a/source/encoder/nal.cpp +++ b/source/encoder/nal.cpp @@ -193,12 +193,10 @@ uint32_t NALList::serializeSubstreams(uint32_t* streamSizeBytes, uint32_t stream { for (uint32_t i = 0; i < inSize; i++) { - if (bytes > 2 && !out[bytes - 2] && !out[bytes - 3] && out[bytes - 1] <= 0x03) + if (bytes >= 2 && !out[bytes - 2] && !out[bytes - 1] && inBytes[i] <= 0x03) { /* inject 0x03 to prevent emulating a start code */ - out[bytes] = out[bytes - 1]; - out[bytes - 1] = 0x03; - bytes++; + out[bytes++] = 3; } out[bytes++] = inBytes[i]; diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp index f54b101..4f31fe1 100644 --- a/source/encoder/ratecontrol.cpp +++ b/source/encoder/ratecontrol.cpp @@ -40,8 +40,7 @@ using namespace x265; /* Amortize the partial cost of I frames over the next N frames */ -const double RateControl::s_amortizeFraction = 0.85; -const int RateControl::s_amortizeFrames = 75; + const int RateControl::s_slidingWindowFrames = 20; const char *RateControl::s_defaultStatFileName = "x265_2pass.log"; @@ -173,8 +172,8 @@ static inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcSt /* Find the total AC energy of each block in all planes */ uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t block_y) { - intptr_t stride = curFrame->m_origPicYuv->m_stride; - intptr_t cStride = curFrame->m_origPicYuv->m_strideC; + intptr_t stride = curFrame->m_fencPic->m_stride; + intptr_t cStride = curFrame->m_fencPic->m_strideC; intptr_t blockOffsetLuma = block_x + (block_y * stride); int colorFormat = m_param->internalCsp; int hShift = CHROMA_H_SHIFT(colorFormat); @@ -183,9 +182,9 @@ uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t blo uint32_t var; - var = acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat); - var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat); - var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat); + var = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat); + var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat); + var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat); x265_emms(); return var; } @@ -193,8 +192,8 @@ uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t blo void RateControl::calcAdaptiveQuantFrame(Frame *curFrame) { /* Actual adaptive quantization */ - int maxCol = curFrame->m_origPicYuv->m_picWidth; - int maxRow = curFrame->m_origPicYuv->m_picHeight; + int maxCol = curFrame->m_fencPic->m_picWidth; + int maxRow = curFrame->m_fencPic->m_picHeight; for (int y = 0; y < 3; y++) { @@ -323,6 +322,12 @@ RateControl::RateControl(x265_param *p) m_bTerminated = false; m_finalFrameCount = 0; m_numEntries = 0; + m_amortizeFraction = 0.85; + m_amortizeFrames = 75; + if (m_param->totalFrames <= 2 * m_fps) + { + m_amortizeFraction = m_amortizeFrames = 0; + } if (m_param->rc.rateControlMode == X265_RC_CRF) { m_param->rc.qp = (int)m_param->rc.rfConstant; @@ -494,12 +499,12 @@ bool RateControl::init(const SPS *sps) /* Frame Predictors and Row predictors used in vbv */ for (int i = 0; i < 5; i++) { - m_pred[i].coeff = 2.0; + m_pred[i].coeff = 1.5; m_pred[i].count = 1.0; m_pred[i].decay = 0.5; m_pred[i].offset = 0.0; } - m_predBfromP = m_pred[0]; + m_pred[0].coeff = 1.0; if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead)) { /* If the user hasn't defined the stat filename, use the default value */ @@ -1157,7 +1162,7 @@ int RateControl::rateControlStart(Frame* curFrame, RateControlEntry* rce, Encode rce->leadingNoBSatd = m_leadingNoBSatd; if (curFrame->m_forceqp) { - m_qp = int32_t(curFrame->m_forceqp + 0.5) - 1; + m_qp = (int32_t)(curFrame->m_forceqp + 0.5) - 1; m_qp = Clip3(QP_MIN, QP_MAX_MAX, m_qp); rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = m_qp; } @@ -1343,6 +1348,25 @@ fail: return false; } +double RateControl::tuneAbrQScaleFromFeedback(double qScale) +{ + double abrBuffer = 2 * m_param->rc.rateTolerance * m_bitrate; + if (m_currentSatd) + { + /* use framesDone instead of POC as poc count is not serial with bframes enabled */ + double overflow = 1.0; + double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration; + double wantedBits = timeDone * m_bitrate; + if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames) + { + abrBuffer *= X265_MAX(1, sqrt(timeDone)); + overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer); + qScale *= overflow; + } + } + return qScale; +} + double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce) { double q; @@ -1415,17 +1439,25 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce) q += m_pbOffset / 2; else q += m_pbOffset; - rce->qpNoVbv = q; - double qScale = x265_qp2qScale(q); - if (!m_2pass && m_isVbv) + double qScale = x265_qp2qScale(q); + if (m_isCbr) { - if (m_leadingBframes > 5) + qScale = tuneAbrQScaleFromFeedback(qScale); + if (!m_isAbrReset) { - qScale = clipQscale(curFrame, rce, qScale); - m_lastQScaleFor[m_sliceType] = qScale; + double lmin = m_lastQScaleFor[P_SLICE] / m_lstep; + double lmax = m_lastQScaleFor[P_SLICE] * m_lstep; + qScale = Clip3(lmin, lmax, qScale); } - rce->frameSizePlanned = predictSize(&m_predBfromP, qScale, (double)m_leadingNoBSatd); + q = x265_qScale2qp(qScale); + } + rce->qpNoVbv = q; + if (!m_2pass && m_isVbv) + { + qScale = clipQscale(curFrame, rce, qScale); + m_lastQScaleFor[m_sliceType] = qScale; + rce->frameSizePlanned = predictSize(&m_pred[m_sliceType], qScale, (double)m_currentSatd); } else if (m_2pass && m_isVbv) { @@ -1508,7 +1540,7 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce) * tradeoff between quality and bitrate precision. But at large * tolerances, the bit distribution approaches that of 2pass. */ - double wantedBits, overflow = 1; + double overflow = 1; m_shortTermCplxSum *= 0.5; m_shortTermCplxCount *= 0.5; @@ -1528,25 +1560,10 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce) { if (!m_param->rc.bStatRead) checkAndResetABR(rce, false); - q = getQScale(rce, m_wantedBitsWindow / m_cplxrSum); - - /* ABR code can potentially be counterproductive in CBR, so just - * don't bother. Don't run it if the frame complexity is zero - * either. */ - if (!m_isCbr && m_currentSatd) - { - /* use framesDone instead of POC as poc count is not serial with bframes enabled */ - double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration; - wantedBits = timeDone * m_bitrate; - if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames) - { - abrBuffer *= X265_MAX(1, sqrt(timeDone)); - overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer); - q *= overflow; - } - } + double initialQScale = getQScale(rce, m_wantedBitsWindow / m_cplxrSum); + q = tuneAbrQScaleFromFeedback(initialQScale); + overflow = q / initialQScale; } - if (m_sliceType == I_SLICE && m_param->keyframeMax > 1 && m_lastNonBPictType != I_SLICE && !m_isAbrReset) { @@ -1574,7 +1591,7 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce) { q = x265_qp2qScale(CRF_INIT_QP) / fabs(m_param->rc.ipFactor); } - else if (m_framesDone == 0 && !m_isVbv) + else if (m_framesDone == 0 && !m_isVbv && m_param->rc.rateControlMode == X265_RC_ABR) { /* for ABR alone, clip the first I frame qp */ double lqmax = x265_qp2qScale(ABR_INIT_QP_MAX) * m_lstep; @@ -1615,8 +1632,8 @@ void RateControl::rateControlUpdateStats(RateControlEntry* rce) if (m_partialResidualFrames) rce->rowTotalBits += m_partialResidualCost * m_partialResidualFrames; - m_partialResidualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax); - m_partialResidualCost = (int)((rce->rowTotalBits * s_amortizeFraction) /m_partialResidualFrames); + m_partialResidualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax); + m_partialResidualCost = (int)((rce->rowTotalBits * m_amortizeFraction) /m_partialResidualFrames); rce->rowTotalBits -= m_partialResidualCost * m_partialResidualFrames; } else if (m_partialResidualFrames) @@ -1725,13 +1742,11 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q) { double frameQ[3]; double curBits; - if (m_sliceType == B_SLICE) - curBits = predictSize(&m_predBfromP, q, (double)m_currentSatd); - else - curBits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd); + curBits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd); double bufferFillCur = m_bufferFill - curBits; double targetFill; - double totalDuration = 0; + double totalDuration = m_frameDuration; + bool isIFramePresent = m_sliceType == I_SLICE ? true : false; frameQ[P_SLICE] = m_sliceType == I_SLICE ? q * m_param->rc.ipFactor : (m_sliceType == B_SLICE ? q / m_param->rc.pbFactor : q); frameQ[B_SLICE] = frameQ[P_SLICE] * m_param->rc.pbFactor; frameQ[I_SLICE] = frameQ[P_SLICE] / m_param->rc.ipFactor; @@ -1747,20 +1762,23 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q) bufferFillCur += wantedFrameSize; int64_t satd = curFrame->m_lowres.plannedSatd[j] >> (X265_DEPTH - 8); type = IS_X265_TYPE_I(type) ? I_SLICE : IS_X265_TYPE_B(type) ? B_SLICE : P_SLICE; + if (type == I_SLICE) + isIFramePresent = true; curBits = predictSize(&m_pred[type], frameQ[type], (double)satd); bufferFillCur -= curBits; } - /* Try to get the buffer at least 50% filled, but don't set an impossible goal. */ - targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * 0.5); + /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */ + double tol = isIFramePresent ? 1 / totalDuration : totalDuration < 0.5 ? 2 : 1; + targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5 , m_bufferSize * (1 - 0.8 * totalDuration * tol)); if (bufferFillCur < targetFill) { q *= 1.01; loopTerminate |= 1; continue; } - /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */ - targetFill = Clip3(m_bufferSize * 0.8, m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5); + /* Try to get the buffer atleast 50% filled, but don't set an impossible goal. */ + targetFill = Clip3(m_bufferSize - (m_bufferSize * totalDuration * 0.5), m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5); if (m_isCbr && bufferFillCur > targetFill) { q /= 1.01; @@ -1810,25 +1828,6 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q) if (pbits > rce->frameSizeMaximum) q *= pbits / rce->frameSizeMaximum; - // Check B-frame complexity, and use up any bits that would - // overflow before the next P-frame. - if (m_leadingBframes <= 5 && m_sliceType == P_SLICE && !m_singleFrameVbv) - { - int nb = m_leadingBframes; - double bits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd); - double bbits = predictSize(&m_predBfromP, q * m_param->rc.pbFactor, (double)m_currentSatd); - double space; - if (bbits > m_bufferRate) - nb = 0; - double pbbits = nb * bbits; - - space = m_bufferFill + (1 + nb) * m_bufferRate - m_bufferSize; - if (pbbits < space) - q *= X265_MAX(pbbits / space, bits / (0.5 * m_bufferSize)); - - q = X265_MAX(q0 / 2, q); - } - if (!m_isCbr || (m_isAbr && m_currentSatd >= rce->movingAvgSum && q <= q0 / 2)) q = X265_MAX(q0, q); @@ -1899,22 +1898,24 @@ double RateControl::predictRowsSizeSum(Frame* curFrame, RateControlEntry* rce, d && refQScale > 0 && refRowSatdCost > 0) { - if (abs(int32_t(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2) + if (abs((int32_t)(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2) { double predTotal = refRowBits * satdCostForPendingCus / refRowSatdCost * refQScale / qScale; - totalSatdBits += int32_t((pred_s + predTotal) * 0.5); + totalSatdBits += (int32_t)((pred_s + predTotal) * 0.5); continue; } } - totalSatdBits += int32_t(pred_s); + totalSatdBits += (int32_t)pred_s; } - else + else if (picType == P_SLICE) { /* Our QP is lower than the reference! */ double pred_intra = predictSize(rce->rowPred[1], qScale, intraCost); /* Sum: better to overestimate than underestimate by using only one of the two predictors. */ - totalSatdBits += int32_t(pred_intra + pred_s); + totalSatdBits += (int32_t)(pred_intra + pred_s); } + else + totalSatdBits += (int32_t)pred_s; } } @@ -1969,16 +1970,8 @@ int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateCo if (row < sps.numCuInHeight - 1) { - /* B-frames shouldn't use lower QP than their reference frames. */ - if (rce->sliceType == B_SLICE) - { - Frame* refSlice1 = curEncData.m_slice->m_refPicList[0][0]; - Frame* refSlice2 = curEncData.m_slice->m_refPicList[1][0]; - qpMin = X265_MAX(qpMin, X265_MAX(refSlice1->m_encData->m_rowStat[row].diagQp, refSlice2->m_encData->m_rowStat[row].diagQp)); - qpVbv = X265_MAX(qpVbv, qpMin); - } /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */ - double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_param->rc.rateTolerance; + double rcTol = (bufferLeftPlanned * 0.2) / m_param->frameNumThreads * m_param->rc.rateTolerance; int32_t encodedBitsSoFar = 0; double accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar); @@ -1996,7 +1989,7 @@ int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateCo while (qpVbv < qpMax && ((accFrameBits > rce->frameSizePlanned + rcTol) || - (rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.5) || + (rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.2) || (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv))) { qpVbv += stepSize; @@ -2085,7 +2078,7 @@ void RateControl::updatePredictor(Predictor *p, double q, double var, double bit { if (var < 10) return; - const double range = 1.5; + const double range = 2; double old_coeff = p->coeff / p->count; double new_coeff = bits * q / var; double new_coeff_clipped = Clip3(old_coeff / range, old_coeff * range, new_coeff); @@ -2220,8 +2213,8 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* /* previous I still had a residual; roll it into the new loan */ if (m_residualFrames) bits += m_residualCost * m_residualFrames; - m_residualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax); - m_residualCost = (int)((bits * s_amortizeFraction) / m_residualFrames); + m_residualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax); + m_residualCost = (int)((bits * m_amortizeFraction) / m_residualFrames); bits -= m_residualCost * m_residualFrames; } else if (m_residualFrames) @@ -2257,16 +2250,6 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* if (m_isVbv) { - if (rce->sliceType == B_SLICE) - { - m_bframeBits += actualBits; - if (rce->bLastMiniGopBFrame) - { - if (rce->bframes != 0) - updatePredictor(&m_predBfromP, x265_qp2qScale(rce->qpaRc), (double)rce->leadingNoBSatd, (double)m_bframeBits / rce->bframes); - m_bframeBits = 0; - } - } updateVbv(actualBits, rce); if (m_param->bEmitHRDSEI) diff --git a/source/encoder/ratecontrol.h b/source/encoder/ratecontrol.h index 5b86147..13e1701 100644 --- a/source/encoder/ratecontrol.h +++ b/source/encoder/ratecontrol.h @@ -233,11 +233,10 @@ public: void initHRD(SPS* sps); int rateControlSliceType(int frameNum); bool cuTreeReadFor2Pass(Frame* curFrame); + double tuneAbrQScaleFromFeedback(double qScale); protected: - static const double s_amortizeFraction; - static const int s_amortizeFrames; static const int s_slidingWindowFrames; static const char *s_defaultStatFileName; @@ -245,6 +244,8 @@ protected: int m_partialResidualFrames; int m_residualCost; int m_partialResidualCost; + int m_amortizeFrames; + double m_amortizeFraction; double getQScale(RateControlEntry *rce, double rateFactor); double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR diff --git a/source/encoder/rdcost.h b/source/encoder/rdcost.h index 10bfff3..8bc7f18 100644 --- a/source/encoder/rdcost.h +++ b/source/encoder/rdcost.h @@ -37,30 +37,37 @@ public: /* all weights and factors stored as FIX8 */ uint64_t m_lambda2; uint64_t m_lambda; - uint64_t m_cbDistortionWeight; - uint64_t m_crDistortionWeight; + uint32_t m_chromaDistWeight[2]; + uint32_t m_psyRdBase; uint32_t m_psyRd; int m_qp; - void setPsyRdScale(double scale) { m_psyRd = (uint32_t)floor(256.0 * scale * 0.33); } - void setCbDistortionWeight(uint16_t weightFix8) { m_cbDistortionWeight = weightFix8; } - void setCrDistortionWeight(uint16_t weightFix8) { m_crDistortionWeight = weightFix8; } + void setPsyRdScale(double scale) { m_psyRdBase = (uint32_t)floor(256.0 * scale * 0.33); } void setQP(const Slice& slice, int qp) { m_qp = qp; + int qpCb, qpCr; + /* Scale PSY RD factor by a slice type factor */ + static const uint32_t psyScaleFix8[3] = { 300, 256, 96 }; /* B, P, I */ + m_psyRd = (m_psyRdBase * psyScaleFix8[slice.m_sliceType]) >> 8; setLambda(x265_lambda2_tab[qp], x265_lambda_tab[qp]); - - int qpCb = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCbQpOffset); + if (slice.m_sps->chromaFormatIdc == X265_CSP_I420) + qpCb = Clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice.m_pps->chromaQpOffset[0]]); + else + qpCb = X265_MIN(qp + slice.m_pps->chromaQpOffset[0], QP_MAX_SPEC); int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET); uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256; - setCbDistortionWeight(lambdaOffset); + m_chromaDistWeight[0] = lambdaOffset; - int qpCr = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCrQpOffset); + if (slice.m_sps->chromaFormatIdc == X265_CSP_I420) + qpCr = Clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice.m_pps->chromaQpOffset[0]]); + else + qpCr = X265_MIN(qp + slice.m_pps->chromaQpOffset[0], QP_MAX_SPEC); chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET); lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256; - setCrDistortionWeight(lambdaOffset); + m_chromaDistWeight[1] = lambdaOffset; } void setLambda(double lambda2, double lambda) @@ -72,18 +79,18 @@ public: inline uint64_t calcRdCost(uint32_t distortion, uint32_t bits) const { X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2, - "calcRdCost wrap detected dist: %d, bits %d, lambda: %d\n", distortion, bits, (int)m_lambda2); + "calcRdCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2); return distortion + ((bits * m_lambda2 + 128) >> 8); } /* return the difference in energy between the source block and the recon block */ - inline int psyCost(int size, pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) const + inline int psyCost(int size, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) const { return primitives.psy_cost_pp[size](source, sstride, recon, rstride); } /* return the difference in energy between the source block and the recon block */ - inline int psyCost(int size, int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride) const + inline int psyCost(int size, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) const { return primitives.psy_cost_ss[size](source, sstride, recon, rstride); } @@ -97,22 +104,15 @@ public: inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits) const { X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda, - "calcRdSADCost wrap detected dist: %d, bits %d, lambda: "X265_LL"\n", sadCost, bits, m_lambda); + "calcRdSADCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", sadCost, bits, m_lambda); return sadCost + ((bits * m_lambda + 128) >> 8); } - inline uint32_t scaleChromaDistCb(uint32_t dist) const - { - X265_CHECK(dist <= (UINT64_MAX - 128) / m_cbDistortionWeight, - "scaleChromaDistCb wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_cbDistortionWeight); - return (uint32_t)(((dist * m_cbDistortionWeight) + 128) >> 8); - } - - inline uint32_t scaleChromaDistCr(uint32_t dist) const + inline uint32_t scaleChromaDist(uint32_t plane, uint32_t dist) const { - X265_CHECK(dist <= (UINT64_MAX - 128) / m_crDistortionWeight, - "scaleChromaDistCr wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_crDistortionWeight); - return (uint32_t)(((dist * m_crDistortionWeight) + 128) >> 8); + X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1], + "scaleChromaDist wrap detected dist: %u, lambda: %u\n", dist, m_chromaDistWeight[plane - 1]); + return (uint32_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8); } inline uint32_t getCost(uint32_t bits) const diff --git a/source/encoder/reference.cpp b/source/encoder/reference.cpp index 958042b..425174c 100644 --- a/source/encoder/reference.cpp +++ b/source/encoder/reference.cpp @@ -33,86 +33,142 @@ using namespace x265; MotionReference::MotionReference() { - m_weightBuffer = NULL; + weightBuffer[0] = NULL; + weightBuffer[1] = NULL; + weightBuffer[2] = NULL; } -int MotionReference::init(PicYuv* recPic, WeightParam *w) +MotionReference::~MotionReference() +{ + X265_FREE(weightBuffer[0]); + X265_FREE(weightBuffer[1]); + X265_FREE(weightBuffer[2]); +} + +int MotionReference::init(PicYuv* recPic, WeightParam *wp, const x265_param& p) { - m_reconPic = recPic; + reconPic = recPic; + numWeightedRows = 0; lumaStride = recPic->m_stride; - intptr_t startpad = recPic->m_lumaMarginY * lumaStride + recPic->m_lumaMarginX; + chromaStride = recPic->m_strideC; + numInterpPlanes = p.subpelRefine > 2 ? 3 : 1; /* is chroma satd possible? */ - /* directly reference the pre-extended integer pel plane */ - fpelPlane = recPic->m_picBuf[0] + startpad; + /* directly reference the extended integer pel planes */ + fpelPlane[0] = recPic->m_picOrg[0]; + fpelPlane[1] = recPic->m_picOrg[1]; + fpelPlane[2] = recPic->m_picOrg[2]; isWeighted = false; - if (w) + if (wp) { - if (!m_weightBuffer) + uint32_t numCUinHeight = (reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize; + + int marginX = reconPic->m_lumaMarginX; + int marginY = reconPic->m_lumaMarginY; + intptr_t stride = reconPic->m_stride; + int cuHeight = g_maxCUSize; + + for (int c = 0; c < numInterpPlanes; c++) { - uint32_t numCUinHeight = (recPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize; - size_t padheight = (numCUinHeight * g_maxCUSize) + recPic->m_lumaMarginY * 2; - m_weightBuffer = X265_MALLOC(pixel, lumaStride * padheight); - if (!m_weightBuffer) - return -1; + if (c == 1) + { + marginX = reconPic->m_chromaMarginX; + marginY = reconPic->m_chromaMarginY; + stride = reconPic->m_strideC; + cuHeight >>= reconPic->m_vChromaShift; + } + + if (wp[c].bPresentFlag) + { + if (!weightBuffer[c]) + { + size_t padheight = (numCUinHeight * cuHeight) + marginY * 2; + weightBuffer[c] = X265_MALLOC(pixel, stride * padheight); + if (!weightBuffer[c]) + return -1; + } + + /* use our buffer which will have weighted pixels written to it */ + fpelPlane[c] = weightBuffer[c] + marginY * stride + marginX; + X265_CHECK(recPic->m_picOrg[c] - recPic->m_picBuf[c] == marginY * stride + marginX, "PicYuv pad calculation mismatch\n"); + + w[c].weight = wp[c].inputWeight; + w[c].offset = wp[c].inputOffset * (1 << (X265_DEPTH - 8)); + w[c].shift = wp[c].log2WeightDenom; + w[c].round = w[c].shift ? 1 << (w[c].shift - 1) : 0; + } } isWeighted = true; - weight = w->inputWeight; - offset = w->inputOffset * (1 << (X265_DEPTH - 8)); - shift = w->log2WeightDenom; - round = shift ? 1 << (shift - 1) : 0; - m_numWeightedRows = 0; - - /* use our buffer which will have weighted pixels written to it */ - fpelPlane = m_weightBuffer + startpad; } return 0; } -MotionReference::~MotionReference() -{ - X265_FREE(m_weightBuffer); -} - -void MotionReference::applyWeight(int rows, int numRows) +void MotionReference::applyWeight(int finishedRows, int maxNumRows) { - rows = X265_MIN(rows, numRows); - if (m_numWeightedRows >= rows) + finishedRows = X265_MIN(finishedRows, maxNumRows); + if (numWeightedRows >= finishedRows) return; - int marginX = m_reconPic->m_lumaMarginX; - int marginY = m_reconPic->m_lumaMarginY; - pixel* src = (pixel*)m_reconPic->m_picOrg[0] + (m_numWeightedRows * (int)g_maxCUSize * lumaStride); - pixel* dst = fpelPlane + ((m_numWeightedRows * (int)g_maxCUSize) * lumaStride); - int width = m_reconPic->m_picWidth; - int height = ((rows - m_numWeightedRows) * g_maxCUSize); - if (rows == numRows) - height = ((m_reconPic->m_picHeight % g_maxCUSize) ? (m_reconPic->m_picHeight % g_maxCUSize) : g_maxCUSize); - - // Computing weighted CU rows - int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth - int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths - primitives.weight_pp(src, dst, lumaStride, padwidth, height, - weight, round << correction, shift + correction, offset); - - // Extending Left & Right - primitives.extendRowBorder(dst, lumaStride, width, height, marginX); - - // Extending Above - if (m_numWeightedRows == 0) + + int marginX = reconPic->m_lumaMarginX; + int marginY = reconPic->m_lumaMarginY; + intptr_t stride = reconPic->m_stride; + int width = reconPic->m_picWidth; + int height = (finishedRows - numWeightedRows) * g_maxCUSize; + if (finishedRows == maxNumRows && (reconPic->m_picHeight % g_maxCUSize)) { - pixel *pixY = fpelPlane - marginX; - for (int y = 0; y < marginY; y++) - memcpy(pixY - (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel)); + /* the last row may be partial height */ + height -= g_maxCUSize; + height += reconPic->m_picHeight % g_maxCUSize; } + int cuHeight = g_maxCUSize; - // Extending Bottom - if (rows == numRows) + for (int c = 0; c < numInterpPlanes; c++) { - pixel *pixY = fpelPlane - marginX + (m_reconPic->m_picHeight - 1) * lumaStride; - for (int y = 0; y < marginY; y++) - memcpy(pixY + (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel)); + if (c == 1) + { + marginX = reconPic->m_chromaMarginX; + marginY = reconPic->m_chromaMarginY; + stride = reconPic->m_strideC; + width >>= reconPic->m_hChromaShift; + height >>= reconPic->m_vChromaShift; + cuHeight >>= reconPic->m_vChromaShift; + } + + /* Do not generate weighted predictions if using original picture */ + if (fpelPlane[c] == reconPic->m_picOrg[c]) + continue; + + const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride; + pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride; + + // Computing weighted CU rows + int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth + int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths + primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset); + + // Extending Left & Right + primitives.extendRowBorder(dst, stride, width, height, marginX); + + // Extending Above + if (numWeightedRows == 0) + { + pixel *pixY = fpelPlane[c] - marginX; + for (int y = 0; y < marginY; y++) + memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel)); + } + + // Extending Bottom + if (finishedRows == maxNumRows) + { + int picHeight = reconPic->m_picHeight; + if (c) picHeight >>= reconPic->m_vChromaShift; + pixel *pixY = fpelPlane[c] - marginX + (picHeight - 1) * stride; + for (int y = 0; y < marginY; y++) + memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel)); + } } - m_numWeightedRows = rows; + + numWeightedRows = finishedRows; } diff --git a/source/encoder/reference.h b/source/encoder/reference.h index 3fb9afd..6b33499 100644 --- a/source/encoder/reference.h +++ b/source/encoder/reference.h @@ -25,13 +25,13 @@ #define X265_REFERENCE_H #include "primitives.h" +#include "picyuv.h" #include "lowres.h" #include "mv.h" namespace x265 { // private x265 namespace -class PicYuv; struct WeightParam; class MotionReference : public ReferencePlanes @@ -40,12 +40,12 @@ public: MotionReference(); ~MotionReference(); - int init(PicYuv*, WeightParam* w = NULL); + int init(PicYuv*, WeightParam* wp, const x265_param& p); void applyWeight(int rows, int numRows); - PicYuv* m_reconPic; - pixel* m_weightBuffer; - int m_numWeightedRows; + pixel* weightBuffer[3]; + int numInterpPlanes; + int numWeightedRows; protected: diff --git a/source/encoder/sao.cpp b/source/encoder/sao.cpp index 1179fe0..9836aa7 100644 --- a/source/encoder/sao.cpp +++ b/source/encoder/sao.cpp @@ -176,8 +176,11 @@ void SAO::allocSaoParam(SAOParam* saoParam) const void SAO::startSlice(Frame* frame, Entropy& initState, int qp) { Slice* slice = frame->m_encData->m_slice; - - int qpCb = Clip3(0, QP_MAX_MAX, qp + slice->m_pps->chromaCbQpOffset); + int qpCb = qp; + if (m_param->internalCsp == X265_CSP_I420) + qpCb = Clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]); + else + qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC); m_lumaLambda = x265_lambda2_tab[qp]; m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma m_frame = frame; @@ -225,8 +228,8 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane) { int x, y; const CUData* cu = m_frame->m_encData->getPicCTU(addr); - pixel* rec = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr); - intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride; + pixel* rec = m_frame->m_reconPic->getPlaneAddr(plane, addr); + intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride; uint32_t picWidth = m_param->sourceWidth; uint32_t picHeight = m_param->sourceHeight; int ctuWidth = g_maxCUSize; @@ -436,7 +439,7 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane) /* Process SAO all units */ void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane) { - intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride; + intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride; uint32_t picWidth = m_param->sourceWidth; int ctuWidth = g_maxCUSize; int ctuHeight = g_maxCUSize; @@ -449,12 +452,12 @@ void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane) if (!idxY) { - pixel* rec = m_frame->m_reconPicYuv->m_picOrg[plane]; + pixel* rec = m_frame->m_reconPic->m_picOrg[plane]; memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth); } int addr = idxY * m_numCuInWidth; - pixel* rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr); + pixel* rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr); for (int i = 0; i < ctuHeight + 1; i++) { @@ -506,7 +509,7 @@ void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane) } else if (idxX != (m_numCuInWidth - 1)) { - rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr); + rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr); for (int i = 0; i < ctuHeight + 1; i++) { @@ -543,12 +546,12 @@ void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc) void SAO::calcSaoStatsCu(int addr, int plane) { int x, y; - CUData* cu = m_frame->m_encData->getPicCTU(addr); - const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr); - const pixel* rec0 = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr); + const CUData* cu = m_frame->m_encData->getPicCTU(addr); + const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr); + const pixel* rec0 = m_frame->m_reconPic->getPlaneAddr(plane, addr); const pixel* fenc; const pixel* rec; - intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride; + intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride; uint32_t picWidth = m_param->sourceWidth; uint32_t picHeight = m_param->sourceHeight; int ctuWidth = g_maxCUSize; @@ -789,10 +792,10 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY) int addr = idxX + m_numCuInWidth * idxY; int x, y; - CUData* cu = frame->m_encData->getPicCTU(addr); + const CUData* cu = frame->m_encData->getPicCTU(addr); const pixel* fenc; const pixel* rec; - intptr_t stride = m_frame->m_reconPicYuv->m_stride; + intptr_t stride = m_frame->m_reconPic->m_stride; uint32_t picWidth = m_param->sourceWidth; uint32_t picHeight = m_param->sourceHeight; int ctuWidth = g_maxCUSize; @@ -826,7 +829,7 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY) { if (plane == 1) { - stride = frame->m_reconPicYuv->m_strideC; + stride = frame->m_reconPic->m_strideC; picWidth >>= m_hChromaShift; picHeight >>= m_vChromaShift; ctuWidth >>= m_hChromaShift; @@ -845,8 +848,8 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY) stats = m_offsetOrgPreDblk[addr][plane][SAO_BO]; count = m_countPreDblk[addr][plane][SAO_BO]; - const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr); - const pixel* rec0 = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr); + const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr); + const pixel* rec0 = m_frame->m_reconPic->getPlaneAddr(plane, addr); fenc = fenc0; rec = rec0; diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp index cd86318..bc0dc94 100644 --- a/source/encoder/search.cpp +++ b/source/encoder/search.cpp @@ -37,6 +37,8 @@ using namespace x265; #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data) #endif +#define MVP_IDX_BITS 1 + ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 }; ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 }; @@ -66,11 +68,10 @@ bool Search::initSearch(const x265_param& param, ScalingList& scalingList) m_numLayers = g_log2Size[param.maxCUSize] - 2; m_rdCost.setPsyRdScale(param.psyRd); - m_me.setSearchMethod(param.searchMethod); - m_me.setSubpelRefine(param.subpelRefine); + m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp); bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder); - if (m_param->noiseReduction) + if (m_param->noiseReductionIntra || m_param->noiseReductionInter) ok &= m_quant.allocNoiseReduction(param); ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */ @@ -163,70 +164,55 @@ void Search::invalidateContexts(int fromDepth) void Search::invalidateContexts(int) {} #endif -void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height) +void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx) { - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; - uint32_t subdiv = tuDepthL > trDepth; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx]; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - bool mCodeAll = true; - const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift); - if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE)) - mCodeAll = false; - - if (mCodeAll) + if (!(log2TrSize - m_hChromaShift < 2)) { - if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1)) - m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv); - - if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1)) - m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv); + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1)) + m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv); + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1)) + m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv); } if (subdiv) { - absPartIdxStep >>= 2; - width >>= 1; - height >>= 1; - - uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - for (uint32_t part = 0; part < 4; part++) - codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx); } } -void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype) +void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype) { - if (!cu.getCbf(absPartIdx, ttype, trDepth)) + if (!cu.getCbf(absPartIdx, ttype, tuDepth)) return; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - if (tuDepthL > trDepth) + if (tuDepth < cu.m_tuDepth[absPartIdx]) { - uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - for (uint32_t part = 0; part < 4; part++) - codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype); return; } - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - - uint32_t trDepthC = trDepth; + uint32_t tuDepthC = tuDepth; uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - - if (log2TrSizeC == 1) - { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n"); - trDepthC--; - log2TrSizeC++; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); - bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0); - if (!bFirstQ) + + if (log2TrSizeC < 2) + { + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + if (absPartIdx & 3) return; + log2TrSizeC = 2; + tuDepthC--; } uint32_t qtLayer = log2TrSize - 2; @@ -243,17 +229,17 @@ void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absP uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1); coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset; uint32_t subTUSize = 1 << (log2TrSizeC * 2); - uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1); - if (cu.getCbf(absPartIdx, ttype, trDepth + 1)) + uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2); + if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype); - if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype); + if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype); } } -void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2]) +void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2]) { - uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t qtLayer = log2TrSize - 2; uint32_t sizeIdx = log2TrSize - 2; @@ -280,20 +266,20 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, if (mightSplit) m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); - pixel* fenc = const_cast(mode.fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx); pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); uint32_t stride = mode.fencYuv->m_size; // init availability pattern uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; - initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode); + initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode); // get prediction signal predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); - cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; @@ -312,9 +298,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, } else // no coded residual, recon = pred - primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride); + primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride); - bCBF = !!numSig << trDepth; + bCBF = !!numSig << tuDepth; cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride); @@ -338,21 +324,21 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, } else { - uint32_t qtNumParts = cuGeom.numPartitions >> 2; - if (!trDepth) + uint32_t qNumParts = cuGeom.numPartitions >> 2; + if (!tuDepth) { - for (uint32_t part = 0; part < 4; part++) - m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false); + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx) + m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false); } - else if (!(absPartIdx & (qtNumParts - 1))) + else if (!(absPartIdx & (qNumParts - 1))) m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); } if (log2TrSize != depthRange[0]) m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); - m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]); + m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth); - if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth)) + if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA); fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); @@ -380,26 +366,25 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, } // code split block - uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - uint32_t absPartIdxSub = absPartIdx; + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; if (m_param->bEnableTSkipFast) - checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN; + checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N; Cost splitCost; uint32_t cbf = 0; - for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { if (checkTransformSkip) - codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost); + codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost); else - codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange); + codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange); - cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1); + cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); } - for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) - cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth); + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) + cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth); if (mightNotSplit && log2TrSize != depthRange[0]) { @@ -428,16 +413,16 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, m_entropyCoder.load(m_rqt[fullDepth].rqtTest); // recover transform index and Cbf values - cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); } } // set reconstruction for next intra prediction blocks if full TU prediction won - pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - intptr_t picStride = m_frame->m_reconPicYuv->m_stride; - primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); + pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = m_frame->m_reconPic->m_stride; + primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); outCost.rdcost += fullCost.rdcost; outCost.distortion += fullCost.distortion; @@ -445,9 +430,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, outCost.energy += fullCost.energy; } -void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost) +void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost) { - uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t tuSize = 1 << log2TrSize; @@ -462,7 +447,7 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep int bTSkip = 0; uint32_t bCBF = 0; - pixel* fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); pixel* pred = predYuv->getLumaAddr(absPartIdx); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); uint32_t stride = fencYuv->m_size; @@ -470,12 +455,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep // init availability pattern uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; - initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode); + initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode); // get prediction signal predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); - cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); uint32_t qtLayer = log2TrSize - 2; uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); @@ -518,12 +503,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep } else // no residual coded, recon = pred - primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride); + primitives.luma_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride); uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride); cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth); - cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth); + cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); if (useTSkip) m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); @@ -548,20 +533,20 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep } else { - uint32_t qtNumParts = cuGeom.numPartitions >> 2; - if (!trDepth) + uint32_t qNumParts = cuGeom.numPartitions >> 2; + if (!tuDepth) { - for (uint32_t part = 0; part < 4; part++) - m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false); + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx) + m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false); } - else if (!(absPartIdx & (qtNumParts - 1))) + else if (!(absPartIdx & (qNumParts - 1))) m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); } m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); - m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]); + m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth); - if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth)) + if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA); uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits(); @@ -591,19 +576,19 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep if (bTSkip) { memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2)); - primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize); + primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize); } else if (checkTransformSkip) { cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); - cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth); + cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); m_entropyCoder.load(m_rqt[fullDepth].rqtTemp); } // set reconstruction for next intra prediction blocks - pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - intptr_t picStride = m_frame->m_reconPicYuv->m_stride; - primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); + pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = m_frame->m_reconPic->m_stride; + primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); outCost.rdcost += fullCost.rdcost; outCost.distortion += fullCost.distortion; @@ -612,11 +597,11 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep } /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */ -void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2]) +void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2]) { CUData& cu = mode.cu; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; bool bCheckFull = log2TrSize <= depthRange[1]; @@ -629,22 +614,22 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3 if (bCheckFull) { - pixel* fenc = const_cast(mode.fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx); pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); - pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - intptr_t picStride = m_frame->m_reconPicYuv->m_stride; + pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = m_frame->m_reconPic->m_stride; uint32_t stride = mode.fencYuv->m_size; uint32_t sizeIdx = log2TrSize - 2; uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); coeff_t* coeff = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY; - initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode); + initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode); predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n"); - cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); primitives.calcresidual[sizeIdx](fenc, pred, residual, stride); uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false); @@ -652,11 +637,11 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3 { m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig); primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride); - cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth); + cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); } else { - primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride); + primitives.luma_copy_pp[sizeIdx](picReconY, picStride, pred, stride); cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); } } @@ -665,26 +650,25 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3 X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n"); /* code split block */ - uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t cbf = 0; - for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange); - cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1); + residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange); + cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); } - for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) - cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth); + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) + cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth); } } -void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx) +void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx) { - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - if (tuDepth == trDepth) + if (tuDepth == cu.m_tuDepth[absPartIdx]) { - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t qtLayer = log2TrSize - 2; // copy transform coefficients @@ -698,88 +682,80 @@ void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, u } else { - uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) - extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx); } } +inline void offsetCBFs(uint8_t subTUCBF[2]) +{ + uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1]; + subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF; + subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF; +} + /* 4:2:2 post-TU split processing */ -void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx) +void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx) { uint32_t depth = cu.m_cuDepth[0]; - uint32_t fullDepth = depth + trDepth; + uint32_t fullDepth = depth + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - uint32_t trDepthC = trDepth; if (log2TrSize == 2) { - X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n"); - trDepthC--; + X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + ++log2TrSize; } - uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1; + uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1); // move the CBFs down a level and set the parent CBF uint8_t subTUCBF[2]; - uint8_t combinedSubTUCBF = 0; - - for (uint32_t subTU = 0; subTU < 2; subTU++) - { - const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU); + subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth); + subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth); + offsetCBFs(subTUCBF); - subTUCBF[subTU] = cu.getCbf(subTUAbsPartIdx, ttype, trDepth); - combinedSubTUCBF |= subTUCBF[subTU]; - } - - for (uint32_t subTU = 0; subTU < 2; subTU++) - { - const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU); - const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF; - - cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU); - } + cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts); + cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts); } /* returns distortion */ -uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy) +uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy) { CUData& cu = mode.cu; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - if (tuDepthL > trDepth) + if (tuDepth < cu.m_tuDepth[absPartIdx]) { - uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0; - for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy); - splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1); - splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1); + outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy); + splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); + splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); } - for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) { - cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth); - cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth); + cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth); + cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth); } return outDist; } - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - uint32_t trDepthC = trDepth; - if (log2TrSizeC == 1) + uint32_t tuDepthC = tuDepth; + if (log2TrSizeC < 2) { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n"); - trDepthC--; - log2TrSizeC++; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); - bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0); - if (!bFirstQ) + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + if (absPartIdx & 3) return 0; + log2TrSizeC = 2; + tuDepthC--; } if (m_bEnableRDOQ) @@ -788,13 +764,13 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]); if (checkTransformSkip) - return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy); + return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy); uint32_t qtLayer = log2TrSize - 2; uint32_t tuSize = 1 << log2TrSizeC; uint32_t outDist = 0; - uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) @@ -806,7 +782,7 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr { uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - pixel* fenc = const_cast(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC); + const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t stride = mode.fencYuv->m_csize; @@ -817,11 +793,11 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; - pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); - intptr_t picStride = m_frame->m_reconPicYuv->m_strideC; + pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + intptr_t picStride = m_frame->m_reconPic->m_strideC; // init availability pattern - initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId); + initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId); pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize); uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; @@ -837,44 +813,42 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride); uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false); - uint32_t tmpDist; if (numSig) { m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig); primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride); - cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); } else { // no coded residual, recon = pred - primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride); + primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride); cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); } - tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride); - outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist); + outDist += m_rdCost.scaleChromaDist(chromaId, primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride)); if (m_rdCost.m_psyRd) psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride); - primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride); + primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride); } while (tuIterator.isNextSection()); if (splitType == VERTICAL_SPLIT) - offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx); + offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx); } return outDist; } /* returns distortion */ -uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy) +uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy) { CUData& cu = mode.cu; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - uint32_t log2TrSizeC = 2; + const uint32_t log2TrSizeC = 2; uint32_t tuSize = 4; uint32_t qtLayer = log2TrSize - 2; uint32_t outDist = 0; @@ -887,7 +861,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]); ALIGN_VAR_32(pixel, tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]); - uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) @@ -899,11 +873,11 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t { uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - pixel* fenc = const_cast(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC); + const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t stride = mode.fencYuv->m_csize; - uint32_t sizeIdxC = log2TrSizeC - 2; + const uint32_t sizeIdxC = log2TrSizeC - 2; uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; @@ -911,7 +885,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; // init availability pattern - initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId); + initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId); pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize); uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; @@ -943,7 +917,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t { m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig); primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride); - cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); } else if (useTSkip) { @@ -952,11 +926,11 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t } else { - primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride); + primitives.luma_copy_pp[sizeIdxC](recon, reconStride, pred, stride); cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); } uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride); - tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist); + tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist); cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); @@ -991,15 +965,15 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t if (bTSkip) { memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2)); - primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE); + primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE); } - cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); - pixel* reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); - intptr_t picStride = m_frame->m_reconPicYuv->m_strideC; - primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride); + pixel* reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + intptr_t picStride = m_frame->m_reconPic->m_strideC; + primitives.luma_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride); outDist += bDist; psyEnergy += bEnergy; @@ -1007,34 +981,27 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t while (tuIterator.isNextSection()); if (splitType == VERTICAL_SPLIT) - offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx); + offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx); } m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); return outDist; } -void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad) +void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth) { - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - if (tuDepthL == trDepth) + if (tuDepthL == tuDepth || log2TrSizeC == 2) { - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - - if (tuQuad) - { - log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */ - trDepth--; /* also adjust the number of coeff read */ - } - // copy transform coefficients uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422)); uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); - uint32_t qtLayer = log2TrSize - 2; + uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth); coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC; @@ -1047,38 +1014,29 @@ void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absP } else { - if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444) - /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */ - extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true); - else - { - uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) - extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false); - } + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1); } } -void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx) +void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx) { CUData& cu = mode.cu; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - if (tuDepthL == trDepth) + if (tuDepth == cu.m_tuDepth[absPartIdx]) { - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - uint32_t trDepthC = trDepth; - if (log2TrSizeC == 1) + uint32_t tuDepthC = tuDepth; + if (log2TrSizeC < 2) { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n"); - trDepthC--; - log2TrSizeC++; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); - bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0); - if (!bFirstQ) + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + if (absPartIdx & 3) return; + log2TrSizeC = 2; + tuDepthC--; } ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; @@ -1086,7 +1044,7 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr uint32_t stride = mode.fencYuv->m_csize; const int sizeIdxC = log2TrSizeC - 2; - uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) @@ -1098,20 +1056,20 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr { uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - pixel* fenc = const_cast(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC)); + const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC); pixel* recon = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed? uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); coeff_t* coeff = cu.m_trCoeff[ttype] + coeffOffsetC; - pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); - uint32_t picStride = m_frame->m_reconPicYuv->m_strideC; + pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + uint32_t picStride = m_frame->m_reconPic->m_strideC; uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; if (chromaPredMode == DM_CHROMA_IDX) chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode; - initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId); + initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId); pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize); predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp); @@ -1124,36 +1082,36 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr { m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig); primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride); - primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride); - cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride); + cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); } else { - primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride); - primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride); + primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride); + primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride); cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); } } while (tuIterator.isNextSection()); if (splitType == VERTICAL_SPLIT) - offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx); + offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx); } } else { - uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t splitCbfU = 0, splitCbfV = 0; - for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC); - splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1); - splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1); + residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx); + splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); + splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); } - for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) { - cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth); - cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth); + cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth); + cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth); } } } @@ -1188,7 +1146,7 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits(); bool bCodeDQP = m_slice->m_pps->bUseDQP; - m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange); + m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); m_entropyCoder.store(intraMode.contexts); intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits; @@ -1198,7 +1156,224 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize updateModeCost(intraMode); } -uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes) +/* Note that this function does not save the best intra prediction, it must + * be generated later. It records the best mode in the cu */ +void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom) +{ + CUData& cu = intraMode.cu; + uint32_t depth = cu.m_cuDepth[0]; + + cu.setPartSizeSubParts(SIZE_2Nx2N); + cu.setPredModeSubParts(MODE_INTRA); + + const uint32_t initTuDepth = 0; + uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth; + uint32_t tuSize = 1 << log2TrSize; + const uint32_t absPartIdx = 0; + + // Reference sample smoothing + initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX); + + const pixel* fenc = intraMode.fencYuv->m_buf[0]; + uint32_t stride = intraMode.fencYuv->m_size; + + pixel* above = m_refAbove + tuSize - 1; + pixel* aboveFiltered = m_refAboveFlt + tuSize - 1; + pixel* left = m_refLeft + tuSize - 1; + pixel* leftFiltered = m_refLeftFlt + tuSize - 1; + int sad, bsad; + uint32_t bits, bbits, mode, bmode; + uint64_t cost, bcost; + + // 33 Angle modes once + ALIGN_VAR_32(pixel, bufScale[32 * 32]); + ALIGN_VAR_32(pixel, bufTrans[32 * 32]); + ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]); + int scaleTuSize = tuSize; + int scaleStride = stride; + int costShift = 0; + int sizeIdx = log2TrSize - 2; + + if (tuSize > 32) + { + // origin is 64x64, we scale to 32x32 and setup required parameters + primitives.scale2D_64to32(bufScale, fenc, stride); + fenc = bufScale; + + // reserve space in case primitives need to store data in above + // or left buffers + pixel _above[4 * 32 + 1]; + pixel _left[4 * 32 + 1]; + pixel* aboveScale = _above + 2 * 32; + pixel* leftScale = _left + 2 * 32; + aboveScale[0] = leftScale[0] = above[0]; + primitives.scale1D_128to64(aboveScale + 1, above + 1, 0); + primitives.scale1D_128to64(leftScale + 1, left + 1, 0); + + scaleTuSize = 32; + scaleStride = 32; + costShift = 2; + sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 + + // Filtered and Unfiltered refAbove and refLeft pointing to above and left. + above = aboveScale; + left = leftScale; + aboveFiltered = aboveScale; + leftFiltered = leftScale; + } + + pixelcmp_t sa8d = primitives.sa8d[sizeIdx]; + int predsize = scaleTuSize * scaleTuSize; + + m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); + + /* there are three cost tiers for intra modes: + * pred[0] - mode probable, least cost + * pred[1], pred[2] - less probable, slightly more cost + * non-mpm modes - all cost the same (rbits) */ + uint64_t mpms; + uint32_t preds[3]; + uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms); + + // DC + primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16)); + bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + bmode = mode = DC_IDX; + bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; + bcost = m_rdCost.calcRdSADCost(bsad, bbits); + + pixel* abovePlanar = above; + pixel* leftPlanar = left; + + if (tuSize & (8 | 16 | 32)) + { + abovePlanar = aboveFiltered; + leftPlanar = leftFiltered; + } + + // PLANAR + primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0); + sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + mode = PLANAR_IDX; + bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; + cost = m_rdCost.calcRdSADCost(sad, bits); + COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); + + // Transpose NxN + primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride); + + primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16)); + + bool modeHor; + const pixel* cmp; + intptr_t srcStride; + +#define TRY_ANGLE(angle) \ + modeHor = angle < 18; \ + cmp = modeHor ? bufTrans : fenc; \ + srcStride = modeHor ? scaleTuSize : scaleStride; \ + sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \ + bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \ + cost = m_rdCost.calcRdSADCost(sad, bits) + + if (m_param->bEnableFastIntra) + { + int asad = 0; + uint32_t lowmode, highmode, amode = 5, abits = 0; + uint64_t acost = MAX_INT64; + + /* pick the best angle, sampling at distance of 5 */ + for (mode = 5; mode < 35; mode += 5) + { + TRY_ANGLE(mode); + COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits); + } + + /* refine best angle at distance 2, then distance 1 */ + for (uint32_t dist = 2; dist >= 1; dist--) + { + lowmode = amode - dist; + highmode = amode + dist; + + X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n"); + TRY_ANGLE(lowmode); + COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits); + + X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n"); + TRY_ANGLE(highmode); + COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits); + } + + if (amode == 33) + { + TRY_ANGLE(34); + COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits); + } + + COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits); + } + else // calculate and search all intra prediction angles for lowest cost + { + for (mode = 2; mode < 35; mode++) + { + TRY_ANGLE(mode); + COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); + } + } + + cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth); + intraMode.initCosts(); + intraMode.totalBits = bbits; + intraMode.distortion = bsad; + intraMode.sa8dCost = bcost; + intraMode.sa8dBits = bbits; +} + +void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom) +{ + CUData& cu = intraMode.cu; + Yuv* reconYuv = &intraMode.reconYuv; + const Yuv* fencYuv = intraMode.fencYuv; + + X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n"); + X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n"); + + m_quant.setQPforQuant(cu); + + uint32_t tuDepthRange[2]; + cu.getIntraTUQtDepthRange(tuDepthRange, 0); + + m_entropyCoder.load(m_rqt[cuGeom.depth].cur); + + Cost icosts; + codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange); + extractIntraResultQT(cu, *reconYuv, 0, 0); + + intraMode.distortion = icosts.distortion; + intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom); + + m_entropyCoder.resetBits(); + if (m_slice->m_pps->bTransquantBypassEnabled) + m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); + m_entropyCoder.codeSkipFlag(cu, 0); + m_entropyCoder.codePredMode(cu.m_predMode[0]); + m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); + m_entropyCoder.codePredInfo(cu, 0); + intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits(); + + bool bCodeDQP = m_slice->m_pps->bUseDQP; + m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); + + intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); + intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits; + if (m_rdCost.m_psyRd) + intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + + m_entropyCoder.store(intraMode.contexts); + updateModeCost(intraMode); +} + +uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes) { CUData& cu = intraMode.cu; Yuv* reconYuv = &intraMode.reconYuv; @@ -1206,37 +1381,37 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t const Yuv* fencYuv = intraMode.fencYuv; uint32_t depth = cu.m_cuDepth[0]; - uint32_t initTrDepth = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1; - uint32_t numPU = 1 << (2 * initTrDepth); - uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth; + uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; + uint32_t numPU = 1 << (2 * initTuDepth); + uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth; uint32_t tuSize = 1 << log2TrSize; uint32_t qNumParts = cuGeom.numPartitions >> 2; uint32_t sizeIdx = log2TrSize - 2; uint32_t absPartIdx = 0; uint32_t totalDistortion = 0; - int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN; + int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N; // loop over partitions - for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts) + for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts) { uint32_t bmode = 0; if (sharedModes) - bmode = sharedModes[pu]; + bmode = sharedModes[puIdx]; else { // Reference sample smoothing - initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX); + initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX); // determine set of modes to be tested (using prediction signal only) - pixel* fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); uint32_t stride = predYuv->m_size; - pixel *above = m_refAbove + tuSize - 1; - pixel *aboveFiltered = m_refAboveFlt + tuSize - 1; - pixel *left = m_refLeft + tuSize - 1; - pixel *leftFiltered = m_refLeftFlt + tuSize - 1; + pixel* above = m_refAbove + tuSize - 1; + pixel* aboveFiltered = m_refAboveFlt + tuSize - 1; + pixel* left = m_refLeft + tuSize - 1; + pixel* leftFiltered = m_refLeftFlt + tuSize - 1; // 33 Angle modes once ALIGN_VAR_32(pixel, buf_trans[32 * 32]); @@ -1250,8 +1425,8 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t if (tuSize > 32) { - pixel *aboveScale = _above + 2 * 32; - pixel *leftScale = _left + 2 * 32; + pixel* aboveScale = _above + 2 * 32; + pixel* leftScale = _left + 2 * 32; // origin is 64x64, we scale to 32x32 and setup required parameters primitives.scale2D_64to32(bufScale, fenc, stride); @@ -1296,8 +1471,8 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits); // PLANAR - pixel *abovePlanar = above; - pixel *leftPlanar = left; + pixel* abovePlanar = above; + pixel* leftPlanar = left; if (tuSize >= 8 && tuSize <= 32) { abovePlanar = aboveFiltered; @@ -1316,7 +1491,7 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t for (int mode = 2; mode < 35; mode++) { bool modeHor = (mode < 18); - pixel *cmp = (modeHor ? buf_trans : fenc); + const pixel* cmp = (modeHor ? buf_trans : fenc); intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride); bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; @@ -1330,7 +1505,7 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t * levels and at higher depths */ uint64_t candCostList[MAX_RD_INTRA_MODES]; uint32_t rdModeList[MAX_RD_INTRA_MODES]; - int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1); + int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1); for (int i = 0; i < maxCandCount; i++) candCostList[i] = MAX_INT64; @@ -1346,51 +1521,50 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t if (candCostList[i] == MAX_INT64) break; m_entropyCoder.load(m_rqt[depth].cur); - cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth); + cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth); Cost icosts; if (checkTransformSkip) - codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts); + codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts); else - codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange); + codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange); COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]); } } /* remeasure best mode, allowing TU splits */ - cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth); + cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth); m_entropyCoder.load(m_rqt[depth].cur); Cost icosts; if (checkTransformSkip) - codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts); + codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts); else - codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange); + codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange); totalDistortion += icosts.distortion; - extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx); + extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx); // set reconstruction for next intra prediction blocks - if (pu != numPU - 1) + if (puIdx != numPU - 1) { /* This has important implications for parallelism and RDO. It is writing intermediate results into the * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think * that the contexts should be tracked through each PU */ - pixel* dst = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - uint32_t dststride = m_frame->m_reconPicYuv->m_stride; - pixel* src = reconYuv->getLumaAddr(absPartIdx); + pixel* dst = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + uint32_t dststride = m_frame->m_reconPic->m_stride; + const pixel* src = reconYuv->getLumaAddr(absPartIdx); uint32_t srcstride = reconYuv->m_size; - primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride); + primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride); } } if (numPU > 1) { uint32_t combCbfY = 0; - uint32_t partIdx = 0; - for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts) - combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1); + for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts) + combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1); for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) cu.m_cbf[0][offs] |= combCbfY; @@ -1415,17 +1589,19 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom) uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift; uint32_t tuSize = 1 << log2TrSizeC; int32_t scaleTuSize = tuSize; + uint32_t tuDepth = 0; int32_t costShift = 0; if (tuSize > 32) { scaleTuSize = 32; + tuDepth = 1; costShift = 2; log2TrSizeC = 5; } - Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1); - Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2); + Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1); + Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2); cu.getAllowedChromaDir(0, modeList); // check chroma modes @@ -1440,7 +1616,7 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom) uint64_t cost = 0; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) { - pixel* fenc = fencYuv->m_buf[chromaId]; + const pixel* fenc = fencYuv->m_buf[chromaId]; pixel* pred = predYuv->m_buf[chromaId]; pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize); @@ -1465,19 +1641,18 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) Yuv& reconYuv = intraMode.reconYuv; uint32_t depth = cu.m_cuDepth[0]; - uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444; - uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth; + uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444; + uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth; uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1)); uint32_t totalDistortion = 0; int part = partitionFromLog2Size(log2TrSize); - TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0); + TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0); do { uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - int cuSize = 1 << cu.m_log2CUSize[absPartIdxC]; uint32_t bestMode = 0; uint32_t bestDist = 0; @@ -1496,9 +1671,9 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) // restore context models m_entropyCoder.load(m_rqt[depth].cur); - cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth); + cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth); uint32_t psyEnergy = 0; - uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy); + uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy); if (m_slice->m_pps->bTransformSkipEnabled) m_entropyCoder.load(m_rqt[depth].cur); @@ -1512,14 +1687,14 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) } else { - uint32_t qtNumParts = cuGeom.numPartitions >> 2; - if (!(absPartIdxC & (qtNumParts - 1))) + uint32_t qNumParts = cuGeom.numPartitions >> 2; + if (!(absPartIdxC & (qNumParts - 1))) m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList); } - codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize); - codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U); - codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V); + codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC); + codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U); + codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V); uint32_t bits = m_entropyCoder.getNumberOfWrittenBits(); uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits); @@ -1528,7 +1703,7 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) bestCost = cost; bestDist = dist; bestMode = modeList[mode]; - extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false); + extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth); memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); @@ -1539,14 +1714,15 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) if (!tuIterator.isLastSection()) { uint32_t zorder = cuGeom.encodeIdx + absPartIdxC; - uint32_t dststride = m_frame->m_reconPicYuv->m_strideC; - pixel *src, *dst; + uint32_t dststride = m_frame->m_reconPic->m_strideC; + const pixel* src; + pixel* dst; - dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder); + dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder); src = reconYuv.getCbAddr(absPartIdxC); primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize); - dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder); + dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder); src = reconYuv.getCrAddr(absPartIdxC); primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize); } @@ -1555,23 +1731,23 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t)); memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); - cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth); + cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth); totalDistortion += bestDist; } while (tuIterator.isNextSection()); - if (initTrDepth != 0) + if (initTuDepth != 0) { uint32_t combCbfU = 0; uint32_t combCbfV = 0; - uint32_t partIdx = 0; - for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep) + uint32_t qNumParts = tuIterator.absPartIdxStep; + for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1); - combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1); + combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1); + combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1); } - for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++) + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) { cu.m_cbf[1][offs] |= combCbfU; cu.m_cbf[2][offs] |= combCbfV; @@ -1615,13 +1791,17 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me continue; cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv; - cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx; + cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx; cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv; - cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx; + cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx; prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(tempYuv, true, false); + motionCompensation(tempYuv, true, m_me.bChromaSATD); + uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size); + if (m_me.bChromaSATD) + costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx); + uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand); costCand = costCand + m_rdCost.getCost(bitsCand); if (costCand < outCost) @@ -1642,41 +1822,45 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me /* this function assumes the caller has configured its MotionEstimation engine with the * correct source plane and source PU, and has called prepMotionCompensation() to set * m_puAbsPartIdx, m_puWidth, and m_puHeight */ -void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref) +void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref) { uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS; bits += getTUBits(ref, m_slice->m_numRefIdx[list]); - MV amvpCand[AMVP_NUM_CANDS]; MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1]; - int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc); + int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc); - uint32_t bestCost = MAX_INT; int mvpIdx = 0; int merange = m_param->searchRange; - for (int i = 0; i < AMVP_NUM_CANDS; i++) + MotionData* bestME = interMode.bestME[part]; + + if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1]) { - MV mvCand = amvpCand[i]; + uint32_t bestCost = MAX_INT; + for (int i = 0; i < AMVP_NUM_CANDS; i++) + { + MV mvCand = interMode.amvpCand[list][ref][i]; - // NOTE: skip mvCand if Y is > merange and -FN>1 - if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) - continue; + // NOTE: skip mvCand if Y is > merange and -FN>1 + if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) + continue; - cu.clipMv(mvCand); + interMode.cu.clipMv(mvCand); - Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; - predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand); - uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); + Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; + predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); - if (bestCost > cost) - { - bestCost = cost; - mvpIdx = i; + if (bestCost > cost) + { + bestCost = cost; + mvpIdx = i; + } } } - MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx]; - setSearchRange(cu, mvp, merange, mvmin, mvmax); + MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx]; + setSearchRange(interMode.cu, mvp, merange, mvmin, mvmax); int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); @@ -1685,34 +1869,32 @@ void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGe uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ - checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost); + checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost); /* tie goes to the smallest ref ID, just like --no-pme */ - ScopedLock _lock(master.m_outputLock); - if (cost < master.m_bestME[list].cost || - (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref)) + ScopedLock _lock(master.m_meLock); + if (cost < bestME[list].cost || + (cost == bestME[list].cost && ref < bestME[list].ref)) { - master.m_bestME[list].mv = outmv; - master.m_bestME[list].mvp = mvp; - master.m_bestME[list].mvpIdx = mvpIdx; - master.m_bestME[list].ref = ref; - master.m_bestME[list].cost = cost; - master.m_bestME[list].bits = bits; + bestME[list].mv = outmv; + bestME[list].mvp = mvp; + bestME[list].mvpIdx = mvpIdx; + bestME[list].ref = ref; + bestME[list].cost = cost; + bestME[list].bits = bits; } } /* search of the best candidate for inter prediction * returns true if predYuv was filled with a motion compensated prediction */ -bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma) +bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D) { CUData& cu = interMode.cu; Yuv* predYuv = &interMode.predYuv; - MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS]; MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1]; const Slice *slice = m_slice; - PicYuv* fencPic = m_frame->m_origPicYuv; int numPart = cu.getNumPartInter(); int numPredDir = slice->isInterP() ? 1 : 2; const int* numRefIdx = slice->m_numRefIdx; @@ -1727,23 +1909,24 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO for (int puIdx = 0; puIdx < numPart; puIdx++) { + MotionData* bestME = interMode.bestME[puIdx]; + /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */ initMotionCompensation(cu, cuGeom, puIdx); - pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx); - m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight); + m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight); uint32_t mrgCost = MAX_UINT; - /* find best cost merge candidate */ - if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N) + /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */ + if (cu.m_partSize[0] != SIZE_2Nx2N) { merge.absPartIdx = m_puAbsPartIdx; merge.width = m_puWidth; merge.height = m_puHeight; mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge); - if (bMergeOnly && cu.m_log2CUSize[0] > 3) + if (bMergeOnly) { if (mrgCost == MAX_UINT) { @@ -1762,33 +1945,88 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO totalmebits += merge.bits; prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(*predYuv, true, bChroma); + motionCompensation(*predYuv, true, bChromaSA8D); continue; } } - MotionData bidir[2]; - uint32_t bidirCost = MAX_UINT; - int bidirBits = 0; - - m_bestME[0].cost = MAX_UINT; - m_bestME[1].cost = MAX_UINT; + bestME[0].cost = MAX_UINT; + bestME[1].cost = MAX_UINT; getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits); - if (bDistributed) + /* Uni-directional prediction */ + if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0) { - m_curMECu = &cu; - m_curGeom = &cuGeom; + for (int l = 0; l < numPredDir; l++) + { + int ref = bestME[l].ref; + uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS; + bits += getTUBits(ref, numRefIdx[l]); + + int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc); + + // Pick the best possible MVP from AMVP candidates based on least residual + int mvpIdx = 0; + int merange = m_param->searchRange; + + if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1]) + { + uint32_t bestCost = MAX_INT; + for (int i = 0; i < AMVP_NUM_CANDS; i++) + { + MV mvCand = interMode.amvpCand[l][ref][i]; + + // NOTE: skip mvCand if Y is > merange and -FN>1 + if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) + continue; + + cu.clipMv(mvCand); + predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); - /* this worker might already be enqueued for pmode, so other threads - * might be looking at the ME job counts at any time, do these sets - * in a safe order */ + if (bestCost > cost) + { + bestCost = cost; + mvpIdx = i; + } + } + } + + MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx]; + + int satdCost; + setSearchRange(cu, mvp, merange, mvmin, mvmax); + satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); + + /* Get total cost of partition, but only include MV bit cost once */ + bits += m_me.bitcost(outmv); + uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); + + /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ + checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost); + + if (cost < bestME[l].cost) + { + bestME[l].mv = outmv; + bestME[l].mvp = mvp; + bestME[l].mvpIdx = mvpIdx; + bestME[l].cost = cost; + bestME[l].bits = bits; + } + } + } + else if (bDistributed) + { + m_meLock.acquire(); + m_curInterMode = &interMode; + m_curGeom = &cuGeom; m_curPart = puIdx; m_totalNumME = 0; m_numAcquiredME = 1; m_numCompletedME = 0; m_totalNumME = numRefIdx[0] + numRefIdx[1]; + m_meLock.release(); if (!m_bJobsQueued) JobProvider::enqueue(); @@ -1796,34 +2034,43 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO for (int i = 1; i < m_totalNumME; i++) m_pool->pokeIdleThread(); - while (m_totalNumME > m_numAcquiredME) + do { - int id = ATOMIC_INC(&m_numAcquiredME); - if (m_totalNumME >= id) + m_meLock.acquire(); + if (m_totalNumME > m_numAcquiredME) { - id -= 1; + int id = m_numAcquiredME++; + m_meLock.release(); + if (id < numRefIdx[0]) - singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id); + singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id); else - singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]); + singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]); - if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME) - m_meCompletionEvent.trigger(); + m_meLock.acquire(); + m_numCompletedME++; + m_meLock.release(); } + else + m_meLock.release(); } + while (m_totalNumME > m_numAcquiredME); + if (!m_bJobsQueued) JobProvider::dequeue(); /* we saved L0-0 for ourselves */ - singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0); - if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME) + singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0); + + m_meLock.acquire(); + if (++m_numCompletedME == m_totalNumME) m_meCompletionEvent.trigger(); + m_meLock.release(); m_meCompletionEvent.wait(); } else { - // Uni-directional prediction for (int l = 0; l < numPredDir; l++) { for (int ref = 0; ref < numRefIdx[l]; ref++) @@ -1831,33 +2078,36 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS; bits += getTUBits(ref, numRefIdx[l]); - int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc); + int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc); // Pick the best possible MVP from AMVP candidates based on least residual - uint32_t bestCost = MAX_INT; int mvpIdx = 0; int merange = m_param->searchRange; - for (int i = 0; i < AMVP_NUM_CANDS; i++) + if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1]) { - MV mvCand = amvpCand[l][ref][i]; + uint32_t bestCost = MAX_INT; + for (int i = 0; i < AMVP_NUM_CANDS; i++) + { + MV mvCand = interMode.amvpCand[l][ref][i]; - // NOTE: skip mvCand if Y is > merange and -FN>1 - if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) - continue; + // NOTE: skip mvCand if Y is > merange and -FN>1 + if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) + continue; - cu.clipMv(mvCand); - predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand); - uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); + cu.clipMv(mvCand); + predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); - if (bestCost > cost) - { - bestCost = cost; - mvpIdx = i; + if (bestCost > cost) + { + bestCost = cost; + mvpIdx = i; + } } } - MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx]; + MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx]; setSearchRange(cu, mvp, merange, mvmin, mvmax); int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); @@ -1867,45 +2117,67 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ - checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost); + checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost); - if (cost < m_bestME[l].cost) + if (cost < bestME[l].cost) { - m_bestME[l].mv = outmv; - m_bestME[l].mvp = mvp; - m_bestME[l].mvpIdx = mvpIdx; - m_bestME[l].ref = ref; - m_bestME[l].cost = cost; - m_bestME[l].bits = bits; + bestME[l].mv = outmv; + bestME[l].mvp = mvp; + bestME[l].mvpIdx = mvpIdx; + bestME[l].ref = ref; + bestME[l].cost = cost; + bestME[l].bits = bits; } } } } /* Bi-directional prediction */ - if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT) + MotionData bidir[2]; + uint32_t bidirCost = MAX_UINT; + int bidirBits = 0; + + if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */ + cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */ + bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT) { - bidir[0] = m_bestME[0]; - bidir[1] = m_bestME[1]; + bidir[0] = bestME[0]; + bidir[1] = bestME[1]; + + int satdCost; - /* Generate reference subpels */ - PicYuv* refPic0 = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv; - PicYuv* refPic1 = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv; - Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv; - predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv); - predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv); + if (m_me.bChromaSATD) + { + cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv; + cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref; + cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv; + cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref; - pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx); - pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx); + prepMotionCompensation(cu, cuGeom, puIdx); + motionCompensation(tmpPredYuv, true, true); - int partEnum = partitionFromSizes(m_puWidth, m_puHeight); - primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32); - int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) + + m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx); + } + else + { + PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic; + PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic; + Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv; - bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); + /* Generate reference subpels */ + predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv); + predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv); + + primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size, + bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32); + satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + } + + bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); bidirCost = satdCost + m_rdCost.getCost(bidirBits); - bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero(); + bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero(); if (bTryZero) { /* Do not try zero MV if unidir motion predictors are beyond @@ -1917,38 +2189,48 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO mvmin <<= 2; mvmax <<= 2; - bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax); - bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax); + bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax); + bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax); } if (bTryZero) { - // coincident blocks of the two reference pictures - pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]); - pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]); - intptr_t refStride = slice->m_mref[0][0].lumaStride; + /* coincident blocks of the two reference pictures */ + if (m_me.bChromaSATD) + { + cu.m_mv[0][m_puAbsPartIdx] = mvzero; + cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref; + cu.m_mv[1][m_puAbsPartIdx] = mvzero; + cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref; - primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32); - satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + prepMotionCompensation(cu, cuGeom, puIdx); + motionCompensation(tmpPredYuv, true, true); - MV mvp0 = m_bestME[0].mvp; - int mvpIdx0 = m_bestME[0].mvpIdx; - uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); + satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) + + m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx); + } + else + { + const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx); + const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx); + intptr_t refStride = slice->m_mref[0][0].lumaStride; - MV mvp1 = m_bestME[1].mvp; - int mvpIdx1 = m_bestME[1].mvpIdx; - uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); + primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32); + satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + } - uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); + MV mvp0 = bestME[0].mvp; + int mvpIdx0 = bestME[0].mvpIdx; + uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); - if (bDistributed) - { - cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc); - cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc); - } + MV mvp1 = bestME[1].mvp; + int mvpIdx1 = bestME[1].mvpIdx; + uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); + + uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */ - checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost); - checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost); + checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost); + checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost); if (cost < bidirCost) { @@ -1965,7 +2247,7 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO } /* select best option and store into CU */ - if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost) + if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost) { cu.m_mergeFlag[m_puAbsPartIdx] = true; cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx @@ -1977,39 +2259,39 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO totalmebits += merge.bits; } - else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost) + else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost) { lastMode = 2; cu.m_mergeFlag[m_puAbsPartIdx] = false; cu.setPUInterDir(3, m_puAbsPartIdx, puIdx); cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx); cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp; cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx; cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx); cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp; cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx; totalmebits += bidirBits; } - else if (m_bestME[0].cost <= m_bestME[1].cost) + else if (bestME[0].cost <= bestME[1].cost) { lastMode = 0; cu.m_mergeFlag[m_puAbsPartIdx] = false; cu.setPUInterDir(1, m_puAbsPartIdx, puIdx); - cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx); - cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp; - cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx; + cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx); + cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp; + cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx; cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx); cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx); - totalmebits += m_bestME[0].bits; + totalmebits += bestME[0].bits; } else { @@ -2017,19 +2299,19 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO cu.m_mergeFlag[m_puAbsPartIdx] = false; cu.setPUInterDir(2, m_puAbsPartIdx, puIdx); - cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx); - cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp; - cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx; + cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx); + cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp; + cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx; cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx); cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx); - totalmebits += m_bestME[1].bits; + totalmebits += bestME[1].bits; } prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(*predYuv, true, bChroma); + motionCompensation(*predYuv, true, bChromaSA8D); } interMode.sa8dBits += totalmebits; @@ -2147,7 +2429,7 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode) // No residual coding : SKIP mode - cu.setSkipFlagSubParts(true); + cu.setPredModeSubParts(MODE_SKIP); cu.clearCbf(); cu.setTUDepthSubParts(0, 0, depth); @@ -2158,8 +2440,8 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode) interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); // Chroma part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); - interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); - interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); + interMode.distortion += m_rdCost.scaleChromaDist(1, primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); + interMode.distortion += m_rdCost.scaleChromaDist(2, primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); m_entropyCoder.load(m_rqt[depth].cur); m_entropyCoder.resetBits(); @@ -2212,8 +2494,8 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) if (!cu.m_tqBypass[0]) { uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); - cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize)); - cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize)); + cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize)); + cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize)); /* Consider the RD cost of not signaling any residual */ m_entropyCoder.load(m_rqt[depth].cur); @@ -2247,7 +2529,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) uint32_t coeffBits, bits; if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) { - cu.setSkipFlagSubParts(true); + cu.setPredModeSubParts(MODE_SKIP); /* Merge/Skip */ m_entropyCoder.resetBits(); @@ -2270,7 +2552,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits(); bool bCodeDQP = m_slice->m_pps->bUseDQP; - m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange); + m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); bits = m_entropyCoder.getNumberOfWrittenBits(); coeffBits = bits - mvBits; @@ -2285,8 +2567,8 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) // update with clipped distortion and cost (qp estimation loop uses unclipped values) uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); - bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); - bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); + bestDist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); + bestDist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); if (m_rdCost.m_psyRd) interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); @@ -2297,41 +2579,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) updateModeCost(interMode); } -void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom) -{ - CUData& cu = mode.cu; - - m_quant.setQPforQuant(mode.cu); - - if (cu.m_predMode[0] == MODE_INTER) - { - uint32_t tuDepthRange[2]; - cu.getInterTUQtDepthRange(tuDepthRange, 0); - - residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange); - if (cu.getQtRootCbf(0)) - mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]); - else - { - mode.reconYuv.copyFromYuv(mode.predYuv); - if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) - cu.setSkipFlagSubParts(true); - } - } - else if (cu.m_predMode[0] == MODE_INTRA) - { - uint32_t tuDepthRange[2]; - cu.getIntraTUQtDepthRange(tuDepthRange, 0); - - uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN; - residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange); - getBestIntraModeChroma(mode, cuGeom); - residualQTIntraChroma(mode, cuGeom, 0, 0); - mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO: - } -} - -void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2]) +void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2]) { CUData& cu = mode.cu; X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n"); @@ -2340,7 +2588,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 uint32_t tuDepth = depth - cu.m_cuDepth[0]; bool bCheckFull = log2TrSize <= depthRange[1]; - if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0]) + if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0]) bCheckFull = false; if (bCheckFull) @@ -2349,13 +2597,12 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; bool bCodeChroma = true; uint32_t tuDepthC = tuDepth; - if (log2TrSizeC == 1) + if (log2TrSizeC < 2) { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n"); - log2TrSizeC++; + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + log2TrSizeC = 2; tuDepthC--; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1); - bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + bCodeChroma = !(absPartIdx & 3); } uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); @@ -2372,10 +2619,10 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; const Yuv* fencYuv = mode.fencYuv; - int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx); + int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx); uint32_t strideResiY = resiYuv.m_size; - pixel *fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); if (numSigY) @@ -2409,7 +2656,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC); - pixel* fencCb = const_cast(fencYuv->getCbAddr(absPartIdxC)); + const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC); uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false); if (numSigU) { @@ -2423,7 +2670,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 } int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC); - pixel* fencCr = const_cast(fencYuv->getCrAddr(absPartIdxC)); + const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC); uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false); if (numSigV) { @@ -2449,16 +2696,16 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 { X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n"); - const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t ycbf = 0, ucbf = 0, vcbf = 0; - for (uint32_t i = 0; i < 4; i++) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange); - ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1); - ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1); - vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1); + residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange); + ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); + ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); + vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); } - for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++) + for (uint32_t i = 0; i < 4 * qNumParts; i++) { cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth; cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth; @@ -2467,15 +2714,26 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 } } -void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2]) +uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId) +{ + uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth); + + if (m_rdCost.m_psyRd) + return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy); + else + return m_rdCost.calcRdCost(dist, nullBits); +} + +void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2]) { CUData& cu = mode.cu; uint32_t log2TrSize = g_maxLog2CUSize - depth; bool bCheckSplit = log2TrSize > depthRange[0]; bool bCheckFull = log2TrSize <= depthRange[1]; + bool bSplitPresentFlag = bCheckSplit && bCheckFull; - if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit) + if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit) bCheckFull = false; X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n"); @@ -2485,12 +2743,12 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; bool bCodeChroma = true; uint32_t tuDepthC = tuDepth; - if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444)) + if (log2TrSizeC < 2) { - log2TrSizeC++; + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + log2TrSizeC = 2; tuDepthC--; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1); - bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + bCodeChroma = !(absPartIdx & 3); } // code full block @@ -2499,9 +2757,9 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; - uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; - uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; - uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} }; @@ -2532,57 +2790,25 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (m_bEnableRDOQ) m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); - pixel *fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); - int16_t *resi = resiYuv.getLumaAddr(absPartIdx); + const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); + int16_t* resi = resiYuv.getLumaAddr(absPartIdx); numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0]; m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); - if (cbfFlag[TEXT_LUMA][0]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); - singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits(); - - uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0]; - - if (bCodeChroma) - { - uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); - for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) - { - coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; - TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); - - do - { - uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - - cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); - if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) - m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); - - fenc = const_cast(fencYuv->getChromaAddr(chromaId, absPartIdxC)); - resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); - numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false); - cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section]; - - m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth); - if (cbfFlag[chromaId][tuIterator.section]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId); - - uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits(); - singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev; + if (bSplitPresentFlag && log2TrSize > depthRange[0]) + m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); + fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); - singleBitsPrev = newBits; - } - while (tuIterator.isNextSection()); - } - } + // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth. + // So it is valid if we encode coefficients and then cbfs at least for analysis. +// m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth); + if (cbfFlag[TEXT_LUMA][0]) + m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); - const uint32_t numCoeffY = 1 << (log2TrSize * 2); - const uint32_t numCoeffC = 1 << (log2TrSizeC * 2); + uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits(); + singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits; X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n"); uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size); @@ -2590,156 +2816,168 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (m_rdCost.m_psyRd) psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0); - int16_t *curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx); + int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx); uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size; if (cbfFlag[TEXT_LUMA][0]) { m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only + // non-zero cost calculation for luma - This is an approximation + // finally we have to encode correct cbf after comparing with null cost const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY); - uint32_t nonZeroPsyEnergyY = 0; + uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); + uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0; if (m_rdCost.m_psyRd) + { nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY); + singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY); + } + else + singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]); if (cu.m_tqBypass[0]) { - distY = nonZeroDistY; - psyEnergyY = nonZeroPsyEnergyY; + singleDist[TEXT_LUMA][0] = nonZeroDistY; + singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY; } else { - uint64_t singleCostY = 0; - if (m_rdCost.m_psyRd) - singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY); - else - singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]); - m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth); - const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits(); - uint64_t nullCostY = 0; - if (m_rdCost.m_psyRd) - nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY); - else - nullCostY = m_rdCost.calcRdCost(distY, nullBitsY); + // zero-cost calculation for luma. This is an approximation + // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf. + // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma. + uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA); + if (nullCostY < singleCostY) { cbfFlag[TEXT_LUMA][0] = 0; + singleBits[TEXT_LUMA][0] = 0; + primitives.blockfill_s[partSize](curResiY, strideResiY, 0); #if CHECKED_BUILD || _DEBUG + uint32_t numCoeffY = 1 << (log2TrSize << 1); memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY); #endif if (checkTransformSkipY) minCost[TEXT_LUMA][0] = nullCostY; + singleDist[TEXT_LUMA][0] = distY; + singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY; } else { - distY = nonZeroDistY; - psyEnergyY = nonZeroPsyEnergyY; if (checkTransformSkipY) minCost[TEXT_LUMA][0] = singleCostY; + singleDist[TEXT_LUMA][0] = nonZeroDistY; + singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY; } } } - else if (checkTransformSkipY) + else { - m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth); - const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits(); - if (m_rdCost.m_psyRd) - minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY); - else - minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY); + if (checkTransformSkipY) + minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA); + primitives.blockfill_s[partSize](curResiY, strideResiY, 0); + singleDist[TEXT_LUMA][0] = distY; + singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY; } - singleDistComp[TEXT_LUMA][0] = distY; - singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY; - if (!cbfFlag[TEXT_LUMA][0]) - primitives.blockfill_s[partSize](curResiY, strideResiY, 0); cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); if (bCodeChroma) { - uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) { uint32_t distC = 0, psyEnergyC = 0; coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); - do - { - uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); + cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); - distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize)); + if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) + m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); - if (cbfFlag[chromaId][tuIterator.section]) - { - m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset, - log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]); - uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); - const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist); - uint32_t nonZeroPsyEnergyC = 0; - if (m_rdCost.m_psyRd) - nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); - - if (cu.m_tqBypass[0]) - { - distC = nonZeroDistC; - psyEnergyC = nonZeroPsyEnergyC; - } - else + fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC); + resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); + numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false); + cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section]; + + //Coding cbf flags has been removed from here +// m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth); + if (cbfFlag[chromaId][tuIterator.section]) + m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId); + uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits(); + singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev; + singleBitsPrev = newBits; + + int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); + distC = m_rdCost.scaleChromaDist(chromaId, primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize)); + + if (cbfFlag[chromaId][tuIterator.section]) { - uint64_t singleCostC = 0; + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset, + log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]); + + // non-zero cost calculation for luma, same as luma - This is an approximation + // finally we have to encode correct cbf after comparing with null cost + uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); + uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth); + uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist); + uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0; if (m_rdCost.m_psyRd) - singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC); + { + nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); + singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC); + } else - singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]); - m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth); - const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits(); - uint64_t nullCostC = 0; - if (m_rdCost.m_psyRd) - nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC); + singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]); + + if (cu.m_tqBypass[0]) + { + singleDist[chromaId][tuIterator.section] = nonZeroDistC; + singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC; + } else - nullCostC = m_rdCost.calcRdCost(distC, nullBitsC); - if (nullCostC < singleCostC) { - cbfFlag[chromaId][tuIterator.section] = 0; + //zero-cost calculation for chroma. This is an approximation + uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId); + + if (nullCostC < singleCostC) + { + cbfFlag[chromaId][tuIterator.section] = 0; + singleBits[chromaId][tuIterator.section] = 0; + primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0); #if CHECKED_BUILD || _DEBUG + uint32_t numCoeffC = 1 << (log2TrSizeC << 1); memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC); #endif if (checkTransformSkipC) minCost[chromaId][tuIterator.section] = nullCostC; + singleDist[chromaId][tuIterator.section] = distC; + singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC; } else { - distC = nonZeroDistC; - psyEnergyC = nonZeroPsyEnergyC; if (checkTransformSkipC) minCost[chromaId][tuIterator.section] = singleCostC; + singleDist[chromaId][tuIterator.section] = nonZeroDistC; + singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC; } } } - else if (checkTransformSkipC) + else { - m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC); - const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits(); - if (m_rdCost.m_psyRd) - minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC); - else - minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC); - } - - singleDistComp[chromaId][tuIterator.section] = distC; - singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC; - - if (!cbfFlag[chromaId][tuIterator.section]) + if (checkTransformSkipC) + minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId); primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0); + singleDist[chromaId][tuIterator.section] = distC; + singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC; + } cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); } @@ -2763,14 +3001,14 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (m_bEnableRDOQ) m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); - fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + fenc = fencYuv->getLumaAddr(absPartIdx); resi = resiYuv.getLumaAddr(absPartIdx); uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true); if (numSigTSkipY) { m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth); + m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth); m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA); const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits(); @@ -2791,12 +3029,13 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); else { - singleDistComp[TEXT_LUMA][0] = nonZeroDistY; - singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY; + singleDist[TEXT_LUMA][0] = nonZeroDistY; + singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY; cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY; bestTransformMode[TEXT_LUMA][0] = 1; + uint32_t numCoeffY = 1 << (log2TrSize << 1); memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY); - primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize); + primitives.luma_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize); } cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); @@ -2821,7 +3060,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); + int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]); ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]); @@ -2831,42 +3070,43 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); - fenc = const_cast(fencYuv->getChromaAddr(chromaId, absPartIdxC)); + fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC); resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true); m_entropyCoder.resetBits(); - singleBitsComp[chromaId][tuIterator.section] = 0; + singleBits[chromaId][tuIterator.section] = 0; if (numSigTSkipC) { - m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth); + m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth); m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId); - singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits(); + singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits(); m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC, log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC); uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC); - nonZeroDistC = m_rdCost.scaleChromaDistCb(dist); + nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist); if (m_rdCost.m_psyRd) { nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC); - singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC); + singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC); } else - singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]); + singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]); } if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC) cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); else { - singleDistComp[chromaId][tuIterator.section] = nonZeroDistC; - singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC; + singleDist[chromaId][tuIterator.section] = nonZeroDistC; + singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC; cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC; bestTransformMode[chromaId][tuIterator.section] = 1; + uint32_t numCoeffC = 1 << (log2TrSizeC << 1); memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC); - primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC); + primitives.luma_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC); } cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); @@ -2875,66 +3115,55 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa } } + // Here we were encoding cbfs and coefficients, after calculating distortion above. + // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected + // bits required for coefficients and added with number of cbf bits. As I tested the order does not + // make any difference. But bit confused whether I should load the original context as below. m_entropyCoder.load(m_rqt[depth].rqtRoot); - m_entropyCoder.resetBits(); - if (log2TrSize > depthRange[0]) - m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); - + //Encode cbf flags if (bCodeChroma) { for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) { if (!splitIntoSubTUs) - m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth); + m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth); else { offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx); - for (uint32_t subTU = 0; subTU < 2; subTU++) - m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth); + m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth); + m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth); } } } - m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); - if (cbfFlag[TEXT_LUMA][0]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); + m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth); - if (bCodeChroma) - { - uint32_t subTUSize = 1 << (log2TrSizeC * 2); - uint32_t partIdxesPerSubTU = absPartIdxStep >> 1; - uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits(); - for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) - { - coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; - if (!splitIntoSubTUs) - { - if (cbfFlag[chromaId][0]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId); - } - else - { - for (uint32_t subTU = 0; subTU < 2; subTU++) - { - if (cbfFlag[chromaId][subTU]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId); - } - } - } + uint32_t coeffBits = 0; + coeffBits = singleBits[TEXT_LUMA][0]; + for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++) + { + coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex]; + coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex]; } - fullCost.distortion += singleDistComp[TEXT_LUMA][0]; - fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also + // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma. + // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for + // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks. + // For that reason, I am collecting individual coefficient bits only. + fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits; + + fullCost.distortion += singleDist[TEXT_LUMA][0]; + fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++) { - fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex]; - fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex]; + fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex]; + fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex]; } - fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); if (m_rdCost.m_psyRd) fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); else @@ -2951,31 +3180,40 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa } Cost splitCost; - const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); + if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])) + { + // Subdiv flag can be encoded at the start of anlysis of splitted blocks. + m_entropyCoder.resetBits(); + m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); + splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); + } + + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t ycbf = 0, ucbf = 0, vcbf = 0; - for (uint32_t i = 0; i < 4; ++i) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange); - ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1); - ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1); - vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1); + estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange); + ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); + ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); + vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); } - for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i) + for (uint32_t i = 0; i < 4 * qNumParts; ++i) { cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth; cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth; cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth; } + // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits + // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma. + // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context + // at depth 0 (for example). m_entropyCoder.load(m_rqt[depth].rqtRoot); m_entropyCoder.resetBits(); - encodeResidualQT(cu, absPartIdx, depth, true, TEXT_LUMA, depthRange); - encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange); - encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange); - encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange); - - splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); + codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange); + uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits(); + splitCost.bits += splitCbfBits; if (m_rdCost.m_psyRd) splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); @@ -2999,15 +3237,18 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth); if (bCodeChroma) { - const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1; - - uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0); - for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++) + if (!splitIntoSubTUs) { - const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU); - - cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU); - cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU); + cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth); + cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth); + } + else + { + uint32_t tuNumParts = absPartIdxStep >> 1; + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts); + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts); + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts); + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts); } } X265_CHECK(bCheckFull, "check-full must be set\n"); @@ -3019,23 +3260,21 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (bCodeChroma) { - uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1; - uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0); - - for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + if (!splitIntoSubTUs) { - for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++) - { - const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU); + cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth); + cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth); + } + else + { + uint32_t tuNumParts = absPartIdxStep >> 1; - if (splitIntoSubTUs) - { - uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1]; - cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU); - } - else - cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU); - } + offsetCBFs(cbfFlag[TEXT_CHROMA_U]); + offsetCBFs(cbfFlag[TEXT_CHROMA_V]); + cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts); + cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts); + cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts); + cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts); } } @@ -3045,51 +3284,65 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa outCosts.energy += fullCost.energy; } -void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]) +void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2]) { X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n"); - X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n"); + X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n"); - const uint32_t curTuDepth = depth - cu.m_cuDepth[0]; - const uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; - const bool bSubdiv = curTuDepth != tuDepth; + const uint32_t tuDepth = depth - cu.m_cuDepth[0]; + const bool bSubdiv = tuDepth != cu.m_tuDepth[absPartIdx]; const uint32_t log2TrSize = g_maxLog2CUSize - depth; - uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - - const bool splitIntoSubTUs = (m_csp == X265_CSP_I422); + if (!(log2TrSize - m_hChromaShift < 2)) + { + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1)) + m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv); + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1)) + m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv); + } + else + { + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n"); + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n"); + } - if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]) - m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize); + if (!bSubdiv) + { + m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth); + } + else + { + uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange); + } +} - bool mCodeAll = true; - uint32_t trWidthC = 1 << log2TrSizeC; - uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC; +void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2]) +{ + X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n"); + X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n"); - const uint32_t numPels = trWidthC * trHeightC; - if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE)) - mCodeAll = false; + const uint32_t curTuDepth = depth - cu.m_cuDepth[0]; + const uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; + const bool bSubdiv = curTuDepth != tuDepth; + const uint32_t log2TrSize = g_maxLog2CUSize - depth; - if (bSubdivAndCbf) + if (bSubdiv) { - const bool bFirstCbfOfCU = curTuDepth == 0; - if (bFirstCbfOfCU || mCodeAll) - { - uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + curTuDepth) << 1); - if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1)) - m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv); - if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1)) - m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv); - } - else + if (cu.getCbf(absPartIdx, ttype, curTuDepth)) { - X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n"); - X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n"); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange); } + return; } - - if (!bSubdiv) + else { + const bool splitIntoSubTUs = (m_csp == X265_CSP_I422); + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + // Luma const uint32_t qtLayer = log2TrSize - 2; uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); @@ -3098,65 +3351,51 @@ void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t de // Chroma bool bCodeChroma = true; uint32_t tuDepthC = tuDepth; - if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444)) + if (log2TrSize == 2 && m_csp != X265_CSP_I444) { + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); log2TrSizeC++; tuDepthC--; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1); - bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + bCodeChroma = !(absPartIdx & 3); } - if (bSubdivAndCbf) - m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth); - else + if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); + + if (bCodeChroma) { - if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; + coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; - if (bCodeChroma) + if (!splitIntoSubTUs) { - uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); - coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; - coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; - - if (!splitIntoSubTUs) + if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U); + if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); + } + else + { + uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2); + uint32_t subTUSize = 1 << (log2TrSizeC * 2); + if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) { - if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) + if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U); - if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); + if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U); } - else + if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) { - uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1); - uint32_t subTUSize = 1 << (log2TrSizeC * 2); - if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) - { - if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U); - if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U); - } - if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) - { - if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); - if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V); - } + if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); + if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V); } } } } - else - { - if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth)) - { - const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); - for (uint32_t i = 0; i < 4; ++i) - encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange); - } - } } void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth) @@ -3164,28 +3403,27 @@ void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartI X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n"); const uint32_t curTrMode = depth - cu.m_cuDepth[0]; const uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; + const uint32_t log2TrSize = g_maxLog2CUSize - depth; if (curTrMode < tuDepth) { - uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); - for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv) + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1); return; } - const uint32_t log2TrSize = g_maxLog2CUSize - depth; const uint32_t qtLayer = log2TrSize - 2; uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; bool bCodeChroma = true; uint32_t tuDepthC = tuDepth; - if (log2TrSizeC == 1) + if (log2TrSizeC < 2) { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n"); - log2TrSizeC++; + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + log2TrSizeC = 2; tuDepthC--; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); - bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + bCodeChroma = !(absPartIdx & 3); } m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize); diff --git a/source/encoder/search.h b/source/encoder/search.h index 79ed94a..a59fbf3 100644 --- a/source/encoder/search.h +++ b/source/encoder/search.h @@ -35,9 +35,6 @@ #include "entropy.h" #include "motion.h" -#define MVP_IDX_BITS 1 -#define NUM_LAYERS 4 - namespace x265 { // private namespace @@ -68,6 +65,64 @@ struct RQTData Yuv bidirPredYuv[2]; }; +struct MotionData +{ + MV mv; + MV mvp; + int mvpIdx; + int ref; + uint32_t cost; + int bits; + bool costZero; +}; + +struct Mode +{ + CUData cu; + const Yuv* fencYuv; + Yuv predYuv; + Yuv reconYuv; + Entropy contexts; + + enum { MAX_INTER_PARTS = 2 }; + + MotionData bestME[MAX_INTER_PARTS][2]; + MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS]; + + uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits) + uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits) + uint32_t sa8dBits; // signal bits used in sa8dCost calculation + uint32_t psyEnergy; // sum of partition psycho-visual energy difference + uint32_t distortion; // sum of partition SSE distortion + uint32_t totalBits; // sum of partition bits (mv + coeff) + uint32_t mvBits; // Mv bits + Ref + block type (or intra mode) + uint32_t coeffBits; // Texture bits (DCT Coeffs) + + void initCosts() + { + rdCost = 0; + sa8dCost = 0; + sa8dBits = 0; + psyEnergy = 0; + distortion = 0; + totalBits = 0; + mvBits = 0; + coeffBits = 0; + } + + void addSubCosts(const Mode& subMode) + { + rdCost += subMode.rdCost; + sa8dCost += subMode.sa8dCost; + sa8dBits += subMode.sa8dBits; + psyEnergy += subMode.psyEnergy; + distortion += subMode.distortion; + totalBits += subMode.totalBits; + mvBits += subMode.mvBits; + coeffBits += subMode.coeffBits; + } +}; + inline int getTUBits(int idx, int numIdx) { return idx + (idx < numIdx - 1); @@ -98,58 +153,6 @@ public: uint32_t m_numLayers; uint32_t m_refLagPixels; - struct Mode - { - CUData cu; - const Yuv* fencYuv; - Yuv predYuv; - Yuv reconYuv; - Entropy contexts; - - uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits) - uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits) - uint32_t sa8dBits; // signal bits used in sa8dCost calculation - uint32_t psyEnergy; // sum of partition psycho-visual energy difference - uint32_t distortion; // sum of partition SSE distortion - uint32_t totalBits; // sum of partition bits (mv + coeff) - uint32_t mvBits; // Mv bits + Ref + block type (or intra mode) - uint32_t coeffBits; // Texture bits (DCT Coeffs) - - void initCosts() - { - rdCost = 0; - sa8dCost = 0; - sa8dBits = 0; - psyEnergy = 0; - distortion = 0; - totalBits = 0; - mvBits = 0; - coeffBits = 0; - } - - void addSubCosts(const Mode& subMode) - { - rdCost += subMode.rdCost; - sa8dCost += subMode.sa8dCost; - sa8dBits += subMode.sa8dBits; - psyEnergy += subMode.psyEnergy; - distortion += subMode.distortion; - totalBits += subMode.totalBits; - mvBits += subMode.mvBits; - coeffBits += subMode.coeffBits; - } - }; - - struct MotionData - { - MV mv; - MV mvp; - int mvpIdx; - int ref; - uint32_t cost; - int bits; - }; - Search(); ~Search(); @@ -162,6 +165,11 @@ public: // full RD search of intra modes. if sharedModes is not NULL, it directly uses them void checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes); + // select best intra mode using only sa8d costs, cannot measure NxN intra + void checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom); + // encode luma mode selected by checkIntraInInter, then pick and encode a chroma mode + void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom); + // estimation inter prediction (non-skip) bool predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma); @@ -169,38 +177,41 @@ public: void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom); void encodeResAndCalcRdSkipCU(Mode& interMode); - void generateCoeffRecon(Mode& mode, const CUGeom& cuGeom); - void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2]); + // encode residual without rd-cost + void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2]); + void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2]); + void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx); - uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const; + // pick be chroma mode from available using just sa8d costs + void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom); protected: /* motion estimation distribution */ ThreadLocalData* m_tld; - CUData* m_curMECu; + Mode* m_curInterMode; const CUGeom* m_curGeom; int m_curPart; - MotionData m_bestME[2]; uint32_t m_listSelBits[3]; int m_totalNumME; volatile int m_numAcquiredME; volatile int m_numCompletedME; Event m_meCompletionEvent; - Lock m_outputLock; + Lock m_meLock; bool m_bJobsQueued; - void singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref); + void singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref); void saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth); // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned - uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes); + uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes); // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom); - void codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height); - void codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype); + void codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx); + void codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2]); + void codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype); struct Cost { @@ -211,24 +222,24 @@ protected: Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; } }; - void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, uint32_t depthRange[2]); + uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId); + void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]); - void encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]); + // estimate bit cost of residual QT + void encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, TextType ttype, const uint32_t depthRange[2]); // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits - void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, uint32_t depthRange[2]); - void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& costs); - void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx); + void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]); + void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs); + void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx); // generate chroma prediction, generate residual and recon - uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy); - uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy); - void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad); - - void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2]); - void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx); + uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy); + uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy); + void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth); - void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx); + // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks + void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx); struct MergeData { @@ -258,7 +269,9 @@ protected: /* intra helper functions */ enum { MAX_RD_INTRA_MODES = 16 }; static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList); - void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom); + + // get most probable luma modes for CU part, and bit cost of all non mpm modes + uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const; void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); } }; diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp index cc70c20..a792760 100644 --- a/source/encoder/slicetype.cpp +++ b/source/encoder/slicetype.cpp @@ -111,7 +111,7 @@ void Lookahead::destroy() /* Called by API thread */ void Lookahead::addPicture(Frame *curFrame, int sliceType) { - PicYuv *orig = curFrame->m_origPicYuv; + PicYuv *orig = curFrame->m_fencPic; curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType); @@ -192,7 +192,7 @@ Frame* Lookahead::getDecidedPicture() /* Called by pool worker threads */ bool Lookahead::findJob(int) { - if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1) + if (m_bReady > 0 && ATOMIC_DEC(&m_bReady) == 0) { m_inputQueueLock.acquire(); slicetypeDecide(); @@ -290,6 +290,8 @@ void Lookahead::getEstimatedPictureCost(Frame *curFrame) /* called by API thread or worker thread with inputQueueLock acquired */ void Lookahead::slicetypeDecide() { + ProfileScopeEvent(slicetypeDecideEV); + ScopedLock lock(m_decideLock); Lowres *frames[X265_LOOKAHEAD_MAX]; @@ -417,7 +419,6 @@ void Lookahead::slicetypeDecide() list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF; brefs++; } - /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */ if (m_param->rc.rateControlMode != X265_RC_CQP) { @@ -524,14 +525,12 @@ void Lookahead::slicetypeDecide() void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe) { int prevNonB = 0, curNonB = 1, idx = 0; - bool isNextNonB = false; - while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B) curNonB++; - int nextNonB = keyframe ? prevNonB : curNonB; - int nextB = keyframe ? prevNonB + 1 : curNonB + 1; - + int nextB = prevNonB + 1; + int nextBRef = 0; + int miniGopEnd = keyframe ? prevNonB : curNonB; while (curNonB < numFrames + !keyframe) { /* P/I cost: This shouldn't include the cost of nextNonB */ @@ -540,38 +539,53 @@ void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe) int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB); frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType; + /* Save the nextNonB Cost in each B frame of the current miniGop */ + if (curNonB > miniGopEnd) + { + for (int j = nextB; j < miniGopEnd; j++) + { + frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx]; + frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx]; + + } + } idx++; } /* Handle the B-frames: coded order */ - for (int i = prevNonB + 1; i < curNonB; i++, idx++) - { - frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, prevNonB, curNonB, i); - frames[nextNonB]->plannedType[idx] = X265_TYPE_B; - } + if (m_param->bBPyramid && curNonB - prevNonB > 1) + nextBRef = (prevNonB + curNonB + 1) / 2; - for (int i = nextB; i <= curNonB; i++) + for (int i = prevNonB + 1; i < curNonB; i++, idx++) { - for (int j = frames[i]->indB + i + 1; j <= curNonB; j++, frames[i]->indB++) + int64_t satdCost = 0; int type = X265_TYPE_B; + if (nextBRef) { - if (j == curNonB) + if (i == nextBRef) { - if (isNextNonB) - { - int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; - frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, p0, curNonB, curNonB); - frames[i]->plannedType[frames[i]->indB] = frames[curNonB]->sliceType; - } + satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef); + type = X265_TYPE_BREF; } + else if (i < nextBRef) + satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i); else - { - frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, prevNonB, curNonB, j); - frames[i]->plannedType[frames[i]->indB] = X265_TYPE_B; - } + satdCost = vbvFrameCost(frames, nextBRef, curNonB, i); } - if (i == curNonB && !isNextNonB) - isNextNonB = true; - } + else + satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i); + frames[nextNonB]->plannedSatd[idx] = satdCost; + frames[nextNonB]->plannedType[idx] = type; + /* Save the nextB Cost in each B frame of the current miniGop */ + for (int j = nextB; j < miniGopEnd; j++) + { + if (nextBRef && i == nextBRef) + break; + if (j >= i && j !=nextBRef) + continue; + frames[j]->plannedSatd[frames[j]->indB] = satdCost; + frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B; + } + } prevNonB = curNonB; curNonB++; while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B) @@ -1238,7 +1252,7 @@ void CostEstimate::init(x265_param *_param, Frame *curFrame) if (m_param->bEnableWeightedPred) { - PicYuv *orig = curFrame->m_origPicYuv; + PicYuv *orig = curFrame->m_fencPic; m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY; intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX; @@ -1249,7 +1263,7 @@ void CostEstimate::init(x265_param *_param, Frame *curFrame) m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset; } - m_weightedRef.fpelPlane = m_weightedRef.lowresPlane[0]; + m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0]; m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride; m_weightedRef.isLowres = true; m_weightedRef.isWeighted = false; @@ -1290,7 +1304,6 @@ int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, for (int i = 0; i < m_heightInCU; i++) { m_rows[i].init(); - m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride); if (!fenc->bIntraCalculated) fenc->rowSatds[0][0][i] = 0; fenc->rowSatds[b - p0][p1 - b][i] = 0; @@ -1351,7 +1364,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara { Lowres *fenc = frames[b]; Lowres *ref = frames[p0]; - pixel *src = ref->fpelPlane; + pixel *src = ref->fpelPlane[0]; intptr_t stride = fenc->lumaStride; if (wp) @@ -1365,7 +1378,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines, scale, round << correction, denom + correction, offset); - src = m_weightedRef.fpelPlane; + src = m_weightedRef.fpelPlane[0]; } uint32_t cost = 0; @@ -1376,7 +1389,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara { for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8) { - int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane + pixoff, stride); + int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride); cost += X265_MIN(satd, fenc->intraCost[mb]); } } @@ -1469,6 +1482,8 @@ void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0) void CostEstimate::processRow(int row, int /*threadId*/) { + ProfileScopeEvent(costEstimateRow); + int realrow = m_heightInCU - 1 - row; Lowres **frames = m_curframes; ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0]; @@ -1531,7 +1546,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 && cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2; - m_me.setSourcePU(pelOffset, cuSize, cuSize); + m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize); /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */ int lowresPenalty = 4; @@ -1592,12 +1607,13 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c } if (bBidir) { - pixel subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE], subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]; + ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); + ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE; pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0); pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1); - pixel ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]; + ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32); int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); COPY2_IF_LT(bcost, bicost, listused, 3); @@ -1626,9 +1642,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c // Copy Left for (int i = 0; i < cuSize + 1; i++) - { left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride]; - } for (int i = 0; i < cuSize; i++) { @@ -1652,22 +1666,22 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c // generate 35 intra predictions into m_predictions pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)]; - int icost = m_me.COST_MAX, cost; + int icost = m_me.COST_MAX; primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16)); - cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize); + int cost = m_me.bufSATD(m_predictions, cuSize); if (cost < icost) icost = cost; pixel *above = (cuSize >= 8) ? above1 : above0; pixel *left = (cuSize >= 8) ? left1 : left0; primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0); - cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize); + cost = m_me.bufSATD(m_predictions, cuSize); if (cost < icost) icost = cost; primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16)); // calculate satd costs, keep least cost ALIGN_VAR_32(pixel, buf_trans[32 * 32]); - primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE); + primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE); int acost = m_me.COST_MAX; uint32_t mode, lowmode = 4; @@ -1676,7 +1690,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c if (mode < 18) cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); else - cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize); + cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize); COPY2_IF_LT(acost, cost, lowmode, mode); } for (uint32_t dist = 2; dist >= 1; dist--) @@ -1685,14 +1699,14 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c if (mode < 18) cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); else - cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize); + cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize); COPY2_IF_LT(acost, cost, lowmode, mode); mode = lowmode + dist; if (mode < 18) cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); else - cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize); + cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize); COPY2_IF_LT(acost, cost, lowmode, mode); } if (acost < icost) @@ -1701,6 +1715,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c const int intraPenalty = 5 * m_lookAheadLambda; icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */ fenc->intraCost[cuXY] = icost; + fenc->intraMode[cuXY] = (uint8_t)lowmode; int icostAq = icost; if (bFrameScoreCU) { diff --git a/source/encoder/slicetype.h b/source/encoder/slicetype.h index 8805e90..2b6e265 100644 --- a/source/encoder/slicetype.h +++ b/source/encoder/slicetype.h @@ -73,8 +73,7 @@ public: EstimateRow() { m_me.setQP(X265_LOOKAHEAD_QP); - m_me.setSearchMethod(X265_HEX_SEARCH); - m_me.setSubpelRefine(1); + m_me.init(X265_HEX_SEARCH, 1, X265_CSP_I400); m_predictions = X265_MALLOC(pixel, 35 * 8 * 8); m_merange = 16; m_lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP]; diff --git a/source/encoder/weightPrediction.cpp b/source/encoder/weightPrediction.cpp index 3bf5a45..cd6f4f7 100644 --- a/source/encoder/weightPrediction.cpp +++ b/source/encoder/weightPrediction.cpp @@ -219,7 +219,7 @@ namespace x265 { void weightAnalyse(Slice& slice, Frame& frame, x265_param& param) { WeightParam wp[2][MAX_NUM_REF][3]; - PicYuv *fencPic = frame.m_origPicYuv; + PicYuv *fencPic = frame.m_fencPic; Lowres& fenc = frame.m_lowres; Cache cache; @@ -329,7 +329,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param) if (!refFrame->m_bChromaExtended) { refFrame->m_bChromaExtended = true; - PicYuv *refPic = refFrame->m_origPicYuv; + PicYuv *refPic = refFrame->m_fencPic; int width = refPic->m_picWidth >> cache.hshift; int height = refPic->m_picHeight >> cache.vshift; extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY); @@ -363,7 +363,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param) case 1: orig = fencPic->m_picOrg[1]; stride = fencPic->m_strideC; - fref = refFrame->m_origPicYuv->m_picOrg[1]; + fref = refFrame->m_fencPic->m_picOrg[1]; /* Clamp the chroma dimensions to the nearest multiple of * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres @@ -381,7 +381,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param) break; case 2: - fref = refFrame->m_origPicYuv->m_picOrg[2]; + fref = refFrame->m_fencPic->m_picOrg[2]; orig = fencPic->m_picOrg[2]; stride = fencPic->m_strideC; width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift; diff --git a/source/test/CMakeLists.txt b/source/test/CMakeLists.txt index ff3312f..ff1e141 100644 --- a/source/test/CMakeLists.txt +++ b/source/test/CMakeLists.txt @@ -23,6 +23,3 @@ add_executable(TestBench ${YASM_SRC} ipfilterharness.cpp ipfilterharness.h intrapredharness.cpp intrapredharness.h) target_link_libraries(TestBench x265-static ${PLATFORM_LIBS}) - -add_executable(PoolTest testpool.cpp) -target_link_libraries(PoolTest x265-static ${PLATFORM_LIBS}) diff --git a/source/test/intrapredharness.cpp b/source/test/intrapredharness.cpp index 97eff94..38c8cf4 100644 --- a/source/test/intrapredharness.cpp +++ b/source/test/intrapredharness.cpp @@ -31,8 +31,6 @@ IntraPredHarness::IntraPredHarness() { for (int i = 0; i < INPUT_SIZE; i++) pixel_buff[i] = rand() % PIXEL_MAX; - - initROM(); } bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width) diff --git a/source/test/ipfilterharness.cpp b/source/test/ipfilterharness.cpp index f23e84b..1bcee04 100644 --- a/source/test/ipfilterharness.cpp +++ b/source/test/ipfilterharness.cpp @@ -586,9 +586,9 @@ bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const Encode for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++) { - if (opt.chroma_p2s[csp]) + if (opt.chroma[csp].p2s) { - if (!check_IPFilter_primitive(ref.chroma_p2s[csp], opt.chroma_p2s[csp], 1, csp)) + if (!check_IPFilter_primitive(ref.chroma[csp].p2s, opt.chroma[csp].p2s, 1, csp)) { printf("chroma_p2s[%s]", x265_source_csp_names[csp]); return false; @@ -725,10 +725,10 @@ void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPr for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++) { printf("= Color Space %s =\n", x265_source_csp_names[csp]); - if (opt.chroma_p2s[csp]) + if (opt.chroma[csp].p2s) { printf("chroma_p2s\t"); - REPORT_SPEEDUP(opt.chroma_p2s[csp], ref.chroma_p2s[csp], + REPORT_SPEEDUP(opt.chroma[csp].p2s, ref.chroma[csp].p2s, pixel_buff, srcStride, IPF_vec_output_s, width, height); } for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++) diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp index 88e4676..f9e16d6 100644 --- a/source/test/mbdstharness.cpp +++ b/source/test/mbdstharness.cpp @@ -65,17 +65,17 @@ MBDstHarness::MBDstHarness() short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX); int_test_buff[0][i] = rand() % PIXEL_MAX; int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX; - int_denoise_test_buff1[0][i] = int_denoise_test_buff2[0][i] = (rand() & UNSIGNED_SHORT_MAX) - (rand() & UNSIGNED_SHORT_MAX); + short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX); short_test_buff[1][i] = -PIXEL_MAX; int_test_buff[1][i] = -PIXEL_MAX; int_idct_test_buff[1][i] = SHORT_MIN; - int_denoise_test_buff1[1][i] = int_denoise_test_buff2[1][i] = -UNSIGNED_SHORT_MAX; + short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX; short_test_buff[2][i] = PIXEL_MAX; int_test_buff[2][i] = PIXEL_MAX; int_idct_test_buff[2][i] = SHORT_MAX; - int_denoise_test_buff1[2][i] = int_denoise_test_buff2[2][i] = UNSIGNED_SHORT_MAX; + short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX; mbuf1[i] = rand() & PIXEL_MAX; mbufdct[i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX); @@ -96,16 +96,16 @@ MBDstHarness::MBDstHarness() bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, intptr_t width) { int j = 0; - intptr_t cmp_size = sizeof(int) * width * width; + intptr_t cmp_size = sizeof(short) * width * width; for (int i = 0; i < ITERS; i++) { int index = rand() % TEST_CASES; - ref(short_test_buff[index] + j, mintbuf3, width); - checked(opt, short_test_buff[index] + j, mintbuf4, width); + ref(short_test_buff[index] + j, mshortbuf2, width); + checked(opt, short_test_buff[index] + j, mshortbuf3, width); - if (memcmp(mintbuf3, mintbuf4, cmp_size)) + if (memcmp(mshortbuf2, mshortbuf3, cmp_size)) return false; reportfail(); @@ -124,8 +124,8 @@ bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, intptr_t width) { int index = rand() % TEST_CASES; - ref(int_idct_test_buff[index] + j, mshortbuf2, width); - checked(opt, int_idct_test_buff[index] + j, mshortbuf3, width); + ref(short_test_buff[index] + j, mshortbuf2, width); + checked(opt, short_test_buff[index] + j, mshortbuf3, width); if (memcmp(mshortbuf2, mshortbuf3, cmp_size)) return false; @@ -156,10 +156,10 @@ bool MBDstHarness::check_dequant_primitive(dequant_normal_t ref, dequant_normal_ int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift; - ref(short_test_buff[index] + j, mintbuf3, width * height, scale, shift); - checked(opt, short_test_buff[index] + j, mintbuf4, width * height, scale, shift); + ref(short_test_buff[index] + j, mshortbuf2, width * height, scale, shift); + checked(opt, short_test_buff[index] + j, mshortbuf3, width * height, scale, shift); - if (memcmp(mintbuf3, mintbuf4, sizeof(int) * height * width)) + if (memcmp(mshortbuf2, mshortbuf3, sizeof(int16_t) * height * width)) return false; reportfail(); @@ -175,6 +175,10 @@ bool MBDstHarness::check_dequant_primitive(dequant_scaling_t ref, dequant_scalin for (int i = 0; i < ITERS; i++) { + + memset(mshortbuf2, 0, MAX_TU_SIZE * sizeof(int16_t)); + memset(mshortbuf3, 0, MAX_TU_SIZE * sizeof(int16_t)); + int log2TrSize = (rand() % 4) + 2; int width = (1 << log2TrSize); @@ -185,13 +189,13 @@ bool MBDstHarness::check_dequant_primitive(dequant_scaling_t ref, dequant_scalin int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift; - int cmp_size = sizeof(int) * height * width; + int cmp_size = sizeof(int16_t) * height * width; int index1 = rand() % TEST_CASES; - ref(short_test_buff[index1] + j, mintbuf3, mintbuf1, width * height, per, shift); - checked(opt, short_test_buff[index1] + j, mintbuf4, mintbuf2, width * height, per, shift); + ref(short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf2, width * height, per, shift); + checked(opt, short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf3, width * height, per, shift); - if (memcmp(mintbuf1, mintbuf2, cmp_size)) + if (memcmp(mshortbuf2, mshortbuf3, cmp_size)) return false; reportfail(); @@ -222,8 +226,8 @@ bool MBDstHarness::check_quant_primitive(quant_t ref, quant_t opt) int index1 = rand() % TEST_CASES; int index2 = rand() % TEST_CASES; - refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff); - optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff); + refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff); + optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff); if (memcmp(mintbuf1, mintbuf3, cmp_size)) return false; @@ -261,8 +265,8 @@ bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt) int index1 = rand() % TEST_CASES; int index2 = rand() % TEST_CASES; - refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff); - optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff); + refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff); + optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff); if (memcmp(mshortbuf2, mshortbuf3, cmp_size)) return false; @@ -324,6 +328,7 @@ bool MBDstHarness::check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t op int log2TrSize = s + 2; int num = 1 << (log2TrSize * 2); int cmp_size = sizeof(int) * num; + int cmp_short = sizeof(short) * num; for (int i = 0; i < ITERS; i++) { @@ -336,10 +341,10 @@ bool MBDstHarness::check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t op int index = rand() % TEST_CASES; - ref(int_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num); - checked(opt, int_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num); + ref(short_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num); + checked(opt, short_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num); - if (memcmp(int_denoise_test_buff1[index] + j, int_denoise_test_buff2[index] + j, cmp_size)) + if (memcmp(short_denoise_test_buff1[index] + j, short_denoise_test_buff2[index] + j, cmp_short)) return false; if (memcmp(mubuf1, mubuf2, cmp_size)) @@ -454,7 +459,7 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi if (opt.dct[value]) { printf("%s\t", dctInfo[value].name); - REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mintbuf3, dctInfo[value].width); + REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mshortbuf2, dctInfo[value].width); } } @@ -463,32 +468,32 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi if (opt.idct[value]) { printf("%s\t", idctInfo[value].name); - REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mbufidct, mshortbuf2, idctInfo[value].width); + REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mshortbuf3, mshortbuf2, idctInfo[value].width); } } if (opt.dequant_normal) { printf("dequant_normal\t"); - REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mintbuf3, 32 * 32, 70, 1); + REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1); } if (opt.dequant_scaling) { printf("dequant_scaling\t"); - REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1); + REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mshortbuf2, 32 * 32, 5, 1); } if (opt.quant) { printf("quant\t\t"); - REPORT_SPEEDUP(opt.quant, ref.quant, int_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32); + REPORT_SPEEDUP(opt.quant, ref.quant, short_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32); } if (opt.nquant) { printf("nquant\t\t"); - REPORT_SPEEDUP(opt.nquant, ref.nquant, int_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32); + REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32); } if (opt.count_nonzero) @@ -503,7 +508,7 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi if (opt.denoiseDct) { printf("denoiseDct\t"); - REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, int_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32); + REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, short_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32); } } diff --git a/source/test/mbdstharness.h b/source/test/mbdstharness.h index a8b4de2..284892a 100644 --- a/source/test/mbdstharness.h +++ b/source/test/mbdstharness.h @@ -60,8 +60,8 @@ protected: uint32_t mubuf2[MAX_TU_SIZE]; uint16_t mushortbuf1[MAX_TU_SIZE]; - int int_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE]; - int int_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE]; + int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE]; + int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE]; bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt); bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt); diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp index bb6e0e6..5c91bdc 100644 --- a/source/test/pixelharness.cpp +++ b/source/test/pixelharness.cpp @@ -344,7 +344,7 @@ bool PixelHarness::check_downscale_t(downscale_t ref, downscale_t opt) return true; } -bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt) +bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt) { ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); @@ -359,8 +359,8 @@ bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t op int shift = (rand() % 7 + 1); int index = i % TEST_CASES; - checked(opt, opt_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE); - ref(ref_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE); + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift); + ref(ref_dest, short_test_buff[index] + j, stride, shift); if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) return false; @@ -372,60 +372,7 @@ bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t op return true; } -bool PixelHarness::check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt) -{ - ALIGN_VAR_16(int32_t, ref_dest[64 * 64]); - ALIGN_VAR_16(int32_t, opt_dest[64 * 64]); - - int j = 0; - intptr_t stride = STRIDE; - for (int i = 0; i < ITERS; i++) - { - int shift = (rand() % 7 + 1); - - int index = i % TEST_CASES; - checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride); - ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride); - - if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t))) - return false; - - reportfail(); - j += INCR; - } - - return true; -} - -bool PixelHarness::check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt) -{ - ALIGN_VAR_16(int32_t, ref_dest[64 * 64]); - ALIGN_VAR_16(int32_t, opt_dest[64 * 64]); - - memset(ref_dest, 0xCD, sizeof(ref_dest)); - memset(opt_dest, 0xCD, sizeof(opt_dest)); - - int j = 0; - intptr_t stride = STRIDE; - for (int i = 0; i < ITERS; i++) - { - int shift = (rand() % 7 + 1); - - int index = i % TEST_CASES; - checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride); - ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride); - - if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t))) - return false; - - reportfail(); - j += INCR; - } - - return true; -} - -bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt) +bool PixelHarness::check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt) { ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); @@ -440,8 +387,8 @@ bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t op int shift = (rand() % 7 + 1); int index = i % TEST_CASES; - checked(opt, opt_dest, int_test_buff[index] + j, stride, shift); - ref(ref_dest, int_test_buff[index] + j, stride, shift); + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift); + ref(ref_dest, short_test_buff[index] + j, stride, shift); if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) return false; @@ -479,7 +426,7 @@ bool PixelHarness::check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt) return true; } -bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt) +bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt) { ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); @@ -494,8 +441,8 @@ bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt) int shift = (rand() % 7 + 1); int index = i % TEST_CASES; - checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE); - ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE); + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift); + ref(ref_dest, short_test_buff[index] + j, stride, shift); if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) return false; @@ -507,7 +454,7 @@ bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt) return true; } -bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt) +bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt) { ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); @@ -1308,50 +1255,40 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr } } - if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i]) + if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i]) { - if (!check_cvt16to32_shr_t(ref.cvt16to32_shr[i], opt.cvt16to32_shr[i])) + if (!check_cpy2Dto1D_shl_t(ref.cpy2Dto1D_shl[i], opt.cpy2Dto1D_shl[i])) { - printf("cvt16to32_shr failed!\n"); + printf("cpy2Dto1D_shl failed!\n"); return false; } } - if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i]) + if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i]) { - if (!check_cvt32to16_shl_t(ref.cvt32to16_shl[i], opt.cvt32to16_shl[i])) + if (!check_cpy2Dto1D_shr_t(ref.cpy2Dto1D_shr[i], opt.cpy2Dto1D_shr[i])) { - printf("cvt32to16_shl failed!\n"); + printf("cpy2Dto1D_shr failed!\n"); return false; } } - if ((i < BLOCK_64x64) && opt.copy_shl[i]) + if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i]) { - if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i])) + if (!check_cpy1Dto2D_shl_t(ref.cpy1Dto2D_shl[i], opt.cpy1Dto2D_shl[i])) { - printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i); + printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i); return false; } } - } - - if (opt.cvt32to16_shr) - { - if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr)) + if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i]) { - printf("cvt32to16 failed!\n"); - return false; - } - } - - if (opt.cvt16to32_shl) - { - if (!check_cvt16to32_shl_t(ref.cvt16to32_shl, opt.cvt16to32_shl)) - { - printf("cvt16to32_shl failed!\n"); - return false; + if (!check_cpy1Dto2D_shr_t(ref.cpy1Dto2D_shr[i], opt.cpy1Dto2D_shr[i])) + { + printf("cpy1Dto2D_shr[%dx%d] failed!\n", 4 << i, 4 << i); + return false; + } } } @@ -1373,9 +1310,9 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr } } - if (opt.frame_init_lowres_core) + if (opt.frameInitLowres) { - if (!check_downscale_t(ref.frame_init_lowres_core, opt.frame_init_lowres_core)) + if (!check_downscale_t(ref.frameInitLowres, opt.frameInitLowres)) { printf("downscale failed!\n"); return false; @@ -1445,15 +1382,6 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr } } - if (opt.copy_shr) - { - if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr)) - { - printf("copy_shr failed!\n"); - return false; - } - } - return true; } @@ -1674,42 +1602,35 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE); } - if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i]) + if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i]) { - HEADER("cvt16to32_shr[%dx%d]", 4 << i, 4 << i); - REPORT_SPEEDUP(opt.cvt16to32_shr[i], ref.cvt16to32_shr[i], ibuf1, sbuf2, STRIDE, 3, 4); + HEADER("cpy2Dto1D_shl[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.cpy2Dto1D_shl[i], ref.cpy2Dto1D_shl[i], sbuf1, sbuf2, STRIDE, MAX_TR_DYNAMIC_RANGE - X265_DEPTH - (i + 2)); } - if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i]) + if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i]) { - HEADER("cvt32to16_shl[%dx%d]", 4 << i, 4 << i); - REPORT_SPEEDUP(opt.cvt32to16_shl[i], ref.cvt32to16_shl[i], sbuf2, ibuf1, STRIDE, 3); + HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.cpy2Dto1D_shr[i], ref.cpy2Dto1D_shr[i], sbuf1, sbuf2, STRIDE, 3); } - if ((i < BLOCK_64x64) && opt.copy_cnt[i]) + if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i]) { - HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i); - REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE); + HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.cpy1Dto2D_shl[i], ref.cpy1Dto2D_shl[i], sbuf1, sbuf2, STRIDE, 64); } - if ((i < BLOCK_64x64) && opt.copy_shl[i]) + if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i]) { - HEADER("copy_shl[%dx%d]", 4 << i, 4 << i); - REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64); + HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.cpy1Dto2D_shr[i], ref.cpy1Dto2D_shr[i], sbuf1, sbuf2, STRIDE, 64); } - } - - if (opt.cvt32to16_shr) - { - HEADER0("cvt32to16_shr"); - REPORT_SPEEDUP(opt.cvt32to16_shr, ref.cvt32to16_shr, sbuf1, ibuf1, 64, 5, 64); - } - - if (opt.cvt16to32_shl) - { - HEADER0("cvt16to32_shl"); - REPORT_SPEEDUP(opt.cvt16to32_shl, ref.cvt16to32_shl, ibuf1, sbuf1, 64, 5, 64); + if ((i < BLOCK_64x64) && opt.copy_cnt[i]) + { + HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE); + } } if (opt.weight_pp) @@ -1724,10 +1645,10 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi REPORT_SPEEDUP(opt.weight_sp, ref.weight_sp, (int16_t*)sbuf1, pbuf1, 64, 64, 32, 32, 128, 1 << 9, 10, 100); } - if (opt.frame_init_lowres_core) + if (opt.frameInitLowres) { HEADER0("downscale"); - REPORT_SPEEDUP(opt.frame_init_lowres_core, ref.frame_init_lowres_core, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64); + REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64); } if (opt.scale1D_128to64) @@ -1771,11 +1692,4 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi HEADER0("planecopy_cp"); REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2); } - - if (opt.copy_shr) - { - HEADER0("copy_shr"); - REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64); - } - } diff --git a/source/test/pixelharness.h b/source/test/pixelharness.h index 1255d99..b35d958 100644 --- a/source/test/pixelharness.h +++ b/source/test/pixelharness.h @@ -80,13 +80,11 @@ protected: bool check_weightp(weightp_pp_t ref, weightp_pp_t opt); bool check_weightp(weightp_sp_t ref, weightp_sp_t opt); bool check_downscale_t(downscale_t ref, downscale_t opt); - bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt); - bool check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt); - bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt); - bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt); + bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt); + bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt); + bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt); + bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt); bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt); - bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt); - bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt); bool check_pixel_var(var_t ref, var_t opt); bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt); bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt); diff --git a/source/test/testpool.cpp b/source/test/testpool.cpp deleted file mode 100644 index 01f037b..0000000 --- a/source/test/testpool.cpp +++ /dev/null @@ -1,238 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2013 x265 project - * - * Authors: Steve Borho - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com - *****************************************************************************/ - -#include "common.h" -#include "threadpool.h" -#include "wavefront.h" -#include "threading.h" -#include "md5.h" -#include "PPA/ppa.h" - -#include -#include - -using namespace x265; - -struct CUData -{ - CUData() - { - memset(digest, 0, sizeof(digest)); - } - - unsigned char digest[16]; -}; - -struct RowData -{ - RowData() : active(false), curCol(0) {} - - Lock lock; - volatile bool active; - volatile int curCol; -}; - -// Create a fake frame class with manufactured data in each CU block. We -// need to create an MD5 hash such that each CU's hash includes the hashes -// of the blocks that would have HEVC data dependencies (left, top-left, -// top, top-right). This will give us one deterministic output hash. We -// then generate the same hash using the thread pool and wave-front parallelism -// to verify the thread-pool behavior and the wave-front schedule data -// structures. -class MD5Frame : public WaveFront -{ -private: - - CUData *cu; - RowData *row; - int numrows; - int numcols; - Event complete; - -public: - - MD5Frame(ThreadPool *pool) : WaveFront(pool), cu(0), row(0) {} - - virtual ~MD5Frame() - { - // ensure no threads are lingering on FindJob() before allowing - // this object's vtable to be destroyed - JobProvider::flush(); - - delete[] this->cu; - delete[] this->row; - } - - void initialize(int cols, int rows); - - void encode(); - - void processRow(int row, int threadid); -}; - -void MD5Frame::initialize(int cols, int rows) -{ - this->cu = new CUData[rows * cols]; - this->row = new RowData[rows]; - this->numrows = rows; - this->numcols = cols; - - if (!this->WaveFront::init(rows)) - { - assert(!"Unable to initialize job queue"); - } -} - -void MD5Frame::encode() -{ - this->JobProvider::enqueue(); - - this->WaveFront::enqueueRow(0); - - // NOTE: When EnableRow after enqueueRow at first row, we'd better call pokeIdleThread, it will release a thread to do job - this->WaveFront::enableRow(0); - this->m_pool->pokeIdleThread(); - - this->complete.wait(); - - this->JobProvider::dequeue(); - - unsigned int *outdigest = (unsigned int*)this->cu[this->numrows * this->numcols - 1].digest; - - std::stringstream ss; - - for (int i = 0; i < 4; i++) - { - ss << std::hex << outdigest[i]; - } - - if (ss.str().compare("da667b741a7a9d0ee862158da2dd1882")) - std::cout << "Bad hash: " << ss.str() << std::endl; -} - -void MD5Frame::processRow(int rownum, int) -{ - // Called by worker thread - RowData &curRow = this->row[rownum]; - - assert(rownum < this->numrows && rownum >= 0); - assert(curRow.curCol < this->numcols); - - while (curRow.curCol < this->numcols) - { - int id = rownum * this->numcols + curRow.curCol; - CUData &curCTU = this->cu[id]; - MD5 hash; - - // * Fake CTU processing * - PPAStartCpuEventFunc(encode_block); - memset(curCTU.digest, id, sizeof(curCTU.digest)); - hash.update(curCTU.digest, sizeof(curCTU.digest)); - if (curRow.curCol > 0) - hash.update(this->cu[id - 1].digest, sizeof(curCTU.digest)); - - if (rownum > 0) - { - if (curRow.curCol > 0) - hash.update(this->cu[id - this->numcols - 1].digest, sizeof(curCTU.digest)); - - hash.update(this->cu[id - this->numcols].digest, sizeof(curCTU.digest)); - if (curRow.curCol < this->numcols - 1) - hash.update(this->cu[id - this->numcols + 1].digest, sizeof(curCTU.digest)); - } - - hash.finalize(curCTU.digest); - PPAStopCpuEventFunc(encode_block); - - curRow.curCol++; - - if (curRow.curCol >= 2 && rownum < this->numrows - 1) - { - ScopedLock below(this->row[rownum + 1].lock); - - if (this->row[rownum + 1].active == false && - this->row[rownum + 1].curCol + 2 <= curRow.curCol) - { - // set active indicator so row is only enqueued once - // row stays marked active until blocked or done - this->row[rownum + 1].active = true; - this->WaveFront::enqueueRow(rownum + 1); - this->WaveFront::enableRow(rownum + 1); - } - } - - ScopedLock self(curRow.lock); - - if (rownum > 0 && - curRow.curCol < this->numcols - 1 && - this->row[rownum - 1].curCol < curRow.curCol + 2) - { - // row is blocked, quit job - curRow.active = false; - return; - } - } - - // * Row completed * - - if (rownum == this->numrows - 1) - this->complete.trigger(); -} - -int main(int, char **) -{ - ThreadPool *pool; - - PPA_INIT(); - - pool = ThreadPool::allocThreadPool(1); - { - MD5Frame frame(pool); - frame.initialize(60, 40); - frame.encode(); - } - pool->release(); - pool = ThreadPool::allocThreadPool(2); - { - MD5Frame frame(pool); - frame.initialize(60, 40); - frame.encode(); - } - pool->release(); - pool = ThreadPool::allocThreadPool(4); - { - MD5Frame frame(pool); - frame.initialize(60, 40); - frame.encode(); - } - pool->release(); - pool = ThreadPool::allocThreadPool(8); - { - MD5Frame frame(pool); - frame.initialize(60, 40); - frame.encode(); - } - pool->release(); - - return 0; -} diff --git a/source/x265.cpp b/source/x265.cpp index 474cea9..ab7c93f 100644 --- a/source/x265.cpp +++ b/source/x265.cpp @@ -37,7 +37,6 @@ /* Visual Leak Detector */ #include #endif -#include "PPA/ppa.h" #include #include @@ -155,6 +154,11 @@ static const struct option long_options[] = { "aq-strength", required_argument, NULL, 0 }, { "ipratio", required_argument, NULL, 0 }, { "pbratio", required_argument, NULL, 0 }, + { "qcomp", required_argument, NULL, 0 }, + { "qpstep", required_argument, NULL, 0 }, + { "ratetol", required_argument, NULL, 0 }, + { "cplxblur", required_argument, NULL, 0 }, + { "qblur", required_argument, NULL, 0 }, { "cbqpoffs", required_argument, NULL, 0 }, { "crqpoffs", required_argument, NULL, 0 }, { "rd", required_argument, NULL, 0 }, @@ -165,8 +169,10 @@ static const struct option long_options[] = { "no-lossless", no_argument, NULL, 0 }, { "no-signhide", no_argument, NULL, 0 }, { "signhide", no_argument, NULL, 0 }, - { "no-lft", no_argument, NULL, 0 }, - { "lft", no_argument, NULL, 0 }, + { "no-lft", no_argument, NULL, 0 }, /* DEPRECATED */ + { "lft", no_argument, NULL, 0 }, /* DEPRECATED */ + { "no-deblock", no_argument, NULL, 0 }, + { "deblock", required_argument, NULL, 0 }, { "no-sao", no_argument, NULL, 0 }, { "sao", no_argument, NULL, 0 }, { "no-sao-non-deblock", no_argument, NULL, 0 }, @@ -203,7 +209,8 @@ static const struct option long_options[] = { "lambda-file", required_argument, NULL, 0 }, { "b-intra", no_argument, NULL, 0 }, { "no-b-intra", no_argument, NULL, 0 }, - { "nr", required_argument, NULL, 0 }, + { "nr-intra", required_argument, NULL, 0 }, + { "nr-inter", required_argument, NULL, 0 }, { "stats", required_argument, NULL, 0 }, { "pass", required_argument, NULL, 0 }, { "slow-firstpass", no_argument, NULL, 0 }, @@ -268,8 +275,6 @@ struct CLIOptions void showHelp(x265_param *param); bool parse(int argc, char **argv, x265_param* param); bool parseQPFile(x265_picture &pic_org); - void readAnalysisFile(x265_picture* pic, x265_param*); - void writeAnalysisFile(x265_picture* pic, x265_param*); bool validateFanout(x265_param*); }; @@ -291,7 +296,7 @@ void CLIOptions::destroy() void CLIOptions::writeNALs(const x265_nal* nal, uint32_t nalcount) { - PPAScopeEvent(bitstream_write); + ProfileScopeEvent(bitstreamWrite); for (uint32_t i = 0; i < nalcount; i++) { bitstreamFile.write((const char*)nal->payload, nal->sizeBytes); @@ -335,11 +340,14 @@ void CLIOptions::printVersion(x265_param *param) void CLIOptions::showHelp(x265_param *param) { + int level = param->logLevel; x265_param_default(param); printVersion(param); -#define H0 printf #define OPT(value) (value ? "enabled" : "disabled") +#define H0 printf +#define H1 if (level >= X265_LOG_DEBUG) printf + H0("\nSyntax: x265 [options] infile [-o] outfile\n"); H0(" infile can be YUV or Y4M\n"); H0(" outfile is raw HEVC bitstream\n"); @@ -351,18 +359,18 @@ void CLIOptions::showHelp(x265_param *param) H0(" --log-level Logging level: none error warning info debug full. Default %s\n", logLevelNames[param->logLevel + 1]); H0(" --no-progress Disable CLI progress reports\n"); H0(" --[no-]cu-stats Enable logging stats about distribution of cu across all modes. Default %s\n",OPT(param->bLogCuStats)); - H0(" --csv Comma separated log file, log level >= 3 frame log, else one line per run\n"); + H1(" --csv Comma separated log file, log level >= 3 frame log, else one line per run\n"); H0("\nInput Options:\n"); H0(" --input Raw YUV or Y4M input file name. `-` for stdin\n"); - H0(" --y4m Force parsing of input stream as YUV4MPEG2 regardless of file extension\n"); + H1(" --y4m Force parsing of input stream as YUV4MPEG2 regardless of file extension\n"); H0(" --fps Source frame rate (float or num/denom), auto-detected if Y4M\n"); H0(" --input-res WxH Source picture size [w x h], auto-detected if Y4M\n"); - H0(" --input-depth Bit-depth of input file. Default 8\n"); - H0(" --input-csp Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n"); + H1(" --input-depth Bit-depth of input file. Default 8\n"); + H1(" --input-csp Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n"); H0("-f/--frames Maximum number of frames to encode. Default all\n"); H0(" --seek First frame to encode\n"); - H0(" --[no-]interlace Indicate input pictures are interlace fields in temporal order. Default progressive\n"); - H0(" --dither Enable dither if downscaling to 8 bit pixels. Default disabled\n"); + H1(" --[no-]interlace Indicate input pictures are interlace fields in temporal order. Default progressive\n"); + H1(" --dither Enable dither if downscaling to 8 bit pixels. Default disabled\n"); H0("\nQuality reporting metrics:\n"); H0(" --[no-]ssim Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim)); H0(" --[no-]psnr Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr)); @@ -381,33 +389,34 @@ void CLIOptions::showHelp(x265_param *param) H0("-p/--preset Trade off performance for compression efficiency. Default medium\n"); H0(" ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n"); H0("-t/--tune Tune the settings for a particular type of source or situation:\n"); - H0(" psnr, ssim, zerolatency, or fastdecode\n"); + H0(" psnr, ssim, grain, zerolatency, fastdecode or cbr\n"); H0("\nQuad-Tree size and depth:\n"); - H0("-s/--ctu <64|32|16> Maximum CU size (default: 64x64). Default %d\n", param->maxCUSize); + H0("-s/--ctu <64|32|16> Maximum CU size (WxH). Default %d\n", param->maxCUSize); H0(" --tu-intra-depth Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth); H0(" --tu-inter-depth Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth); - H0(" --[no-]rect Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter)); - H0(" --[no-]amp Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP)); H0("\nAnalysis:\n"); H0(" --rd <0..6> Level of RD in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel); - H0(" --psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default %f\n", param->psyRd); - H0(" --psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default %f\n", param->psyRdoq); - H0(" --nr An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled\n"); - H0(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast)); + H0(" --psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd); + H0(" --psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default %.1f\n", param->psyRdoq); H0(" --[no-]early-skip Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip)); - H0(" --[no-]fast-cbf Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode)); + H1(" --[no-]fast-cbf Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode)); + H1(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast)); + H1(" --nr-intra An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n"); + H1(" --nr-inter An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n"); H0("\nCoding tools:\n"); H0("-w/--[no-]weightp Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred)); H0(" --[no-]weightb Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred)); H0(" --[no-]cu-lossless Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless)); H0(" --[no-]signhide Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding)); - H0(" --[no-]tskip Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip)); + H1(" --[no-]tskip Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip)); H0("\nTemporal / motion search options:\n"); H0(" --me Motion search method dia hex umh star full. Default %d\n", param->searchMethod); H0("-m/--subme Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine); H0(" --merange Motion search range. Default %d\n", param->searchRange); H0(" --max-merge <1..5> Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand); - H0(" --[no-]temporal-mvp Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp)); + H0(" --[no-]rect Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter)); + H0(" --[no-]amp Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP)); + H1(" --[no-]temporal-mvp Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp)); H0("\nSpatial / intra options:\n"); H0(" --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing)); H0(" --[no-]constrained-intra Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra)); @@ -422,57 +431,59 @@ void CLIOptions::showHelp(x265_param *param) H0(" --scenecut How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold); H0(" --rc-lookahead Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth); H0(" --bframes Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes); - H0(" --bframe-bias Bias towards B frame decisions. Default %d\n", param->bFrameBias); + H1(" --bframe-bias Bias towards B frame decisions. Default %d\n", param->bFrameBias); H0(" --b-adapt <0..2> 0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive); H0(" --[no-]b-pyramid Use B-frames as references. Default %s\n", OPT(param->bBPyramid)); H0(" --ref max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences); - H0(" --qpfile Force frametypes and QPs for some or all frames\n"); - H0(" Format of each line: framenumber frametype QP\n"); - H0(" QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n"); - H0(" QPs are restricted by qpmin/qpmax.\n"); - H0("\nRate control, Quantization:\n"); + H1(" --qpfile Force frametypes and QPs for some or all frames\n"); + H1(" Format of each line: framenumber frametype QP\n"); + H1(" QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n"); + H1(" QPs are restricted by qpmin/qpmax.\n"); + H0("\nRate control, Adaptive Quantization:\n"); H0(" --bitrate Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate); - H0("-q/--qp QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n"); - H0(" --crf Quality-based VBR (0-51). Default %f\n", param->rc.rfConstant); - H0(" --[no-]lossless Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless)); - H0(" --crf-max With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax); - H0(" May cause VBV underflows!\n"); - H0(" --crf-min With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin); - H0(" this specifies a minimum rate factor value for encode!\n"); + H1("-q/--qp QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n"); + H0(" --crf Quality-based VBR (0-51). Default %.1f\n", param->rc.rfConstant); + H1(" --[no-]lossless Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless)); + H1(" --crf-max With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax); + H1(" May cause VBV underflows!\n"); + H1(" --crf-min With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin); + H1(" this specifies a minimum rate factor value for encode!\n"); H0(" --vbv-maxrate Max local bitrate (kbit/s). Default %d\n", param->rc.vbvMaxBitrate); H0(" --vbv-bufsize Set size of the VBV buffer (kbit). Default %d\n", param->rc.vbvBufferSize); - H0(" --vbv-init Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %f\n", param->rc.vbvBufferInit); - H0(" --aq-mode Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode); - H0(" --aq-strength Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default %f\n", param->rc.aqStrength); - H0(" --[no-]cutree Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree)); - H0(" --ipratio QP factor between I and P. Default %f\n", param->rc.ipFactor); - H0(" --pbratio QP factor between P and B. Default %f\n", param->rc.pbFactor); - H0(" --cbqpoffs Chroma Cb QP Offset. Default %d\n", param->cbQpOffset); - H0(" --crqpoffs Chroma Cr QP Offset. Default %d\n", param->crQpOffset); - H0(" --stats Filename for stats file in multipass pass rate control. Default x265_2pass.log\n"); + H0(" --vbv-init Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit); H0(" --pass Multi pass rate control.\n" " - 1 : First pass, creates stats file\n" " - 2 : Last pass, does not overwrite stats file\n" " - 3 : Nth pass, overwrites stats file\n"); + H0(" --stats Filename for stats file in multipass pass rate control. Default x265_2pass.log\n"); H0(" --[no-]slow-firstpass Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass)); H0(" --analysis-mode save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode); H0(" --analysis-file Specify file name used for either dumping or reading analysis data.\n"); - H0(" --scaling-list Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n"); - H0(" --lambda-file Specify a file containing replacement values for the lambda tables\n"); - H0(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n"); - H0(" Blank lines and lines starting with hash(#) are ignored\n"); - H0(" Comma is considered to be white-space\n"); + H0(" --aq-mode Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode); + H0(" --aq-strength Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength); + H0(" --[no-]cutree Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree)); + H1(" --ipratio QP factor between I and P. Default %.2f\n", param->rc.ipFactor); + H1(" --pbratio QP factor between P and B. Default %.2f\n", param->rc.pbFactor); + H1(" --qcomp Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress); + H1(" --ratetol Degree of rate fluctuation that can be tolerated. Default %.2f\n", param->rc.rateTolerance); + H1(" --cbqpoffs Chroma Cb QP Offset. Default %d\n", param->cbQpOffset); + H1(" --crqpoffs Chroma Cr QP Offset. Default %d\n", param->crQpOffset); + H1(" --scaling-list Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n"); + H1(" --lambda-file Specify a file containing replacement values for the lambda tables\n"); + H1(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n"); + H1(" Blank lines and lines starting with hash(#) are ignored\n"); + H1(" Comma is considered to be white-space\n"); H0("\nLoop filters (deblock and SAO):\n"); - H0(" --[no-]lft Enable Deblocking Loop Filter. Default %s\n", OPT(param->bEnableLoopFilter)); + H0(" --[no-]deblock Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter)); H0(" --[no-]sao Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO)); - H0(" --[no-]sao-non-deblock Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked)); + H1(" --[no-]sao-non-deblock Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked)); H0("\nVUI options:\n"); H0(" --sar Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n"); H0(" Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n"); H0(" 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n"); H0(" 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of . Default %d\n", param->vui.aspectRatioIdc); - H0(" --crop-rect Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n"); - H0(" --overscan Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n"); + H1(" --crop-rect Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n"); + H1(" --overscan Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n"); H0(" --videoformat Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n"); H0(" --range Specify black level and range of luma and chroma signals as full or limited Default limited\n"); H0(" --colorprim Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,\n"); @@ -480,21 +491,25 @@ void CLIOptions::showHelp(x265_param *param) H0(" --transfer Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,\n"); H0(" smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n"); H0(" bt2020-10, bt2020-12. Default undef\n"); - H0(" --colormatrix Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n"); - H0(" smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n"); - H0(" --chromaloc Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField); + H1(" --colormatrix Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n"); + H1(" smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n"); + H1(" --chromaloc Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField); H0("\nBitstream options:\n"); H0(" --[no-]info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI)); H0(" --[no-]aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters)); - H0(" --[no-]hrd Enable HRD parameters signalling. Default %s\n", OPT(param->bEmitHRDSEI)); + H0(" --[no-]hrd Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI)); H0(" --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders)); - H0(" --hash Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI); - H0("\nReconstructed video options (debugging):\n"); - H0("-r/--recon Reconstructed raw image YUV or Y4M output file name\n"); - H0(" --recon-depth Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n"); + H1(" --hash Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI); + H1("\nReconstructed video options (debugging):\n"); + H1("-r/--recon Reconstructed raw image YUV or Y4M output file name\n"); + H1(" --recon-depth Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n"); #undef OPT #undef H0 - printf("\n\nFull documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n"); +#undef H1 + + if (level < X265_LOG_DEBUG) + printf("\nUse --log-level full --help for a full listing\n"); + printf("\n\nComplete documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n"); exit(0); } @@ -510,7 +525,6 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param) const char *preset = NULL; const char *tune = NULL; const char *profile = NULL; - const char *analysisfn = "x265_analysis.dat"; if (argc <= 1) { @@ -603,7 +617,6 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param) OPT("profile") profile = optarg; /* handled last */ OPT("preset") /* handled above */; OPT("tune") /* handled above */; - OPT("analysis-file") analysisfn = optarg; OPT("qpfile") { this->qpfile = fopen(optarg, "rb"); @@ -751,163 +764,9 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param) x265_log(NULL, X265_LOG_ERROR, "failed to open bitstream file <%s> for writing\n", bitstreamfn); return true; } - - if (param->analysisMode) - { - const char *mode = param->analysisMode == X265_ANALYSIS_SAVE ? "wb" : "rb"; - this->analysisFile = fopen(analysisfn, mode); - if (!this->analysisFile) - { - x265_log(NULL, X265_LOG_ERROR, "failed to open analysis file %s\n", analysisfn); - return true; - } - } - return false; } -bool CLIOptions::validateFanout(x265_param *param) -{ -#define CMP_OPT_FANOUT(opt, param_val)\ - {\ - bErr = 0;\ - p = strstr(paramBuf, opt "=");\ - char* q = strstr(paramBuf, "no-"opt);\ - if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\ - bErr = 1;\ - else if (!param_val && !q)\ - bErr = 1;\ - else if (param_val && (q || !strstr(paramBuf, opt)))\ - bErr = 1;\ - if (bErr)\ - {\ - x265_log(param, X265_LOG_ERROR, "different " opt " setting than given in analysis file (%d vs %d)\n", param_val, i);\ - X265_FREE(paramBuf);\ - return false;\ - }\ - } - - char *p = NULL, *paramBuf; - int i, j; - uint32_t k , l; - bool bErr = false; - - paramBuf = X265_MALLOC(char, MAXPARAMSIZE); - if (!paramBuf) - return false; - - fread(paramBuf, 1, MAXPARAMSIZE, this->analysisFile); - - /* check whether fanout options are compatible */ - if (strncmp(paramBuf, "#options:", 9)) - { - x265_log(param, X265_LOG_ERROR, "options list in analysis file is not valid\n"); - X265_FREE(paramBuf); - return false; - } - - char* buf = strchr(paramBuf, '\n'); - if (!buf) - { - x265_log(param, X265_LOG_ERROR, "Malformed analysis file\n"); - X265_FREE(paramBuf); - return false; - } - *buf = '\0'; - fseek(this->analysisFile, (int)strlen(paramBuf) + 1, SEEK_SET); - - if (sscanf(paramBuf, "#options: %dx%d", &i, &j) != 2) - { - x265_log(param, X265_LOG_ERROR, "Resolution specified in analysis file is not valid\n"); - X265_FREE(paramBuf); - return false; - } - if ((p = strstr(paramBuf, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2) - { - x265_log(param, X265_LOG_ERROR, "fps specified in analysis file is not valid\n"); - X265_FREE(paramBuf); - return false; - } - if (k != param->fpsNum || l != param->fpsDenom) - { - x265_log(param, X265_LOG_ERROR, "fps mismatch than given in analysis file (%u/%u vs %u/%u)\n", - param->fpsNum, param->fpsDenom, k, l); - X265_FREE(paramBuf); - return false; - } - - CMP_OPT_FANOUT("bitdepth", param->internalBitDepth); - CMP_OPT_FANOUT("weightp", param->bEnableWeightedPred); - CMP_OPT_FANOUT("bframes", param->bframes); - CMP_OPT_FANOUT("b-pyramid", param->bBPyramid); - CMP_OPT_FANOUT("b-adapt", param->bFrameAdaptive); - CMP_OPT_FANOUT("open-gop", param->bOpenGOP); - CMP_OPT_FANOUT("keyint", param->keyframeMax); - CMP_OPT_FANOUT("min-keyint", param->keyframeMin); - CMP_OPT_FANOUT("scenecut", param->scenecutThreshold); - CMP_OPT_FANOUT("ctu", (int)param->maxCUSize); - CMP_OPT_FANOUT("ref", param->maxNumReferences); - CMP_OPT_FANOUT("rc-lookahead", param->lookaheadDepth); - -#undef CMP_OPT_FANOUT - - X265_FREE(paramBuf); - return true; -} - -void CLIOptions::readAnalysisFile(x265_picture* pic, x265_param* p) -{ - int poc, width, height; - uint32_t numPart, numCU; - fread(&width, sizeof(int), 1, this->analysisFile); - fread(&height, sizeof(int), 1, this->analysisFile); - fread(&poc, sizeof(int), 1, this->analysisFile); - fread(&pic->sliceType, sizeof(int), 1, this->analysisFile); - fread(&numCU, sizeof(int), 1, this->analysisFile); - fread(&numPart, sizeof(int), 1, this->analysisFile); - - if (poc != pic->poc || width != p->sourceWidth || height != p->sourceHeight) - { - x265_log(NULL, X265_LOG_WARNING, "Error in reading intra-inter data.\n"); - x265_free_analysis_data(pic); - return; - } - - fread(pic->analysisData.intraData->depth, - sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); - fread(pic->analysisData.intraData->modes, - sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); - fread(pic->analysisData.intraData->partSizes, - sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); - fread(pic->analysisData.intraData->poc, - sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile); - fread(pic->analysisData.intraData->cuAddr, - sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile); - fread(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile); -} - -void CLIOptions::writeAnalysisFile(x265_picture* pic, x265_param *p) -{ - uint64_t seekTo = pic->poc * this->analysisRecordSize + this->analysisHeaderSize; - fseeko(this->analysisFile, seekTo, SEEK_SET); - fwrite(&p->sourceWidth, sizeof(int), 1, this->analysisFile); - fwrite(&p->sourceHeight, sizeof(int), 1, this->analysisFile); - fwrite(&pic->poc, sizeof(int), 1, this->analysisFile); - fwrite(&pic->sliceType, sizeof(int), 1, this->analysisFile); - fwrite(&pic->analysisData.numCUsInFrame, sizeof(int), 1, this->analysisFile); - fwrite(&pic->analysisData.numPartitions, sizeof(int), 1, this->analysisFile); - - fwrite(pic->analysisData.intraData->depth, - sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); - fwrite(pic->analysisData.intraData->modes, - sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); - fwrite(pic->analysisData.intraData->partSizes, - sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); - fwrite(pic->analysisData.intraData->poc, sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile); - fwrite(pic->analysisData.intraData->cuAddr, sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile); - fwrite(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile); -} - bool CLIOptions::parseQPFile(x265_picture &pic_org) { int32_t num = -1, qp, ret; @@ -948,7 +807,7 @@ int main(int argc, char **argv) // This uses Microsoft's proprietary WCHAR type, but this only builds on Windows to start with VLDSetReportOptions(VLD_OPT_REPORT_TO_DEBUGGER | VLD_OPT_REPORT_TO_FILE, L"x265_leaks.txt"); #endif - PPA_INIT(); + PROFILE_INIT(); x265_param *param = x265_param_alloc(); CLIOptions cliopt; @@ -979,7 +838,8 @@ int main(int argc, char **argv) x265_picture pic_orig, pic_out; x265_picture *pic_in = &pic_orig; - x265_picture *pic_recon = cliopt.recon ? &pic_out : NULL; + /* Allocate recon picture if analysisMode is enabled */ + x265_picture *pic_recon = (cliopt.recon || !!param->analysisMode) ? &pic_out : NULL; uint32_t inFrameCount = 0; uint32_t outFrameCount = 0; x265_nal *p_nal; @@ -1000,35 +860,12 @@ int main(int argc, char **argv) x265_picture_init(param, pic_in); - if (param->analysisMode && !pic_recon) - { - x265_log(NULL, X265_LOG_ERROR, "Must specify recon with analysis-mode option.\n"); - goto fail; - } if (param->analysisMode) { - if (param->analysisMode == X265_ANALYSIS_SAVE) + if (param->bDistributeModeAnalysis || param->bDistributeMotionEstimation) { - char *p = x265_param2string(param); - if (!p) - { - x265_log(NULL, X265_LOG_ERROR, "analysis: buffer allocation failure, aborting"); - goto fail; - } - uint32_t numCU = pic_in->analysisData.numCUsInFrame; - uint32_t numPart = pic_in->analysisData.numPartitions; - - cliopt.analysisRecordSize = ((sizeof(int) * 4 + sizeof(uint32_t) * 2) + sizeof(x265_inter_data) * numCU * 85 + - sizeof(uint8_t) * 2 * numPart * numCU + sizeof(char) * numPart * numCU + sizeof(int) * numCU + sizeof(uint32_t) * numCU); - - fprintf(cliopt.analysisFile, "#options: %s\n", p); - cliopt.analysisHeaderSize = ftell(cliopt.analysisFile); - X265_FREE(p); - } - else - { - if (!cliopt.validateFanout(param)) - goto fail; + x265_log(NULL, X265_LOG_ERROR, "Analysis load/save options incompatible with pmode/pme"); + goto fail; } } @@ -1069,13 +906,6 @@ int main(int argc, char **argv) ditherImage(*pic_in, param->sourceWidth, param->sourceHeight, errorBuf, X265_DEPTH); pic_in->bitDepth = X265_DEPTH; } - if (param->analysisMode) - { - x265_alloc_analysis_data(pic_in); - - if (param->analysisMode == X265_ANALYSIS_LOAD) - cliopt.readAnalysisFile(pic_in, param); - } } int numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, pic_in, pic_recon); @@ -1085,15 +915,9 @@ int main(int argc, char **argv) break; } outFrameCount += numEncoded; - if (numEncoded && pic_recon) - { - cliopt.recon->writePicture(pic_out); - if (param->analysisMode == X265_ANALYSIS_SAVE) - cliopt.writeAnalysisFile(pic_recon, param); - if (param->analysisMode) - x265_free_analysis_data(pic_recon); - } + if (numEncoded && pic_recon && cliopt.recon) + cliopt.recon->writePicture(pic_out); if (nal) cliopt.writeNALs(p_nal, nal); @@ -1106,15 +930,8 @@ int main(int argc, char **argv) { uint32_t numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, NULL, pic_recon); outFrameCount += numEncoded; - if (numEncoded && pic_recon) - { + if (numEncoded && pic_recon && cliopt.recon) cliopt.recon->writePicture(pic_out); - if (param->analysisMode == X265_ANALYSIS_SAVE) - cliopt.writeAnalysisFile(pic_recon, param); - if (param->analysisMode) - x265_free_analysis_data(pic_recon); - } - if (nal) cliopt.writeNALs(p_nal, nal); diff --git a/source/x265.def.in b/source/x265.def.in index e78bfc1..9e964f6 100644 --- a/source/x265.def.in +++ b/source/x265.def.in @@ -9,8 +9,6 @@ x265_param_free x265_picture_init x265_picture_alloc x265_picture_free -x265_alloc_analysis_data -x265_free_analysis_data x265_param_apply_profile x265_max_bit_depth x265_version_str diff --git a/source/x265.h b/source/x265.h index e5474b7..f7c2360 100644 --- a/source/x265.h +++ b/source/x265.h @@ -88,36 +88,16 @@ typedef struct x265_nal uint8_t* payload; } x265_nal; -/* Stores inter (motion estimation) analysis data for a single frame */ -typedef struct x265_inter_data -{ - uint32_t zOrder; - int ref[2]; - int costZero[2]; - int16_t mvx[2]; - int16_t mvy[2]; - uint32_t depth; - int poc; - uint32_t cuAddr; -} x265_inter_data; - -/* Stores intra (motion estimation) analysis data for a single frame */ -typedef struct x265_intra_data -{ - uint8_t* depth; - uint8_t* modes; - char* partSizes; - int* poc; - uint32_t* cuAddr; -} x265_intra_data; - /* Stores all analysis data for a single frame */ typedef struct x265_analysis_data { - x265_inter_data* interData; - x265_intra_data* intraData; + uint32_t frameRecordSize; + int32_t poc; + int32_t sliceType; uint32_t numCUsInFrame; uint32_t numPartitions; + void* interData; + void* intraData; } x265_analysis_data; /* Used to pass pictures into the encoder, and to get picture data back out of @@ -290,7 +270,6 @@ typedef enum #define X265_ANALYSIS_OFF 0 #define X265_ANALYSIS_SAVE 1 #define X265_ANALYSIS_LOAD 2 - typedef struct { int planes; @@ -419,7 +398,7 @@ typedef struct x265_param * per-slice statistics to this log file in encode order. Otherwise the * encoder will emit per-stream statistics into the log file when * x265_encoder_log is called (presumably at the end of the encode) */ - const char *csvfn; + char *csvfn; /* Enable the generation of SEI messages for each encoded frame containing * the hashes of the three reconstructed picture planes. Most decoders will @@ -504,13 +483,13 @@ typedef struct x265_param /* The additional depth the residual quadtree is allowed to recurse beyond * the coding quadtree, for inter coded blocks. This must be between 1 and - * 3. The higher the value the more efficiently the residual can be + * 4. The higher the value the more efficiently the residual can be * compressed by the DCT transforms, at the expense of much more compute */ uint32_t tuQTMaxInterDepth; /* The additional depth the residual quadtree is allowed to recurse beyond * the coding quadtree, for intra coded blocks. This must be between 1 and - * 3. The higher the value the more efficiently the residual can be + * 4. The higher the value the more efficiently the residual can be * compressed by the DCT transforms, at the expense of much more compute */ uint32_t tuQTMaxIntraDepth; @@ -664,7 +643,7 @@ typedef struct x265_param /* Enable the use of `coded block flags` (flags set to true when a residual * has been coded for a given block) to avoid intra analysis in likely skip - * blocks. Default is disabled */ + * blocks. Only applicable in RD levels 5 and 6. Default is disabled */ int bEnableCbfFastMode; /* Enable early skip decisions to avoid intra and inter analysis in likely @@ -712,9 +691,10 @@ typedef struct x265_param * buffer and use this analysis information to reduce the amount of work * the encoder must perform. Default X265_ANALYSIS_OFF */ int analysisMode; + /* Filename for analysisMode save/load. Default name is "x265_analysis.dat" */ + char* analysisFileName; /*== Coding tools ==*/ - /* Enable the implicit signaling of the sign bit of the last coefficient of * each transform unit. This saves one bit per TU at the expense of figuring * out which coefficient can be toggled with the least distortion. @@ -735,9 +715,17 @@ typedef struct x265_param /* Enable the deblocking loop filter, which improves visual quality by * reducing blocking effects at block edges, particularly at lower bitrates * or higher QP. When enabled it adds another CU row of reference lag, - * reducing frame parallelism effectiveness. Default is enabled */ + * reducing frame parallelism effectiveness. Default is enabled */ int bEnableLoopFilter; + /* deblocking filter tC offset [-6, 6] -6 light filter, 6 strong. + * This is the coded div2 value, actual offset is doubled at use */ + int deblockingFilterTCOffset; + + /* deblocking filter Beta offset [-6, 6] -6 light filter, 6 strong + * This is the coded div2 value, actual offset is doubled at use */ + int deblockingFilterBetaOffset; + /* Enable the Sample Adaptive Offset loop filter, which reduces distortion * effects by adjusting reconstructed sample values based on histogram * analysis to better approximate the original samples. When enabled it adds @@ -769,9 +757,13 @@ typedef struct x265_param * regardless of this setting. */ int bIntraInBFrames; - /* An integer value in range of 100 to 1000, which denotes strength of noise - * reduction */ - int noiseReduction; + /* An integer value in range of 0 to 2000, which denotes strength of noise + * reduction in intra CUs. 0 means disabled */ + int noiseReductionIntra; + + /* An integer value in range of 0 to 2000, which denotes strength of noise + * reduction in inter CUs. 0 means disabled */ + int noiseReductionInter; /* The lossless flag enables true lossless coding, by bypassing scaling, * transform, quantization and in-loop filter processes. This is used for @@ -802,7 +794,7 @@ typedef struct x265_param int bitrate; /* The degree of rate fluctuation that x265 tolerates. Rate tolerance is used - * alongwith overflow (difference between actual and target bitrate), to adjust + * along with overflow (difference between actual and target bitrate), to adjust * qp. Default is 1.0 */ double rateTolerance; @@ -824,12 +816,12 @@ typedef struct x265_param double rfConstant; /* Enable adaptive quantization. This mode distributes available bits between all - * macroblocks of a frame, assigning more bits to low complexity areas. Turning + * CTUs of a frame, assigning more bits to low complexity areas. Turning * this ON will usually affect PSNR negatively, however SSIM and visual quality - * generally improves. Default: X265_AQ_AUTO_VARIANCE */ + * generally improves. Default: X265_AQ_VARIANCE */ int aqMode; - /* Sets the strength of AQ bias towards low detail macroblocks. Valid only if + /* Sets the strength of AQ bias towards low detail CTUs. Valid only if * AQ is enabled. Default value: 1.0. Acceptable values between 0.0 and 3.0 */ double aqStrength; @@ -856,14 +848,15 @@ typedef struct x265_param /* In CRF mode, minimum CRF as caused by VBV */ double rfConstantMin; - /* Two pass (INCOMPLETE) */ + /* Multi-pass encoding */ /* Enable writing the stats in a multipass encode to the stat output file */ int bStatWrite; /* Enable loading data from the stat input file in a multi pass encode */ int bStatRead; - /* Filename of the 2pass output/input stats file */ + /* Filename of the 2pass output/input stats file, if unspecified the + * encoder will default to using x265_2pass.log */ char* statFileName; /* temporally blur quants */ @@ -991,7 +984,7 @@ void x265_setup_primitives(x265_param *param, int cpu); * special in any way, but using this method together with x265_param_free() * and x265_param_parse() to set values by name allows the application to treat * x265_param as an opaque data struct for version safety */ -x265_param *x265_param_alloc(); +x265_param *x265_param_alloc(void); /* x265_param_free: * Use x265_param_free() to release storage for an x265_param instance @@ -1039,7 +1032,7 @@ static const char * const x265_preset_names[] = { "ultrafast", "superfast", "ver * 100 times faster than placebo! * * Currently available tunings are: */ -static const char * const x265_tune_names[] = { "psnr", "ssim", "zerolatency", "fastdecode", 0 }; +static const char * const x265_tune_names[] = { "psnr", "ssim", "grain", "zerolatency", "fastdecode", "cbr", 0 }; /* returns 0 on success, negative on failure (e.g. invalid preset/tune name). */ int x265_param_default_preset(x265_param *, const char *preset, const char *tune); @@ -1049,22 +1042,12 @@ int x265_param_default_preset(x265_param *, const char *preset, const char *tune * special in any way, but using this method together with x265_picture_free() * and x265_picture_init() allows some version safety. New picture fields will * always be added to the end of x265_picture */ -x265_picture *x265_picture_alloc(); +x265_picture *x265_picture_alloc(void); /* x265_picture_free: * Use x265_picture_free() to release storage for an x265_picture instance * allocated by x265_picture_alloc() */ void x265_picture_free(x265_picture *); - -/* x265_alloc_analysis_data: - * Allocate memory to hold analysis data, returns 0 on success else negative */ -int x265_alloc_analysis_data(x265_picture*); - -/* x265_free_analysis_data: - * Use x265_free_analysis_data to release storage of members allocated by - * x265_alloc_analysis_data */ -void x265_free_analysis_data(x265_picture*); - /*** * Initialize an x265_picture structure to default values. It sets the pixel * depth and color space to the encoder's internal values and sets the slice