+++ /dev/null
-doc/uncrustify/uncrustify.exe
-build/
-*.rej
-*.orig
-*.hevc
-*.yuv
-*.y4m
-*.out
-*.swp
-.DS_Store
-.pc
+++ /dev/null
-repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
-node: 5e604833c5aa605d0b6efbe5234492b5e7d8ac61
-branch: stable
-tag: 1.4
+++ /dev/null
-syntax: glob\r
-doc/uncrustify/uncrustify.exe\r
-build/\r
-**.rej\r
-**.orig\r
-**.hevc\r
-**.yuv\r
-**.y4m\r
-**.out\r
-**.swp\r
-.DS_Store\r
+++ /dev/null
-681eabf8a086faea6141f9c1f5a72c9897ed8b29 LASTKNOWNGOOD1
-3ec4837e6f6c7159f438e1f537dff117c93ee139 LASTKNOWNGOOD2
-9a6800e84295db446fdce2e7f27059ec8ae838a7 LASTKNOWNGOOD
-99fab2ef92be051cd3b3b2d817064cead282b42c 0.1
-b3471d9009f5cd487b23c8c61a6bfff8980e54f2 0.2
-3767fbfa970ff4b2dc2e8647db0274168727147e 0.3
-2ba6ec553f218d2b06ad803b87d6ec751fd639f7 0.4
-93707bc4fccdaa89a1f2da11db8808ca912a691c 0.4.1
-69acb3cb777f977f5edde908069ac565915dd366 0.5
-b970ffbdd696e3ce45c93b315902eb6366ff085e 0.6
-d24e2a8c4326b0cd01bfa6c414c5378481af9018 0.7
-527d03c56d6860dc979ddea1196f7e94d13d3e82 0.8
-82bbd2bf3b49ba086be0f0922f91fe0084896351 0.9
-cea97c4d79456842e00ade6be6fd5ec34610e5f8 1.0
-ae9609aeebdc3271114168ece003679e9b1dca1b 1.1
-d6257335c5370ee54317a0426a12c1f0724b18b9 1.2
-c1e4fc0162c14fdb84f5c3bd404fb28cfe10a17f 1.3
+2014-12-23 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/encoder/rdcost.h, source/encoder/search.cpp:
+ rdcost: unify scaleChromaDist*()
+ [5f9f7194267b] [tip]
+
+2014-12-23 Gopu Govindaswamy <gopu@multicorewareinc.com>
+
+ * source/encoder/encoder.cpp:
+ encoder: allocate memory for inter and intra analysis data based on
+ slicetype
+ [9fdab427a191]
+
+ * source/encoder/analysis.cpp, source/encoder/analysis.h:
+ analysis: remove redundant argument in compressIntraCU
+ [c4ec3f22846b]
+
+2014-12-20 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/encoder/search.cpp:
+ fix 4:4:4 rd<=1
+ [8d2f418829c8]
+
+2014-12-18 David T Yuen <dtyx265@gmail.com>
+
+ * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm,
+ source/common/x86/dct8.h:
+ asm: idct[8x8] sse2 12232 -> 3500 over c code 3550 -> 3500 over
+ intrinsic
+ [7b816fdb393d]
+
+2014-12-17 Steve Borho <steve@borho.org>
+
+ * source/PPA/ppaCPUEvents.h, source/encoder/frameencoder.cpp:
+ ppa: emit one event per CTU for more clarity, disable frame threads
+ events
+
+ The frame threads are generally uninteresting when WPP is in use
+ [78ae7996a1ce]
+
+ * source/PPA/ppaCPUEvents.h, source/encoder/frameencoder.cpp,
+ source/encoder/framefilter.cpp, source/encoder/slicetype.cpp,
+ source/x265.cpp:
+ ppa: refine event names
+
+ Drop the unused names, remove uninteresting events. Try to cover the
+ main thread pool tasks and the frame encoder times.
+ [6cbd7d26b2a1]
+
+ * source/PPA/ppa.cpp, source/PPA/ppa.h, source/PPA/ppaApi.h:
+ ppa: simplify interfaces, enforce coding style
+ [952a2a361fcb]
+
+ * source/common/common.h, source/encoder/analysis.cpp,
+ source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp,
+ source/x265.cpp:
+ ppa: minimize code foot-print of profiling events
+
+ This will allow us to add support for more profiling systems without
+ littering the code
+ [3315d6c0ced1]
+
+ * doc/reST/cli.rst, source/x265.h:
+ doc: improve documentation for --stats and multi-pass in general
+ [42fb030a4c43]
+
+2014-12-16 Min Chen <chenm003@163.com>
+
+ * source/encoder/nal.cpp:
+ fix: output wrong WppEntryOffset when emulating start code at end of
+ WPP row
+ [295d033cb091]
+
+2014-12-16 Aasaipriya Chandran <aasaipriya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: chroma_hpp[16x16] for colorspace i420 in avx2 improve
+ 1540c->969c
+ [775ebb4694ad]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: chroma_hpp[32x32] for colorspace i420 in avx2 improve
+ 6189c->3537c
+ [619c0e654f5b]
+
+2014-12-13 Steve Borho <steve@borho.org>
+
+ * source/encoder/api.cpp, source/encoder/encoder.cpp,
+ source/encoder/encoder.h:
+ encoder: combine create() and init() functions
+
+ They were always called back-to-back() and their functionality was
+ non-distinct. It also now checks for abort errors at startup and
+ returns a NULL from the encoder open function (early aborts are
+ usually malloc failures)
+ [6ba7be7b1697]
+
+ * source/CMakeLists.txt, source/cmake/CMakeASM_YASMInformation.cmake:
+ cmake: eoln and white-space fixes, slight refactor
+ [ee36b6311aaf]
+
+2014-12-12 Steve Borho <steve@borho.org>
+
+ * source/encoder/analysis.h:
+ analysis: typo
+ [d00a5b93c07e]
+
+ * source/CMakeLists.txt, source/cmake/CMakeASM_YASMInformation.cmake:
+ cmake: allow position independent code to be generally configurable
+ (fixes #91)
+
+ Allow the builder to over-ride the default
+ [afdcb68dace4]
+
+2014-12-11 Steve Borho <steve@borho.org>
+
+ * source/encoder/entropy.cpp, source/encoder/entropy.h:
+ entropy: add methods to estimate CU mode decision costs
+ [e0374c37e745]
+
+2014-12-12 Steve Borho <steve@borho.org>
+
+ * source/common/pixel.cpp:
+ pixel: nits
+ [750839e8e0cf]
+
+ * doc/reST/cli.rst, source/common/param.cpp, source/x265.h:
+ api: change default AQ mode to 1
+
+ We've received a lot of feedback that AQ mode 2 is often
+ problematic, but AQ mode 1 is generally safe and useful.
+ [cbf5cad2e12b]
+
+2014-12-12 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vps[4x4] in avx2: improve 337c->219c
+ [6f770a6b24f0]
+
+2014-12-11 Steve Borho <steve@borho.org>
+
+ * build/README.txt:
+ build: update README to not be so specific about yasm 1.2.0
+ [b1c2ef980dfe]
+
+2014-12-10 Steve Borho <steve@borho.org>
+
+ * source/encoder/reference.cpp:
+ reference: avoid weighting pixels when plane is unweighted
+
+ Just because the luma plane is weighted does not mean either of the
+ chroma planes are also weighted. If the weight parameters for a
+ given plane are not present, then just directly use the un-weighted
+ reference plane.
+ [ae50be4c3a6e]
+
+2014-12-11 Aasaipriya Chandran <aasaipriya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: chroma_hpp[4x4] for colorspace i420 in avx2 improve 217c->192c
+ [667e4ea0899f]
+
+2014-12-10 Steve Borho <steve@borho.org>
+
+ * doc/reST/cli.rst:
+ doc: describe what happens when psy-rd is too high for bitrate
+ [9c3b478a60b2]
+
+2014-12-10 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: chroma_vpp[32x32] for colorspace i420 in avx2: improve
+ 3881c->1933c
+ [04d145864dd6]
+
+2014-12-10 Steve Borho <steve@borho.org>
+
+ * source/encoder/analysis.cpp:
+ analysis: avoid redundant MC work
+ [9e244ebe21d2]
+
+ * source/encoder/analysis.cpp:
+ analysis: fix chroma predictions for 2Nx2N bidir at zero mv
+
+ Valgrind discovered that the chroma predictions were not in fact
+ predicted
+ [0dc816f49c01]
+
+ * source/x265.h:
+ api: add some blank lines
+ [ab1e1e0ca75c]
+
+2014-12-09 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h:
+ asm: chroma_vpp[4x4] for colorspace i422 in avx2: improve 228c->184c
+ [5f16dc82652a]
+
+2014-12-10 Steve Borho <steve@borho.org>
+
+ * source/common/lowres.cpp, source/common/lowres.h,
+ source/encoder/frameencoder.cpp, source/encoder/motion.cpp,
+ source/encoder/reference.cpp, source/encoder/reference.h,
+ source/encoder/slicetype.cpp:
+ reference: weight chroma planes of reference pictures if using
+ chroma satd
+ [6c32c8d4e0a1]
+
+2014-12-08 Steve Borho <steve@borho.org>
+
+ * doc/reST/cli.rst, source/encoder/analysis.cpp,
+ source/encoder/frameencoder.cpp, source/encoder/motion.cpp,
+ source/encoder/motion.h, source/encoder/search.cpp,
+ source/encoder/slicetype.cpp:
+ motion: chroma ME [CHANGES OUTPUTS]
+
+ include chroma distortion in satd decisions when --subme > 2 and
+ chroma blocks are multiples of 4x4
+
+ This required making the MotionEstimate class more aware of PicYuv
+ and its indexing scheme so that it could find the correct chroma
+ pixels to interpolate. This allowed me to merge the setSourcePlane()
+ method into the lookahead's version of setSourcePU.
+
+ This requires further work. The Reference class needs to generate
+ weighted chroma planes if subpel refine will use chroma residual
+ cost. Until this is fixed, the chroma subpel steps will use
+ unweighted reference pixels.
+ [afd5620c77a4]
+
+2014-12-09 Steve Borho <steve@borho.org>
+
+ * source/common/pixel.cpp, source/common/primitives.cpp:
+ primitives: use NULL chroma satd func pointers for blocks not
+ capable of satd
+
+ If the block is not a multiple of 4x4, then chroma satd measurements
+ are not possible, so we will disable chroma residual measurements
+ for these block sizes (and thus only measure luma residual)
+ [4c97d85c8488]
+
+ * source/common/primitives.cpp:
+ primitives: use luma satd functions for chroma, where applicable
+
+ The commented lines should be considered TODO items for the assembly
+ team
+ [29489f2fc2c7]
+
+ * source/common/pixel.cpp, source/common/primitives.h:
+ primitives: add a chroma satd table that is indexed by luma
+ partition
+
+ There are a number of chroma partitions that have dimensions of 2 or
+ 6 and those cannot use satd (which is 4x4 based), so we degrade them
+ down to SAD which makes me unhappy.
+ [47c490836fd8]
+
+2014-12-08 Steve Borho <steve@borho.org>
+
+ * source/common/lowres.h, source/encoder/reference.cpp,
+ source/encoder/reference.h:
+ reference: move reconPic pointer to base class so it is available to
+ ME
+ [dd55fd39745c]
+
+ * source/encoder/motion.cpp:
+ motion: sync argument names between the header and the cpp file
+ [e2b958539e6a]
+
+ * source/common/yuv.cpp:
+ yuv: fix size check in copyFromYuv
+
+ The target buffer needs to be as large as or larger than the source.
+ The fact that this check has never failed tells me all users of this
+ function have equal sized arguments.
+ [15be837edb36]
+
+ * source/encoder/search.cpp:
+ search: rename index variable to puIdx for consistency
+ [1cab6a4c0ab8]
+
+ * source/common/yuv.cpp, source/common/yuv.h,
+ source/encoder/analysis.cpp, source/encoder/motion.cpp,
+ source/encoder/motion.h, source/encoder/search.cpp:
+ motion: add a version of setSourcePU which can accept fenc from
+ another Yuv
+
+ The analysis code has already gone through the trouble of loading
+ the CU's fenc pixels from the source picture into a much smaller Yuv
+ buffer with small strides. This allows us to avoid accessing the
+ fenc PicYuv in a performance critical portion of the encoder.
+
+ We utilize the Yuv class to copy the PU, since it already has logic
+ for calculating part offsets for luma and chroma
+ [1d1f803a3eec]
+
+ * source/encoder/motion.cpp, source/encoder/motion.h,
+ source/encoder/search.cpp, source/encoder/slicetype.cpp,
+ source/encoder/slicetype.h:
+ motion: use Yuv instance to hold fenc PU pixels (preparing for
+ chroma ME)
+
+ This required making an init function which accepts the encoder
+ color space. We use 4:0:0 for lookahead since it does not keep
+ chroma planes. Note that I explicitly renamed this Yuv instance
+ fencPUYuv to make sure people understand it is not a duplicate of
+ the fencYuv kept by the Analysis structure; it will often be a sub-
+ partition of the CU fenc yuv.
+ [e640c8461495]
+
+ * source/encoder/slicetype.cpp:
+ slicetype: cleanups - use bufSATD method where applicable
+ [b5b05c94ae7c]
+
+ * source/common/yuv.cpp:
+ yuv: plumb in support for mono-chrome YUV buffers
+
+ The need for this will be obvious in the next commit
+ [5a44d694ed9b]
+
+2014-12-09 Aasaipriya Chandran <aasaipriya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: chroma_hpp[8x8] for colorspace i420 in avx2 improve 530c->373c
+ [88498ec9b10b]
+
+2014-12-08 Steve Borho <steve@borho.org>
+
+ * source/common/x86/asm-primitives.cpp:
+ asm: fix x86 link errors
+ [b376435b31c1]
+
+2014-12-09 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: chroma_vpp[16x16] for colorspace i420 in avx2: improve
+ 998c->978c
+ [d042d1ea2d69]
+
+2014-12-05 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: chroma_vpp[8x8] for colorspace i420 in avx2: improve 338c->269c
+ [fee9fb1f9762]
+
+2014-12-06 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/cudata.h, source/encoder/analysis.cpp,
+ source/encoder/entropy.cpp, source/encoder/entropy.h,
+ source/encoder/search.cpp, source/encoder/search.h:
+ refine tuDepth related
+ [53f7efef5ebd]
+
+2014-12-05 Steve Borho <steve@borho.org>
+
+ * source/cmake/version.cmake:
+ cmake: do not use a cache string for version found in hg_archive.txt
+ (refs #84)
+
+ This was not passing the tagged version number to version.cpp
+ [35d086074bb5]
+
+2014-12-04 Aarthi Thirumalai <Aarthi Thirumalai>
+
+ * source/encoder/ratecontrol.cpp:
+ rc : fix bug in deciding qp for first frame in CRF
+ [1458ad34157c]
+
+ * source/encoder/rdcost.h, source/encoder/sao.cpp:
+ rc: fix chroma qp and chroma lambda derivations.
+
+ fix the chroma qp values for Main10 profile, derive chroma qp from
+ luma qp values according to the HEVC spec. improves quality at high
+ qps.
+ [a1e76461c0d4]
+
+2014-12-05 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/encoder/analysis.cpp:
+ analysis: comments
+ [4ae9691c1a23]
+
+2014-12-05 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/encoder/analysis.cpp:
+ fix chroma distortion for 4:2:2
+ [42df5c8bdb25]
+
+2014-12-04 Steve Borho <steve@borho.org>
+
+ * source/encoder/CMakeLists.txt:
+ cmake: disable idiotic uninitialized local variable warnings from VC
+
+ If the compiler is not going to make any minimal attempt to figure
+ out if a variable was initialized, I am not going to make any
+ attempt to look at their stupid warnings.
+ [c9fd35f97e6d]
+
+2014-12-04 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h:
+ asm: chroma_vpp[4x4] for colorspace i420 in avx2: improve 228c->184c
+ [23e637065aec]
+
+2014-12-04 Steve Borho <steve@borho.org>
+
+ * source/encoder/analysis.cpp, source/encoder/analysis.h:
+ analysis: cache m_bChromaSa8d and reduce redundant work
+
+ Renamed some 'part' variables to 'puIdx' to avoid variable shadow
+ warnings and for consistency with search.cpp
+ [cc327e846dac]
+
+2014-12-04 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/encoder/analysis.cpp:
+ analysis: add chroma distortion to rdLevels 3 and 4
+
+ At these rdLevels, inter/bidir and merge candidate decisions were
+ being taken based on luma sa8dCost only. This will increase bitrate
+ and lower ssim slightly, with better subjective quality.
+
+ Also fixed some naming nits.
+ [1d2a11f6a33f]
+
+ * doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp,
+ source/encoder/frameencoder.cpp, source/encoder/search.cpp,
+ source/x265.cpp, source/x265.h:
+ noiseReduction: allow separate strengths to be specified for intra
+ and inter CUs
+ [ec06f5878e8b]
+
+2014-12-04 Aarthi Thirumalai <Aarthi Thirumalai>
+
+ * source/common/x86/asm-primitives.cpp:
+ primitives: fix build error in refactor of chroma p2s primitive.
+ [511dde5ac1de]
+
+2014-12-03 Steve Borho <steve@borho.org>
+
+ * source/common/ipfilter.cpp, source/common/lowres.cpp,
+ source/common/pixel.cpp, source/common/predict.cpp,
+ source/common/primitives.cpp, source/common/primitives.h,
+ source/common/quant.cpp, source/common/shortyuv.cpp,
+ source/common/x86/asm-primitives.cpp, source/common/yuv.cpp,
+ source/encoder/search.cpp, source/test/ipfilterharness.cpp,
+ source/test/pixelharness.cpp:
+ primitives: cleanup EncoderPrimitives, refactor chroma p2s primitive
+
+ No behavior changes
+ [b1b5f06fe9ce]
+
+ * source/common/pixel.cpp, source/common/primitives.h:
+ primitives: remove unused chroma lowres primitive
+ [bfeee4ac5463]
+
+ * source/encoder/search.cpp:
+ search: avoid AMVP selection if both MVs are the same
+
+ This is a simple work avoidance optimization, should have no effect
+ on outputs
+ [2f66c3284c35]
+
+ * source/common/CMakeLists.txt, source/common/primitives.cpp:
+ cmake: remove buggy workarounds for partial SIMD support (fixes #92)
+
+ In the past, there were a number of primitives written in SIMD
+ intrinsics that could work without compiling with YASM. Most of
+ those are now gone, and we generally require YASM for SIMD support.
+ This commit remoes support for using the few remaining SIMD
+ intrinsics without having YASM to provide implementations of
+ x265_emms(), x265_cpu_cpuid(), etc. Fixing a bug in the process.
+ [d7b5e73fc91a]
+
+ * doc/reST/cli.rst:
+ doc: fix typo (closes #83)
+ [7192725cbb0a]
+
+ * doc/reST/cli.rst, source/common/param.cpp, source/x265.cpp,
+ source/x265.h:
+ param: allow NR values from 1..99, clarify docs (closes #87)
+ [21b869f9f706]
+
+ * doc/reST/Makefile, doc/reST/conf.py, doc/reST/x265.rst:
+ doc: add support for reST generated man-pages (closes #89)
+
+ This patch was attached to issue #89 by djcj
+ [ff08fd9b294c]
+
+ * source/common/constants.cpp:
+ constants: adjust lambda tabels for 10bit encodes (fixes #55)
+
+ Since samples are 10bits, where two bits of extra resolution has
+ been added to add more granularity, distortion also has two extra
+ bits. A typical resolution for this problem is to down-shift
+ distortion by 2 bits everywhere, before adding lambda * bits to
+ calculate RD cost. Instead, we multiply lambda by 4 (essentially
+ shift it up by two bits) so distortion and lambda * bits are both at
+ the higher scale.
+
+ lambda2 uses the square of the up-shifted lambda, so it has the
+ doubled up-shift same as the squared distortion values used for RDO.
+
+ Example output change: ./x265
+ /Volumes/video/sintel_trailer_2k_480p24.y4m o.bin --ssim --no-info
+
+ Main: 195.67 kb/s, SSIM Mean Y: 0.9833338 (17.782 dB) Main10 before:
+ 363.49 kb/s, SSIM Mean Y: 0.9888182 (19.515 dB) Main10 after: 206.54
+ kb/s, SSIM Mean Y: 0.9855121 (18.390 dB)
+ [014a1e0fb58b]
+
+2014-12-03 Gopu Govindaswamy <gopu@multicorewareinc.com>
+
+ * source/encoder/encoder.cpp:
+ encoder: fix binary mismatch for analysis load vs save with same
+ bitrate
+ [50d2b92ecc89]
+
+2014-12-02 Steve Borho <steve@borho.org>
+
+ * Merge
+ [de54cffaecf2]
+
+2014-11-27 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[8x16, 8x32] in avx2: improve 1139c->774c, 1968c->1452c
+ [2e055cbc9046]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[12x16] in avx2: improve 1977c->1418c
+ [ef4ca8474f5c]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[24x32] in avx2: improve 5637c->3695c
+ [8aeeaf6950f7]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[48x64] in avx2: improve 21298c->14696c
+ [d97b1c9f5106]
+
+2014-12-02 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/x265.cpp:
+ x265: add ratetol to command line help
+ [f636a0aadd68]
+
+2014-12-01 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/CMakeLists.txt, source/encoder/encoder.cpp, source/x265.h:
+ encoder: free csv file name
+
+ Since strdup is used uniformly for filenames, csvfn cannot be const.
+ [bde1753de250]
+
+2014-11-27 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[8x16, 8x32] in avx2: improve 1139c->774c, 1968c->1452c
+ [5ee693e4b5fa]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[12x16] in avx2: improve 1977c->1418c
+ [e280ce2e5076]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[24x32] in avx2: improve 5637c->3695c
+ [e1ca311bbb5b]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[48x64] in avx2: improve 21298c->14696c
+ [984271a3aae9]
+
+2014-11-30 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/x265.cpp:
+ x265: remove validateFanout
+ [d9f835ddd112]
+
+2014-11-27 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/pixel.cpp, source/common/primitives.h,
+ source/common/quant.cpp, source/common/x86/asm-primitives.cpp,
+ source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h,
+ source/test/pixelharness.cpp, source/test/pixelharness.h:
+ primitives: refactor tskip related
+ [90401d77a05d]
+
+2014-11-28 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/dct.cpp, source/common/quant.h,
+ source/common/x86/dct8.asm, source/common/x86/dct8.h,
+ source/encoder/search.cpp:
+ nits
+ [e2db5f3c6df8]
+
+2014-11-28 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/common/param.cpp:
+ param: disable b-intra in B frames when tune grain is true.
+ [d32249002258]
+
+2014-11-25 Gopu Govindaswamy <gopu@multicorewareinc.com>
+
+ * source/encoder/encoder.h:
+ encoder: make all member fields public
+ [af6b68f0feaa]
+
+2014-11-26 Steve Borho <steve@borho.org>
+
+ * doc/reST/cli.rst, doc/reST/presets.rst:
+ doc: restructure documentation with better grouping, improve cross-
+ refs
+ [dfe0803ae6be]
+
+ * doc/reST/introduction.rst:
+ doc: fix a sphinx build warning
+ [f488b394693b]
+
+ * doc/reST/presets.rst:
+ doc: improve readability of film grain section
+ [03bd64057e72]
+
+ * doc/reST/cli.rst, doc/reST/presets.rst:
+ doc: add cbr to the list of tunings, add helpful cross-refs
+ [071dbe651364]
+
+2014-11-27 Aarthi Thirumalai <Aarthi Thirumalai>
+
+ * source/CMakeLists.txt, source/common/param.cpp,
+ source/encoder/ratecontrol.cpp, source/x265.cpp, source/x265.h:
+ rc: introduce cli option to tune for cbr.
+ [8e602ed5ca4c]
+
+2014-11-25 Aarthi Thirumalai <Aarthi Thirumalai>
+
+ * source/encoder/ratecontrol.cpp:
+ rc: improve the frame size planning with respect to vbv buffer
+ occupancy and the lookahead window.
+ [2870269cdd60]
+
+ * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h:
+ rc: adjust qp for B frames from ABR feedback in case of CBR.
+
+ limits the bitrate fluctuation for CBR with respect to the target
+ bitrate set.
+ [576c675adf92]
+
+ * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h:
+ rc: limit bit amortization in ABR to longer sequences
+ [11342c8376dd]
+
+2014-11-26 Steve Borho <steve@borho.org>
+
+ * source/encoder/ratecontrol.cpp:
+ rc: use c-style typecasts
+ [c67b4f3a5e3c]
+
+2014-11-19 Aarthi Thirumalai <Aarthi Thirumalai>
+
+ * source/encoder/ratecontrol.cpp:
+ rc: tune midframe vbv logic for B frames
+ [8f5fa9538e13]
+
+2014-11-21 Aarthi Thirumalai <Aarthi Thirumalai>
+
+ * source/encoder/slicetype.cpp:
+ slicetype: fix vbv lookahead data collection for all frames within
+ the lookahead window.
+ [52246e09727d]
+
+2014-11-26 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_hpp[8x8, 8x16, 8x32] in avx2: improve 623c->523c,
+ 1384c->1083c, 2555c->2058c
+ [01d82aa06285]
+
+2014-11-26 Aasaipriya Chandran <aasaipriya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ Luma_hpp[48x64] avx2 asm code : improved 25053c->17882c
+ [bb7303bb00d1]
+
+2014-11-26 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_hpp[8x4] in avx2: improve 357c->261c
+ [a88ddc970748]
+
+2014-11-26 Aasaipriya Chandran <aasaipriya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ Luma_hpp[32x8 , 32x16 , 32x24 , 32x32 , 32x64] avx2 asm code:
+ improved 2032c->1556c, 4238c->3014c, 6696c->4801c, 8697c->6433c,
+ 16823c->12297c
+ [b0153f354186]
+
+2014-11-26 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[64x16] in avx2: improve 7245c->4910c
+ [5700875b428f]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[64x32, 64x48, 64x64] in avx2: improve 14150c->9810c,
+ 21132c->14684c, 28663c->19616c
+ [db518f7c8474]
+
+2014-11-25 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[32x8] in avx2: improve 2047c->1472c
+ [d57c28a3010b]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[32x24] in avx2: improve 5562c->3899c
+ [dedc5a8589a6]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[32x16] in avx2: improve 3808c->2491c
+ [3db00b06aea6]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[32x32, 32x64] in avx2: improve 7247c->4909c,
+ 14365c->9774c
+ [adf15e303c37]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[16x32, 16x64] in avx2: improve 3875c->2463c,
+ 7499c->4894c
+ [45456cd145d8]
+
+2014-11-25 Aasaipriya Chandran <aasaipriya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: avx2 for Luma_hpp[16x4, 16x8, 16x12, 16x16 , 16x32, 16x64]
+
+ 619c->458c, 1174c->812c, 1694c->1112c, 2291c->1535c, 4846c->3207c,
+ 9294c->6104c
+ [d11d3120361f]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: avx2 for luma_hpp[64x64, 64x48, 64x32, 64x16]
+
+ 33137c->22606c , 24826c->17202c , 16726c->11560c , 7830c->5534c
+ [1e8a0f1e0889]
+
+2014-11-22 Steve Borho <steve@borho.org>
+
+ * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h:
+ frameencoder: do not use bitmaps for framefilter if not WPP
+
+ The non-WPP row loop wants to do frame filter work in between each
+ row, with a m_filterRowDelay lag. If we use the functions which
+ update the bitmap, it would allow a worker thread to process a
+ filter row before it was ready. In short, the non-WPP path was never
+ intended to work in the presence of a thread pool. This was causing
+ crashes when --no-wpp --pmode||--pme was used.
+ [8011e2a68b88]
+
+2014-11-24 Steve Borho <steve@borho.org>
+
+ * source/encoder/frameencoder.cpp:
+ frameencoder: release row lock while waiting during VBV restarts
+
+ This fixes what appears to have been an old deadlock bug that has
+ just recently become very reproducible
+ [82f6e4847d57]
+
+2014-11-21 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[16x4] in avx2: improve 734c->497c
+ [3c6f703f94ea]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[16x8] in avx2: improve 1195c->745c
+ [fc83cf5299ae]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[16x12] in avx2: improve 1644c->1018c
+ [65017182318c]
+
+2014-11-21 Praveen Tiwari <Praveen Tiwari>
+
+ * source/common/dct.cpp:
+ idct32_c: C code optimization
+ [346fccbba4de]
+
+ * source/common/dct.cpp:
+ idct16_c: optimization
+ [388c893d3825]
+
+ * source/common/dct.cpp:
+ idct8_c: optimization
+ [f7d7c480b85d]
+
+ * source/common/dct.cpp:
+ idct4_c: optimization
+ [69a472a77b49]
+
+ * source/common/dct.cpp:
+ dct32_c: optimization
+ [a60dfb900169]
+
+ * source/common/dct.cpp:
+ dct16_c: optimization
+ [7e94ea285179]
+
+ * source/common/dct.cpp:
+ dct8_c: optimization
+ [d426e93e240c]
+
+ * source/common/dct.cpp:
+ dct4_c: C code optimization
+ [d4376e113855]
+
+ * source/common/dct.cpp:
+ idst4_c: optimization
+ [8f373c20bc41]
+
+ * source/common/dct.cpp:
+ dst4_c: optimization
+ [49b66c57972d]
+
+2014-11-21 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/pixel.cpp, source/common/x86/asm-primitives.cpp,
+ source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h:
+ fix copy16to16_shl
+ [5a8da9cb52e8]
+
+2014-11-20 Steve Borho <steve@borho.org>
+
+ * source/encoder/analysis.cpp, source/encoder/analysis.h,
+ source/encoder/search.cpp, source/encoder/search.h:
+ analysis: explicit locking for pmode and pme parameters
+
+ We've found a repro case involving --no-wpp --pmode --pme --preset
+ slower where time starved worker threads get stuck in the findJob()
+ routine and pushed off the CPU in the mean time the master thread
+ moves on to another CU. This caused very hard to reproduce crashes.
+ [2f8df4c972b9]
+
+2014-11-20 David T Yuen <dtyx265@gmail.com>
+
+ * source/common/vec/dct-sse3.cpp:
+ Updated intrinsic of idct8 sse3 for new input format
+ [2abf89f5c4f2]
+
+2014-11-20 Divya Manivannan <divya@multicorewareinc.com>
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[16x16] in avx2: improve 2141c->1284c
+ [2a2142982602]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[8x4] in avx2: improve 498c->257c
+ [c2fd1b7d5d99]
+
+ * source/common/x86/asm-primitives.cpp,
+ source/common/x86/ipfilter8.asm:
+ asm: luma_vpp[8x8] in avx2: improve 701c->387c
+ [562c43f738e4]
+
+2014-11-20 Steve Borho <steve@borho.org>
+
+ * source/encoder/encoder.cpp:
+ encoder: nits and alloc zero fix
+
+ intraData needs to be zerod on allocation else if one of the later
+ allocs failed some of the pointers will be uninitialized and passed
+ to X265_FREE()
+ [80dcd3dfb805]
+
+2014-11-20 Praveen Tiwari <Praveen Tiwari>
+
+ * source/common/dct.cpp:
+ Fix for C code mismatch
+
+ This patch is for fix the the binary mismatch in encoded output
+ introduced during refactorizaton of the transform/quant path.
+ Basically it is original version of code to make sure all valid
+ inputs are copied in input buffer, in other hand it is not fully
+ optimized code but this patch is quick fix for the problem and allow
+ us to optimze one function at a time.
+ [1d17ec0cb954]
+
+2014-11-20 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/pixel.cpp:
+ fix for old gcc
+ [ed587d360b97]
+
+2014-11-20 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * build/icl32/build-all.bat, build/icl32/make-makefile.bat,
+ build/icl64/build-all.bat, build/icl64/make-makefile.bat:
+ build: remove icl32 and icl64 scripts
+
+ Typical Windows ICL users link with Visual Studio
+ [3649fabf90d3]
+
+2014-11-20 Praveen Tiwari <Praveen Tiwari>
+
+ * source/common/x86/ipfilter8.asm:
+ luma_hpp[4x4]: AVX2 asm code bug fix
+ [4b637cb9b792]
+
+2014-11-20 Gopu Govindaswamy <gopu@multicorewareinc.com>
+
+ * source/encoder/encoder.cpp:
+ encoder: fix analysis file read
+ [0c25a6eac0ca]
+
+2014-11-20 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/encoder/analysis.cpp:
+ fix for rd=0
+ [b33cbe130c63]
+
+ * source/common/cudata.cpp, source/common/cudata.h,
+ source/encoder/analysis.cpp, source/encoder/frameencoder.cpp,
+ source/encoder/search.cpp:
+ replace char to int8_t, where it should be signed char
+ [14a8bb7bbcab]
+
+2014-11-19 Praveen Tiwari <Praveen Tiwari>
+
+ * source/common/x86/asm-primitives.cpp:
+ disable denoiseDct asm code until fixed for Mac OS
+ [f236adb703f5]
+
+2014-11-16 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/dct.cpp, source/common/ipfilter.cpp,
+ source/common/picyuv.h, source/common/pixel.cpp,
+ source/common/predict.cpp, source/common/primitives.h,
+ source/common/quant.cpp, source/common/quant.h,
+ source/common/shortyuv.cpp, source/common/vec/dct-sse3.cpp,
+ source/common/vec/dct-ssse3.cpp, source/common/x86/blockcopy8.h,
+ source/common/x86/dct8.h, source/common/x86/ipfilter8.h,
+ source/common/x86/mc.h, source/common/x86/pixel-util.h,
+ source/common/x86/pixel.h, source/common/yuv.cpp,
+ source/encoder/analysis.cpp, source/encoder/rdcost.h,
+ source/encoder/search.cpp:
+ primitives: clarify constness
+ [99b5cebf8193]
+
+2014-11-18 Steve Borho <steve@borho.org>
+
+ * source/common/dct.cpp:
+ dct: fix gcc warnings
+ [34cb58c53859]
+
+2014-11-18 Praveen Tiwari <Praveen Tiwari>
+
+ * source/common/dct.cpp, source/common/pixel.cpp,
+ source/common/primitives.h, source/common/quant.cpp,
+ source/common/quant.h, source/common/vec/dct-sse3.cpp,
+ source/common/vec/dct-sse41.cpp, source/common/vec/dct-ssse3.cpp,
+ source/common/x86/asm-primitives.cpp,
+ source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h,
+ source/common/x86/dct8.asm, source/common/x86/dct8.h,
+ source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm,
+ source/test/mbdstharness.cpp, source/test/mbdstharness.h,
+ source/test/pixelharness.cpp, source/test/pixelharness.h:
+ refactorizaton of the transform/quant path.
+
+ This patch involves scaling down the DCT/IDCT coefficients from
+ int32_t to int16_t as they can be accommodated on int16_t without
+ any introduction of encode error, this allows us to clean up lots of
+ DCT/IDCT intermediate buffers, optimize enode efficiency for
+ different cli options including noise reduction by reducing data
+ movement operations, accommodating more number of coefficients in a
+ single register for SIMD operations. This patch include all
+ necessary changes for the transfor/quant path including unit test
+ code.
+ [8bee552a1964]
+
+2014-11-19 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/common.h:
+ fseeko for mingw32
+ [cb9bb697fcaa]
+
+2014-11-19 Steve Borho <steve@borho.org>
+
+ * source/common/threading.h:
+ threadind: fixes for VC11 Win32 includes, prune two unused functions
+ [2b830f08d948]
+
+2014-11-18 Steve Borho <steve@borho.org>
+
+ * source/common/wavefront.cpp:
+ wavefront: fix msvc warning
+
+ warning C4800: 'unsigned long' : forcing value to bool 'true' or
+ 'false' (performance warning)
+ [e29c618cd9a7]
+
+ * source/common/param.cpp, source/common/quant.cpp,
+ source/common/threading.h, source/common/threadpool.cpp,
+ source/common/wavefront.cpp, source/common/wavefront.h,
+ source/common/winxp.h, source/encoder/entropy.cpp,
+ source/encoder/slicetype.cpp:
+ threading: use 32bit atomic integer operations exclusively
+
+ The 32bit operations have better portability and have less onerous
+ alignment restrictions.
+ [814b687db30e]
+
+ * source/common/constants.cpp, source/common/constants.h,
+ source/common/primitives.cpp, source/encoder/api.cpp,
+ source/test/intrapredharness.cpp:
+ constants: remove init/destroyROM functions
+ [d3389bb9efd0]
+
+ * source/x265.h:
+ api: fix range limit docs for RQT limit params
+ [d059cfa88f1a]
+
+ * source/encoder/frameencoder.cpp:
+ frameencoder: white-space nits
+ [29a374b62920]
+
+ * source/encoder/analysis.cpp:
+ analysis: drop MATCH_NON_PMODE macro
+
+ this was a debugging feature, it's not being tested which means it
+ will get broken and so it's best just to keep the code clean
+ [dc61091d5cc4]
+
+ * source/common/threading.cpp:
+ threading: don't use this->
+
+ We don't do this anywhere else
+ [3731d9bc7b88]
+
+ * source/common/threading.cpp, source/common/threading.h,
+ source/common/threadpool.cpp, source/common/threadpool.h:
+ threading: copyright comment format nits
+
+ be consistent with our other files
+ [a7b9b90e1bdd]
+
+ * source/common/param.cpp:
+ param: use strdup() on input strings uniformly
+ [ad532c30bc95]
+
+2014-11-18 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/encoder/encoder.cpp:
+ encoder: init filename to NULL
+ [2f0062f0791b]
+
+2014-11-17 Gopu Govindaswamy <gopu@multicorewareinc.com>
+
+ * source/common/common.h, source/encoder/analysis.cpp,
+ source/encoder/search.cpp:
+ search: fix binary mismatch and inconsistent crash for share inter
+ information
+ [854fcbb50220]
+
+ * source/encoder/encoder.cpp:
+ encoder: force slicetype using analysis file
+ [05d824463602]
+
+2014-11-17 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/cudata.cpp, source/common/lowres.h,
+ source/common/mv.h, source/encoder/bitcost.h,
+ source/encoder/motion.cpp, source/encoder/motion.h,
+ source/encoder/slicetype.cpp:
+ modify MV default constructor to do nothing
+ [7a1ec67bd004]
+
+2014-11-17 Aarthi Thirumalai <Aarthi Thirumalai>
+
+ * source/encoder/ratecontrol.cpp:
+ vbv: tune vbv predictors for better mapping of predicted bits to
+ encoded bits
+ [27d36c4b4a27]
+
+2014-11-16 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/encoder/analysis.cpp, source/encoder/analysis.h:
+ analysis: cleanups, init pointers, variable names are made self-
+ explanatory
+ [ed2ba7a90567]
+
+2014-11-12 Gopu Govindaswamy <gopu@multicorewareinc.com>
+
+ * source/encoder/analysis.cpp:
+ analysis: fix binary mismatch for share intra save and load mode
+ with same cli
+ [10b8d3fbe408]
+
+2014-11-14 Steve Borho <steve@borho.org>
+
+ * source/x265.cpp:
+ cli: fix analysis filename argument
+
+ This showed up as a GCC warning about an unused variable, but having
+ the arg handled here prevented the org from being passed to
+ x265_param_parse()
+ [8191e0d02455]
+
+ * source/encoder/encoder.cpp:
+ encoder: add prefix to FREAD and FWRITE macros to avoid MacOSX macro
+ conflict
+
+ /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform
+ /Developer/SDKs/MacOSX10.10.sdk/usr/include/sys/fcntl.h:111:9: note:
+ previous definition is here #define FWRITE 0x0002
+ [b617dca5ce12]
+
+ * source/common/common.h, source/common/frame.h,
+ source/encoder/encoder.h:
+ common: move analysis reuse structs to common.h
+
+ files in common/ shouldn't include encoder.h
+ [72f1222903a3]
+
+2014-11-14 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/encoder/analysis.cpp:
+ analysis: encodeResidue() directly write to reconPic
+ [c3096034934f]
+
+2014-11-14 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/CMakeLists.txt, source/common/common.h,
+ source/common/frame.h, source/common/param.cpp,
+ source/encoder/analysis.cpp, source/encoder/analysis.h,
+ source/encoder/api.cpp, source/encoder/encoder.cpp,
+ source/encoder/encoder.h, source/x265.cpp, source/x265.def.in,
+ source/x265.h:
+ analysis save/load: refactor full implementation
+
+ 1. Move analysis inter/intra data into encoder 2. Encoder allocates
+ and frees memory for x265 analysis, remove api calls 3. Inter and
+ intra data allocated based on sliceType only 4. frame record size is
+ now variable
+ [58c2e06c2e4a]
+
+2014-11-13 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/encoder/analysis.cpp:
+ analysis: don't add the cost of picture boundary CU to avgCost
+ [CHANGES OUTPUT]
+ [64314f8061f1]
+
+2014-11-13 Steve Borho <steve@borho.org>
+
+ * source/cmake/FindVLD.cmake:
+ cmake: hack to avoid escaping problems in cmake 3.1 parser
+
+ Fix suggested by Mario *LigH* Rohkrämer
+ [17f2fb0996db]
+
+2014-11-13 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/cudata.cpp, source/encoder/analysis.cpp,
+ source/encoder/entropy.cpp, source/encoder/entropy.h,
+ source/encoder/sao.cpp:
+ nits
+ [03974d78f241]
+
+2014-11-12 Steve Borho <steve@borho.org>
+
+ * source/encoder/rdcost.h:
+ rdcost: lower the psy-rd scale factor for I slices to 96/256
+
+ Based on Santhoshini's testing, this is better at preventing
+ artifacts
+ [18aefbde72ab]
+
+2014-11-11 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/cudata.cpp, source/common/cudata.h,
+ source/encoder/frameencoder.cpp, source/encoder/frameencoder.h:
+ refine initializeGeoms()
+ [98fb658f3229]
+
+2014-11-11 Steve Borho <steve@borho.org>
+
+ * source/encoder/analysis.cpp:
+ analysis: fix bidir non-determinism in --pmode --rd 5
+ [306ef9782a30]
+
+ * source/common/param.cpp, source/encoder/analysis.cpp,
+ source/encoder/search.cpp, source/encoder/search.h:
+ Merge
+ [fa2fedd97ff2]
+
+2014-11-10 Steve Borho <steve@borho.org>
+
+ * source/common/quant.cpp:
+ quant: allow --nr in all slice types evenly
+ [38fa64a5c51c]
+
+2014-11-06 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/common/common.h, source/common/quant.cpp,
+ source/common/quant.h:
+ noiseReduction: apply only for I and P, move NoiseReduction to
+ quant.h
+
+ This doubles the number of quant nr categories; intra blocks now use
+ the lower half.
+ [ed89e58b44e8]
+
+2014-11-10 Steve Borho <steve@borho.org>
+
+ * doc/reST/cli.rst, source/common/param.cpp:
+ param: raise --nr limit to 2000
+ [27f293dd9eee]
+
+ * doc/reST/presets.rst, source/common/param.cpp:
+ param: remove --b-intra from --tune grain, document rdoq restriction
+ [64ccc616be33]
+
+2014-11-09 Steve Borho <steve@borho.org>
+
+ * source/encoder/rdcost.h:
+ rdcost: experimental slice-type based psy-rd scale factor
+ [4f3fd7ab8868]
+
+2014-11-08 Steve Borho <steve@borho.org>
+
+ * source/encoder/analysis.cpp, source/encoder/analysis.h,
+ source/encoder/search.cpp:
+ analysis: RDO based BIDIR decisions
+
+ At RD 0, 1, and 2, this changes 2Nx2N bidir from a SATD decision to
+ an SA8D decision.
+
+ At RD 3 and 4, if the bidir SA8D cost is within 17/16 of the best
+ inter cost, then it makes an RDO decision between bestInter and
+ Bidir (allowing psy-rd to influence the decision, which is the whole
+ point)
+
+ At RD 5 and 6, 2Nx2N BIDIR is yet another RD choice at the same
+ level as 2Nx2N inter and rect and amp. (psy) RDO picks the best mode
+ for each block.
+ [4c6c28cc93d9]
+
+2014-11-11 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/x265.cpp:
+ x265: more meaningful error messages in analysis
+ [838e41fb256b]
+
+ * source/encoder/api.cpp:
+ api: cleanup
+ [3c01e8881946]
+
+ * source/encoder/api.cpp:
+ api: replace analysis data with pre defined constant
+ [b4effa4dd53b]
+
+ * source/x265.cpp:
+ x265: create and initialise recon object if analysis mode is enabled
+ [47b290236ca3]
+
+ * source/common/param.cpp:
+ param: add default value to analysis mode
+ [5c397e744cfd]
+
+2014-11-11 Gopu Govindaswamy <gopu@multicorewareinc.com>
+
+ * source/encoder/analysis.cpp, source/encoder/api.cpp,
+ source/x265.cpp, source/x265.h:
+ x265: remove redundant variables from intra and inter analysis
+ structure
+ [ad5177c86756]
+
+ * source/encoder/analysis.cpp, source/encoder/search.cpp,
+ source/encoder/search.h, source/x265.h:
+ analysis: Dump best MV statistics and re-use this for analysis load
+ mode
+
+ This patch fixes a bug in inter slices in analysis=load|save mode.
+ Inter data for all partitions is now saved correctly.
+ [c8004323493e]
+
+2014-11-10 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/cudata.cpp, source/common/cudata.h,
+ source/common/deblock.cpp, source/encoder/analysis.cpp,
+ source/encoder/entropy.cpp, source/encoder/frameencoder.cpp,
+ source/encoder/search.cpp:
+ cleanup SIZE_NONE. empty CU has MODE_NONE.
+ [32513a4c3bd4]
+
+2014-11-09 Steve Borho <steve@borho.org>
+
+ * source/encoder/search.cpp:
+ search: fixup
+ [1e04e178a349]
+
+2014-11-08 Steve Borho <steve@borho.org>
+
+ * source/encoder/reference.cpp, source/encoder/reference.h,
+ source/encoder/search.cpp:
+ reference: add methods for querying CU/PU pointers
+ [9687a9d1205a]
+
+ * source/encoder/analysis.cpp:
+ analysis: delay initialization of prediction cu in RD 5 and 6
+ [b9147e641ce6]
+
+2014-11-09 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/encoder/analysis.cpp:
+ fix typo
+ [3dc9857c59d3]
+
+2014-11-08 Steve Borho <steve@borho.org>
+
+ * source/encoder/analysis.cpp:
+ analysis: delay initialization of prediction cu until just before
+ use
+
+ This avoids initializing CUs that may never be used because of
+ various early-outs
+ [3f2d68368554]
+
+ * source/encoder/search.cpp, source/encoder/search.h:
+ search: keep AMVP candidates in mode structure
+
+ This fixes some work replication in --pme and will also make
+ handling BIDIR as a seperate prediction easier.
+ [6124c837b3ab]
+
+ * source/encoder/motion.h, source/encoder/search.cpp,
+ source/encoder/slicetype.h:
+ motion: remove trivial set methods; make some members public
+ [53c146f7eb9f]
+
+2014-11-07 Steve Borho <steve@borho.org>
+
+ * source/encoder/frameencoder.cpp:
+ nr: fix denoise offset memcopy size
+ [0912563c4ac1]
+
+ * source/encoder/entropy.h:
+ entropy: pass context model (state) to bitsCodeBin as uint32_t
+
+ Should be slightly more efficient
+ [a67b848d6c04]
+
+ * source/encoder/entropy.cpp:
+ entropy: nit
+ [b55799a2f5ad]
+
+ * source/encoder/entropy.cpp:
+ entropy: ensure X265_CHECK() has braces
+ [0fd8e0c5272a]
+
+ * source/encoder/entropy.cpp, source/encoder/entropy.h:
+ entropy: inline methods which mapped to encodeBin() calls
+ [640d2936e699]
+
+ * source/encoder/entropy.cpp, source/encoder/entropy.h:
+ entropy: inline bit counting functions
+ [ca7873cab172]
+
+ * source/encoder/entropy.cpp:
+ entropy: use bitsCodeBin in intra mode bit estimate functions
+ [84fc74874406]
+
+ * source/encoder/entropy.cpp, source/encoder/entropy.h:
+ entropy: rename encodeBinContext to bitsCodeBin, make const
+
+ The function is not modifying the context, so there is no need to
+ pass as a reference, and the function can be const. Also, group the
+ bit counting RDO functions together
+ [a1ee9422183b]
+
+ * source/encoder/entropy.cpp:
+ entropy: white-space nits
+ [429742055057]
+
+2014-11-07 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/encoder/search.cpp:
+ fix bug in 522baf03fbbd
+ [f2130a4dc876]
+
+2014-11-07 Deepthi Nandakumar <deepthi@multicorewareinc.com>
+
+ * source/encoder/search.cpp:
+ search: fix warnings
+ [7338b1f1f43d]
+
+2014-11-07 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/encoder/analysis.cpp:
+ fix typo
+ [4f034e3adef8]
+
+2014-11-05 Ashok Kumar Mishra <ashok@multicorewareinc.com>
+
+ * source/encoder/entropy.cpp, source/encoder/entropy.h,
+ source/encoder/search.cpp, source/encoder/search.h:
+ [REVIEW PATCH/OUTPUT CHANGED]search: removed multiple encode
+ Coefficients from estimateResidualQT()
+
+ Tried to remove multiple encode coefficients from
+ estimateResidualQT() function. Coefficients are encoded in three
+ stages: Once for calculation of distortion and twice for split and
+ unsplit block cost calculation. I have given comments where I have
+ changed the code.
+ [eb5a9eb03dd6]
+
+ * source/encoder/search.cpp, source/encoder/search.h:
+ search: made a function for null cost calculation in
+ xEstimateResidualQT()
+ [522baf03fbbd]
+
+ * source/encoder/search.cpp, source/encoder/search.h:
+ search: made separate functions for encoding cbfs in
+ xEstimateResidualQT()
+ [0b7c709335b2]
+
+2014-11-07 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/cudata.cpp, source/common/cudata.h:
+ cudata: remove default argument
+ [bc4f3dab51db]
+
+2014-11-06 Steve Borho <steve@borho.org>
+
+ * doc/reST/presets.rst:
+ doc: fix sub-title depth
+
+ Single dash was already used by a higher section
+ [0ebd0b00bf9b]
+
+ * doc/reST/cli.rst, doc/reST/presets.rst, source/common/param.cpp:
+ param: add --tune grain
+ [ec5588025568]
+
+ * source/encoder/search.cpp:
+ search: ugly bias hack for bidir with psy-rd
+ [e33e09549c0c]
+
+ * doc/reST/cli.rst:
+ docs: document RC params, at least minimally
+ [beac946dac85]
+
+ * source/x265.h:
+ api: cleanup comments
+ [8ceaab303bfa]
+
+ * source/x265.cpp:
+ cli: cleanup CLI help, add 'verbose' tier
+
+ Remove a lot of uncommon features from the initial help output,
+ require
+ --log-level debug --help to see it all
+ [f599a4df57ac]
+
+ * source/common/param.cpp, source/x265.cpp:
+ api: expose rate control params via x265_param_parse() and CLI
+
+ Adds range checks for qCompress, which has documented limits. The
+ others have very minimal explanations; so I'm not adding them to the
+ CLI help. Users should not touch them unless they know what they are
+ doing.
+
+ Note this commit doesn't bump X265_BUILD since no new params were
+ added.
+ [b37cda5d3092]
+
+2014-11-05 Steve Borho <steve@borho.org>
+
+ * source/common/deblock.cpp:
+ deblock: fix type conversion warnings
+ [4a3997fd4fc1]
+
+2014-11-05 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/deblock.cpp, source/common/deblock.h,
+ source/common/quant.cpp, source/common/slice.h,
+ source/encoder/encoder.cpp, source/encoder/entropy.cpp,
+ source/encoder/framefilter.cpp, source/encoder/rdcost.h,
+ source/encoder/sao.cpp:
+ refine deblocking filter
+ [65e14d5a5728]
+
+2014-11-04 Gopu Govindaswamy <gopu@multicorewareinc.com>
+
+ * source/x265.cpp:
+ cli: bug fix for validatefanout param analysis-mode=save and load
+ [2a8f3d5820a6]
+
+2014-11-04 gopi jayaraman <gopi@multicorewareinc.com>
+
+ * source/encoder/encoder.cpp:
+ encoder: use 6 frameNumThreads for cpucount 32 and above
+ [0dcc6a1d8f02]
+
+2014-11-04 Steve Borho <steve@borho.org>
+
+ * source/x265.h:
+ api: add void to functions that take no parameters for -Wstrict-
+ prototypes
+ [0d44fcb269a6]
+
+ * source/common/deblock.cpp, source/common/frame.cpp,
+ source/common/frame.h, source/common/framedata.h,
+ source/common/predict.cpp, source/encoder/analysis.cpp,
+ source/encoder/dpb.cpp, source/encoder/encoder.cpp,
+ source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp,
+ source/encoder/ratecontrol.cpp, source/encoder/sao.cpp,
+ source/encoder/search.cpp, source/encoder/slicetype.cpp,
+ source/encoder/weightPrediction.cpp:
+ frame: rename m_reconPicYuv -> m_reconPic, m_origPicYuv -> m_fencPic
+
+ the fooPicYuv names were potentially confusing, preferred names:
+ PicYuv* fooPic; Yuv* fooYuv;
+ [67bf055c13d5]
+
+ * source/encoder/motion.cpp, source/encoder/motion.h:
+ motion: remove unused sa8d pointer and bufSA8D method
+ [59a08101dfc6]
+
+2014-11-04 Gopu Govindaswamy <gopu@multicorewareinc.com>
+
+ * source/common/cudata.h, source/encoder/analysis.cpp,
+ source/encoder/analysis.h, source/encoder/api.cpp,
+ source/encoder/encoder.cpp, source/encoder/search.cpp,
+ source/encoder/search.h, source/x265.cpp, source/x265.h:
+ search: dump and share the best motion statistics for inter(P&B)
+ slices
+ [d5f6133b99d4]
+
+2014-11-03 Steve Borho <steve@borho.org>
+
+ * doc/reST/cli.rst:
+ docs: fix reST parsing issues
+ [a8ec469d7fb1]
+
+2014-11-03 Min Chen <chenm003@163.com>
+
+ * source/common/primitives.h, source/common/x86/pixel-util.h,
+ source/common/x86/pixel-util8.asm:
+ cleanup: remove unused asm calcrecon
+ [5637b495e2e1]
+
+ * source/common/x86/ipfilter8.asm:
+ asm: fix typo error in interp_8tap_vert_pp_4x4_avx2
+ [ee88b63aced0]
+
+2014-11-03 Satoshi Nakagawa <nakagawa424@oki.com>
+
+ * source/common/cudata.cpp, source/common/cudata.h,
+ source/common/quant.cpp, source/encoder/analysis.cpp,
+ source/encoder/entropy.cpp, source/encoder/frameencoder.cpp,
+ source/encoder/search.cpp:
+ cleanup CUData::m_skipFlag
+ [2e60f3b81981]
+
+2014-10-31 Steve Borho <steve@borho.org>
+
+ * source/encoder/encoder.cpp:
+ encoder: make it clear that --fast-cbf is innefective at lower rd
+ levels
+
+ This begs the question of whether the feature should exist, or
+ whether it should be added to the lower RD levels
+ [eebb372eec89]
+
+ * source/common/param.cpp:
+ param: show options using their CLI / param_parse names
+ [c32a733a819b]
+
+2014-10-30 Steve Borho <steve@borho.org>
+
+ * .hgtags:
+ remove dead non-release tags
+
+ anyone interested in archeology can still find them; there's no
+ sense to keep them on the tip since we stopped tracking last known
+ good more than a year ago
+ [75cb2ab1ecec]
+
2014-10-31 Steve Borho <steve@borho.org>
+ * source/encoder/encoder.cpp:
+ Merge with stable
+ [ae8a661acdc4]
+
+ * .hgtags:
+ Added tag 1.4 for changeset 5e604833c5aa
+ [d2db9c1ab44b] <stable>
+
* source/encoder/encoder.cpp:
encoder: emit an Active Parameter Sets SEI in stream headers if
interlaced
1. Yasm 1.2.0 or later, to compile assembly primitives (performance)
- For Windows, download
- http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win32.exe or
- http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win64.exe
- depending on your O/S and copy the EXE into C:\Windows or somewhere else
- in your %PATH% that a 32-bit app (cmake) can find it. If it is not in the
- path, you must manually tell cmake where to find it.
+ For Windows, download the latest yasm executable
+ http://yasm.tortall.net/Download.html and copy the EXE into
+ C:\Windows or somewhere else in your %PATH% that a 32-bit app (cmake)
+ can find it. If it is not in the path, you must manually tell cmake
+ where to find it. Note: you do not need the vsyasm packages, x265
+ does not use them. You only need the yasm executable itself.
- For Linux, yasm-1.2.0 is likely too new to be packaged for your system so you
- will need get http://www.tortall.net/projects/yasm/releases/yasm-1.2.0.tar.gz
- compile, and install it.
+ On Linux, the packaged yasm may be older than 1.2, in which case
+ so you will need get the latest source and build it yourself.
Once YASM is properly installed, run cmake to regenerate projects. If you
do not see the below line in the cmake output, YASM is not in the PATH.
- -- Found Yasm 1.2.0 to build assembly primitives
+ -- Found Yasm 1.3.0 to build assembly primitives
- Now build the encoder and run x265 -V. If you see "assembly" on this
- line, you have YASM properly installed:
+ Now build the encoder and run x265 -V:
- x265 [info]: performance primitives: intrinsic assembly
+ x265 [info]: using cpu capabilities: MMX, SSE2, ...
+
+ If cpu capabilities line says 'none!', then the encoder was built
+ without yasm.
2. VisualLeakDetector (Windows Only)
+++ /dev/null
-@echo off\r
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )\r
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )\r
-if "%ICL%" == "" (\r
- msg "%username%" "Intel C++ 2013 not detected"\r
- exit 1\r
-)\r
-if not exist Makefile (\r
- call make-makefile.bat\r
-)\r
-if exist Makefile (\r
- call "%ICL%\bin\compilervars.bat" ia32\r
- nmake\r
-)\r
+++ /dev/null
-@echo off\r
-::\r
-:: run this batch file to create an Intel C++ 2013 NMake makefile for this project.\r
-:: See the cmake documentation for other generator targets\r
-::\r
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )\r
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )\r
-if "%ICL%" == "" (\r
- msg "%username%" "Intel C++ 2013 not detected"\r
- exit 1\r
-)\r
-call "%ICL%\bin\compilervars.bat" ia32\r
-set CC=icl\r
-set CXX=icl\r
-cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source\r
+++ /dev/null
-@echo off\r
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )\r
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )\r
-if "%ICL%" == "" (\r
- msg "%username%" "Intel C++ 2013 not detected"\r
- exit 1\r
-)\r
-if not exist Makefile (\r
- call make-makefile.bat\r
-)\r
-if exist Makefile (\r
- call "%ICL%\bin\compilervars.bat" intel64\r
- nmake\r
-)\r
+++ /dev/null
-@echo off\r
-::\r
-:: run this batch file to create an Intel C++ 2013 NMake makefile for this project.\r
-:: See the cmake documentation for other generator targets\r
-::\r
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )\r
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )\r
-if "%ICL%" == "" (\r
- msg "%username%" "Intel C++ 2013 not detected"\r
- pause\r
- exit 1\r
-)\r
-call "%ICL%\bin\compilervars.bat" intel64\r
-set CC=icl\r
-set CXX=icl\r
-cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source\r
-pause\r
+++ /dev/null
-x265 (1.4-4~ubuntu1) trusty; urgency=medium
-
- * Support for ARMv7 with NEON extensions
-
- -- Jérôme Benoit <jerome.benoit@piment-noir.org> Tue, 09 Dec 2014 22:19:18 +0100
-
-x265 (1.4-3~trusty) trusty; urgency=low
-
- * Upstream bugfixes:
- * cli: bug fix for validatefanout param analysis-mode=save and load
- * docs: fix reST parsing issues
-
- -- Marshall Banana <djcj@gmx.de> Wed, 05 Nov 2014 01:29:32 +0100
-
-x265 (1.4-2~trusty2) trusty; urgency=low
-
- * Install documentation in x265-doc package to avoid package conflicts
-
- -- Marshall Banana <djcj@gmx.de> Sat, 01 Nov 2014 02:49:51 +0100
-
-x265 (1.4-1~trusty) trusty; urgency=low
-
- * New upstream release
-
- -- Marshall Banana <djcj@gmx.de> Sat, 01 Nov 2014 00:20:42 +0100
-
-x265 (1.3-4~trusty) trusty; urgency=low
-
- * Update manpage
-
- -- Marshall Banana <djcj@gmx.de> Wed, 01 Oct 2014 18:09:33 +0200
-
-x265 (1.3-3~trusty) trusty; urgency=low
-
- * Rename x265-10b to x265-10bit
- * Provide x265-16bit via symbolic link
-
- -- Marshall Banana <djcj@gmx.de> Wed, 01 Oct 2014 17:56:41 +0200
-
-x265 (1.3-2~trusty) trusty; urgency=low
-
- * Add doc-base control file
-
- -- Marshall Banana <djcj@gmx.de> Fri, 05 Sep 2014 04:07:20 +0200
-
-x265 (1.3-1~trusty) trusty; urgency=low
-
- * New upstream release
-
- -- Marshall Banana <djcj@gmx.de> Fri, 22 Aug 2014 20:30:50 +0200
-
-x265 (1.2+510-hg2bdcfcc1bb33-1~trusty) trusty; urgency=low
-
- * Current snapshot
-
- -- Marshall Banana <djcj@gmx.de> Mon, 11 Aug 2014 12:51:05 +0200
-
-x265 (1.2-3~trusty) trusty; urgency=low
-
- * Build static library from different object files
-
- -- Marshall Banana <djcj@gmx.de> Mon, 21 Jul 2014 05:29:20 +0200
-
-x265 (1.2-2~trusty1) trusty; urgency=low
-
- * Provide separate optimized shared libraries for i686
-
- -- Marshall Banana <djcj@gmx.de> Fri, 11 Jul 2014 20:36:37 +0200
-
-x265 (1.2-1~trusty) trusty; urgency=low
-
- * New upstream version
- * Update patch
- * Update man page
- * Install upstream changelog
-
- -- Marshall Banana <djcj@gmx.de> Thu, 10 Jul 2014 19:40:33 +0200
-
-x265 (1.1-4~trusty) trusty; urgency=low
-
- * Install 10bit binary to make usage of 10bit library possible.
-
- -- Marshall Banana <djcj@gmx.de> Tue, 17 Jun 2014 10:53:50 +0200
-
-x265 (1.1-3~trusty) trusty; urgency=low
-
- * Don't rename 10 bit library
-
- -- Marshall Banana <djcj@gmx.de> Fri, 13 Jun 2014 16:43:34 +0200
-
-x265 (1.1-2~trusty) trusty; urgency=low
-
- * Build less packages
-
- -- Marshall Banana <djcj@gmx.de> Wed, 11 Jun 2014 03:47:39 +0200
-
-x265 (1.1-1~trusty) trusty; urgency=low
-
- * New upstream version
-
- -- Marshall Banana <djcj@gmx.de> Fri, 13 Jun 2014 16:42:16 +0200
-
-x265 (1.1) unstable; urgency=low
-
- * New upstream version
-
- -- Marshall Banana <djcj@gmx.de> Sat, 07 Jun 2014 17:44:06 +0200
-
-x265 (1.0+5-dcf74ea39e31) unstable; urgency=low
-
- * New upstream version
-
- -- Marshall Banana <djcj@gmx.de> Sun, 04 May 2014 19:07:30 +0100
-
-x265 (0.9+114-c630b0b393ee) unstable; urgency=low
-
- * New upstream version
-
- -- Marshall Banana <djcj@gmx.de> Fri, 04 Apr 2014 01:45:30 +0100
-
-x265 (0.8+52-93861c42b879) unstable; urgency=low
-
- * New upstream version
-
- -- Marshall Banana <djcj@gmx.de> Sat, 08 Mar 2014 10:08:00 +0100
-
-x265 (0.7+216-591ca91f0501) unstable; urgency=low
-
- * Initial upload
-
- -- Marshall Banana <djcj@gmx.de> Wed, 19 Feb 2014 21:30:00 +0100
+++ /dev/null
-libdir := lib/$(DEB_HOST_MULTIARCH)
-
-
-common_confflags := \
- -DCMAKE_INSTALL_PREFIX=/usr \
- -DCMAKE_BUILD_TYPE=RelWithDebInfo \
- -DCMAKE_VERBOSE_MAKEFILE=ON
-
-8bit_confflags := \
- $(common_confflags) \
- -DLIB_INSTALL_DIR=$(libdir)
-
-10bit_confflags := \
- $(common_confflags) \
- -DBIN_INSTALL_DIR=$(libdir)/x265-10bit \
- -DLIB_INSTALL_DIR=$(libdir)/x265-10bit \
- -DHIGH_BIT_DEPTH=ON
-
-
-static_confflags := \
- -DCMAKE_INSTALL_PREFIX=/usr \
- -DCMAKE_BUILD_TYPE=Release \
- -DCMAKE_VERBOSE_MAKEFILE=ON \
- -DENABLE_CLI=OFF \
- -DENABLE_SHARED=OFF
-
-8bit_static_confflags := \
- $(static_confflags) \
- -DLIB_INSTALL_DIR=$(libdir)
-
-10bit_static_confflags := \
- $(static_confflags) \
- -DLIB_INSTALL_DIR=$(libdir)/x265-10bit \
- -DHIGH_BIT_DEPTH=ON
-
-
-# disable assembly on x86 and arm
-ifneq (,$(filter i386 i486 i586 i686 pentium arm,$(DEB_HOST_GNU_CPU)))
-noasm = -DENABLE_ASSEMBLY=OFF -DCMAKE_CXX_FLAGS='-DX86_64=0'
-8bit_confflags += $(noasm)
-10bit_confflags += $(noasm)
-8bit_static_confflags += $(noasm)
-10bit_static_confflags += $(noasm)
-endif
-
+++ /dev/null
-Source: x265
-Section: libs
-Priority: optional
-Maintainer: Marshall Banana <djcj@gmx.de>
-Homepage: https://bitbucket.org/multicoreware/x265/wiki/Home
-Standards-Version: 3.9.5
-Build-Depends:
- debhelper (>= 9),
- cmake (>= 2.8.8),
- python3-sphinx,
- yasm (>= 1.2.0) [any-i386]
-Vcs-Browser: https://bitbucket.org/multicoreware/x265/src
-
-Package: x265
-Architecture: any
-Section: video
-Depends:
- ${misc:Depends},
- ${shlibs:Depends}
-Description: H.265/HEVC video encoder - frontend binary
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
-
-Package: libx265-35
-Architecture: any
-Pre-Depends:
- ${misc:Pre-Depends}
-Depends:
- ${misc:Depends},
- ${shlibs:Depends}
-Description: H.265/HEVC video encoder - runtime files
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
-
-Package: libx265-dev
-Architecture: any
-Section: libdevel
-Depends:
- ${misc:Depends},
- libx265-35 (= ${binary:Version})
-Description: H.265/HEVC video encoder - development files
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the static library and
- headers used to build programs that use libx265-35.
-
-Package: x265-doc
-Architecture: all
-Section: doc
-Depends:
- ${misc:Depends},
- libjs-jquery (>= 1.4),
- libjs-underscore
-Description: x265 documentation
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the x265 documentation.
-
-Package: x265-dbg
-Architecture: any
-Section: debug
-Priority: extra
-Depends:
- ${misc:Depends},
- x265 (= ${binary:Version}),
- libx265-35 (= ${binary:Version})
-Description: debugging symbols for x265 and libx265
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the debugging symbols for x265.
+++ /dev/null
-Source: x265
-Section: libs
-Priority: optional
-Maintainer: Marshall Banana <djcj@gmx.de>
-Homepage: https://bitbucket.org/multicoreware/x265/wiki/Home
-Standards-Version: 3.9.5
-Build-Depends:
- debhelper (>= 9),
- cmake (>= 2.8.8),
- python3-sphinx,
- yasm (>= 1.2.0) [any-i386]
-Vcs-Browser: https://bitbucket.org/multicoreware/x265/src
-
-Package: x265
-Architecture: any
-Section: video
-Depends:
- ${misc:Depends},
- ${shlibs:Depends}
-Description: H.265/HEVC video encoder - frontend binary
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
-
-Package: libx265-@API@
-Architecture: any
-Pre-Depends:
- ${misc:Pre-Depends}
-Depends:
- ${misc:Depends},
- ${shlibs:Depends}
-Description: H.265/HEVC video encoder - runtime files
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
-
-Package: libx265-dev
-Architecture: any
-Section: libdevel
-Depends:
- ${misc:Depends},
- libx265-@API@ (= ${binary:Version})
-Description: H.265/HEVC video encoder - development files
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the static library and
- headers used to build programs that use libx265-@API@.
-
-Package: x265-doc
-Architecture: all
-Section: doc
-Depends:
- ${misc:Depends},
- libjs-jquery (>= 1.4),
- libjs-underscore
-Description: x265 documentation
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the x265 documentation.
-
-Package: x265-dbg
-Architecture: any
-Section: debug
-Priority: extra
-Depends:
- ${misc:Depends},
- x265 (= ${binary:Version}),
- libx265-@API@ (= ${binary:Version})
-Description: debugging symbols for x265 and libx265
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the debugging symbols for x265.
+++ /dev/null
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: x265
-Upstream-Contact: Steve Borho <steve@borho.org>
-Source: https://bitbucket.org/multicoreware/x265/wiki/Home
-
-
-Files: *
-Copyright: 2013-2014 x265 project
-License: GPL-2.0+
-
-Files: source/common/const-a.asm
- source/common/cpu-a.asm
- source/common/intrapred.h
- source/common/mc-a*.asm
- source/common/pixel.h
- source/common/pixel-32.asm
- source/common/pixel-a.asm
- source/common/sad*.asm
- source/common/ssd.asm
- source/common/x86inc.asm
- source/test/checkasm-a.asm
-Copyright: 2003-2014 x264 project
-License: GPL-2.0+
-
-Files: source/common/x86util.asm
-Copyright: 2008-2013 x264 project
-License: ISC
-
-Files: source/compat/getopt/*
-Copyright: 1987-2001 Free Software Foundation, Inc.
-License: LGPL-2.1+
-
-Files: source/Lib/*
-Copyright: 2010-2013 ITU/ISO/IEC
-License: BSD-3-clause
-
-Files: debian/*
-Copyright: 2014 djcj <djcj@gmx.de>
-License: ISC
-
-
-License: GPL-2.0+
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- .
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- .
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- .
- On Debian GNU/Linux systems, the complete text of the GNU General Public
- License version 2 can be found in '/usr/share/common-licenses/GPL-2'.
-
-
-License: ISC
- Permission to use, copy, modify, and/or distribute this software for any
- purpose with or without fee is hereby granted, provided that the above
- copyright notice and this permission notice appear in all copies.
- .
- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-
-License: LGPL-2.1+
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
- .
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
- .
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <http://www.gnu.org/licenses/>.
- .
- On Debian systems, the complete text of the GNU Lesser General
- Public License version 3 can be found in '/usr/share/common-licenses/LGPL-2.1'.
-
-
-License: BSD-3-clause
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- .
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
- be used to endorse or promote products derived from this software without
- specific prior written permission.
- .
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
- BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- THE POSSIBILITY OF SUCH DAMAGE.
+++ /dev/null
-#!/bin/sh
-egrep 'set\(X265_BUILD ' source/CMakeLists.txt | sed -e 's/set(X265_BUILD //; s/)//g'
+++ /dev/null
-usr/include
-usr/lib/*/*.a
-usr/lib/*/*.so
-usr/lib/*/pkgconfig
-usr/lib/*/x265-10bit/*.a
-usr/lib/*/x265-10bit/*.so
+++ /dev/null
-usr/lib/*/*.so.*
-usr/lib/*/x265-10bit/*.so.*
+++ /dev/null
---- a/source/CMakeLists.txt
-+++ b/source/CMakeLists.txt
-@@ -49,9 +49,13 @@ if("${SYSPROC}" STREQUAL "" OR X86MATCH
- message(STATUS "Detected x86 system processor")
- endif()
- elseif(${SYSPROC} STREQUAL "armv6l")
-- message(STATUS "Detected ARM target processor")
-- set(ARM 1)
-+ message(STATUS "Detected ARMV6 target processor")
-+ set(ARMV6 1)
- add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
-+elseif(${SYSPROC} STREQUAL "armv7l")
-+ message(STATUS "Detected ARMV7 target processor")
-+ set(ARMV7 1)
-+ add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=1)
- else()
- message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
- message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-@@ -129,9 +133,12 @@ if(GCC)
- if(X86 AND NOT X64)
- add_definitions(-march=i686)
- endif()
-- if(ARM)
-+ if(ARMV6)
- add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
- endif()
-+ if(ARMV7)
-+ add_definitions(-fPIC -march=armv7 -mfloat-abi=hard -mfpu=neon)
-+ endif()
- check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING)
- check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS)
- if (CC_HAS_NO_ARRAY_BOUNDS)
---- a/source/common/cpu.cpp
-+++ b/source/common/cpu.cpp
-@@ -356,7 +356,7 @@ uint32_t cpu_detect(void)
- // which may result in incorrect detection and the counters stuck enabled.
- // right now Apple does not seem to support performance counters for this test
- #ifndef __MACH__
-- flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
-+ //flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
- #endif
- // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
- #endif // if HAVE_ARMV6
+++ /dev/null
---- a/source/x265.cpp
-+++ b/source/x265.cpp
-@@ -772,12 +772,9 @@
- {\
- bErr = 0;\
- p = strstr(paramBuf, opt "=");\
-- char* q = strstr(paramBuf, "no-"opt);\
- if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\
- bErr = 1;\
-- else if (!param_val && !q)\
-- bErr = 1;\
-- else if (param_val && (q || !strstr(paramBuf, opt)))\
-+ else if (param_val && strstr(paramBuf, "no-"opt))\
- bErr = 1;\
- if (bErr)\
- {\
+++ /dev/null
---- a/source/CMakeLists.txt
-+++ b/source/CMakeLists.txt
-@@ -39,12 +39,14 @@
- set(X86_ALIASES x86 i386 i686 x86_64 amd64)
- list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
- if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
-- message(STATUS "Detected x86 target processor")
- set(X86 1)
- add_definitions(-DX265_ARCH_X86=1)
- if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
-+ message(STATUS "Detected x86_64 system processor")
- set(X64 1)
- add_definitions(-DX86_64=1)
-+ else()
-+ message(STATUS "Detected x86 system processor")
- endif()
- elseif(${SYSPROC} STREQUAL "armv6l")
- message(STATUS "Detected ARM target processor")
+++ /dev/null
---- a/doc/reST/cli.rst
-+++ b/doc/reST/cli.rst
-@@ -63,7 +63,7 @@
- is used for WPP and for distributed analysis and motion search:
- :option:`--wpp` :option:`--pmode` and :option:`--pme` respectively.
-
-- If :option:`--threads`=1 is specified, then no thread pool is
-+ If :option:`--threads` 1 is specified, then no thread pool is
- created. When no thread pool is created, all the thread pool
- features are implicitly disabled. If all the pool features are
- disabled by the user, then the pool is implicitly disabled.
-@@ -904,8 +904,8 @@
- between 0 and 1, or in kbits. In other words these two option pairs
- are equivalent::
-
-- :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 900
-- :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 0.9
-+ --vbv-bufsize 1000 --vbv-init 900
-+ --vbv-bufsize 1000 --vbv-init 0.9
-
- Default 0.9
-
-@@ -1206,7 +1206,7 @@
- .. option:: --aud, --no-aud
-
- Emit an access unit delimiter NAL at the start of each slice access
-- unit. If option:`--repeat-headers` is not enabled (indicating the
-+ unit. If :option:`--repeat-headers` is not enabled (indicating the
- user will be writing headers manually at the start of the stream)
- the very first AUD will be skipped since it cannot be placed at the
- start of the access unit, where it belongs. Default disabled
+++ /dev/null
---- a/source/encoder/encoder.cpp
-+++ b/source/encoder/encoder.cpp
-@@ -1330,6 +1330,12 @@
- p->bBPyramid = 0;
-
- /* Disable features which are not supported by the current RD level */
-+ if (p->rdLevel < 5)
-+ {
-+ if (p->bEnableCbfFastMode) /* impossible */
-+ x265_log(p, X265_LOG_WARNING, "--fast-cbf disabled, requires --rdlevel 5 or higher\n");
-+ p->bEnableCbfFastMode = 0;
-+ }
- if (p->rdLevel < 4)
- {
- if (p->psyRdoq > 0) /* impossible */
+++ /dev/null
-version.patch
-cpu-detection.patch
-show-options-using-their-param_parse-names.patch
-make_it_clear_that_--fast-cbf_is_innefective_at_lower_rd_levels.patch
-bug_fix_for_validatefanout_param_analysis-mode_save_and_load.patch
-fix-reST-parsing-issues-in-docs.patch
-armv7l-support.patch
+++ /dev/null
---- a/source/common/param.cpp
-+++ b/source/common/param.cpp
-@@ -1152,8 +1152,8 @@
- fprintf(stderr, "psy-rd=%.2lf ", param->psyRd);
- if (param->psyRdoq > 0.)
- fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq);
-- TOOLOPT(param->bEnableEarlySkip, "esd");
-- TOOLOPT(param->bEnableCbfFastMode, "cfm");
-+ TOOLOPT(param->bEnableEarlySkip, "early-skip");
-+ TOOLOPT(param->bEnableCbfFastMode, "fast-cbf");
- if (param->noiseReduction)
- fprintf(stderr, "nr=%d ", param->noiseReduction);
- TOOLOPT(param->bEnableLoopFilter, "lft");
+++ /dev/null
---- a/source/cmake/version.cmake
-+++ b/source/cmake/version.cmake
-@@ -6,8 +6,8 @@
- find_package(Git QUIET) # present in 2.8.8
-
- # defaults, in case everything below fails
--set(X265_VERSION "unknown")
--set(X265_LATEST_TAG "0.0")
-+set(X265_VERSION "1.4")
-+set(X265_LATEST_TAG "1.4")
- set(X265_TAG_DISTANCE "0")
-
- if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt)
+++ /dev/null
-#!/usr/bin/make -f
-
-DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
-DEB_HOST_GNU_CPU ?= $(shell dpkg-architecture -qDEB_HOST_GNU_CPU)
-DEB_HOST_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
-API ?= $(shell debian/getapi.sh)
-LIB = $(CURDIR)/debian/install/usr/lib/$(DEB_HOST_MULTIARCH)
-
-include debian/confflags
-
-builddir = compiling
-x265-clean = rm -rf $(builddir) && mkdir $(builddir)
-x265-install = $(MAKE) -C $(builddir) install DESTDIR=$(CURDIR)/debian/install
-x265-build = dh_auto_build -D$(builddir)
-test-build = \
- $(builddir)/x265 --pass 1 --bitrate 10 -o /dev/null debian/test.y4m && \
- $(builddir)/x265 --pass 2 --bitrate 10 -o test.hevc debian/test.y4m
-
-
-%:
- dh ${@} --parallel
-
-override_dh_auto_clean:
- dh_auto_clean
- rm -rf $(builddir) doc/reST/build debian/install debian/test.y4m
-
-override_dh_auto_build:
- sed -e 's/@API@/$(API)/g' debian/control.in > debian/control
- cp -f debian/libx265N.install debian/libx265-$(API).install
- unxz -fk debian/test.y4m.xz
-# build x265 8-bit
- mkdir $(builddir)
- cd $(builddir) && cmake $(8bit_confflags) ../source
- $(x265-build)
- $(x265-install)
-# test x265 8-bit
-# $(test-build)
-# build x265 10-bit
- $(x265-clean)
- cd $(builddir) && cmake $(10bit_confflags) ../source
- $(x265-build)
- $(x265-install)
- sed -e 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g' \
- debian/x265-10bit.in > $(builddir)/x265-10bit
- install -c -D -m755 $(builddir)/x265-10bit $(CURDIR)/debian/install/usr/bin
-# test x265 10-bit
-# $(test-build)
-# build x265 8-bit static
- $(x265-clean)
- rm -f $(LIB)/libx265.a
- cd $(builddir) && cmake $(8bit_static_confflags) ../source
- $(x265-build)
- install -c -D -m644 $(builddir)/libx265.a $(LIB)
-# build x265 10-bit static
- $(x265-clean)
- rm -f $(LIB)/x265-10bit/libx265.a
- cd $(builddir) && cmake $(10bit_static_confflags) ../source
- $(x265-build)
- install -c -D -m644 $(builddir)/libx265.a $(LIB)/x265-10bit
-# build documentation
- $(MAKE) -C doc/reST pickle html
-
-override_dh_install:
- dh_install --list-missing --sourcedir=$(CURDIR)/debian/install
-
-override_dh_installchangelogs:
- dh_installchangelogs -px265-doc -plibx265-$(API) ChangeLog
-
-override_dh_installdocs:
- dh_installdocs -plibx265-$(API) -px265-doc -X.buildinfo
- dh_installdocs -px265 -px265-dbg -plibx265-dev --link-doc=libx265-$(API)
-
-override_dh_strip:
- dh_strip --dbg-package=x265-dbg
-
-override_dh_builddeb:
- dh_builddeb -- -Zxz -z9
-
+++ /dev/null
-3.0 (quilt)
+++ /dev/null
-debian/test.y4m.xz
+++ /dev/null
-compression = "xz"
-compression-level = 9
+++ /dev/null
-hg log --style changelog > ChangeLog
+++ /dev/null
-version=3
-https://bitbucket.org/multicoreware/x265/get/([0-9.]+)\.tar\.(?:xz|bz2|gz)
\ No newline at end of file
+++ /dev/null
-#!/bin/sh
-LIBX265_PATH=/usr/lib/@DEB_HOST_MULTIARCH@/x265-10bit
-LD_LIBRARY_PATH="$LIBX265_PATH:$LD_LIBRARY_PATH" $LIBX265_PATH/x265 "$@"
+++ /dev/null
-Document: x265
-Title: x265 Documentation
-Author: Steve Borho <steve@borho.org>
-Abstract: This is the official documentation x265,
- a free H.265/HEVC video encoder.
-Section: Video
-
-Format: HTML
-Index: /usr/share/doc/x265-doc/html/index.html
-Files: /usr/share/doc/x265-doc/html/*.html
+++ /dev/null
-doc/intra
-doc/reST/build/html
+++ /dev/null
-/usr/share/javascript/jquery/jquery.js /usr/share/doc/x265-doc/html/_static/jquery.js
-/usr/share/javascript/underscore/underscore.js /usr/share/doc/x265-doc/html/_static/underscore.js
+++ /dev/null
-
-Syntax: x265 [options] infile [-o] outfile
- infile can be YUV or Y4M
- outfile is raw HEVC bitstream
-
-Executable Options:
--h/--help Show this help text and exit
--V/--version Show version info and exit
-
-Output Options:
--o/--output <filename> Bitstream output file name
- --log-level <string> Logging level: none error warning info debug full. Default info
- --no-progress Disable CLI progress reports
- --[no-]cu-stats Enable logging stats about distribution of cu across all modes. Default disabled
- --csv <filename> Comma separated log file, log level >= 3 frame log, else one line per run
-
-Input Options:
- --input <filename> Raw YUV or Y4M input file name. `-` for stdin
- --y4m Force parsing of input stream as YUV4MPEG2 regardless of file extension
- --fps <float|rational> Source frame rate (float or num/denom), auto-detected if Y4M
- --input-res WxH Source picture size [w x h], auto-detected if Y4M
- --input-depth <integer> Bit-depth of input file. Default 8
- --input-csp <string> Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420
--f/--frames <integer> Maximum number of frames to encode. Default all
- --seek <integer> First frame to encode
- --[no-]interlace <bff|tff> Indicate input pictures are interlace fields in temporal order. Default progressive
- --dither Enable dither if downscaling to 8 bit pixels. Default disabled
-
-Quality reporting metrics:
- --[no-]ssim Enable reporting SSIM metric scores. Default disabled
- --[no-]psnr Enable reporting PSNR metric scores. Default disabled
-
-Profile, Level, Tier:
- --profile <string> Enforce an encode profile: main, main10, mainstillpicture
- --level-idc <integer|float> Force a minumum required decoder level (as '5.0' or '50')
- --[no-]high-tier If a decoder level is specified, this modifier selects High tier of that level
-
-Threading, performance:
- --threads <integer> Number of threads for thread pool (0: detect CPU core count, default)
--F/--frame-threads <integer> Number of concurrently encoded frames. 0: auto-determined by core count
- --[no-]wpp Enable Wavefront Parallel Processing. Default enabled
- --[no-]pmode Parallel mode analysis. Default disabled
- --[no-]pme Parallel motion estimation. Default disabled
- --[no-]asm <bool|int|string> Override CPU detection. Default: auto
-
-Presets:
--p/--preset <string> Trade off performance for compression efficiency. Default medium
- ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo
--t/--tune <string> Tune the settings for a particular type of source or situation:
- psnr, ssim, zerolatency, or fastdecode
-
-Quad-Tree size and depth:
--s/--ctu <64|32|16> Maximum CU size (default: 64x64). Default 64
- --tu-intra-depth <integer> Max TU recursive depth for intra CUs. Default 1
- --tu-inter-depth <integer> Max TU recursive depth for inter CUs. Default 1
- --[no-]rect Enable rectangular motion partitions Nx2N and 2NxN. Default disabled
- --[no-]amp Enable asymmetric motion partitions, requires --rect. Default disabled
-
-Analysis:
- --rd <0..6> Level of RD in mode decision 0:least....6:full RDO. Default 3
- --psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default 0.000000
- --psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default 0.000000
- --nr <integer> An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled
- --[no-]tskip-fast Enable fast intra transform skipping. Default disabled
- --[no-]early-skip Enable early SKIP detection. Default disabled
- --[no-]fast-cbf Enable early outs based on whether residual is coded. Default disabled
-
-Coding tools:
--w/--[no-]weightp Enable weighted prediction in P slices. Default enabled
- --[no-]weightb Enable weighted prediction in B slices. Default disabled
- --[no-]cu-lossless Consider lossless mode in CU RDO decisions. Default disabled
- --[no-]signhide Hide sign bit of one coeff per TU (rdo). Default enabled
- --[no-]tskip Enable intra 4x4 transform skipping. Default disabled
-
-Temporal / motion search options:
- --me <string> Motion search method dia hex umh star full. Default 1
--m/--subme <integer> Amount of subpel refinement to perform (0:least .. 7:most). Default 2
- --merange <integer> Motion search range. Default 57
- --max-merge <1..5> Maximum number of merge candidates. Default 2
- --[no-]temporal-mvp Enable temporal MV predictors. Default enabled
-
-Spatial / intra options:
- --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default enabled
- --[no-]constrained-intra Constrained intra prediction (use only intra coded reference pixels) Default disabled
- --[no-]b-intra Enable intra in B frames in veryslow presets. Default disabled
- --[no-]fast-intra Enable faster search method for angular intra predictions. Default disabled
- --rdpenalty <0..2> penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default 0
-
-Slice decision options:
- --[no-]open-gop Enable open-GOP, allows I slices to be non-IDR. Default enabled
--I/--keyint <integer> Max IDR period in frames. -1 for infinite-gop. Default 250
--i/--min-keyint <integer> Scenecuts closer together than this are coded as I, not IDR. Default: auto
- --no-scenecut Disable adaptive I-frame decision
- --scenecut <integer> How aggressively to insert extra I-frames. Default 40
- --rc-lookahead <integer> Number of frames for frame-type lookahead (determines encoder latency) Default 20
- --bframes <integer> Maximum number of consecutive b-frames (now it only enables B GOP structure) Default 4
- --bframe-bias <integer> Bias towards B frame decisions. Default 0
- --b-adapt <0..2> 0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default 2
- --[no-]b-pyramid Use B-frames as references. Default enabled
- --ref <integer> max number of L0 references to be allowed (1 .. 16) Default 3
- --qpfile <string> Force frametypes and QPs for some or all frames
- Format of each line: framenumber frametype QP
- QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.
- QPs are restricted by qpmin/qpmax.
-
-Rate control, Quantization:
- --bitrate <integer> Target bitrate (kbps) for ABR (implied). Default 0
--q/--qp <integer> QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs
- --crf <float> Quality-based VBR (0-51). Default 28.000000
- --[no-]lossless Enable lossless: bypass transform, quant and loop filters globally. Default disabled
- --crf-max <float> With CRF+VBV, limit RF to this value. Default 0.000000
- May cause VBV underflows!
- --crf-min <float> With CRF+VBV, limit RF to this value. Default 0.000000
- this specifies a minimum rate factor value for encode!
- --vbv-maxrate <integer> Max local bitrate (kbit/s). Default 0
- --vbv-bufsize <integer> Set size of the VBV buffer (kbit). Default 0
- --vbv-init <float> Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default 0.900000
- --aq-mode <integer> Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default 2
- --aq-strength <float> Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default 1.000000
- --[no-]cutree Enable cutree for Adaptive Quantization. Default enabled
- --ipratio <float> QP factor between I and P. Default 1.400000
- --pbratio <float> QP factor between P and B. Default 1.300000
- --cbqpoffs <integer> Chroma Cb QP Offset. Default 0
- --crqpoffs <integer> Chroma Cr QP Offset. Default 0
- --stats Filename for stats file in multipass pass rate control. Default x265_2pass.log
- --pass Multi pass rate control.
- - 1 : First pass, creates stats file
- - 2 : Last pass, does not overwrite stats file
- - 3 : Nth pass, overwrites stats file
- --[no-]slow-firstpass Enable a slow first pass in a multipass rate control mode. Default disabled
- --analysis-mode <string|int> save - Dump analysis info into file, load - Load analysis buffers from the file. Default 0
- --analysis-file <filename> Specify file name used for either dumping or reading analysis data.
- --scaling-list <string> Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off
- --lambda-file <string> Specify a file containing replacement values for the lambda tables
- MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table
- Blank lines and lines starting with hash(#) are ignored
- Comma is considered to be white-space
-
-Loop filters (deblock and SAO):
- --[no-]lft Enable Deblocking Loop Filter. Default enabled
- --[no-]sao Enable Sample Adaptive Offset. Default enabled
- --[no-]sao-non-deblock Use non-deblocked pixels, else right/bottom boundary areas skipped. Default disabled
-
-VUI options:
- --sar <width:height|int> Sample Aspect Ratio, the ratio of width to height of an individual pixel.
- Choose from 0=undef, 1=1:1("square"), 2=12:11, 3=10:11, 4=16:11,
- 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,
- 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default 0
- --crop-rect <string> Add 'left,top,right,bottom' to the bitstream-level cropping rectangle
- --overscan <string> Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef
- --videoformat <string> Specify video format from undef, component, pal, ntsc, secam, mac. Default undef
- --range <string> Specify black level and range of luma and chroma signals as full or limited Default limited
- --colorprim <string> Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,
- smpte240m, film, bt2020. Default undef
- --transfer <string> Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,
- smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,
- bt2020-10, bt2020-12. Default undef
- --colormatrix <string> Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,
- smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef
- --chromaloc <integer> Specify chroma sample location (0 to 5). Default of 0
-
-Bitstream options:
- --[no-]info Emit SEI identifying encoder and parameters. Default enabled
- --[no-]aud Emit access unit delimiters at the start of each access unit. Default disabled
- --[no-]hrd Enable HRD parameters signalling. Default disabled
- --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default disabled
- --hash <integer> Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default 0
-
-Reconstructed video options (debugging):
--r/--recon <filename> Reconstructed raw image YUV or Y4M output file name
- --recon-depth <integer> Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M
-
-
-Full documentation may be found at http://x265.readthedocs.org/en/default/cli.html
+++ /dev/null
-.TH X265 "1" "AUGUST 2014" "v1.4" "User Commands"
-.SH NAME
-x265 \- H.265/HEVC video encoder
-
-.SH SYNOPSIS
-.B x265 \fR[options] \fIinfile \fR[\-o] \fIoutfile
-.br
-Bit depth: 8
-.PP
-.B x265\-10bit \fR[options] \fIinfile \fR[\-o] \fIoutfile
-.br
-Bit depth: 10
-.PP
-infile can be YUV or Y4M
-.br
-outfile is raw HEVC bitstream
-
-.SH DESCRIPTION
-Increasing demand for high definition and ultra\-high definition video,
-along with an increasing desire for video on demand has led to
-exponential growth in demand for bandwidth and storage requirements.
-These challenges can be met by the new High Efficiency Video Coding
-(HEVC) standard, also known as H.265. The x265 HEVC encoder project was
-launched by MulticoreWare in 2013, aiming to provide the most efficient,
-highest performance HEVC video encoder.
-.SS About HEVC
-The High Efficiency Video Coding (HEVC) was developed by the ISO/IEC
-Moving Picture Experts Group (MPEG) and ITU\-T Video Coding Experts Group
-(VCEG), through their Joint Collaborative Team on Video Coding (JCT\-VC).
-HEVC is also known as ISO/IEC 23008\-2 MPEG\-H Part 2 and ITU\-T H.265.
-HEVC provides superior video quality and up to twice the data
-compression as the previous standard (H.264/MPEG\-4 AVC). HEVC can
-support 8K Ultra High Definition video, with a picture size up to
-8192x4320 pixels.
-.SS About x265
-The primary objective of x265 is to become the best H.265/HEVC encoder
-available anywhere, offering the highest compression efficiency and the
-highest performance on a wide variety of hardware platforms. The x265
-encoder is available as an open source library, published under the
-GPLv2 license. It is also available under a commercial license, enabling
-commercial companies to utilize and distribute x265 in their solutions
-without being subject to the restrictions of the GPL license.
-
-.SH OPTIONS
-.TP
-\fB\-h/\-\-help
-Show this help text and exit
-.TP
-\fB\-V/\-\-version
-Show version info and exit
-
-.SS "Output Options:"
-.TP
-\fB\-\-output <filename>
-Bitstream output file name
-.TP
-\fB\-\-log\-level <string>
-Logging level: none error warning info debug full. Default info
-.TP
-\fB\-\-no\-progress
-Disable CLI progress reports
-.TP
-\fB\-\-[no\-]cu\-stats
-Enable logging stats about distribution of cu across all modes. Default disabled
-.TP
-\fB\-\-csv <filename>
-Comma separated log file, log level >= 3 frame log, else one line per run
-
-.SS "Input Options:"
-.TP
-\fB\-\-input <filename>
-Raw YUV or Y4M input file name. `\-` for stdin
-.TP
-\fB\-\-y4m
-Force parsing of input stream as YUV4MPEG2 regardless of file extension
-.TP
-\fB\-\-fps <float|rational>
-Source frame rate (float or num/denom), auto\-detected if Y4M
-.TP
-\fB\-\-input\-res WxH
-Source picture size [w x h], auto\-detected if Y4M
-.TP
-\fB\-\-input\-depth <integer>
-Bit\-depth of input file. Default 8
-.TP
-\fB\-\-input\-csp <string>
-Source color space: i420, i444 or i422, auto\-detected if Y4M. Default: i420
-.TP
-\fB\-f/\-\-frames <integer>
-Maximum number of frames to encode. Default all
-.TP
-\fB\-\-seek <integer>
-First frame to encode
-.TP
-\fB\-\-[no\-]interlace <bff|tff>
-Indicate input pictures are interlace fields in temporal order. Default progressive
-.TP
-\fB\-\-dither
-Enable dither if downscaling to 8 bit pixels. Default disabled
-
-.SS "Quality reporting metrics:"
-.TP
-\fB\-\-[no\-]ssim
-Enable reporting SSIM metric scores. Default disabled
-.TP
-\fB\-\-[no\-]psnr
-Enable reporting PSNR metric scores. Default disabled
-
-.SS "Profile, Level, Tier:"
-.TP
-\fB\-\-profile <string>
-Enforce an encode profile: main, main10, mainstillpicture
-.TP
-\fB\-\-level\-idc <integer|float>
-Force a minumum required decoder level (as '5.0' or '50')
-.TP
-\fB\-\-[no\-]high\-tier
-If a decoder level is specified, this modifier selects High tier of that level
-
-.SS "Threading, performance:"
-.TP
-\fB\-\-threads <integer>
-Number of threads for thread pool (0: detect CPU core count, default)
-.TP
-\fB\-F/\-\-frame\-threads <integer>
-Number of concurrently encoded frames. 0: auto\-determined by core count
-.TP
-\fB\-\-[no\-]wpp
-Enable Wavefront Parallel Processing. Default enabled
-.TP
-\fB\-\-[no\-]pmode
-Parallel mode analysis. Default disabled
-.TP
-\fB\-\-[no\-]pme
-Parallel motion estimation. Default disabled
-.TP
-\fB\-\-[no\-]asm <bool|int|string>
-Override CPU detection. Default: auto
-
-.SS Presets:
-.TP
-\fB\-p/\-\-preset <string>
-Trade off performance for compression efficiency. Default medium
-.br
-ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo
-.TP
-\fB\-t/--tune <string>
-Tune the settings for a particular type of source or situation:"
-.br
-psnr, ssim, zerolatency, or fastdecode
-
-.SS "Quad-Tree size and depth:"
-.TP
-\fB\-s/\-\-ctu <64|32|16>
-Maximum CU size (default: 64x64). Default 64
-.TP
-\fB\-\-tu\-intra\-depth <integer>
-Max TU recursive depth for intra CUs. Default 1
-.TP
-\fB\-\-tu\-inter\-depth <integer>
-Max TU recursive depth for inter CUs. Default 1
-.TP
-\fB\-\-[no\-]rect
-Enable rectangular motion partitions Nx2N and 2NxN. Default disabled
-.TP
-\fB\-\-[no\-]amp
-Enable asymmetric motion partitions, requires \fB\-\-rect\fR. Default disabled
-.TP
-\fB\-\-rd <0..6>
-Level of RD in mode decision 0:least....6:full RDO. Default 3
-.TP
-\fB\-\-psy\-rd <0..2.0>
-Strength of psycho\-visual rate distortion optimization, 0 to disable. Default 0.000000
-.TP
-\fB\-\-psy\-rdoq <0..50.0>
-Strength of psycho\-visual optimization in quantization, 0 to disable. Default 0.000000
-.TP
-\fB\-\-nr <integer>
-An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled
-.TP
-\fB\-\-[no\-]tskip\-fast
-Enable fast intra transform skipping. Default disabled
-.TP
-\fB\-\-[no\-]early\-skip
-Enable early SKIP detection. Default disabled
-.TP
-\fB\-\-[no\-]fast\-cbf
-Enable early outs based on whether residual is coded. Default disabled
-
-.SS "Coding tools:"
-.TP
-\fB\-w/\-\-[no\-]weightp
-Enable weighted prediction in P slices. Default enabled
-.TP
-\fB\-\-[no\-]weightb
-Enable weighted prediction in B slices. Default disabled
-.TP
-\fB\-\-[no\-]cu-lossless
-Consider lossless mode in CU RDO decisions. Default disabled
-.TP
-\fB\-\-[no\-]signhide
-Hide sign bit of one coeff per TU (rdo). Default enabled
-.TP
-\fB\-\-[no\-]tskip
-Enable intra transform skipping. Default disabled
-
-.SS "Temporal / motion search options:"
-.TP
-\fB\-\-me <string>
-Motion search method dia hex umh star full. Default 1
-.TP
-\fB\-m/\-\-subme <integer>
-Amount of subpel refinement to perform (0:least .. 7:most). Default 2
-.TP
-\fB\-\-merange <integer>
-Motion search range. Default 57
-.TP
-\fB\-\-max\-merge <1..5>
-Maximum number of merge candidates. Default 2
-.TP
-\fB\-\-[no\-]temporal\-mvp
-Enable temporal MV predictors. Default enabled
-
-.SS "Spatial / intra options:"
-.TP
-\fB\-\-[no\-]strong\-intra\-smoothing
-Enable strong intra smoothing for 32x32 blocks. Default enabled
-.TP
-\fB\-\-[no\-]constrained\-intra
-Constrained intra prediction (use only intra coded reference pixels) Default disabled
-.TP
-\fB\-\-[no\-]b\-intra
-Enable intra in B frames in veryslow presets. Default disabled
-.TP
-\fB\-\-[no\-]fast\-intra
-Enable faster search method for angular intra predictions. Default disabled
-.TP
-\fB\-\-rdpenalty <0..2>
-penalty for 32x32 intra TU in non\-I slices. 0:disabled 1:RD\-penalty 2:maximum. Default 0
-
-.SS "Slice decision options:"
-.TP
-\fB\-\-[no\-]open\-gop
-Enable open\-GOP, allows I slices to be non\-IDR. Default enabled
-.TP
-\fB\-I/\-\-keyint <integer>
-Max IDR period in frames. \-1 for infinite\-gop. Default 250
-.TP
-\fB\-i/\-\-min\-keyint <integer>
-Scenecuts closer together than this are coded as I, not IDR. Default: auto
-.TP
-\fB\-\-no\-scenecut
-Disable adaptive I\-frame decision
-.TP
-\fB\-\-scenecut <integer>
-How aggressively to insert extra I\-frames. Default 40
-.TP
-\fB\-\-rc\-lookahead <integer>
-Number of frames for frame\-type lookahead (determines encoder latency) Default 20
-.TP
-\fB\-\-bframes <integer>
-Maximum number of consecutive b\-frames (now it only enables B GOP structure) Default 4
-.TP
-\fB\-\-bframe\-bias <integer>
-Bias towards B frame decisions. Default 0
-.TP
-\fB\-\-b\-adapt <0..2>
-0 \- none, 1 \- fast, 2 \- full (trellis) adaptive B frame scheduling. Default 2
-.TP
-\fB\-\-[no\-]b\-pyramid
-Use B\-frames as references. Default enabled
-.TP
-\fB\-\-ref <integer>
-max number of L0 references to be allowed (1 .. 16) Default 3
-.TP
-\fB\-\-qpfile <string>
-Force frametypes and QPs for some or all frames
-.br
-Format of each line: framenumber frametype QP
-.br
-QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.
-.br
-QPs are restricted by qpmin/qpmax.
-.PP
-
-.SS "Rate control, Quantization:"
-.TP
-\fB\-\-bitrate <integer>
-Target bitrate (kbps) for ABR (implied). Default 0
-.TP
-\fB\-\-crf <float>
-Quality\-based VBR (0\-51). Default 28.000000
-.TP
-\fB\-\-[no\-]lossless
-Enable lossless: bypass transform, quant and loop filters globally. Default disabled
-.TP
-\fB\-\-crf\-max <float>
-With CRF+VBV, limit RF to this value. Default 0.000000
-.br
-May cause VBV underflows!
-.TP
-\fB\-\-crf\-min <float>
-With CRF+VBV, limit RF to this value. Default 0.000000
-.br
-this specifies a minimum rate factor value for encode!
-.TP
-\fB\-\-vbv\-maxrate <integer>
-Max local bitrate (kbit/s). Default 0
-.TP
-\fB\-\-vbv\-bufsize <integer>
-Set size of the VBV buffer (kbit). Default 0
-.TP
-\fB\-\-vbv\-init <float>
-Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default 0.900000
-.TP
-\fB\-\-aq\-mode <integer>
-Mode for Adaptive Quantization \- 0:none 1:uniform AQ 2:auto variance. Default 2
-.TP
-\fB\-\-aq\-strength <float>
-Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default 1.000000
-.TP
-\fB\-\-[no\-]cutree
-Enable cutree for Adaptive Quantization. Default enabled
-.TP
-\fB\-\-ipratio <float>
-QP factor between I and P. Default 1.400000
-.TP
-\fB\-\-pbratio <float>
-QP factor between P and B. Default 1.300000
-.TP
-\fB\-\-cbqpoffs <integer>
-Chroma Cb QP Offset. Default 0
-.TP
-\fB\-\-crqpoffs <integer>
-Chroma Cr QP Offset. Default 0
-.TP
-\fB\-\-stats
-Filename for stats file in multipass pass rate control. Default x265_2pass.log
-.TP
-\fB\-\-pass
-Multi pass rate control.
-.br
- \- 1 : First pass, creates stats file
-.br
- \- 2 : Last pass, does not overwrite stats file
-.br
- \- 3 : Nth pass, overwrites stats file
-.TP
-\fB\-\-[no\-]slow\-firstpass
-Enable a slow first pass in a multipass rate control mode. Default disabled
-.TP
-\fB\-\-analysis\-mode <string|int>
-save \- Dump analysis info into file, load \- Load analysis buffers from the file. Default 0
-.TP
-\fB\-\-analysis-file <filename>
-Specify file name used for either dumping or reading analysis data.
-.TP
-\fB\-\-scaling\-list <string>
-Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off
-.TP
-\fB\-\-lambda\-file <string>
-Specify a file containing replacement values for the lambda tables
-.br
-MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table
-.br
-Blank lines and lines starting with hash(#) are ignored
-.br
-Comma is considered to be white-space
-
-.SS "Loop filters (deblock and SAO):"
-.TP
-\fB\-\-[no\-]lft
-Enable Deblocking Loop Filter. Default enabled
-.TP
-\fB\-\-[no\-]sao
-Enable Sample Adaptive Offset. Default enabled
-.TP
-\fB\-\-[no\-]sao\-non\-deblock
-Use non\-deblocked pixels, else right/bottom boundary areas skipped. Default disabled
-
-.SS "VUI options:"
-.TP
-\fB\-\-sar <width:height|int>
-Sample Aspect Ratio, the ratio of width to height of an individual pixel.
-.br
-Choose from 0=undef, 1=1:1("square"), 2=12:11, 3=10:11, 4=16:11,
-5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,
-12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default 0
-.TP
-\fB\-\-crop\-rect <string>
-Add 'left,top,right,bottom' to the bitstream\-level cropping rectangle
-.TP
-\fB\-\-overscan <string>
-Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef
-.TP
-\fB\-\-videoformat <string>
-Specify video format from undef, component, pal, ntsc, secam, mac. Default undef
-.TP
-\fB\-\-range <string>
-Specify black level and range of luma and chroma signals as full or limited Default limited
-.TP
-\fB\-\-colorprim <string>
-Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,
-smpte240m, film, bt2020. Default undef
-.TP
-\fB\-\-transfer <string>
-Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,
-smpte240m, linear, log100, log316, iec61966\-2\-4, bt1361e, iec61966\-2\-1,
-bt2020\-10, bt2020\-12. Default undef
-.TP
-\fB\-\-colormatrix <string>
-Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,
-smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef
-.TP
-\fB\-\-chromaloc <integer>
-Specify chroma sample location (0 to 5). Default of 0
-
-.SS "Bitstream options:"
-.TP
-\fB\-\-[no\-]info
-Emit SEI identifying encoder and parameters. Default enabled
-.TP
-\fB\-\-[no\-]aud
-Emit access unit delimiters at the start of each access unit. Default disabled
-.TP
-\fB\-\-[no\-]hrd
-Enable HRD parameters signalling. Default disabled
-.TP
-\fB\-\-[no\-]repeat\-headers
-Emit SPS and PPS headers at each keyframe. Default disabled
-.TP
-\fB\-\-hash <integer>
-Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default 0
-
-.SS "Reconstructed video options (debugging):"
-.TP
-\fB\-r/\-\-recon <filename>
-Reconstructed raw image YUV or Y4M output file name
-.TP
-\fB\-\-recon\-depth <integer>
-Bit\-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M
-.SH COPYRIGHT
-Copyright \(co 2013\-2014 MulticoreWare, Inc.
-.PP
-The x265 software is owned and copyrighted by MulticoreWare, Inc.
-MulticoreWare is committed to offering the x265 software under the GNU
-GPL v2 license. Companies who do not wish to integrate the x265
-Software in their products under the terms of the GPL license can
-contact MulticoreWare (\fIlicense@x265.com\fR) to obtain a commercial
-license agreement. Companies who use x265 under the GPL may also wish
-to work with MulticoreWare to accelerate the development of specific
-features or optimized support for specific hardware or software
-platforms, or to contract for support.
-.PP
-The GNU GPL v2 license or the x265 commercial license agreement govern
-your rights to access the copyrighted x265 software source code, but do
-not cover any patents that may be applicable to the function of binary
-executable software created from the x265 source code. You are
-responsible for understanding the laws in your country, and for
-licensing all applicable patent rights needed for use or distribution of
-software applications created from the x265 source code. A good place
-to start is with the Motion Picture Experts Group \- Licensing Authority
-\- HEVC Licensing Program.
-.PP
-x265 is a registered trademark of MulticoreWare, Inc. The x265 logo is
-a trademark of MulticoreWare, and may only be used with explicit written
-permission. All rights reserved.
-
-.SH "SEE ALSO"
-.TP
-\fIhttp://x265.readthedocs.org/en/default/cli.html\fR
-online documentation
+++ /dev/null
-usr/bin
-usr/lib/*/x265-10bit/x265
+++ /dev/null
-/usr/bin/x265-10bit /usr/bin/x265-16bit
-/usr/share/man/man1/x265.1.gz /usr/share/man/man1/x265-10bit.1.gz
-/usr/share/man/man1/x265.1.gz /usr/share/man/man1/x265-16bit.1.gz
+++ /dev/null
-debian/x265.1
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-.PHONY: help clean html web pickle htmlhelp qthelp qhc latex changes linkcheck
+.PHONY: help clean distclean html web pickle htmlhelp qthelp qhc latex changes linkcheck
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " qthelp to make HTML files and a qthelp project"
@echo " qhc to make QHC file"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " man to make manpages"
@echo " changes to make an overview over all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
clean:
-rm -rf build/*
+distclean: clean
+ -rmdir build/
+
html:
mkdir -p build/html build/doctrees
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html
@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
"run these through (pdf)latex."
+man:
+ mkdir -p build/man build/doctrees
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) build/man
+ @echo
+ @echo "Build finished; the manpages are in build/man."
+ @echo "Run \`man -l build/man/x265.1' or \`man -l build/man/libx265.3'" \
+ "to view them."
+
changes:
mkdir -p build/changes build/doctrees
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes
x265 which was compiled, and **x265_build_info_str** is a pointer to a
string identifying the compiler and build options.
+.. Note::
+
+ **x265_version_str** is only updated when **cmake** runs. If you are
+ making binaries for others to use, it is recommended to run
+ **cmake** prior to **make** in your build scripts.
+
x265 will accept input pixels of any depth between 8 and 16 bits
regardless of the depth of its internal pixels (8 or 10). It will shift
and mask input pixels as required to reach the internal depth. If
the user may specify the integer ordinal of the value they desire. ie:
:option:`--log-level` 3 is equivalent to :option:`--log-level` debug.
-Standalone Executable Options
-=============================
+Executable Options
+==================
.. option:: --help, -h
**CLI ONLY**
+Logging/Statistic Options
+=========================
+
+.. option:: --log-level <integer|string>
+
+ Logging level. Debug level enables per-frame QP, metric, and bitrate
+ logging. If a CSV file is being generated, debug level makes the log
+ be per-frame rather than per-encode. Full level enables hash and
+ weight logging. -1 disables all logging, except certain fatal
+ errors, and can be specified by the string "none".
+
+ 0. error
+ 1. warning
+ 2. info **(default)**
+ 3. debug
+ 4. full
+
+.. option:: --no-progress
+
+ Disable periodic progress reports from the CLI
+
+ **CLI ONLY**
+
+.. option:: --csv <filename>
+
+ Writes encoding results to a comma separated value log file. Creates
+ the file if it doesnt already exist, else adds one line per run. if
+ :option:`--log-level` is debug or above, it writes one line per
+ frame. Default none
+
+.. option:: --cu-stats, --no-cu-stats
+
+ Records statistics on how each CU was coded (split depths and other
+ mode decisions) and reports those statistics at the end of the
+ encode. Default disabled
+
+.. option:: --ssim, --no-ssim
+
+ Calculate and report Structural Similarity values. It is
+ recommended to use :option:`--tune` ssim if you are measuring ssim,
+ else the results should not be used for comparison purposes.
+ Default disabled
+
+.. option:: --psnr, --no-psnr
+
+ Calculate and report Peak Signal to Noise Ratio. It is recommended
+ to use :option:`--tune` psnr if you are measuring PSNR, else the
+ results should not be used for comparison purposes. Default
+ disabled
+
+Performance Options
+===================
+
.. option:: --asm <integer:false:string>, --no-asm
x265 will use all detected CPU SIMD architectures by default. You can
One may also directly supply the CPU capability bitmap as an integer.
+.. option:: --frame-threads, -F <integer>
+
+ Number of concurrently encoded frames. Using a single frame thread
+ gives a slight improvement in compression, since the entire reference
+ frames are always available for motion compensation, but it has
+ severe performance implications. Default is an autodetected count
+ based on the number of CPU cores and whether WPP is enabled or not.
+
+ Over-allocation of frame threads will not improve performance, it
+ will generally just increase memory use.
+
.. option:: --threads <integer>
Number of threads to allocate for the worker thread pool This pool
is used for WPP and for distributed analysis and motion search:
:option:`--wpp` :option:`--pmode` and :option:`--pme` respectively.
- If :option:`--threads`=1 is specified, then no thread pool is
+ If :option:`--threads` 1 is specified, then no thread pool is
created. When no thread pool is created, all the thread pool
features are implicitly disabled. If all the pool features are
disabled by the user, then the pool is implicitly disabled.
Default 0, one thread is allocated per detected hardware thread
(logical CPU cores)
+.. option:: --wpp, --no-wpp
+
+ Enable Wavefront Parallel Processing. The encoder may begin encoding
+ a row as soon as the row above it is at least two CTUs ahead in the
+ encode process. This gives a 3-5x gain in parallelism for about 1%
+ overhead in compression efficiency.
+
+ This feature is implicitly disabled when no thread pool is present.
+
+ Default: Enabled
+
.. option:: --pmode, --no-pmode
Parallel mode decision, or distributed mode analysis. When enabled
the encoder will distribute the analysis work of each CU (merge,
inter, intra) across multiple worker threads. Only recommended if
x265 is not already saturating the CPU cores. In RD levels 3 and 4
- it will be most effective if --rect was enabled. At RD levels 5 and
+ it will be most effective if --rect is enabled. At RD levels 5 and
6 there is generally always enough work to distribute to warrant the
overhead, assuming your CPUs are not already saturated.
efficiency. In fact, since the modes are all measured in parallel it
makes certain early-outs impractical and thus you usually get
slightly better compression when it is enabled (at the expense of
- not skipping improbable modes).
+ not skipping improbable modes). This bypassing of early-outs can
+ cause pmode to slow down encodes, especially at faster presets.
This feature is implicitly disabled when no thread pool is present.
Sets parameters to preselected values, trading off compression efficiency against
encoding speed. These parameters are applied before all other input parameters are
- applied, and so you can override any parameters that these values control.
+ applied, and so you can override any parameters that these values control. See
+ :ref:`presets <presets>` for more detail.
0. ultrafast
1. superfast
.. option:: --tune, -t <string>
Tune the settings for a particular type of source or situation. The changes will
- be applied after :option:`--preset` but before all other parameters. Default none
-
- **Values:** psnr, ssim, zero-latency, fast-decode.
-
-.. option:: --frame-threads, -F <integer>
-
- Number of concurrently encoded frames. Using a single frame thread
- gives a slight improvement in compression, since the entire reference
- frames are always available for motion compensation, but it has
- severe performance implications. Default is an autodetected count
- based on the number of CPU cores and whether WPP is enabled or not.
-
- Over-allocation of frame threads will not improve performance, it
- will generally just increase memory use.
-
-.. option:: --log-level <integer|string>
-
- Logging level. Debug level enables per-frame QP, metric, and bitrate
- logging. If a CSV file is being generated, debug level makes the log
- be per-frame rather than per-encode. Full level enables hash and
- weight logging. -1 disables all logging, except certain fatal
- errors, and can be specified by the string "none".
-
- 0. error
- 1. warning
- 2. info **(default)**
- 3. debug
- 4. full
-
-.. option:: --csv <filename>
-
- Writes encoding results to a comma separated value log file. Creates
- the file if it doesnt already exist, else adds one line per run. if
- :option:`--log-level` is debug or above, it writes one line per
- frame. Default none
-
-.. option:: --cu-stats, --no-cu-stats
-
- Records statistics on how each CU was coded (split depths and other
- mode decisions) and reports those statistics at the end of the
- encode. Default disabled
-
-.. option:: --output, -o <filename>
-
- Bitstream output file name. If there are two extra CLI options, the
- first is implicitly the input filename and the second is the output
- filename, making the :option:`--output` option optional.
-
- The output file will always contain a raw HEVC bitstream, the CLI
- does not support any container file formats.
-
- **CLI ONLY**
-
-.. option:: --no-progress
+ be applied after :option:`--preset` but before all other parameters. Default none.
+ See :ref:`tunings <tunings>` for more detail.
- Disable CLI periodic progress reports
+ **Values:** psnr, ssim, grain, zero-latency, fast-decode, cbr.
- **CLI ONLY**
-
-Quality reporting metrics
+Input/Output File Options
=========================
-.. option:: --ssim, --no-ssim
-
- Calculate and report Structural Similarity values. It is
- recommended to use :option:`--tune` ssim if you are measuring ssim,
- else the results should not be used for comparison purposes.
- Default disabled
-
-.. option:: --psnr, --no-psnr
-
- Calculate and report Peak Signal to Noise Ratio. It is recommended
- to use :option:`--tune` psnr if you are measuring PSNR, else the
- results should not be used for comparison purposes. Default
- disabled
-
-Input Options
-=============
+These options all describe the input video sequence or, in the case of
+:option:`--dither`, operations that are performed on the sequence prior
+to encode. All options dealing with files (names, formats, offsets or
+frame counts) are only applicable to the CLI application.
.. option:: --input <filename>
**CLI ONLY**
-.. option:: --nr <integer>
-
- Noise reduction - an adaptive deadzone applied after DCT
- (subtracting from DCT coefficients), before quantization, on inter
- blocks. It does no pixel-level filtering, doesn't cross DCT block
- boundaries, has no overlap, doesn't affect intra blocks. The higher
- the strength value parameter, the more aggressively it will reduce
- noise.
-
- Enabling noise reduction will make outputs diverge between different
- numbers of frame threads. Outputs will be deterministic but the
- outputs of -F2 will no longer match the outputs of -F3, etc.
-
- **Values:** any value in range of 100 to 1000. Default disabled.
-
.. option:: --input-res <wxh>
YUV only: Source picture size [w x h]
.. option:: --interlaceMode <false|tff|bff>, --no-interlaceMode
- **EXPERIMENTAL** Specify interlace type of source pictures.
-
0. progressive pictures **(default)**
1. top field first
2. bottom field first
.. option:: --frames, -f <integer>
- Number of frames to be encoded. Default 0 (all)
+ Number of frames of input sequence to be encoded. Default 0 (all)
**CLI ONLY**
-.. option:: --qpfile <filename>
-
- Specify a text file which contains frametypes and QPs for some or
- all frames. The format of each line is:
-
- framenumber frametype QP
-
- Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame,
- **b** is an unreferenced B frame. **I** is a keyframe (random
- access point) while **i** is a I frame that is not a keyframe
- (references are not broken).
-
- Specifying QP (integer) is optional, and if specified they are
- clamped within the encoder to qpmin/qpmax.
-
-.. option:: --scaling-list <filename>
-
- Quantization scaling lists. HEVC supports 6 quantization scaling
- lists to be defined; one each for Y, Cb, Cr for intra prediction and
- one each for inter prediction.
-
- x265 does not use scaling lists by default, but this can also be
- made explicit by :option:`--scaling-list` *off*.
-
- HEVC specifies a default set of scaling lists which may be enabled
- without requiring them to be signaled in the SPS. Those scaling
- lists can be enabled via :option:`--scaling-list` *default*.
-
- All other strings indicate a filename containing custom scaling
- lists in the HM format. The encode will abort if the file is not
- parsed correctly. Custom lists must be signaled in the SPS
+.. option:: --output, -o <filename>
-.. option:: --lambda-file <filename>
+ Bitstream output file name. If there are two extra CLI options, the
+ first is implicitly the input filename and the second is the output
+ filename, making the :option:`--output` option optional.
- Specify a text file containing values for x265_lambda_tab and
- x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float
- values.
-
- The text file syntax is simple. Comma is considered to be
- white-space. All white-space is ignored. Lines must be less than 2k
- bytes in length. Content following hash (#) characters are ignored.
- The values read from the file are logged at :option:`--log-level`
- debug.
+ The output file will always contain a raw HEVC bitstream, the CLI
+ does not support any container file formats.
- Note that the lambda tables are process-global and so the new values
- affect all encoders running in the same process.
-
- Lambda values affect encoder mode decisions, the lower the lambda
- the more bits it will try to spend on signaling information (motion
- vectors and splits) and less on residual. This feature is intended
- for experimentation.
+ **CLI ONLY**
Profile, Level, Tier
====================
parameters to meet those requirements but it will never raise
them.
-Quad-Tree analysis
-==================
+Mode decision / Analysis
+========================
-.. option:: --wpp, --no-wpp
+.. option:: --rd <0..6>
- Enable Wavefront Parallel Processing. The encoder may begin encoding
- a row as soon as the row above it is at least two CTUs ahead in the
- encode process. This gives a 3-5x gain in parallelism for about 1%
- overhead in compression efficiency. Default: Enabled
+ Level of RDO in mode decision. The higher the value, the more
+ exhaustive the analysis and the more rate distortion optimization is
+ used. The lower the value the faster the encode, the higher the
+ value the smaller the bitstream (in general). Default 3
+
+ Note that this table aims for accuracy, but is not necessarily our
+ final target behavior for each mode.
+
+ +-------+---------------------------------------------------------------+
+ | Level | Description |
+ +=======+===============================================================+
+ | 0 | sa8d mode and split decisions, intra w/ source pixels |
+ +-------+---------------------------------------------------------------+
+ | 1 | recon generated (better intra), RDO merge/skip selection |
+ +-------+---------------------------------------------------------------+
+ | 2 | RDO splits and merge/skip selection |
+ +-------+---------------------------------------------------------------+
+ | 3 | RDO mode and split decisions, chroma residual used for sa8d |
+ +-------+---------------------------------------------------------------+
+ | 4 | Adds RDO Quant |
+ +-------+---------------------------------------------------------------+
+ | 5 | Adds RDO prediction decisions |
+ +-------+---------------------------------------------------------------+
+ | 6 | Currently same as 5 |
+ +-------+---------------------------------------------------------------+
+
+ **Range of values:** 0: least .. 6: full RDO analysis
+
+Options which affect the coding unit quad-tree, sometimes referred to as
+the prediction quad-tree.
.. option:: --ctu, -s <64|32|16>
and less frame parallelism as well. Because of this the faster
presets use a CU size of 32. Default: 64
+.. option:: --rect, --no-rect
+
+ Enable analysis of rectangular motion partitions Nx2N and 2NxN
+ (50/50 splits, two directions). Default disabled
+
+.. option:: --amp, --no-amp
+
+ Enable analysis of asymmetric motion partitions (75/25 splits, four
+ directions). At RD levels 0 through 4, AMP partitions are only
+ considered at CU sizes 32x32 and below. At RD levels 5 and 6, it
+ will only consider AMP partitions as merge candidates (no motion
+ search) at 64x64, and as merge or inter candidates below 64x64.
+
+ The AMP partitions which are searched are derived from the current
+ best inter partition. If Nx2N (vertical rectangular) is the best
+ current prediction, then left and right asymmetrical splits will be
+ evaluated. If 2NxN (horizontal rectangular) is the best current
+ prediction, then top and bottom asymmetrical splits will be
+ evaluated, If 2Nx2N is the best prediction, and the block is not a
+ merge/skip, then all four AMP partitions are evaluated.
+
+ This setting has no effect if rectangular partitions are disabled.
+ Default disabled
+
+.. option:: --early-skip, --no-early-skip
+
+ Measure full CU size (2Nx2N) merge candidates first; if no residual
+ is found the analysis is short circuited. Default disabled
+
+.. option:: --fast-cbf, --no-fast-cbf
+
+ Short circuit analysis if a prediction is found that does not set
+ the coded block flag (aka: no residual was encoded). It prevents
+ the encoder from perhaps finding other predictions that also have no
+ residual but require less signaling bits or have less distortion.
+ Only applicable for RD levels 5 and 6. Default disabled
+
+.. option:: --fast-intra, --no-fast-intra
+
+ Perform an initial scan of every fifth intra angular mode, then
+ check modes +/- 2 distance from the best mode, then +/- 1 distance
+ from the best mode, effectively performing a gradient descent. When
+ enabled 10 modes in total are checked. When disabled all 33 angular
+ modes are checked. Only applicable for :option:`--rd` levels 4 and
+ below (medium preset and faster).
+
+.. option:: --b-intra, --no-b-intra
+
+ Enables the evaluation of intra modes in B slices. Default disabled.
+
+.. option:: --cu-lossless, --no-cu-lossless
+
+ For each CU, evaluate lossless (transform and quant bypass) encode
+ of the best non-lossless mode option as a potential rate distortion
+ optimization. If the global option :option:`--lossless` has been
+ specified, all CUs will be encoded as lossless unconditionally
+ regardless of whether this option was enabled. Default disabled.
+
+ Only effective at RD levels 3 and above, which perform RDO mode
+ decisions.
+
+.. option:: --tskip, --no-tskip
+
+ Enable evaluation of transform skip (bypass DCT but still use
+ quantization) coding for 4x4 TU coded blocks.
+
+ Only effective at RD levels 3 and above, which perform RDO mode
+ decisions. Default disabled
+
+.. option:: --tskip-fast, --no-tskip-fast
+
+ Only evaluate transform skip for NxN intra predictions (4x4 blocks).
+ Only applicable if transform skip is enabled. For chroma, only
+ evaluate if luma used tskip. Inter block tskip analysis is
+ unmodified. Default disabled
+
+Analysis re-use options, to improve performance when encoding the same
+sequence multiple times (presumably at varying bitrates). The encoder
+will not reuse analysis if the resolution and slice type parameters do
+not match.
+
+.. option:: --analysis-mode <string|int>
+
+ Specify whether analysis information of each frame is output by encoder
+ or input for reuse. By reading the analysis data writen by an
+ earlier encode of the same sequence, substantial redundant work may
+ be avoided.
+
+ The following data may be stored and reused:
+ I frames - split decisions and luma intra directions of all CUs.
+ P/B frames - motion vectors are dumped at each depth for all CUs.
+
+ **Values:** off(0), save(1): dump analysis data, load(2): read analysis data
+
+.. option:: --analysis-file <filename>
+
+ Specify a filename for analysis data (see :option:`--analysis-mode`)
+ If no filename is specified, x265_analysis.dat is used.
+
+Options which affect the transform unit quad-tree, sometimes referred to
+as the residual quad-tree (RQT).
+
.. option:: --tu-intra-depth <1..4>
The transform unit (residual) quad-tree begins with the same depth
| 5 | 1 | 8 | 1 | 8 | true |
+----+------------+-----------+------------+-----------+-----------+
| 6 | 2 | 8 | 1 | 8 | true |
- +----+------------+-----------+------------+-----------+-----------+
- | 7 | 2 | 8 | 2 | 8 | true |
- +----+------------+-----------+------------+-----------+-----------+
-
-.. option:: --merange <integer>
-
- Motion search range. Default 57
-
- The default is derived from the default CTU size (64) minus the luma
- interpolation half-length (4) minus maximum subpel distance (2)
- minus one extra pixel just in case the hex search method is used. If
- the search range were any larger than this, another CTU row of
- latency would be required for reference frames.
-
- **Range of values:** an integer from 0 to 32768
-
-.. option:: --max-merge <1..5>
-
- Maximum number of neighbor (spatial and temporal) candidate blocks
- that the encoder may consider for merging motion predictions. If a
- merge candidate results in no residual, it is immediately selected
- as a "skip". Otherwise the merge candidates are tested as part of
- motion estimation when searching for the least cost inter option.
- The max candidate number is encoded in the SPS and determines the
- bit cost of signaling merge CUs. Default 2
-
-.. option:: --temporal-mvp, --no-temporal-mvp
-
- Enable temporal motion vector predictors in P and B slices.
- This enables the use of the motion vector from the collocated block
- in the previous frame to be used as a predictor. Default is enabled
-
-Spatial/intra options
-=====================
-
-.. option:: --rdpenalty <0..2>
-
- When set to 1, transform units of size 32x32 are given a 4x bit cost
- penalty compared to smaller transform units, in intra coded CUs in P
- or B slices.
-
- When set to 2, transform units of size 32x32 are not even attempted,
- unless otherwise required by the maximum recursion depth. For this
- option to be effective with 32x32 intra CUs,
- :option:`--tu-intra-depth` must be at least 2. For it to be
- effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be
- at least 3.
-
- Note that in HEVC an intra transform unit (a block of the residual
- quad-tree) is also a prediction unit, meaning that the intra
- prediction signal is generated for each TU block, the residual
- subtracted and then coded. The coding unit simply provides the
- prediction modes that will be used when predicting all of the
- transform units within the CU. This means that when you prevent
- 32x32 intra transform units, you are preventing 32x32 intra
- predictions.
-
- Default 0, disabled.
-
- **Values:** 0:disabled 1:4x cost penalty 2:force splits
-
-.. option:: --b-intra, --no-b-intra
-
- Enables the evaluation of intra modes in B slices. Default disabled.
-
-.. option:: --tskip, --no-tskip
-
- Enable evaluation of transform skip (bypass DCT but still use
- quantization) coding for 4x4 TU coded blocks.
-
- Only effective at RD levels 3 and above, which perform RDO mode
- decisions. Default disabled
-
-.. option:: --tskip-fast, --no-tskip-fast
-
- Only evaluate transform skip for NxN intra predictions (4x4 blocks).
- Only applicable if transform skip is enabled. For chroma, only
- evaluate if luma used tskip. Inter block tskip analysis is
- unmodified. Default disabled
-
-.. option:: --strong-intra-smoothing, --no-strong-intra-smoothing
-
- Enable strong intra smoothing for 32x32 intra blocks. Default enabled
-
-.. option:: --constrained-intra, --no-constrained-intra
-
- Constrained intra prediction. When generating intra predictions for
- blocks in inter slices, only intra-coded reference pixels are used.
- Inter-coded reference pixels are replaced with intra-coded neighbor
- pixels or default values. The general idea is to block the
- propagation of reference errors that may have resulted from lossy
- signals. Default disabled
-
-Mode decision / Analysis
-========================
-
-.. option:: --rect, --no-rect
-
- Enable analysis of rectangular motion partitions Nx2N and 2NxN
- (50/50 splits, two directions). Default disabled
-
-.. option:: --amp, --no-amp
-
- Enable analysis of asymmetric motion partitions (75/25 splits, four
- directions). At RD levels 0 through 4, AMP partitions are only
- considered at CU sizes 32x32 and below. At RD levels 5 and 6, it
- will only consider AMP partitions as merge candidates (no motion
- search) at 64x64, and as merge or inter candidates below 64x64.
-
- The AMP partitions which are searched are derived from the current
- best inter partition. If Nx2N (vertical rectangular) is the best
- current prediction, then left and right asymmetrical splits will be
- evaluated. If 2NxN (horizontal rectangular) is the best current
- prediction, then top and bottom asymmetrical splits will be
- evaluated, If 2Nx2N is the best prediction, and the block is not a
- merge/skip, then all four AMP partitions are evaluated.
+ +----+------------+-----------+------------+-----------+-----------+
+ | 7 | 2 | 8 | 2 | 8 | true |
+ +----+------------+-----------+------------+-----------+-----------+
- This setting has no effect if rectangular partitions are disabled.
- Default disabled
+ At --subme values larger than 2, chroma residual cost is included
+ in all subpel refinement steps and chroma residual is included in
+ all motion estimation decisions (selecting the best reference
+ picture in each list, and chosing between merge, uni-directional
+ motion and bi-directional motion). The 'slow' preset is the first
+ preset to enable the use of chroma residual.
-.. option:: --early-skip, --no-early-skip
+.. option:: --merange <integer>
- Measure full CU size (2Nx2N) merge candidates first; if no residual
- is found the analysis is short circuited. Default disabled
+ Motion search range. Default 57
-.. option:: --fast-cbf, --no-fast-cbf
+ The default is derived from the default CTU size (64) minus the luma
+ interpolation half-length (4) minus maximum subpel distance (2)
+ minus one extra pixel just in case the hex search method is used. If
+ the search range were any larger than this, another CTU row of
+ latency would be required for reference frames.
- Short circuit analysis if a prediction is found that does not set
- the coded block flag (aka: no residual was encoded). It prevents
- the encoder from perhaps finding other predictions that also have no
- residual but require less signaling bits or have less distortion.
- Only applicable for RD levels 5 and 6. Default disabled
+ **Range of values:** an integer from 0 to 32768
-.. option:: --fast-intra, --no-fast-intra
+.. option:: --max-merge <1..5>
- Perform an initial scan of every fifth intra angular mode, then
- check modes +/- 2 distance from the best mode, then +/- 1 distance
- from the best mode, effectively performing a gradient descent. When
- enabled 10 modes in total are checked. When disabled all 33 angular
- modes are checked. Only applicable for :option:`--rd` levels 3 and
- below (medium preset and faster).
+ Maximum number of neighbor (spatial and temporal) candidate blocks
+ that the encoder may consider for merging motion predictions. If a
+ merge candidate results in no residual, it is immediately selected
+ as a "skip". Otherwise the merge candidates are tested as part of
+ motion estimation when searching for the least cost inter option.
+ The max candidate number is encoded in the SPS and determines the
+ bit cost of signaling merge CUs. Default 2
+
+.. option:: --temporal-mvp, --no-temporal-mvp
+
+ Enable temporal motion vector predictors in P and B slices.
+ This enables the use of the motion vector from the collocated block
+ in the previous frame to be used as a predictor. Default is enabled
.. option:: --weightp, -w, --no-weightp
Enable weighted prediction in B slices. Default disabled
-.. option:: --rd <0..6>
+Spatial/intra options
+=====================
- Level of RDO in mode decision. The higher the value, the more
- exhaustive the analysis and the more rate distortion optimization is
- used. The lower the value the faster the encode, the higher the
- value the smaller the bitstream (in general). Default 3
+.. option:: --strong-intra-smoothing, --no-strong-intra-smoothing
- Note that this table aims for accuracy, but is not necessarily our
- final target behavior for each mode.
+ Enable strong intra smoothing for 32x32 intra blocks. Default enabled
- +-------+---------------------------------------------------------------+
- | Level | Description |
- +=======+===============================================================+
- | 0 | sa8d mode and split decisions, intra w/ source pixels |
- +-------+---------------------------------------------------------------+
- | 1 | recon generated (better intra), RDO merge/skip selection |
- +-------+---------------------------------------------------------------+
- | 2 | RDO splits and merge/skip selection |
- +-------+---------------------------------------------------------------+
- | 3 | RDO mode and split decisions |
- +-------+---------------------------------------------------------------+
- | 4 | Adds RDO Quant |
- +-------+---------------------------------------------------------------+
- | 5 | Adds RDO prediction decisions |
- +-------+---------------------------------------------------------------+
- | 6 | Currently same as 5 |
- +-------+---------------------------------------------------------------+
+.. option:: --constrained-intra, --no-constrained-intra
- **Range of values:** 0: least .. 6: full RDO analysis
+ Constrained intra prediction. When generating intra predictions for
+ blocks in inter slices, only intra-coded reference pixels are used.
+ Inter-coded reference pixels are replaced with intra-coded neighbor
+ pixels or default values. The general idea is to block the
+ propagation of reference errors that may have resulted from lossy
+ signals. Default disabled
-.. option:: --cu-lossless, --no-cu-lossless
+.. option:: --rdpenalty <0..2>
- For each CU, evaluate lossless (transform and quant bypass) encode
- of the best non-lossless mode option as a potential rate distortion
- optimization. If the global option :option:`--lossless` has been
- specified, all CUs will be encoded as lossless unconditionally
- regardless of whether this option was enabled. Default disabled.
+ When set to 1, transform units of size 32x32 are given a 4x bit cost
+ penalty compared to smaller transform units, in intra coded CUs in P
+ or B slices.
- Only effective at RD levels 3 and above, which perform RDO mode
- decisions.
+ When set to 2, transform units of size 32x32 are not even attempted,
+ unless otherwise required by the maximum recursion depth. For this
+ option to be effective with 32x32 intra CUs,
+ :option:`--tu-intra-depth` must be at least 2. For it to be
+ effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be
+ at least 3.
-.. option:: --signhide, --no-signhide
+ Note that in HEVC an intra transform unit (a block of the residual
+ quad-tree) is also a prediction unit, meaning that the intra
+ prediction signal is generated for each TU block, the residual
+ subtracted and then coded. The coding unit simply provides the
+ prediction modes that will be used when predicting all of the
+ transform units within the CU. This means that when you prevent
+ 32x32 intra transform units, you are preventing 32x32 intra
+ predictions.
+
+ Default 0, disabled.
+
+ **Values:** 0:disabled 1:4x cost penalty 2:force splits
- Hide sign bit of one coeff per TU (rdo). The last sign is implied.
- This requires analyzing all the coefficients to determine if a sign
- must be toggled, and then to determine which one can be toggled with
- the least amount of distortion. Default enabled
-
Psycho-visual options
=====================
force rate control to increase global QP. Finding the optimal
psycho-visual parameters for a given video requires experimentation. Our
recommended defaults (1.0 for both) are generally on the low end of the
-spectrum. And generally the lower the bitrate, the lower the optimal
-psycho-visual settings.
+spectrum.
+
+The lower the bitrate, the lower the optimal psycho-visual settings. If
+the bitrate is too low for the psycho-visual settings, you will begin to
+see temporal artifacts (motion judder). This is caused when the encoder
+is forced to code skip blocks (no residual) in areas of difficult motion
+because it is the best option psycho-visually (they have great amounts
+of energy and no residual cost). One can lower psy-rd settings when
+judder is happening, and allow the encoder to use some blur in these
+areas of high motion.
.. option:: --psy-rd <float>
.. option:: --crf-min <0..51.0>
Specify an lower limit to the rate factor which may be assigned to
- any given frame (ensuring a min QP). This is dangerous when CRF is
- used in combination with VBV as it may result in buffer underruns.
- Default disabled
+ any given frame (ensuring a min compression factor).
.. option:: --vbv-bufsize <integer>
between 0 and 1, or in kbits. In other words these two option pairs
are equivalent::
- :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 900
- :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 0.9
+ --vbv-bufsize 1000 --vbv-init 900
+ --vbv-bufsize 1000 --vbv-init 0.9
Default 0.9
**Range of values:** an integer from 0 to 51
-.. option:: --ipratio <float>
-
- QP ratio factor between I and P slices. This ratio is used in all of
- the rate control modes. Some :option:`--tune` options may change the
- default value. It is not typically manually specified. Default 1.4
-
-.. option:: --pbratio <float>
-
- QP ratio factor between P and B slices. This ratio is used in all of
- the rate control modes. Some :option:`--tune` options may change the
- default value. It is not typically manually specified. Default 1.3
-
.. option:: --lossless, --no-lossless
Enables true lossless coding by bypassing scaling, transform,
and not enough in flat areas.
0. disabled
- 1. AQ enabled
- 2. AQ enabled with auto-variance **(default)**
+ 1. AQ enabled **(default)**
+ 2. AQ enabled with auto-variance
.. option:: --aq-strength <float>
less bits. This tends to improve detail in the backgrounds of video
with less detail in areas of high motion. Default enabled
-.. option:: --cbqpoffs <integer>
-
- Offset of Cb chroma QP from the luma QP selected by rate control.
- This is a general way to spend more or less bits on the chroma
- channel. Default 0
-
- **Range of values:** -12 to 12
+.. option:: --nr-intra <integer>, --nr-inter <integer>
-.. option:: --crqpoffs <integer>
+ Noise reduction - an adaptive deadzone applied after DCT
+ (subtracting from DCT coefficients), before quantization. It does
+ no pixel-level filtering, doesn't cross DCT block boundaries, has no
+ overlap, The higher the strength value parameter, the more
+ aggressively it will reduce noise.
- Offset of Cr chroma QP from the luma QP selected by rate control.
- This is a general way to spend more or less bits on the chroma
- channel. Default 0
+ Enabling noise reduction will make outputs diverge between different
+ numbers of frame threads. Outputs will be deterministic but the
+ outputs of -F2 will no longer match the outputs of -F3, etc.
- **Range of values:** -12 to 12
+ **Values:** any value in range of 0 to 2000. Default 0 (disabled).
.. option:: --pass <integer>
- Enable multipass rate control mode. Input is encoded multiple times,
+ Enable multi-pass rate control mode. Input is encoded multiple times,
storing the encoded information of each pass in a stats file from which
the consecutive pass tunes the qp of each frame to improve the quality
of the output. Default disabled
**Range of values:** 1 to 3
+.. option:: --stats <filename>
+
+ Specify file name of of the multi-pass stats file. If unspecified
+ the encoder will use x265_2pass.log
+
.. option:: --slow-firstpass, --no-slow-firstpass
- Enable a slow and more detailed first pass encode in Multipass rate
+ Enable a slow and more detailed first pass encode in multi-pass rate
control mode. Speed of the first pass encode is slightly lesser and
quality midly improved when compared to the default settings in a
- multipass encode. Default disabled (turbo mode enabled)
+ multi-pass encode. Default disabled (turbo mode enabled)
When **turbo** first pass is not disabled, these options are
set on the first pass to improve performance:
* :option:`--subme` = MIN(2, :option:`--subme`)
* :option:`--rd` = MIN(2, :option:`--rd`)
-.. option:: --analysis-mode <string|int>
+.. option:: --cbqpoffs <integer>
- Specify whether analysis information of each frame is output by encoder
- or input for reuse. By reading the analysis data writen by an
- earlier encode of the same sequence, substantial redundant work may
- be avoided.
+ Offset of Cb chroma QP from the luma QP selected by rate control.
+ This is a general way to spend more or less bits on the chroma
+ channel. Default 0
- The following data may be stored and reused:
- I frames - split decisions and luma intra directions of all CUs.
- P/B frames - motion vectors are dumped at each depth for all CUs.
+ **Range of values:** -12 to 12
- **Values:** off(0), save(1): dump analysis data, load(2): read analysis data
+.. option:: --crqpoffs <integer>
-.. option:: --analysis-file <filename>
+ Offset of Cr chroma QP from the luma QP selected by rate control.
+ This is a general way to spend more or less bits on the chroma
+ channel. Default 0
- Specify a filename for analysis data (see :option:`--analysis-mode`)
- If no filename is specified, x265_analysis.dat is used.
+ **Range of values:** -12 to 12
+
+.. option:: --ipratio <float>
+
+ QP ratio factor between I and P slices. This ratio is used in all of
+ the rate control modes. Some :option:`--tune` options may change the
+ default value. It is not typically manually specified. Default 1.4
+
+.. option:: --pbratio <float>
+
+ QP ratio factor between P and B slices. This ratio is used in all of
+ the rate control modes. Some :option:`--tune` options may change the
+ default value. It is not typically manually specified. Default 1.3
+
+.. option:: --qcomp <float>
+
+ qComp sets the quantizer curve compression factor. It weights the
+ frame quantizer based on the complexity of residual (measured by
+ lookahead). Default value is 0.6. Increasing it to 1 will
+ effectively generate CQP
+
+.. option:: --qstep <integer>
+
+ The maximum single adjustment in QP allowed to rate control. Default
+ 4
+
+.. option:: --ratetol <float>
+
+ The degree of rate fluctuation that x265 tolerates. Rate tolerance
+ is used along with overflow (difference between actual and target
+ bitrate), to adjust qp. Default is 1.0
+
+.. option:: --qblur <float>
+
+ Temporally blur quants. Default 0.5
+
+.. option:: --cplxblur <float>
+
+ temporally blur complexity. default 20
+
+Quantization Options
+====================
+
+Note that rate-distortion optimized quantization (RDOQ) is enabled
+implicitly at :option:`--rd` 4, 5, and 6 and disabled implicitly at all
+other levels.
+
+.. option:: --signhide, --no-signhide
+
+ Hide sign bit of one coeff per TU (rdo). The last sign is implied.
+ This requires analyzing all the coefficients to determine if a sign
+ must be toggled, and then to determine which one can be toggled with
+ the least amount of distortion. Default enabled
+
+.. option:: --qpfile <filename>
+
+ Specify a text file which contains frametypes and QPs for some or
+ all frames. The format of each line is:
+
+ framenumber frametype QP
+
+ Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame,
+ **b** is an unreferenced B frame. **I** is a keyframe (random
+ access point) while **i** is a I frame that is not a keyframe
+ (references are not broken).
+
+ Specifying QP (integer) is optional, and if specified they are
+ clamped within the encoder to qpmin/qpmax.
+
+.. option:: --scaling-list <filename>
+
+ Quantization scaling lists. HEVC supports 6 quantization scaling
+ lists to be defined; one each for Y, Cb, Cr for intra prediction and
+ one each for inter prediction.
+
+ x265 does not use scaling lists by default, but this can also be
+ made explicit by :option:`--scaling-list` *off*.
+
+ HEVC specifies a default set of scaling lists which may be enabled
+ without requiring them to be signaled in the SPS. Those scaling
+ lists can be enabled via :option:`--scaling-list` *default*.
+
+ All other strings indicate a filename containing custom scaling
+ lists in the HM format. The encode will abort if the file is not
+ parsed correctly. Custom lists must be signaled in the SPS
+
+.. option:: --lambda-file <filename>
+
+ Specify a text file containing values for x265_lambda_tab and
+ x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float
+ values.
+
+ The text file syntax is simple. Comma is considered to be
+ white-space. All white-space is ignored. Lines must be less than 2k
+ bytes in length. Content following hash (#) characters are ignored.
+ The values read from the file are logged at :option:`--log-level`
+ debug.
+
+ Note that the lambda tables are process-global and so the new values
+ affect all encoders running in the same process.
+
+ Lambda values affect encoder mode decisions, the lower the lambda
+ the more bits it will try to spend on signaling information (motion
+ vectors and splits) and less on residual. This feature is intended
+ for experimentation.
Loop filters
============
-.. option:: --lft, --no-lft
+.. option:: --deblock=<int>:<int>, --no-deblock
+
+ Toggle deblocking loop filter, optionally specify deblocking
+ strength offsets.
+
+ <int>:<int> - parsed as tC offset and Beta offset
+ <int>,<int> - parsed as tC offset and Beta offset
+ <int> - both tC and Beta offsets assigned the same value
+
+ If unspecified, the offsets default to 0. The offsets must be in a
+ range of -6 (lowest strength) to 6 (highest strength).
+
+ To disable the deblocking filter entirely, use --no-deblock or
+ --deblock=false. Default enabled, with both offsets defaulting to 0
- Toggle deblocking loop filter, default enabled
+ If deblocking is disabled, or the offsets are non-zero, these
+ changes from the default configuration are signaled in the PPS.
.. option:: --sao, --no-sao
9. bt2020nc
10. bt2020c
-.. option:: --chromalocs <0..5>
+.. option:: --chromaloc <0..5>
Specify chroma sample location for 4:2:0 inputs. Consult the HEVC
specification for a description of these values. Default undefined
.. option:: --aud, --no-aud
Emit an access unit delimiter NAL at the start of each slice access
- unit. If option:`--repeat-headers` is not enabled (indicating the
+ unit. If :option:`--repeat-headers` is not enabled (indicating the
user will be writing headers manually at the start of the stream)
the very first AUD will be skipped since it cannot be placed at the
start of the access unit, where it belongs. Default disabled
# -- Options for HTML output ---------------------------------------------------
html_theme = "default"
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'libx265', 'Full x265 Documentation',
+ ['MulticoreWare Inc'], 3),
+ ('x265', 'x265', 'x265 CLI Documentation',
+ ['MulticoreWare Inc'], 1)
+]
licensing all applicable patent rights needed for use or distribution of
software applications created from the x265 source code. A good place
to start is with the `Motion Picture Experts Group - Licensing Authority
-- HEVC Licensing Program<http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
+- HEVC Licensing Program <http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
x265 is a registered trademark of MulticoreWare, Inc. The x265 logo is
a trademark of MulticoreWare, and may only be used with explicit written
Preset Options
--------------
+.. _presets:
+
Presets
=======
-.. _preset-tune-ref:
-
x265 has a number of predefined :option:`--preset` options that make
trade-offs between encode speed (encoded frames per second) and
compression efficiency (quality per bit in the bitstream). The default
+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
| rdLevel | 2 | 2 | 2 | 2 | 2 | 3 | 4 | 6 | 6 | 6 |
+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| lft | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+| deblock | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
| tu-intra | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | 4 |
+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
Placebo mode enables transform-skip prediction evaluation.
+.. _tunings:
+
Tuning
======
+--------------+-----------------------------------------------------+
| ssim | enables adaptive quant auto-mode, disables psy-rd |
+--------------+-----------------------------------------------------+
+| grain | improves retention of film grain. more below |
++--------------+-----------------------------------------------------+
| fastdecode | no loop filters, no weighted pred, no intra in B |
+--------------+-----------------------------------------------------+
| zerolatency | no lookahead, no B frames, no cutree |
+--------------+-----------------------------------------------------+
+| cbr | --pbratio 1.0 --ratetol 0.5 |
++--------------+-----------------------------------------------------+
+
+
+Film Grain Retention
+~~~~~~~~~~~~~~~~~~~~
+
+:option:`--tune` grain tries to improve the retention of film grain in
+the reconstructed output. It helps rate distortion optimizations select
+modes which preserve high frequency noise:
+
+ * :option:`--psy-rd` 0.5
+ * :option:`--psy-rdoq` 30
+
+.. Note::
+
+ --psy-rdoq is only effective when RDOQuant is enabled, which is at
+ RD levels 4, 5, and 6 (presets slow and below).
+
+It lowers the strength of adaptive quantization, so residual energy can
+be more evenly distributed across the (noisy) picture:
+
+ * :option:`--aq-mode` 1
+ * :option:`--aq-strength` 0.3
+
+And it similarly tunes rate control to prevent the slice QP from
+swinging too wildly from frame to frame:
+
+ * :option:`--ipratio` 1.1
+ * :option:`--pbratio` 1.1
+ * :option:`--qcomp` 0.8
+
+And lastly it reduces the strength of deblocking to prevent grain being
+blurred on block boundaries:
+
+ * :option:`--deblock` -2
+
--- /dev/null
+x265 CLI Documentation
+######################
+
+
+SYNOPSIS
+========
+
+**x265** [options] infile [-o] outfile
+
+Bit depth: 8
+
+
+**x265-10bit** [options] infile [-o] outfile
+
+Bit depth: 10
+
+
+infile can be YUV or Y4M
+
+outfile is raw HEVC bitstream
+
+
+DESCRIPTION
+===========
+
+.. toctree::
+ :maxdepth: 2
+
+ introduction
+
+
+OPTIONS
+=======
+
+.. toctree::
+ :maxdepth: 2
+
+ cli
+ presets
+ lossless
+
+
+SEE ALSO
+========
+
+**libx265**\(3)
+
+Online documentation: http://x265.readthedocs.org/en/default/cli.html
+
include(CheckCXXCompilerFlag)
# X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 35)
+set(X265_BUILD 40)
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
endif()
endif(UNIX)
+if(X64 AND NOT WIN32)
+ option(ENABLE_PIC "Enable Position Independent Code" ON)
+else()
+ option(ENABLE_PIC "Enable Position Independent Code" OFF)
+endif(X64 AND NOT WIN32)
+
# Compiler detection
if(CMAKE_GENERATOR STREQUAL "Xcode")
set(XCODE 1)
if(GCC)
add_definitions(-Wall -Wextra -Wshadow)
add_definitions(-D__STDC_LIMIT_MACROS=1)
- if(X64 AND NOT WIN32)
- add_definitions(-fPIC)
- endif(X64 AND NOT WIN32)
+ if(ENABLE_PIC)
+ add_definitions(-fPIC)
+ endif(ENABLE_PIC)
if(X86 AND NOT X64)
add_definitions(-march=i686)
endif()
typedef void (FUNC_PPALibRelease)(ppa::Base* &);
}
+using namespace ppa;
+
static FUNC_PPALibRelease *_pfuncPpaRelease;
-ppa::Base *ppabase;
+ppa::Base *ppa::ppabase;
static void _ppaReleaseAtExit()
{
* For more information, contact us at license @ x265.com.
*****************************************************************************/
-#ifndef _PPA_H_
-#define _PPA_H_
-
-#if !defined(ENABLE_PPA)
-
-#define PPA_INIT()
-#define PPAStartCpuEventFunc(e)
-#define PPAStopCpuEventFunc(e)
-#define PPAScopeEvent(e)
-
-#else
+#ifndef PPA_H
+#define PPA_H
/* declare enum list of users CPU events */
#define PPA_REGISTER_CPU_EVENT(x) x,
#include "ppaCPUEvents.h"
PPACpuGroupNums
};
-
#undef PPA_REGISTER_CPU_EVENT
-#define PPA_INIT() initializePPA()
-#define PPAStartCpuEventFunc(e) if (ppabase) ppabase->triggerStartEvent(ppabase->getEventId(e))
-#define PPAStopCpuEventFunc(e) if (ppabase) ppabase->triggerEndEvent(ppabase->getEventId(e))
-#define PPAScopeEvent(e) _PPAScope __scope_(e)
-
#include "ppaApi.h"
void initializePPA();
-extern ppa::Base *ppabase;
-
-class _PPAScope
-{
-protected:
-
- ppa::EventID m_id;
-
-public:
-
- _PPAScope(int e) { if (ppabase) { m_id = ppabase->getEventId(e); ppabase->triggerStartEvent(m_id); } else m_id = 0; }
- ~_PPAScope() { if (ppabase) ppabase->triggerEndEvent(m_id); }
-};
-
-#endif // if !defined(ENABLE_PPA)
+#define PPA_INIT() initializePPA()
+#define PPAScopeEvent(e) ppa::ProfileScope ppaScope_(e)
-#endif /* _PPA_H_ */
+#endif /* PPA_H */
virtual void init(const char **pNames, int eventCount) = 0;
};
+
+extern ppa::Base *ppabase;
+
+struct ProfileScope
+{
+ ppa::EventID id;
+
+ ProfileScope(int e) { if (ppabase) { id = ppabase->getEventId(e); ppabase->triggerStartEvent(id); } else id = 0; }
+ ~ProfileScope() { if (ppabase) ppabase->triggerEndEvent(id); }
+};
+
}
#endif //_PPA_API_H_
-PPA_REGISTER_CPU_EVENT(encode_block)
-PPA_REGISTER_CPU_EVENT(bitstream_write)
-PPA_REGISTER_CPU_EVENT(DPB_prepareEncode)
-PPA_REGISTER_CPU_EVENT(FrameEncoder_compressFrame)
-PPA_REGISTER_CPU_EVENT(FrameEncoder_compressRows)
-PPA_REGISTER_CPU_EVENT(CompressCU)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth1)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth2)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth3)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth4)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth1)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth2)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth3)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth4)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth1)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth2)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth3)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth4)
-PPA_REGISTER_CPU_EVENT(CalcRDCostIntra)
-PPA_REGISTER_CPU_EVENT(Thread_ProcessRow)
-PPA_REGISTER_CPU_EVENT(Thread_compressCU)
-PPA_REGISTER_CPU_EVENT(Thread_encodeCU)
-PPA_REGISTER_CPU_EVENT(Thread_filterCU)
+PPA_REGISTER_CPU_EVENT(bitstreamWrite)
+PPA_REGISTER_CPU_EVENT(frameThread)
+PPA_REGISTER_CPU_EVENT(encodeCTU)
+PPA_REGISTER_CPU_EVENT(filterCTURow)
+PPA_REGISTER_CPU_EVENT(slicetypeDecideEV)
+PPA_REGISTER_CPU_EVENT(costEstimateRow)
set(CMAKE_ASM${ASM_DIALECT}_SOURCE_FILE_EXTENSIONS asm)
if(X64)
- list(APPEND ASM_FLAGS -DARCH_X86_64=1 -DPIC)
+ list(APPEND ASM_FLAGS -DARCH_X86_64=1)
+ if(ENABLE_PIC)
+ list(APPEND ASM_FLAGS -DPIC)
+ endif()
if(APPLE)
set(ARGS -f macho64 -m amd64 -DPREFIX)
elseif(UNIX AND NOT CYGWIN)
LIST (APPEND _VLD_POSSIBLE_LIB_SUFFIXES lib/Win64)
ENDIF (CMAKE_SIZEOF_VOID_P EQUAL 4)
+SET (PFILES "ProgramFiles")
+SET (PFILES_X86 "ProgramFiles(x86)") # hack to avoid escaping issues in cmake 3.1
+
FIND_PATH (VLD_ROOT_DIR
NAMES include/vld.h
PATHS ENV VLDROOT
- "$ENV{PROGRAMFILES}/Visual Leak Detector"
- "$ENV{PROGRAMFILES(X86)}/Visual Leak Detector"
+ "$ENV{PFILES}/Visual Leak Detector"
+ "$ENV{PFILES_X86}/Visual Leak Detector"
"[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]"
"[HKEY_LOCAL_MACHINE\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]"
DOC "VLD root directory")
find_package(Git QUIET) # present in 2.8.8
# defaults, in case everything below fails
-set(X265_VERSION "unknown")
-set(X265_LATEST_TAG "0.0")
-set(X265_TAG_DISTANCE "0")
+set(X265_VERSION "1.4+222+hg5f9f7194267b")
+set(X265_LATEST_TAG "1.4")
+set(X265_TAG_DISTANCE "222")
if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt)
# read the lines of the archive summary file to extract the version
set(hg_${key} ${value})
endforeach()
if(DEFINED hg_tag)
- set(X265_VERSION ${hg_tag} CACHE STRING "x265 version string.")
+ set(X265_VERSION ${hg_tag})
set(X265_LATEST_TAG ${hg_tag})
- set(X265_TAG_DISTANCE "0")
+ set(X265_TAG_DISTANCE "222")
elseif(DEFINED hg_node)
string(SUBSTRING "${hg_node}" 0 16 hg_id)
set(X265_VERSION "${hg_latesttag}+${hg_latesttagdistance}-${hg_id}")
# vim: syntax=cmake
-set(SSE3 vec/dct-sse3.cpp)
-set(SSSE3 vec/dct-ssse3.cpp)
-set(SSE41 vec/dct-sse41.cpp)
-if(MSVC AND X86)
- set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
- set(WARNDISABLE "/wd4100") # unreferenced formal parameter
- if(INTEL_CXX)
- add_definitions(/Qwd111) # statement is unreachable
- add_definitions(/Qwd128) # loop is unreachable
- add_definitions(/Qwd177) # declared function is unused
- add_definitions(/Qwd185) # dynamic initialization in unreachable code
- add_definitions(/Qwd280) # conditional expression is constant
- endif()
- if(X64)
- set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE}")
- else()
- # x64 implies SSE4, so only add /arch:SSE2 if building for Win32
- set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
- endif()
-endif()
-if(GCC AND X86)
- if(CLANG)
- # llvm intrinsic headers cause shadow warnings
- set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
- else()
- set(WARNDISABLE "-Wno-unused-parameter")
- endif()
- if(INTEL_CXX OR CLANG OR (NOT CC_VERSION VERSION_LESS 4.3))
+if(ENABLE_ASSEMBLY)
+ set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
+
+ set(SSE3 vec/dct-sse3.cpp)
+ set(SSSE3 vec/dct-ssse3.cpp)
+ set(SSE41 vec/dct-sse41.cpp)
+
+ if(MSVC AND X86)
set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
- set_source_files_properties(${SSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse3")
- set_source_files_properties(${SSSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -mssse3")
- set_source_files_properties(${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse4.1")
+ set(WARNDISABLE "/wd4100") # unreferenced formal parameter
+ if(INTEL_CXX)
+ add_definitions(/Qwd111) # statement is unreachable
+ add_definitions(/Qwd128) # loop is unreachable
+ add_definitions(/Qwd177) # declared function is unused
+ add_definitions(/Qwd185) # dynamic initialization in unreachable code
+ add_definitions(/Qwd280) # conditional expression is constant
+ endif()
+ if(X64)
+ set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE}")
+ else()
+ # x64 implies SSE4, so only add /arch:SSE2 if building for Win32
+ set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
+ endif()
endif()
-endif()
-set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES})
-source_group(Intrinsics FILES ${VEC_PRIMITIVES})
+ if(GCC AND X86)
+ if(CLANG)
+ # llvm intrinsic headers cause shadow warnings
+ set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
+ else()
+ set(WARNDISABLE "-Wno-unused-parameter")
+ endif()
+ if(INTEL_CXX OR CLANG OR (NOT CC_VERSION VERSION_LESS 4.3))
+ set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
+ set_source_files_properties(${SSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse3")
+ set_source_files_properties(${SSSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -mssse3")
+ set_source_files_properties(${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse4.1")
+ endif()
+ endif()
+ set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES})
+ source_group(Intrinsics FILES ${VEC_PRIMITIVES})
-if(ENABLE_ASSEMBLY)
- set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
mc-a2.asm pixel-util8.asm blockcopy8.asm
#include "x265.h"
+#if ENABLE_PPA
+#include "PPA/ppa.h"
+#define ProfileScopeEvent(x) PPAScopeEvent(x)
+#define PROFILE_INIT() PPA_INIT()
+#else
+#define ProfileScopeEvent(x)
+#define PROFILE_INIT()
+#endif
+
#define FENC_STRIDE 64
#define NUM_INTRA_MODE 35
#define x265_stack_align(func, ...) func(__VA_ARGS__)
#endif
+#if defined(__MINGW32__)
+#define fseeko fseeko64
+#endif
+
#elif defined(_MSC_VER)
#define ALIGN_VAR_8(T, var) __declspec(align(8)) T var
#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
-#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
-#define MAX_NUM_TR_CATEGORIES 8 /* 32, 16, 8, 4 transform categories each for luma and chroma */
-
#define COEF_REMAIN_BIN_REDUCTION 3 // indicates the level at which the VLC
// transitions from Golomb-Rice to TU+EG(k)
#define CHROMA_H_SHIFT(x) (x == X265_CSP_I420 || x == X265_CSP_I422)
#define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
+#define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
namespace x265 {
enum { SAO_NUM_OFFSET = 4 };
-// NOTE: MUST be alignment to 16 or 32 bytes for asm code
-struct NoiseReduction
-{
- /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
- * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
- uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
- uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
- uint32_t count[MAX_NUM_TR_CATEGORIES];
-};
-
enum SaoMergeMode
{
SAO_MERGE_NONE,
}
};
+/* Stores inter (motion estimation) analysis data for a single frame */
+struct analysis_inter_data
+{
+ int ref;
+};
+
+/* Stores intra analysis data for a single frame. This struct needs better packing */
+struct analysis_intra_data
+{
+ uint8_t* depth;
+ uint8_t* modes;
+ char* partSizes;
+};
+
enum TextType
{
TEXT_LUMA = 0, // luma
namespace x265 {
-static int initialized /* = 0 */;
-
-// initialize ROM variables
-void initROM()
+#if HIGH_BIT_DEPTH
+// lambda = pow(2, (double)q / 6 - 2) * (1 << (X265_DEPTH - 8));
+double x265_lambda_tab[QP_MAX_MAX + 1] =
{
- if (ATOMIC_CAS32(&initialized, 0, 1) == 1)
- return;
-}
+ 1.0000, 1.1225, 1.2599, 1.4142, 1.5874,
+ 1.7818, 2.0000, 2.2449, 2.5198, 2.8284,
+ 3.1748, 3.5636, 4.0000, 4.4898, 5.0397,
+ 5.6569, 6.3496, 7.1272, 8.0000, 8.9797,
+ 10.0794, 11.3137, 12.6992, 14.2544, 16.0000,
+ 17.9594, 20.1587, 22.6274, 25.3984, 28.5088,
+ 32.0000, 35.9188, 40.3175, 45.2548, 50.7968,
+ 57.0175, 64.0000, 71.8376, 80.6349, 90.5097,
+ 101.5937, 114.0350, 128.0000, 143.6751, 161.2699,
+ 181.0193, 203.1873, 228.0701, 256.0000, 287.3503,
+ 322.5398, 362.0387, 406.3747, 456.1401, 512.0000,
+ 574.7006, 645.0796, 724.0773, 812.7493, 912.2803,
+ 1024.0000, 1149.4011, 1290.1592, 1448.1547, 1625.4987,
+ 1824.5606, 2048.0000, 2298.8023, 2580.3183, 2896.3094,
+};
-void destroyROM()
+// lambda2 = pow(lambda, 2) * scale (0.85);
+double x265_lambda2_tab[QP_MAX_MAX + 1] =
{
- if (ATOMIC_CAS32(&initialized, 1, 0) == 0)
- return;
-}
+ 0.8500, 1.0709, 1.3493, 1.7000, 2.1419,
+ 2.6986, 3.4000, 4.2837, 5.3972, 6.8000,
+ 8.5675, 10.7943, 13.6000, 17.1349, 21.5887,
+ 27.2000, 34.2699, 43.1773, 54.4000, 68.5397,
+ 86.3546, 108.8000, 137.0794, 172.7092, 217.6000,
+ 274.1588, 345.4185, 435.2000, 548.3176, 690.8369,
+ 870.4000, 1096.6353, 1381.6739, 1740.8000, 2193.2706,
+ 2763.3478, 3481.6000, 4386.5411, 5526.6955, 6963.2000,
+ 8773.0823, 11053.3910, 13926.4000, 17546.1645, 22106.7820,
+ 27852.8000, 35092.3290, 44213.5640, 55705.6000, 70184.6580,
+ 88427.1280, 111411.2000, 140369.3161, 176854.2561, 222822.4000,
+ 280738.6321, 353708.5122, 445644.8000, 561477.2643, 707417.0243,
+ 891289.6000, 1122954.5286, 1414834.0486, 1782579.2000, 2245909.0572,
+ 2829668.0973, 3565158.4000, 4491818.1144, 5659336.1946, 7130316.8000,
+};
+#else /* !HIGH_BIT_DEPTH */
// lambda = pow(2, (double)q / 6 - 2);
double x265_lambda_tab[QP_MAX_MAX + 1] =
176854.2222, 222822.4000, 280738.6627, 353708.5368, 445644.7459
};
+#endif
+
const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
{
16, 20, 25, 32, 40, 50,
namespace x265 {
// private namespace
-void initROM();
-void destroyROM();
-
void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx);
void initRasterToZscan(uint32_t maxFullDepth);
/* Each CU's data is layed out sequentially within the charMemBlock */
uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
- m_qp = (char*)charBuf; charBuf += m_numPartitions;
+ m_qp = (int8_t*)charBuf; charBuf += m_numPartitions;
m_log2CUSize = charBuf; charBuf += m_numPartitions;
- m_partSize = charBuf; charBuf += m_numPartitions;
- m_predMode = charBuf; charBuf += m_numPartitions;
m_lumaIntraDir = charBuf; charBuf += m_numPartitions;
m_tqBypass = charBuf; charBuf += m_numPartitions;
- m_refIdx[0] = (char*)charBuf; charBuf += m_numPartitions;
- m_refIdx[1] = (char*)charBuf; charBuf += m_numPartitions;
+ m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
+ m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
m_cuDepth = charBuf; charBuf += m_numPartitions;
- m_skipFlag = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+ m_predMode = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+ m_partSize = charBuf; charBuf += m_numPartitions;
m_mergeFlag = charBuf; charBuf += m_numPartitions;
m_interDir = charBuf; charBuf += m_numPartitions;
m_mvpIdx[0] = charBuf; charBuf += m_numPartitions;
/* sequential memsets */
m_partSet((uint8_t*)m_qp, (uint8_t)qp);
m_partSet(m_log2CUSize, (uint8_t)g_maxLog2CUSize);
- m_partSet(m_partSize, (uint8_t)SIZE_NONE);
- m_partSet(m_predMode, (uint8_t)MODE_NONE);
m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
m_partSet(m_tqBypass, (uint8_t)frame.m_encData->m_param->bLossless);
if (m_slice->m_sliceType != I_SLICE)
X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n");
/* initialize the remaining CU data in one memset */
- memset(m_cuDepth, 0, (BytesPerPartition - 8) * m_numPartitions);
+ memset(m_cuDepth, 0, (BytesPerPartition - 6) * m_numPartitions);
uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
/* sequential memsets */
m_partSet((uint8_t*)m_qp, (uint8_t)ctu.m_qp[0]);
m_partSet(m_log2CUSize, (uint8_t)cuGeom.log2CUSize);
- m_partSet(m_partSize, (uint8_t)SIZE_NONE);
- m_partSet(m_predMode, (uint8_t)MODE_NONE);
m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
m_partSet(m_tqBypass, (uint8_t)m_encData->m_param->bLossless);
m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
m_partSet(m_cuDepth, (uint8_t)cuGeom.depth);
/* initialize the remaining CU data in one memset */
- memset(m_skipFlag, 0, (BytesPerPartition - 9) * m_numPartitions);
+ memset(m_predMode, 0, (BytesPerPartition - 7) * m_numPartitions);
}
/* Copy the results of a sub-part (split) CU to the parent CU */
m_subPartCopy((uint8_t*)m_qp + offset, (uint8_t*)subCU.m_qp);
m_subPartCopy(m_log2CUSize + offset, subCU.m_log2CUSize);
- m_subPartCopy(m_partSize + offset, subCU.m_partSize);
- m_subPartCopy(m_predMode + offset, subCU.m_predMode);
m_subPartCopy(m_lumaIntraDir + offset, subCU.m_lumaIntraDir);
m_subPartCopy(m_tqBypass + offset, subCU.m_tqBypass);
m_subPartCopy((uint8_t*)m_refIdx[0] + offset, (uint8_t*)subCU.m_refIdx[0]);
m_subPartCopy((uint8_t*)m_refIdx[1] + offset, (uint8_t*)subCU.m_refIdx[1]);
m_subPartCopy(m_cuDepth + offset, subCU.m_cuDepth);
- m_subPartCopy(m_skipFlag + offset, subCU.m_skipFlag);
+ m_subPartCopy(m_predMode + offset, subCU.m_predMode);
+ m_subPartCopy(m_partSize + offset, subCU.m_partSize);
m_subPartCopy(m_mergeFlag + offset, subCU.m_mergeFlag);
m_subPartCopy(m_interDir + offset, subCU.m_interDir);
m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
m_partSet(m_tqBypass, true);
/* clear residual coding flags */
- m_partSet(m_skipFlag, 0);
+ m_partSet(m_predMode, cu.m_predMode[0] & (MODE_INTRA | MODE_INTER));
m_partSet(m_tuDepth, 0);
m_partSet(m_transformSkip[0], 0);
m_partSet(m_transformSkip[1], 0);
m_partCopy((uint8_t*)ctu.m_qp + m_absIdxInCTU, (uint8_t*)m_qp);
m_partCopy(ctu.m_log2CUSize + m_absIdxInCTU, m_log2CUSize);
- m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize);
- m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
m_partCopy(ctu.m_lumaIntraDir + m_absIdxInCTU, m_lumaIntraDir);
m_partCopy(ctu.m_tqBypass + m_absIdxInCTU, m_tqBypass);
m_partCopy((uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU, (uint8_t*)m_refIdx[0]);
m_partCopy((uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU, (uint8_t*)m_refIdx[1]);
m_partCopy(ctu.m_cuDepth + m_absIdxInCTU, m_cuDepth);
- m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag);
+ m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
+ m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize);
m_partCopy(ctu.m_mergeFlag + m_absIdxInCTU, m_mergeFlag);
m_partCopy(ctu.m_interDir + m_absIdxInCTU, m_interDir);
m_partCopy(ctu.m_mvpIdx[0] + m_absIdxInCTU, m_mvpIdx[0]);
/* copy out all prediction info for this part */
m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
m_partCopy(m_log2CUSize, ctu.m_log2CUSize + m_absIdxInCTU);
- m_partCopy(m_partSize, ctu.m_partSize + m_absIdxInCTU);
- m_partCopy(m_predMode, ctu.m_predMode + m_absIdxInCTU);
m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU);
m_partCopy(m_tqBypass, ctu.m_tqBypass + m_absIdxInCTU);
m_partCopy((uint8_t*)m_refIdx[0], (uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU);
m_partCopy((uint8_t*)m_refIdx[1], (uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU);
m_partCopy(m_cuDepth, ctu.m_cuDepth + m_absIdxInCTU);
+ m_partSet(m_predMode, ctu.m_predMode[m_absIdxInCTU] & (MODE_INTRA | MODE_INTER)); /* clear skip flag */
+ m_partCopy(m_partSize, ctu.m_partSize + m_absIdxInCTU);
m_partCopy(m_mergeFlag, ctu.m_mergeFlag + m_absIdxInCTU);
m_partCopy(m_interDir, ctu.m_interDir + m_absIdxInCTU);
m_partCopy(m_mvpIdx[0], ctu.m_mvpIdx[0] + m_absIdxInCTU);
memcpy(m_mvd[1], ctu.m_mvd[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
/* clear residual coding flags */
- m_partSet(m_skipFlag, 0);
m_partSet(m_tuDepth, 0);
m_partSet(m_transformSkip[0], 0);
m_partSet(m_transformSkip[1], 0);
m_partCopy(ctu.m_transformSkip[0] + m_absIdxInCTU, m_transformSkip[0]);
m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
- m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag);
+ m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth);
m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]);
m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]);
return m_cuLeft;
}
-const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary) const
+const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx) const
{
uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx];
if (isEqualRow(absPartIdx, absZorderCUIdx, s_numPartInCUSize))
return m_encData->getPicCTU(m_cuAddr);
else
- {
aPartUnitIdx -= m_absIdxInCTU;
- return this;
- }
+ return this;
}
- if (planarAtCTUBoundary)
- return NULL;
-
aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize];
return m_cuAbove;
}
}
/* Get reference QP from left QpMinCu or latest coded QP */
-char CUData::getRefQP(uint32_t curAbsIdxInCTU) const
+int8_t CUData::getRefQP(uint32_t curAbsIdxInCTU) const
{
uint32_t lPartIdx = 0, aPartIdx = 0;
const CUData* cULeft = getQpMinCuLeft(lPartIdx, m_absIdxInCTU + curAbsIdxInCTU);
return lastValidPartIdx;
}
-char CUData::getLastCodedQP(uint32_t absPartIdx) const
+int8_t CUData::getLastCodedQP(uint32_t absPartIdx) const
{
uint32_t quPartIdxMask = 0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2;
int lastValidPartIdx = getLastValidPartIdx(absPartIdx & quPartIdxMask);
else if (m_cuAddr > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled && !(m_cuAddr % m_slice->m_sps->numCuInWidth)))
return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_CU_PARTITIONS);
else
- return (char)m_slice->m_sliceQp;
+ return (int8_t)m_slice->m_sliceQp;
}
}
leftIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX;
// Get intra direction of above PU
- tempCU = getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx, true);
+ tempCU = g_zscanToPelY[m_absIdxInCTU + absPartIdx] > 0 ? getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx) : NULL;
aboveIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX;
void CUData::getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const
{
uint32_t log2CUSize = m_log2CUSize[absPartIdx];
- uint32_t splitFlag = m_partSize[absPartIdx] == SIZE_NxN;
+ uint32_t splitFlag = m_partSize[absPartIdx] != SIZE_2Nx2N;
tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;
return ctx;
}
-bool CUData::setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth)
+bool CUData::setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth)
{
uint32_t curPartNumb = NUM_CU_PARTITIONS >> (depth << 1);
uint32_t curPartNumQ = curPartNumb >> 2;
setAllPU(m_mv[list], mv, absPartIdx, puIdx);
}
-void CUData::setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx)
+void CUData::setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx)
{
setAllPU(m_refIdx[list], refIdx, absPartIdx, puIdx);
}
else
{
// OUT OF BOUNDARY
- outMvField.mv.word = 0;
+ outMvField.mv = 0;
outMvField.refIdx = REF_NOT_VALID;
}
}
for (uint32_t i = 0; i < maxNumMergeCand; ++i)
{
+ mvFieldNeighbours[i][0].mv = 0;
+ mvFieldNeighbours[i][1].mv = 0;
mvFieldNeighbours[i][0].refIdx = REF_NOT_VALID;
mvFieldNeighbours[i][1].refIdx = REF_NOT_VALID;
}
bool isAvailableA1 = cuLeft &&
cuLeft->isDiffMER(xP - 1, yP + nPSH - 1, xP, yP) &&
!(puIdx == 1 && (curPS == SIZE_Nx2N || curPS == SIZE_nLx2N || curPS == SIZE_nRx2N)) &&
- !cuLeft->isIntra(leftPartIdx);
+ cuLeft->isInter(leftPartIdx);
if (isAvailableA1)
{
// get Inter Dir
bool isAvailableB1 = cuAbove &&
cuAbove->isDiffMER(xP + nPSW - 1, yP - 1, xP, yP) &&
!(puIdx == 1 && (curPS == SIZE_2NxN || curPS == SIZE_2NxnU || curPS == SIZE_2NxnD)) &&
- !cuAbove->isIntra(abovePartIdx);
+ cuAbove->isInter(abovePartIdx);
if (isAvailableB1 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAbove, abovePartIdx)))
{
// get Inter Dir
const CUData* cuAboveRight = getPUAboveRight(aboveRightPartIdx, partIdxRT);
bool isAvailableB0 = cuAboveRight &&
cuAboveRight->isDiffMER(xP + nPSW, yP - 1, xP, yP) &&
- !cuAboveRight->isIntra(aboveRightPartIdx);
+ cuAboveRight->isInter(aboveRightPartIdx);
if (isAvailableB0 && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveRight, aboveRightPartIdx)))
{
// get Inter Dir
const CUData* cuLeftBottom = this->getPUBelowLeft(leftBottomPartIdx, partIdxLB);
bool isAvailableA0 = cuLeftBottom &&
cuLeftBottom->isDiffMER(xP - 1, yP + nPSH, xP, yP) &&
- !cuLeftBottom->isIntra(leftBottomPartIdx);
+ cuLeftBottom->isInter(leftBottomPartIdx);
if (isAvailableA0 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuLeftBottom, leftBottomPartIdx)))
{
// get Inter Dir
const CUData* cuAboveLeft = getPUAboveLeft(aboveLeftPartIdx, absPartAddr);
bool isAvailableB2 = cuAboveLeft &&
cuAboveLeft->isDiffMER(xP - 1, yP - 1, xP, yP) &&
- !cuAboveLeft->isIntra(aboveLeftPartIdx);
+ cuAboveLeft->isInter(aboveLeftPartIdx);
if (isAvailableB2 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAboveLeft, aboveLeftPartIdx))
&& (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveLeft, aboveLeftPartIdx)))
{
while (count < maxNumMergeCand)
{
interDirNeighbours[count] = 1;
- mvFieldNeighbours[count][0].mv.word = 0;
+ mvFieldNeighbours[count][0].mv = 0;
mvFieldNeighbours[count][0].refIdx = r;
if (isInterB)
bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int partUnitIdx) const
{
- uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
-
- int colRefPicList;
- int colPOC, colRefPOC, curPOC, curRefPOC;
- MV colmv;
-
- // use coldir.
- Frame *colPic = m_slice->m_refPicList[m_slice->isInterB() ? 1 - m_slice->m_colFromL0Flag : 0][m_slice->m_colRefIdx];
- CUData *colCU = colPic->m_encData->getPicCTU(cuAddr);
+ const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
+ const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
- if (colCU->m_partSize[partUnitIdx] == SIZE_NONE)
+ if (colCU->m_predMode[partUnitIdx] == MODE_NONE)
return false;
- curPOC = m_slice->m_poc;
- colPOC = colCU->m_slice->m_poc;
+ uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
if (colCU->isIntra(absPartAddr))
return false;
- colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag;
+ int colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag;
int colRefIdx = colCU->m_refIdx[colRefPicList][absPartAddr];
}
// Scale the vector
- colRefPOC = colCU->m_slice->m_refPOCList[colRefPicList][colRefIdx];
- colmv = colCU->m_mv[colRefPicList][absPartAddr];
- curRefPOC = m_slice->m_refPOCList[picList][outRefIdx];
+ int colRefPOC = colCU->m_slice->m_refPOCList[colRefPicList][colRefIdx];
+ int colPOC = colCU->m_slice->m_poc;
+ MV colmv = colCU->m_mv[colRefPicList][absPartAddr];
+
+ int curRefPOC = m_slice->m_refPOCList[picList][outRefIdx];
+ int curPOC = m_slice->m_poc;
scaleMvByPOCDist(outMV, colmv, curPOC, curRefPOC, colPOC, colRefPOC);
return true;
#define CU_SET_FLAG(bitfield, flag, value) (bitfield) = ((bitfield) & (~(flag))) | ((~((value) - 1)) & (flag))
-void CUData::calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const
+void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS])
{
// Initialize the coding blocks inside the CTB
for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= MIN_LOG2_CU_SIZE; log2CUSize--)
uint32_t depthIdx = g_depthScanIdx[sbY][sbX];
uint32_t cuIdx = rangeCUIdx + depthIdx;
uint32_t childIdx = rangeCUIdx + sbWidth * sbWidth + (depthIdx << 2);
- uint32_t px = m_cuPelX + sbX * blockSize;
- uint32_t py = m_cuPelY + sbY * blockSize;
- int32_t presentFlag = px < picWidth && py < picHeight;
- int32_t splitMandatoryFlag = presentFlag && !lastLevelFlag && (px + blockSize > picWidth || py + blockSize > picHeight);
+ uint32_t px = sbX * blockSize;
+ uint32_t py = sbY * blockSize;
+ int32_t presentFlag = px < ctuWidth && py < ctuHeight;
+ int32_t splitMandatoryFlag = presentFlag && !lastLevelFlag && (px + blockSize > ctuWidth || py + blockSize > ctuHeight);
/* Offset of the luma CU in the X, Y direction in terms of pixels from the CTU origin */
uint32_t xOffset = (sbX * blockSize) >> 3;
SIZE_2NxnD, // asymmetric motion partition, 2Nx(3N/2) + 2Nx( N/2)
SIZE_nLx2N, // asymmetric motion partition, ( N/2)x2N + (3N/2)x2N
SIZE_nRx2N, // asymmetric motion partition, (3N/2)x2N + ( N/2)x2N
- SIZE_NONE = 15
+ NUM_SIZES
};
enum PredMode
{
- MODE_INTER,
- MODE_INTRA,
- MODE_NONE = 15
+ MODE_NONE = 0,
+ MODE_INTER = (1 << 0),
+ MODE_INTRA = (1 << 1),
+ MODE_SKIP = (1 << 2) | MODE_INTER
};
// motion vector predictor direction used in AMVP
int m_vChromaShift;
/* Per-part data, stored contiguously */
- char* m_qp; // array of QP values
+ int8_t* m_qp; // array of QP values
uint8_t* m_log2CUSize; // array of cu log2Size TODO: seems redundant to depth
- uint8_t* m_partSize; // array of partition sizes
- uint8_t* m_predMode; // array of prediction modes
uint8_t* m_lumaIntraDir; // array of intra directions (luma)
uint8_t* m_tqBypass; // array of CU lossless flags
- char* m_refIdx[2]; // array of motion reference indices per list
+ int8_t* m_refIdx[2]; // array of motion reference indices per list
uint8_t* m_cuDepth; // array of depths
- uint8_t* m_skipFlag; // array of skip flags
+ uint8_t* m_predMode; // array of prediction modes
+ uint8_t* m_partSize; // array of partition sizes
uint8_t* m_mergeFlag; // array of merge flags
uint8_t* m_interDir; // array of inter directions
uint8_t* m_mvpIdx[2]; // array of motion vector predictor candidates or merge candidate indices [0]
uint8_t* m_transformSkip[3]; // array of transform skipping flags per plane
uint8_t* m_cbf[3]; // array of coded block flags (CBF) per plane
uint8_t* m_chromaIntraDir; // array of intra directions (chroma)
- enum { BytesPerPartition = 22 }; // combined sizeof() of all per-part data
+ enum { BytesPerPartition = 21 }; // combined sizeof() of all per-part data
coeff_t* m_trCoeff[3]; // transformed coefficient buffer per plane
CUData();
void initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance);
- void calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const;
+ static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]);
void initCTU(const Frame& frame, uint32_t cuAddr, int qp);
void initSubCU(const CUData& ctu, const CUGeom& cuGeom);
void updatePic(uint32_t depth) const;
void setPartSizeSubParts(PartSize size) { m_partSet(m_partSize, (uint8_t)size); }
- void setSkipFlagSubParts(uint8_t skipFlag) { m_partSet(m_skipFlag, skipFlag); }
void setPredModeSubParts(PredMode mode) { m_partSet(m_predMode, (uint8_t)mode); }
void clearCbf() { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); }
/* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */
- void setQPSubParts(char qp, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
+ void setQPSubParts(int8_t qp, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
void setTUDepthSubParts(uint8_t tuDepth, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_tuDepth + absPartIdx, tuDepth); }
void setLumaIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_lumaIntraDir + absPartIdx, dir); }
void setChromIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_chromaIntraDir + absPartIdx, dir); }
void setTransformSkipSubParts(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_transformSkip[ttype] + absPartIdx, tskip); }
void setTransformSkipPartRange(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t coveredPartIdxes) { memset(m_transformSkip[ttype] + absPartIdx, tskip, coveredPartIdxes); }
- bool setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth);
+ bool setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth);
void setPUInterDir(uint8_t dir, uint32_t absPartIdx, uint32_t puIdx);
void setPUMv(int list, const MV& mv, int absPartIdx, int puIdx);
- void setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx);
+ void setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
- uint8_t getCbf(uint32_t absPartIdx, TextType ttype, uint32_t trDepth) const { return (m_cbf[ttype][absPartIdx] >> trDepth) & 0x1; }
+ uint8_t getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
uint8_t getQtRootCbf(uint32_t absPartIdx) const { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
- char getRefQP(uint32_t currAbsIdxInCTU) const;
+ int8_t getRefQP(uint32_t currAbsIdxInCTU) const;
uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const;
void clipMv(MV& outMV) const;
int fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const;
uint32_t getNumPartInter() const { return nbPartsTable[(int)m_partSize[0]]; }
bool isIntra(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_INTRA; }
- bool isSkipped(uint32_t absPartIdx) const { return !!m_skipFlag[absPartIdx]; }
+ bool isInter(uint32_t absPartIdx) const { return !!(m_predMode[absPartIdx] & MODE_INTER); }
+ bool isSkipped(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_SKIP; }
bool isBipredRestriction() const { return m_log2CUSize[0] == 3 && m_partSize[0] != SIZE_2Nx2N; }
void getPartIndexAndSize(uint32_t puIdx, uint32_t& absPartIdx, int& puWidth, int& puHeight) const;
void getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const;
const CUData* getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const;
- const CUData* getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary = false) const;
+ const CUData* getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx) const;
const CUData* getPUAboveLeft(uint32_t& alPartUnitIdx, uint32_t curPartUnitIdx) const;
const CUData* getPUAboveRight(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx) const;
const CUData* getPUBelowLeft(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx) const;
template<typename T>
void setAllPU(T *p, const T& val, int absPartIdx, int puIdx);
- char getLastCodedQP(uint32_t absPartIdx) const;
+ int8_t getLastCodedQP(uint32_t absPartIdx) const;
int getLastValidPartIdx(int absPartIdx) const;
bool hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const;
// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
// give identical results
-void fastForwardDst(int16_t *block, int16_t *coeff, int shift) // input block, output coeff
+void fastForwardDst(const int16_t* block, int16_t* coeff, int shift) // input block, output coeff
{
int c[4];
int rnd_factor = 1 << (shift - 1);
}
}
-void inversedst(int16_t *tmp, int16_t *block, int shift) // input tmp, output block
+void inversedst(const int16_t* tmp, int16_t* block, int shift) // input tmp, output block
{
int i, c[4];
int rnd_factor = 1 << (shift - 1);
}
}
-void partialButterfly16(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[8], O[8];
}
}
-void partialButterfly32(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[16], O[16];
}
}
-void partialButterfly8(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[4], O[4];
}
}
-void partialButterflyInverse4(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
{
int j;
int E[2], O[2];
}
}
-void partialButterflyInverse8(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[4], O[4];
}
}
-void partialButterflyInverse16(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[8], O[8];
}
}
-void partialButterflyInverse32(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[16], O[16];
}
}
-void partialButterfly4(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
{
int j;
int E[2], O[2];
}
}
-void dst4_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 1 + X265_DEPTH - 8;
const int shift_2nd = 8;
for (int i = 0; i < 4; i++)
{
- memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+ memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
}
fastForwardDst(block, coef, shift_1st);
- fastForwardDst(coef, block, shift_2nd);
-
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ fastForwardDst(coef, dst, shift_2nd);
}
-void dct4_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 1 + X265_DEPTH - 8;
const int shift_2nd = 8;
for (int i = 0; i < 4; i++)
{
- memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+ memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
}
partialButterfly4(block, coef, shift_1st, 4);
- partialButterfly4(coef, block, shift_2nd, 4);
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly4(coef, dst, shift_2nd, 4);
}
-void dct8_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 2 + X265_DEPTH - 8;
const int shift_2nd = 9;
for (int i = 0; i < 8; i++)
{
- memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
+ memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
}
partialButterfly8(block, coef, shift_1st, 8);
- partialButterfly8(coef, block, shift_2nd, 8);
-
-#define N (8)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly8(coef, dst, shift_2nd, 8);
}
-void dct16_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 3 + X265_DEPTH - 8;
const int shift_2nd = 10;
for (int i = 0; i < 16; i++)
{
- memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
+ memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
}
partialButterfly16(block, coef, shift_1st, 16);
- partialButterfly16(coef, block, shift_2nd, 16);
-
-#define N (16)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly16(coef, dst, shift_2nd, 16);
}
-void dct32_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 4 + X265_DEPTH - 8;
const int shift_2nd = 11;
for (int i = 0; i < 32; i++)
{
- memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
+ memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
}
partialButterfly32(block, coef, shift_1st, 32);
- partialButterfly32(coef, block, shift_2nd, 32);
-
-#define N (32)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly32(coef, dst, shift_2nd, 32);
}
-void idst4_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
ALIGN_VAR_32(int16_t, coef[4 * 4]);
ALIGN_VAR_32(int16_t, block[4 * 4]);
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
+ inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
for (int i = 0; i < 4; i++)
{
- memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
}
}
-void idct4_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
ALIGN_VAR_32(int16_t, coef[4 * 4]);
ALIGN_VAR_32(int16_t, block[4 * 4]);
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
+ partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
for (int i = 0; i < 4; i++)
{
- memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
}
}
-void idct8_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
ALIGN_VAR_32(int16_t, coef[8 * 8]);
ALIGN_VAR_32(int16_t, block[8 * 8]);
-#define N (8)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse8(block, coef, shift_1st, 8);
+ partialButterflyInverse8(src, coef, shift_1st, 8);
partialButterflyInverse8(coef, block, shift_2nd, 8);
+
for (int i = 0; i < 8; i++)
{
- memcpy(&dst[i * stride], &block[i * 8], 8 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
}
}
-void idct16_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
ALIGN_VAR_32(int16_t, coef[16 * 16]);
ALIGN_VAR_32(int16_t, block[16 * 16]);
-#define N (16)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse16(block, coef, shift_1st, 16);
+ partialButterflyInverse16(src, coef, shift_1st, 16);
partialButterflyInverse16(coef, block, shift_2nd, 16);
+
for (int i = 0; i < 16; i++)
{
- memcpy(&dst[i * stride], &block[i * 16], 16 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
}
}
-void idct32_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
ALIGN_VAR_32(int16_t, coef[32 * 32]);
ALIGN_VAR_32(int16_t, block[32 * 32]);
-#define N (32)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse32(block, coef, shift_1st, 32);
+ partialButterflyInverse32(src, coef, shift_1st, 32);
partialButterflyInverse32(coef, block, shift_2nd, 32);
for (int i = 0; i < 32; i++)
{
- memcpy(&dst[i * stride], &block[i * 32], 32 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
}
}
-void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
+void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
{
#if HIGH_BIT_DEPTH
X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > 2), "dequant invalid scale %d\n", scale);
for (int n = 0; n < num; n++)
{
coeffQ = (quantCoef[n] * scale + add) >> shift;
- coef[n] = Clip3(-32768, 32767, coeffQ);
+ coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ);
}
}
-void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
{
X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
for (int n = 0; n < num; n++)
{
coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
- coef[n] = Clip3(-32768, 32767, coeffQ);
+ coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ);
}
}
else
for (int n = 0; n < num; n++)
{
coeffQ = Clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
- coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift));
+ coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ << (per - shift));
}
}
}
-uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
{
X265_CHECK(qBits >= 8, "qBits less than 8\n");
X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
return numSig;
}
-uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
+uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
{
X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
return numSig;
}
-int count_nonzero_c(const int16_t *quantCoeff, int numCoeff)
+int count_nonzero_c(const int16_t* quantCoeff, int numCoeff)
{
X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff);
}
template<int trSize>
-uint32_t copy_count(int16_t* coeff, int16_t* residual, intptr_t stride)
+uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
{
uint32_t numSig = 0;
for (int k = 0; k < trSize; k++)
{
for (int j = 0; j < trSize; j++)
{
- coeff[k * trSize + j] = residual[k * stride + j];
- numSig += (residual[k * stride + j] != 0);
+ coeff[k * trSize + j] = residual[k * resiStride + j];
+ numSig += (residual[k * resiStride + j] != 0);
}
}
return numSig;
}
-void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
+void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff)
{
for (int i = 0; i < numCoeff; i++)
{
level = (level + sign) ^ sign;
resSum[i] += level;
level -= offset[i];
- dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
+ dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
}
}
#define DEBLOCK_SMALLEST_BLOCK 8
#define DEFAULT_INTRA_TC_OFFSET 2
-void Deblock::deblockCTU(CUData* cu, int32_t dir)
+void Deblock::deblockCTU(const CUData* ctu, int32_t dir)
{
- uint8_t blockingStrength[MAX_NUM_PARTITIONS];
+ uint8_t blockStrength[MAX_NUM_PARTITIONS];
- memset(blockingStrength, 0, sizeof(uint8_t) * m_numPartitions);
+ memset(blockStrength, 0, sizeof(uint8_t) * m_numPartitions);
- deblockCU(cu, 0, 0, dir, blockingStrength);
+ deblockCU(ctu, 0, 0, dir, blockStrength);
+}
+
+static inline uint8_t bsCuEdge(const CUData* cu, uint32_t absPartIdx, int32_t dir)
+{
+ if (dir == Deblock::EDGE_VER)
+ {
+ if (cu->m_cuPelX + g_zscanToPelX[absPartIdx] > 0)
+ {
+ uint32_t tempPartIdx;
+ const CUData* tempCU = cu->getPULeft(tempPartIdx, absPartIdx);
+ return tempCU ? 2 : 0;
+ }
+ }
+ else
+ {
+ if (cu->m_cuPelY + g_zscanToPelY[absPartIdx] > 0)
+ {
+ uint32_t tempPartIdx;
+ const CUData* tempCU = cu->getPUAbove(tempPartIdx, absPartIdx);
+ return tempCU ? 2 : 0;
+ }
+ }
+
+ return 0;
}
/* Deblocking filter process in CU-based (the same function as conventional's)
* param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */
-void Deblock::deblockCU(CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockingStrength[])
+void Deblock::deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[])
{
- if (cu->m_partSize[absPartIdx] == SIZE_NONE)
+ if (cu->m_predMode[absPartIdx] == MODE_NONE)
return;
uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
uint32_t ymax = sps.picHeightInLumaSamples - cu->m_cuPelY;
for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
- deblockCU(cu, absPartIdx, depth + 1, dir, blockingStrength);
+ deblockCU(cu, absPartIdx, depth + 1, dir, blockStrength);
return;
}
- const uint32_t widthInBaseUnits = sps.numPartInCUSize >> depth;
- Param params;
- setLoopfilterParam(cu, absPartIdx, ¶ms);
- setEdgefilterPU(cu, absPartIdx, dir, blockingStrength, widthInBaseUnits);
- setEdgefilterTU(cu, absPartIdx, depth, dir, blockingStrength);
- setEdgefilterMultiple(cu, absPartIdx, dir, 0, (dir == EDGE_VER ? params.leftEdge : params.topEdge), blockingStrength, widthInBaseUnits);
+ const uint32_t numUnits = sps.numPartInCUSize >> depth;
+ setEdgefilterPU(cu, absPartIdx, dir, blockStrength, numUnits);
+ setEdgefilterTU(cu, absPartIdx, depth, dir, blockStrength);
+ setEdgefilterMultiple(cu, absPartIdx, dir, 0, bsCuEdge(cu, absPartIdx, dir), blockStrength, numUnits);
for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + curNumParts; partIdx++)
{
uint32_t bsCheck = !(partIdx & (1 << dir));
- if (bsCheck && blockingStrength[partIdx])
- getBoundaryStrengthSingle(cu, dir, partIdx, blockingStrength);
+ if (bsCheck && blockStrength[partIdx])
+ blockStrength[partIdx] = getBoundaryStrength(cu, dir, partIdx, blockStrength);
}
const uint32_t partIdxIncr = DEBLOCK_SMALLEST_BLOCK >> LOG2_UNIT_SIZE;
for (uint32_t e = 0; e < sizeInPU; e += partIdxIncr)
{
- edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockingStrength);
+ edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
if (!((e0 + e) & chromaMask))
- edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockingStrength);
+ edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockStrength);
}
}
-static inline uint32_t calcBsIdx(CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
+static inline uint32_t calcBsIdx(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
{
- uint32_t ctuWidthInBaseUnits = cu->m_slice->m_sps->numPartInCUSize;
+ uint32_t numPartInCUSize = cu->m_slice->m_sps->numPartInCUSize;
if (dir)
- return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * ctuWidthInBaseUnits + baseUnitIdx];
+ return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * numPartInCUSize + baseUnitIdx];
else
- return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * ctuWidthInBaseUnits + edgeIdx];
+ return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numPartInCUSize + edgeIdx];
}
-void Deblock::setEdgefilterMultiple(CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits)
+void Deblock::setEdgefilterMultiple(const CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits)
{
- const uint32_t numElem = widthInBaseUnits;
- X265_CHECK(numElem > 0, "numElem edge filter check\n");
- for (uint32_t i = 0; i < numElem; i++)
+ X265_CHECK(numUnits > 0, "numUnits edge filter check\n");
+ for (uint32_t i = 0; i < numUnits; i++)
{
const uint32_t bsidx = calcBsIdx(cu, scanIdx, dir, edgeIdx, i);
- blockingStrength[bsidx] = value;
+ blockStrength[bsidx] = value;
}
}
-void Deblock::setEdgefilterTU(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[])
+void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[])
{
if ((uint32_t)cu->m_tuDepth[absPartIdx] + cu->m_cuDepth[absPartIdx] > depth)
{
const uint32_t qNumParts = curNumParts >> 2;
for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
- setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockingStrength);
+ setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockStrength);
return;
}
- uint32_t widthInBaseUnits = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE);
- setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockingStrength, widthInBaseUnits);
+ uint32_t numUnits = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE);
+ setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockStrength, numUnits);
}
-void Deblock::setEdgefilterPU(CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits)
+void Deblock::setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits)
{
- const uint32_t hWidthInBaseUnits = widthInBaseUnits >> 1;
- const uint32_t qWidthInBaseUnits = widthInBaseUnits >> 2;
+ const uint32_t hNumUnits = numUnits >> 1;
+ const uint32_t qNumUnits = numUnits >> 2;
switch (cu->m_partSize[absPartIdx])
{
case SIZE_2NxN:
if (EDGE_HOR == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_Nx2N:
if (EDGE_VER == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_NxN:
- setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_2NxnU:
if (EDGE_HOR == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_nLx2N:
if (EDGE_VER == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_2NxnD:
if (EDGE_HOR == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_nRx2N:
if (EDGE_VER == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_2Nx2N:
}
}
-void Deblock::setLoopfilterParam(CUData* cu, uint32_t absPartIdx, Param *params)
+uint8_t Deblock::getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[])
{
- uint32_t x = cu->m_cuPelX + g_zscanToPelX[absPartIdx];
- uint32_t y = cu->m_cuPelY + g_zscanToPelY[absPartIdx];
-
- const CUData* tempCU;
- uint32_t tempPartIdx;
+ // Calculate block index
+ uint32_t partP;
+ const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ));
- if (!x)
- params->leftEdge = 0;
- else
- {
- tempCU = cu->getPULeft(tempPartIdx, absPartIdx);
- if (tempCU)
- params->leftEdge = 2;
- else
- params->leftEdge = 0;
- }
+ // Set BS for Intra MB : BS = 2
+ if (cuP->isIntra(partP) || cuQ->isIntra(partQ))
+ return 2;
- if (!y)
- params->topEdge = 0;
- else
- {
- tempCU = cu->getPUAbove(tempPartIdx, absPartIdx);
- if (tempCU)
- params->topEdge = 2;
- else
- params->topEdge = 0;
- }
-}
+ // Set BS for not Intra MB : BS = 1 or 0
+ if (blockStrength[partQ] > 1 &&
+ (cuQ->getCbf(partQ, TEXT_LUMA, cuQ->m_tuDepth[partQ]) ||
+ cuP->getCbf(partP, TEXT_LUMA, cuP->m_tuDepth[partP])))
+ return 1;
-void Deblock::getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t absPartIdx, uint8_t blockingStrength[])
-{
- const Slice* const slice = cu->m_slice;
- const uint32_t partQ = absPartIdx;
- CUData* const cuQ = cu;
+ static const MV zeroMv(0, 0);
+ const Slice* const sliceQ = cuQ->m_slice;
+ const Slice* const sliceP = cuP->m_slice;
- uint32_t partP;
- const CUData* cuP;
- uint8_t bs = 0;
+ const Frame* refP0 = sliceP->getRefPic(0, cuP->m_refIdx[0][partP]);
+ const Frame* refQ0 = sliceQ->getRefPic(0, cuQ->m_refIdx[0][partQ]);
+ const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv;
+ const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv;
- // Calculate block index
- if (dir == EDGE_VER)
- cuP = cuQ->getPULeft(partP, partQ);
- else // (dir == EDGE_HOR)
- cuP = cuQ->getPUAbove(partP, partQ);
+ if (sliceQ->isInterP() && sliceP->isInterP())
+ {
+ return ((refP0 != refQ0) ||
+ (abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4)) ? 1 : 0;
+ }
- // Set BS for Intra MB : BS = 4 or 3
- if (cuP->isIntra(partP) || cuQ->isIntra(partQ))
- bs = 2;
+ // (sliceQ->isInterB() || sliceP->isInterB())
+ const Frame* refP1 = sliceP->getRefPic(1, cuP->m_refIdx[1][partP]);
+ const Frame* refQ1 = sliceQ->getRefPic(1, cuQ->m_refIdx[1][partQ]);
+ const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv;
+ const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv;
- // Set BS for not Intra MB : BS = 2 or 1 or 0
- if (!cuP->isIntra(partP) && !cuQ->isIntra(partQ))
+ if (((refP0 == refQ0) && (refP1 == refQ1)) || ((refP0 == refQ1) && (refP1 == refQ0)))
{
- uint32_t nsPartQ = partQ;
- uint32_t nsPartP = partP;
-
- if (blockingStrength[absPartIdx] > 1 &&
- (cuQ->getCbf(nsPartQ, TEXT_LUMA, cuQ->m_tuDepth[nsPartQ]) ||
- cuP->getCbf(nsPartP, TEXT_LUMA, cuP->m_tuDepth[nsPartP])))
- bs = 1;
- else
+ if (refP0 != refP1) // Different L0 & L1
{
- if (dir == EDGE_HOR)
- cuP = cuQ->getPUAbove(partP, partQ);
-
- if (slice->isInterB() || cuP->m_slice->isInterB())
- {
- int32_t refIdx;
- Frame *refP0, *refP1, *refQ0, *refQ1;
- refIdx = cuP->m_refIdx[0][partP];
- refP0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx];
- refIdx = cuP->m_refIdx[1][partP];
- refP1 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[1][refIdx];
- refIdx = cuQ->m_refIdx[0][partQ];
- refQ0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx];
- refIdx = cuQ->m_refIdx[1][partQ];
- refQ1 = (refIdx < 0) ? NULL : slice->m_refPicList[1][refIdx];
-
- MV mvp0 = cuP->m_mv[0][partP];
- MV mvp1 = cuP->m_mv[1][partP];
- MV mvq0 = cuQ->m_mv[0][partQ];
- MV mvq1 = cuQ->m_mv[1][partQ];
-
- if (!refP0) mvp0 = 0;
- if (!refP1) mvp1 = 0;
- if (!refQ0) mvq0 = 0;
- if (!refQ1) mvq1 = 0;
-
- if (((refP0 == refQ0) && (refP1 == refQ1)) || ((refP0 == refQ1) && (refP1 == refQ0)))
- {
- if (refP0 != refP1) // Different L0 & L1
- {
- if (refP0 == refQ0)
- {
- bs = ((abs(mvq0.x - mvp0.x) >= 4) ||
- (abs(mvq0.y - mvp0.y) >= 4) ||
- (abs(mvq1.x - mvp1.x) >= 4) ||
- (abs(mvq1.y - mvp1.y) >= 4)) ? 1 : 0;
- }
- else
- {
- bs = ((abs(mvq1.x - mvp0.x) >= 4) ||
- (abs(mvq1.y - mvp0.y) >= 4) ||
- (abs(mvq0.x - mvp1.x) >= 4) ||
- (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0;
- }
- }
- else // Same L0 & L1
- {
- bs = ((abs(mvq0.x - mvp0.x) >= 4) ||
- (abs(mvq0.y - mvp0.y) >= 4) ||
- (abs(mvq1.x - mvp1.x) >= 4) ||
- (abs(mvq1.y - mvp1.y) >= 4)) &&
- ((abs(mvq1.x - mvp0.x) >= 4) ||
- (abs(mvq1.y - mvp0.y) >= 4) ||
- (abs(mvq0.x - mvp1.x) >= 4) ||
- (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0;
- }
- }
- else // for all different Ref_Idx
- bs = 1;
- }
- else // slice->isInterP()
- {
- int32_t refIdx;
- Frame *refp0, *refq0;
- refIdx = cuP->m_refIdx[0][partP];
- refp0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx];
- refIdx = cuQ->m_refIdx[0][partQ];
- refq0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx];
- MV mvp0 = cuP->m_mv[0][partP];
- MV mvq0 = cuQ->m_mv[0][partQ];
-
- if (!refp0) mvp0 = 0;
- if (!refq0) mvq0 = 0;
-
- bs = ((refp0 != refq0) ||
- (abs(mvq0.x - mvp0.x) >= 4) ||
- (abs(mvq0.y - mvp0.y) >= 4)) ? 1 : 0;
- }
+ if (refP0 == refQ0)
+ return ((abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4) ||
+ (abs(mvQ1.x - mvP1.x) >= 4) || (abs(mvQ1.y - mvP1.y) >= 4)) ? 1 : 0;
+ else
+ return ((abs(mvQ1.x - mvP0.x) >= 4) || (abs(mvQ1.y - mvP0.y) >= 4) ||
+ (abs(mvQ0.x - mvP1.x) >= 4) || (abs(mvQ0.y - mvP1.y) >= 4)) ? 1 : 0;
+ }
+ else // Same L0 & L1
+ {
+ return (((abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4) ||
+ (abs(mvQ1.x - mvP1.x) >= 4) || (abs(mvQ1.y - mvP1.y) >= 4)) &&
+ ((abs(mvQ1.x - mvP0.x) >= 4) || (abs(mvQ1.y - mvP0.y) >= 4) ||
+ (abs(mvQ0.x - mvP1.x) >= 4) || (abs(mvQ0.y - mvP1.y) >= 4))) ? 1 : 0;
}
}
-
- blockingStrength[absPartIdx] = bs;
+
+ // for all different Ref_Idx
+ return 1;
}
static inline int32_t calcDP(pixel* src, intptr_t offset)
}
/* Deblocking for the luminance component with strong or weak filter
- * \param src pointer to picture data
- * \param offset offset value for picture data
- * \param tc tc value
- * \param partPNoFilter indicator to disable filtering on partP
- * \param partQNoFilter indicator to disable filtering on partQ
- * \param filterSecondP decision weak filter/no filter for partP
- * \param filterSecondQ decision weak filter/no filter for partQ */
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter)
+ * \param src pointer to picture data
+ * \param offset offset value for picture data
+ * \param tc tc value
+ * \param maskP indicator to enable filtering on partP
+ * \param maskQ indicator to enable filtering on partQ
+ * \param maskP1 decision weak filter/no filter for partP
+ * \param maskQ1 decision weak filter/no filter for partQ */
+static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
{
+ int32_t tc2 = 2 * tc;
+ int32_t tcP = (tc2 & maskP);
+ int32_t tcQ = (tc2 & maskQ);
for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
{
int16_t m4 = (int16_t)src[0];
int16_t m3 = (int16_t)src[-offset];
int16_t m5 = (int16_t)src[offset];
int16_t m2 = (int16_t)src[-offset * 2];
- int32_t tc2 = 2 * tc;
- if (!partPNoFilter)
- {
- int16_t m1 = (int16_t)src[-offset * 3];
- int16_t m0 = (int16_t)src[-offset * 4];
- src[-offset * 3] = (pixel)(Clip3(-tc2, tc2, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
- src[-offset * 2] = (pixel)(Clip3(-tc2, tc2, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
- src[-offset] = (pixel)(Clip3(-tc2, tc2, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
- }
- if (!partQNoFilter)
- {
- int16_t m6 = (int16_t)src[offset * 2];
- int16_t m7 = (int16_t)src[offset * 3];
- src[0] = (pixel)(Clip3(-tc2, tc2, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
- src[offset] = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
- src[offset * 2] = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
- }
+ int16_t m6 = (int16_t)src[offset * 2];
+ int16_t m1 = (int16_t)src[-offset * 3];
+ int16_t m7 = (int16_t)src[offset * 3];
+ int16_t m0 = (int16_t)src[-offset * 4];
+ src[-offset * 3] = (pixel)(Clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
+ src[-offset * 2] = (pixel)(Clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
+ src[-offset] = (pixel)(Clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
+ src[0] = (pixel)(Clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
+ src[offset] = (pixel)(Clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
+ src[offset * 2] = (pixel)(Clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
}
}
/* Weak filter */
-static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter,
- bool filterSecondP, bool filterSecondQ)
+static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
+ int32_t maskP1, int32_t maskQ1)
{
int32_t thrCut = tc * 10;
+ int32_t tc2 = tc >> 1;
+ maskP1 &= maskP;
+ maskQ1 &= maskQ;
for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
{
{
delta = Clip3(-tc, tc, delta);
- int32_t tc2 = tc >> 1;
- if (!partPNoFilter)
+ src[-offset] = Clip(m3 + (delta & maskP));
+ src[0] = Clip(m4 - (delta & maskQ));
+ if (maskP1)
{
- src[-offset] = Clip(m3 + delta);
- if (filterSecondP)
- {
- int16_t m1 = (int16_t)src[-offset * 3];
- int32_t delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1));
- src[-offset * 2] = Clip(m2 + delta1);
- }
+ int16_t m1 = (int16_t)src[-offset * 3];
+ int32_t delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1));
+ src[-offset * 2] = Clip(m2 + delta1);
}
- if (!partQNoFilter)
+ if (maskQ1)
{
- src[0] = Clip(m4 - delta);
- if (filterSecondQ)
- {
- int16_t m6 = (int16_t)src[offset * 2];
- int32_t delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1));
- src[offset] = Clip(m5 + delta2);
- }
+ int16_t m6 = (int16_t)src[offset * 2];
+ int32_t delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1));
+ src[offset] = Clip(m5 + delta2);
}
}
}
}
/* Deblocking of one line/column for the chrominance component
- * \param src pointer to picture data
- * \param offset offset value for picture data
- * \param tc tc value
- * \param partPNoFilter indicator to disable filtering on partP
- * \param partQNoFilter indicator to disable filtering on partQ */
-static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter)
+ * \param src pointer to picture data
+ * \param offset offset value for picture data
+ * \param tc tc value
+ * \param maskP indicator to disable filtering on partP
+ * \param maskQ indicator to disable filtering on partQ */
+static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
{
for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
{
int16_t m2 = (int16_t)src[-offset * 2];
int32_t delta = Clip3(-tc, tc, ((((m4 - m3) << 2) + m2 - m5 + 4) >> 3));
- if (!partPNoFilter)
- src[-offset] = Clip(m3 + delta);
- if (!partQNoFilter)
- src[0] = Clip(m4 - delta);
+ src[-offset] = Clip(m3 + (delta & maskP));
+ src[0] = Clip(m4 - (delta & maskQ));
}
}
-void Deblock::edgeFilterLuma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[])
+void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
{
- PicYuv* reconYuv = cu->m_encData->m_reconPicYuv;
- pixel* src = reconYuv->getLumaAddr(cu->m_cuAddr, absPartIdx);
-
- intptr_t stride = reconYuv->m_stride;
- uint32_t numParts = cu->m_slice->m_sps->numPartInCUSize >> depth;
+ PicYuv* reconPic = cuQ->m_encData->m_reconPic;
+ pixel* src = reconPic->getLumaAddr(cuQ->m_cuAddr, absPartIdx);
+ intptr_t stride = reconPic->m_stride;
+ const PPS* pps = cuQ->m_slice->m_pps;
intptr_t offset, srcStep;
- bool partPNoFilter = false;
- bool partQNoFilter = false;
- uint32_t partP = 0;
- uint32_t partQ = 0;
- const CUData* cuP = cu;
- const CUData* cuQ = cu;
- int32_t betaOffset = cuQ->m_slice->m_pps->deblockingFilterBetaOffsetDiv2 << 1;
- int32_t tcOffset = cuQ->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1;
+ int32_t maskP = -1;
+ int32_t maskQ = -1;
+ int32_t betaOffset = pps->deblockingFilterBetaOffsetDiv2 << 1;
+ int32_t tcOffset = pps->deblockingFilterTcOffsetDiv2 << 1;
+ bool bCheckNoFilter = pps->bTransquantBypassEnabled;
if (dir == EDGE_VER)
{
src += (edge << LOG2_UNIT_SIZE) * stride;
}
- for (uint32_t idx = 0; idx < numParts; idx++)
+ uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> depth;
+ for (uint32_t idx = 0; idx < numUnits; idx++)
{
- uint32_t unitOffset = idx << LOG2_UNIT_SIZE;
- uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx);
- uint32_t bs = blockingStrength[bsAbsIdx];
- if (bs)
- {
- int32_t qpQ = cu->m_qp[bsAbsIdx];
- partQ = bsAbsIdx;
+ uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx);
+ uint32_t bs = blockStrength[partQ];
- // Derive neighboring PU index
- if (dir == EDGE_VER)
- cuP = cuQ->getPULeft(partP, partQ);
- else // (dir == EDGE_HOR)
- cuP = cuQ->getPUAbove(partP, partQ);
+ if (!bs)
+ continue;
- int32_t qpP = cuP->m_qp[partP];
- int32_t qp = (qpP + qpQ + 1) >> 1;
+ int32_t qpQ = cuQ->m_qp[partQ];
- int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset);
+ // Derive neighboring PU index
+ uint32_t partP;
+ const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ));
- const int32_t bitdepthShift = X265_DEPTH - 8;
- int32_t beta = s_betaTable[indexB] << bitdepthShift;
+ int32_t qpP = cuP->m_qp[partP];
+ int32_t qp = (qpP + qpQ + 1) >> 1;
- int32_t dp0 = calcDP(src + srcStep * (unitOffset + 0), offset);
- int32_t dq0 = calcDQ(src + srcStep * (unitOffset + 0), offset);
- int32_t dp3 = calcDP(src + srcStep * (unitOffset + 3), offset);
- int32_t dq3 = calcDQ(src + srcStep * (unitOffset + 3), offset);
- int32_t d0 = dp0 + dq0;
- int32_t d3 = dp3 + dq3;
+ int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset);
- int32_t d = d0 + d3;
+ const int32_t bitdepthShift = X265_DEPTH - 8;
+ int32_t beta = s_betaTable[indexB] << bitdepthShift;
- if (d < beta)
- {
- if (cu->m_slice->m_pps->bTransquantBypassEnabled)
- {
- // check if each of PUs is lossless coded
- partPNoFilter = !!cuP->m_tqBypass[partP];
- partQNoFilter = !!cuQ->m_tqBypass[partQ];
- }
-
- int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
- int32_t tc = s_tcTable[indexTC] << bitdepthShift;
-
- bool sw = (2 * d0 < (beta >> 2) &&
- 2 * d3 < (beta >> 2) &&
- useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 0)) &&
- useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 3)));
-
- if (sw)
- pelFilterLumaStrong(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter);
- else
- {
- int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
- int32_t dp = dp0 + dp3;
- int32_t dq = dq0 + dq3;
- bool filterP = (dp < sideThreshold);
- bool filterQ = (dq < sideThreshold);
-
- pelFilterLuma(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter, filterP, filterQ);
- }
- }
+ intptr_t unitOffset = idx * srcStep << LOG2_UNIT_SIZE;
+ int32_t dp0 = calcDP(src + unitOffset , offset);
+ int32_t dq0 = calcDQ(src + unitOffset , offset);
+ int32_t dp3 = calcDP(src + unitOffset + srcStep * 3, offset);
+ int32_t dq3 = calcDQ(src + unitOffset + srcStep * 3, offset);
+ int32_t d0 = dp0 + dq0;
+ int32_t d3 = dp3 + dq3;
+
+ int32_t d = d0 + d3;
+
+ if (d >= beta)
+ continue;
+
+ if (bCheckNoFilter)
+ {
+ // check if each of PUs is lossless coded
+ maskP = (cuP->m_tqBypass[partP] ? 0 : -1);
+ maskQ = (cuQ->m_tqBypass[partQ] ? 0 : -1);
+ }
+
+ int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
+ int32_t tc = s_tcTable[indexTC] << bitdepthShift;
+
+ bool sw = (2 * d0 < (beta >> 2) &&
+ 2 * d3 < (beta >> 2) &&
+ useStrongFiltering(offset, beta, tc, src + unitOffset ) &&
+ useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
+
+ if (sw)
+ pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
+ else
+ {
+ int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
+ int32_t dp = dp0 + dp3;
+ int32_t dq = dq0 + dq3;
+ int32_t maskP1 = (dp < sideThreshold ? -1 : 0);
+ int32_t maskQ1 = (dq < sideThreshold ? -1 : 0);
+
+ pelFilterLuma(src + unitOffset, srcStep, offset, tc, maskP, maskQ, maskP1, maskQ1);
}
}
}
-void Deblock::edgeFilterChroma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[])
+void Deblock::edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
{
- int32_t chFmt = cu->m_chromaFormat, chromaShift;
+ int32_t chFmt = cuQ->m_chromaFormat, chromaShift;
intptr_t offset, srcStep;
+ const PPS* pps = cuQ->m_slice->m_pps;
- bool partPNoFilter = false;
- bool partQNoFilter = false;
- uint32_t partP;
- uint32_t partQ;
- const CUData* cuP;
- const CUData* cuQ = cu;
- int32_t tcOffset = cu->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1;
+ int32_t maskP = -1;
+ int32_t maskQ = -1;
+ int32_t tcOffset = pps->deblockingFilterTcOffsetDiv2 << 1;
X265_CHECK(((dir == EDGE_VER)
- ? ((g_zscanToPelX[absPartIdx] + edge * UNIT_SIZE) >> cu->m_hChromaShift)
- : ((g_zscanToPelY[absPartIdx] + edge * UNIT_SIZE) >> cu->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0,
+ ? ((g_zscanToPelX[absPartIdx] + edge * UNIT_SIZE) >> cuQ->m_hChromaShift)
+ : ((g_zscanToPelY[absPartIdx] + edge * UNIT_SIZE) >> cuQ->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0,
"invalid edge\n");
- PicYuv* reconPic = cu->m_encData->m_reconPicYuv;
+ PicYuv* reconPic = cuQ->m_encData->m_reconPic;
intptr_t stride = reconPic->m_strideC;
- intptr_t srcOffset = reconPic->getChromaAddrOffset(cu->m_cuAddr, absPartIdx);
+ intptr_t srcOffset = reconPic->getChromaAddrOffset(cuQ->m_cuAddr, absPartIdx);
+ bool bCheckNoFilter = pps->bTransquantBypassEnabled;
if (dir == EDGE_VER)
{
- chromaShift = cu->m_vChromaShift;
- srcOffset += (edge << (LOG2_UNIT_SIZE - cu->m_hChromaShift));
+ chromaShift = cuQ->m_vChromaShift;
+ srcOffset += (edge << (LOG2_UNIT_SIZE - cuQ->m_hChromaShift));
offset = 1;
srcStep = stride;
}
else // (dir == EDGE_HOR)
{
- chromaShift = cu->m_hChromaShift;
- srcOffset += edge * stride << (LOG2_UNIT_SIZE - cu->m_vChromaShift);
+ chromaShift = cuQ->m_hChromaShift;
+ srcOffset += edge * stride << (LOG2_UNIT_SIZE - cuQ->m_vChromaShift);
offset = stride;
srcStep = 1;
}
srcChroma[0] = reconPic->m_picOrg[1] + srcOffset;
srcChroma[1] = reconPic->m_picOrg[2] + srcOffset;
- uint32_t numUnits = cu->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift);
+ uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift);
for (uint32_t idx = 0; idx < numUnits; idx++)
{
- uint32_t unitOffset = idx << LOG2_UNIT_SIZE;
- uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx << chromaShift);
- uint32_t bs = blockingStrength[bsAbsIdx];
+ uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx << chromaShift);
+ uint32_t bs = blockStrength[partQ];
- if (bs > 1)
- {
- int32_t qpQ = cu->m_qp[bsAbsIdx];
- partQ = bsAbsIdx;
+ if (bs <= 1)
+ continue;
- // Derive neighboring PU index
- if (dir == EDGE_VER)
- cuP = cuQ->getPULeft(partP, partQ);
- else // (dir == EDGE_HOR)
- cuP = cuQ->getPUAbove(partP, partQ);
+ int32_t qpQ = cuQ->m_qp[partQ];
- int32_t qpP = cuP->m_qp[partP];
+ // Derive neighboring PU index
+ uint32_t partP;
+ const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ));
- if (cu->m_slice->m_pps->bTransquantBypassEnabled)
- {
- // check if each of PUs is lossless coded
- partPNoFilter = !!cuP->m_tqBypass[partP];
- partQNoFilter = !!cuQ->m_tqBypass[partQ];
- }
+ int32_t qpP = cuP->m_qp[partP];
- for (uint32_t chromaIdx = 0; chromaIdx < 2; chromaIdx++)
+ if (bCheckNoFilter)
+ {
+ // check if each of PUs is lossless coded
+ maskP = (cuP->m_tqBypass[partP] ? 0 : -1);
+ maskQ = (cuQ->m_tqBypass[partQ] ? 0 : -1);
+ }
+
+ intptr_t unitOffset = idx * srcStep << LOG2_UNIT_SIZE;
+ for (uint32_t chromaIdx = 0; chromaIdx < 2; chromaIdx++)
+ {
+ int32_t chromaQPOffset = pps->chromaQpOffset[chromaIdx];
+ int32_t qp = ((qpP + qpQ + 1) >> 1) + chromaQPOffset;
+ if (qp >= 30)
{
- int32_t chromaQPOffset = !chromaIdx ? cu->m_slice->m_pps->chromaCbQpOffset : cu->m_slice->m_pps->chromaCrQpOffset;
- int32_t qp = ((qpP + qpQ + 1) >> 1) + chromaQPOffset;
- if (qp >= 30)
- {
- if (chFmt == X265_CSP_I420)
- qp = g_chromaScale[qp];
- else
- qp = X265_MIN(qp, 51);
- }
-
- int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET + tcOffset));
- const int32_t bitdepthShift = X265_DEPTH - 8;
- int32_t tc = s_tcTable[indexTC] << bitdepthShift;
- pixel* srcC = srcChroma[chromaIdx];
-
- pelFilterChroma(srcC + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter);
+ if (chFmt == X265_CSP_I420)
+ qp = g_chromaScale[qp];
+ else
+ qp = X265_MIN(qp, 51);
}
+
+ int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET + tcOffset));
+ const int32_t bitdepthShift = X265_DEPTH - 8;
+ int32_t tc = s_tcTable[indexTC] << bitdepthShift;
+ pixel* srcC = srcChroma[chromaIdx];
+
+ pelFilterChroma(srcC + unitOffset, srcStep, offset, tc, maskP, maskQ);
}
}
}
void init() { m_numPartitions = 1 << (g_maxFullDepth * 2); }
- void deblockCTU(CUData* cu, int32_t dir);
+ void deblockCTU(const CUData* ctu, int32_t dir);
protected:
// CU-level deblocking function
- void deblockCU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, const int32_t Edge, uint8_t blockingStrength[]);
-
- struct Param
- {
- uint8_t leftEdge;
- uint8_t topEdge;
- };
+ void deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[]);
// set filtering functions
- void setLoopfilterParam(CUData* cu, uint32_t absZOrderIdx, Param *params);
- void setEdgefilterTU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[]);
- void setEdgefilterPU(CUData* cu, uint32_t absZOrderIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits);
- void setEdgefilterMultiple(CUData* cu, uint32_t absZOrderIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits);
+ void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[]);
+ void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
+ void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
// get filtering functions
- void getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t partIdx, uint8_t blockingStrength[]);
+ uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
// filter luma/chroma functions
- void edgeFilterLuma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]);
- void edgeFilterChroma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]);
+ void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
+ void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
static const uint8_t s_tcTable[54];
static const uint8_t s_betaTable[52];
m_reconRowCount.set(0);
m_countRefEncoders = 0;
m_encData = NULL;
- m_reconPicYuv = NULL;
+ m_reconPic = NULL;
m_next = NULL;
m_prev = NULL;
memset(&m_lowres, 0, sizeof(m_lowres));
bool Frame::create(x265_param *param)
{
- m_origPicYuv = new PicYuv;
+ m_fencPic = new PicYuv;
- return m_origPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
- m_lowres.create(m_origPicYuv, param->bframes, !!param->rc.aqMode);
+ return m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
+ m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
}
bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
{
m_encData = new FrameData;
- m_reconPicYuv = new PicYuv;
- m_encData->m_reconPicYuv = m_reconPicYuv;
- bool ok = m_encData->create(param, sps) && m_reconPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
+ m_reconPic = new PicYuv;
+ m_encData->m_reconPic = m_reconPic;
+ bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
if (ok)
{
/* initialize right border of m_reconpicYuv as SAO may read beyond the
* end of the picture accessing uninitialized pixels */
int maxHeight = sps.numCuInHeight * g_maxCUSize;
- memset(m_reconPicYuv->m_picOrg[0], 0, m_reconPicYuv->m_stride * maxHeight);
- memset(m_reconPicYuv->m_picOrg[1], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift));
- memset(m_reconPicYuv->m_picOrg[2], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift));
+ memset(m_reconPic->m_picOrg[0], 0, m_reconPic->m_stride * maxHeight);
+ memset(m_reconPic->m_picOrg[1], 0, m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+ memset(m_reconPic->m_picOrg[2], 0, m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
}
return ok;
}
void Frame::reinit(const SPS& sps)
{
m_bChromaExtended = false;
- m_reconPicYuv = m_encData->m_reconPicYuv;
+ m_reconPic = m_encData->m_reconPic;
m_encData->reinit(sps);
}
m_encData = NULL;
}
- if (m_origPicYuv)
+ if (m_fencPic)
{
- m_origPicYuv->destroy();
- delete m_origPicYuv;
- m_origPicYuv = NULL;
+ m_fencPic->destroy();
+ delete m_fencPic;
+ m_fencPic = NULL;
}
- if (m_reconPicYuv)
+ if (m_reconPic)
{
- m_reconPicYuv->destroy();
- delete m_reconPicYuv;
- m_reconPicYuv = NULL;
+ m_reconPic->destroy();
+ delete m_reconPic;
+ m_reconPic = NULL;
}
m_lowres.destroy();
/* These two items will be NULL until the Frame begins to be encoded, at which point
* it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */
- FrameData* m_encData;
- PicYuv* m_reconPicYuv;
+ FrameData* m_encData;
+ PicYuv* m_reconPic;
/* Data associated with x265_picture */
- PicYuv* m_origPicYuv;
- int m_poc;
- int64_t m_pts; // user provided presentation time stamp
- int64_t m_reorderedPts;
- int64_t m_dts;
- int32_t m_forceqp; // Force to use the qp specified in qp file
- x265_intra_data* m_intraData;
- x265_inter_data* m_interData;
- void* m_userData; // user provided pointer passed in with this picture
+ PicYuv* m_fencPic;
+ int m_poc;
+ int64_t m_pts; // user provided presentation time stamp
+ int64_t m_reorderedPts;
+ int64_t m_dts;
+ int32_t m_forceqp; // Force to use the qp specified in qp file
+ void* m_userData; // user provided pointer passed in with this picture
- Lowres m_lowres;
- bool m_bChromaExtended; // orig chroma planes motion extended for weight analysis
+ Lowres m_lowres;
+ bool m_bChromaExtended; // orig chroma planes motion extended for weight analysis
/* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
- ThreadSafeInteger m_reconRowCount; // count of CTU rows completely reconstructed and extended for motion reference
- volatile uint32_t m_countRefEncoders; // count of FrameEncoder threads monitoring m_reconRowCount
+ ThreadSafeInteger m_reconRowCount; // count of CTU rows completely reconstructed and extended for motion reference
+ volatile uint32_t m_countRefEncoders; // count of FrameEncoder threads monitoring m_reconRowCount
- Frame* m_next; // PicList doubly linked list pointers
- Frame* m_prev;
+ Frame* m_next; // PicList doubly linked list pointers
+ Frame* m_prev;
+ x265_analysis_data m_analysisData;
Frame();
bool create(x265_param *param);
x265_param* m_param;
FrameData* m_freeListNext;
- PicYuv* m_reconPicYuv;
+ PicYuv* m_reconPic;
bool m_bHasReferences; /* used during DPB/RPS updates */
int m_frameEncoderID; /* the ID of the FrameEncoder encoding this frame */
namespace {
template<int dstStride>
-void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+void filterConvertPelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height)
{
int shift = IF_INTERNAL_PREC - X265_DEPTH;
int row, col;
}
template<int N, int width, int height>
-void interp_horiz_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
{
- int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+ const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int headRoom = IF_FILTER_PREC;
int offset = (1 << (headRoom - 1));
uint16_t maxVal = (1 << X265_DEPTH) - 1;
}
template<int N, int width, int height>
-void interp_horiz_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
{
- int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+ const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int shift = IF_FILTER_PREC - headRoom;
int offset = -IF_INTERNAL_OFFS << shift;
}
template<int N, int width, int height>
-void interp_vert_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
{
- int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+ const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int shift = IF_FILTER_PREC;
int offset = 1 << (shift - 1);
uint16_t maxVal = (1 << X265_DEPTH) - 1;
}
template<int N, int width, int height>
-void interp_vert_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
{
- int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+ const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int shift = IF_FILTER_PREC - headRoom;
int offset = -IF_INTERNAL_OFFS << shift;
}
template<int N, int width, int height>
-void interp_vert_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
{
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int shift = IF_FILTER_PREC + headRoom;
int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
uint16_t maxVal = (1 << X265_DEPTH) - 1;
- const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+ const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
src -= (N / 2 - 1) * srcStride;
}
template<int N, int width, int height>
-void interp_vert_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
{
- const int16_t *const c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+ const int16_t* c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
int shift = IF_FILTER_PREC;
int row, col;
}
template<int N>
-void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx)
+void filterVertical_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int coeffIdx)
{
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int shift = IF_FILTER_PREC + headRoom;
int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
uint16_t maxVal = (1 << X265_DEPTH) - 1;
- const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+ const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
src -= (N / 2 - 1) * srcStride;
}
template<int N, int width, int height>
-void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+void interp_hv_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
{
short immedVals[(64 + 8) * (64 + 8)];
CHROMA_444(16, 64);
p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
- p.chroma_p2s[X265_CSP_I444] = filterConvertPelToShort_c<MAX_CU_SIZE>;
- p.chroma_p2s[X265_CSP_I420] = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
- p.chroma_p2s[X265_CSP_I422] = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
+ p.chroma[X265_CSP_I444].p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
+ p.chroma[X265_CSP_I420].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
+ p.chroma[X265_CSP_I422].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
p.extendRowBorder = extendCURowColBorder;
}
lowresPlane[3] = buffer[3] + padoffset;
CHECKED_MALLOC(intraCost, int32_t, cuCount);
+ CHECKED_MALLOC(intraMode, uint8_t, cuCount);
for (int i = 0; i < bframes + 2; i++)
{
X265_FREE(buffer[i]);
X265_FREE(intraCost);
+ X265_FREE(intraMode);
for (int i = 0; i < bframes + 2; i++)
{
intraMbs[i] = 0;
/* downscale and generate 4 hpel planes for lookahead */
- primitives.frame_init_lowres_core(origPic->m_picOrg[0],
+ primitives.frameInitLowres(origPic->m_picOrg[0],
lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3],
origPic->m_stride, lumaStride, width, lines);
extendPicBorder(lowresPlane[1], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
extendPicBorder(lowresPlane[2], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
extendPicBorder(lowresPlane[3], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
- fpelPlane = lowresPlane[0];
+ fpelPlane[0] = lowresPlane[0];
}
#include "primitives.h"
#include "common.h"
+#include "picyuv.h"
#include "mv.h"
namespace x265 {
// private namespace
-class PicYuv;
-
struct ReferencePlanes
{
ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
- pixel* fpelPlane;
+ pixel* fpelPlane[3];
pixel* lowresPlane[4];
+ PicYuv* reconPic;
bool isWeighted;
bool isLowres;
+
intptr_t lumaStride;
- int weight;
- int offset;
- int shift;
- int round;
+ intptr_t chromaStride;
+
+ struct {
+ int weight;
+ int offset;
+ int shift;
+ int round;
+ } w[3];
+
+ pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[0] + reconPic->m_cuOffsetY[ctuAddr] + reconPic->m_buOffsetY[absPartIdx]; }
+ pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[1] + reconPic->m_cuOffsetC[ctuAddr] + reconPic->m_buOffsetC[absPartIdx]; }
+ pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[2] + reconPic->m_cuOffsetC[ctuAddr] + reconPic->m_buOffsetC[absPartIdx]; }
/* lowres motion compensation, you must provide a buffer and stride for QPEL averaged pixels
* in case QPEL is required. Else it returns a pointer to the HPEL pixels */
{
int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
-
- MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2);
- int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1);
-
- pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride;
+ int qmvx = qmv.x + (qmv.x & 1);
+ int qmvy = qmv.y + (qmv.y & 1);
+ int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
+ pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
primitives.pixelavg_pp[LUMA_8x8](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
return buf;
}
ALIGN_VAR_16(pixel, subpelbuf[8 * 8]);
int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
- MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2);
- int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1);
- pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride;
+ int qmvx = qmv.x + (qmv.x & 1);
+ int qmvy = qmv.y + (qmv.y & 1);
+ int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
+ pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
primitives.pixelavg_pp[LUMA_8x8](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
return comp(fenc, FENC_STRIDE, subpelbuf, 8);
}
int32_t* rowSatds[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2];
int intraMbs[X265_BFRAME_MAX + 2];
int32_t* intraCost;
+ uint8_t* intraMode;
int64_t satdCost;
uint16_t* lowresCostForRc;
uint16_t(*lowresCosts[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]);
int32_t word;
};
- MV() : word(0) {}
-
+ MV() {}
+ MV(int32_t w) : word(w) {}
MV(int16_t _x, int16_t _y) : x(_x), y(_y) {}
- const MV& operator =(uint32_t w) { word = w; return *this; }
+ MV& operator =(uint32_t w) { word = w; return *this; }
- const MV& operator +=(const MV& other) { x += other.x; y += other.y; return *this; }
+ MV& operator +=(const MV& other) { x += other.x; y += other.y; return *this; }
- const MV& operator -=(const MV& other) { x -= other.x; y -= other.y; return *this; }
+ MV& operator -=(const MV& other) { x -= other.x; y -= other.y; return *this; }
- const MV& operator >>=(int i) { x >>= i; y >>= i; return *this; }
+ MV& operator >>=(int i) { x >>= i; y >>= i; return *this; }
- const MV& operator <<=(int i) { x <<= i; y <<= i; return *this; }
+ MV& operator <<=(int i) { x <<= i; y <<= i; return *this; }
MV operator >>(int i) const { return MV(x >> i, y >> i); }
MV operator *(int16_t i) const { return MV(x * i, y * i); }
- const MV operator -(const MV& other) const { return MV(x - other.x, y - other.y); }
+ MV operator -(const MV& other) const { return MV(x - other.x, y - other.y); }
- const MV operator +(const MV& other) const { return MV(x + other.x, y + other.y); }
+ MV operator +(const MV& other) const { return MV(x + other.x, y + other.y); }
bool operator ==(const MV& other) const { return word == other.word; }
bool operator !=(const MV& other) const { return word != other.word; }
+ bool operator !() const { return !word; }
+
// Scale down a QPEL mv to FPEL mv, rounding up by one HPEL offset
- MV roundToFPel() const { return MV(x + 2, y + 2) >> 2; }
+ MV roundToFPel() const { return MV((x + 2) >> 2, (y + 2) >> 2); }
// Scale up an FPEL mv to QPEL by shifting up two bits
MV toQPel() const { return *this << 2; }
param->rdPenalty = 0;
param->psyRd = 0.0;
param->psyRdoq = 0.0;
+ param->analysisMode = 0;
+ param->analysisFileName = NULL;
param->bIntraInBFrames = 0;
param->bLossless = 0;
param->bCULossless = 0;
param->rc.qpStep = 4;
param->rc.rateControlMode = X265_RC_CRF;
param->rc.qp = 32;
- param->rc.aqMode = X265_AQ_AUTO_VARIANCE;
+ param->rc.aqMode = X265_AQ_VARIANCE;
param->rc.aqStrength = 1.0;
param->rc.cuTree = 1;
param->rc.rfConstantMax = 0;
param->scenecutThreshold = 0;
param->rc.cuTree = 0;
}
+ else if (!strcmp(tune, "grain"))
+ {
+ param->deblockingFilterBetaOffset = -2;
+ param->deblockingFilterTCOffset = -2;
+ param->bIntraInBFrames = 0;
+ param->psyRdoq = 30;
+ param->psyRd = 0.5;
+ param->rc.ipFactor = 1.1;
+ param->rc.pbFactor = 1.1;
+ param->rc.aqMode = X265_AQ_VARIANCE;
+ param->rc.aqStrength = 0.3;
+ param->rc.qCompress = 0.8;
+ }
+ else if (!strcmp(tune, "cbr"))
+ {
+ param->rc.pbFactor = 1.0;
+ param->rc.rateTolerance = 0.5;
+ }
else
return -1;
}
}
}
}
- OPT("csv") p->csvfn = value;
- OPT("scaling-list") p->scalingLists = value;
- OPT("lambda-file") p->rc.lambdaFileName = value;
OPT("threads") p->poolNumThreads = atoi(value);
OPT("frame-threads") p->frameNumThreads = atoi(value);
OPT("pmode") p->bDistributeModeAnalysis = atobool(value);
OPT("psy-rdoq") p->psyRdoq = atof(value);
OPT("signhide") p->bEnableSignHiding = atobool(value);
OPT("b-intra") p->bIntraInBFrames = atobool(value);
- OPT("lft") p->bEnableLoopFilter = atobool(value);
+ OPT("lft") p->bEnableLoopFilter = atobool(value); /* DEPRECATED */
+ OPT("deblock")
+ {
+ if (2 == sscanf(value, "%d:%d", &p->deblockingFilterTCOffset, &p->deblockingFilterBetaOffset) ||
+ 2 == sscanf(value, "%d,%d", &p->deblockingFilterTCOffset, &p->deblockingFilterBetaOffset))
+ {
+ p->bEnableLoopFilter = true;
+ }
+ else if (sscanf(value, "%d", &p->deblockingFilterTCOffset))
+ {
+ p->bEnableLoopFilter = 1;
+ p->deblockingFilterBetaOffset = p->deblockingFilterTCOffset;
+ }
+ else
+ p->bEnableLoopFilter = atobool(value);
+ }
OPT("sao") p->bEnableSAO = atobool(value);
OPT("sao-non-deblock") p->bSaoNonDeblocked = atobool(value);
OPT("ssim") p->bEnableSsim = atobool(value);
OPT("hrd") p->bEmitHRDSEI = atobool(value);
OPT2("ipratio", "ip-factor") p->rc.ipFactor = atof(value);
OPT2("pbratio", "pb-factor") p->rc.pbFactor = atof(value);
+ OPT("qcomp") p->rc.qCompress = atof(value);
+ OPT("qpstep") p->rc.qpStep = atoi(value);
+ OPT("ratetol") p->rc.rateTolerance = atof(value);
+ OPT("cplxblur") p->rc.complexityBlur = atof(value);
+ OPT("qblur") p->rc.qblur = atof(value);
OPT("aq-mode") p->rc.aqMode = atoi(value);
OPT("aq-strength") p->rc.aqStrength = atof(value);
OPT("vbv-maxrate") p->rc.vbvMaxBitrate = atoi(value);
&p->vui.defDispWinRightOffset,
&p->vui.defDispWinBottomOffset) != 4;
}
- OPT("nr") p->noiseReduction = atoi(value);
+ OPT("nr-intra") p->noiseReductionIntra = atoi(value);
+ OPT("nr-inter") p->noiseReductionInter = atoi(value);
OPT("pass")
{
int pass = Clip3(0, 3, atoi(value));
p->rc.bStatRead = pass & 2;
}
OPT("stats") p->rc.statFileName = strdup(value);
+ OPT("csv") p->csvfn = strdup(value);
+ OPT("scaling-list") p->scalingLists = strdup(value);
+ OPT("lambda-file") p->rc.lambdaFileName = strdup(value);
+ OPT("analysis-file") p->analysisFileName = strdup(value);
else
return X265_PARAM_BAD_NAME;
#undef OPT
"Aq-Mode is out of range");
CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3,
"Aq-Strength is out of range");
+ CHECK(param->deblockingFilterTCOffset < -6 || param->deblockingFilterTCOffset > 6,
+ "deblocking filter tC offset must be in the range of -6 to +6");
+ CHECK(param->deblockingFilterBetaOffset < -6 || param->deblockingFilterBetaOffset > 6,
+ "deblocking filter Beta offset must be in the range of -6 to +6");
CHECK(param->psyRd < 0 || 2.0 < param->psyRd, "Psy-rd strength must be between 0 and 2.0");
CHECK(param->psyRdoq < 0 || 50.0 < param->psyRdoq, "Psy-rdoq strength must be between 0 and 50.0");
CHECK(param->bEnableWavefront < 0, "WaveFrontSynchro cannot be negative");
"Valid initial VBV buffer occupancy must be a fraction 0 - 1, or size in kbits");
CHECK(param->rc.bitrate < 0,
"Target bitrate can not be less than zero");
- if (param->noiseReduction)
- CHECK(100 > param->noiseReduction || param->noiseReduction > 1000, "Valid noise reduction range 100 - 1000");
+ CHECK(param->rc.qCompress < 0.5 || param->rc.qCompress > 1.0,
+ "qCompress must be between 0.5 and 1.0");
+ if (param->noiseReductionIntra)
+ CHECK(0 > param->noiseReductionIntra || param->noiseReductionIntra > 2000, "Valid noise reduction range 0 - 2000");
+ if (param->noiseReductionInter)
+ CHECK(0 > param->noiseReductionInter || param->noiseReductionInter > 2000, "Valid noise reduction range 0 - 2000");
CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead,
"Constant rate-factor is incompatible with 2pass");
CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead,
{
static int once /* = 0 */;
- if (ATOMIC_CAS32(&once, 0, 1) == 1)
+ if (ATOMIC_INC(&once) > 1)
{
if (param->maxCUSize != g_maxCUSize)
{
fprintf(stderr, "psy-rd=%.2lf ", param->psyRd);
if (param->psyRdoq > 0.)
fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq);
- TOOLOPT(param->bEnableEarlySkip, "esd");
- TOOLOPT(param->bEnableCbfFastMode, "cfm");
- if (param->noiseReduction)
- fprintf(stderr, "nr=%d ", param->noiseReduction);
- TOOLOPT(param->bEnableLoopFilter, "lft");
+ TOOLOPT(param->bEnableEarlySkip, "early-skip");
+ TOOLOPT(param->bEnableCbfFastMode, "fast-cbf");
+ if (param->noiseReductionIntra)
+ fprintf(stderr, "nr-intra=%d ", param->noiseReductionIntra);
+ if (param->noiseReductionInter)
+ fprintf(stderr, "nr-inter=%d ", param->noiseReductionInter);
+ if (param->bEnableLoopFilter)
+ {
+ if (param->deblockingFilterBetaOffset || param->deblockingFilterTCOffset)
+ fprintf(stderr, "deblock(tC=%d:B=%d) ", param->deblockingFilterTCOffset, param->deblockingFilterBetaOffset);
+ else
+ TOOLOPT(param->bEnableLoopFilter, "deblock");
+ }
if (param->bEnableSAO)
fprintf(stderr, "sao%s ", param->bSaoNonDeblocked ? "-non-deblock" : "");
TOOLOPT(param->bEnableSignHiding, "signhide");
pixel* getCrAddr(uint32_t ctuAddr) { return m_picOrg[2] + m_cuOffsetC[ctuAddr]; }
pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr]; }
pixel* getPlaneAddr(uint32_t plane, uint32_t ctuAddr) { return m_picOrg[plane] + (plane ? m_cuOffsetC[ctuAddr] : m_cuOffsetY[ctuAddr]); }
+ const pixel* getLumaAddr(uint32_t ctuAddr) const { return m_picOrg[0] + m_cuOffsetY[ctuAddr]; }
+ const pixel* getCbAddr(uint32_t ctuAddr) const { return m_picOrg[1] + m_cuOffsetC[ctuAddr]; }
+ const pixel* getCrAddr(uint32_t ctuAddr) const { return m_picOrg[2] + m_cuOffsetC[ctuAddr]; }
+ const pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr) const { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr]; }
+ const pixel* getPlaneAddr(uint32_t plane, uint32_t ctuAddr) const { return m_picOrg[plane] + (plane ? m_cuOffsetC[ctuAddr] : m_cuOffsetY[ctuAddr]); }
/* get pointer to CU start address */
pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[0] + m_cuOffsetY[ctuAddr] + m_buOffsetY[absPartIdx]; }
pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[1] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[2] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
+ const pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[0] + m_cuOffsetY[ctuAddr] + m_buOffsetY[absPartIdx]; }
+ const pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[1] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
+ const pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[2] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
+ const pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
};
void updateChecksum(const pixel* plane, uint32_t& checksumVal, uint32_t height, uint32_t width, intptr_t stride, int row, uint32_t cuHeight);
using namespace x265;
-#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
- p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
+#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
+ p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
#define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
// place functions in anonymous namespace (file static)
template<int lx, int ly>
-int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int sum = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
- {
sum += abs(pix1[x] - pix2[x]);
- }
pix1 += stride_pix1;
pix2 += stride_pix2;
}
template<int lx, int ly>
-int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
{
int sum = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
- {
sum += abs(pix1[x] - pix2[x]);
- }
pix1 += stride_pix1;
pix2 += stride_pix2;
}
template<int lx, int ly>
-void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstride, int32_t *res)
+void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
{
res[0] = 0;
res[1] = 0;
}
template<int lx, int ly>
-void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, intptr_t frefstride, int32_t *res)
+void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
{
res[0] = 0;
res[1] = 0;
}
template<int lx, int ly, class T1, class T2>
-int sse(T1 *pix1, intptr_t stride_pix1, T2 *pix2, intptr_t stride_pix2)
+int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
{
int sum = 0;
- int iTemp;
+ int tmp;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
{
- iTemp = pix1[x] - pix2[x];
- sum += (iTemp * iTemp);
+ tmp = pix1[x] - pix2[x];
+ sum += (tmp * tmp);
}
pix1 += stride_pix1;
return (a + s) ^ s;
}
-int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
sum2_t tmp[4][2];
sum2_t a0, a1, a2, a3, b0, b1;
return (int)(sum >> 1);
}
-int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
{
ssum2_t tmp[4][2];
ssum2_t a0, a1, a2, a3, b0, b1;
}
// x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
-int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
sum2_t tmp[4][4];
sum2_t a0, a1, a2, a3;
template<int w, int h>
// calculate satd in blocks of 4x4
-int satd4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int satd = 0;
for (int row = 0; row < h; row += 4)
- {
for (int col = 0; col < w; col += 4)
- {
satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
- }
- }
return satd;
}
template<int w, int h>
// calculate satd in blocks of 8x4
-int satd8(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int satd = 0;
for (int row = 0; row < h; row += 4)
- {
for (int col = 0; col < w; col += 8)
- {
satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
- }
- }
return satd;
}
-inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
sum2_t tmp[8][4];
sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
return (int)sum;
}
-int sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
}
-inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
{
ssum2_t tmp[8][4];
ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
return (int)sum;
}
-int sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
{
return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
}
-int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
+ _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
template<int w, int h>
// Calculate sa8d in blocks of 8x8
-int sa8d8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int cost = 0;
for (int y = 0; y < h; y += 8)
- {
for (int x = 0; x < w; x += 8)
- {
cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
- }
- }
return cost;
}
template<int w, int h>
// Calculate sa8d in blocks of 16x16
-int sa8d16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int cost = 0;
for (int y = 0; y < h; y += 16)
- {
for (int x = 0; x < w; x += 16)
- {
cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
- }
- }
return cost;
}
template<int size>
-int pixel_ssd_s_c(short *a, intptr_t dstride)
+int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
{
int sum = 0;
for (int y = 0; y < size; y++)
{
for (int x = 0; x < size; x++)
- {
sum += a[x] * a[x];
- }
+
a += dstride;
}
return sum;
}
template<int size>
-void blockfil_s_c(int16_t *dst, intptr_t dstride, int16_t val)
+void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val)
{
for (int y = 0; y < size; y++)
- {
for (int x = 0; x < size; x++)
- {
dst[y * dstride + x] = val;
- }
- }
-}
-
-void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size)
-{
- for (int i = 0; i < size; i++)
- {
- for (int j = 0; j < size; j++)
- {
- dst[i * size + j] = ((int)src[i * stride + j]) << shift;
- }
- }
}
template<int size>
-void convert16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset)
+void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
- for (int i = 0; i < size; i++)
- {
- for (int j = 0; j < size; j++)
- {
- dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
- }
- }
-}
-
-void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size)
-{
- int round = 1 << (shift - 1);
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift >= 0, "invalid shift\n");
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (int16_t)((src[j] + round) >> shift);
- }
+ dst[j] = src[j] << shift;
- src += size;
- dst += stride;
+ src += srcStride;
+ dst += size;
}
}
-void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
+template<int size>
+void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
- int round = 1 << (shift - 1);
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+ int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (int16_t)((src[j] + round) >> shift);
- }
+ dst[j] = (src[j] + round) >> shift;
- src += size;
- dst += stride;
+ src += srcStride;
+ dst += size;
}
}
template<int size>
-void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
+ X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift >= 0, "invalid shift\n");
+
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = ((int16_t)src[j] << shift);
- }
+ dst[j] = src[j] << shift;
src += size;
- dst += stride;
+ dst += dstStride;
}
}
template<int size>
-void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
+ X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+
+ int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (src[j] << shift);
- }
+ dst[j] = (src[j] + round) >> shift;
src += size;
- dst += stride;
+ dst += dstStride;
}
}
template<int blockSize>
-void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
+void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
{
for (int y = 0; y < blockSize; y++)
{
for (int x = 0; x < blockSize; x++)
- {
residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
- }
fenc += stride;
residual += stride;
}
template<int blockSize>
-void transpose(pixel* dst, pixel* src, intptr_t stride)
+void transpose(pixel* dst, const pixel* src, intptr_t stride)
{
for (int k = 0; k < blockSize; k++)
- {
for (int l = 0; l < blockSize; l++)
- {
dst[k * blockSize + l] = src[l * stride + k];
- }
- }
}
-void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
{
int x, y;
}
}
-void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
+void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
{
int x, y;
}
template<int lx, int ly>
-void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int)
+void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
{
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
- {
dst[x] = (src0[x] + src1[x] + 1) >> 1;
- }
src0 += sstride0;
src1 += sstride1;
}
}
-void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/)
{
int x;
}
}
-void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
{
- int x, y;
+ uint32_t x, y;
for (y = 0; y < 64; y += 2)
{
}
}
-void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
+void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
intptr_t src_stride, intptr_t dst_stride, int width, int height)
{
for (int y = 0; y < height; y++)
{
- pixel *src1 = src0 + src_stride;
- pixel *src2 = src1 + src_stride;
+ const pixel* src1 = src0 + src_stride;
+ const pixel* src2 = src1 + src_stride;
for (int x = 0; x < width; x++)
{
// slower than naive bilinear, but matches asm
}
/* structural similarity metric */
-void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4])
+void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
{
for (int z = 0; z < 2; z++)
{
}
template<int size>
-uint64_t pixel_var(pixel *pix, intptr_t i_stride)
+uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
{
uint32_t sum = 0, sqr = 0;
#endif
template<int size>
-int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
+int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
{
static pixel zeroBuf[8] /* = { 0 } */;
}
template<int size>
-int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride)
+int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
{
static int16_t zeroBuf[8] /* = { 0 } */;
}
}
-void plane_copy_deinterleave_chroma(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride,
- pixel *src, intptr_t srcStride, int w, int h)
-{
- for (int y = 0; y < h; y++, dstu += dstuStride, dstv += dstvStride, src += srcStride)
- {
- for (int x = 0; x < w; x++)
- {
- dstu[x] = src[2 * x];
- dstv[x] = src[2 * x + 1];
- }
- }
-}
-
template<int bx, int by>
-void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = b[x];
- }
a += stridea;
b += strideb;
}
template<int bx, int by>
-void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = b[x];
- }
a += stridea;
b += strideb;
}
template<int bx, int by>
-void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
}
template<int bx, int by>
-void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = (int16_t)b[x];
- }
a += stridea;
b += strideb;
}
template<int bx, int by>
-void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = (int16_t)(b0[x] - b1[x]);
- }
b0 += sstride0;
b1 += sstride1;
}
template<int bx, int by>
-void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = Clip(b0[x] + b1[x]);
- }
b0 += sstride0;
b1 += sstride1;
}
template<int bx, int by>
-void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
{
int shiftNum, offset;
}
}
-void planecopy_cp_c(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
+void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
{
for (int r = 0; r < height; r++)
{
for (int c = 0; c < width; c++)
- {
dst[c] = ((pixel)src[c]) << shift;
- }
dst += dstStride;
src += srcStride;
}
}
-void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
{
for (int r = 0; r < height; r++)
{
for (int c = 0; c < width; c++)
- {
dst[c] = (pixel)((src[c] >> shift) & mask);
- }
dst += dstStride;
src += srcStride;
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given CU. */
-void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts,
- int32_t *invQscales, double *fpsFactor, int len)
+void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
+ const int32_t* invQscales, const double* fpsFactor, int len)
{
double fps = *fpsFactor / 256;
primitives.extendRowBorder(pic, stride, width, height, marginX);
/* copy top row to create above margin */
- pixel *top = pic - marginX;
+ pixel* top = pic - marginX;
for (int y = 0; y < marginY; y++)
memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
/* copy bottom row to create below margin */
- pixel *bot = pic - marginX + (height - 1) * stride;
+ pixel* bot = pic - marginX + (height - 1) * stride;
for (int y = 0; y < marginY; y++)
memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
}
p.satd[LUMA_64x16] = satd8<64, 16>;
p.satd[LUMA_16x64] = satd8<16, 64>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_2x2] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = satd_4x4;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = satd8<8, 8>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>;
+
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x2] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_2x4] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = satd_8x4;
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = satd4<4, 8>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = satd8<16, 8>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = satd8<8, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>;
+
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x6] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_6x8] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x2] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_2x8] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = satd4<16, 4>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = satd4<4, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = satd8<32, 8>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = satd8<8, 32>;
+
+ p.chroma[X265_CSP_I422].satd[CHROMA422_2x4] = NULL;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = satd4<4, 8>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = satd8<8, 16>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>;
+
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = satd_4x4;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_2x8] = NULL;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = satd8<8, 8>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = satd4<4, 16>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = satd8<8, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>;
+
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_6x16] = NULL;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = satd4<8, 4>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_2x16] = NULL;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = satd8<16, 8>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>;
+
#define CHROMA_420(W, H) \
p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \
p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
#define CHROMA_422(W, H) \
- p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
+ p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
#define CHROMA_444(W, H) \
+ p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H] = p.satd[LUMA_ ## W ## x ## H]; \
p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
-
-
LUMA(4, 4);
LUMA(8, 8);
CHROMA_420(4, 4);
CHROMA_444(64, 16);
CHROMA_444(16, 64);
- SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
- SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
- SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
+ SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
+ SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
+ SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>;
p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>;
p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
- p.cvt16to32_shl = convert16to32_shl;
- p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
- p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
- p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
- p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
- p.cvt32to16_shr = convert32to16_shr;
- p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
- p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
- p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
- p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
-
- p.copy_shr = copy_shr;
- p.copy_shl[BLOCK_4x4] = copy_shl<4>;
- p.copy_shl[BLOCK_8x8] = copy_shl<8>;
- p.copy_shl[BLOCK_16x16] = copy_shl<16>;
- p.copy_shl[BLOCK_32x32] = copy_shl<32>;
+ p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
+ p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
+ p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
+ p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
+ p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
+ p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
+ p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
+ p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
p.sa8d[BLOCK_4x4] = satd_4x4;
p.sa8d[BLOCK_8x8] = sa8d_8x8;
p.scale1D_128to64 = scale1D_128to64;
p.scale2D_64to32 = scale2D_64to32;
- p.frame_init_lowres_core = frame_init_lowres_core;
+ p.frameInitLowres = frame_init_lowres_core;
p.ssim_4x4x2_core = ssim_4x4x2_core;
p.ssim_end_4 = ssim_end_4;
p.var[BLOCK_16x16] = pixel_var<16>;
p.var[BLOCK_32x32] = pixel_var<32>;
p.var[BLOCK_64x64] = pixel_var<64>;
- p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
p.planecopy_cp = planecopy_cp_c;
p.planecopy_sp = planecopy_sp_c;
p.propagateCost = estimateCUPropagateCost;
{
int tuSize = 1 << log2TrSize;
- pixel *refLft, *refAbv;
+ pixel* refLft;
+ pixel* refAbv;
if (!(g_intraFilterFlags[dirMode] & tuSize))
{
ShortYuv& shortYuv = m_predShortYuv[0];
if (bLuma)
- predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
if (bChroma)
- predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
}
else
{
if (bLuma)
- predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
if (bChroma)
- predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
}
}
else
if (bLuma)
{
- predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
- predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+ predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
}
if (bChroma)
{
- predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
- predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+ predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
}
if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
ShortYuv& shortYuv = m_predShortYuv[0];
if (bLuma)
- predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
if (bChroma)
- predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
}
else
{
if (bLuma)
- predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
if (bChroma)
- predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
}
}
else
ShortYuv& shortYuv = m_predShortYuv[0];
if (bLuma)
- predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
if (bChroma)
- predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
}
else
{
if (bLuma)
- predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
if (bChroma)
- predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
}
}
}
void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
{
- pixel *dst = dstYuv.getLumaAddr(m_puAbsPartIdx);
+ pixel* dst = dstYuv.getLumaAddr(m_puAbsPartIdx);
intptr_t dstStride = dstYuv.m_size;
intptr_t srcStride = refPic.m_stride;
intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
- pixel* src = const_cast<PicYuv&>(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+ const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
int xFrac = mv.x & 0x3;
int yFrac = mv.y & 0x3;
void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
{
- int16_t *dst = dstSYuv.getLumaAddr(m_puAbsPartIdx);
+ int16_t* dst = dstSYuv.getLumaAddr(m_puAbsPartIdx);
int dstStride = dstSYuv.m_size;
intptr_t srcStride = refPic.m_stride;
intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
- pixel *src = const_cast<PicYuv&>(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+ const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
int xFrac = mv.x & 0x3;
int yFrac = mv.y & 0x3;
intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
- pixel* refCb = const_cast<PicYuv&>(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
- pixel* refCr = const_cast<PicYuv&>(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+ const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+ const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
pixel* dstCb = dstYuv.getCbAddr(m_puAbsPartIdx);
pixel* dstCr = dstYuv.getCrAddr(m_puAbsPartIdx);
intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
- pixel* refCb = const_cast<PicYuv&>(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
- pixel* refCr = const_cast<PicYuv&>(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+ const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+ const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
int16_t* dstCb = dstSYuv.getCbAddr(m_puAbsPartIdx);
int16_t* dstCr = dstSYuv.getCrAddr(m_puAbsPartIdx);
if (!(yFrac | xFrac))
{
- primitives.chroma_p2s[m_csp](refCb, refStride, dstCb, cxWidth, cxHeight);
- primitives.chroma_p2s[m_csp](refCr, refStride, dstCr, cxWidth, cxHeight);
+ primitives.chroma[m_csp].p2s(refCb, refStride, dstCb, cxWidth, cxHeight);
+ primitives.chroma[m_csp].p2s(refCr, refStride, dstCr, cxWidth, cxHeight);
}
else if (!yFrac)
{
int w0, w1, offset, shiftNum, shift, round;
uint32_t src0Stride, src1Stride, dststride;
- pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
- pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
- pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-
- const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx);
- const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx);
- const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx);
-
- const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx);
- const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx);
- const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx);
-
if (bLuma)
{
+ pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
+ const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx);
+ const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx);
+
// Luma
w0 = wp0[0].w;
offset = wp0[0].o + wp1[0].o;
if (bChroma)
{
+ pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
+ pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
+ const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx);
+ const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx);
+ const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx);
+ const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx);
+
// Chroma U
w0 = wp0[1].w;
offset = wp0[1].o + wp1[1].o;
/* weighted averaging for uni-pred */
void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const
{
- pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
- pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
- pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-
- const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx);
- const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx);
- const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx);
-
int w0, offset, shiftNum, shift, round;
uint32_t srcStride, dstStride;
if (bLuma)
{
+ pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
+ const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx);
+
// Luma
w0 = wp[0].w;
offset = wp[0].offset;
srcStride = srcYuv.m_size;
dstStride = predYuv.m_size;
- primitives.weight_sp(const_cast<int16_t*>(srcY0), dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset);
+ primitives.weight_sp(srcY0, dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset);
}
if (bChroma)
{
+ pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
+ pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
+ const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx);
+ const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx);
+
// Chroma U
w0 = wp[1].w;
offset = wp[1].offset;
uint32_t cwidth = m_puWidth >> srcYuv.m_hChromaShift;
uint32_t cheight = m_puHeight >> srcYuv.m_vChromaShift;
- primitives.weight_sp(const_cast<int16_t*>(srcU0), dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
+ primitives.weight_sp(srcU0, dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
// Chroma V
w0 = wp[2].w;
shift = wp[2].shift + shiftNum;
round = shift ? (1 << (shift - 1)) : 0;
- primitives.weight_sp(const_cast<int16_t*>(srcV0), dstV, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
+ primitives.weight_sp(srcV0, dstV, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
}
}
int tuSize = intraNeighbors.tuSize;
int tuSize2 = tuSize << 1;
- pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = cu.m_encData->m_reconPicYuv->m_stride;
+ pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors);
initIntraNeighbors(cu, absPartIdx, partDepth, false, &intraNeighbors);
uint32_t tuSize = intraNeighbors.tuSize;
- const pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = cu.m_encData->m_reconPicYuv->m_strideC;
+ const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
pixel* adiRef = getAdiChromaBuf(chromaId, tuSize);
fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors);
}
int numIntraNeighbor = 0;
- bool *bNeighborFlags = intraNeighbors->bNeighborFlags;
+ bool* bNeighborFlags = intraNeighbors->bNeighborFlags;
uint32_t partIdxLT, partIdxRT, partIdxLB;
}
else // reference samples are partially available
{
- const bool *bNeighborFlags = intraNeighbors.bNeighborFlags;
- const bool *pNeighborFlags;
+ const bool* bNeighborFlags = intraNeighbors.bNeighborFlags;
+ const bool* pNeighborFlags;
int aboveUnits = intraNeighbors.aboveUnits;
int leftUnits = intraNeighbors.leftUnits;
int unitWidth = intraNeighbors.unitWidth;
int unitHeight = intraNeighbors.unitHeight;
int totalSamples = (leftUnits * unitHeight) + ((aboveUnits + 1) * unitWidth);
pixel adiLineBuffer[5 * MAX_CU_SIZE];
- pixel *adi;
+ pixel* adi;
// Initialize
for (int i = 0; i < totalSamples; i++)
while (next < totalUnits && !bNeighborFlags[next])
next++;
- pixel *pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
+ pixel* pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
const pixel refSample = *pAdiLineNext;
// Pad unavailable samples with new value
int nextOrTop = X265_MIN(next, leftUnits);
return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft);
}
-int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags)
+int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
{
const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT] + 1;
const uint32_t idxStep = 1;
- bool *validFlagPtr = bValidFlags;
+ bool* validFlagPtr = bValidFlags;
int numIntra = 0;
for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
return numIntra;
}
-int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags)
+int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
{
const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB] + 1;
const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize;
- bool *validFlagPtr = bValidFlags;
+ bool* validFlagPtr = bValidFlags;
int numIntra = 0;
for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
return numIntra;
}
-int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags)
+int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
{
const uint32_t numUnitsInPU = g_zscanToRaster[partIdxRT] - g_zscanToRaster[partIdxLT] + 1;
- bool *validFlagPtr = bValidFlags;
+ bool* validFlagPtr = bValidFlags;
int numIntra = 0;
for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
return numIntra;
}
-int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags)
+int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
{
const uint32_t numUnitsInPU = (g_zscanToRaster[partIdxLB] - g_zscanToRaster[partIdxLT]) / cu.m_slice->m_sps->numPartInCUSize + 1;
- bool *validFlagPtr = bValidFlags;
+ bool* validFlagPtr = bValidFlags;
int numIntra = 0;
for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
- p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
+ p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
+ p.chroma[X265_CSP_I444].satd[i] = p.satd[i];
}
for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i];
}
- for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
- {
- int partL = partitionFromLog2Size(i + 2);
- p.square_copy_pp[i] = p.luma_copy_pp[partL];
- p.square_copy_ps[i] = p.luma_copy_ps[partL];
- p.square_copy_sp[i] = p.luma_copy_sp[partL];
- p.square_copy_ss[i] = p.luma_copy_ss[partL];
- }
-
primitives.sa8d[BLOCK_4x4] = primitives.sa8d_inter[LUMA_4x4];
primitives.sa8d[BLOCK_8x8] = primitives.sa8d_inter[LUMA_8x8];
primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16];
primitives.sa8d_inter[LUMA_16x4] = primitives.satd[LUMA_16x4];
primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12];
primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16];
+
+ // Chroma SATD can often reuse luma primitives
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = primitives.satd[LUMA_4x4];
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = primitives.satd[LUMA_8x8];
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = primitives.satd[LUMA_16x16];
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = primitives.satd[LUMA_32x32];
+
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = primitives.satd[LUMA_8x4];
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = primitives.satd[LUMA_4x8];
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = primitives.satd[LUMA_16x8];
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = primitives.satd[LUMA_8x16];
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = primitives.satd[LUMA_32x16];
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = primitives.satd[LUMA_16x32];
+
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = primitives.satd[LUMA_16x12];
+ p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = primitives.satd[LUMA_12x16];
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = primitives.satd[LUMA_16x4];
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = primitives.satd[LUMA_4x16];
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = primitives.satd[LUMA_32x24];
+ p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = primitives.satd[LUMA_24x32];
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = primitives.satd[LUMA_32x8];
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = primitives.satd[LUMA_8x32];
+
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = primitives.satd[LUMA_4x8];
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = primitives.satd[LUMA_8x16];
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = primitives.satd[LUMA_16x32];
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = primitives.satd[LUMA_32x64];
+
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = primitives.satd[LUMA_4x4];
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = primitives.satd[LUMA_8x8];
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = primitives.satd[LUMA_4x16];
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = primitives.satd[LUMA_16x16];
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = primitives.satd[LUMA_8x32];
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = primitives.satd[LUMA_32x32];
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = primitives.satd[LUMA_16x64];
+
+ //p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = primitives.satd[LUMA_8x4];
+ //p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
+ //p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = primitives.satd[LUMA_16x8];
+ //p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>;
+ //p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
+ //p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = primitives.satd[LUMA_32x16];
+ //p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>;
}
}
using namespace x265;
if (!primitives.sad[0])
{
Setup_C_Primitives(primitives);
- Setup_Instrinsic_Primitives(primitives, cpuid);
#if ENABLE_ASSEMBLY
+ Setup_Instrinsic_Primitives(primitives, cpuid);
Setup_Assembly_Primitives(primitives, cpuid);
#else
x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
#endif
Setup_Alias_Primitives(primitives);
-
- initROM();
}
if (param->logLevel >= X265_LOG_INFO)
}
}
-#if !defined(ENABLE_ASSEMBLY)
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-
+#if ENABLE_ASSEMBLY
+/* these functions are implemented in assembly. When assembly is not being
+ * compiled, they are unnecessary and can be NOPs */
+#else
extern "C" {
-// the intrinsic primitives will not use MMX instructions, so if assembly
-// is disabled there should be no reason to use EMMS.
+int x265_cpu_cpuid_test(void) { return 0; }
void x265_cpu_emms(void) {}
-
-#if defined(X265_ARCH_X86)
-
-#if defined(_MSC_VER)
-# pragma warning(disable: 4100)
-#elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax
-# define __cpuidex(regsArray, level, index) \
- __asm__ __volatile__ ("cpuid" \
- : "=a" ((regsArray)[0]), "=b" ((regsArray)[1]), "=c" ((regsArray)[2]), "=d" ((regsArray)[3]) \
- : "0" (level), "2" (index));
-#else
-# error "compiler not supported"
-#endif
-
-int x265_cpu_cpuid_test(void)
-{
- return 0;
+void x265_cpu_cpuid(uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *) {}
+void x265_cpu_xgetbv(uint32_t, uint32_t *, uint32_t *) {}
}
-
-void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
-{
- int output[4];
-
- __cpuidex(output, op, 0);
- *eax = output[0];
- *ebx = output[1];
- *ecx = output[2];
- *edx = output[3];
-}
-
-void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx)
-{
- uint64_t out = 0;
-
-#if X265_ARCH_X86
-
-#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
-
- // MSVC 2010 SP1 or later, or similar Intel release
- out = _xgetbv(op);
-
-#elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax
-
- uint32_t a, d;
- __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (op) :);
- *eax = a;
- *edx = d;
- return;
-
-#elif defined(_WIN64) // On x64 with older compilers, this is impossible
-
-#endif // if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
-
-#endif // if x86
-
- *eax = (uint32_t)out;
- *edx = (uint32_t)(out >> 32);
-}
-
-#endif // X265_ARCH_X86
-}
-#endif // if !ENABLE_ASSEMBLY
+#endif
return log2Size - 2;
}
-typedef int (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned
-typedef int (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride);
-typedef int (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
-typedef int (*pixel_ssd_s_t)(int16_t *fenc, intptr_t fencstride);
-typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res);
-typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
-typedef void (*blockcpy_sp_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
-typedef void (*blockcpy_sc_t)(int bx, int by, int16_t *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned
-typedef void (*pixelsub_ps_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight);
-typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val);
-
-typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter);
-typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma);
-
-typedef void (*cvt16to32_shl_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
-typedef void (*cvt16to32_shr_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
-typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int);
-typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int);
-typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride);
-typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
-typedef void (*copy_shl_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift);
-
-typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
-typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
-typedef void (*denoiseDct_t)(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
-
-typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
-typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-typedef int (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
-
-typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
-typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc,
+typedef int (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
+typedef int (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
+typedef int (*pixelcmp_sp_t)(const int16_t* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride);
+typedef int (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
+typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
+typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
+
+typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter);
+typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma);
+
+typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy1Dto2D_shl_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef void (*cpy1Dto2D_shr_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
+
+typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t dstStride);
+typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff);
+
+typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride);
+typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
+typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+typedef int (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff);
+
+typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride);
+typedef void (*downscale_t)(const pixel* src0, pixel* dstf, pixel* dsth, pixel* dstv, pixel* dstc,
intptr_t src_stride, intptr_t dst_stride, int width, int height);
typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX);
-typedef void (*ssim_4x4x2_core_t)(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]);
+typedef void (*ssim_4x4x2_core_t)(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
typedef float (*ssim_end4_t)(int sum0[5][4], int sum1[5][4], int width);
-typedef uint64_t (*var_t)(pixel *pix, intptr_t stride);
-typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src, intptr_t srcStride, int w, int h);
+typedef uint64_t (*var_t)(const pixel* pix, intptr_t stride);
+typedef void (*plane_copy_deinterleave_t)(pixel* dstu, intptr_t dstuStride, pixel* dstv, intptr_t dstvStride, const pixel* src, intptr_t srcStride, int w, int h);
-typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_hps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-typedef void (*filter_ps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_sp_t) (int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_ss_t) (int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY);
-typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+typedef void (*filter_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+typedef void (*filter_ps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_sp_t) (const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_ss_t) (const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
+typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
-typedef void (*copy_pp_t)(pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
-typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride);
-typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
-typedef void (*copy_ss_t)(int16_t *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride);
+typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned
+typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+typedef void (*copy_ps_t)(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+typedef void (*copy_ss_t)(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
-typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+typedef void (*pixel_sub_ps_t)(int16_t* dst, intptr_t dstride, const pixel* src0, const pixel* src1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*pixel_add_ps_t)(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
-typedef void (*saoCuOrgE0_t)(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft);
-typedef void (*planecopy_cp_t) (uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift);
-typedef void (*planecopy_sp_t) (uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
-typedef void (*cutree_propagate_cost) (int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len);
+typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
* a vectorized primitive, or a C function. */
struct EncoderPrimitives
{
- pixelcmp_t sad[NUM_LUMA_PARTITIONS]; // Sum of Differences for each size
- pixelcmp_x3_t sad_x3[NUM_LUMA_PARTITIONS]; // Sum of Differences 3x for each size
- pixelcmp_x4_t sad_x4[NUM_LUMA_PARTITIONS]; // Sum of Differences 4x for each size
- pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
- pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed
- pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed
- pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed
- pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD)
- pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
- pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks
- pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in AC energy between two blocks
- pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS];
-
- blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value
- cvt16to32_shl_t cvt16to32_shl;
- cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1];
- cvt32to16_shr_t cvt32to16_shr;
- cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
- copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1];
- copy_shr_t copy_shr;
- copy_shl_t copy_shl[NUM_SQUARE_BLOCKS - 1];
-
- copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS];
- copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
- copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS];
- copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS];
- pixel_sub_ps_t luma_sub_ps[NUM_SQUARE_BLOCKS];
- pixel_add_ps_t luma_add_ps[NUM_SQUARE_BLOCKS];
- copy_pp_t square_copy_pp[NUM_SQUARE_BLOCKS];
- copy_sp_t square_copy_sp[NUM_SQUARE_BLOCKS];
- copy_ps_t square_copy_ps[NUM_SQUARE_BLOCKS];
- copy_ss_t square_copy_ss[NUM_SQUARE_BLOCKS];
-
- filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS];
- filter_hps_t luma_hps[NUM_LUMA_PARTITIONS];
- filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS];
- filter_ps_t luma_vps[NUM_LUMA_PARTITIONS];
- filter_sp_t luma_vsp[NUM_LUMA_PARTITIONS];
- filter_ss_t luma_vss[NUM_LUMA_PARTITIONS];
- filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS];
- filter_p2s_t luma_p2s;
- filter_p2s_t chroma_p2s[X265_CSP_COUNT];
-
- weightp_sp_t weight_sp;
- weightp_pp_t weight_pp;
- pixelavg_pp_t pixelavg_pp[NUM_LUMA_PARTITIONS];
- addAvg_t luma_addAvg[NUM_LUMA_PARTITIONS];
-
- intra_pred_t intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE];
- intra_allangs_t intra_pred_allangs[NUM_TR_SIZE];
- scale_t scale1D_128to64;
- scale_t scale2D_64to32;
-
- dct_t dct[NUM_DCTS];
- idct_t idct[NUM_IDCTS];
- quant_t quant;
- nquant_t nquant;
- dequant_scaling_t dequant_scaling;
- dequant_normal_t dequant_normal;
- count_nonzero_t count_nonzero;
- denoiseDct_t denoiseDct;
-
- calcresidual_t calcresidual[NUM_SQUARE_BLOCKS];
- transpose_t transpose[NUM_SQUARE_BLOCKS];
-
- var_t var[NUM_SQUARE_BLOCKS];
- ssim_4x4x2_core_t ssim_4x4x2_core;
- ssim_end4_t ssim_end_4;
-
- downscale_t frame_init_lowres_core;
- plane_copy_deinterleave_t plane_copy_deinterleave_c;
- extendCURowBorder_t extendRowBorder;
- // sao primitives
- saoCuOrgE0_t saoCuOrgE0;
- planecopy_cp_t planecopy_cp;
- planecopy_sp_t planecopy_sp;
-
- cutree_propagate_cost propagateCost;
+ pixelcmp_t sad[NUM_LUMA_PARTITIONS]; // Sum of Differences for each size
+ pixelcmp_x3_t sad_x3[NUM_LUMA_PARTITIONS]; // Sum of Differences 3x for each size
+ pixelcmp_x4_t sad_x4[NUM_LUMA_PARTITIONS]; // Sum of Differences 4x for each size
+ pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
+ pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed
+ pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed
+ pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed
+ pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD)
+ pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
+ pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks
+ pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in AC energy between two blocks
+ pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS];
+
+ dct_t dct[NUM_DCTS];
+ idct_t idct[NUM_IDCTS];
+ quant_t quant;
+ nquant_t nquant;
+ dequant_scaling_t dequant_scaling;
+ dequant_normal_t dequant_normal;
+ count_nonzero_t count_nonzero;
+ denoiseDct_t denoiseDct;
+ calcresidual_t calcresidual[NUM_SQUARE_BLOCKS];
+ blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value
+ cpy2Dto1D_shl_t cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1];
+ cpy2Dto1D_shr_t cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1];
+ cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1];
+ cpy1Dto2D_shr_t cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1];
+ copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1];
+
+ intra_pred_t intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE];
+ intra_allangs_t intra_pred_allangs[NUM_TR_SIZE];
+ transpose_t transpose[NUM_SQUARE_BLOCKS];
+ scale_t scale1D_128to64;
+ scale_t scale2D_64to32;
+
+ var_t var[NUM_SQUARE_BLOCKS];
+ ssim_4x4x2_core_t ssim_4x4x2_core;
+ ssim_end4_t ssim_end_4;
+
+ saoCuOrgE0_t saoCuOrgE0;
+
+ downscale_t frameInitLowres;
+ cutree_propagate_cost propagateCost;
+
+ extendCURowBorder_t extendRowBorder;
+ planecopy_cp_t planecopy_cp;
+ planecopy_sp_t planecopy_sp;
+
+ weightp_sp_t weight_sp;
+ weightp_pp_t weight_pp;
+ pixelavg_pp_t pixelavg_pp[NUM_LUMA_PARTITIONS];
+ addAvg_t luma_addAvg[NUM_LUMA_PARTITIONS];
+
+ filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS];
+ filter_hps_t luma_hps[NUM_LUMA_PARTITIONS];
+ filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS];
+ filter_ps_t luma_vps[NUM_LUMA_PARTITIONS];
+ filter_sp_t luma_vsp[NUM_LUMA_PARTITIONS];
+ filter_ss_t luma_vss[NUM_LUMA_PARTITIONS];
+ filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS];
+ filter_p2s_t luma_p2s;
+
+ copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS];
+ copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
+ copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS];
+ copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS];
+ pixel_sub_ps_t luma_sub_ps[NUM_SQUARE_BLOCKS];
+ pixel_add_ps_t luma_add_ps[NUM_SQUARE_BLOCKS];
struct
{
+ pixelcmp_t satd[NUM_LUMA_PARTITIONS];
filter_pp_t filter_vpp[NUM_LUMA_PARTITIONS];
filter_ps_t filter_vps[NUM_LUMA_PARTITIONS];
filter_sp_t filter_vsp[NUM_LUMA_PARTITIONS];
copy_ss_t copy_ss[NUM_LUMA_PARTITIONS];
pixel_sub_ps_t sub_ps[NUM_SQUARE_BLOCKS];
pixel_add_ps_t add_ps[NUM_SQUARE_BLOCKS];
- } chroma[4]; // X265_CSP_COUNT - do not want to include x265.h here
+ filter_p2s_t p2s;
+ } chroma[X265_CSP_COUNT];
};
void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY);
return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
}
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
{
X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
// NOTE: mapping to x86 hardware instruction BSR
unsigned long size;
- CLZ32(size, absLevel);
+ CLZ(size, absLevel);
int egs = size * 2 + 1;
rate += egs << 15;
}
/* Calculates the cost for specific absolute transform level */
-inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
{
X265_CHECK(absLevel, "absLevel should not be zero\n");
if (symbol)
{
unsigned long idx;
- CLZ32(idx, symbol + 1);
+ CLZ(idx, symbol + 1);
length = idx;
}
m_useRDOQ = useRDOQ;
m_psyRdoqScale = (int64_t)(psyScale * 256.0);
m_scalingList = &scalingList;
- m_resiDctCoeff = X265_MALLOC(int32_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
+ m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
int qpy = ctu.m_qp[0];
m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET);
- setChromaQP(qpy + ctu.m_slice->m_pps->chromaCbQpOffset, TEXT_CHROMA_U, ctu.m_chromaFormat);
- setChromaQP(qpy + ctu.m_slice->m_pps->chromaCrQpOffset, TEXT_CHROMA_V, ctu.m_chromaFormat);
+ setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
+ setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
}
void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams)
{
const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG;
- const uint16_t *scan = codeParams.scan;
+ const uint16_t* scan = codeParams.scan;
bool lastCG = true;
for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--)
return numSig;
}
-uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16_t* residual, uint32_t stride,
+uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
{
+ const uint32_t sizeIdx = log2TrSize - 2;
if (cu.m_tqBypass[absPartIdx])
{
X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
- return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride);
+ return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
}
bool isLuma = ttype == TEXT_LUMA;
bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip;
- bool isIntra = cu.m_predMode[absPartIdx] == MODE_INTRA;
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
- int trSize = 1 << log2TrSize;
X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
if (useTransformSkip)
{
#if X265_DEPTH <= 10
- primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+ X265_CHECK(transformShift >= 0, "invalid transformShift\n");
+ primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
#else
if (transformShift >= 0)
- primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+ primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
else
- {
- int shift = -transformShift;
- int offset = (1 << (shift - 1));
- primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset);
- }
+ primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
#endif
}
else
{
- const uint32_t sizeIdx = log2TrSize - 2;
+ bool isIntra = cu.isIntra(absPartIdx);
int useDST = !sizeIdx && isLuma && isIntra;
int index = DCT_4x4 + sizeIdx - useDST;
- primitives.dct[index](residual, m_resiDctCoeff, stride);
+ primitives.dct[index](residual, m_resiDctCoeff, resiStride);
/* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
* there is no risk of performing this DCT unnecessarily */
if (usePsy)
{
+ int trSize = 1 << log2TrSize;
/* perform DCT on source pixels for psy-rdoq */
- primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
+ primitives.luma_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
}
- if (m_nr && !isIntra)
+ if (m_nr)
{
/* denoise is not applied to intra residual, so DST can be ignored */
- int cat = sizeIdx + 4 * !isLuma;
+ int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
int numCoeff = 1 << (log2TrSize * 2);
primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
m_nr->count[cat]++;
int scalingListType = ttype + (isLuma ? 3 : 0);
int rem = m_qpParam[ttype].rem;
int per = m_qpParam[ttype].per;
- int32_t *quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
int qbits = QUANT_SHIFT + per + transformShift;
int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
}
}
-void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff,
+void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
{
+ const uint32_t sizeIdx = log2TrSize - 2;
if (transQuantBypass)
{
- primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0);
+ primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
return;
}
if (m_scalingList->m_bEnabled)
{
int scalingListType = (bIntra ? 0 : 3) + ttype;
- int32_t *dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
}
else
if (useTransformSkip)
{
- int trSize = 1 << log2TrSize;
-
#if X265_DEPTH <= 10
- primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+ X265_CHECK(transformShift > 0, "invalid transformShift\n");
+ primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
#else
if (transformShift > 0)
- primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+ primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
else
- primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
+ primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
#endif
}
else
{
- const uint32_t sizeIdx = log2TrSize - 2;
int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
// DC only
if (numSig == 1 && coeff[0] != 0 && !useDST)
{
- const int shift_1st = 7;
+ const int shift_1st = 7 - 6;
const int add_1st = 1 << (shift_1st - 1);
- const int shift_2nd = 12 - (X265_DEPTH - 8);
+ const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
const int add_2nd = 1 << (shift_2nd - 1);
- int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
- primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val);
+ int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
+ primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
return;
}
- primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride);
+ primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
}
}
/* Rate distortion optimized quantization for entropy coding engines using
* probability models like CABAC */
-uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
{
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
int per = m_qpParam[ttype].per;
int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
int add = (1 << (qbits - 1));
- int32_t *qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
int numCoeff = 1 << (log2TrSize * 2);
/* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
* scale applied that must be removed during unquant. Note that in real dequant there is clipping
* at several stages. We skip the clipping for simplicity when measuring RD cost */
- int32_t *unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
int scaleBits = SCALE_BITS - 2 * transformShift;
// coefficient level estimation
const uint32_t oneCtx = 4 * ctxSet + c1;
const uint32_t absCtx = ctxSet + c2;
- const int *greaterOneBits = estBitsSbac.greaterOneBits[oneCtx];
- const int *levelAbsBits = estBitsSbac.levelAbsBits[absCtx];
+ const int* greaterOneBits = estBitsSbac.greaterOneBits[oneCtx];
+ const int* levelAbsBits = estBitsSbac.levelAbsBits[absCtx];
uint16_t level = 0;
uint32_t sigCoefBits = 0;
}
};
+#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
+#define MAX_NUM_TR_CATEGORIES 16 /* 32, 16, 8, 4 transform categories each for luma and chroma */
+
+// NOTE: MUST be 16-byte aligned for asm code
+struct NoiseReduction
+{
+ /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
+ * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
+ * Intra 0..7 - Inter 8..15 */
+ uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+ uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+ uint32_t count[MAX_NUM_TR_CATEGORIES];
+};
+
class Quant
{
protected:
bool m_useRDOQ;
int64_t m_psyRdoqScale;
- int32_t* m_resiDctCoeff;
- int32_t* m_fencDctCoeff;
+ int16_t* m_resiDctCoeff;
+ int16_t* m_fencDctCoeff;
int16_t* m_fencShortBuf;
enum { IEP_RATE = 32768 }; /* FIX15 cost of an equal probable bit */
/* CU setup */
void setQPforQuant(const CUData& ctu);
- uint32_t transformNxN(CUData& cu, pixel *fenc, uint32_t fencstride, int16_t* residual, uint32_t stride, coeff_t* coeff,
+ uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
- void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff,
+ void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
/* static methods shared with entropy.cpp */
uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters);
- uint32_t rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
+ uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
inline uint32_t getRateLast(uint32_t posx, uint32_t posy) const;
};
const int16_t* src = getLumaAddr(absPartIdx);
int16_t* dst = dstYuv.getLumaAddr(absPartIdx);
- primitives.square_copy_ss[log2Size - 2](dst, dstYuv.m_size, const_cast<int16_t*>(src), m_size);
+ primitives.luma_copy_ss[log2Size - 2](dst, dstYuv.m_size, src, m_size);
}
void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
const int16_t* src = getLumaAddr(absPartIdx);
pixel* dst = dstYuv.getLumaAddr(absPartIdx);
- primitives.square_copy_sp[log2Size - 2](dst, dstYuv.m_size, const_cast<int16_t*>(src), m_size);
+ primitives.luma_copy_sp[log2Size - 2](dst, dstYuv.m_size, src, m_size);
}
void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
int16_t* dstU = dstYuv.getCbAddr(absPartIdx);
int16_t* dstV = dstYuv.getCrAddr(absPartIdx);
- primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, const_cast<int16_t*>(srcU), m_csize);
- primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, const_cast<int16_t*>(srcV), m_csize);
+ primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, srcU, m_csize);
+ primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, srcV, m_csize);
}
void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
pixel* dstU = dstYuv.getCbAddr(absPartIdx);
pixel* dstV = dstYuv.getCrAddr(absPartIdx);
- primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, const_cast<int16_t*>(srcU), m_csize);
- primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, const_cast<int16_t*>(srcV), m_csize);
+ primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, srcU, m_csize);
+ primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, srcV, m_csize);
}
{
uint32_t maxCuDQPDepth;
- int chromaCbQpOffset; // use param
- int chromaCrQpOffset; // use param
+ int chromaQpOffset[2]; // use param
bool bUseWeightPred; // use param
bool bUseWeightedBiPred; // use param
void setRefPicList(PicList& picList);
+ const Frame* getRefPic(int list, int refIdx) const { return refIdx >= 0 ? m_refPicList[list][refIdx] : NULL; }
+
bool getRapPicFlag() const
{
return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
/*****************************************************************************
- * x265: threading class and intrinsics
- *****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
{
DWORD threadId;
- this->thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)ThreadShim, this, 0, &threadId);
+ thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)ThreadShim, this, 0, &threadId);
return threadId > 0;
}
void Thread::stop()
{
- if (this->thread)
- WaitForSingleObject(this->thread, INFINITE);
+ if (thread)
+ WaitForSingleObject(thread, INFINITE);
}
Thread::~Thread()
{
- if (this->thread)
- CloseHandle(this->thread);
+ if (thread)
+ CloseHandle(thread);
}
#else /* POSIX / pthreads */
bool Thread::start()
{
- if (pthread_create(&this->thread, NULL, ThreadShim, this))
+ if (pthread_create(&thread, NULL, ThreadShim, this))
{
- this->thread = 0;
-
+ thread = 0;
return false;
}
void Thread::stop()
{
- if (this->thread)
- pthread_join(this->thread, NULL);
+ if (thread)
+ pthread_join(thread, NULL);
}
Thread::~Thread() {}
Thread::Thread()
{
- this->thread = 0;
+ thread = 0;
}
+
}
/*****************************************************************************
- * x265: threading class and intrinsics
- *****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
#include <sys/time.h>
#include <unistd.h>
-#define CLZ32(id, x) id = (unsigned long)__builtin_clz(x) ^ 31
-#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x)
-#define ATOMIC_OR(ptr, mask) __sync_or_and_fetch(ptr, mask)
-#define ATOMIC_CAS(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval)
-#define ATOMIC_CAS32(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval)
+#define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31
+#define CTZ(id, x) id = (unsigned long)__builtin_ctz(x)
+#define ATOMIC_OR(ptr, mask) __sync_fetch_and_or(ptr, mask)
+#define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
#define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1)
#define ATOMIC_DEC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, -1)
#define GIVE_UP_TIME() usleep(0)
#include <intrin.h>
-#if !_WIN64
-inline int _BitScanReverse64(DWORD *id, uint64_t x64) // fake 64bit CLZ
-{
- uint32_t high32 = (uint32_t)(x64 >> 32);
- uint32_t low32 = (uint32_t)x64;
-
- if (high32)
- {
- _BitScanReverse(id, high32);
- *id += 32;
- return 1;
- }
- else if (low32)
- return _BitScanReverse(id, low32);
- else
- return *id = 0;
-}
-
-inline int _BitScanForward64(DWORD *id, uint64_t x64) // fake 64bit CLZ
-{
- uint32_t high32 = (uint32_t)(x64 >> 32);
- uint32_t low32 = (uint32_t)x64;
-
- if (high32)
- {
- _BitScanForward(id, high32);
- *id += 32;
- return 1;
- }
- else if (low32)
- return _BitScanForward(id, low32);
- else
- return *id = 0;
-}
-
-#endif // if !_WIN64
-
-#ifndef ATOMIC_OR
-#define ATOMIC_OR(ptr, mask) InterlockedOr64((volatile LONG64*)ptr, mask)
-#endif
-
-#define CLZ32(id, x) _BitScanReverse(&id, x)
-#define CTZ64(id, x) _BitScanForward64(&id, x)
-#define ATOMIC_CAS(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
-#define ATOMIC_CAS32(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval)
+#define CLZ(id, x) _BitScanReverse(&id, x)
+#define CTZ(id, x) _BitScanForward(&id, x)
#define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr)
#define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr)
+#define ATOMIC_OR(ptr, mask) _InterlockedOr((volatile LONG*)ptr, (LONG)mask)
+#define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
#define GIVE_UP_TIME() Sleep(0)
#endif // ifdef __GNUC__
/*****************************************************************************
- * x265: singleton thread pool and interface classes
- *****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
int m_numThreads;
int m_numSleepMapWords;
PoolThread *m_threads;
- volatile uint64_t *m_sleepMap;
+ volatile uint32_t *m_sleepMap;
/* Lock for write access to the provider lists. Threads are
* always allowed to read m_firstProvider and follow the
void ThreadPoolImpl::markThreadAsleep(int id)
{
- int word = id >> 6;
- uint64_t bit = 1LL << (id & 63);
+ int word = id >> 5;
+ uint32_t bit = 1 << (id & 31);
ATOMIC_OR(&m_sleepMap[word], bit);
}
* not give up until a thread is awakened or all of them are awake */
for (int i = 0; i < m_numSleepMapWords; i++)
{
- uint64_t oldval = m_sleepMap[i];
+ uint32_t oldval = m_sleepMap[i];
while (oldval)
{
unsigned long id;
- CTZ64(id, oldval);
+ CTZ(id, oldval);
- uint64_t newval = oldval & ~(1LL << id);
- if (ATOMIC_CAS(&m_sleepMap[i], oldval, newval) == oldval)
+ uint32_t bit = 1 << id;
+ if (ATOMIC_AND(&m_sleepMap[i], ~bit) & bit)
{
- m_threads[(i << 6) | id].poke();
+ m_threads[i * 32 + id].poke();
return;
}
, m_firstProvider(NULL)
, m_lastProvider(NULL)
{
- m_numSleepMapWords = (numThreads + 63) >> 6;
- m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords);
+ m_numSleepMapWords = (numThreads + 31) >> 5;
+ m_sleepMap = X265_MALLOC(uint32_t, m_numSleepMapWords);
char *buffer = (char*)X265_MALLOC(PoolThread, numThreads);
m_threads = reinterpret_cast<PoolThread*>(buffer);
if (m_threads && m_sleepMap)
{
for (int i = 0; i < m_numSleepMapWords; i++)
- {
m_sleepMap[i] = 0;
- }
m_ok = true;
int i;
}
if (m_ok)
- {
waitForAllIdle();
- }
else
{
// stop threads that did start up
int id = 0;
do
{
- int word = id >> 6;
- uint64_t bit = 1LL << (id & 63);
+ int word = id >> 5;
+ uint32_t bit = 1 << (id & 31);
if (m_sleepMap[word] & bit)
- {
id++;
- }
else
{
GIVE_UP_TIME();
{
// cleanup thread handles
for (int i = 0; i < m_numThreads; i++)
- {
m_threads[i].~PoolThread();
- }
X265_FREE(reinterpret_cast<char*>(m_threads));
}
/*****************************************************************************
- * x265: singleton thread pool and interface classes
- *****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
{ 83, 36, 83, 36, 83, 36, 83, 36 },
{ 36, -83, 36, -83, 36, -83, 36, -83 }
};
-void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+void idct8(const int16_t* src, int16_t* dst, intptr_t stride)
{
__m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
__m128i T00, T01, T02, T03, T04, T05, T06, T07;
m128iAdd = _mm_set1_epi32(64);
- T00 = _mm_load_si128((__m128i*)&src[8 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[8 + 4]);
- m128iS1 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[24 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[24 + 4]);
- m128iS3 = _mm_packs_epi32(T00, T01);
+ m128iS1 = _mm_load_si128((__m128i*)&src[8 + 0]);
+ m128iS3 = _mm_load_si128((__m128i*)&src[24 + 0]);
m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
- T00 = _mm_load_si128((__m128i*)&src[40 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[40 + 4]);
- m128iS5 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[56 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[56 + 4]);
- m128iS7 = _mm_packs_epi32(T00, T01);
+ m128iS5 = _mm_load_si128((__m128i*)&src[40 + 0]);
+ m128iS7 = _mm_load_si128((__m128i*)&src[56 + 0]);
m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
/* ------- */
- T00 = _mm_load_si128((__m128i*)&src[0 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[0 + 4]);
- m128iS0 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[32 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[32 + 4]);
- m128iS4 = _mm_packs_epi32(T00, T01);
+ m128iS0 = _mm_load_si128((__m128i*)&src[0 + 0]);
+ m128iS4 = _mm_load_si128((__m128i*)&src[32 + 0]);
m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
/* ------- */
- T00 = _mm_load_si128((__m128i*)&src[16 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[16 + 4]);
- m128iS2 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[48 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[48 + 4]);
- m128iS6 = _mm_packs_epi32(T00, T01);
+ m128iS2 = _mm_load_si128((__m128i*)&src[16 + 0]);
+ m128iS6 = _mm_load_si128((__m128i*)&src[48 + 0]);
m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
_mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(T11));
}
-void idct16(int32_t *src, int16_t *dst, intptr_t stride)
+void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
{
const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050);
for (int i = 0; i < 2; i++)
{
const int offset = (i << 3);
- __m128i T00, T01;
-
- T00 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset + 4]);
- in00[i] = _mm_packs_epi32(T00, T01); // [07 06 05 04 03 02 01 00]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset + 4]);
- in01[i] = _mm_packs_epi32(T00, T01); // [17 16 15 14 13 12 11 10]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset + 4]);
- in02[i] = _mm_packs_epi32(T00, T01); // [27 26 25 24 23 22 21 20]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset + 4]);
- in03[i] = _mm_packs_epi32(T00, T01); // [37 36 35 34 33 32 31 30]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset + 4]);
- in04[i] = _mm_packs_epi32(T00, T01); // [47 46 45 44 43 42 41 40]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset + 4]);
- in05[i] = _mm_packs_epi32(T00, T01); // [57 56 55 54 53 52 51 50]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset + 4]);
- in06[i] = _mm_packs_epi32(T00, T01); // [67 66 65 64 63 62 61 60]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset + 4]);
- in07[i] = _mm_packs_epi32(T00, T01); // [77 76 75 74 73 72 71 70]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset + 4]);
- in08[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset + 4]);
- in09[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset + 4]);
- in10[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset + 4]);
- in11[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset + 4]);
- in12[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset + 4]);
- in13[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset + 4]);
- in14[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset + 4]);
- in15[i] = _mm_packs_epi32(T00, T01);
+ in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00]
+ in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10]
+ in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20]
+ in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30]
+ in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); // [47 46 45 44 43 42 41 40]
+ in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); // [57 56 55 54 53 52 51 50]
+ in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); // [67 66 65 64 63 62 61 60]
+ in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); // [77 76 75 74 73 72 71 70]
+ in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
+ in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
+ in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
+ in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
+ in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
+ in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
+ in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
+ in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
}
for (int pass = 0; pass < 2; pass++)
_mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
}
-void idct32(int32_t *src, int16_t *dst, intptr_t stride)
+void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
{
//Odd
const __m128i c16_p90_p90 = _mm_set1_epi32(0x005A005A); //column 0
for (int i = 0; i < 4; i++)
{
const int offset = (i << 3);
- __m128i T00, T01;
-
- T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
- in00[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
- in01[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
- in02[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
- in03[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
- in04[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
- in05[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
- in06[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
- in07[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
- in08[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
- in09[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
- in10[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
- in11[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
- in12[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
- in13[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
- in14[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
- in15[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
- in16[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
- in17[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
- in18[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
- in19[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
- in20[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
- in21[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
- in22[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
- in23[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
- in24[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
- in25[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
- in26[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
- in27[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
- in28[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
- in29[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
- in30[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
- in31[i] = _mm_packs_epi32(T00, T01);
+ in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
+ in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
+ in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
+ in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
+ in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
+ in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
+ in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
+ in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
+ in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
+ in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
+ in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
+ in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
+ in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
+ in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
+ in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
+ in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
+ in16[i] = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
+ in17[i] = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
+ in18[i] = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
+ in19[i] = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
+ in20[i] = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
+ in21[i] = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
+ in22[i] = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
+ in23[i] = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
+ in24[i] = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
+ in25[i] = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
+ in26[i] = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
+ in27[i] = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
+ in28[i] = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
+ in29[i] = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
+ in30[i] = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
+ in31[i] = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
}
for (int pass = 0; pass < 2; pass++)
using namespace x265;
namespace {
-void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
{
X265_CHECK(num <= 32 * 32, "dequant num too large\n");
quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per));
quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
- sign = _mm_srai_epi16(quantCoef12, 15);
- quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
- _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
- quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
- _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
+ _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
}
}
else
quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift));
quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
- sign = _mm_srai_epi16(quantCoef12, 15);
- quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
- _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
- quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
- _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
+ _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
}
}
}
#undef MAKE_COEF
};
-void dct16(int16_t *src, int32_t *dst, intptr_t stride)
+void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
{
// Const
__m128i c_4 = _mm_set1_epi32(4);
T41 = _mm_hsub_epi32(T30, T31);
T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40);
- _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41);
+ T40 = _mm_packs_epi32(T40, T40);
+ T41 = _mm_packs_epi32(T41, T41);
+ _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
+ _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
T40 = _mm_hadd_epi32(T30, T31);
T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[4 * 16 + i], T40);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
T40 = _mm_hadd_epi32(T30, T31);
T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[12 * 16 + i], T40);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
T40 = _mm_hadd_epi32(T30, T31);
T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[2 * 16 + i], T40);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
T40 = _mm_hadd_epi32(T30, T31);
T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[6 * 16 + i], T40);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
T40 = _mm_hadd_epi32(T30, T31);
T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[10 * 16 + i], T40);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
T40 = _mm_hadd_epi32(T30, T31);
T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[14 * 16 + i], T40);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
#define MAKE_ODD(tab, dstPos) \
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \
\
T40 = _mm_hadd_epi32(T30, T31); \
T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \
- _mm_storeu_si128((__m128i*)&dst[(dstPos) * 16 + i], T40);
+ T40 = _mm_packs_epi32(T40, T40); \
+ _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
MAKE_ODD(14, 1);
MAKE_ODD(16, 3);
#undef MAKE_COEF16
};
-void dct32(int16_t *src, int32_t *dst, intptr_t stride)
+void dct32(const int16_t *src, int16_t *dst, intptr_t stride)
{
// Const
__m128i c_8 = _mm_set1_epi32(8);
T60 = _mm_hadd_epi32(T60, T61); \
\
T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \
- _mm_storeu_si128((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
+ T60 = _mm_packs_epi32(T60, T60); \
+ _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
MAKE_ODD(44, 44, 44, 44, 0);
MAKE_ODD(45, 45, 45, 45, 16);
{
m_numRows = numRows;
- m_numWords = (numRows + 63) >> 6;
- m_internalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords);
+ m_numWords = (numRows + 31) >> 5;
+ m_internalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords);
if (m_internalDependencyBitmap)
- memset((void*)m_internalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_internalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
- m_externalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords);
+ m_externalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords);
if (m_externalDependencyBitmap)
- memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
return m_internalDependencyBitmap && m_externalDependencyBitmap;
}
void WaveFront::clearEnabledRowMask()
{
- memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
}
void WaveFront::enqueueRow(int row)
{
- // thread safe
- uint64_t bit = 1LL << (row & 63);
-
- X265_CHECK(row < m_numRows, "invalid row\n");
- ATOMIC_OR(&m_internalDependencyBitmap[row >> 6], bit);
+ uint32_t bit = 1 << (row & 31);
+ ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);
if (m_pool) m_pool->pokeIdleThread();
}
void WaveFront::enableRow(int row)
{
- // thread safe
- uint64_t bit = 1LL << (row & 63);
-
- X265_CHECK(row < m_numRows, "invalid row\n");
- ATOMIC_OR(&m_externalDependencyBitmap[row >> 6], bit);
+ uint32_t bit = 1 << (row & 31);
+ ATOMIC_OR(&m_externalDependencyBitmap[row >> 5], bit);
}
void WaveFront::enableAllRows()
{
- memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint64_t) * m_numWords);
-}
-
-bool WaveFront::checkHigherPriorityRow(int curRow)
-{
- int fullwords = curRow >> 6;
- uint64_t mask = (1LL << (curRow & 63)) - 1;
-
- // Check full bitmap words before curRow
- for (int i = 0; i < fullwords; i++)
- {
- if (m_internalDependencyBitmap[i] & m_externalDependencyBitmap[i])
- return true;
- }
-
- // check the partially masked bitmap word of curRow
- if (m_internalDependencyBitmap[fullwords] & m_externalDependencyBitmap[fullwords] & mask)
- return true;
- return false;
+ memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint32_t) * m_numWords);
}
bool WaveFront::dequeueRow(int row)
{
- uint64_t oldval, newval;
-
- oldval = m_internalDependencyBitmap[row >> 6];
- newval = oldval & ~(1LL << (row & 63));
- return ATOMIC_CAS(&m_internalDependencyBitmap[row >> 6], oldval, newval) == oldval;
+ uint32_t bit = 1 << (row & 31);
+ return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit);
}
bool WaveFront::findJob(int threadId)
// thread safe
for (int w = 0; w < m_numWords; w++)
{
- uint64_t oldval = m_internalDependencyBitmap[w];
- while (oldval & m_externalDependencyBitmap[w])
+ uint32_t oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
+ while (oldval)
{
- uint64_t mask = oldval & m_externalDependencyBitmap[w];
-
- CTZ64(id, mask);
+ CTZ(id, oldval);
- uint64_t newval = oldval & ~(1LL << id);
- if (ATOMIC_CAS(&m_internalDependencyBitmap[w], oldval, newval) == oldval)
+ uint32_t bit = 1 << id;
+ if (ATOMIC_AND(&m_internalDependencyBitmap[w], ~bit) & bit)
{
- // we cleared the bit, process row
- processRow(w * 64 + id, threadId);
+ /* we cleared the bit, we get to process the row */
+ processRow(w * 32 + id, threadId);
return true;
}
+
// some other thread cleared the bit, try another bit
- oldval = m_internalDependencyBitmap[w];
+ oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
}
}
// Dependencies are categorized as internal and external. Internal dependencies
// are caused by neighbor block availability. External dependencies are generally
// reference frame reconstructed pixels being available.
- uint64_t volatile *m_internalDependencyBitmap;
- uint64_t volatile *m_externalDependencyBitmap;
+ uint32_t volatile *m_internalDependencyBitmap;
+ uint32_t volatile *m_externalDependencyBitmap;
// number of words in the bitmap
int m_numWords;
// Start or resume encode processing of this row, must be implemented by
// derived classes.
virtual void processRow(int row, int threadId) = 0;
-
- // Returns true if a row above curRow is available for processing. The processRow()
- // method may call this function periodically and voluntarily exit
- bool checkHigherPriorityRow(int curRow);
};
} // end namespace x265
#define WakeAllConditionVariable x265::cond_broadcast
#define XP_CONDITION_VAR_FREE x265::cond_destroy
-#if defined(_MSC_VER)
-
-/* Windows XP did not define atomic OR 64, but gcc has a good version, so
- * only use this workaround when targeting XP with MSVC */
-FORCEINLINE LONGLONG interlocked_OR64(__inout LONGLONG volatile *Destination,
- __in LONGLONG Value)
-{
- LONGLONG Old;
-
- do
- {
- Old = *Destination;
- }
- while (_InterlockedCompareExchange64(Destination, Old | Value, Old) != Old);
-
- return Old;
-}
-
-#define ATOMIC_OR(ptr, mask) x265::interlocked_OR64((volatile LONG64*)ptr, mask)
-
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#pragma intrinsic(_InterlockedCompareExchange64)
-#endif
-#endif // defined(_MSC_VER)
} // namespace x265
#else // if defined(_WIN32) && (_WIN32_WINNT < 0x0600)
p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
- p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
- p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
- p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
- p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
- p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
+ p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
+ p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
+ p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
+ p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
+ p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
+ p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
+ p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
+ p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
CHROMA_PIXELSUB_PS(_sse2);
CHROMA_PIXELSUB_PS_422(_sse2);
CHROMA_VERT_FILTERS_422(_sse2);
CHROMA_VERT_FILTERS_444(_sse2);
p.luma_p2s = x265_luma_p2s_sse2;
- p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
- p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_sse2;
- p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
+ p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2;
+ p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
+ p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
+#if X86_64
+ p.idct[IDCT_8x8] = x265_idct8_sse2;
+#endif
p.idct[IDST_4x4] = x265_idst4_sse2;
LUMA_SS_FILTERS(_sse2);
p.quant = x265_quant_sse4;
p.nquant = x265_nquant_sse4;
p.dequant_normal = x265_dequant_normal_sse4;
- p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
- p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
- p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
- p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
- p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
}
if (cpuMask & X265_CPU_XOP)
{
- p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
+ p.frameInitLowres = x265_frame_init_lowres_core_xop;
SA8D_INTER_FROM_BLOCK(xop);
INIT7(satd, _xop);
HEVC_SATD(xop);
p.nquant = x265_nquant_avx2;
p.dequant_normal = x265_dequant_normal_avx2;
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
#if X86_64
p.dct[DCT_8x8] = x265_dct8_avx2;
p.dct[DCT_16x16] = x265_dct16_avx2;
p.idct[IDCT_8x8] = x265_idct8_avx2;
p.idct[IDCT_16x16] = x265_idct16_avx2;
p.idct[IDCT_32x32] = x265_idct32_avx2;
-
p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
INIT8(sad_x4, _mmx2);
p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
- p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
+ p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
PIXEL_AVG(sse2);
PIXEL_AVG_W4(mmx2);
p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
- p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
+ p.frameInitLowres = x265_frame_init_lowres_core_sse2;
SA8D_INTER_FROM_BLOCK(sse2);
- p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
- p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
- p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
- p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
- p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
+ p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
+ p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
+ p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
+ p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
+ p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
+ p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
+ p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
+ p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
+
p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
+
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
+#if X86_64
+ p.idct[IDCT_8x8] = x265_idct8_sse2;
+#endif
p.idct[IDST_4x4] = x265_idst4_sse2;
+
p.planecopy_sp = x265_downShift_16_sse2;
- p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
- p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
- p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
- p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
- p.frame_init_lowres_core = x265_frame_init_lowres_core_ssse3;
+ p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
SA8D_INTER_FROM_BLOCK(ssse3);
p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3;
ASSGN_SSE(ssse3);
p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
p.luma_p2s = x265_luma_p2s_ssse3;
- p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3;
- p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3;
- p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s
+ p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
+ p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
+ p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_ssse3; // for i444, chroma_p2s can use luma_p2s
p.dct[DST_4x4] = x265_dst4_ssse3;
p.idct[IDCT_8x8] = x265_idct8_ssse3;
LUMA_ADDAVG(_sse4);
CHROMA_ADDAVG(_sse4);
CHROMA_ADDAVG_422(_sse4);
- p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
- p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
- p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
- p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
- p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
// TODO: check POPCNT flag!
p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
INTRA_ANG_SSE4(sse4);
p.dct[DCT_8x8] = x265_dct8_sse4;
- p.copy_shr = x265_copy_shr_sse4;
- p.denoiseDct = x265_denoise_dct_sse4;
+// p.denoiseDct = x265_denoise_dct_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
- p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
+ p.frameInitLowres = x265_frame_init_lowres_core_avx;
HEVC_SATD(avx);
SA8D_INTER_FROM_BLOCK(avx);
ASSGN_SSE(avx);
}
if (cpuMask & X265_CPU_XOP)
{
- p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
+ p.frameInitLowres = x265_frame_init_lowres_core_xop;
SA8D_INTER_FROM_BLOCK(xop);
INIT7(satd, _xop);
INIT5_NAME(sse_pp, ssd, _xop);
p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
- p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
- p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
- p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
- p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
- p.denoiseDct = x265_denoise_dct_avx2;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
+
+// p.denoiseDct = x265_denoise_dct_avx2;
p.dct[DCT_4x4] = x265_dct4_avx2;
p.quant = x265_quant_avx2;
p.nquant = x265_nquant_avx2;
p.dequant_normal = x265_dequant_normal_avx2;
+
p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx;
p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx;
p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx;
p.weight_pp = x265_weight_pp_avx2;
#if X86_64
+
p.dct[DCT_8x8] = x265_dct8_avx2;
p.dct[DCT_16x16] = x265_dct16_avx2;
p.dct[DCT_32x32] = x265_dct32_avx2;
p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
+
+ p.luma_vpp[LUMA_12x16] = x265_interp_8tap_vert_pp_12x16_avx2;
+
+ p.luma_vpp[LUMA_16x4] = x265_interp_8tap_vert_pp_16x4_avx2;
+ p.luma_vpp[LUMA_16x8] = x265_interp_8tap_vert_pp_16x8_avx2;
+ p.luma_vpp[LUMA_16x12] = x265_interp_8tap_vert_pp_16x12_avx2;
+ p.luma_vpp[LUMA_16x16] = x265_interp_8tap_vert_pp_16x16_avx2;
+ p.luma_vpp[LUMA_16x32] = x265_interp_8tap_vert_pp_16x32_avx2;
+ p.luma_vpp[LUMA_16x64] = x265_interp_8tap_vert_pp_16x64_avx2;
+
+ p.luma_vpp[LUMA_24x32] = x265_interp_8tap_vert_pp_24x32_avx2;
+
+ p.luma_vpp[LUMA_32x8] = x265_interp_8tap_vert_pp_32x8_avx2;
+ p.luma_vpp[LUMA_32x16] = x265_interp_8tap_vert_pp_32x16_avx2;
+ p.luma_vpp[LUMA_32x24] = x265_interp_8tap_vert_pp_32x24_avx2;
+ p.luma_vpp[LUMA_32x32] = x265_interp_8tap_vert_pp_32x32_avx2;
+ p.luma_vpp[LUMA_32x64] = x265_interp_8tap_vert_pp_32x64_avx2;
+
+ p.luma_vpp[LUMA_48x64] = x265_interp_8tap_vert_pp_48x64_avx2;
+
+ p.luma_vpp[LUMA_64x16] = x265_interp_8tap_vert_pp_64x16_avx2;
+ p.luma_vpp[LUMA_64x32] = x265_interp_8tap_vert_pp_64x32_avx2;
+ p.luma_vpp[LUMA_64x48] = x265_interp_8tap_vert_pp_64x48_avx2;
+ p.luma_vpp[LUMA_64x64] = x265_interp_8tap_vert_pp_64x64_avx2;
#endif
p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
+
+ p.luma_hpp[LUMA_8x4] = x265_interp_8tap_horiz_pp_8x4_avx2;
+ p.luma_hpp[LUMA_8x8] = x265_interp_8tap_horiz_pp_8x8_avx2;
+ p.luma_hpp[LUMA_8x16] = x265_interp_8tap_horiz_pp_8x16_avx2;
+ p.luma_hpp[LUMA_8x32] = x265_interp_8tap_horiz_pp_8x32_avx2;
+
+ p.luma_hpp[LUMA_16x4] = x265_interp_8tap_horiz_pp_16x4_avx2;
+ p.luma_hpp[LUMA_16x8] = x265_interp_8tap_horiz_pp_16x8_avx2;
+ p.luma_hpp[LUMA_16x12] = x265_interp_8tap_horiz_pp_16x12_avx2;
+ p.luma_hpp[LUMA_16x16] = x265_interp_8tap_horiz_pp_16x16_avx2;
+ p.luma_hpp[LUMA_16x32] = x265_interp_8tap_horiz_pp_16x32_avx2;
+ p.luma_hpp[LUMA_16x64] = x265_interp_8tap_horiz_pp_16x64_avx2;
+
+ p.luma_hpp[LUMA_32x8] = x265_interp_8tap_horiz_pp_32x8_avx2;
+ p.luma_hpp[LUMA_32x16] = x265_interp_8tap_horiz_pp_32x16_avx2;
+ p.luma_hpp[LUMA_32x24] = x265_interp_8tap_horiz_pp_32x24_avx2;
+ p.luma_hpp[LUMA_32x32] = x265_interp_8tap_horiz_pp_32x32_avx2;
+ p.luma_hpp[LUMA_32x64] = x265_interp_8tap_horiz_pp_32x64_avx2;
+
+ p.luma_hpp[LUMA_64x64] = x265_interp_8tap_horiz_pp_64x64_avx2;
+ p.luma_hpp[LUMA_64x48] = x265_interp_8tap_horiz_pp_64x48_avx2;
+ p.luma_hpp[LUMA_64x32] = x265_interp_8tap_horiz_pp_64x32_avx2;
+ p.luma_hpp[LUMA_64x16] = x265_interp_8tap_horiz_pp_64x16_avx2;
+
+ p.luma_hpp[LUMA_48x64] = x265_interp_8tap_horiz_pp_48x64_avx2;
+
+ p.chroma[X265_CSP_I420].filter_hpp[CHROMA_8x8] = x265_interp_4tap_horiz_pp_8x8_avx2;
+ p.chroma[X265_CSP_I420].filter_hpp[CHROMA_4x4] = x265_interp_4tap_horiz_pp_4x4_avx2;
+ p.chroma[X265_CSP_I420].filter_hpp[CHROMA_32x32] = x265_interp_4tap_horiz_pp_32x32_avx2;
+ p.chroma[X265_CSP_I420].filter_hpp[CHROMA_16x16] = x265_interp_4tap_horiz_pp_16x16_avx2;
+
+ p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2;
+
+ p.luma_vpp[LUMA_8x4] = x265_interp_8tap_vert_pp_8x4_avx2;
+ p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2;
+ p.luma_vpp[LUMA_8x16] = x265_interp_8tap_vert_pp_8x16_avx2;
+ p.luma_vpp[LUMA_8x32] = x265_interp_8tap_vert_pp_8x32_avx2;
+
+ // color space i420
+ p.chroma[X265_CSP_I420].filter_vpp[CHROMA_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
+ p.chroma[X265_CSP_I420].filter_vpp[CHROMA_8x8] = x265_interp_4tap_vert_pp_8x8_avx2;
+
+ // color space i422
+ p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
+
+ p.luma_vps[LUMA_4x4] = x265_interp_8tap_vert_ps_4x4_avx2;
+
+#if X86_64
+ p.chroma[X265_CSP_I420].filter_vpp[CHROMA_16x16] = x265_interp_4tap_vert_pp_16x16_avx2;
+ p.chroma[X265_CSP_I420].filter_vpp[CHROMA_32x32] = x265_interp_4tap_vert_pp_32x32_avx2;
+#endif
}
#endif // if HIGH_BIT_DEPTH
}
SECTION .text
;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x4, 4, 7, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x8, 4, 7, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x16, 4, 7, 0
;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_4x2, 4, 6, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_4x4, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W4_H8 2
INIT_XMM sse2
BLOCKCOPY_PP_W4_H8 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_6x8, 4, 7, 8
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_6x16, 4, 7, 2
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x2, 4, 4, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x4, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x6, 4, 7, 6
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x12, 4, 5, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W8_H8 2
INIT_XMM sse2
BLOCKCOPY_PP_W8_H8 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W12_H4 2
INIT_XMM sse2
BLOCKCOPY_PP_W12_H4 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W16_H4 2
INIT_XMM sse2
BLOCKCOPY_PP_W16_H4 16, 12
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W16_H8 2
INIT_XMM sse2
BLOCKCOPY_PP_W16_H8 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W24_H4 2
INIT_XMM sse2
BLOCKCOPY_PP_W24_H4 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W32_H4 2
INIT_XMM sse2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_YMM avx
cglobal blockcopy_pp_32x24, 4, 7, 6
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W32_H16_avx 2
INIT_YMM avx
BLOCKCOPY_PP_W32_H16_avx 32, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W48_H2 2
INIT_XMM sse2
BLOCKCOPY_PP_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W64_H4 2
INIT_XMM sse2
BLOCKCOPY_PP_W64_H4 64, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal blockcopy_sp_2x4, 4, 5, 2
;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal blockcopy_sp_2x8, 4, 5, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W2_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
add r3, r3
mov r6d, %2/2
.loop:
BLOCKCOPY_SP_W2_H2 2, 16
;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
add r3, r3
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
add r3, r3
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
add r3, r3
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W4_H8 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/8
BLOCKCOPY_SP_W4_H8 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal blockcopy_sp_6x8, 4, 4, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W6_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
add r3, r3
mov r6d, %2/2
.loop:
BLOCKCOPY_SP_W6_H2 6, 16
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
add r3, r3
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
add r3, r3
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
add r3, r3
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
add r3, r3
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W8_H4 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
add r3, r3
mov r4d, %2/4
.loop:
BLOCKCOPY_SP_W8_H4 8, 12
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W8_H8 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/8
BLOCKCOPY_SP_W8_H8 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W12_H4 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/4
BLOCKCOPY_SP_W12_H4 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W16_H4 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/4
BLOCKCOPY_SP_W16_H4 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W24_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
mov r4d, %2/2
BLOCKCOPY_SP_W24_H2 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W32_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/2
BLOCKCOPY_SP_W32_H2 32, 48
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W48_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
mov r4d, %2
BLOCKCOPY_SP_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W64_H1 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2
BLOCKCOPY_SP_W64_H1 64, 64
;-----------------------------------------------------------------------------
-; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
add r1, r1
RET
;-----------------------------------------------------------------------------
-; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
add r1, r1
RET
;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
%macro BLOCKFILL_S_W16_H8 2
INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
mov r3d, %2/8
RET
;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
%macro BLOCKFILL_S_W32_H4 2
INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
mov r3d, %2/4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
add r1, r1
mov r4d, 16/2
.loop:
;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W4_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W6_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
BLOCKCOPY_PS_W6_H4 6, 16
;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W8_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W12_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
BLOCKCOPY_PS_W12_H2 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
add r1, r1
pxor m0, m0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W16_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
BLOCKCOPY_PS_W16_H4 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W24_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
BLOCKCOPY_PS_W24_H2 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W32_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
BLOCKCOPY_PS_W32_H2 32, 48
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W48_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
BLOCKCOPY_PS_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W64_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
BLOCKCOPY_PS_W64_H2 64, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_2x4, 4, 6, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_2x8, 4, 6, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_2x16, 4, 7, 0
;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_4x2, 4, 4, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_4x4, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W4_H8 2
INIT_XMM sse2
BLOCKCOPY_SS_W4_H8 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_6x8, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_6x16, 4, 5, 4
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x2, 4, 4, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x4, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x6, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x12, 4, 5, 2
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W8_H8 2
INIT_XMM sse2
BLOCKCOPY_SS_W8_H8 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W12_H4 2
INIT_XMM sse2
BLOCKCOPY_SS_W12_H4 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W16_H4 2
INIT_XMM sse2
BLOCKCOPY_SS_W16_H4 16, 12
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W16_H4_avx 2
INIT_YMM avx
BLOCKCOPY_SS_W16_H4_avx 16, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W16_H8 2
INIT_XMM sse2
BLOCKCOPY_SS_W16_H8 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W24_H4 2
INIT_XMM sse2
BLOCKCOPY_SS_W24_H4 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W32_H4 2
INIT_XMM sse2
BLOCKCOPY_SS_W32_H4 32, 48
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W48_H2 2
INIT_XMM sse2
BLOCKCOPY_SS_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W64_H4 2
INIT_XMM sse2
-cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
mov r4d, %2/4
add r1, r1
add r3, r3
BLOCKCOPY_SS_W64_H4 64, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W64_H4_avx 2
INIT_YMM avx
-cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
mov r4d, %2/4
add r1, r1
add r3, r3
BLOCKCOPY_SS_W64_H4_avx 64, 48
BLOCKCOPY_SS_W64_H4_avx 64, 64
-;-----------------------------------------------------------------------------
-; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal cvt32to16_shr, 4, 7, 3, dst, src, stride
-%define rnd m2
-%define shift m1
-
- ; make shift
- mov r5d, r3m
- movd shift, r5d
-
- ; make round
- dec r5
- xor r6, r6
- bts r6, r5
-
- movd rnd, r6d
- pshufd rnd, rnd, 0
-
- ; register alloc
- ; r0 - dst
- ; r1 - src
- ; r2 - stride * 2 (short*)
- ; r3 - lx
- ; r4 - size
- ; r5 - ly
- ; r6 - diff
- add r2d, r2d
-
- mov r4d, r4m
- mov r5, r4
- mov r6, r2
- sub r6, r4
- add r6, r6
-
- shr r5, 1
-.loop_row:
-
- mov r3, r4
- shr r3, 2
-.loop_col:
- ; row 0
- movu m0, [r1]
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0], m0
-
- ; row 1
- movu m0, [r1 + r4 * 4]
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0 + r2], m0
-
- ; move col pointer
- add r1, 16
- add r0, 8
-
- dec r3
- jg .loop_col
-
- ; update pointer
- lea r1, [r1 + r4 * 4]
- add r0, r6
-
- ; end of loop_row
- dec r5
- jg .loop_row
-
- RET
-
-
-;--------------------------------------------------------------------------------------
-; void cvt16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size);
-;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shl, 5, 7, 2, dst, src, stride, shift, size
-%define shift m1
-
- ; make shift
- mov r5d, r3m
- movd shift, r5d
-
- ; register alloc
- ; r0 - dst
- ; r1 - src
- ; r2 - stride
- ; r3 - shift
- ; r4 - size
-
- sub r2d, r4d
- add r2d, r2d
- mov r5d, r4d
- shr r4d, 2
-.loop_row:
- mov r6d, r4d
-
-.loop_col:
- pmovsxwd m0, [r1]
- pslld m0, shift
- movu [r0], m0
-
- add r1, 8
- add r0, 16
-
- dec r6d
- jnz .loop_col
-
- add r1, r2
- dec r5d
- jnz .loop_row
- RET
-
-
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_4, 3,3,3
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_4, 3, 4, 4
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; m0 - shift
- ; m1 - dword [offset]
-
- ; Row 0
- pmovsxwd m2, [r1]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 0 * mmsize], m2
-
- ; Row 1
- pmovsxwd m2, [r1 + r2]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 1 * mmsize], m2
+ ; m1 - word [-round]
- ; Row 2
+ ; Row 0-3
+ movh m2, [r1]
+ movhps m2, [r1 + r2]
lea r1, [r1 + r2 * 2]
- pmovsxwd m2, [r1]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 2 * mmsize], m2
-
- ; Row 3
- pmovsxwd m2, [r1 + r2]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 3 * mmsize], m2
+ movh m3, [r1]
+ movhps m3, [r1 + r2]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
RET
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_8, 3,5,3
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_8, 3, 5, 4
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
mov r3d, 8/4
lea r4, [r2 * 3]
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; r3 - loop counter
; r4 - stride * 3
; m0 - shift
- ; m1 - dword [offset]
+ ; m1 - word [-round]
.loop:
- ; Row 0
- pmovsxwd m2, [r1]
- pmovsxwd m3, [r1 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m3
+ ; Row 0-1
+ mova m2, [r1]
+ mova m3, [r1 + r2]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
- ; Row 1
- pmovsxwd m2, [r1 + r2]
- pmovsxwd m3, [r1 + r2 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 2 * mmsize], m2
- movu [r0 + 3 * mmsize], m3
-
- ; Row 2
- pmovsxwd m2, [r1 + r2 * 2]
- pmovsxwd m3, [r1 + r2 * 2 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m3
-
- ; Row 3
- pmovsxwd m2, [r1 + r4]
- pmovsxwd m3, [r1 + r4 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 6 * mmsize], m2
- movu [r0 + 7 * mmsize], m3
-
- add r0, 8 * mmsize
+ ; Row 2-3
+ mova m2, [r1 + r2 * 2]
+ mova m3, [r1 + r4]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
lea r1, [r1 + r2 * 4]
dec r3d
jnz .loop
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_16, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_16, 3, 4, 4
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
mov r3d, 16/2
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; r3 - loop counter
; m0 - shift
- ; m1 - dword [offset]
+ ; m1 - word [-round]
.loop:
; Row 0
- pmovsxwd m2, [r1 + 0 * mmsize/2]
- pmovsxwd m3, [r1 + 1 * mmsize/2]
- pmovsxwd m4, [r1 + 2 * mmsize/2]
- pmovsxwd m5, [r1 + 3 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m3
- movu [r0 + 2 * mmsize], m4
- movu [r0 + 3 * mmsize], m5
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
; Row 1
- pmovsxwd m2, [r1 + r2 + 0 * mmsize/2]
- pmovsxwd m3, [r1 + r2 +1 * mmsize/2]
- pmovsxwd m4, [r1 + r2 +2 * mmsize/2]
- pmovsxwd m5, [r1 + r2 +3 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m3
- movu [r0 + 6 * mmsize], m4
- movu [r0 + 7 * mmsize], m5
-
- add r0, 8 * mmsize
+ mova m2, [r1 + r2 + 0 * mmsize]
+ mova m3, [r1 + r2 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
lea r1, [r1 + r2 * 2]
dec r3d
jnz .loop
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_32, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_32, 3, 4, 6
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
mov r3d, 32/1
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; r3 - loop counter
; m0 - shift
- ; m1 - dword [offset]
+ ; m1 - word [-round]
.loop:
; Row 0
- pmovsxwd m2, [r1 + 0 * mmsize/2]
- pmovsxwd m3, [r1 + 1 * mmsize/2]
- pmovsxwd m4, [r1 + 2 * mmsize/2]
- pmovsxwd m5, [r1 + 3 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m3
- movu [r0 + 2 * mmsize], m4
- movu [r0 + 3 * mmsize], m5
-
- pmovsxwd m2, [r1 + 4 * mmsize/2]
- pmovsxwd m3, [r1 + 5 * mmsize/2]
- pmovsxwd m4, [r1 + 6 * mmsize/2]
- pmovsxwd m5, [r1 + 7 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m3
- movu [r0 + 6 * mmsize], m4
- movu [r0 + 7 * mmsize], m5
-
- add r0, 8 * mmsize
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+ mova [r0 + 2 * mmsize], m4
+ mova [r0 + 3 * mmsize], m5
+
+ add r0, 4 * mmsize
add r1, r2
dec r3d
jnz .loop
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_4, 3,3,5
+cglobal cpy1Dto2D_shl_4, 3, 3, 3
add r2d, r2d
movd m0, r3m
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
psllw m1, m0
- psllw m3, m0
+ psllw m2, m0
movh [r0], m1
movhps [r0 + r2], m1
- movh [r0 + r2 * 2], m3
+ movh [r0 + r2 * 2], m2
lea r2, [r2 * 3]
- movhps [r0 + r2], m3
+ movhps [r0 + r2], m2
RET
INIT_YMM avx2
-cglobal cvt32to16_shl_4, 3,3,3
+cglobal cpy1Dto2D_shl_4, 3, 3, 2
add r2d, r2d
movd xm0, r3m
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- packssdw m1, m2
+ movu m1, [r1]
psllw m1, xm0
vextracti128 xm0, m1, 1
movq [r0], xm1
- movq [r0 + r2], xm0
+ movhps [r0 + r2], xm1
lea r0, [r0 + r2 * 2]
- movhps [r0], xm1
+ movq [r0], xm0
movhps [r0 + r2], xm0
RET
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_8, 3,5,5
+cglobal cpy1Dto2D_shl_8, 3, 4, 5
add r2d, r2d
movd m0, r3m
- mov r3d, 8/4
- lea r4, [r2 * 3]
+ lea r3, [r2 * 3]
-.loop:
- ; Row 0-1
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ ; Row 0-3
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0], m1
- movu [r0 + r2], m3
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + r2], m2
+ mova [r0 + r2 * 2], m3
+ mova [r0 + r3], m4
+ lea r0, [r0 + r2 * 4]
- ; Row 2-3
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ ; Row 4-7
+ mova m1, [r1 + 4 * mmsize]
+ mova m2, [r1 + 5 * mmsize]
+ mova m3, [r1 + 6 * mmsize]
+ mova m4, [r1 + 7 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0 + r2 * 2], m1
- movu [r0 + r4], m3
-
- add r1, 8 * mmsize
- lea r0, [r0 + r2 * 4]
- dec r3d
- jnz .loop
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + r2], m2
+ mova [r0 + r2 * 2], m3
+ mova [r0 + r3], m4
RET
INIT_YMM avx2
-cglobal cvt32to16_shl_8, 3,4,3
+cglobal cpy1Dto2D_shl_8, 3, 4, 3
add r2d, r2d
movd xm0, r3m
lea r3, [r2 * 3]
- ; Row 0-1
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0], xm1
- vextracti128 [r0 + r2], m1, 1
-
- ; Row 2-3
- movu xm1, [r1 + 2 * mmsize]
- vinserti128 m1, m1, [r1 + 3 * mmsize], 1
- movu xm2, [r1 + 2 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0 + r2 * 2], xm1
- vextracti128 [r0 + r3], m1, 1
-
- add r1, 4 * mmsize
- lea r0, [r0 + r2 * 4]
-
- ; Row 4-5
+ ; Row 0-3
movu m1, [r1 + 0 * mmsize]
movu m2, [r1 + 1 * mmsize]
- packssdw m1, m2
- vpermq m1, m1, 11011000b
psllw m1, xm0
+ psllw m2, xm0
movu [r0], xm1
vextracti128 [r0 + r2], m1, 1
+ movu [r0 + r2 * 2], xm2
+ vextracti128 [r0 + r3], m2, 1
- ; Row 6-7
+ ; Row 4-7
movu m1, [r1 + 2 * mmsize]
movu m2, [r1 + 3 * mmsize]
- packssdw m1, m2
- vpermq m1, m1, 11011000b
+ lea r0, [r0 + r2 * 4]
psllw m1, xm0
- movu [r0 + r2 * 2], xm1
- vextracti128 [r0 + r3], m1, 1
+ psllw m2, xm0
+ movu [r0], xm1
+ vextracti128 [r0 + r2], m1, 1
+ movu [r0 + r2 * 2], xm2
+ vextracti128 [r0 + r3], m2, 1
RET
+
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_16, 3,4,5
+cglobal cpy1Dto2D_shl_16, 3, 4, 5
add r2d, r2d
movd m0, r3m
- mov r3d, 16/2
+ mov r3d, 16/4
.loop:
- ; Row 0
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ ; Row 0-1
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0], m1
- movu [r0 + mmsize], m3
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + 16], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 + 16], m4
- ; Row 1
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ ; Row 2-3
+ mova m1, [r1 + 4 * mmsize]
+ mova m2, [r1 + 5 * mmsize]
+ mova m3, [r1 + 6 * mmsize]
+ mova m4, [r1 + 7 * mmsize]
+ lea r0, [r0 + r2 * 2]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0 + r2], m1
- movu [r0 + r2 + mmsize], m3
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + 16], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 + 16], m4
add r1, 8 * mmsize
lea r0, [r0 + r2 * 2]
INIT_YMM avx2
-cglobal cvt32to16_shl_16, 3,5,3
+cglobal cpy1Dto2D_shl_16, 3, 5, 3
add r2d, r2d
movd xm0, r3m
mov r3d, 16/4
lea r4, [r2 * 3]
.loop:
- ; Row 0
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- packssdw m1, m2
+ ; Row 0-1
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
psllw m1, xm0
+ psllw m2, xm0
movu [r0], m1
+ movu [r0 + r2], m2
- ; Row 1
- movu xm1, [r1 + 2 * mmsize]
- vinserti128 m1, m1, [r1 + 3 * mmsize], 1
- movu xm2, [r1 + 2 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0 + r2], m1
-
- add r1, 4 * mmsize
-
- ; Row 2
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0 + r2 * 2], m1
-
- ; Row 3
+ ; Row 2-3
movu m1, [r1 + 2 * mmsize]
movu m2, [r1 + 3 * mmsize]
- packssdw m1, m2
psllw m1, xm0
- vpermq m1, m1, 11011000b
- movu [r0 + r4], m1
+ psllw m2, xm0
+ movu [r0 + r2 * 2], m1
+ movu [r0 + r4], m2
add r1, 4 * mmsize
lea r0, [r0 + r2 * 4]
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_32, 3,4,5
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
add r2d, r2d
movd m0, r3m
- mov r3d, 32/1
+ mov r3d, 32/2
.loop:
; Row 0
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0 + 0 * mmsize], m1
- movu [r0 + 1 * mmsize], m3
-
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ psllw m4, m0
+ mova [r0 + 0 * mmsize], m1
+ mova [r0 + 1 * mmsize], m2
+ mova [r0 + 2 * mmsize], m3
+ mova [r0 + 3 * mmsize], m4
+
+ ; Row 1
+ mova m1, [r1 + 4 * mmsize]
+ mova m2, [r1 + 5 * mmsize]
+ mova m3, [r1 + 6 * mmsize]
+ mova m4, [r1 + 7 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0 + 2 * mmsize], m1
- movu [r0 + 3 * mmsize], m3
+ psllw m4, m0
+ mova [r0 + r2 + 0 * mmsize], m1
+ mova [r0 + r2 + 1 * mmsize], m2
+ mova [r0 + r2 + 2 * mmsize], m3
+ mova [r0 + r2 + 3 * mmsize], m4
add r1, 8 * mmsize
- add r0, r2
+ lea r0, [r0 + r2 * 2]
dec r3d
jnz .loop
RET
INIT_YMM avx2
-cglobal cvt32to16_shl_32, 3,4,5
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
add r2d, r2d
movd xm0, r3m
mov r3d, 32/2
.loop:
- ; Row 0
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- movu xm3, [r1 + 2 * mmsize]
- vinserti128 m3, m3, [r1 + 3 * mmsize], 1
- movu xm4, [r1 + 2 * mmsize + mmsize/2]
- vinserti128 m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, xm0
- psllw m3, xm0
- movu [r0], m1
- movu [r0 + mmsize], m3
-
- add r1, 4 * mmsize
-
- ; Row 1
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+ ; Row 0-1
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
movu m3, [r1 + 2 * mmsize]
movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
psllw m1, xm0
+ psllw m2, xm0
psllw m3, xm0
- vpermq m3, m3, 11011000b
- movu [r0 + r2], m1
- movu [r0 + r2 + mmsize], m3
+ psllw m4, xm0
+ movu [r0], m1
+ movu [r0 + mmsize], m2
+ movu [r0 + r2], m3
+ movu [r0 + r2 + mmsize], m4
add r1, 4 * mmsize
lea r0, [r0 + r2 * 2]
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_4, 3,3,3
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_8, 3,3,6
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_16, 3,4,6
RET
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_32, 3,4,6
movd eax, xm4
RET
-;-----------------------------------------------------------------------------
-; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal copy_shr, 4, 7, 4, dst, src, stride
-%define rnd m2
-%define shift m1
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_4, 4, 4, 4
+ add r2d, r2d
+ movd m0, r3d
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - srcStride
+ ; m0 - shift
- ; make shift
- mov r5d, r3m
- movd shift, r5d
+ ; Row 0-3
+ movh m2, [r1]
+ movhps m2, [r1 + r2]
+ lea r1, [r1 + r2 * 2]
+ movh m3, [r1]
+ movhps m3, [r1 + r2]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
- ; make round
- dec r5
- xor r6, r6
- bts r6, r5
+ RET
- movd rnd, r6d
- pshufd rnd, rnd, 0
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_8, 4, 5, 4
+ add r2d, r2d
+ movd m0, r3d
+ mov r3d, 8/4
+ lea r4, [r2 * 3]
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride * 2 (short*)
- ; r3 - lx
- ; r4 - size
- ; r5 - ly
- ; r6 - diff
- add r2d, r2d
+ ; r2 - srcStride
+ ; r3 - loop counter
+ ; r4 - stride * 3
+ ; m0 - shift
+
+.loop:
+ ; Row 0, 1
+ mova m2, [r1]
+ mova m3, [r1 + r2]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+
+ ; Row 2, 3
+ mova m2, [r1 + r2 * 2]
+ mova m3, [r1 + r4]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
- mov r4d, r4m
- mov r5, r4 ; size
- mov r6, r2 ; stride
- sub r6, r4
- add r6, r6
- shr r5, 1
-.loop_row:
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_16, 4, 4, 4
+ add r2d, r2d
+ movd m0, r3d
+ mov r3d, 16/2
- mov r3, r4
- shr r3, 2
-.loop_col:
- ; row 0
- movh m3, [r1]
- pmovsxwd m0, m3
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0], m0
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - srcStride
+ ; r3 - loop counter
+ ; m0 - shift
- ; row 1
- movh m3, [r1 + r4 * 2]
- pmovsxwd m0, m3
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0 + r2], m0
+.loop:
+ ; Row 0
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
- ; move col pointer
- add r1, 8
- add r0, 8
+ ; Row 1
+ mova m2, [r1 + r2 + 0 * mmsize]
+ mova m3, [r1 + r2 + 1 * mmsize]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
- dec r3
- jg .loop_col
- ; update pointer
- lea r1, [r1 + r4 * 2]
- add r0, r6
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_32, 4, 4, 6
+ add r2d, r2d
+ movd m0, r3d
+ mov r3d, 32/1
- ; end of loop_row
- dec r5
- jg .loop_row
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - srcStride
+ ; r3 - loop counter
+ ; m0 - shift
+.loop:
+ ; Row 0
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ psllw m5, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+ mova [r0 + 2 * mmsize], m4
+ mova [r0 + 3 * mmsize], m5
+
+ add r0, 4 * mmsize
+ add r1, r2
+ dec r3d
+ jnz .loop
RET
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_4, 3,3,3
+cglobal cpy1Dto2D_shr_4, 3, 3, 4
add r2d, r2d
movd m0, r3m
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- psllw m1, m0
- psllw m2, m0
- movh [r0], m1
- movhps [r0 + r2], m1
- movh [r0 + r2 * 2], m2
- lea r2, [r2 * 3]
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ movh [r0], m2
movhps [r0 + r2], m2
+ movh [r0 + r2 * 2], m3
+ lea r2, [r2 * 3]
+ movhps [r0 + r2], m3
RET
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_4, 3, 3, 3
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+
+ ; Row 0-3
+ movu m2, [r1]
+ psubw m2, m1
+ psraw m2, xm0
+ vextracti128 xm1, m2, 1
+ movq [r0], xm2
+ movhps [r0 + r2], xm2
+ lea r0, [r0 + r2 * 2]
+ movq [r0], xm1
+ movhps [r0 + r2], xm1
+ RET
+
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_8, 3,4,5
+cglobal cpy1Dto2D_shr_8, 3, 4, 6
add r2d, r2d
movd m0, r3m
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
+ lea r3, [r2 * 3]
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0], m1
- movu [r0 + r2], m2
- movu [r0 + 2 * r2], m3
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m4
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 * 2], m4
+ mova [r0 + r3], m5
; Row 4-7
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2 * 2], m1
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m2
- movu [r0 + 2 * r2], m3
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m4
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ lea r0, [r0 + r2 * 4]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 * 2], m4
+ mova [r0 + r3], m5
+ RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_8, 3, 4, 4
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ lea r3, [r2 * 3]
+
+ ; Row 0-3
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], xm2
+ vextracti128 [r0 + r2], m2, 1
+ movu [r0 + r2 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
+
+ ; Row 4-7
+ movu m2, [r1 + 2 * mmsize]
+ movu m3, [r1 + 3 * mmsize]
+ lea r0, [r0 + r2 * 4]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], xm2
+ vextracti128 [r0 + r2], m2, 1
+ movu [r0 + r2 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
RET
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_16, 3,4,5
+cglobal cpy1Dto2D_shr_16, 3, 5, 6
add r2d, r2d
movd m0, r3m
- mov r3d, 256/64
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
+ mov r3d, 16/4
+ lea r4, [r2 * 3]
.loop:
- ; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0], m1
- movu [r0 + 16], m2
- movu [r0 + r2], m3
- movu [r0 + r2 + 16], m4
+ ; Row 0-1
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + mmsize], m3
+ mova [r0 + r2], m4
+ mova [r0 + r2 + mmsize], m5
- ; Row 4-7
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2 * 2], m1
- movu [r0 + r2 * 2 + 16], m2
- lea r0, [r0 + r2 * 2]
- movu [r0 + r2], m3
- movu [r0 + r2 + 16], m4
+ ; Row 2-3
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + r2 * 2], m2
+ mova [r0 + r2 * 2 + mmsize], m3
+ mova [r0 + r4], m4
+ mova [r0 + r4 + mmsize], m5
add r1, 8 * mmsize
- lea r0, [r0 + r2 * 2]
+ lea r0, [r0 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_16, 3, 5, 4
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ mov r3d, 16/4
+ lea r4, [r2 * 3]
+
+.loop:
+ ; Row 0-1
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], m2
+ movu [r0 + r2], m3
+
+ ; Row 2-3
+ movu m2, [r1 + 2 * mmsize]
+ movu m3, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0 + r2 * 2], m2
+ movu [r0 + r4], m3
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
dec r3d
jnz .loop
RET
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_32, 3,4,5
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
add r2d, r2d
movd m0, r3m
- mov r3d, 1024/64
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
+ mov r3d, 32/2
.loop:
- ; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0], m1
- movu [r0 + 16], m2
- movu [r0 + 32], m3
- movu [r0 + 48], m4
+ ; Row 0
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+ mova [r0 + 2 * mmsize], m4
+ mova [r0 + 3 * mmsize], m5
- ; Row 4-7
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2], m1
- movu [r0 + r2 + 16], m2
- movu [r0 + r2 + 32], m3
- movu [r0 + r2 + 48], m4
+ ; Row 1
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + r2 + 0 * mmsize], m2
+ mova [r0 + r2 + 1 * mmsize], m3
+ mova [r0 + r2 + 2 * mmsize], m4
+ mova [r0 + r2 + 3 * mmsize], m5
add r1, 8 * mmsize
lea r0, [r0 + r2 * 2]
dec r3d
jnz .loop
RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ mov r3d, 32/2
+
+.loop:
+ ; Row 0-1
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ movu m4, [r1 + 2 * mmsize]
+ movu m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ psraw m4, xm0
+ psraw m5, xm0
+ movu [r0], m2
+ movu [r0 + mmsize], m3
+ movu [r0 + r2], m4
+ movu [r0 + r2 + mmsize], m5
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
#ifndef X265_BLOCKCOPY8_H
#define X265_BLOCKCOPY8_H
-void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int);
-void x265_cvt32to16_shl_4_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_8_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_16_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_32_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_4_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_8_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_16_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_32_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_4_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_copy_shr_sse4(int16_t * dst, int16_t *src, intptr_t, int, int);
-void x265_copy_shl_4_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-void x265_copy_shl_8_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-void x265_copy_shl_16_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-void x265_copy_shl_32_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-uint32_t x265_copy_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_32_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_4_avx2(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_8_avx2(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_16_avx2(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_32_avx2(int16_t * dst, int16_t * src, intptr_t);
+void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
- void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
- void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb); \
- void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+ void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
+ void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); \
+ void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
#define SETUP_BLOCKCOPY_PS(W, H, cpu) \
- void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t * dst, intptr_t dstStride, pixel * src, intptr_t srcStride);
+ void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
#define SETUP_BLOCKCOPY_SP(W, H, cpu) \
- void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+ void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
#define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \
- void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
- void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+ void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
+ void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
#define BLOCKCOPY_COMMON(cpu) \
SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \
BLOCKCOPY_SP(_sse2);
-void x265_blockfill_s_4x4_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_8x8_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_32x32_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockcopy_ss_16x4_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x8_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x12_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x24_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x48_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-
-void x265_blockcopy_pp_32x8_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x16_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x24_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x32_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x48_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-
-void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_32x32_avx2(int16_t *dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_4x4_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+
+void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x24_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+void x265_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val);
#undef BLOCKCOPY_COMMON
#undef BLOCKCOPY_SS_PP
avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83
-const idct4_shuf1, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+const idct4_shuf1, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
cextern pw_ppppmmmm
;------------------------------------------------------
-;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
+;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM sse2
cglobal dct4, 3, 4, 8
paddd m1, m3
paddd m1, m7
psrad m1, 8
- movu [r1 + 0 * 16], m1
- pmaddwd m1, m2, m5
+ pmaddwd m4, m2, m5
pmaddwd m3, m0, m5
- psubd m1, m3
- paddd m1, m7
- psrad m1, 8
- movu [r1 + 1 * 16], m1
+ psubd m4, m3
+ paddd m4, m7
+ psrad m4, 8
+ packssdw m1, m4
+ movu [r1 + 0 * 16], m1
pmaddwd m1, m2, m6
pmaddwd m3, m0, m6
paddd m1, m3
paddd m1, m7
psrad m1, 8
- movu [r1 + 2 * 16], m1
pmaddwd m2, [r3 + 3 * 16]
pmaddwd m0, [r3 + 3 * 16]
psubd m2, m0
paddd m2, m7
psrad m2, 8
- movu [r1 + 3 * 16], m2
+ packssdw m1, m2
+ movu [r1 + 1 * 16], m1
RET
; DCT 4x4
paddd m2, m7
psrad m2, 8
- movu [r1], xm3
- movu [r1 + mmsize/2], m2
- vextracti128 [r1 + mmsize], m3, 1
- vextracti128 [r1 + mmsize + mmsize/2], m2, 1
+ packssdw m3, m2
+ movu [r1], m3
RET
;-------------------------------------------------------
-;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
+;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idct4, 3, 4, 7
movu m0, [r0 + 0 * 16]
movu m1, [r0 + 1 * 16]
- packssdw m0, m1
-
- movu m1, [r0 + 2 * 16]
- movu m2, [r0 + 3 * 16]
- packssdw m1, m2
punpcklwd m2, m0, m1
pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1
RET
;------------------------------------------------------
-;void dst4(int16_t *src, int32_t *dst, intptr_t stride)
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM ssse3
%if ARCH_X86_64
phaddd m0, m1
paddd m0, m5
psrad m0, 8
- movu [r1 + 0 * 16], m0
- pmaddwd m0, m2, coef1
+ pmaddwd m4, m2, coef1
pmaddwd m1, m3, coef1
- phaddd m0, m1
- paddd m0, m5
- psrad m0, 8
- movu [r1 + 1 * 16], m0
+ phaddd m4, m1
+ paddd m4, m5
+ psrad m4, 8
+ packssdw m0, m4
+ movu [r1 + 0 * 16], m0
pmaddwd m0, m2, coef2
pmaddwd m1, m3, coef2
phaddd m0, m1
paddd m0, m5
psrad m0, 8
- movu [r1 + 2 * 16], m0
pmaddwd m2, coef3
pmaddwd m3, coef3
phaddd m2, m3
paddd m2, m5
psrad m2, 8
- movu [r1 + 3 * 16], m2
+ packssdw m0, m2
+ movu [r1 + 1 * 16], m0
RET
;-------------------------------------------------------
-;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
+;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idst4, 3, 4, 7
movu m0, [r0 + 0 * 16]
movu m1, [r0 + 1 * 16]
- packssdw m0, m1
-
- movu m1, [r0 + 2 * 16]
- movu m2, [r0 + 3 * 16]
- packssdw m1, m2
punpcklwd m2, m0, m1 ; m2 = m128iAC
punpckhwd m0, m1 ; m0 = m128iBD
;-------------------------------------------------------
-; void dct8(int16_t *src, int32_t *dst, intptr_t stride)
+; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
;-------------------------------------------------------
INIT_XMM sse4
cglobal dct8, 3,6,7,0-16*mmsize
phsubd m4, m2 ; m4 = [Row6 Row4]
paddd m4, m6
psrad m4, 9
- movh [r1 + 0*2*mmsize], m3
- movhps [r1 + 2*2*mmsize], m3
- movh [r1 + 4*2*mmsize], m4
- movhps [r1 + 6*2*mmsize], m4
+
+ packssdw m3, m3
+ movd [r1 + 0*mmsize], m3
+ pshufd m3, m3, 1
+ movd [r1 + 2*mmsize], m3
+
+ packssdw m4, m4
+ movd [r1 + 4*mmsize], m4
+ pshufd m4, m4, 1
+ movd [r1 + 6*mmsize], m4
; odd
pmulld m2, m0, [r4 + 2*16]
phaddd m2, m4 ; m2 = [Row3 Row1]
paddd m2, m6
psrad m2, 9
- movh [r1 + 1*2*mmsize], m2
- movhps [r1 + 3*2*mmsize], m2
+
+ packssdw m2, m2
+ movd [r1 + 1*mmsize], m2
+ pshufd m2, m2, 1
+ movd [r1 + 3*mmsize], m2
pmulld m2, m0, [r4 + 4*16]
pmulld m3, m1, [r4 + 4*16]
phaddd m2, m4 ; m2 = [Row7 Row5]
paddd m2, m6
psrad m2, 9
- movh [r1 + 5*2*mmsize], m2
- movhps [r1 + 7*2*mmsize], m2
- add r1, mmsize/2
+ packssdw m2, m2
+ movd [r1 + 5*mmsize], m2
+ pshufd m2, m2, 1
+ movd [r1 + 7*mmsize], m2
+
+ add r1, mmsize/4
add r0, 2*2*mmsize
%endrep
RET
;-------------------------------------------------------
-; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
+;-------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse2
+%if BIT_DEPTH == 10
+ %define IDCT_SHIFT 10
+ %define IDCT_ADD pd_512
+%elif BIT_DEPTH == 8
+ %define IDCT_SHIFT 12
+ %define IDCT_ADD pd_2048
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+cglobal idct8, 3, 6, 16, 0-5*mmsize
+ mova m9, [r0 + 1 * mmsize]
+ mova m1, [r0 + 3 * mmsize]
+ mova m7, m9
+ punpcklwd m7, m1
+ punpckhwd m9, m1
+ mova m14, [tab_idct8_3]
+ mova m3, m14
+ pmaddwd m14, m7
+ pmaddwd m3, m9
+ mova m0, [r0 + 5 * mmsize]
+ mova m10, [r0 + 7 * mmsize]
+ mova m2, m0
+ punpcklwd m2, m10
+ punpckhwd m0, m10
+ mova m15, [tab_idct8_3 + 1 * mmsize]
+ mova m11, [tab_idct8_3 + 1 * mmsize]
+ pmaddwd m15, m2
+ mova m4, [tab_idct8_3 + 2 * mmsize]
+ pmaddwd m11, m0
+ mova m1, [tab_idct8_3 + 2 * mmsize]
+ paddd m15, m14
+ mova m5, [tab_idct8_3 + 4 * mmsize]
+ mova m12, [tab_idct8_3 + 4 * mmsize]
+ paddd m11, m3
+ mova [rsp + 0 * mmsize], m11
+ mova [rsp + 1 * mmsize], m15
+ pmaddwd m4, m7
+ pmaddwd m1, m9
+ mova m14, [tab_idct8_3 + 3 * mmsize]
+ mova m3, [tab_idct8_3 + 3 * mmsize]
+ pmaddwd m14, m2
+ pmaddwd m3, m0
+ paddd m14, m4
+ paddd m3, m1
+ mova [rsp + 2 * mmsize], m3
+ pmaddwd m5, m9
+ pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
+ mova m6, [tab_idct8_3 + 5 * mmsize]
+ pmaddwd m12, m7
+ pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
+ mova m4, [tab_idct8_3 + 5 * mmsize]
+ pmaddwd m6, m2
+ paddd m6, m12
+ pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
+ paddd m7, m2
+ mova [rsp + 3 * mmsize], m6
+ pmaddwd m4, m0
+ pmaddwd m0, [tab_idct8_3 + 7 * mmsize]
+ paddd m9, m0
+ paddd m5, m4
+ mova m6, [r0 + 0 * mmsize]
+ mova m0, [r0 + 4 * mmsize]
+ mova m4, m6
+ punpcklwd m4, m0
+ punpckhwd m6, m0
+ mova m12, [r0 + 2 * mmsize]
+ mova m0, [r0 + 6 * mmsize]
+ mova m13, m12
+ mova m8, [tab_dct4]
+ punpcklwd m13, m0
+ mova m10, [tab_dct4]
+ punpckhwd m12, m0
+ pmaddwd m8, m4
+ mova m3, m8
+ pmaddwd m4, [tab_dct4 + 2 * mmsize]
+ pmaddwd m10, m6
+ mova m2, [tab_dct4 + 1 * mmsize]
+ mova m1, m10
+ pmaddwd m6, [tab_dct4 + 2 * mmsize]
+ mova m0, [tab_dct4 + 1 * mmsize]
+ pmaddwd m2, m13
+ paddd m3, m2
+ psubd m8, m2
+ mova m2, m6
+ pmaddwd m13, [tab_dct4 + 3 * mmsize]
+ pmaddwd m0, m12
+ paddd m1, m0
+ psubd m10, m0
+ mova m0, m4
+ pmaddwd m12, [tab_dct4 + 3 * mmsize]
+ paddd m3, [pd_64]
+ paddd m1, [pd_64]
+ paddd m8, [pd_64]
+ paddd m10, [pd_64]
+ paddd m0, m13
+ paddd m2, m12
+ paddd m0, [pd_64]
+ paddd m2, [pd_64]
+ psubd m4, m13
+ psubd m6, m12
+ paddd m4, [pd_64]
+ paddd m6, [pd_64]
+ mova m12, m8
+ psubd m8, m7
+ psrad m8, 7
+ paddd m15, m3
+ psubd m3, [rsp + 1 * mmsize]
+ psrad m15, 7
+ paddd m12, m7
+ psrad m12, 7
+ paddd m11, m1
+ mova m13, m14
+ psrad m11, 7
+ packssdw m15, m11
+ psubd m1, [rsp + 0 * mmsize]
+ psrad m1, 7
+ mova m11, [rsp + 2 * mmsize]
+ paddd m14, m0
+ psrad m14, 7
+ psubd m0, m13
+ psrad m0, 7
+ paddd m11, m2
+ mova m13, [rsp + 3 * mmsize]
+ psrad m11, 7
+ packssdw m14, m11
+ mova m11, m6
+ psubd m6, m5
+ paddd m13, m4
+ psrad m13, 7
+ psrad m6, 7
+ paddd m11, m5
+ psrad m11, 7
+ packssdw m13, m11
+ mova m11, m10
+ psubd m4, [rsp + 3 * mmsize]
+ psubd m10, m9
+ psrad m4, 7
+ psrad m10, 7
+ packssdw m4, m6
+ packssdw m8, m10
+ paddd m11, m9
+ psrad m11, 7
+ packssdw m12, m11
+ psubd m2, [rsp + 2 * mmsize]
+ mova m5, m15
+ psrad m2, 7
+ packssdw m0, m2
+ mova m2, m14
+ psrad m3, 7
+ packssdw m3, m1
+ mova m6, m13
+ punpcklwd m5, m8
+ punpcklwd m2, m4
+ mova m1, m12
+ punpcklwd m6, m0
+ punpcklwd m1, m3
+ mova m9, m5
+ punpckhwd m13, m0
+ mova m0, m2
+ punpcklwd m9, m6
+ punpckhwd m5, m6
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ punpckhwd m15, m8
+ mova m1, m5
+ punpckhwd m14, m4
+ punpckhwd m12, m3
+ mova m6, m9
+ punpckhwd m9, m0
+ punpcklwd m1, m2
+ mova m4, [tab_idct8_3 + 0 * mmsize]
+ punpckhwd m5, m2
+ punpcklwd m6, m0
+ mova m2, m15
+ mova m0, m14
+ mova m7, m9
+ punpcklwd m2, m13
+ punpcklwd m0, m12
+ punpcklwd m7, m5
+ punpckhwd m14, m12
+ mova m10, m2
+ punpckhwd m15, m13
+ punpckhwd m9, m5
+ pmaddwd m4, m7
+ mova m13, m1
+ punpckhwd m2, m0
+ punpcklwd m10, m0
+ mova m0, m15
+ punpckhwd m15, m14
+ mova m12, m1
+ mova m3, [tab_idct8_3 + 0 * mmsize]
+ punpcklwd m0, m14
+ pmaddwd m3, m9
+ mova m11, m2
+ punpckhwd m2, m15
+ punpcklwd m11, m15
+ mova m8, [tab_idct8_3 + 1 * mmsize]
+ punpcklwd m13, m0
+ punpckhwd m12, m0
+ pmaddwd m8, m11
+ paddd m8, m4
+ mova [rsp + 4 * mmsize], m8
+ mova m4, [tab_idct8_3 + 2 * mmsize]
+ pmaddwd m4, m7
+ mova m15, [tab_idct8_3 + 2 * mmsize]
+ mova m5, [tab_idct8_3 + 1 * mmsize]
+ pmaddwd m15, m9
+ pmaddwd m5, m2
+ paddd m5, m3
+ mova [rsp + 3 * mmsize], m5
+ mova m14, [tab_idct8_3 + 3 * mmsize]
+ mova m5, [tab_idct8_3 + 3 * mmsize]
+ pmaddwd m14, m11
+ paddd m14, m4
+ mova [rsp + 2 * mmsize], m14
+ pmaddwd m5, m2
+ paddd m5, m15
+ mova [rsp + 1 * mmsize], m5
+ mova m15, [tab_idct8_3 + 4 * mmsize]
+ mova m5, [tab_idct8_3 + 4 * mmsize]
+ pmaddwd m15, m7
+ pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
+ pmaddwd m5, m9
+ pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
+ mova m4, [tab_idct8_3 + 5 * mmsize]
+ pmaddwd m4, m2
+ paddd m5, m4
+ mova m4, m6
+ mova m8, [tab_idct8_3 + 5 * mmsize]
+ punpckhwd m6, m10
+ pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
+ punpcklwd m4, m10
+ paddd m9, m2
+ pmaddwd m8, m11
+ mova m10, [tab_dct4]
+ paddd m8, m15
+ pmaddwd m11, [tab_idct8_3 + 7 * mmsize]
+ paddd m7, m11
+ mova [rsp + 0 * mmsize], m8
+ pmaddwd m10, m6
+ pmaddwd m6, [tab_dct4 + 2 * mmsize]
+ mova m1, m10
+ mova m8, [tab_dct4]
+ mova m3, [tab_dct4 + 1 * mmsize]
+ pmaddwd m8, m4
+ pmaddwd m4, [tab_dct4 + 2 * mmsize]
+ mova m0, m8
+ mova m2, [tab_dct4 + 1 * mmsize]
+ pmaddwd m3, m13
+ psubd m8, m3
+ paddd m0, m3
+ mova m3, m6
+ pmaddwd m13, [tab_dct4 + 3 * mmsize]
+ pmaddwd m2, m12
+ paddd m1, m2
+ psubd m10, m2
+ mova m2, m4
+ pmaddwd m12, [tab_dct4 + 3 * mmsize]
+ paddd m0, [IDCT_ADD]
+ paddd m1, [IDCT_ADD]
+ paddd m8, [IDCT_ADD]
+ paddd m10, [IDCT_ADD]
+ paddd m2, m13
+ paddd m3, m12
+ paddd m2, [IDCT_ADD]
+ paddd m3, [IDCT_ADD]
+ psubd m4, m13
+ psubd m6, m12
+ paddd m4, [IDCT_ADD]
+ paddd m6, [IDCT_ADD]
+ mova m15, [rsp + 4 * mmsize]
+ mova m12, m8
+ psubd m8, m7
+ psrad m8, IDCT_SHIFT
+ mova m11, [rsp + 3 * mmsize]
+ paddd m15, m0
+ psrad m15, IDCT_SHIFT
+ psubd m0, [rsp + 4 * mmsize]
+ psrad m0, IDCT_SHIFT
+ paddd m12, m7
+ paddd m11, m1
+ mova m14, [rsp + 2 * mmsize]
+ psrad m11, IDCT_SHIFT
+ packssdw m15, m11
+ psubd m1, [rsp + 3 * mmsize]
+ psrad m1, IDCT_SHIFT
+ mova m11, [rsp + 1 * mmsize]
+ paddd m14, m2
+ psrad m14, IDCT_SHIFT
+ packssdw m0, m1
+ psrad m12, IDCT_SHIFT
+ psubd m2, [rsp + 2 * mmsize]
+ paddd m11, m3
+ mova m13, [rsp + 0 * mmsize]
+ psrad m11, IDCT_SHIFT
+ packssdw m14, m11
+ mova m11, m6
+ psubd m6, m5
+ paddd m13, m4
+ psrad m13, IDCT_SHIFT
+ mova m1, m15
+ paddd m11, m5
+ psrad m11, IDCT_SHIFT
+ packssdw m13, m11
+ mova m11, m10
+ psubd m10, m9
+ psrad m10, IDCT_SHIFT
+ packssdw m8, m10
+ psrad m6, IDCT_SHIFT
+ psubd m4, [rsp + 0 * mmsize]
+ paddd m11, m9
+ psrad m11, IDCT_SHIFT
+ packssdw m12, m11
+ punpcklwd m1, m14
+ mova m5, m13
+ psrad m4, IDCT_SHIFT
+ packssdw m4, m6
+ psubd m3, [rsp + 1 * mmsize]
+ psrad m2, IDCT_SHIFT
+ mova m6, m8
+ psrad m3, IDCT_SHIFT
+ punpcklwd m5, m12
+ packssdw m2, m3
+ punpcklwd m6, m4
+ punpckhwd m8, m4
+ mova m4, m1
+ mova m3, m2
+ punpckhdq m1, m5
+ punpckldq m4, m5
+ punpcklwd m3, m0
+ punpckhwd m2, m0
+ mova m0, m6
+ lea r2, [r2 + r2]
+ lea r4, [r2 + r2]
+ lea r3, [r4 + r2]
+ lea r4, [r4 + r3]
+ lea r0, [r4 + r2 * 2]
+ movq [r1], m4
+ punpckhwd m15, m14
+ movhps [r1 + r2], m4
+ punpckhdq m0, m3
+ movq [r1 + r2 * 2], m1
+ punpckhwd m13, m12
+ movhps [r1 + r3], m1
+ mova m1, m6
+ punpckldq m1, m3
+ movq [r1 + 8], m1
+ movhps [r1 + r2 + 8], m1
+ movq [r1 + r2 * 2 + 8], m0
+ movhps [r1 + r3 + 8], m0
+ mova m0, m15
+ punpckhdq m15, m13
+ punpckldq m0, m13
+ movq [r1 + r2 * 4], m0
+ movhps [r1 + r4], m0
+ mova m0, m8
+ punpckhdq m8, m2
+ movq [r1 + r3 * 2], m15
+ punpckldq m0, m2
+ movhps [r1 + r0], m15
+ movq [r1 + r2 * 4 + 8], m0
+ movhps [r1 + r4 + 8], m0
+ movq [r1 + r3 * 2 + 8], m8
+ movhps [r1 + r0 + 8], m8
+ RET
+
+%undef IDCT_SHIFT
+%undef IDCT_ADD
+%endif
+
+;-------------------------------------------------------
+; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM ssse3
cglobal patial_butterfly_inverse_internal_pass1
- movu m0, [r0]
- movu m1, [r0 + 4 * 32]
- movu m2, [r0 + 2 * 32]
- movu m3, [r0 + 6 * 32]
- packssdw m0, m2
- packssdw m1, m3
+ movh m0, [r0]
+ movhps m0, [r0 + 2 * 16]
+ movh m1, [r0 + 4 * 16]
+ movhps m1, [r0 + 6 * 16]
+
punpckhwd m2, m0, m1 ; [2 6]
punpcklwd m0, m1 ; [0 4]
pmaddwd m1, m0, [r6] ; EE[0]
paddd m3, m5
paddd m4, m5
- movu m2, [r0 + 32]
- movu m5, [r0 + 5 * 32]
- packssdw m2, m5
- movu m5, [r0 + 3 * 32]
- movu m6, [r0 + 7 * 32]
- packssdw m5, m6
+ movh m2, [r0 + 16]
+ movhps m2, [r0 + 5 * 16]
+ movh m5, [r0 + 3 * 16]
+ movhps m5, [r0 + 7 * 16]
punpcklwd m6, m2, m5 ;[1 3]
punpckhwd m2, m5 ;[5 7]
call patial_butterfly_inverse_internal_pass1
- add r0, 16
+ add r0, 8
add r5, 8
call patial_butterfly_inverse_internal_pass1
;-----------------------------------------------------------------------------
-; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size)
+; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal denoise_dct, 4, 4, 6
pxor m5, m5
- shr r3d, 2
+ shr r3d, 3
.loop:
mova m0, [r0]
- pabsd m1, m0
+ pabsw m1, m0
+
mova m2, [r1]
- paddd m2, m1
+ pmovsxwd m3, m1
+ paddd m2, m3
mova [r1], m2
- pmovzxwd m3, [r2]
- psubd m1, m3
- pcmpgtd m4, m1, m5
+ mova m2, [r1 + 16]
+ psrldq m3, m1, 8
+ pmovsxwd m4, m3
+ paddd m2, m4
+ mova [r1 + 16], m2
+
+ movu m3, [r2]
+ psubsw m1, m3
+ pcmpgtw m4, m1, m5
pand m1, m4
- psignd m1, m0
+ psignw m1, m0
mova [r0], m1
add r0, 16
- add r1, 16
- add r2, 8
+ add r1, 32
+ add r2, 16
dec r3d
jnz .loop
RET
INIT_YMM avx2
cglobal denoise_dct, 4, 4, 6
pxor m5, m5
- shr r3d, 3
+ shr r3d, 4
.loop:
movu m0, [r0]
- pabsd m1, m0
+ pabsw m1, m0
movu m2, [r1]
- paddd m2, m1
+ pmovsxwd m4, xm1
+ paddd m2, m4
movu [r1], m2
- pmovzxwd m3, [r2]
- psubd m1, m3
- pcmpgtd m4, m1, m5
+ vextracti128 xm4, m1, 1
+ movu m2, [r1 + 32]
+ pmovsxwd m3, xm4
+ paddd m2, m3
+ movu [r1 + 32], m2
+ movu m3, [r2]
+ psubw m1, m3
+ pcmpgtw m4, m1, m5
pand m1, m4
- psignd m1, m0
+ psignw m1, m0
movu [r0], m1
add r0, 32
- add r1, 32
- add r2, 16
+ add r1, 64
+ add r2, 32
dec r3d
jnz .loop
RET
+
%if ARCH_X86_64 == 1
%macro DCT8_PASS_1 4
vpbroadcastq m0, [r6 + %1]
mova [r5 + %2], xm2
%endmacro
-%macro DCT8_PASS_2 1
+%macro DCT8_PASS_2 2
vbroadcasti128 m4, [r6 + %1]
pmaddwd m6, m0, m4
pmaddwd m7, m1, m4
phaddd m6, m8
paddd m6, m5
psrad m6, DCT_SHIFT2
+
+ vbroadcasti128 m4, [r6 + %2]
+ pmaddwd m10, m0, m4
+ pmaddwd m7, m1, m4
+ pmaddwd m8, m2, m4
+ pmaddwd m9, m3, m4
+ phaddd m10, m7
+ phaddd m8, m9
+ phaddd m10, m8
+ paddd m10, m5
+ psrad m10, DCT_SHIFT2
+
+ packssdw m6, m10
+ vpermq m10, m6, 0xD8
+
%endmacro
INIT_YMM avx2
-cglobal dct8, 3, 7, 10, 0-8*16
+cglobal dct8, 3, 7, 11, 0-8*16
%if BIT_DEPTH == 10
%define DCT_SHIFT 4
vbroadcasti128 m5, [pd_8]
DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
;pass2
- mov r2d, 32
- lea r3, [r2 * 3]
- lea r4, [r1 + r2 * 4]
vbroadcasti128 m5, [pd_256]
mova m0, [r5]
mova m2, [r5 + 64]
mova m3, [r5 + 96]
- DCT8_PASS_2 0 * 16
- movu [r1], m6
- DCT8_PASS_2 1 * 16
- movu [r1 + r2], m6
- DCT8_PASS_2 2 * 16
- movu [r1 + r2 * 2], m6
- DCT8_PASS_2 3 * 16
- movu [r1 + r3], m6
- DCT8_PASS_2 4 * 16
- movu [r4], m6
- DCT8_PASS_2 5 * 16
- movu [r4 + r2], m6
- DCT8_PASS_2 6 * 16
- movu [r4 + r2 * 2], m6
- DCT8_PASS_2 7 * 16
- movu [r4 + r3], m6
+ DCT8_PASS_2 0 * 16, 1 * 16
+ movu [r1], m10
+ DCT8_PASS_2 2 * 16, 3 * 16
+ movu [r1 + 32], m10
+ DCT8_PASS_2 4 * 16, 5 * 16
+ movu [r1 + 64], m10
+ DCT8_PASS_2 6 * 16, 7 * 16
+ movu [r1 + 96], m10
RET
%macro DCT16_PASS_1_E 2
mova [r5 + %2], xm10
%endmacro
-%macro DCT16_PASS_2 1
+%macro DCT16_PASS_2 2
vbroadcasti128 m8, [r7 + %1]
vbroadcasti128 m13, [r8 + %1]
phaddd m10, m11
paddd m10, m9
psrad m10, DCT_SHIFT2
+
+
+ vbroadcasti128 m8, [r7 + %2]
+ vbroadcasti128 m13, [r8 + %2]
+
+ pmaddwd m14, m0, m8
+ pmaddwd m11, m1, m13
+ paddd m14, m11
+
+ pmaddwd m11, m2, m8
+ pmaddwd m12, m3, m13
+ paddd m11, m12
+ phaddd m14, m11
+
+ pmaddwd m11, m4, m8
+ pmaddwd m12, m5, m13
+ paddd m11, m12
+
+ pmaddwd m12, m6, m8
+ pmaddwd m13, m7, m13
+ paddd m12, m13
+ phaddd m11, m12
+
+ phaddd m14, m11
+ paddd m14, m9
+ psrad m14, DCT_SHIFT2
+
+ packssdw m10, m14
+ vextracti128 xm14, m10, 1
+ movlhps xm15, xm10, xm14
+ movhlps xm14, xm10
%endmacro
INIT_YMM avx2
-cglobal dct16, 3, 9, 15, 0-16*mmsize
+cglobal dct16, 3, 9, 16, 0-16*mmsize
%if BIT_DEPTH == 10
%define DCT_SHIFT 5
vbroadcasti128 m9, [pd_16]
mov r5, rsp
mov r4d, 2
- mov r2d, 64
+ mov r2d, 32
lea r3, [r2 * 3]
vbroadcasti128 m9, [pd_512]
mova m6, [r5 + 3 * 32] ; [row3lo row7lo]
mova m7, [r5 + 11 * 32] ; [row3hi row7hi]
- DCT16_PASS_2 -8 * 16
- movu [r1], m10
- DCT16_PASS_2 -7 * 16
- movu [r1 + r2], m10
- DCT16_PASS_2 -6 * 16
- movu [r1 + r2 * 2], m10
- DCT16_PASS_2 -5 * 16
- movu [r1 + r3], m10
+ DCT16_PASS_2 -8 * 16, -7 * 16
+ movu [r1], xm15
+ movu [r1 + r2], xm14
+
+ DCT16_PASS_2 -6 * 16, -5 * 16
+ movu [r1 + r2 * 2], xm15
+ movu [r1 + r3], xm14
lea r6, [r1 + r2 * 4]
- DCT16_PASS_2 -4 * 16
- movu [r6], m10
- DCT16_PASS_2 -3 * 16
- movu [r6 + r2], m10
- DCT16_PASS_2 -2 * 16
- movu [r6 + r2 * 2], m10
- DCT16_PASS_2 -1 * 16
- movu [r6 + r3], m10
+ DCT16_PASS_2 -4 * 16, -3 * 16
+ movu [r6], xm15
+ movu [r6 + r2], xm14
+
+ DCT16_PASS_2 -2 * 16, -1 * 16
+ movu [r6 + r2 * 2], xm15
+ movu [r6 + r3], xm14
lea r6, [r6 + r2 * 4]
- DCT16_PASS_2 0 * 16
- movu [r6], m10
- DCT16_PASS_2 1 * 16
- movu [r6 + r2], m10
- DCT16_PASS_2 2 * 16
- movu [r6 + r2 * 2], m10
- DCT16_PASS_2 3 * 16
- movu [r6 + r3], m10
+ DCT16_PASS_2 0 * 16, 1 * 16
+ movu [r6], xm15
+ movu [r6 + r2], xm14
+
+ DCT16_PASS_2 2 * 16, 3 * 16
+ movu [r6 + r2 * 2], xm15
+ movu [r6 + r3], xm14
lea r6, [r6 + r2 * 4]
- DCT16_PASS_2 4 * 16
- movu [r6], m10
- DCT16_PASS_2 5 * 16
- movu [r6 + r2], m10
- DCT16_PASS_2 6 * 16
- movu [r6 + r2 * 2], m10
- DCT16_PASS_2 7 * 16
- movu [r6 + r3], m10
-
- add r1, 32
+ DCT16_PASS_2 4 * 16, 5 * 16
+ movu [r6], xm15
+ movu [r6 + r2], xm14
+
+ DCT16_PASS_2 6 * 16, 7 * 16
+ movu [r6 + r2 * 2], xm15
+ movu [r6 + r3], xm14
+
+ add r1, 16
add r5, 128
dec r4d
paddd xm11, xm9
psrad xm11, DCT_SHIFT2
+ packssdw xm11, xm11
%endmacro
dec r4d
jnz .pass1
- mov r2d, 128
+ mov r2d, 64
lea r3, [r2 * 3]
mov r5, rsp
mov r4d, 8
mova m7, [r5 + 3 * 64 + 32]
DCT32_PASS_2 0 * 32
- movu [r1], xm11
+ movq [r1], xm11
DCT32_PASS_2 1 * 32
- movu [r1 + r2], xm11
+ movq [r1 + r2], xm11
DCT32_PASS_2 2 * 32
- movu [r1 + r2 * 2], xm11
+ movq [r1 + r2 * 2], xm11
DCT32_PASS_2 3 * 32
- movu [r1 + r3], xm11
+ movq [r1 + r3], xm11
lea r6, [r1 + r2 * 4]
DCT32_PASS_2 4 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 5 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 6 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 7 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 8 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 9 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 10 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 11 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 12 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 13 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 14 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 15 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 16 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 17 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 18 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 19 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 20 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 21 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 22 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 23 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 24 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 25 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 26 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 27 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 28 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 29 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 30 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 31 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
add r5, 256
- add r1, 16
+ add r1, 8
dec r4d
jnz .pass2
lea r6, [avx2_idct8_2]
;pass1
- mova m0, [r0 + 0 * 32]
- mova m1, [r0 + 4 * 32]
- packssdw m0, m1 ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4]
- mova m1, [r0 + 2 * 32]
- mova m2, [r0 + 6 * 32]
- packssdw m1, m2 ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6]
- mova m2, [r0 + 1 * 32]
- mova m3, [r0 + 5 * 32]
- packssdw m2, m3 ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5]
- mova m3, [r0 + 3 * 32]
- mova m4, [r0 + 7 * 32]
- packssdw m3, m4 ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7]
+ mova m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
+ mova m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
+ vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
+ vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
+ vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
+ vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3]
+ vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
+
+ mova m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
+ mova m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
+ vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
+ vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
+ vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
+ vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7]
+ vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
mova m5, [idct8_shuf1]
-
- punpcklwd m4, m0, m1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
- punpckhwd m0, m1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
vpermd m4, m5, m4
vpermd m0, m5, m0
-
- punpcklwd m1, m2, m3 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
- punpckhwd m2, m3 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
vpermd m1, m5, m1
vpermd m2, m5, m2
%endmacro
;-------------------------------------------------------
-; void idct16(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct16, 3, 7, 16, 0-16*mmsize
mov r4d, 2
.pass1:
- movu m0, [r0 + 0 * 64]
- movu m1, [r0 + 8 * 64]
- packssdw m0, m1 ;[0L 8L 0H 8H]
-
- movu m1, [r0 + 1 * 64]
- movu m2, [r0 + 9 * 64]
- packssdw m1, m2 ;[1L 9L 1H 9H]
-
- movu m2, [r0 + 2 * 64]
- movu m3, [r0 + 10 * 64]
- packssdw m2, m3 ;[2L 10L 2H 10H]
-
- movu m3, [r0 + 3 * 64]
- movu m4, [r0 + 11 * 64]
- packssdw m3, m4 ;[3L 11L 3H 11H]
-
- movu m4, [r0 + 4 * 64]
- movu m5, [r0 + 12 * 64]
- packssdw m4, m5 ;[4L 12L 4H 12H]
-
- movu m5, [r0 + 5 * 64]
- movu m6, [r0 + 13 * 64]
- packssdw m5, m6 ;[5L 13L 5H 13H]
-
- movu m6, [r0 + 6 * 64]
- movu m7, [r0 + 14 * 64]
- packssdw m6, m7 ;[6L 14L 6H 14H]
-
- movu m7, [r0 + 7 * 64]
- movu m8, [r0 + 15 * 64]
- packssdw m7, m8 ;[7L 15L 7H 15H]
+ movu xm0, [r0 + 0 * 32]
+ movu xm1, [r0 + 8 * 32]
+ punpckhqdq xm2, xm0, xm1
+ punpcklqdq xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+
+ movu xm1, [r0 + 1 * 32]
+ movu xm2, [r0 + 9 * 32]
+ punpckhqdq xm3, xm1, xm2
+ punpcklqdq xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+
+ movu xm2, [r0 + 2 * 32]
+ movu xm3, [r0 + 10 * 32]
+ punpckhqdq xm4, xm2, xm3
+ punpcklqdq xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+
+ movu xm3, [r0 + 3 * 32]
+ movu xm4, [r0 + 11 * 32]
+ punpckhqdq xm5, xm3, xm4
+ punpcklqdq xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+
+ movu xm4, [r0 + 4 * 32]
+ movu xm5, [r0 + 12 * 32]
+ punpckhqdq xm6, xm4, xm5
+ punpcklqdq xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+
+ movu xm5, [r0 + 5 * 32]
+ movu xm6, [r0 + 13 * 32]
+ punpckhqdq xm7, xm5, xm6
+ punpcklqdq xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+
+ movu xm6, [r0 + 6 * 32]
+ movu xm7, [r0 + 14 * 32]
+ punpckhqdq xm8, xm6, xm7
+ punpcklqdq xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+
+ movu xm7, [r0 + 7 * 32]
+ movu xm8, [r0 + 15 * 32]
+ punpckhqdq xm9, xm7, xm8
+ punpcklqdq xm7, xm8
+ vinserti128 m7, m7, xm9, 1
punpckhwd m8, m0, m2 ;[8 10]
punpcklwd m0, m2 ;[0 2]
IDCT_PASS1 4, 10
IDCT_PASS1 6, 8
- add r0, 32
+ add r0, 16
add r3, 16
dec r4d
jnz .pass1
%endmacro
;-------------------------------------------------------
-; void idct32(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
; TODO: Reduce PHADDD instruction by PADDD
mov r5d, 8
.pass1:
- movu xm0, [r0 + 2 * 128]
- movu xm1, [r0 + 18 * 128]
- vinserti128 m0, m0, [r0 + 0 * 128], 1
- vinserti128 m1, m1, [r0 + 16 * 128], 1
-
- packssdw m0, m1 ;[2 18 0 16]
-
- movu xm1, [r0 + 1 * 128]
- movu xm2, [r0 + 9 * 128]
- vinserti128 m1, m1, [r0 + 17 * 128], 1
- vinserti128 m2, m2, [r0 + 25 * 128], 1
- packssdw m1, m2 ;[1 9 17 25]
-
- movu xm2, [r0 + 6 * 128]
- movu xm3, [r0 + 22 * 128]
- vinserti128 m2, m2, [r0 + 4 * 128], 1
- vinserti128 m3, m3, [r0 + 20 * 128], 1
- packssdw m2, m3 ;[6 22 4 20]
-
- movu xm3, [r0 + 3 * 128]
- movu xm4, [r0 + 11 * 128]
- vinserti128 m3, m3, [r0 + 19 * 128], 1
- vinserti128 m4, m4, [r0 + 27 * 128], 1
- packssdw m3, m4 ;[3 11 19 27]
-
- movu xm4, [r0 + 10 * 128]
- movu xm5, [r0 + 26 * 128]
- vinserti128 m4, m4, [r0 + 8 * 128], 1
- vinserti128 m5, m5, [r0 + 24 * 128], 1
- packssdw m4, m5 ;[10 26 8 24]
-
- movu xm5, [r0 + 5 * 128]
- movu xm6, [r0 + 13 * 128]
- vinserti128 m5, m5, [r0 + 21 * 128], 1
- vinserti128 m6, m6, [r0 + 29 * 128], 1
- packssdw m5, m6 ;[5 13 21 29]
-
- movu xm6, [r0 + 14 * 128]
- movu xm7, [r0 + 30 * 128]
- vinserti128 m6, m6, [r0 + 12 * 128], 1
- vinserti128 m7, m7, [r0 + 28 * 128], 1
- packssdw m6, m7 ;[14 30 12 28]
-
- movu xm7, [r0 + 7 * 128]
- movu xm8, [r0 + 15 * 128]
- vinserti128 m7, m7, [r0 + 23 * 128], 1
- vinserti128 m8, m8, [r0 + 31 * 128], 1
- packssdw m7, m8 ;[7 15 23 31]
+ movq xm0, [r0 + 2 * 64]
+ movq xm1, [r0 + 18 * 64]
+ punpcklqdq xm0, xm0, xm1
+ movq xm1, [r0 + 0 * 64]
+ movq xm2, [r0 + 16 * 64]
+ punpcklqdq xm1, xm1, xm2
+ vinserti128 m0, m0, xm1, 1 ;[2 18 0 16]
+
+ movq xm1, [r0 + 1 * 64]
+ movq xm2, [r0 + 9 * 64]
+ punpcklqdq xm1, xm1, xm2
+ movq xm2, [r0 + 17 * 64]
+ movq xm3, [r0 + 25 * 64]
+ punpcklqdq xm2, xm2, xm3
+ vinserti128 m1, m1, xm2, 1 ;[1 9 17 25]
+
+ movq xm2, [r0 + 6 * 64]
+ movq xm3, [r0 + 22 * 64]
+ punpcklqdq xm2, xm2, xm3
+ movq xm3, [r0 + 4 * 64]
+ movq xm4, [r0 + 20 * 64]
+ punpcklqdq xm3, xm3, xm4
+ vinserti128 m2, m2, xm3, 1 ;[6 22 4 20]
+
+ movq xm3, [r0 + 3 * 64]
+ movq xm4, [r0 + 11 * 64]
+ punpcklqdq xm3, xm3, xm4
+ movq xm4, [r0 + 19 * 64]
+ movq xm5, [r0 + 27 * 64]
+ punpcklqdq xm4, xm4, xm5
+ vinserti128 m3, m3, xm4, 1 ;[3 11 17 25]
+
+ movq xm4, [r0 + 10 * 64]
+ movq xm5, [r0 + 26 * 64]
+ punpcklqdq xm4, xm4, xm5
+ movq xm5, [r0 + 8 * 64]
+ movq xm6, [r0 + 24 * 64]
+ punpcklqdq xm5, xm5, xm6
+ vinserti128 m4, m4, xm5, 1 ;[10 26 8 24]
+
+ movq xm5, [r0 + 5 * 64]
+ movq xm6, [r0 + 13 * 64]
+ punpcklqdq xm5, xm5, xm6
+ movq xm6, [r0 + 21 * 64]
+ movq xm7, [r0 + 29 * 64]
+ punpcklqdq xm6, xm6, xm7
+ vinserti128 m5, m5, xm6, 1 ;[5 13 21 9]
+
+ movq xm6, [r0 + 14 * 64]
+ movq xm7, [r0 + 30 * 64]
+ punpcklqdq xm6, xm6, xm7
+ movq xm7, [r0 + 12 * 64]
+ movq xm8, [r0 + 28 * 64]
+ punpcklqdq xm7, xm7, xm8
+ vinserti128 m6, m6, xm7, 1 ;[14 30 12 28]
+
+ movq xm7, [r0 + 7 * 64]
+ movq xm8, [r0 + 15 * 64]
+ punpcklqdq xm7, xm7, xm8
+ movq xm8, [r0 + 23 * 64]
+ movq xm9, [r0 + 31 * 64]
+ punpcklqdq xm8, xm8, xm9
+ vinserti128 m7, m7, xm8, 1 ;[7 15 23 31]
punpckhwd m8, m0, m2 ;[18 22 16 20]
punpcklwd m0, m2 ;[2 6 0 4]
IDCT32_PASS1 6
IDCT32_PASS1 7
- add r0, 16
+ add r0, 8
add r3, 4
add r4, 4
dec r5d
RET
;-------------------------------------------------------
-; void idct4(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct4, 3, 4, 6
add r2d, r2d
lea r3, [r2 * 3]
- movu m0, [r0] ;[00 01 02 03 10 11 12 13]
- movu m1, [r0 + 32] ;[20 21 22 23 30 31 32 33]
+ movu m0, [r0] ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]
- packssdw m0, m1 ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33]
- pshufb m0, [idct4_shuf1] ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33]
- vpermq m2, m0, 0x44 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
- vpermq m0, m0, 0xEE ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
+ pshufb m0, [idct4_shuf1] ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
+ vextracti128 xm1, m0, 1 ;[20 22 21 23 30 32 31 33]
+ punpcklwd xm2, xm0, xm1 ;[00 20 02 22 01 21 03 23]
+ punpckhwd xm0, xm1 ;[10 30 12 32 11 31 13 33]
+ vinserti128 m2, m2, xm2, 1 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
+ vinserti128 m0, m0, xm0, 1 ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
mova m1, [avx2_idct4_1]
mova m3, [avx2_idct4_1 + 32]
#ifndef X265_DCT8_H
#define X265_DCT8_H
-void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct8_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_idct32_avx2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
-void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
+void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
#endif // ifndef X265_DCT8_H
db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
+ALIGN 32
+const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15
+
+ALIGN 32
+const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
+ dd 2, 3, 3, 4, 4, 5, 5, 6
+
ALIGN 32
tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
-tab_c_512: times 8 dw 512
tab_c_526336: times 4 dd 8192*64+2048
tab_ChromaCoeff: db 0, 64, 0, 0
times 8 db 58, -10
times 8 db 4, -1
-tab_c_128: times 16 db 0x80
+ALIGN 32
+tab_LumaCoeffVer_32: times 16 db 0, 0
+ times 16 db 0, 64
+ times 16 db 0, 0
+ times 16 db 0, 0
+
+ times 16 db -1, 4
+ times 16 db -10, 58
+ times 16 db 17, -5
+ times 16 db 1, 0
+
+ times 16 db -1, 4
+ times 16 db -11, 40
+ times 16 db 40, -11
+ times 16 db 4, -1
+
+ times 16 db 0, 1
+ times 16 db -5, 17
+ times 16 db 58, -10
+ times 16 db 4, -1
+
+ALIGN 32
+tab_ChromaCoeffVer_32: times 16 db 0, 64
+ times 16 db 0, 0
+
+ times 16 db -2, 58
+ times 16 db 10, -2
+
+ times 16 db -4, 54
+ times 16 db 16, -2
+
+ times 16 db -6, 46
+ times 16 db 28, -4
+
+ times 16 db -4, 36
+ times 16 db 36, -4
+
+ times 16 db -4, 28
+ times 16 db 46, -6
+
+ times 16 db -2, 16
+ times 16 db 54, -4
+
+ times 16 db -2, 10
+ times 16 db 58, -2
+
tab_c_64_n64: times 8 db 64, -64
+const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+
+ALIGN 32
+interp4_horiz_shuf1: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
SECTION .text
-cextern idct4_shuf1
+cextern pb_128
cextern pw_1
cextern pw_512
cextern pw_2000
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 2
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 4
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
mov r5d, 16/2
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
FILTER_H4_w4_2 t0, t1, t2
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 2
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 4
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 8
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
mov r5d, 32/2
RET
+ALIGN 32
+const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
+
%macro FILTER_H4_w6 3
movu %1, [srcq - 1]
mov r5d, %2
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
mova Tm1, [tab_Tm + 16]
mov r5d, %2
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
mova Tm1, [tab_Tm + 16]
punpcklqdq m3, m3
%ifidn %3, pp
- mova m2, [tab_c_512]
+ mova m2, [pw_512]
%else
mova m2, [pw_2000]
%endif
pmulhrsw m3, [pw_512]
vextracti128 xm4, m3, 1
packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
- pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]
+ pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0]
lea r0, [r3 * 3]
movd [r2], xm3
pextrd [r2+r0], xm3, 3
RET
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastq m0, [r5 + r4 * 8]
+%else
+ vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
+%endif
+
+ mova m1, [tab_Lm]
+ mova m2, [tab_Lm + 32]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1, m2 - shuffle order table
+
+ sub r0, 3
+ lea r5, [r1 * 3]
+ lea r4, [r3 * 3]
+
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m3, m2
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ phaddw m3, m4
+ ; Row 1
+ vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m2
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddubsw m5, m0
+ phaddw m4, m5
+
+ phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
+ pmulhrsw m3, [pw_512]
+
+ ; Row 2
+ vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m2
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddubsw m5, m0
+ phaddw m4, m5
+ ; Row 3
+ vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m6, m5, m2
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddubsw m6, m0
+ phaddw m5, m6
+
+ phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
+ pmulhrsw m4, [pw_512]
+
+ packuswb m3, m4
+ vextracti128 xm4, m3, 1
+ punpcklwd xm5, xm3, xm4
+
+ movq [r2], xm5
+ movhps [r2 + r3], xm5
+
+ punpckhwd xm5, xm3, xm4
+ movq [r2 + r3 * 2], xm5
+ movhps [r2 + r4], xm5
+ RET
+
+%macro IPFILTER_LUMA_AVX2_8xN 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastq m0, [r5 + r4 * 8]
+%else
+ vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
+%endif
+
+ mova m1, [tab_Lm]
+ mova m2, [tab_Lm + 32]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1, m2 - shuffle order table
+
+ sub r0, 3
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+ mov r4d, %2 / 4
+.loop:
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m3, m2
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ phaddw m3, m4
+ ; Row 1
+ vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m2
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddubsw m5, m0
+ phaddw m4, m5
+
+ phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
+ pmulhrsw m3, [pw_512]
+
+ ; Row 2
+ vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m2
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddubsw m5, m0
+ phaddw m4, m5
+ ; Row 3
+ vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m6, m5, m2
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddubsw m6, m0
+ phaddw m5, m6
+
+ phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
+ pmulhrsw m4, [pw_512]
+
+ packuswb m3, m4
+ vextracti128 xm4, m3, 1
+ punpcklwd xm5, xm3, xm4
+
+ movq [r2], xm5
+ movhps [r2 + r3], xm5
+
+ punpckhwd xm5, xm3, xm4
+ movq [r2 + r3 * 2], xm5
+ movhps [r2 + r6], xm5
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+IPFILTER_LUMA_AVX2_8xN 8, 8
+IPFILTER_LUMA_AVX2_8xN 8, 16
+IPFILTER_LUMA_AVX2_8xN 8, 32
+
+%macro IPFILTER_LUMA_AVX2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
+ sub r0, 3
+ mov r4d, r4m
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8]
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+ movu m3, [tab_Tm + 16]
+ vpbroadcastd m7, [pw_1]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+ ; m2 , m2 shuffle order table
+ ; m7 - pw_1
+ mov r4d, %2/2
+.loop:
+ ; Row 0
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m4, [pw_512]
+ vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2], xm4
+ movu [r2+r3], xm5
+ lea r0, [r0 + r1 * 2]
+ lea r2, [r2 + r3 * 2]
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+%macro IPFILTER_LUMA_32x_avx2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
+ sub r0, 3
+ mov r4d, r4m
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8]
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+ movu m3, [tab_Tm + 16]
+ vpbroadcastd m7, [pw_1]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+ ; m2 , m2 shuffle order table
+ ; m7 - pw_1
+
+ mov r4d, %2
+.loop:
+ ; Row 0
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 8]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m4, [pw_512]
+ vbroadcasti128 m2, [r0 + 16]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + 24]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2], xm4
+ movu [r2 + 16], xm5
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+%macro IPFILTER_LUMA_64x_avx2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
+ sub r0, 3
+ mov r4d, r4m
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8]
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+ movu m3, [tab_Tm + 16]
+ vpbroadcastd m7, [pw_1]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+ ; m2 , m2 shuffle order table
+ ; m7 - pw_1
+
+ mov r4d, %2
+.loop:
+ ; Row 0
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 8]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m4, [pw_512]
+ vbroadcasti128 m2, [r0 + 16]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + 24]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2], xm4
+ movu [r2 + 16], xm5
+
+ vbroadcasti128 m4, [r0 + 32]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 40]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5
+ pmulhrsw m4, [pw_512]
+ vbroadcasti128 m2, [r0 + 48]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + 56]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2 +32], xm4
+ movu [r2 + 48], xm5
+
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_48x64, 4,6,8
+ sub r0, 3
+ mov r4d, r4m
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8]
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+ movu m3, [tab_Tm + 16]
+ vpbroadcastd m7, [pw_1]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+ ; m2 , m2 shuffle order table
+ ; m7 - pw_1
+
+ mov r4d, 64
+.loop:
+ ; Row 0
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 8]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m4, [pw_512]
+
+ vbroadcasti128 m2, [r0 + 16]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + 24]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2], xm4
+ movu [r2 + 16], xm5
+
+ vbroadcasti128 m4, [r0 + 32]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 40]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5
+ pmulhrsw m4, [pw_512]
+ packuswb m4, m4
+ vpermq m4, m4, 11011000b
+ pshufd xm4, xm4, 11011000b
+ movu [r2 + 32], xm4
+
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+ dec r4d
+ jnz .loop
+ RET
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_4x4, 4,6,6
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ vpbroadcastd m2, [pw_1]
+ vbroadcasti128 m1, [tab_Tm]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ dec r0
+
+ ; Row 0-1
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ vinserti128 m3, m3, [r0 + r1], 1
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+
+ ; Row 2-3
+ lea r0, [r0 + r1 * 2]
+ vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ vinserti128 m4, m4, [r0 + r1], 1
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+
+ packssdw m3, m4
+ pmulhrsw m3, [pw_512]
+ vextracti128 xm4, m3, 1
+ packuswb xm3, xm4
+
+ lea r0, [r3 * 3]
+ movd [r2], xm3
+ pextrd [r2+r3], xm3, 2
+ pextrd [r2+r3*2], xm3, 1
+ pextrd [r2+r0], xm3, 3
+ RET
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_32x32, 4,6,7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ mova m1, [interp4_horiz_shuf1]
+ vpbroadcastd m2, [pw_1]
+ mova m6, [pw_512]
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ dec r0
+ mov r4d, 32
+
+.loop:
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+ vbroadcasti128 m4, [r0 + 4]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ packssdw m3, m4
+ pmulhrsw m3, m6
+
+ vbroadcasti128 m4, [r0 + 16]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ vbroadcasti128 m5, [r0 + 20]
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddwd m5, m2
+ packssdw m4, m5
+ pmulhrsw m4, m6
+
+ packuswb m3, m4
+ vpermq m3, m3, 11011000b
+
+ movu [r2], m3
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ dec r4d
+ jnz .loop
+ RET
+
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ mova m6, [pw_512]
+ mova m1, [interp4_horiz_shuf1]
+ vpbroadcastd m2, [pw_1]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ dec r0
+ mov r4d, 8
+
+.loop:
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+ vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ packssdw m3, m4
+ pmulhrsw m3, m6
+
+ ; Row 1
+ vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddwd m5, m2
+ packssdw m4, m5
+ pmulhrsw m4, m6
+
+ packuswb m3, m4
+ vpermq m3, m3, 11011000b
+
+ vextracti128 xm4, m3, 1
+ movu [r2], xm3
+ movu [r2 + r3], xm4
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1 * 2]
+ dec r4d
+ jnz .loop
+ RET
;--------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
IPFILTER_LUMA 12, 16, pp
IPFILTER_LUMA 4, 16, pp
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_8x8, 4,6,6
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ movu m1, [tab_Tm]
+ vpbroadcastd m2, [pw_1]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ sub r0, 1
+ mov r4d, 2
+
+.loop:
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+
+ ; Row 1
+ vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ packssdw m3, m4
+ pmulhrsw m3, [pw_512]
+ lea r0, [r0 + r1 * 2]
+
+ ; Row 2
+ vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+
+ ; Row 3
+ vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddwd m5, m2
+ packssdw m4, m5
+ pmulhrsw m4, [pw_512]
+
+ packuswb m3, m4
+ mova m5, [interp_4tap_8x8_horiz_shuf]
+ vpermd m3, m5, m3
+ vextracti128 xm4, m3, 1
+ movq [r2], xm3
+ movhps [r2 + r3], xm3
+ lea r2, [r2 + r3 * 2]
+ movq [r2], xm4
+ movhps [r2 + r3], xm4
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1*2]
+ dec r4d
+ jnz .loop
+ RET
+
+ IPFILTER_LUMA_AVX2 16, 4
+ IPFILTER_LUMA_AVX2 16, 8
+ IPFILTER_LUMA_AVX2 16, 12
+ IPFILTER_LUMA_AVX2 16, 16
+ IPFILTER_LUMA_AVX2 16, 32
+ IPFILTER_LUMA_AVX2 16, 64
+
+ IPFILTER_LUMA_32x_avx2 32 , 8
+ IPFILTER_LUMA_32x_avx2 32 , 16
+ IPFILTER_LUMA_32x_avx2 32 , 24
+ IPFILTER_LUMA_32x_avx2 32 , 32
+ IPFILTER_LUMA_32x_avx2 32 , 64
+
+ IPFILTER_LUMA_64x_avx2 64 , 64
+ IPFILTER_LUMA_64x_avx2 64 , 48
+ IPFILTER_LUMA_64x_avx2 64 , 32
+ IPFILTER_LUMA_64x_avx2 64 , 16
+
;--------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
mov r4, rsp
.loopH:
- FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
+ FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]
psubw m1, [pw_2000]
mova [r4], m1
lea r4, [r1 * 3]
lea r5, [r0 + 4 * r1]
pshufb m0, [tab_Cm]
-mova m1, [tab_c_512]
+mova m1, [pw_512]
movd m2, [r0]
movd m3, [r0 + r1]
pshufb m0, [tab_Cm]
-mova m1, [tab_c_512]
+mova m1, [pw_512]
mov r4d, %2
lea r5, [3 * r1]
phaddw m2, m3
-pmulhrsw m2, [tab_c_512]
+pmulhrsw m2, [pw_512]
packuswb m2, m2
movd [r2], m2
pextrd [r2 + r3], m2, 1
%endif
pshufb m0, [tab_Cm]
-mova m1, [tab_c_512]
+mova m1, [pw_512]
lea r5, [r0 + 4 * r1]
lea r4, [r1 * 3]
RET
+INIT_YMM avx2
+cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
+ mov r4d, r4m
+ shl r4d, 6
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+
+ movd xm1, [r0]
+ pinsrd xm1, [r0 + r1], 1
+ pinsrd xm1, [r0 + r1 * 2], 2
+ pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
+ lea r0, [r0 + r1 * 4]
+ movd xm2, [r0]
+ pinsrd xm2, [r0 + r1], 1
+ pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4]
+ vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0]
+ mova m2, [interp4_vpp_shuf1]
+ vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0]
+ mova m2, [interp4_vpp_shuf1 + mmsize]
+ vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2]
+
+ mova m2, [interp4_vpp_shuf]
+ pshufb m0, m0, m2
+ pshufb m1, m1, m2
+ pmaddubsw m0, [r5]
+ pmaddubsw m1, [r5 + mmsize]
+ paddw m0, m1 ; m0 = WORD ROW[3 2 1 0]
+ pmulhrsw m0, [pw_512]
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ lea r5, [r3 * 3]
+ movd [r2], xm0
+ pextrd [r2 + r3], xm0, 1
+ pextrd [r2 + r3 * 2], xm0, 2
+ pextrd [r2 + r5], xm0, 3
+ RET
+
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
pshufb m0, [tab_Cm]
-mova m1, [tab_c_512]
+mova m1, [pw_512]
mov r4d, %2
paddw m0, m4
-mova m4, [tab_c_512]
+mova m4, [pw_512]
pmulhrsw m0, m4
packuswb m0, m0
pshufb m6, m5, [tab_Vm]
pshufb m5, [tab_Vm + 16]
-mova m4, [tab_c_512]
+mova m4, [pw_512]
lea r5, [r1 * 3]
mov r4d, %2
FILTER_V4_W8_H8_H16_H32 8, 12
FILTER_V4_W8_H8_H16_H32 8, 64
+%macro PROCESS_CHROMA_AVX2_W8_8R 0
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+ vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ lea r0, [r0 + r1 * 4]
+ movq xm1, [r0] ; m1 = row 4
+ punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+ vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ movq xm4, [r0 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+ vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ pmaddubsw m1, [r5]
+ movq xm3, [r0 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 8
+ punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+ vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ pmaddubsw m3, m4, [r5 + 1 * mmsize]
+ paddw m1, m3
+ pmaddubsw m4, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ pmaddubsw m0, [r5 + 1 * mmsize]
+ paddw m4, m0
+%endmacro
+
+INIT_YMM avx2
+cglobal interp_4tap_vert_pp_8x8, 4, 6, 7
+ mov r4d, r4m
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r1
+ PROCESS_CHROMA_AVX2_W8_8R
+ lea r4, [r3 * 3]
+ mova m3, [pw_512]
+ pmulhrsw m5, m3 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m3 ; m2 = word: row 2, row 3
+ pmulhrsw m1, m3 ; m1 = word: row 4, row 5
+ pmulhrsw m4, m3 ; m4 = word: row 6, row 7
+ packuswb m5, m2
+ packuswb m1, m4
+ vextracti128 xm2, m5, 1
+ vextracti128 xm4, m1, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ movhps [r2 + r3 * 2], xm5
+ movhps [r2 + r4], xm2
+ lea r2, [r2 + r3 * 4]
+ movq [r2], xm1
+ movq [r2 + r3], xm4
+ movhps [r2 + r3 * 2], xm1
+ movhps [r2 + r4], xm4
+ RET
;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
pshufb m6, m5, [tab_Vm]
pshufb m5, [tab_Vm + 16]
-mova m4, [tab_c_512]
+mova m4, [pw_512]
mov r4d, %2
lea r5, [3 * r1]
pmaddubsw m6, m0
paddw m2, m6
-mova m6, [tab_c_512]
+mova m6, [pw_512]
pmulhrsw m4, m6
pmulhrsw m2, m6
pmaddubsw m7, m0
paddw m4, m7
-mova m7, [tab_c_512]
+mova m7, [pw_512]
pmulhrsw m4, m7
pmulhrsw m2, m7
FILTER_V4_W16_H2 16, 24
FILTER_V4_W16_H2 16, 64
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_vert_pp_16x16, 4, 6, 15
+ mov r4d, r4m
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+ mova m12, [r5]
+ mova m13, [r5 + mmsize]
+ lea r4, [r1 * 3]
+ sub r0, r1
+ lea r5, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, m12
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, m12
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, m13
+ paddw m0, m4
+ pmaddubsw m2, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, m13
+ paddw m1, m5
+ pmaddubsw m3, m12
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, m13
+ paddw m2, m6
+ pmaddubsw m4, m12
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, m13
+ paddw m3, m7
+ pmaddubsw m5, m12
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, m13
+ paddw m4, m8
+ pmaddubsw m6, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, m13
+ paddw m5, m9
+ pmaddubsw m7, m12
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, m13
+ paddw m6, m10
+ pmaddubsw m8, m12
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, m13
+ paddw m7, m11
+ pmaddubsw m9, m12
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ packuswb m6, m7
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vpermq m6, m6, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ vextracti128 xm7, m6, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r5], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r5], xm7
+ lea r2, [r2 + r3 * 4]
+
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm6, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm6, 1
+ pmaddubsw m6, m10, m13
+ paddw m8, m6
+ pmaddubsw m10, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm6, [r0] ; m6 = row 12
+ punpckhbw xm7, xm11, xm6
+ punpcklbw xm11, xm6
+ vinserti128 m11, m11, xm7, 1
+ pmaddubsw m7, m11, m13
+ paddw m9, m7
+ pmaddubsw m11, m12
+
+ movu xm7, [r0 + r1] ; m7 = row 13
+ punpckhbw xm0, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm0, 1
+ pmaddubsw m0, m6, m13
+ paddw m10, m0
+ pmaddubsw m6, m12
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm7, xm0
+ punpcklbw xm7, xm0
+ vinserti128 m7, m7, xm1, 1
+ pmaddubsw m1, m7, m13
+ paddw m11, m1
+ pmaddubsw m7, m12
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, m13
+ paddw m6, m2
+ pmaddubsw m0, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, m13
+ paddw m7, m3
+ pmaddubsw m1, m12
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m2, m13
+ paddw m0, m2
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m3, m13
+ paddw m1, m3
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m6, m14 ; m6 = word: row 12
+ pmulhrsw m7, m14 ; m7 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m6, m7
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m6, m6, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm7, m6, 1
+ vextracti128 xm1, m0, 1
+ movu [r2], xm8
+ movu [r2 + r3], xm9
+ movu [r2 + r3 * 2], xm10
+ movu [r2 + r5], xm11
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm6
+ movu [r2 + r3], xm7
+ movu [r2 + r3 * 2], xm0
+ movu [r2 + r5], xm1
+ RET
+%endif
+
;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
pmaddubsw m6, m0
paddw m2, m6
-mova m6, [tab_c_512]
+mova m6, [pw_512]
pmulhrsw m4, m6
pmulhrsw m2, m6
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
-mova m7, [tab_c_512]
+mova m7, [pw_512]
mov r4d, %2
FILTER_V4_W32 32, 48
FILTER_V4_W32 32, 64
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_vert_pp_32x32, 4, 7, 13
+ mov r4d, r4m
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+ mova m10, [r5]
+ mova m11, [r5 + mmsize]
+ lea r4, [r1 * 3]
+ sub r0, r1
+ lea r5, [r3 * 3]
+ mova m12, [pw_512]
+ mov r6d, 8
+.loopW:
+ movu m0, [r0] ; m0 = row 0
+ movu m1, [r0 + r1] ; m1 = row 1
+ punpcklbw m2, m0, m1
+ punpckhbw m3, m0, m1
+ pmaddubsw m2, m10
+ pmaddubsw m3, m10
+ movu m0, [r0 + r1 * 2] ; m0 = row 2
+ punpcklbw m4, m1, m0
+ punpckhbw m5, m1, m0
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ movu m1, [r0 + r4] ; m1 = row 3
+ punpcklbw m6, m0, m1
+ punpckhbw m7, m0, m1
+ pmaddubsw m8, m6, m11
+ pmaddubsw m9, m7, m11
+ pmaddubsw m6, m10
+ pmaddubsw m7, m10
+ paddw m2, m8
+ paddw m3, m9
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ packuswb m2, m3
+ movu [r2], m2
+
+ lea r0, [r0 + r1 * 4]
+ movu m0, [r0] ; m0 = row 4
+ punpcklbw m2, m1, m0
+ punpckhbw m3, m1, m0
+ pmaddubsw m8, m2, m11
+ pmaddubsw m9, m3, m11
+ pmaddubsw m2, m10
+ pmaddubsw m3, m10
+ paddw m4, m8
+ paddw m5, m9
+ pmulhrsw m4, m12
+ pmulhrsw m5, m12
+ packuswb m4, m5
+ movu [r2 + r3], m4
+
+ movu m1, [r0 + r1] ; m1 = row 5
+ punpcklbw m4, m0, m1
+ punpckhbw m5, m0, m1
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m6, m4
+ paddw m7, m5
+ pmulhrsw m6, m12
+ pmulhrsw m7, m12
+ packuswb m6, m7
+ movu [r2 + r3 * 2], m6
+
+ movu m0, [r0 + r1 * 2] ; m0 = row 6
+ punpcklbw m6, m1, m0
+ punpckhbw m7, m1, m0
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m2, m6
+ paddw m3, m7
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ packuswb m2, m3
+ movu [r2 + r5], m2
+
+ lea r2, [r2 + r3 * 4]
+ dec r6d
+ jnz .loopW
+ RET
+%endif
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
pmaddubsw m7, m0
paddw m4, m7
-mova m7, [tab_c_512]
+mova m7, [pw_512]
pmulhrsw m4, m7
pmulhrsw m2, m7
mov r4d, r4m
; load constant
- mova m4, [tab_c_128]
+ mova m4, [pb_128]
mova m5, [tab_c_64_n64]
.loopH:
%endif
%ifidn %3,pp
- mova m3, [tab_c_512]
+ mova m3, [pw_512]
%else
mova m3, [pw_2000]
%endif
RET
%endmacro
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_4x4, 4,6,8
+ mov r4d, r4m
+ lea r5, [r1 * 3]
+ sub r0, r5
+
+ ; TODO: VPGATHERDD
+ movd xm1, [r0] ; m1 = row0
+ movd xm2, [r0 + r1] ; m2 = row1
+ punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00]
+
+ movd xm3, [r0 + r1 * 2] ; m3 = row2
+ punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10]
+ movd xm4, [r0 + r5]
+ punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20]
+ punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+
+ lea r0, [r0 + r1 * 4]
+ movd xm5, [r0] ; m5 = row4
+ punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30]
+ punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]
+ vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+ movd xm2, [r0 + r1] ; m2 = row5
+ punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40]
+ punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
+ movd xm6, [r0 + r1 * 2] ; m6 = row6
+ punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50]
+ punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]
+ vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
+ movd xm4, [r0 + r5] ; m4 = row7
+ punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60]
+ punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
+
+ lea r0, [r0 + r1 * 4]
+ movd xm7, [r0] ; m7 = row8
+ punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70]
+ punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]
+ vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
+ movd xm2, [r0 + r1] ; m2 = row9
+ punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80]
+ punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
+ movd xm7, [r0 + r1 * 2] ; m7 = rowA
+ punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90]
+ punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]
+ vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
+
+ ; load filter coeff
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8 + 0]
+ vpbroadcastd m2, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0]
+ vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+
+ pmaddubsw m1, m0
+ pmaddubsw m3, m0
+ pmaddubsw m5, m2
+ pmaddubsw m6, m2
+ vbroadcasti128 m0, [pw_1]
+ pmaddwd m1, m0
+ pmaddwd m3, m0
+ pmaddwd m5, m0
+ pmaddwd m6, m0
+ paddd m1, m5 ; m1 = DQWORD ROW[1 0]
+ paddd m3, m6 ; m3 = DQWORD ROW[3 2]
+ packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0]
+
+ ; TODO: does it overflow?
+ pmulhrsw m1, [pw_512]
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0]
+ movd [r2], xm1
+ pextrd [r2 + r3], xm1, 2
+ pextrd [r2 + r3 * 2], xm1, 1
+ lea r4, [r3 * 3]
+ pextrd [r2 + r4], xm1, 3
+ RET
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_ps_4x4, 4, 6, 5
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+
+ add r3d, r3d
+
+ movd xm1, [r0]
+ pinsrd xm1, [r0 + r1], 1
+ pinsrd xm1, [r0 + r1 * 2], 2
+ pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
+ lea r0, [r0 + r1 * 4]
+ movd xm2, [r0]
+ pinsrd xm2, [r0 + r1], 1
+ pinsrd xm2, [r0 + r1 * 2], 2
+ pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4]
+ vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0]
+ lea r0, [r0 + r1 * 4]
+ movd xm3, [r0]
+ pinsrd xm3, [r0 + r1], 1
+ pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8]
+ vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4]
+ mova m3, [interp4_vpp_shuf1]
+ vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0]
+ vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4]
+ mova m3, [interp4_vpp_shuf1 + mmsize]
+ vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2]
+ vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6]
+
+ mova m3, [interp4_vpp_shuf]
+ pshufb m0, m0, m3
+ pshufb m1, m1, m3
+ pshufb m4, m4, m3
+ pshufb m2, m2, m3
+ pmaddubsw m0, [r5]
+ pmaddubsw m1, [r5 + mmsize]
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ pmaddubsw m2, [r5 + 3 * mmsize]
+ paddw m0, m1
+ paddw m0, m4
+ paddw m0, m2 ; m0 = WORD ROW[3 2 1 0]
+
+ vbroadcasti128 m3, [pw_2000]
+ psubw m0, m3
+ vextracti128 xm2, m0, 1
+ lea r5, [r3 * 3]
+ movq [r2], xm0
+ movhps [r2 + r3], xm0
+ movq [r2 + r3 * 2], xm2
+ movhps [r2 + r5], xm2
+ RET
+
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
;-------------------------------------------------------------------------------------------------------------
FILTER_VER_LUMA_4xN 4, 16, ps
+%macro PROCESS_LUMA_AVX2_W8_8R 0
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+ vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ lea r0, [r0 + r1 * 4]
+ movq xm1, [r0] ; m1 = row 4
+ punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+ vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ movq xm4, [r0 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+ vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m5, m3
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ pmaddubsw m1, [r5]
+ movq xm3, [r0 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 8
+ punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+ vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ pmaddubsw m3, m4, [r5 + 3 * mmsize]
+ paddw m5, m3
+ pmaddubsw m3, m4, [r5 + 2 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m4, [r5 + 1 * mmsize]
+ paddw m1, m3
+ pmaddubsw m4, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ pmaddubsw m3, m0, [r5 + 3 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m0, [r5 + 2 * mmsize]
+ paddw m1, m3
+ pmaddubsw m0, [r5 + 1 * mmsize]
+ paddw m4, m0
+
+ movq xm3, [r0 + r4] ; m3 = row 11
+ punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 12
+ punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0]
+ vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
+ pmaddubsw m3, m6, [r5 + 3 * mmsize]
+ paddw m1, m3
+ pmaddubsw m6, [r5 + 2 * mmsize]
+ paddw m4, m6
+ movq xm3, [r0 + r1] ; m3 = row 13
+ punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 14
+ punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
+ pmaddubsw m0, [r5 + 3 * mmsize]
+ paddw m4, m0
+%endmacro
+
+%macro PROCESS_LUMA_AVX2_W8_4R 0
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+ vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ lea r0, [r0 + r1 * 4]
+ movq xm1, [r0] ; m1 = row 4
+ punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+ vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ movq xm4, [r0 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+ vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m5, m3
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ movq xm3, [r0 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 8
+ punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+ vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ pmaddubsw m3, m4, [r5 + 3 * mmsize]
+ paddw m5, m3
+ pmaddubsw m3, m4, [r5 + 2 * mmsize]
+ paddw m2, m3
+ movq xm3, [r0 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ pmaddubsw m3, m0, [r5 + 3 * mmsize]
+ paddw m2, m3
+%endmacro
+
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
%endif
%ifidn %3,pp
- mova m3, [tab_c_512]
+ mova m3, [pw_512]
%else
mova m3, [pw_2000]
%endif
RET
%endmacro
+%macro FILTER_VER_LUMA_AVX2_8xN 2
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r1 * 4]
+ mov word [rsp], %2 / 8
+ mova m7, [pw_512]
+
+.loop:
+ PROCESS_LUMA_AVX2_W8_8R
+ pmulhrsw m5, m7 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m7 ; m2 = word: row 2, row 3
+ pmulhrsw m1, m7 ; m1 = word: row 4, row 5
+ pmulhrsw m4, m7 ; m4 = word: row 6, row 7
+ packuswb m5, m2
+ packuswb m1, m4
+ vextracti128 xm2, m5, 1
+ vextracti128 xm4, m1, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ lea r2, [r2 + r3 * 2]
+ movhps [r2], xm5
+ movhps [r2 + r3], xm2
+ lea r2, [r2 + r3 * 2]
+ movq [r2], xm1
+ movq [r2 + r3], xm4
+ lea r2, [r2 + r3 * 2]
+ movhps [r2], xm1
+ movhps [r2 + r3], xm4
+ lea r2, [r2 + r3 * 2]
+ sub r0, r6
+ dec word [rsp]
+ jnz .loop
+ RET
+%endmacro
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ PROCESS_LUMA_AVX2_W8_8R
+ lea r4, [r3 * 3]
+ mova m3, [pw_512]
+ pmulhrsw m5, m3 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m3 ; m2 = word: row 2, row 3
+ pmulhrsw m1, m3 ; m1 = word: row 4, row 5
+ pmulhrsw m4, m3 ; m4 = word: row 6, row 7
+ packuswb m5, m2
+ packuswb m1, m4
+ vextracti128 xm2, m5, 1
+ vextracti128 xm4, m1, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ movhps [r2 + r3 * 2], xm5
+ movhps [r2 + r4], xm2
+ lea r2, [r2 + r3 * 4]
+ movq [r2], xm1
+ movq [r2 + r3], xm4
+ movhps [r2 + r3 * 2], xm1
+ movhps [r2 + r4], xm4
+ RET
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ PROCESS_LUMA_AVX2_W8_4R
+ lea r4, [r3 * 3]
+ mova m3, [pw_512]
+ pmulhrsw m5, m3 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m3 ; m2 = word: row 2, row 3
+ packuswb m5, m2
+ vextracti128 xm2, m5, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ movhps [r2 + r3 * 2], xm5
+ movhps [r2 + r4], xm2
+ RET
+
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
FILTER_VER_LUMA_8xN 8, 16, pp
+FILTER_VER_LUMA_AVX2_8xN 8, 16
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
FILTER_VER_LUMA_8xN 8, 32, pp
+FILTER_VER_LUMA_AVX2_8xN 8, 32
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
%endif
%ifidn %3,pp
- mova m3, [tab_c_512]
+ mova m3, [pw_512]
%else
mova m3, [pw_2000]
%endif
;-------------------------------------------------------------------------------------------------------------
FILTER_VER_LUMA_12xN 12, 16, ps
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_12x16, 4, 7, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movq [r2], xm0
+ pextrd [r2 + 8], xm0, 2
+ movq [r2 + r3], xm1
+ pextrd [r2 + r3 + 8], xm1, 2
+ movq [r2 + r3 * 2], xm2
+ pextrd [r2 + r3 * 2 + 8], xm2, 2
+ movq [r2 + r6], xm3
+ pextrd [r2 + r6 + 8], xm3, 2
+ lea r2, [r2 + r3 * 4]
+ movq [r2], xm4
+ pextrd [r2 + 8], xm4, 2
+ movq [r2 + r3], xm5
+ pextrd [r2 + r3 + 8], xm5, 2
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ pmaddubsw m12, [r5]
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+ pmaddubsw m13, [r5]
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movq [r2 + r3 * 2], xm6
+ pextrd [r2 + r3 * 2 + 8], xm6, 2
+ movq [r2 + r6], xm7
+ pextrd [r2 + r6 + 8], xm7, 2
+ lea r2, [r2 + r3 * 4]
+
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ pmaddubsw m2, m0, [r5 + 1 * mmsize]
+ paddw m12, m2
+ pmaddubsw m0, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ pmaddubsw m3, m1, [r5 + 1 * mmsize]
+ paddw m13, m3
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ pmaddubsw m4, m2, [r5 + 2 * mmsize]
+ paddw m12, m4
+ pmaddubsw m2, [r5 + 1 * mmsize]
+ paddw m0, m2
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+ pmaddubsw m5, m3, [r5 + 2 * mmsize]
+ paddw m13, m5
+ pmaddubsw m3, [r5 + 1 * mmsize]
+ paddw m1, m3
+ movu xm5, [r0 + r4] ; m5 = row 19
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 3 * mmsize]
+ paddw m12, m6
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ paddw m0, m4
+ lea r0, [r0 + r1 * 4]
+ movu xm6, [r0] ; m6 = row 20
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 3 * mmsize]
+ paddw m13, m7
+ pmaddubsw m5, [r5 + 2 * mmsize]
+ paddw m1, m5
+ movu xm7, [r0 + r1] ; m7 = row 21
+ punpckhbw xm2, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm2, 1
+ pmaddubsw m6, [r5 + 3 * mmsize]
+ paddw m0, m6
+ movu xm2, [r0 + r1 * 2] ; m2 = row 22
+ punpckhbw xm3, xm7, xm2
+ punpcklbw xm7, xm2
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m7, [r5 + 3 * mmsize]
+ paddw m1, m7
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m12, m14 ; m12 = word: row 12
+ pmulhrsw m13, m14 ; m13 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m12, m13
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m12, m12, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm13, m12, 1
+ vextracti128 xm1, m0, 1
+ movq [r2], xm8
+ pextrd [r2 + 8], xm8, 2
+ movq [r2 + r3], xm9
+ pextrd [r2 + r3 + 8], xm9, 2
+ movq [r2 + r3 * 2], xm10
+ pextrd [r2 + r3 * 2 + 8], xm10, 2
+ movq [r2 + r6], xm11
+ pextrd [r2 + r6 + 8], xm11, 2
+ lea r2, [r2 + r3 * 4]
+ movq [r2], xm12
+ pextrd [r2 + 8], xm12, 2
+ movq [r2 + r3], xm13
+ pextrd [r2 + r3 + 8], xm13, 2
+ movq [r2 + r3 * 2], xm0
+ pextrd [r2 + r3 * 2 + 8], xm0, 2
+ movq [r2 + r6], xm1
+ pextrd [r2 + r6 + 8], xm1, 2
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x16, 4, 7, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ pmaddubsw m12, [r5]
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+ pmaddubsw m13, [r5]
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r6], xm7
+ lea r2, [r2 + r3 * 4]
+
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ pmaddubsw m2, m0, [r5 + 1 * mmsize]
+ paddw m12, m2
+ pmaddubsw m0, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ pmaddubsw m3, m1, [r5 + 1 * mmsize]
+ paddw m13, m3
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ pmaddubsw m4, m2, [r5 + 2 * mmsize]
+ paddw m12, m4
+ pmaddubsw m2, [r5 + 1 * mmsize]
+ paddw m0, m2
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+ pmaddubsw m5, m3, [r5 + 2 * mmsize]
+ paddw m13, m5
+ pmaddubsw m3, [r5 + 1 * mmsize]
+ paddw m1, m3
+ movu xm5, [r0 + r4] ; m5 = row 19
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 3 * mmsize]
+ paddw m12, m6
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ paddw m0, m4
+ lea r0, [r0 + r1 * 4]
+ movu xm6, [r0] ; m6 = row 20
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 3 * mmsize]
+ paddw m13, m7
+ pmaddubsw m5, [r5 + 2 * mmsize]
+ paddw m1, m5
+ movu xm7, [r0 + r1] ; m7 = row 21
+ punpckhbw xm2, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm2, 1
+ pmaddubsw m6, [r5 + 3 * mmsize]
+ paddw m0, m6
+ movu xm2, [r0 + r1 * 2] ; m2 = row 22
+ punpckhbw xm3, xm7, xm2
+ punpcklbw xm7, xm2
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m7, [r5 + 3 * mmsize]
+ paddw m1, m7
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m12, m14 ; m12 = word: row 12
+ pmulhrsw m13, m14 ; m13 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m12, m13
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m12, m12, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm13, m12, 1
+ vextracti128 xm1, m0, 1
+ movu [r2], xm8
+ movu [r2 + r3], xm9
+ movu [r2 + r3 * 2], xm10
+ movu [r2 + r6], xm11
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm12
+ movu [r2 + r3], xm13
+ movu [r2 + r3 * 2], xm0
+ movu [r2 + r6], xm1
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x12, 4, 7, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r6], xm7
+ lea r2, [r2 + r3 * 4]
+
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ packuswb m8, m9
+ packuswb m10, m11
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ movu [r2], xm8
+ movu [r2 + r3], xm9
+ movu [r2 + r3 * 2], xm10
+ movu [r2 + r6], xm11
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x8, 4, 7, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r6], xm7
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x4, 4, 7, 13
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m12, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+
+ pmulhrsw m0, m12 ; m0 = word: row 0
+ pmulhrsw m1, m12 ; m1 = word: row 1
+ pmulhrsw m2, m12 ; m2 = word: row 2
+ pmulhrsw m3, m12 ; m3 = word: row 3
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ RET
+%endif
+
+%macro FILTER_VER_LUMA_AVX2_16xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r7, [r1 * 4]
+ mova m14, [pw_512]
+ mov r8d, %2 / 16
+
+.loop:
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ pmaddubsw m12, [r5]
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+ pmaddubsw m13, [r5]
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r6], xm7
+ lea r2, [r2 + r3 * 4]
+
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ pmaddubsw m2, m0, [r5 + 1 * mmsize]
+ paddw m12, m2
+ pmaddubsw m0, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ pmaddubsw m3, m1, [r5 + 1 * mmsize]
+ paddw m13, m3
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ pmaddubsw m4, m2, [r5 + 2 * mmsize]
+ paddw m12, m4
+ pmaddubsw m2, [r5 + 1 * mmsize]
+ paddw m0, m2
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+ pmaddubsw m5, m3, [r5 + 2 * mmsize]
+ paddw m13, m5
+ pmaddubsw m3, [r5 + 1 * mmsize]
+ paddw m1, m3
+ movu xm5, [r0 + r4] ; m5 = row 19
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 3 * mmsize]
+ paddw m12, m6
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ paddw m0, m4
+ lea r0, [r0 + r1 * 4]
+ movu xm6, [r0] ; m6 = row 20
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 3 * mmsize]
+ paddw m13, m7
+ pmaddubsw m5, [r5 + 2 * mmsize]
+ paddw m1, m5
+ movu xm7, [r0 + r1] ; m7 = row 21
+ punpckhbw xm2, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm2, 1
+ pmaddubsw m6, [r5 + 3 * mmsize]
+ paddw m0, m6
+ movu xm2, [r0 + r1 * 2] ; m2 = row 22
+ punpckhbw xm3, xm7, xm2
+ punpcklbw xm7, xm2
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m7, [r5 + 3 * mmsize]
+ paddw m1, m7
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m12, m14 ; m12 = word: row 12
+ pmulhrsw m13, m14 ; m13 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m12, m13
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m12, m12, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm13, m12, 1
+ vextracti128 xm1, m0, 1
+ movu [r2], xm8
+ movu [r2 + r3], xm9
+ movu [r2 + r3 * 2], xm10
+ movu [r2 + r6], xm11
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm12
+ movu [r2 + r3], xm13
+ movu [r2 + r3 * 2], xm0
+ movu [r2 + r6], xm1
+ lea r2, [r2 + r3 * 4]
+ sub r0, r7
+ dec r8d
+ jnz .loop
+ RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_16xN 16, 32
+FILTER_VER_LUMA_AVX2_16xN 16, 64
+
+%macro PROCESS_LUMA_AVX2_W16_16R 0
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r7, [r0 + r1 * 4]
+ movu xm4, [r7] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r7 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r7 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r7 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r7, [r7 + r1 * 4]
+ movu xm8, [r7] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r7 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r7 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r7 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r7, [r7 + r1 * 4]
+ movu xm12, [r7] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r8, [r2 + r3 * 4]
+ movu [r8], xm4
+ movu [r8 + r3], xm5
+
+ movu xm13, [r7 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ pmaddubsw m12, [r5]
+ movu xm0, [r7 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+ pmaddubsw m13, [r5]
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r8 + r3 * 2], xm6
+ movu [r8 + r6], xm7
+ lea r8, [r8 + r3 * 4]
+
+ movu xm1, [r7 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ pmaddubsw m2, m0, [r5 + 1 * mmsize]
+ paddw m12, m2
+ pmaddubsw m0, [r5]
+ lea r7, [r7 + r1 * 4]
+ movu xm2, [r7] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ pmaddubsw m3, m1, [r5 + 1 * mmsize]
+ paddw m13, m3
+ pmaddubsw m1, [r5]
+ movu xm3, [r7 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ pmaddubsw m4, m2, [r5 + 2 * mmsize]
+ paddw m12, m4
+ pmaddubsw m2, [r5 + 1 * mmsize]
+ paddw m0, m2
+ movu xm4, [r7 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+ pmaddubsw m5, m3, [r5 + 2 * mmsize]
+ paddw m13, m5
+ pmaddubsw m3, [r5 + 1 * mmsize]
+ paddw m1, m3
+ movu xm5, [r7 + r4] ; m5 = row 19
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 3 * mmsize]
+ paddw m12, m6
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ paddw m0, m4
+ lea r7, [r7 + r1 * 4]
+ movu xm6, [r7] ; m6 = row 20
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 3 * mmsize]
+ paddw m13, m7
+ pmaddubsw m5, [r5 + 2 * mmsize]
+ paddw m1, m5
+ movu xm7, [r7 + r1] ; m7 = row 21
+ punpckhbw xm2, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm2, 1
+ pmaddubsw m6, [r5 + 3 * mmsize]
+ paddw m0, m6
+ movu xm2, [r7 + r1 * 2] ; m2 = row 22
+ punpckhbw xm3, xm7, xm2
+ punpcklbw xm7, xm2
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m7, [r5 + 3 * mmsize]
+ paddw m1, m7
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m12, m14 ; m12 = word: row 12
+ pmulhrsw m13, m14 ; m13 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m12, m13
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m12, m12, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm13, m12, 1
+ vextracti128 xm1, m0, 1
+ movu [r8], xm8
+ movu [r8 + r3], xm9
+ movu [r8 + r3 * 2], xm10
+ movu [r8 + r6], xm11
+ lea r8, [r8 + r3 * 4]
+ movu [r8], xm12
+ movu [r8 + r3], xm13
+ movu [r8 + r3 * 2], xm0
+ movu [r8 + r6], xm1
+%endmacro
+
+%macro PROCESS_LUMA_AVX2_W16_8R 0
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r7, [r0 + r1 * 4]
+ movu xm4, [r7] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r7 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r7 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r7 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r7, [r7 + r1 * 4]
+ movu xm8, [r7] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r7 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ movu xm10, [r7 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ movu xm11, [r7 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ lea r7, [r7 + r1 * 4]
+ movu xm12, [r7] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r8, [r2 + r3 * 4]
+ movu [r8], xm4
+ movu [r8 + r3], xm5
+
+ movu xm13, [r7 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ movu xm0, [r7 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r8 + r3 * 2], xm6
+ movu [r8 + r6], xm7
+%endmacro
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_24x32, 4, 11, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r10, [r1 * 4]
+ mova m14, [pw_512]
+ mov r9d, 2
+.loopH:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3
+ vinserti128 m5, m1, xm2, 1
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4
+ lea r7, [r0 + r1 * 4]
+ movq xm1, [r7] ; m1 = row 4
+ punpcklbw xm4, xm1
+ vinserti128 m2, m3, xm4, 1
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r7 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3
+ movq xm4, [r7 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m5, m3
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ pmaddubsw m1, [r5]
+ movq xm3, [r7 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3
+ lea r7, [r7 + r1 * 4]
+ movq xm0, [r7] ; m0 = row 8
+ punpcklbw xm3, xm0
+ vinserti128 m4, m4, xm3, 1
+ pmaddubsw m3, m4, [r5 + 3 * mmsize]
+ paddw m5, m3
+ pmaddubsw m3, m4, [r5 + 2 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m4, [r5 + 1 * mmsize]
+ paddw m1, m3
+ pmaddubsw m4, [r5]
+ movq xm3, [r7 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3
+ movq xm6, [r7 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6
+ vinserti128 m0, m0, xm3, 1
+ pmaddubsw m3, m0, [r5 + 3 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m0, [r5 + 2 * mmsize]
+ paddw m1, m3
+ pmaddubsw m3, m0, [r5 + 1 * mmsize]
+ paddw m4, m3
+ pmaddubsw m0, [r5]
+
+ movq xm3, [r7 + r4] ; m3 = row 11
+ punpcklbw xm6, xm3
+ lea r7, [r7 + r1 * 4]
+ movq xm7, [r7] ; m7 = row 12
+ punpcklbw xm3, xm7
+ vinserti128 m6, m6, xm3, 1
+ pmaddubsw m3, m6, [r5 + 3 * mmsize]
+ paddw m1, m3
+ pmaddubsw m3, m6, [r5 + 2 * mmsize]
+ paddw m4, m3
+ pmaddubsw m3, m6, [r5 + 1 * mmsize]
+ paddw m0, m3
+ pmaddubsw m6, [r5]
+ movq xm3, [r7 + r1] ; m3 = row 13
+ punpcklbw xm7, xm3
+ movq xm8, [r7 + r1 * 2] ; m8 = row 14
+ punpcklbw xm3, xm8
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m3, m7, [r5 + 3 * mmsize]
+ paddw m4, m3
+ pmaddubsw m3, m7, [r5 + 2 * mmsize]
+ paddw m0, m3
+ pmaddubsw m3, m7, [r5 + 1 * mmsize]
+ paddw m6, m3
+ pmaddubsw m7, [r5]
+ movq xm3, [r7 + r4] ; m3 = row 15
+ punpcklbw xm8, xm3
+ lea r7, [r7 + r1 * 4]
+ movq xm9, [r7] ; m9 = row 16
+ punpcklbw xm3, xm9
+ vinserti128 m8, m8, xm3, 1
+ pmaddubsw m3, m8, [r5 + 3 * mmsize]
+ paddw m0, m3
+ pmaddubsw m3, m8, [r5 + 2 * mmsize]
+ paddw m6, m3
+ pmaddubsw m3, m8, [r5 + 1 * mmsize]
+ paddw m7, m3
+ pmaddubsw m8, [r5]
+ movq xm3, [r7 + r1] ; m3 = row 17
+ punpcklbw xm9, xm3
+ movq xm10, [r7 + r1 * 2] ; m10 = row 18
+ punpcklbw xm3, xm10
+ vinserti128 m9, m9, xm3, 1
+ pmaddubsw m3, m9, [r5 + 3 * mmsize]
+ paddw m6, m3
+ pmaddubsw m3, m9, [r5 + 2 * mmsize]
+ paddw m7, m3
+ pmaddubsw m3, m9, [r5 + 1 * mmsize]
+ paddw m8, m3
+ movq xm3, [r7 + r4] ; m3 = row 19
+ punpcklbw xm10, xm3
+ lea r7, [r7 + r1 * 4]
+ movq xm9, [r7] ; m9 = row 20
+ punpcklbw xm3, xm9
+ vinserti128 m10, m10, xm3, 1
+ pmaddubsw m3, m10, [r5 + 3 * mmsize]
+ paddw m7, m3
+ pmaddubsw m3, m10, [r5 + 2 * mmsize]
+ paddw m8, m3
+ movq xm3, [r7 + r1] ; m3 = row 21
+ punpcklbw xm9, xm3
+ movq xm10, [r7 + r1 * 2] ; m10 = row 22
+ punpcklbw xm3, xm10
+ vinserti128 m9, m9, xm3, 1
+ pmaddubsw m3, m9, [r5 + 3 * mmsize]
+ paddw m8, m3
+
+ pmulhrsw m5, m14 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2, row 3
+ pmulhrsw m1, m14 ; m1 = word: row 4, row 5
+ pmulhrsw m4, m14 ; m4 = word: row 6, row 7
+ pmulhrsw m0, m14 ; m0 = word: row 8, row 9
+ pmulhrsw m6, m14 ; m6 = word: row 10, row 11
+ pmulhrsw m7, m14 ; m7 = word: row 12, row 13
+ pmulhrsw m8, m14 ; m8 = word: row 14, row 15
+ packuswb m5, m2
+ packuswb m1, m4
+ packuswb m0, m6
+ packuswb m7, m8
+ vextracti128 xm2, m5, 1
+ vextracti128 xm4, m1, 1
+ vextracti128 xm6, m0, 1
+ vextracti128 xm8, m7, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ movhps [r2 + r3 * 2], xm5
+ movhps [r2 + r6], xm2
+ lea r8, [r2 + r3 * 4]
+ movq [r8], xm1
+ movq [r8 + r3], xm4
+ movhps [r8 + r3 * 2], xm1
+ movhps [r8 + r6], xm4
+ lea r8, [r8 + r3 * 4]
+ movq [r8], xm0
+ movq [r8 + r3], xm6
+ movhps [r8 + r3 * 2], xm0
+ movhps [r8 + r6], xm6
+ lea r8, [r8 + r3 * 4]
+ movq [r8], xm7
+ movq [r8 + r3], xm8
+ movhps [r8 + r3 * 2], xm7
+ movhps [r8 + r6], xm8
+
+ sub r7, r10
+ lea r0, [r7 - 16]
+ lea r2, [r8 + r3 * 4 - 16]
+ dec r9d
+ jnz .loopH
+ RET
+%endif
+
+%macro FILTER_VER_LUMA_AVX2_32xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r11, [r1 * 4]
+ mova m14, [pw_512]
+ mov r9d, %2 / 16
+.loopH:
+ mov r10d, %1 / 16
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r10d
+ jnz .loopW
+ sub r7, r11
+ lea r0, [r7 - 16]
+ lea r2, [r8 + r3 * 4 - 16]
+ dec r9d
+ jnz .loopH
+ RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_32xN 32, 32
+FILTER_VER_LUMA_AVX2_32xN 32, 64
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_32x16, 4, 10, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+ mov r9d, 2
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loopW
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_32x24, 4, 10, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+ mov r9d, 2
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loopW
+ lea r9, [r1 * 4]
+ sub r7, r9
+ lea r0, [r7 - 16]
+ lea r2, [r8 + r3 * 4 - 16]
+ mov r9d, 2
+.loop:
+ PROCESS_LUMA_AVX2_W16_8R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loop
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_32x8, 4, 10, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+ mov r9d, 2
+.loopW:
+ PROCESS_LUMA_AVX2_W16_8R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loopW
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_48x64, 4, 12, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r11, [r1 * 4]
+ mova m14, [pw_512]
+ mov r9d, 4
+.loopH:
+ mov r10d, 3
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r10d
+ jnz .loopW
+ sub r7, r11
+ lea r0, [r7 - 32]
+ lea r2, [r8 + r3 * 4 - 32]
+ dec r9d
+ jnz .loopH
+ RET
+%endif
+
+%macro FILTER_VER_LUMA_AVX2_64xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r11, [r1 * 4]
+ mova m14, [pw_512]
+ mov r9d, %2 / 16
+.loopH:
+ mov r10d, %1 / 16
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r10d
+ jnz .loopW
+ sub r7, r11
+ lea r0, [r7 - 48]
+ lea r2, [r8 + r3 * 4 - 48]
+ dec r9d
+ jnz .loopH
+ RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_64xN 64, 32
+FILTER_VER_LUMA_AVX2_64xN 64, 48
+FILTER_VER_LUMA_AVX2_64xN 64, 64
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_64x16, 4, 10, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+ mov r9d, 4
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loopW
+ RET
+%endif
+
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
%endif
%ifidn %3,pp
- mova m3, [tab_c_512]
+ mova m3, [pw_512]
%else
mova m3, [pw_2000]
%endif
mov r4d, r4m
; load constant
- mova m2, [tab_c_128]
+ mova m2, [pb_128]
mova m3, [tab_c_64_n64]
.loopH:
#define X265_IPFILTER8_H
#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
- void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
- void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+ void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
#define LUMA_FILTERS(cpu) \
SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
SETUP_LUMA_FUNC_DEF(16, 64, cpu)
#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
- void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
#define LUMA_SP_FILTERS(cpu) \
SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \
SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu);
#define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
- void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
#define LUMA_SS_FILTERS(cpu) \
SETUP_LUMA_SS_FUNC_DEF(4, 4, cpu); \
#if HIGH_BIT_DEPTH
#define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
#define CHROMA_VERT_FILTERS(cpu) \
SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu)
#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+ void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
#define CHROMA_HORIZ_FILTERS(cpu) \
SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_HORIZ_FUNC_DEF(64, 16, cpu); \
SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu)
-void x265_chroma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
-void x265_luma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_chroma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
+void x265_luma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
CHROMA_VERT_FILTERS(_sse2);
CHROMA_HORIZ_FILTERS(_sse4);
#else // if HIGH_BIT_DEPTH
#define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
- void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+ void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
#define CHROMA_FILTERS(cpu) \
SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_FUNC_DEF(16, 64, cpu);
#define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
#define CHROMA_SP_FILTERS(cpu) \
SETUP_CHROMA_SP_FUNC_DEF(8, 2, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu);
#define SETUP_CHROMA_SS_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
#define CHROMA_SS_FILTERS(cpu) \
SETUP_CHROMA_SS_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu);
CHROMA_FILTERS(_sse4);
+CHROMA_FILTERS(_avx2);
CHROMA_SP_FILTERS(_sse2);
CHROMA_SP_FILTERS_SSE4(_sse4);
CHROMA_SS_FILTERS(_sse2);
CHROMA_SS_FILTERS_SSE4(_sse4);
CHROMA_FILTERS_422(_sse4);
+CHROMA_FILTERS_422(_avx2);
CHROMA_SP_FILTERS_422(_sse2);
CHROMA_SP_FILTERS_422_SSE4(_sse4);
CHROMA_SS_FILTERS_422(_sse2);
CHROMA_SP_FILTERS_444(_sse4);
CHROMA_SS_FILTERS_444(_sse2);
-void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_chroma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
#undef SETUP_CHROMA_FUNC_DEF
#undef SETUP_CHROMA_SP_FUNC_DEF
LUMA_SS_FILTERS(_sse2);
LUMA_FILTERS(_avx2);
-void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
-void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_interp_8tap_hv_pp_8x8_ssse3(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
+void x265_luma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
#undef LUMA_FILTERS
#undef LUMA_SP_FILTERS
#define X265_MC_H
#define LOWRES(cpu) \
- void x265_frame_init_lowres_core_ ## cpu(pixel * src0, pixel * dst0, pixel * dsth, pixel * dstv, pixel * dstc, \
+ void x265_frame_init_lowres_core_ ## cpu(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
intptr_t src_stride, intptr_t dst_stride, int width, int height);
LOWRES(mmx2)
LOWRES(sse2)
void func ## _mmx2 args; \
void func ## _sse2 args; \
void func ## _ssse3 args;
-DECL_SUF(x265_pixel_avg_64x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x48, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_48x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x24, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_24x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x12, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_12x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x48, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_48x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x24, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_24x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x12, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x4, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_12x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x4, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_4x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_4x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_4x4, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
#undef LOWRES
#undef DECL_SUF
#ifndef X265_PIXEL_UTIL_H
#define X265_PIXEL_UTIL_H
-void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-
-void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual16_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual32_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-
-void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
-
-void x265_transpose8_avx2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose16_avx2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose64_avx2(pixel *dest, pixel *src, intptr_t stride);
-
-uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
-
-void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-
-void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
- const uint8_t * pix2, intptr_t stride2, int sums[2][4]);
-void x265_pixel_ssim_4x4x2_core_sse2(const pixel * pix1, intptr_t stride1,
- const pixel * pix2, intptr_t stride2, int sums[2][4]);
-void x265_pixel_ssim_4x4x2_core_avx(const pixel * pix1, intptr_t stride1,
- const pixel * pix2, intptr_t stride2, int sums[2][4]);
+void x265_getResidual4_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual8_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+
+void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose16_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose32_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose64_sse2(pixel* dest, const pixel* src, intptr_t stride);
+
+void x265_transpose8_avx2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose16_avx2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose32_avx2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose64_avx2(pixel* dest, const pixel* src, intptr_t stride);
+
+uint32_t x265_quant_sse4(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_quant_avx2(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_sse4(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_avx2(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+int x265_count_nonzero_ssse3(const int16_t* quantCoeff, int numCoeff);
+
+void x265_weight_pp_sse4(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_pp_avx2(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_sp_sse4(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+
+void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t* pix1, intptr_t stride1,
+ const uint8_t* pix2, intptr_t stride2, int sums[2][4]);
+void x265_pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1,
+ const pixel* pix2, intptr_t stride2, int sums[2][4]);
+void x265_pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1,
+ const pixel* pix2, intptr_t stride2, int sums[2][4]);
float x265_pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width);
float x265_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width);
-void x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t);
-void x265_scale1D_128to64_avx2(pixel *, pixel *, intptr_t);
-void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t);
+void x265_scale1D_128to64_ssse3(pixel*, const pixel*, intptr_t);
+void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
+void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
- void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \
- void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1);
+ void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
+ void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* scr1, intptr_t srcStride0, intptr_t srcStride1);
#define CHROMA_PIXELSUB_DEF(cpu) \
SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu);
#define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
- void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \
- void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1);
+ void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
+ void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* scr1, intptr_t srcStride0, intptr_t srcStride1);
#define LUMA_PIXELSUB_DEF(cpu) \
SETUP_LUMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
CHROMA_PIXELSUB_DEF_422(_sse2);
#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
- uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel * pix, intptr_t pixstride);
+ uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(const pixel* pix, intptr_t pixstride);
#define LUMA_PIXELVAR_DEF(cpu) \
SETUP_LUMA_PIXELVAR_FUNC(8, 8, cpu); \
cextern pd_32767
cextern pd_n32768
-;-----------------------------------------------------------------------------
-; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-%if HIGH_BIT_DEPTH
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 4/2
-.loop:
- movh m0, [r0]
- movh m1, [r0 + r4]
- punpcklqdq m0, m1
- movh m2, [r1]
- movh m3, [r1 + r4]
- punpcklqdq m2, m3
- paddw m0, m2
- CLIPW m0, m4, m5
-
- ; store recipred[]
- movh [r3], m0
- movhps [r3 + r6], m0
-
- ; store recqt[]
- movh [r2], m0
- movhps [r2 + r5], m0
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 4/2
-.loop:
- movd m1, [r0]
- movd m2, [r0 + r4]
- punpckldq m1, m2
- punpcklbw m1, m0
- movh m2, [r1]
- movh m3, [r1 + r4 * 2]
- punpcklqdq m2, m3
- paddw m1, m2
- packuswb m1, m1
-
- ; store recon[] and recipred[]
- movd [r3], m1
- pshufd m2, m1, 1
- movd [r3 + r6], m2
-
- ; store recqt[]
- punpcklbw m1, m0
- movh [r2], m1
- movhps [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons8, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons8, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
-%if HIGH_BIT_DEPTH
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 8/2
-.loop:
- movu m0, [r0]
- movu m1, [r0 + r4]
- movu m2, [r1]
- movu m3, [r1 + r4]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + r6], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 8/2
-.loop:
- movh m1, [r0]
- movh m2, [r0 + r4]
- punpcklbw m1, m0
- punpcklbw m2, m0
- movu m3, [r1]
- movu m4, [r1 + r4 * 2]
- paddw m1, m3
- paddw m2, m4
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movh [r3], m1
- movhps [r3 + r6], m1
-
- ; store recqt[]
- punpcklbw m2, m1, m0
- punpckhbw m1, m0
- movu [r2], m2
- movu [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 16/2
-.loop:
- movu m0, [r0]
- movu m1, [r0 + 16]
- movu m2, [r1]
- movu m3, [r1 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + r4]
- movu m1, [r0 + r4 + 16]
- movu m2, [r1 + r4]
- movu m3, [r1 + r4 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6], m0
- movu [r3 + r6 + 16], m1
-
- ; store recqt[]
- movu [r2 + r5], m0
- movu [r2 + r5 + 16], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 16
-.loop:
- movu m2, [r0]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- paddw m1, [r1]
- paddw m2, [r1 + 16]
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movu [r3], m1
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [r2], m2
- movu [r2 + 16], m1
-
- add r2, r5
- add r3, r6
- add r0, r4
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 32/2
-.loop:
-
- movu m0, [r0]
- movu m1, [r0 + 16]
- movu m2, [r1]
- movu m3, [r1 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + 32]
- movu m1, [r0 + 48]
- movu m2, [r1 + 32]
- movu m3, [r1 + 48]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + 32], m0
- movu [r3 + 48], m1
-
- ; store recqt[]
- movu [r2 + 32], m0
- movu [r2 + 48], m1
- add r2, r5
-
- movu m0, [r0 + r4]
- movu m1, [r0 + r4 + 16]
- movu m2, [r1 + r4]
- movu m3, [r1 + r4 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6], m0
- movu [r3 + r6 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + r4 + 32]
- movu m1, [r0 + r4 + 48]
- movu m2, [r1 + r4 + 32]
- movu m3, [r1 + r4 + 48]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6 + 32], m0
- movu [r3 + r6 + 48], m1
- lea r3, [r3 + r6 * 2]
-
- ; store recqt[]
- movu [r2 + 32], m0
- movu [r2 + 48], m1
- add r2, r5
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 32
-.loop:
- movu m2, [r0]
- movu m4, [r0 + 16]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- pmovzxbw m3, m4
- punpckhbw m4, m0
-
- paddw m1, [r1 + 0 * 16]
- paddw m2, [r1 + 1 * 16]
- packuswb m1, m2
-
- paddw m3, [r1 + 2 * 16]
- paddw m4, [r1 + 3 * 16]
- packuswb m3, m4
-
- ; store recon[] and recipred[]
- movu [r3], m1
- movu [r3 + 16], m3
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [r2 + 0 * 16], m2
- movu [r2 + 1 * 16], m1
- pmovzxbw m4, m3
- punpckhbw m3, m0
- movu [r2 + 2 * 16], m4
- movu [r2 + 3 * 16], m3
-
- add r2, r5
- add r3, r6
- add r0, r4
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
;-----------------------------------------------------------------------------
; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
;-----------------------------------------------------------------------------
-; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
+; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal quant, 5,6,8
pxor m7, m7 ; m7 = numZero
.loop:
; 4 coeff
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
movh [r3], m3
; 4 coeff
- movu m0, [r0 + 16] ; m0 = level
+ pmovsxwd m0, [r0 + 8] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + 16] ; m0 = tmpLevel1
paddd m2, m1, m5
packssdw m3, m3
movh [r3 + 8], m3
- add r0, 32
+ add r0, 16
add r1, 32
add r2, 32
add r3, 16
pxor m7, m7 ; m7 = numZero
.loop:
; 8 coeff
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
psignd m2, m0
; 8 coeff
- movu m0, [r0 + mmsize] ; m0 = level
+ pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
paddd m3, m1, m5
pminuw m2, m9
paddw m7, m2
- add r0, mmsize*2
+ add r0, mmsize
add r1, mmsize*2
add r2, mmsize*2
add r3, mmsize
pxor m7, m7 ; m7 = numZero
.loop:
; 8 coeff
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
movu [r3], xm3
; 8 coeff
- movu m0, [r0 + mmsize] ; m0 = level
+ pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
paddd m2, m1, m5
vpermq m3, m3, q0020
movu [r3 + mmsize/2], xm3
- add r0, mmsize*2
+ add r0, mmsize
add r1, mmsize*2
add r2, mmsize*2
add r3, mmsize
;-----------------------------------------------------------------------------
-; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
+; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal nquant, 3,5,8
shr r4d, 3
.loop:
- movu m0, [r0] ; m0 = level
- movu m1, [r0 + 16] ; m1 = level
+ pmovsxwd m0, [r0] ; m0 = level
+ pmovsxwd m1, [r0 + 8] ; m1 = level
pabsd m2, m0
pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
packssdw m2, m3
movu [r2], m2
- add r0, 32
+ add r0, 16
add r1, 32
add r2, 16
shr r4d, 4
.loop:
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
paddd m1, m4
psrad m1, xm3 ; m0 = level1
psignd m1, m0
- movu m0, [r0 + mmsize] ; m0 = level
+ pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m2, m0
pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
paddd m2, m4
vpermq m2, m1, q3120
movu [r2], m2
- add r0, mmsize * 2
+ add r0, mmsize
add r1, mmsize * 2
add r2, mmsize
pmaddwd m4, m1
psrad m3, m0
psrad m4, m0
- packssdw m3, m3 ; OPT_ME: store must be 32 bits
- pmovsxwd m3, m3
- packssdw m4, m4
- pmovsxwd m4, m4
+ packssdw m3, m4
mova [r1], m3
- mova [r1 + 16], m4
add r0, 16
- add r1, 32
+ add r1, 16
sub r2d, 8
jnz .loop
pmaxsd m3, m6
pminsd m4, m5
pmaxsd m4, m6
+ packssdw m3, m4
mova [r1 + 0 * mmsize/2], xm3
- mova [r1 + 1 * mmsize/2], xm4
- vextracti128 [r1 + 2 * mmsize/2], m3, 1
- vextracti128 [r1 + 3 * mmsize/2], m4, 1
+ vextracti128 [r1 + 1 * mmsize/2], m3, 1
add r0, mmsize
- add r1, mmsize * 2
+ add r1, mmsize
dec r2d
jnz .loop
ret x265_pixel_ ## name ## _12x16_ ## suffix args; \
#define DECL_X1(name, suffix) \
- DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
+ DECL_PIXELS(int, name, suffix, (const pixel*, intptr_t, const pixel*, intptr_t))
#define DECL_X1_SS(name, suffix) \
- DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t))
+ DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const int16_t*, intptr_t))
#define DECL_X1_SP(name, suffix) \
- DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, pixel *, intptr_t))
+ DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const pixel*, intptr_t))
#define DECL_X4(name, suffix) \
- DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \
- DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *))
+ DECL_PIXELS(void, name ## _x3, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*)) \
+ DECL_PIXELS(void, name ## _x4, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*))
/* sad-a.asm */
DECL_X1(sad, mmx2)
DECL_X1(satd, avx)
DECL_X1(satd, xop)
DECL_X1(satd, avx2)
-int x265_pixel_satd_8x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x4_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x12_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t);
+int x265_pixel_satd_8x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x4_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x12_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x64_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
DECL_X1(sa8d, mmx2)
DECL_X1(sa8d, sse2)
DECL_X1_SS(ssd_ss, avx2)
DECL_X1_SP(ssd_sp, sse4)
#define DECL_HEVC_SSD(suffix) \
- int x265_pixel_ssd_32x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_32x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_32x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_32x24_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_24x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_32x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_8x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_8x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x12_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_8x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_8x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t);
+ int x265_pixel_ssd_32x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_32x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_32x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_32x24_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_24x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_32x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_8x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_8x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x12_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_8x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_8x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t);
DECL_HEVC_SSD(sse2)
DECL_HEVC_SSD(ssse3)
DECL_HEVC_SSD(avx)
-int x265_pixel_ssd_12x16_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_24x32_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_48x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x16_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x32_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
+int x265_pixel_ssd_12x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_24x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_48x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x48_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_ssd_s_4_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_8_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_16_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_32_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_32_avx2(int16_t *, intptr_t);
+int x265_pixel_ssd_s_4_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_8_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_16_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_32_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t);
#define ADDAVG(func) \
- void x265_ ## func ## _sse4(int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
+ void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
ADDAVG(addAvg_2x4)
ADDAVG(addAvg_2x8)
ADDAVG(addAvg_4x2);
ADDAVG(addAvg_24x64)
ADDAVG(addAvg_32x48)
-void x265_downShift_16_sse2(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
-void x265_upShift_8_sse4(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift);
+void x265_downShift_16_sse2(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void x265_upShift_8_sse4(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
#undef DECL_PIXELS
#undef DECL_HEVC_SSD
m_hChromaShift = CHROMA_H_SHIFT(csp);
m_vChromaShift = CHROMA_V_SHIFT(csp);
- // set width and height
m_size = size;
- m_csize = size >> m_hChromaShift;
m_part = partitionFromSizes(size, size);
- size_t sizeL = size * size;
- size_t sizeC = sizeL >> (m_vChromaShift + m_hChromaShift);
+ if (csp == X265_CSP_I400)
+ {
+ CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
+ m_buf[1] = m_buf[2] = 0;
+ m_csize = MAX_INT;
+ return true;
+ }
+ else
+ {
+ m_csize = size >> m_hChromaShift;
- X265_CHECK((sizeC & 15) == 0, "invalid size");
+ size_t sizeL = size * size;
+ size_t sizeC = sizeL >> (m_vChromaShift + m_hChromaShift);
- // memory allocation (padded for SIMD reads)
- CHECKED_MALLOC(m_buf[0], pixel, sizeL + sizeC * 2 + 8);
- m_buf[1] = m_buf[0] + sizeL;
- m_buf[2] = m_buf[0] + sizeL + sizeC;
- return true;
+ X265_CHECK((sizeC & 15) == 0, "invalid size");
+
+ // memory allocation (padded for SIMD reads)
+ CHECKED_MALLOC(m_buf[0], pixel, sizeL + sizeC * 2 + 8);
+ m_buf[1] = m_buf[0] + sizeL;
+ m_buf[2] = m_buf[0] + sizeL + sizeC;
+ return true;
+ }
fail:
return false;
void Yuv::copyToPicYuv(PicYuv& dstPic, uint32_t cuAddr, uint32_t absPartIdx) const
{
pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx);
-
primitives.luma_copy_pp[m_part](dstY, dstPic.m_stride, m_buf[0], m_size);
pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx)
{
- /* We cheat with const_cast internally because the get methods are not capable of
- * returning const buffers and the primitives are not const aware, but we know
- * this function does not modify srcPic */
- PicYuv& srcPicSafe = const_cast<PicYuv&>(srcPic);
- pixel* srcY = srcPicSafe.getLumaAddr(cuAddr, absPartIdx);
-
+ const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx);
primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcY, srcPic.m_stride);
- pixel* srcU = srcPicSafe.getCbAddr(cuAddr, absPartIdx);
- pixel* srcV = srcPicSafe.getCrAddr(cuAddr, absPartIdx);
- primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPicSafe.m_strideC);
- primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPicSafe.m_strideC);
+ const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
+ const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
+ primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPic.m_strideC);
+ primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPic.m_strideC);
}
void Yuv::copyFromYuv(const Yuv& srcYuv)
{
- X265_CHECK(m_size <= srcYuv.m_size, "invalid size\n");
+ X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n");
primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
}
+/* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */
+void Yuv::copyPUFromYuv(const Yuv& srcYuv, uint32_t absPartIdx, int partEnum, bool bChroma)
+{
+ X265_CHECK(m_size == FENC_STRIDE && m_size >= srcYuv.m_size, "PU buffer size mismatch\n");
+
+ const pixel* srcY = srcYuv.m_buf[0] + getAddrOffset(absPartIdx, srcYuv.m_size);
+ primitives.luma_copy_pp[partEnum](m_buf[0], m_size, srcY, srcYuv.m_size);
+
+ if (bChroma)
+ {
+ const pixel* srcU = srcYuv.m_buf[1] + srcYuv.getChromaAddrOffset(absPartIdx);
+ const pixel* srcV = srcYuv.m_buf[2] + srcYuv.getChromaAddrOffset(absPartIdx);
+ primitives.chroma[m_csp].copy_pp[partEnum](m_buf[1], m_csize, srcU, srcYuv.m_csize);
+ primitives.chroma[m_csp].copy_pp[partEnum](m_buf[2], m_csize, srcV, srcYuv.m_csize);
+ }
+}
+
void Yuv::copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const
{
pixel* dstY = dstYuv.getLumaAddr(absPartIdx);
{
pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size);
pixel* dstY = dstYuv.m_buf[0];
-
primitives.luma_copy_pp[dstYuv.m_part](dstY, dstYuv.m_size, srcY, m_size);
pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
if (bLuma)
{
- int16_t* srcY0 = const_cast<ShortYuv&>(srcYuv0).getLumaAddr(absPartIdx);
- int16_t* srcY1 = const_cast<ShortYuv&>(srcYuv1).getLumaAddr(absPartIdx);
+ const int16_t* srcY0 = srcYuv0.getLumaAddr(absPartIdx);
+ const int16_t* srcY1 = srcYuv1.getLumaAddr(absPartIdx);
pixel* dstY = getLumaAddr(absPartIdx);
-
primitives.luma_addAvg[part](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
}
if (bChroma)
{
- int16_t* srcU0 = const_cast<ShortYuv&>(srcYuv0).getCbAddr(absPartIdx);
- int16_t* srcV0 = const_cast<ShortYuv&>(srcYuv0).getCrAddr(absPartIdx);
- int16_t* srcU1 = const_cast<ShortYuv&>(srcYuv1).getCbAddr(absPartIdx);
- int16_t* srcV1 = const_cast<ShortYuv&>(srcYuv1).getCrAddr(absPartIdx);
+ const int16_t* srcU0 = srcYuv0.getCbAddr(absPartIdx);
+ const int16_t* srcV0 = srcYuv0.getCrAddr(absPartIdx);
+ const int16_t* srcU1 = srcYuv1.getCbAddr(absPartIdx);
+ const int16_t* srcV1 = srcYuv1.getCrAddr(absPartIdx);
pixel* dstU = getCbAddr(absPartIdx);
pixel* dstV = getCrAddr(absPartIdx);
-
primitives.chroma[m_csp].addAvg[part](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
primitives.chroma[m_csp].addAvg[part](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
}
{
const pixel* src = getLumaAddr(absPartIdx);
pixel* dst = dstYuv.getLumaAddr(absPartIdx);
- primitives.square_copy_pp[log2Size - 2](dst, dstYuv.m_size, const_cast<pixel*>(src), m_size);
+ primitives.luma_copy_pp[log2Size - 2](dst, dstYuv.m_size, src, m_size);
}
void Yuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
const pixel* srcV = getCrAddr(absPartIdx);
pixel* dstU = dstYuv.getCbAddr(absPartIdx);
pixel* dstV = dstYuv.getCrAddr(absPartIdx);
-
- primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, const_cast<pixel*>(srcU), m_csize);
- primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, const_cast<pixel*>(srcV), m_csize);
+ primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, srcU, m_csize);
+ primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, srcV, m_csize);
}
// Copy from same size YUV buffer
void copyFromYuv(const Yuv& srcYuv);
+ // Copy portion of srcYuv into ME prediction buffer
+ void copyPUFromYuv(const Yuv& srcYuv, uint32_t absPartIdx, int partEnum, bool bChroma);
+
// Copy Small YUV buffer to the part of other Big YUV buffer
void copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const;
if(GCC)
add_definitions(-Wno-uninitialized)
endif()
+if(MSVC)
+ add_definitions(/wd4701) # potentially uninitialized local variable 'foo' used
+endif()
add_library(encoder OBJECT ../x265.h
analysis.cpp analysis.h
#include "rdcost.h"
#include "encoder.h"
-#include "PPA/ppa.h"
-
using namespace x265;
/* An explanation of rate distortion levels (--rd-level)
*
* RDO selection between merge and skip
* sa8d selection of best inter mode
+ * sa8d decisions include chroma residual cost
* RDO selection between (merge/skip) / best inter mode / intra / split
*
* rd-level 4 enables RDOQuant
+ * chroma residual cost included in satd decisions, including subpel refine
+ * (as a result of --subme 3 being used by preset slow)
*
* rd-level 5,6 does RDO for each inter mode
*/
Analysis::Analysis()
{
m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
+ m_reuseIntraDataCTU = NULL;
+ m_reuseInterDataCTU = NULL;
}
bool Analysis::create(ThreadLocalData *tld)
{
m_tld = tld;
m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
+ m_bChromaSa8d = m_param->rdLevel >= 3;
int csp = m_param->internalCsp;
uint32_t cuSize = g_maxCUSize;
}
}
-Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
+Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
{
m_slice = ctu.m_slice;
m_frame = &frame;
invalidateContexts(0);
m_quant.setQPforQuant(ctu);
m_rqt[0].cur.load(initialContext);
- m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_origPicYuv, ctu.m_cuAddr, 0);
+ m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
uint32_t numPartition = ctu.m_numPartitions;
+ if (m_param->analysisMode)
+ {
+ m_reuseIntraDataCTU = (analysis_intra_data *)m_frame->m_analysisData.intraData;
+ int numPredDir = m_slice->isInterP() ? 1 : 2;
+ m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData + ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir;
+ }
+
if (m_slice->m_sliceType == I_SLICE)
{
uint32_t zOrder = 0;
- if (m_param->analysisMode == X265_ANALYSIS_LOAD)
- compressIntraCU(ctu, cuGeom, m_frame->m_intraData, zOrder);
- else
+ compressIntraCU(ctu, cuGeom, zOrder);
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
{
- compressIntraCU(ctu, cuGeom, NULL, zOrder);
-
- if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_intraData)
- {
- CUData *bestCU = &m_modeDepth[0].bestMode->cu;
- memcpy(&m_frame->m_intraData->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
- memcpy(&m_frame->m_intraData->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
- memcpy(&m_frame->m_intraData->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
- m_frame->m_intraData->cuAddr[ctu.m_cuAddr] = ctu.m_cuAddr;
- m_frame->m_intraData->poc[ctu.m_cuAddr] = m_frame->m_poc;
- }
+ CUData *bestCU = &m_modeDepth[0].bestMode->cu;
+ memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
+ memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
+ memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
}
}
else
if (!m_param->rdLevel)
{
/* In RD Level 0/1, copy source pixels into the reconstructed block so
- * they are available for intra predictions */
- m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPicYuv, ctu.m_cuAddr, 0);
-
- compressInterCU_rd0_4(ctu, cuGeom); // TODO: this really wants to be compressInterCU_rd0_1
+ * they are available for intra predictions */
+ m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
+
+ compressInterCU_rd0_4(ctu, cuGeom);
/* generate residual for entire CTU at once and copy to reconPic */
encodeResidue(ctu, cuGeom);
if (!md.bestMode->distortion)
/* already lossless */
return;
- else if (md.bestMode->cu.m_predMode[0] == MODE_INTRA)
+ else if (md.bestMode->cu.isIntra(0))
{
md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
}
}
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* shared, uint32_t& zOrder)
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder)
{
uint32_t depth = cuGeom.depth;
ModeDepth& md = m_modeDepth[depth];
bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
- if (shared)
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD)
{
- uint8_t* sharedDepth = &shared->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
- char* sharedPartSizes = &shared->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
- uint8_t* sharedModes = &shared->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+ uint8_t* reuseDepth = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+ uint8_t* reuseModes = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+ char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
- if (mightNotSplit && depth == sharedDepth[zOrder] && zOrder == cuGeom.encodeIdx)
+ if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx)
{
m_quant.setQPforQuant(parentCTU);
- PartSize size = (PartSize)sharedPartSizes[zOrder];
+ PartSize size = (PartSize)reusePartSizes[zOrder];
Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
mode.cu.initSubCU(parentCTU, cuGeom);
- checkIntra(mode, cuGeom, size, sharedModes);
+ checkIntra(mode, cuGeom, size, &reuseModes[zOrder]);
checkBestMode(mode, depth);
if (m_bTryLossless)
addSplitFlagCost(*md.bestMode, cuGeom.depth);
// increment zOrder offset to point to next best depth in sharedDepth buffer
- zOrder += g_depthInc[g_maxCUDepth - 1][sharedDepth[zOrder]];
+ zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
mightSplit = false;
}
}
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
{
- m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+ m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
m_rqt[nextDepth].cur.load(*nextContext);
- compressIntraCU(parentCTU, childCuData, shared, zOrder);
+ compressIntraCU(parentCTU, childGeom, zOrder);
// Save best CU and pred data for this sub CU
- splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+ splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
splitPred->addSubCosts(*nd.bestMode);
- nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
nextContext = &nd.bestMode->contexts;
}
else
{
/* record the depth of this non-present sub-CU */
- splitCU->setEmptyPart(childCuData, subPartIdx);
+ splitCU->setEmptyPart(childGeom, subPartIdx);
zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
}
}
/* Copy best data to encData CTU and recon */
md.bestMode->cu.copyToPic(depth);
if (md.bestMode != &md.pred[PRED_SPLIT])
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
+ md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
}
bool Analysis::findJob(int threadId)
{
/* try to acquire a CU mode to analyze */
+ m_pmodeLock.acquire();
if (m_totalNumJobs > m_numAcquiredJobs)
{
- /* ATOMIC_INC returns the incremented value */
- int id = ATOMIC_INC(&m_numAcquiredJobs);
- if (m_totalNumJobs >= id)
- {
- parallelModeAnalysis(threadId, id - 1);
+ int id = m_numAcquiredJobs++;
+ m_pmodeLock.release();
- if (ATOMIC_INC(&m_numCompletedJobs) == m_totalNumJobs)
- m_modeCompletionEvent.trigger();
- return true;
- }
+ parallelModeAnalysis(threadId, id);
+
+ m_pmodeLock.acquire();
+ if (++m_numCompletedJobs == m_totalNumJobs)
+ m_modeCompletionEvent.trigger();
+ m_pmodeLock.release();
+ return true;
}
+ else
+ m_pmodeLock.release();
+ m_meLock.acquire();
if (m_totalNumME > m_numAcquiredME)
{
- int id = ATOMIC_INC(&m_numAcquiredME);
- if (m_totalNumME >= id)
- {
- parallelME(threadId, id - 1);
+ int id = m_numAcquiredME++;
+ m_meLock.release();
- if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
- m_meCompletionEvent.trigger();
- return true;
- }
+ parallelME(threadId, id);
+
+ m_meLock.acquire();
+ if (++m_numCompletedME == m_totalNumME)
+ m_meCompletionEvent.trigger();
+ m_meLock.release();
+ return true;
}
+ else
+ m_meLock.release();
return false;
}
slave->m_slice = m_slice;
slave->m_frame = m_frame;
- PicYuv* fencPic = m_frame->m_origPicYuv;
- pixel* pu = fencPic->getLumaAddr(m_curMECu->m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx);
- slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
- slave->m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
-
- slave->prepMotionCompensation(*m_curMECu, *m_curGeom, m_curPart);
+ slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
+ slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart);
}
if (meId < m_slice->m_numRefIdx[0])
- slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 0, meId);
+ slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId);
else
- slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
+ slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
}
void Analysis::parallelModeAnalysis(int threadId, int jobId)
slave->m_frame = m_frame;
slave->setQP(*m_slice, m_rdCost.m_qp);
slave->invalidateContexts(0);
- if (jobId)
- slave->m_me.setSourcePlane(m_frame->m_origPicYuv->m_picOrg[0], m_frame->m_origPicYuv->m_stride);
}
ModeDepth& md = m_modeDepth[m_curGeom->depth];
case 0:
if (slave != this)
slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
- slave->checkIntraInInter_rd0_4(md.pred[PRED_INTRA], *m_curGeom);
+ slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
if (m_param->rdLevel > 2)
slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
break;
case 1:
slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N);
+ if (m_slice->m_sliceType == B_SLICE)
+ slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
break;
case 2:
case 1:
slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false);
+ md.pred[PRED_BIDIR].rdCost = MAX_INT64;
+ if (m_slice->m_sliceType == B_SLICE)
+ {
+ slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
+ if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
+ slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom);
+ }
break;
case 2:
/* Initialize all prediction CUs based on parentCTU */
md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
+ md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
if (m_param->bEnableRectInter)
md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
}
+ m_pmodeLock.acquire();
m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4;
m_numAcquiredJobs = !bTryIntra;
m_numCompletedJobs = m_numAcquiredJobs;
m_curGeom = &cuGeom;
m_bJobsQueued = true;
JobProvider::enqueue();
+ m_pmodeLock.release();
for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++)
m_pool->pokeIdleThread();
if (m_param->rdLevel > 2)
{
- /* encode best inter */
- for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+ /* RD selection between merge, inter, bidir and intra */
+ if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
{
- prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
- motionCompensation(bestInter->predYuv, false, true);
+ for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+ {
+ prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+ motionCompensation(bestInter->predYuv, false, true);
+ }
}
encodeResAndCalcRdInterCU(*bestInter, cuGeom);
-
- /* RD selection between merge, inter and intra */
checkBestMode(*bestInter, depth);
+ /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
+ if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
+ md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+ {
+ encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+ checkBestMode(md.pred[PRED_BIDIR], depth);
+ }
+
if (bTryIntra)
checkBestMode(md.pred[PRED_INTRA], depth);
}
if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
md.bestMode = bestInter;
+ if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
+ md.bestMode = &md.pred[PRED_BIDIR];
+
if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
{
md.bestMode = &md.pred[PRED_INTRA];
m_modeCompletionEvent.wait();
checkBestMode(md.pred[PRED_2Nx2N], depth);
+ checkBestMode(md.pred[PRED_BIDIR], depth);
if (m_param->bEnableRectInter)
{
if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra)
{
md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
- checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+ checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
checkBestMode(md.pred[PRED_INTRA], depth);
}
bool bNoSplit = false;
if (md.bestMode)
{
- bNoSplit = !!md.bestMode->cu.isSkipped(0);
+ bNoSplit = md.bestMode->cu.isSkipped(0);
if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
}
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
{
- m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+ m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
m_rqt[nextDepth].cur.load(*nextContext);
- compressInterCU_dist(parentCTU, childCuData);
+ compressInterCU_dist(parentCTU, childGeom);
// Save best CU and pred data for this sub CU
- splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+ splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
splitPred->addSubCosts(*nd.bestMode);
- nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
nextContext = &nd.bestMode->contexts;
}
else
- splitCU->setEmptyPart(childCuData, subPartIdx);
+ splitCU->setEmptyPart(childGeom, subPartIdx);
}
nextContext->store(splitPred->contexts);
checkBestMode(*splitPred, depth);
}
- if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
+ if (mightNotSplit)
{
/* early-out statistics */
- FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+ FrameData& curEncData = *m_frame->m_encData;
FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
cuStat.count[depth] += 1;
/* Copy best data to encData CTU and recon */
md.bestMode->cu.copyToPic(depth);
if (md.bestMode != &md.pred[PRED_SPLIT])
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
+ md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
}
void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom)
{
bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
- /* Initialize all prediction CUs based on parentCTU */
- md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
+ /* Compute Merge Cost */
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
- if (m_param->bEnableRectInter)
- {
- md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
- md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
- }
- if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
- {
- md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
- md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
- md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
- md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
- }
-
- /* Compute Merge Cost */
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
bool earlyskip = false;
if (!earlyskip)
{
+ md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N);
- Mode *bestInter = &md.pred[PRED_2Nx2N];
+ if (m_slice->m_sliceType == B_SLICE)
+ {
+ md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
+ checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+ }
+
+ Mode *bestInter = &md.pred[PRED_2Nx2N];
if (m_param->bEnableRectInter)
{
+ md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N);
if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_Nx2N];
+
+ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN);
if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxN];
if (bHor)
{
+ md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU);
if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxnU];
+
+ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD);
if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxnD];
}
if (bVer)
{
+ md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N);
if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_nLx2N];
+
+ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N);
if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_nRx2N];
if (m_param->rdLevel >= 3)
{
/* Calculate RD cost of best inter option */
- for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+ if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
{
- prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
- motionCompensation(bestInter->predYuv, false, true);
+ for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+ {
+ prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+ motionCompensation(bestInter->predYuv, false, true);
+ }
}
-
encodeResAndCalcRdInterCU(*bestInter, cuGeom);
+ checkBestMode(*bestInter, depth);
- if (!md.bestMode || bestInter->rdCost < md.bestMode->rdCost)
- md.bestMode = bestInter;
+ /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
+ if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
+ md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+ {
+ encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+ checkBestMode(md.pred[PRED_BIDIR], depth);
+ }
if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
md.bestMode->sa8dCost == MAX_INT64)
{
md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
- checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+ checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
- if (md.pred[PRED_INTRA].rdCost < md.bestMode->rdCost)
- md.bestMode = &md.pred[PRED_INTRA];
+ checkBestMode(md.pred[PRED_INTRA], depth);
}
}
else
{
- /* SA8D choice between merge/skip, inter, and intra */
+ /* SA8D choice between merge/skip, inter, bidir, and intra */
if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
md.bestMode = bestInter;
+ if (m_slice->m_sliceType == B_SLICE &&
+ md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
+ md.bestMode = &md.pred[PRED_BIDIR];
+
if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
{
md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
- checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+ checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
md.bestMode = &md.pred[PRED_INTRA];
}
/* prediction already generated for this CU, and if rd level
* is not 0, it is already fully encoded */
}
- else if (md.bestMode->cu.m_predMode[0] == MODE_INTER)
+ else if (md.bestMode->cu.isInter(0))
{
for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
{
encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
else if (m_param->rdLevel == 1)
{
- m_rqt[cuGeom.depth].tmpResiYuv.subtract(md.fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
- generateCoeffRecon(*md.bestMode, cuGeom);
+ /* generate recon pixels with no rate distortion considerations */
+ CUData& cu = md.bestMode->cu;
+ m_quant.setQPforQuant(cu);
+
+ uint32_t tuDepthRange[2];
+ cu.getInterTUQtDepthRange(tuDepthRange, 0);
+
+ m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
+ residualTransformQuantInter(*md.bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
+ if (cu.getQtRootCbf(0))
+ md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
+ else
+ {
+ md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
+ if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
+ cu.setPredModeSubParts(MODE_SKIP);
+ }
}
}
else
if (m_param->rdLevel == 2)
encodeIntraInInter(*md.bestMode, cuGeom);
else if (m_param->rdLevel == 1)
- generateCoeffRecon(*md.bestMode, cuGeom);
+ {
+ /* generate recon pixels with no rate distortion considerations */
+ CUData& cu = md.bestMode->cu;
+ m_quant.setQPforQuant(cu);
+
+ uint32_t tuDepthRange[2];
+ cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+ uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
+ residualTransformQuantIntra(*md.bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
+ getBestIntraModeChroma(*md.bestMode, cuGeom);
+ residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
+ md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
+ }
}
}
} // !earlyskip
bool bNoSplit = false;
if (md.bestMode)
{
- bNoSplit = !!md.bestMode->cu.isSkipped(0);
+ bNoSplit = md.bestMode->cu.isSkipped(0);
if (mightSplit && depth && depth >= minDepth && !bNoSplit)
bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
}
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
{
- m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+ m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
m_rqt[nextDepth].cur.load(*nextContext);
- compressInterCU_rd0_4(parentCTU, childCuData);
+ compressInterCU_rd0_4(parentCTU, childGeom);
// Save best CU and pred data for this sub CU
- splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+ splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
splitPred->addSubCosts(*nd.bestMode);
if (m_param->rdLevel)
- nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
else
- nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
if (m_param->rdLevel > 1)
nextContext = &nd.bestMode->contexts;
}
else
- splitCU->setEmptyPart(childCuData, subPartIdx);
+ splitCU->setEmptyPart(childGeom, subPartIdx);
}
nextContext->store(splitPred->contexts);
if (mightNotSplit)
addSplitFlagCost(*splitPred, cuGeom.depth);
- else if (m_param->rdLevel <= 1)
- splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
- else
+ else if (m_param->rdLevel > 1)
updateModeCost(*splitPred);
+ else
+ splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
if (!md.bestMode)
md.bestMode = splitPred;
- else if (m_param->rdLevel >= 1)
- {
- if (splitPred->rdCost < md.bestMode->rdCost)
- md.bestMode = splitPred;
- }
- else
- {
- if (splitPred->sa8dCost < md.bestMode->sa8dCost)
- md.bestMode = splitPred;
- }
+ else if (m_param->rdLevel > 1)
+ checkBestMode(*splitPred, cuGeom.depth);
+ else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
+ md.bestMode = splitPred;
}
- if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
+ if (mightNotSplit)
{
/* early-out statistics */
- FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+ FrameData& curEncData = *m_frame->m_encData;
FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
cuStat.count[depth] += 1;
/* Copy best data to encData CTU and recon */
md.bestMode->cu.copyToPic(depth);
if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel)
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
+ md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
}
void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom)
if (mightNotSplit)
{
- for (int i = 0; i < MAX_PRED_TYPES; i++)
- md.pred[i].cu.initSubCU(parentCTU, cuGeom);
-
+ md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
+ md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
if (!earlySkip)
{
+ md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false);
checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
+ if (m_slice->m_sliceType == B_SLICE)
+ {
+ md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
+ checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+ if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
+ {
+ encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+ checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
+ }
+ }
+
if (m_param->bEnableRectInter)
{
- // Nx2N rect
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false);
checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
}
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false);
checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
}
{
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly);
checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
}
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly);
checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
}
{
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly);
checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
}
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly);
checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
}
if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) &&
(!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)))
{
+ md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
checkBestMode(md.pred[PRED_INTRA], depth);
if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
{
+ md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
checkBestMode(md.pred[PRED_INTRA_NxN], depth);
}
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
{
- m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+ m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
m_rqt[nextDepth].cur.load(*nextContext);
- compressInterCU_rd5_6(parentCTU, childCuData);
+ compressInterCU_rd5_6(parentCTU, childGeom);
// Save best CU and pred data for this sub CU
- splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+ splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
splitPred->addSubCosts(*nd.bestMode);
- nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
nextContext = &nd.bestMode->contexts;
}
else
- splitCU->setEmptyPart(childCuData, subPartIdx);
+ splitCU->setEmptyPart(childGeom, subPartIdx);
}
nextContext->store(splitPred->contexts);
if (mightNotSplit)
/* Copy best data to encData CTU and recon */
md.bestMode->cu.copyToPic(depth);
if (md.bestMode != &md.pred[PRED_SPLIT])
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
+ md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
}
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
bestPred->sa8dCost = MAX_INT64;
int bestSadCand = -1;
- int sizeIdx = cuGeom.log2CUSize - 2;
+ int cpart, sizeIdx = cuGeom.log2CUSize - 2;
+ if (m_bChromaSa8d)
+ {
+ int cuSize = 1 << cuGeom.log2CUSize;
+ cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
+ }
for (uint32_t i = 0; i < maxNumMergeCand; ++i)
{
if (m_bFrameParallel &&
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
tempPred->cu.m_interDir[0] = interDirNeighbours[i];
tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
- tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+ tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
- tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
+ tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
- // do MC only for Luma part
prepMotionCompensation(tempPred->cu, cuGeom, 0);
- motionCompensation(tempPred->predYuv, true, false);
+ motionCompensation(tempPred->predYuv, true, m_bChromaSa8d);
tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
+ if (m_bChromaSa8d)
+ {
+ tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
+ tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
+ }
tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
if (tempPred->sa8dCost < bestPred->sa8dCost)
return;
/* calculate the motion compensation for chroma for the best mode selected */
- prepMotionCompensation(bestPred->cu, cuGeom, 0);
- motionCompensation(bestPred->predYuv, false, true);
+ if (!m_bChromaSa8d) /* Chroma MC was done above */
+ {
+ prepMotionCompensation(bestPred->cu, cuGeom, 0);
+ motionCompensation(bestPred->predYuv, false, true);
+ }
if (m_param->rdLevel)
{
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
- tempPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
+ tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
- tempPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+ tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
tempPred->sa8dCost = bestPred->sa8dCost;
tempPred->predYuv.copyFromYuv(bestPred->predYuv);
/* broadcast sets of MV field data */
bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
- bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
+ bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
- bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+ bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
}
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */
tempPred->cu.m_interDir[0] = interDirNeighbours[i];
tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
- tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+ tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
- tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
- tempPred->cu.setSkipFlagSubParts(false); /* must be cleared between encode iterations */
+ tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+ tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
prepMotionCompensation(tempPred->cu, cuGeom, 0);
motionCompensation(tempPred->predYuv, true, true);
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
tempPred->cu.m_interDir[0] = interDirNeighbours[i];
tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
- tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+ tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
- tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
- tempPred->cu.setSkipFlagSubParts(false);
+ tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+ tempPred->cu.setPredModeSubParts(MODE_INTER);
tempPred->predYuv.copyFromYuv(bestPred->predYuv);
}
uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0);
bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0);
- bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
+ bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0);
- bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
+ bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
}
}
interMode.initCosts();
interMode.cu.setPartSizeSubParts(partSize);
interMode.cu.setPredModeSubParts(MODE_INTER);
+ int numPredDir = m_slice->isInterP() ? 1 : 2;
- if (predInterSearch(interMode, cuGeom, false, false))
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
+ {
+ for (uint32_t part = 0; part < interMode.cu.getNumPartInter(); part++)
+ {
+ MotionData* bestME = interMode.bestME[part];
+ for (int32_t i = 0; i < numPredDir; i++)
+ {
+ bestME[i].ref = m_reuseInterDataCTU->ref;
+ m_reuseInterDataCTU++;
+ }
+ }
+ }
+ if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d))
{
/* predInterSearch sets interMode.sa8dBits */
const Yuv& fencYuv = *interMode.fencYuv;
Yuv& predYuv = interMode.predYuv;
- interMode.distortion = primitives.sa8d[cuGeom.log2CUSize - 2](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
+ int part = partitionFromLog2Size(cuGeom.log2CUSize);
+ interMode.distortion = primitives.sa8d[part](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
+ if (m_bChromaSa8d)
+ {
+ uint32_t cuSize = 1 << cuGeom.log2CUSize;
+ int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
+ interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
+ interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
+ }
interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
+
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+ {
+ for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+ {
+ MotionData* bestME = interMode.bestME[puIdx];
+ for (int32_t i = 0; i < numPredDir; i++)
+ {
+ m_reuseInterDataCTU->ref = bestME[i].ref;
+ m_reuseInterDataCTU++;
+ }
+ }
+ }
}
else
{
interMode.initCosts();
interMode.cu.setPartSizeSubParts(partSize);
interMode.cu.setPredModeSubParts(MODE_INTER);
+ int numPredDir = m_slice->isInterP() ? 1 : 2;
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
+ {
+ for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+ {
+ MotionData* bestME = interMode.bestME[puIdx];
+ for (int32_t i = 0; i < numPredDir; i++)
+ {
+ bestME[i].ref = m_reuseInterDataCTU->ref;
+ m_reuseInterDataCTU++;
+ }
+ }
+ }
if (predInterSearch(interMode, cuGeom, bMergeOnly, true))
{
/* predInterSearch sets interMode.sa8dBits, but this is ignored */
encodeResAndCalcRdInterCU(interMode, cuGeom);
+
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+ {
+ for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+ {
+ MotionData* bestME = interMode.bestME[puIdx];
+ for (int32_t i = 0; i < numPredDir; i++)
+ {
+ m_reuseInterDataCTU->ref = bestME[i].ref;
+ m_reuseInterDataCTU++;
+ }
+ }
+ }
}
else
{
}
}
-/* Note that this function does not save the best intra prediction, it must
- * be generated later. It records the best mode in the cu */
-void Analysis::checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom)
+void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
{
- CUData& cu = intraMode.cu;
- uint32_t depth = cu.m_cuDepth[0];
+ CUData& cu = bidir2Nx2N.cu;
- cu.setPartSizeSubParts(SIZE_2Nx2N);
- cu.setPredModeSubParts(MODE_INTRA);
-
- uint32_t initTrDepth = 0;
- uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
- uint32_t tuSize = 1 << log2TrSize;
- const uint32_t absPartIdx = 0;
-
- // Reference sample smoothing
- initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
-
- pixel* fenc = m_modeDepth[depth].fencYuv.m_buf[0];
- uint32_t stride = m_modeDepth[depth].fencYuv.m_size;
-
- pixel *above = m_refAbove + tuSize - 1;
- pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
- pixel *left = m_refLeft + tuSize - 1;
- pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
- int sad, bsad;
- uint32_t bits, bbits, mode, bmode;
- uint64_t cost, bcost;
-
- // 33 Angle modes once
- ALIGN_VAR_32(pixel, bufScale[32 * 32]);
- ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
- ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
- int scaleTuSize = tuSize;
- int scaleStride = stride;
- int costShift = 0;
- int sizeIdx = log2TrSize - 2;
-
- if (tuSize > 32)
+ if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
{
- // origin is 64x64, we scale to 32x32 and setup required parameters
- primitives.scale2D_64to32(bufScale, fenc, stride);
- fenc = bufScale;
-
- // reserve space in case primitives need to store data in above
- // or left buffers
- pixel _above[4 * 32 + 1];
- pixel _left[4 * 32 + 1];
- pixel *aboveScale = _above + 2 * 32;
- pixel *leftScale = _left + 2 * 32;
- aboveScale[0] = leftScale[0] = above[0];
- primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
- primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
-
- scaleTuSize = 32;
- scaleStride = 32;
- costShift = 2;
- sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
-
- // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
- above = aboveScale;
- left = leftScale;
- aboveFiltered = aboveScale;
- leftFiltered = leftScale;
+ bidir2Nx2N.sa8dCost = MAX_INT64;
+ bidir2Nx2N.rdCost = MAX_INT64;
+ return;
}
- pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
- int predsize = scaleTuSize * scaleTuSize;
-
- m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
+ const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
+ MV mvzero(0, 0);
+ int cpart, partEnum = cuGeom.log2CUSize - 2;
- /* there are three cost tiers for intra modes:
- * pred[0] - mode probable, least cost
- * pred[1], pred[2] - less probable, slightly more cost
- * non-mpm modes - all cost the same (rbits) */
- uint64_t mpms;
- uint32_t preds[3];
- uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
-
- // DC
- primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
- bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
- bmode = mode = DC_IDX;
- bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
- bcost = m_rdCost.calcRdSADCost(bsad, bbits);
-
- pixel *abovePlanar = above;
- pixel *leftPlanar = left;
-
- if (tuSize & (8 | 16 | 32))
+ if (m_bChromaSa8d)
{
- abovePlanar = aboveFiltered;
- leftPlanar = leftFiltered;
+ int cuSize = 1 << cuGeom.log2CUSize;
+ cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
}
- // PLANAR
- primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
- sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
- mode = PLANAR_IDX;
- bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
- cost = m_rdCost.calcRdSADCost(sad, bits);
- COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
-
- // Transpose NxN
- primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
-
- primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
-
- bool modeHor;
- pixel *cmp;
- intptr_t srcStride;
-
-#define TRY_ANGLE(angle) \
- modeHor = angle < 18; \
- cmp = modeHor ? bufTrans : fenc; \
- srcStride = modeHor ? scaleTuSize : scaleStride; \
- sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
- bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
- cost = m_rdCost.calcRdSADCost(sad, bits)
+ bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
+ bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
+ MotionData* bestME = bidir2Nx2N.bestME[0];
+ int ref0 = bestME[0].ref;
+ MV mvp0 = bestME[0].mvp;
+ int mvpIdx0 = bestME[0].mvpIdx;
+ int ref1 = bestME[1].ref;
+ MV mvp1 = bestME[1].mvp;
+ int mvpIdx1 = bestME[1].mvpIdx;
+
+ bidir2Nx2N.initCosts();
+ cu.setPartSizeSubParts(SIZE_2Nx2N);
+ cu.setPredModeSubParts(MODE_INTER);
+ cu.setPUInterDir(3, 0, 0);
+ cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
+ cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
+ cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
+ cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
+ cu.m_mergeFlag[0] = 0;
+
+ /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
+ cu.setPUMv(0, bestME[0].mv, 0, 0);
+ cu.m_mvd[0][0] = bestME[0].mv - mvp0;
+
+ cu.setPUMv(1, bestME[1].mv, 0, 0);
+ cu.m_mvd[1][0] = bestME[1].mv - mvp1;
+
+ prepMotionCompensation(cu, cuGeom, 0);
+ motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d);
+
+ int sa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
+ if (m_bChromaSa8d)
+ {
+ /* Add in chroma distortion */
+ sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
+ sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
+ }
+ bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+ bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
- if (m_param->bEnableFastIntra)
+ bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
+ if (bTryZero)
+ {
+ /* Do not try zero MV if unidir motion predictors are beyond
+ * valid search area */
+ MV mvmin, mvmax;
+ int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
+ setSearchRange(cu, mvzero, merange, mvmin, mvmax);
+ mvmax.y += 2; // there is some pad for subpel refine
+ mvmin <<= 2;
+ mvmax <<= 2;
+
+ bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
+ bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
+ }
+ if (bTryZero)
{
- int asad = 0;
- uint32_t lowmode, highmode, amode = 5, abits = 0;
- uint64_t acost = MAX_INT64;
+ /* Estimate cost of BIDIR using coincident blocks */
+ Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
- /* pick the best angle, sampling at distance of 5 */
- for (mode = 5; mode < 35; mode += 5)
- {
- TRY_ANGLE(mode);
- COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
- }
+ int zsa8d;
- /* refine best angle at distance 2, then distance 1 */
- for (uint32_t dist = 2; dist >= 1; dist--)
+ if (m_bChromaSa8d)
{
- lowmode = amode - dist;
- highmode = amode + dist;
+ cu.m_mv[0][0] = mvzero;
+ cu.m_mv[1][0] = mvzero;
- X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
- TRY_ANGLE(lowmode);
- COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+ prepMotionCompensation(cu, cuGeom, 0);
+ motionCompensation(tmpPredYuv, true, true);
- X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
- TRY_ANGLE(highmode);
- COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+ zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
+ zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
}
-
- if (amode == 33)
+ else
{
- TRY_ANGLE(34);
- COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
- }
+ pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
+ pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
+ intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
- COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
- }
- else // calculate and search all intra prediction angles for lowest cost
- {
- for (mode = 2; mode < 35; mode++)
- {
- TRY_ANGLE(mode);
- COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+ primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
+ zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
- }
- cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTrDepth);
- intraMode.initCosts();
- intraMode.totalBits = bbits;
- intraMode.distortion = bsad;
- intraMode.sa8dCost = bcost;
- intraMode.sa8dBits = bbits;
-}
+ uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
+ uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+ uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
-void Analysis::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
-{
- CUData& cu = intraMode.cu;
- Yuv* reconYuv = &intraMode.reconYuv;
- Yuv* fencYuv = &m_modeDepth[cuGeom.depth].fencYuv;
+ /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
+ checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvp0, mvpIdx0, bits0, zcost);
+ checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvp1, mvpIdx1, bits1, zcost);
- X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
- X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
+ uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+ zcost = zsa8d + m_rdCost.getCost(zbits);
- m_quant.setQPforQuant(cu);
-
- uint32_t tuDepthRange[2];
- cu.getIntraTUQtDepthRange(tuDepthRange, 0);
-
- m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
-
- Cost icosts;
- codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
- extractIntraResultQT(cu, *reconYuv, 0, 0);
-
- intraMode.distortion = icosts.distortion;
- intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
-
- m_entropyCoder.resetBits();
- if (m_slice->m_pps->bTransquantBypassEnabled)
- m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
- m_entropyCoder.codeSkipFlag(cu, 0);
- m_entropyCoder.codePredMode(cu.m_predMode[0]);
- m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
- m_entropyCoder.codePredInfo(cu, 0);
- intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
+ if (zcost < bidir2Nx2N.sa8dCost)
+ {
+ bidir2Nx2N.sa8dBits = zbits;
+ bidir2Nx2N.sa8dCost = zcost;
- bool bCodeDQP = m_slice->m_pps->bUseDQP;
- m_entropyCoder.codeCoeff(cu, 0, cuGeom.depth, bCodeDQP, tuDepthRange);
+ cu.setPUMv(0, mvzero, 0, 0);
+ cu.m_mvd[0][0] = mvzero - mvp0;
+ cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
- intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
- intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
- if (m_rdCost.m_psyRd)
- intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+ cu.setPUMv(1, mvzero, 0, 0);
+ cu.m_mvd[1][0] = mvzero - mvp1;
+ cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
- m_entropyCoder.store(intraMode.contexts);
- updateModeCost(intraMode);
+ if (m_bChromaSa8d)
+ /* real MC was already performed */
+ bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
+ else
+ {
+ prepMotionCompensation(cu, cuGeom, 0);
+ motionCompensation(bidir2Nx2N.predYuv, true, true);
+ }
+ }
+ else if (m_bChromaSa8d)
+ {
+ /* recover overwritten motion vectors */
+ cu.m_mv[0][0] = bestME[0].mv;
+ cu.m_mv[1][0] = bestME[1].mv;
+ }
+ }
}
void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
{
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
- encodeResidue(ctu, childCuData);
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
+ encodeResidue(ctu, childGeom);
}
return;
}
uint32_t absPartIdx = cuGeom.encodeIdx;
int sizeIdx = cuGeom.log2CUSize - 2;
- Yuv& fencYuv = m_modeDepth[0].fencYuv;
-
/* reuse the bestMode data structures at the current depth */
Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
- Yuv& reconYuv = bestMode->reconYuv;
CUData& cu = bestMode->cu;
cu.copyFromPic(ctu, cuGeom);
m_quant.setQPforQuant(cu);
- if (cu.m_predMode[0] == MODE_INTRA)
+ Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
+ if (cuGeom.depth)
+ m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
+ X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
+
+ if (cu.isIntra(0))
{
uint32_t tuDepthRange[2];
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
- residualTransformQuantIntra(*bestMode, cuGeom, initTrDepth, 0, tuDepthRange);
+ uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
+ residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
getBestIntraModeChroma(*bestMode, cuGeom);
residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
}
- else if (cu.m_predMode[0] == MODE_INTER)
+ else // if (cu.isInter(0))
{
- X265_CHECK(!ctu.m_skipFlag[absPartIdx], "skip not expected prior to transform\n");
+ X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
/* Calculate residual for current CU part into depth sized resiYuv */
pixel* predV = predYuv.getCrAddr(absPartIdx);
primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size,
- fencYuv.getLumaAddr(absPartIdx), predY,
+ fencYuv.m_buf[0], predY,
fencYuv.m_size, predYuv.m_size);
primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize,
- fencYuv.getCbAddr(absPartIdx), predU,
- fencYuv.m_csize, predYuv.m_csize);
+ fencYuv.m_buf[1], predU,
+ fencYuv.m_csize, predYuv.m_csize);
primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize,
- fencYuv.getCrAddr(absPartIdx), predV,
- fencYuv.m_csize, predYuv.m_csize);
+ fencYuv.m_buf[2], predV,
+ fencYuv.m_csize, predYuv.m_csize);
uint32_t tuDepthRange[2];
cu.getInterTUQtDepthRange(tuDepthRange, 0);
residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
- cu.setSkipFlagSubParts(true);
+ cu.setPredModeSubParts(MODE_SKIP);
- PicYuv& reconPicYuv = *m_frame->m_reconPicYuv;
- if (cu.getQtRootCbf(0)) // TODO: split to each component
- {
- /* residualTransformQuantInter() wrote transformed residual back into
- * resiYuv. Generate the recon pixels by adding it to the prediction */
+ /* residualTransformQuantInter() wrote transformed residual back into
+ * resiYuv. Generate the recon pixels by adding it to the prediction */
- primitives.luma_add_ps[sizeIdx](reconYuv.m_buf[0], reconYuv.m_size,
+ PicYuv& reconPic = *m_frame->m_reconPic;
+ if (cu.m_cbf[0][0])
+ primitives.luma_add_ps[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
- primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[1], reconYuv.m_csize,
- predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
- primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[2], reconYuv.m_csize,
- predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
-
- /* copy the reconstructed part to the recon pic for later intra
- * predictions */
- reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, absPartIdx);
- }
else
- {
- /* copy the prediction pixels to the recon pic for later intra
- * predictions */
-
- primitives.luma_copy_pp[sizeIdx](reconPicYuv.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_stride,
+ primitives.luma_copy_pp[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
predY, predYuv.m_size);
- primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCbAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
+
+ if (cu.m_cbf[1][0])
+ primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
+ else
+ primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
predU, predYuv.m_csize);
- primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCrAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
+
+ if (cu.m_cbf[2][0])
+ primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
+ else
+ primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
predV, predYuv.m_csize);
- }
}
- /* else if (cu.m_predMode[0] == MODE_NONE) {} */
checkDQP(cu, cuGeom);
cu.updatePic(cuGeom.depth);
}
-/* check whether current try is the best with identifying the depth of current try */
-void Analysis::checkBestMode(Mode& mode, uint32_t depth)
-{
- ModeDepth& md = m_modeDepth[depth];
- if (md.bestMode)
- {
- if (mode.rdCost < md.bestMode->rdCost)
- md.bestMode = &mode;
- }
- else
- md.bestMode = &mode;
-}
-
void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
{
if (m_param->rdLevel >= 3)
* each quantity */
uint32_t depth = cuGeom.depth;
- FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+ FrameData& curEncData = *m_frame->m_encData;
FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
uint64_t cuCount = cuStat.count[depth];
}
// give 60% weight to all CU's and 40% weight to neighbour CU's
- if (neighCost + cuCount)
+ if (neighCount + cuCount)
{
uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
PRED_SKIP,
PRED_INTRA,
PRED_2Nx2N,
+ PRED_BIDIR,
PRED_Nx2N,
PRED_2NxN,
PRED_SPLIT,
ModeDepth m_modeDepth[NUM_CU_DEPTH];
bool m_bTryLossless;
+ bool m_bChromaSa8d;
+ /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
+ analysis_intra_data* m_reuseIntraDataCTU;
+ analysis_inter_data* m_reuseInterDataCTU;
Analysis();
bool create(ThreadLocalData* tld);
void destroy();
- Search::Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
+ Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
protected:
int m_totalNumJobs;
volatile int m_numAcquiredJobs;
volatile int m_numCompletedJobs;
+ Lock m_pmodeLock;
Event m_modeCompletionEvent;
bool findJob(int threadId);
void parallelModeAnalysis(int threadId, int jobId);
void parallelME(int threadId, int meId);
/* full analysis for an I-slice CU */
- void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* sdata, uint32_t &zOrder);
+ void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder);
/* full analysis for a P or B slice CU */
void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom);
void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize);
void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly);
- /* measure intra options */
- void checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom);
- void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+ void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);
/* encode current bestMode losslessly, pick best RD cost */
void tryLossless(const CUGeom& cuGeom);
- void checkDQP(CUData& cu, const CUGeom& cuGeom);
+ /* add the RD cost of coding a split flag (0 or 1) to the given mode */
void addSplitFlagCost(Mode& mode, uint32_t depth);
- void checkBestMode(Mode& mode, uint32_t depth);
+
+ /* update CBF flags and QP values to be internally consistent */
+ void checkDQP(CUData& cu, const CUGeom& cuGeom);
+
+ /* work-avoidance heuristics for RD levels < 5 */
uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom);
bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode);
+ /* generate residual and recon pixels for an entire CTU recursively (RD0) */
void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
+
+ /* check whether current mode is the new best */
+ inline void checkBestMode(Mode& mode, uint32_t depth)
+ {
+ ModeDepth& md = m_modeDepth[depth];
+ if (md.bestMode)
+ {
+ if (mode.rdCost < md.bestMode->rdCost)
+ md.bestMode = &mode;
+ }
+ else
+ md.bestMode = &mode;
+ }
};
struct ThreadLocalData
determineLevel(*param, encoder->m_vps);
encoder->create();
- encoder->init();
+ if (encoder->m_aborted)
+ {
+ delete encoder;
+ return NULL;
+ }
x265_print_params(param);
extern "C"
void x265_cleanup(void)
{
- destroyROM();
BitCost::destroy();
}
pic->forceqp = X265_QP_AUTO;
if (param->analysisMode)
{
- uint32_t numPartitions = 1 << (g_maxFullDepth * 2);
uint32_t widthInCU = (param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize;
uint32_t heightInCU = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
uint32_t numCUsInFrame = widthInCU * heightInCU;
pic->analysisData.numCUsInFrame = numCUsInFrame;
- pic->analysisData.numPartitions = numPartitions;
+ pic->analysisData.numPartitions = NUM_CU_PARTITIONS;
}
}
{
return x265_free(p);
}
-
-int x265_alloc_analysis_data(x265_picture* pic)
-{
- CHECKED_MALLOC(pic->analysisData.interData, x265_inter_data, pic->analysisData.numCUsInFrame * 85);
- CHECKED_MALLOC(pic->analysisData.intraData, x265_intra_data, 1);
- pic->analysisData.intraData->cuAddr = NULL;
- pic->analysisData.intraData->depth = NULL;
- pic->analysisData.intraData->modes = NULL;
- pic->analysisData.intraData->partSizes = NULL;
- pic->analysisData.intraData->poc = NULL;
- CHECKED_MALLOC(pic->analysisData.intraData->depth, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame);
- CHECKED_MALLOC(pic->analysisData.intraData->modes, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame);
- CHECKED_MALLOC(pic->analysisData.intraData->partSizes, char, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame);
- CHECKED_MALLOC(pic->analysisData.intraData->cuAddr, uint32_t, pic->analysisData.numCUsInFrame);
- CHECKED_MALLOC(pic->analysisData.intraData->poc, int, pic->analysisData.numCUsInFrame);
- return 0;
-
-fail:
- x265_free_analysis_data(pic);
- return -1;
-}
-
-void x265_free_analysis_data(x265_picture* pic)
-{
- X265_FREE(pic->analysisData.interData);
- pic->analysisData.interData = NULL;
- X265_FREE(pic->analysisData.intraData->depth);
- X265_FREE(pic->analysisData.intraData->modes);
- X265_FREE(pic->analysisData.intraData->partSizes);
- X265_FREE(pic->analysisData.intraData->cuAddr);
- X265_FREE(pic->analysisData.intraData->poc);
- X265_FREE(pic->analysisData.intraData);
- pic->analysisData.intraData = NULL;
-}
{
public:
- BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0) {}
+ BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0), m_mvp(0) {}
void setQP(unsigned int qp);
FrameData* next = m_picSymFreeList->m_freeListNext;
m_picSymFreeList->destroy();
- m_picSymFreeList->m_reconPicYuv->destroy();
- delete m_picSymFreeList->m_reconPicYuv;
+ m_picSymFreeList->m_reconPic->destroy();
+ delete m_picSymFreeList->m_reconPic;
delete m_picSymFreeList;
m_picSymFreeList = next;
curFrame->m_encData->m_freeListNext = m_picSymFreeList;
m_picSymFreeList = curFrame->m_encData;
curFrame->m_encData = NULL;
- curFrame->m_reconPicYuv = NULL;
+ curFrame->m_reconPic = NULL;
}
}
}
"B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
"Version\n";
+const char* defaultAnalysisFileName = "x265_analysis.dat";
+
using namespace x265;
Encoder::Encoder()
m_buOffsetC = NULL;
m_threadPool = 0;
m_numThreadLocalData = 0;
+ m_analysisFile = NULL;
}
void Encoder::create()
int cpuCount = getCpuCount();
if (!p->bEnableWavefront)
p->frameNumThreads = X265_MIN(cpuCount, (rows + 1) / 2);
- else if (cpuCount > 32)
+ else if (cpuCount >= 32)
p->frameNumThreads = 6; // dual-socket 10-core IvyBridge or higher
else if (cpuCount >= 16)
p->frameNumThreads = 5; // 8 HT cores, or dual socket
m_csvfpt = fopen(m_param->csvfn, "r");
if (m_csvfpt)
{
- // file already exists, re-open for append
+ /* file already exists, re-open for append */
fclose(m_csvfpt);
m_csvfpt = fopen(m_param->csvfn, "ab");
}
else
{
- // new CSV file, write header
+ /* new CSV file, write header */
m_csvfpt = fopen(m_param->csvfn, "wb");
if (m_csvfpt)
{
}
}
+ if (m_frameEncoder)
+ {
+ int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
+ int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
+ for (int i = 0; i < m_param->frameNumThreads; i++)
+ {
+ if (!m_frameEncoder[i].init(this, numRows, numCols, i))
+ {
+ x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n");
+ m_aborted = true;
+ }
+ }
+ }
+
+ if (m_param->bEmitHRDSEI)
+ m_rateControl->initHRD(&m_sps);
+ if (!m_rateControl->init(&m_sps))
+ m_aborted = true;
+
+ m_lookahead->init();
+
+ if (m_param->analysisMode)
+ {
+ const char* name = m_param->analysisFileName;
+ if (!name)
+ name = defaultAnalysisFileName;
+ const char* mode = m_param->analysisMode == X265_ANALYSIS_LOAD ? "rb" : "wb";
+ m_analysisFile = fopen(name, mode);
+ if (!m_analysisFile)
+ {
+ x265_log(NULL, X265_LOG_ERROR, "Analysis load/save: failed to open file %s\n", name);
+ m_aborted = true;
+ }
+ }
+
m_aborted |= parseLambdaFile(m_param);
+
+ m_encodeStartTime = x265_mdate();
}
void Encoder::destroy()
X265_FREE(m_buOffsetY);
X265_FREE(m_buOffsetC);
+ if (m_analysisFile)
+ fclose(m_analysisFile);
+ free(m_param->analysisFileName);
+ free(m_param->csvfn);
if (m_csvfpt)
fclose(m_csvfpt);
free(m_param->rc.statFileName); // alloc'd by strdup
X265_FREE(m_param);
}
-void Encoder::init()
-{
- if (m_frameEncoder)
- {
- int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
- int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
- for (int i = 0; i < m_param->frameNumThreads; i++)
- {
- if (!m_frameEncoder[i].init(this, numRows, numCols, i))
- {
- x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n");
- m_aborted = true;
- }
- }
- }
- if (m_param->bEmitHRDSEI)
- m_rateControl->initHRD(&m_sps);
- if (!m_rateControl->init(&m_sps))
- m_aborted = true;
- m_lookahead->init();
- m_encodeStartTime = x265_mdate();
-}
-
void Encoder::updateVbvPlan(RateControl* rc)
{
for (int i = 0; i < m_param->frameNumThreads; i++)
* allocated by this top level encoder */
if (m_cuOffsetY)
{
- inFrame->m_origPicYuv->m_cuOffsetC = m_cuOffsetC;
- inFrame->m_origPicYuv->m_cuOffsetY = m_cuOffsetY;
- inFrame->m_origPicYuv->m_buOffsetC = m_buOffsetC;
- inFrame->m_origPicYuv->m_buOffsetY = m_buOffsetY;
+ inFrame->m_fencPic->m_cuOffsetC = m_cuOffsetC;
+ inFrame->m_fencPic->m_cuOffsetY = m_cuOffsetY;
+ inFrame->m_fencPic->m_buOffsetC = m_buOffsetC;
+ inFrame->m_fencPic->m_buOffsetY = m_buOffsetY;
}
else
{
- if (!inFrame->m_origPicYuv->createOffsets(m_sps))
+ if (!inFrame->m_fencPic->createOffsets(m_sps))
{
m_aborted = true;
x265_log(m_param, X265_LOG_ERROR, "memory allocation failure, aborting encode\n");
}
else
{
- m_cuOffsetC = inFrame->m_origPicYuv->m_cuOffsetC;
- m_cuOffsetY = inFrame->m_origPicYuv->m_cuOffsetY;
- m_buOffsetC = inFrame->m_origPicYuv->m_buOffsetC;
- m_buOffsetY = inFrame->m_origPicYuv->m_buOffsetY;
+ m_cuOffsetC = inFrame->m_fencPic->m_cuOffsetC;
+ m_cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
+ m_buOffsetC = inFrame->m_fencPic->m_buOffsetC;
+ m_buOffsetY = inFrame->m_fencPic->m_buOffsetY;
}
}
}
/* Copy input picture into a Frame and PicYuv, send to lookahead */
inFrame->m_poc = ++m_pocLast;
- inFrame->m_origPicYuv->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
- inFrame->m_intraData = pic_in->analysisData.intraData;
- inFrame->m_interData = pic_in->analysisData.interData;
+ inFrame->m_fencPic->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
+
inFrame->m_userData = pic_in->userData;
inFrame->m_pts = pic_in->pts;
inFrame->m_forceqp = pic_in->forceqp;
/* Use the frame types from the first pass, if available */
int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : pic_in->sliceType;
+
+ /* In analysisSave mode, x265_analysis_data is allocated in pic_in and inFrame points to this */
+ /* Load analysis data before lookahead->addPicture, since sliceType has been decided */
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+ {
+ x265_picture* inputPic = const_cast<x265_picture*>(pic_in);
+ /* readAnalysisFile reads analysis data for the frame and allocates memory based on slicetype */
+ readAnalysisFile(&inputPic->analysisData, inFrame->m_poc);
+ inFrame->m_analysisData.poc = inFrame->m_poc;
+ inFrame->m_analysisData.sliceType = inputPic->analysisData.sliceType;
+ inFrame->m_analysisData.numCUsInFrame = inputPic->analysisData.numCUsInFrame;
+ inFrame->m_analysisData.numPartitions = inputPic->analysisData.numPartitions;
+ inFrame->m_analysisData.interData = inputPic->analysisData.interData;
+ inFrame->m_analysisData.intraData = inputPic->analysisData.intraData;
+ sliceType = inputPic->analysisData.sliceType;
+ }
+
m_lookahead->addPicture(inFrame, sliceType);
m_numDelayedPic++;
}
if (outFrame)
{
Slice *slice = outFrame->m_encData->m_slice;
+
+ /* Free up pic_in->analysisData since it has already been used */
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+ freeAnalysis(&outFrame->m_analysisData);
+
if (pic_out)
{
- PicYuv *recpic = outFrame->m_reconPicYuv;
+ PicYuv *recpic = outFrame->m_reconPic;
pic_out->poc = slice->m_poc;
pic_out->bitDepth = X265_DEPTH;
pic_out->userData = outFrame->m_userData;
pic_out->stride[1] = (int)(recpic->m_strideC * sizeof(pixel));
pic_out->planes[2] = recpic->m_picOrg[2];
pic_out->stride[2] = (int)(recpic->m_strideC * sizeof(pixel));
- }
- if (m_param->analysisMode)
- {
- pic_out->analysisData.interData = outFrame->m_interData;
- pic_out->analysisData.intraData = outFrame->m_intraData;
- pic_out->analysisData.numCUsInFrame = slice->m_sps->numCUsInFrame;
- pic_out->analysisData.numPartitions = slice->m_sps->numPartitions;
+ /* Dump analysis data from pic_out to file in save mode and free */
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE)
+ {
+ pic_out->analysisData.poc = pic_out->poc;
+ pic_out->analysisData.sliceType = pic_out->sliceType;
+ pic_out->analysisData.numCUsInFrame = outFrame->m_analysisData.numCUsInFrame;
+ pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
+ pic_out->analysisData.interData = outFrame->m_analysisData.interData;
+ pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
+ writeAnalysisFile(&pic_out->analysisData);
+ freeAnalysis(&pic_out->analysisData);
+ }
}
-
if (slice->m_sliceType == P_SLICE)
{
if (slice->m_weightPredTable[0][0][0].bPresentFlag)
}
if (m_aborted)
return -1;
-
finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits);
+
// Allow this frame to be recycled if no frame encoders are using it for reference
if (!pic_out)
{
slice->m_pps = &m_pps;
slice->m_maxNumMergeCand = m_param->maxNumMergeCand;
slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_CU_PARTITIONS);
- frameEnc->m_reconPicYuv->m_cuOffsetC = m_cuOffsetC;
- frameEnc->m_reconPicYuv->m_cuOffsetY = m_cuOffsetY;
- frameEnc->m_reconPicYuv->m_buOffsetC = m_buOffsetC;
- frameEnc->m_reconPicYuv->m_buOffsetY = m_buOffsetY;
+ frameEnc->m_reconPic->m_cuOffsetC = m_cuOffsetC;
+ frameEnc->m_reconPic->m_cuOffsetY = m_cuOffsetY;
+ frameEnc->m_reconPic->m_buOffsetC = m_buOffsetC;
+ frameEnc->m_reconPic->m_buOffsetY = m_buOffsetY;
}
curEncoder->m_rce.encodeOrder = m_encodedFrameNum++;
if (m_bframeDelay)
}
else
frameEnc->m_dts = frameEnc->m_reorderedPts;
+ /* Allocate analysis data before encode in save mode. This is allocated in frameEnc*/
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE)
+ {
+ x265_analysis_data* analysis = &frameEnc->m_analysisData;
+ analysis->poc = frameEnc->m_poc;
+ analysis->sliceType = frameEnc->m_lowres.sliceType;
+ uint32_t widthInCU = (m_param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize;
+ uint32_t heightInCU = (m_param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
+
+ uint32_t numCUsInFrame = widthInCU * heightInCU;
+ analysis->numCUsInFrame = numCUsInFrame;
+ analysis->numPartitions = NUM_CU_PARTITIONS;
+ allocAnalysis(analysis);
+ }
// determine references, setup RPS, etc
m_dpb->prepareEncode(frameEnc);
void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, uint64_t bits)
{
- PicYuv* reconPic = curFrame->m_reconPicYuv;
+ PicYuv* reconPic = curFrame->m_reconPic;
//===== calculate PSNR =====
int width = reconPic->m_picWidth - m_sps.conformanceWindow.rightOffset;
pps->maxCuDQPDepth = 0;
}
- pps->chromaCbQpOffset = m_param->cbQpOffset;
- pps->chromaCrQpOffset = m_param->crQpOffset;
+ pps->chromaQpOffset[0] = m_param->cbQpOffset;
+ pps->chromaQpOffset[1] = m_param->crQpOffset;
pps->bConstrainedIntraPred = m_param->bEnableConstrainedIntra;
pps->bUseWeightPred = m_param->bEnableWeightedPred;
pps->bTransformSkipEnabled = m_param->bEnableTransformSkip;
pps->bSignHideEnabled = m_param->bEnableSignHiding;
- /* If offsets are ever configured, enable bDeblockingFilterControlPresent and set
- * deblockingFilterBetaOffsetDiv2 / deblockingFilterTcOffsetDiv2 */
- bool bDeblockOffsetInPPS = 0;
- pps->bDeblockingFilterControlPresent = !m_param->bEnableLoopFilter || bDeblockOffsetInPPS;
+ pps->bDeblockingFilterControlPresent = !m_param->bEnableLoopFilter || m_param->deblockingFilterBetaOffset || m_param->deblockingFilterTCOffset;
pps->bPicDisableDeblockingFilter = !m_param->bEnableLoopFilter;
- pps->deblockingFilterBetaOffsetDiv2 = 0;
- pps->deblockingFilterTcOffsetDiv2 = 0;
+ pps->deblockingFilterBetaOffsetDiv2 = m_param->deblockingFilterBetaOffset;
+ pps->deblockingFilterTcOffsetDiv2 = m_param->deblockingFilterTCOffset;
pps->bEntropyCodingSyncEnabled = m_param->bEnableWavefront;
}
p->bBPyramid = 0;
/* Disable features which are not supported by the current RD level */
+ if (p->rdLevel < 5)
+ {
+ if (p->bEnableCbfFastMode) /* impossible */
+ x265_log(p, X265_LOG_WARNING, "--fast-cbf disabled, requires --rdlevel 5 or higher\n");
+ p->bEnableCbfFastMode = 0;
+ }
if (p->rdLevel < 4)
{
if (p->psyRdoq > 0) /* impossible */
x265_log(p, X265_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s);
}
- //========= set default display window ==================================
+ /* initialize the conformance window */
m_conformanceWindow.bEnabled = false;
m_conformanceWindow.rightOffset = 0;
m_conformanceWindow.topOffset = 0;
m_conformanceWindow.bottomOffset = 0;
m_conformanceWindow.leftOffset = 0;
- //======== set pad size if width is not multiple of the minimum CU size =========
- const uint32_t minCUSize = MIN_CU_SIZE;
- if (p->sourceWidth & (minCUSize - 1))
+ /* set pad size if width is not multiple of the minimum CU size */
+ if (p->sourceWidth & (MIN_CU_SIZE - 1))
{
- uint32_t rem = p->sourceWidth & (minCUSize - 1);
- uint32_t padsize = minCUSize - rem;
+ uint32_t rem = p->sourceWidth & (MIN_CU_SIZE - 1);
+ uint32_t padsize = MIN_CU_SIZE - rem;
p->sourceWidth += padsize;
- /* set the confirmation window offsets */
m_conformanceWindow.bEnabled = true;
m_conformanceWindow.rightOffset = padsize;
}
- //======== set pad size if height is not multiple of the minimum CU size =========
- if (p->sourceHeight & (minCUSize - 1))
+ /* set pad size if height is not multiple of the minimum CU size */
+ if (p->sourceHeight & (MIN_CU_SIZE - 1))
{
- uint32_t rem = p->sourceHeight & (minCUSize - 1);
- uint32_t padsize = minCUSize - rem;
+ uint32_t rem = p->sourceHeight & (MIN_CU_SIZE - 1);
+ uint32_t padsize = MIN_CU_SIZE - rem;
p->sourceHeight += padsize;
- /* set the confirmation window offsets */
m_conformanceWindow.bEnabled = true;
m_conformanceWindow.bottomOffset = padsize;
}
+ if (p->bDistributeModeAnalysis && p->analysisMode)
+ {
+ p->analysisMode = X265_ANALYSIS_OFF;
+ x265_log(p, X265_LOG_WARNING, "Analysis save and load mode not supported for distributed mode analysis\n");
+ }
+}
+
+void Encoder::allocAnalysis(x265_analysis_data* analysis)
+{
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+ {
+ analysis_intra_data *intraData = (analysis_intra_data*)analysis->intraData;
+ CHECKED_MALLOC_ZERO(intraData, analysis_intra_data, 1);
+ CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
+ CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
+ CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame);
+ analysis->intraData = intraData;
+ }
+ else
+ {
+ analysis_inter_data *interData = (analysis_inter_data*)analysis->interData;
+ CHECKED_MALLOC(interData, analysis_inter_data, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2);
+ analysis->interData = interData;
+ }
+ return;
+
+fail:
+ freeAnalysis(analysis);
+ m_aborted = true;
+}
+
+void Encoder::freeAnalysis(x265_analysis_data* analysis)
+{
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+ {
+ X265_FREE(((analysis_intra_data*)analysis->intraData)->depth);
+ X265_FREE(((analysis_intra_data*)analysis->intraData)->modes);
+ X265_FREE(((analysis_intra_data*)analysis->intraData)->partSizes);
+ X265_FREE(analysis->intraData);
+ }
+ else
+ X265_FREE(analysis->interData);
+}
+
+void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc)
+{
+
+#define X265_FREAD(val, size, readSize, fileOffset)\
+ if (fread(val, size, readSize, fileOffset) != readSize)\
+ {\
+ x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\
+ freeAnalysis(analysis);\
+ m_aborted = true;\
+ return;\
+ }\
+
+ static uint64_t consumedBytes = 0;
+ static uint64_t totalConsumedBytes = 0;
+ fseeko(m_analysisFile, totalConsumedBytes, SEEK_SET);
+
+ int poc; uint32_t frameRecordSize;
+ X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
+ X265_FREAD(&poc, sizeof(int), 1, m_analysisFile);
+
+ uint64_t currentOffset = totalConsumedBytes;
+
+ /* Seeking to the right frame Record */
+ while (poc != curPoc && !feof(m_analysisFile))
+ {
+ currentOffset += frameRecordSize;
+ fseeko(m_analysisFile, currentOffset, SEEK_SET);
+ X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
+ X265_FREAD(&poc, sizeof(int), 1, m_analysisFile);
+ }
+
+ if (poc != curPoc || feof(m_analysisFile))
+ {
+ x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc);
+ freeAnalysis(analysis);
+ return;
+ }
+
+ /* Now arrived at the right frame, read the record */
+ analysis->poc = poc;
+ analysis->frameRecordSize = frameRecordSize;
+ X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFile);
+ X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile);
+ X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFile);
+
+ /* Memory is allocated for inter and intra analysis data based on the slicetype */
+ allocAnalysis(analysis);
+
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+ {
+ X265_FREAD(((analysis_intra_data *)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FREAD(((analysis_intra_data *)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ analysis->sliceType = X265_TYPE_I;
+ consumedBytes += frameRecordSize;
+ }
+ else if (analysis->sliceType == X265_TYPE_P)
+ {
+ X265_FREAD(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
+ consumedBytes += frameRecordSize;
+ totalConsumedBytes = consumedBytes;
+ }
+ else
+ {
+ X265_FREAD(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
+ consumedBytes += frameRecordSize;
+ }
+#undef X265_FREAD
+}
+
+void Encoder::writeAnalysisFile(x265_analysis_data* analysis)
+{
+
+#define X265_FWRITE(val, size, writeSize, fileOffset)\
+ if (fwrite(val, size, writeSize, fileOffset) < writeSize)\
+ {\
+ x265_log(NULL, X265_LOG_ERROR, "Error writing analysis data\n");\
+ freeAnalysis(analysis);\
+ m_aborted = true;\
+ return;\
+ }\
+
+ /* calculate frameRecordSize */
+ analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
+ sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions);
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+ analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 3;
+ else if (analysis->sliceType == X265_TYPE_P)
+ analysis->frameRecordSize += sizeof(analysis_inter_data) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU;
+ else
+ analysis->frameRecordSize += sizeof(analysis_inter_data) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2;
+
+ X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
+ X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFile);
+ X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFile);
+ X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile);
+ X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFile);
+
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+ {
+ X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ }
+ else if (analysis->sliceType == X265_TYPE_P)
+ {
+ X265_FWRITE(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
+ }
+ else
+ {
+ X265_FWRITE(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
+ }
+#undef X265_FWRITE
}
class Encoder : public x265_encoder
{
-private:
+public:
int m_pocLast; // time index (POC)
int m_encodedFrameNum;
int m_numChromaWPFrames; // number of P frames with weighted chroma reference
int m_numLumaWPBiFrames; // number of B frames with weighted luma reference
int m_numChromaWPBiFrames; // number of B frames with weighted chroma reference
-
-public:
-
+ FILE* m_analysisFile;
int m_conformanceMode;
VPS m_vps;
SPS m_sps;
bool m_aborted; // fatal error detected
Encoder();
-
~Encoder() {}
void create();
void destroy();
- void init();
int encode(const x265_picture* pic, x265_picture *pic_out);
void updateVbvPlan(RateControl* rc);
+ void allocAnalysis(x265_analysis_data* analysis);
+
+ void freeAnalysis(x265_analysis_data* analysis);
+
+ void readAnalysisFile(x265_analysis_data* analysis, int poc);
+
+ void writeAnalysisFile(x265_analysis_data* pic);
+
+ void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits);
+
protected:
void initSPS(SPS *sps);
void initPPS(PPS *pps);
-
- void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits);
};
}
if (pps.bUseDQP)
WRITE_UVLC(pps.maxCuDQPDepth, "diff_cu_qp_delta_depth");
- WRITE_SVLC(pps.chromaCbQpOffset, "pps_cb_qp_offset");
- WRITE_SVLC(pps.chromaCrQpOffset, "pps_cr_qp_offset");
+ WRITE_SVLC(pps.chromaQpOffset[0], "pps_cb_qp_offset");
+ WRITE_SVLC(pps.chromaQpOffset[1], "pps_cr_qp_offset");
WRITE_FLAG(0, "pps_slice_chroma_qp_offsets_present_flag");
WRITE_FLAG(pps.bUseWeightPred, "weighted_pred_flag");
// Ideally this process should not be repeated for each slice in a picture
if (slice.isIRAP())
for (int picIdx = 0; picIdx < slice.m_rps.numberOfPictures; picIdx++)
+ {
X265_CHECK(!slice.m_rps.bUsed[picIdx], "pic unused failure\n");
+ }
#endif
WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag");
}
/* encode a CU block recursively */
-void Entropy::encodeCU(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP)
+void Entropy::encodeCU(const CUData& ctu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP)
{
- const Slice* slice = cu.m_slice;
+ const Slice* slice = ctu.m_slice;
if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
bEncodeDQP = true;
if (!cuUnsplitFlag)
{
- uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2;
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
- encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP);
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
+ encodeCU(ctu, childGeom, absPartIdx, depth + 1, bEncodeDQP);
}
return;
}
// We need to split, so don't try these modes.
if (cuSplitFlag)
- codeSplitFlag(cu, absPartIdx, depth);
+ codeSplitFlag(ctu, absPartIdx, depth);
- if (depth < cu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth)
+ if (depth < ctu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth)
{
- uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2;
-
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP);
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
+ encodeCU(ctu, childGeom, absPartIdx, depth + 1, bEncodeDQP);
}
return;
}
if (slice->m_pps->bTransquantBypassEnabled)
- codeCUTransquantBypassFlag(cu.m_tqBypass[absPartIdx]);
+ codeCUTransquantBypassFlag(ctu.m_tqBypass[absPartIdx]);
if (!slice->isIntra())
- codeSkipFlag(cu, absPartIdx);
-
- if (cu.isSkipped(absPartIdx))
{
- codeMergeIndex(cu, absPartIdx);
- finishCU(cu, absPartIdx, depth);
- return;
+ codeSkipFlag(ctu, absPartIdx);
+ if (ctu.isSkipped(absPartIdx))
+ {
+ codeMergeIndex(ctu, absPartIdx);
+ finishCU(ctu, absPartIdx, depth);
+ return;
+ }
+ codePredMode(ctu.m_predMode[absPartIdx]);
}
- if (!slice->isIntra())
- codePredMode(cu.m_predMode[absPartIdx]);
-
- codePartSize(cu, absPartIdx, depth);
+ codePartSize(ctu, absPartIdx, depth);
// prediction Info ( Intra : direction mode, Inter : Mv, reference idx )
- codePredInfo(cu, absPartIdx);
+ codePredInfo(ctu, absPartIdx);
uint32_t tuDepthRange[2];
- if (cu.isIntra(absPartIdx))
- cu.getIntraTUQtDepthRange(tuDepthRange, absPartIdx);
+ if (ctu.isIntra(absPartIdx))
+ ctu.getIntraTUQtDepthRange(tuDepthRange, absPartIdx);
else
- cu.getInterTUQtDepthRange(tuDepthRange, absPartIdx);
+ ctu.getInterTUQtDepthRange(tuDepthRange, absPartIdx);
// Encode Coefficients, allow codeCoeff() to modify bEncodeDQP
- codeCoeff(cu, absPartIdx, depth, bEncodeDQP, tuDepthRange);
+ codeCoeff(ctu, absPartIdx, bEncodeDQP, tuDepthRange);
// --- write terminating bit ---
- finishCU(cu, absPartIdx, depth);
+ finishCU(ctu, absPartIdx, depth);
+}
+
+/* Return bit count of signaling inter mode */
+uint32_t Entropy::bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t depth) const
+{
+ uint32_t bits;
+ bits = bitsCodeBin(0, m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); /* not skip */
+ bits += bitsCodeBin(0, m_contextState[OFF_PRED_MODE_CTX]); /* inter */
+ PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
+ switch (partSize)
+ {
+ case SIZE_2Nx2N:
+ bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX]);
+ break;
+
+ case SIZE_2NxN:
+ case SIZE_2NxnU:
+ case SIZE_2NxnD:
+ bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]);
+ bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 1]);
+ if (cu.m_slice->m_sps->maxAMPDepth > depth)
+ {
+ bits += bitsCodeBin((partSize == SIZE_2NxN) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]);
+ if (partSize != SIZE_2NxN)
+ bits++; // encodeBinEP((partSize == SIZE_2NxnU ? 0 : 1));
+ }
+ break;
+
+ case SIZE_Nx2N:
+ case SIZE_nLx2N:
+ case SIZE_nRx2N:
+ bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]);
+ bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 1]);
+ if (depth == g_maxCUDepth && !(cu.m_log2CUSize[absPartIdx] == 3))
+ bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 2]);
+ if (cu.m_slice->m_sps->maxAMPDepth > depth)
+ {
+ bits += bitsCodeBin((partSize == SIZE_Nx2N) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]);
+ if (partSize != SIZE_Nx2N)
+ bits++; // encodeBinEP((partSize == SIZE_nLx2N ? 0 : 1));
+ }
+ break;
+ default:
+ X265_CHECK(0, "invalid CU partition\n");
+ break;
+ }
+
+ return bits;
}
/* finish encoding a cu and handle end-of-slice conditions */
-void Entropy::finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
+void Entropy::finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth)
{
- const Slice* slice = cu.m_slice;
- X265_CHECK(cu.m_slice->m_endCUAddr == cu.m_slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n");
+ const Slice* slice = ctu.m_slice;
uint32_t realEndAddress = slice->m_endCUAddr;
- uint32_t cuAddr = cu.getSCUAddr() + absPartIdx;
+ uint32_t cuAddr = ctu.getSCUAddr() + absPartIdx;
+ X265_CHECK(realEndAddress == slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n");
uint32_t granularityMask = g_maxCUSize - 1;
- uint32_t cuSize = 1 << cu.m_log2CUSize[absPartIdx];
- uint32_t rpelx = cu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize;
- uint32_t bpely = cu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize;
+ uint32_t cuSize = 1 << ctu.m_log2CUSize[absPartIdx];
+ uint32_t rpelx = ctu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize;
+ uint32_t bpely = ctu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize;
bool granularityBoundary = (((rpelx & granularityMask) == 0 || (rpelx == slice->m_sps->picWidthInLumaSamples )) &&
((bpely & granularityMask) == 0 || (bpely == slice->m_sps->picHeightInLumaSamples)));
}
}
-void Entropy::encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLuma, uint32_t offsetChroma, uint32_t absPartIdx,
- uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx, bool& bCodeDQP, uint32_t depthRange[2])
+void Entropy::encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
+ bool& bCodeDQP, const uint32_t depthRange[2])
{
- const bool subdiv = cu.m_tuDepth[absPartIdx] + cu.m_cuDepth[absPartIdx] > (uint8_t)depth;
- uint32_t hChromaShift = cu.m_hChromaShift;
- uint32_t vChromaShift = cu.m_vChromaShift;
- uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, trIdx);
- uint32_t cbfU = cu.getCbf(absPartIdx, TEXT_CHROMA_U, trIdx);
- uint32_t cbfV = cu.getCbf(absPartIdx, TEXT_CHROMA_V, trIdx);
-
- if (!trIdx)
- state.bakAbsPartIdxCU = absPartIdx;
-
- if (log2TrSize == 2 && cu.m_chromaFormat != X265_CSP_I444)
- {
- uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- if (!(absPartIdx & (partNum - 1)))
- {
- state.bakAbsPartIdx = absPartIdx;
- state.bakChromaOffset = offsetChroma;
- }
- else if ((absPartIdx & (partNum - 1)) == (partNum - 1))
- {
- cbfU = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_U, trIdx);
- cbfV = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_V, trIdx);
- }
- }
+ const bool subdiv = cu.m_tuDepth[absPartIdx] > tuDepth;
/* in each of these conditions, the subdiv flag is implied and not signaled,
* so we have checks to make sure the implied value matches our intentions */
- if (cu.m_predMode[absPartIdx] == MODE_INTRA && cu.m_partSize[absPartIdx] == SIZE_NxN && depth == cu.m_cuDepth[absPartIdx])
+ if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth)
{
X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n");
}
- else if (cu.m_predMode[absPartIdx] == MODE_INTER && (cu.m_partSize[absPartIdx] != SIZE_2Nx2N) && depth == cu.m_cuDepth[absPartIdx] &&
+ else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth &&
cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1)
{
X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2TrSize %d, depthRange[0] %d\n", log2TrSize, depthRange[0]);
codeTransformSubdivFlag(subdiv, 5 - log2TrSize);
}
- const uint32_t trDepthCurr = depth - cu.m_cuDepth[absPartIdx];
- const bool bFirstCbfOfCU = trDepthCurr == 0;
-
- bool mCodeAll = true;
- const uint32_t numPels = 1 << (log2TrSize * 2 - hChromaShift - vChromaShift);
- if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
- mCodeAll = false;
-
- if (bFirstCbfOfCU || mCodeAll)
+ uint32_t hChromaShift = cu.m_hChromaShift;
+ uint32_t vChromaShift = cu.m_vChromaShift;
+ bool bSmallChroma = (log2TrSize - hChromaShift < 2);
+ if (!tuDepth || !bSmallChroma)
{
- uint32_t tuSize = 1 << log2TrSize;
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1))
- codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_U, trDepthCurr, (subdiv == 0));
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1))
- codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_V, trDepthCurr, (subdiv == 0));
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+ codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+ codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
}
else
{
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1), "chroma xform size match failure\n");
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1), "chroma xform size match failure\n");
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma xform size match failure\n");
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma xform size match failure\n");
}
if (subdiv)
{
- log2TrSize--;
- uint32_t numCoeff = 1 << (log2TrSize * 2);
- uint32_t numCoeffC = (numCoeff >> (hChromaShift + vChromaShift));
- trIdx++;
- ++depth;
- absPartIdxStep >>= 2;
- const uint32_t partNum = NUM_CU_PARTITIONS >> (depth << 1);
+ --log2TrSize;
+ ++tuDepth;
- encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+ uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
- absPartIdx += partNum;
- offsetLuma += numCoeff;
- offsetChroma += numCoeffC;
- encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+ encodeTransform(cu, absPartIdx + 0 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+ encodeTransform(cu, absPartIdx + 1 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+ encodeTransform(cu, absPartIdx + 2 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+ encodeTransform(cu, absPartIdx + 3 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+ return;
+ }
- absPartIdx += partNum;
- offsetLuma += numCoeff;
- offsetChroma += numCoeffC;
- encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+ uint32_t absPartIdxC = bSmallChroma ? absPartIdx & 0xFC : absPartIdx;
- absPartIdx += partNum;
- offsetLuma += numCoeff;
- offsetChroma += numCoeffC;
- encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+ if (cu.isInter(absPartIdxC) && !tuDepth && !cu.getCbf(absPartIdxC, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdxC, TEXT_CHROMA_V, 0))
+ {
+ X265_CHECK(cu.getCbf(absPartIdxC, TEXT_LUMA, 0), "CBF should have been set\n");
}
else
+ codeQtCbfLuma(cu, absPartIdx, tuDepth);
+
+ uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth);
+ uint32_t cbfU = cu.getCbf(absPartIdxC, TEXT_CHROMA_U, tuDepth);
+ uint32_t cbfV = cu.getCbf(absPartIdxC, TEXT_CHROMA_V, tuDepth);
+ if (!(cbfY || cbfU || cbfV))
+ return;
+
+ // dQP: only for CTU once
+ if (cu.m_slice->m_pps->bUseDQP && bCodeDQP)
{
- if (cu.m_predMode[absPartIdx] != MODE_INTRA && depth == cu.m_cuDepth[absPartIdx] && !cu.getCbf(absPartIdx, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdx, TEXT_CHROMA_V, 0))
- {
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n");
- }
- else
- codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+ uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
+ uint32_t absPartIdxLT = absPartIdx & (0xFF << (log2CUSize - LOG2_UNIT_SIZE) * 2);
+ codeDeltaQP(cu, absPartIdxLT);
+ bCodeDQP = false;
+ }
- if (cbfY || cbfU || cbfV)
- {
- // dQP: only for CTU once
- if (cu.m_slice->m_pps->bUseDQP)
- {
- if (bCodeDQP)
- {
- codeDeltaQP(cu, state.bakAbsPartIdxCU);
- bCodeDQP = false;
- }
- }
- }
- if (cbfY)
- codeCoeffNxN(cu, cu.m_trCoeff[0] + offsetLuma, absPartIdx, log2TrSize, TEXT_LUMA);
+ if (cbfY)
+ {
+ uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
+ codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2TrSize, TEXT_LUMA);
+ if (!(cbfU || cbfV))
+ return;
+ }
- int chFmt = cu.m_chromaFormat;
- if (log2TrSize == 2 && chFmt != X265_CSP_I444)
+ if (bSmallChroma)
+ {
+ if ((absPartIdx & 3) != 3)
+ return;
+
+ const uint32_t log2TrSizeC = 2;
+ const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422);
+ const uint32_t curPartNum = 4;
+ uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift));
+ for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
- uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- if ((absPartIdx & (partNum - 1)) == (partNum - 1))
+ TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdxC);
+ const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
+ do
{
- const uint32_t log2TrSizeC = 2;
- const bool splitIntoSubTUs = (chFmt == X265_CSP_I422);
-
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs))
{
- TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, state.bakAbsPartIdx);
- const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
- do
- {
- uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
- if (cbf)
- {
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- codeCoeffNxN(cu, coeffChroma + state.bakChromaOffset + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
- }
- }
- while (tuIterator.isNextSection());
+ uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+ codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
}
}
+ while (tuIterator.isNextSection());
}
- else
+ }
+ else
+ {
+ uint32_t log2TrSizeC = log2TrSize - hChromaShift;
+ const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422);
+ uint32_t curPartNum = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
+ uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift));
+ for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
- uint32_t log2TrSizeC = log2TrSize - hChromaShift;
- const bool splitIntoSubTUs = (chFmt == X265_CSP_I422);
- uint32_t curPartNum = NUM_CU_PARTITIONS >> (depth << 1);
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdxC);
+ const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
+ do
{
- TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdx);
- const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
- do
+ if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs))
{
- uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
- if (cbf)
- {
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- codeCoeffNxN(cu, coeffChroma + offsetChroma + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
- }
+ uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+ codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
}
- while (tuIterator.isNextSection());
}
+ while (tuIterator.isNextSection());
}
}
}
codeIntraDirChroma(cu, absPartIdx, chromaDirMode);
- if ((cu.m_chromaFormat == X265_CSP_I444) && (cu.m_partSize[absPartIdx] == SIZE_NxN))
+ if (cu.m_chromaFormat == X265_CSP_I444 && cu.m_partSize[absPartIdx] != SIZE_2Nx2N)
{
- uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2;
- for (uint32_t i = 1; i <= 3; i++)
+ uint32_t qNumParts = 1 << (cu.m_log2CUSize[absPartIdx] - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 1; qIdx < 4; ++qIdx)
{
- uint32_t offset = absPartIdx + i * partOffset;
- cu.getAllowedChromaDir(offset, chromaDirMode);
- codeIntraDirChroma(cu, offset, chromaDirMode);
+ absPartIdx += qNumParts;
+ cu.getAllowedChromaDir(absPartIdx, chromaDirMode);
+ codeIntraDirChroma(cu, absPartIdx, chromaDirMode);
}
}
}
codeRefFrmIdx(cu, absPartIdx, list);
}
-void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2])
+void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2])
{
if (!cu.isIntra(absPartIdx))
{
return;
}
- uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
- uint32_t lumaOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
- uint32_t chromaOffset = lumaOffset >> (cu.m_hChromaShift + cu.m_vChromaShift);
- uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> (depth << 1);
- CoeffCodeState state;
- encodeTransform(cu, state, lumaOffset, chromaOffset, absPartIdx, absPartIdxStep, depth, log2CUSize, 0, bCodeDQP, depthRange);
+ uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
+ encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
}
void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane)
if (codeNumber != 0)
{
unsigned long idx;
- CLZ32(idx, codeNumber + 1);
+ CLZ(idx, codeNumber + 1);
length = idx;
codeNumber -= (1 << idx) - 1;
}
markValid();
}
-void Entropy::codeMVPIdx(uint32_t symbol)
-{
- encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]);
-}
-
void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
{
PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
}
}
-void Entropy::codePredMode(int predMode)
-{
- encodeBin(predMode == MODE_INTER ? 0 : 1, m_contextState[OFF_PRED_MODE_CTX]);
-}
-
-void Entropy::codeCUTransquantBypassFlag(uint32_t symbol)
-{
- encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]);
-}
-
-void Entropy::codeSkipFlag(const CUData& cu, uint32_t absPartIdx)
-{
- // get context function is here
- uint32_t symbol = cu.isSkipped(absPartIdx) ? 1 : 0;
- uint32_t ctxSkip = cu.getCtxSkipFlag(absPartIdx);
-
- encodeBin(symbol, m_contextState[OFF_SKIP_FLAG_CTX + ctxSkip]);
-}
-
-void Entropy::codeMergeFlag(const CUData& cu, uint32_t absPartIdx)
-{
- const uint32_t symbol = cu.m_mergeFlag[absPartIdx] ? 1 : 0;
-
- encodeBin(symbol, m_contextState[OFF_MERGE_FLAG_EXT_CTX]);
-}
-
void Entropy::codeMergeIndex(const CUData& cu, uint32_t absPartIdx)
{
uint32_t numCand = cu.m_slice->m_maxNumMergeCand;
}
}
-void Entropy::codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
-{
- X265_CHECK(depth < g_maxCUDepth, "invalid depth\n");
-
- uint32_t ctx = cu.getCtxSplitFlag(absPartIdx, depth);
- uint32_t currSplitFlag = (cu.m_cuDepth[absPartIdx] > depth) ? 1 : 0;
-
- X265_CHECK(ctx < 3, "ctx out of range\n");
- encodeBin(currSplitFlag, m_contextState[OFF_SPLIT_FLAG_CTX + ctx]);
-}
-
-void Entropy::codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx)
-{
- encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]);
-}
-
-uint32_t Entropy::bitsIntraModeNonMPM() const
-{
- uint32_t mstate = m_contextState[OFF_ADI_CTX];
- uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 0)) >> 15;
- return bits + 5; /* fixed cost for encodeBinsEP() */
-}
-
-uint32_t Entropy::bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const
-{
- X265_CHECK(dir == preds[0] || dir == preds[1] || dir == preds[2], "dir must be a most probable mode\n");
- uint32_t mstate = m_contextState[OFF_ADI_CTX];
- uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 1)) >> 15;
- return bits + (dir == preds[0] ? 1 : 2);
-}
-
void Entropy::codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple)
{
uint32_t dir[4], j;
uint32_t preds[4][3];
int predIdx[4];
- PartSize mode = (PartSize)cu.m_partSize[absPartIdx];
- uint32_t partNum = isMultiple ? (mode == SIZE_NxN ? 4 : 1) : 1;
- uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2;
+ uint32_t partNum = isMultiple && cu.m_partSize[absPartIdx] != SIZE_2Nx2N ? 4 : 1;
+ uint32_t qNumParts = 1 << (cu.m_log2CUSize[absPartIdx] - 1 - LOG2_UNIT_SIZE) * 2;
- for (j = 0; j < partNum; j++)
+ for (j = 0; j < partNum; j++, absPartIdx += qNumParts)
{
- dir[j] = cu.m_lumaIntraDir[absPartIdx + partOffset * j];
- cu.getIntraDirLumaPredictor(absPartIdx + partOffset * j, preds[j]);
+ dir[j] = cu.m_lumaIntraDir[absPartIdx];
+ cu.getIntraDirLumaPredictor(absPartIdx, preds[j]);
predIdx[j] = -1;
for (uint32_t i = 0; i < 3; i++)
if (dir[j] == preds[j][i])
}
}
-void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel)
+void Entropy::codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel)
{
- uint32_t ctx = ctxCbf[ttype][trDepth];
+ uint32_t ctx = tuDepth + 2;
- bool canQuadSplit = (width >= (MIN_TU_SIZE * 2)) && (height >= (MIN_TU_SIZE * 2));
- uint32_t lowestTUDepth = trDepth + ((!lowestLevel && !canQuadSplit) ? 1 : 0); // unsplittable TUs inherit their parent's CBF
+ uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
+ bool canQuadSplit = (log2TrSize - cu.m_hChromaShift > 2);
+ uint32_t lowestTUDepth = tuDepth + ((!lowestLevel && !canQuadSplit) ? 1 : 0); // unsplittable TUs inherit their parent's CBF
- if ((width != height) && (lowestLevel || !canQuadSplit)) // if sub-TUs are present
+ if (cu.m_chromaFormat == X265_CSP_I422 && (lowestLevel || !canQuadSplit)) // if sub-TUs are present
{
uint32_t subTUDepth = lowestTUDepth + 1; // if this is the lowest level of the TU-tree, the sub-TUs are directly below.
// Otherwise, this must be the level above the lowest level (as specified above)
- uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
-
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
- uint32_t cbf = cu.getCbf(subTUAbsPartIdx, ttype, subTUDepth);
+ uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
- encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
- }
+ encodeBin(cu.getCbf(absPartIdx , ttype, subTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
+ encodeBin(cu.getCbf(absPartIdx + tuNumParts, ttype, subTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
}
else
- {
- uint32_t cbf = cu.getCbf(absPartIdx, ttype, lowestTUDepth);
-
- encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
- }
-}
-
-void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth)
-{
- uint32_t ctx = ctxCbf[ttype][trDepth];
- uint32_t cbf = cu.getCbf(absPartIdx, ttype, trDepth);
- encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
-}
-
-void Entropy::codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth)
-{
- uint32_t ctx = ctxCbf[ttype][trDepth];
- encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
+ encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
}
void Entropy::codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype)
encodeBin(useTransformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]);
}
-void Entropy::codeQtRootCbf(uint32_t cbf)
-{
- encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]);
-}
-
-void Entropy::codeQtCbfZero(TextType ttype, uint32_t trDepth)
-{
- // this function is only used to estimate the bits when cbf is 0
- // and will never be called when writing the bitsream.
- uint32_t ctx = ctxCbf[ttype][trDepth];
- encodeBin(0, m_contextState[OFF_QT_CBF_CTX + ctx]);
-}
-
-void Entropy::codeQtRootCbfZero()
-{
- // this function is only used to estimate the bits when cbf is 0
- // and will never be called when writing the bistream.
- encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]);
-}
-
/** Encode (X,Y) position of the last significant coefficient
* \param posx X component of last coefficient
* \param posy Y component of last coefficient
if ((binValue ^ mstate) & 1)
{
// NOTE: lps is non-zero and the maximum of idx is 8 because lps less than 256
- //numBits = g_renormTable[lps >> 3];
+ //numBits = g_renormTable[lps >> 3];
unsigned long idx;
- CLZ32(idx, lps);
+ CLZ(idx, lps);
X265_CHECK(state != 63 || idx == 1, "state failure\n");
numBits = 8 - idx;
#include "common.h"
#include "bitstream.h"
#include "frame.h"
+#include "cudata.h"
#include "contexts.h"
#include "slice.h"
struct SaoCtuParam;
struct EstBitsSbac;
-class CUData;
-struct CUGeom;
class ScalingList;
enum SplitType
void finishSlice() { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
void encodeCTU(const CUData& cu, const CUGeom& cuGeom);
- void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
- void codeSaoMerge(uint32_t code) { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
- void codeCUTransquantBypassFlag(uint32_t symbol);
- void codeSkipFlag(const CUData& cu, uint32_t absPartIdx);
- void codeMergeFlag(const CUData& cu, uint32_t absPartIdx);
+ void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple);
+ void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode);
+
void codeMergeIndex(const CUData& cu, uint32_t absPartIdx);
- void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
- void codeMVPIdx(uint32_t symbol);
void codeMvd(const CUData& cu, uint32_t absPartIdx, int list);
void codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
- void codePredMode(int predMode);
void codePredInfo(const CUData& cu, uint32_t absPartIdx);
- void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx);
- void codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel);
- void codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth);
- void codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth);
- void codeQtCbfZero(TextType ttype, uint32_t trDepth);
- void codeQtRootCbfZero();
- void codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2]);
+ inline void codeQtCbfLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth) { codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); }
+
+ void codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel);
+ void codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2]);
void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype);
- uint32_t bitsIntraModeNonMPM() const;
- uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const;
- void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple);
- void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode);
+ inline void codeSaoMerge(uint32_t code) { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
+ inline void codeMVPIdx(uint32_t symbol) { encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); }
+ inline void codeMergeFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.m_mergeFlag[absPartIdx], m_contextState[OFF_MERGE_FLAG_EXT_CTX]); }
+ inline void codeSkipFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.isSkipped(absPartIdx), m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); }
+ inline void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth) { encodeBin(cu.m_cuDepth[absPartIdx] > depth, m_contextState[OFF_SPLIT_FLAG_CTX + cu.getCtxSplitFlag(absPartIdx, depth)]); }
+ inline void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx) { encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]); }
+ inline void codePredMode(int predMode) { encodeBin(predMode == MODE_INTRA ? 1 : 0, m_contextState[OFF_PRED_MODE_CTX]); }
+ inline void codeCUTransquantBypassFlag(uint32_t symbol) { encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]); }
+ inline void codeQtCbfLuma(uint32_t cbf, uint32_t tuDepth) { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + !tuDepth]); }
+ inline void codeQtCbfChroma(uint32_t cbf, uint32_t tuDepth) { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + 2 + tuDepth]); }
+ inline void codeQtRootCbf(uint32_t cbf) { encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]); }
+
+ void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
- // RDO functions
+ /* RDO functions */
void estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
void estCBFBit(EstBitsSbac& estBitsSbac) const;
void estSignificantCoeffGroupMapBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const;
void estSignificantMapBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
void estSignificantCoefficientsBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const;
+ inline uint32_t bitsIntraModeNonMPM() const { return bitsCodeBin(0, m_contextState[OFF_ADI_CTX]) + 5; }
+ inline uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const { return bitsCodeBin(1, m_contextState[OFF_ADI_CTX]) + (dir == preds[0] ? 1 : 2); }
+ inline uint32_t estimateCbfBits(uint32_t cbf, TextType ttype, uint32_t tuDepth) const { return bitsCodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctxCbf[ttype][tuDepth]]); }
+ uint32_t bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t depth) const;
+ uint32_t bitsIntraMode(const CUData& cu, uint32_t absPartIdx) const
+ {
+ return bitsCodeBin(0, m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]) + /* not skip */
+ bitsCodeBin(1, m_contextState[OFF_PRED_MODE_CTX]); /* intra */
+ }
+
+ /* these functions are only used to estimate the bits when cbf is 0 and will never be called when writing the bistream. */
+ inline void codeQtRootCbfZero() { encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]); }
+
private:
/* CABAC private methods */
void encodeBinsEP(uint32_t binValues, int numBins);
void encodeBinTrm(uint32_t binValue);
- void encodeCU(const CUData& cu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP);
- void finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
+ /* return the bits of encoding the context bin without updating */
+ inline uint32_t bitsCodeBin(uint32_t binValue, uint32_t ctxModel) const
+ {
+ uint64_t fracBits = (m_fracBits & 32767) + sbacGetEntropyBits(ctxModel, binValue);
+ return (uint32_t)(fracBits >> 15);
+ }
+
+ void encodeCU(const CUData& ctu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP);
+ void finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth);
void writeOut();
void codePredWeightTable(const Slice& slice);
void codeInterDir(const CUData& cu, uint32_t absPartIdx);
void codePUWise(const CUData& cu, uint32_t absPartIdx);
- void codeQtRootCbf(uint32_t cbf);
void codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list);
void codeRefFrmIdx(const CUData& cu, uint32_t absPartIdx, int list);
void codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx);
void codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype);
- struct CoeffCodeState
- {
- uint32_t bakAbsPartIdx;
- uint32_t bakChromaOffset;
- uint32_t bakAbsPartIdxCU;
- };
-
- void encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma,
- uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx,
- bool& bCodeDQP, uint32_t depthRange[2]);
+ void encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
+ bool& bCodeDQP, const uint32_t depthRange[2]);
void copyFrom(const Entropy& src);
void copyContextsFrom(const Entropy& src);
#include "wavefront.h"
#include "param.h"
-#include "PPA/ppa.h"
-
#include "encoder.h"
#include "frameencoder.h"
#include "common.h"
ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
}
- if (m_param->noiseReduction)
+ if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
m_nr = X265_MALLOC(NoiseReduction, 1);
if (m_nr)
memset(m_nr, 0, sizeof(NoiseReduction));
else
- m_param->noiseReduction = 0;
+ m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
start();
return ok;
}
/* Generate a complete list of unique geom sets for the current picture dimensions */
-bool FrameEncoder::initializeGeoms(const FrameData& encData)
+bool FrameEncoder::initializeGeoms()
{
/* Geoms only vary between CTUs in the presence of picture edges */
- int heightRem = m_param->sourceHeight & (m_param->maxCUSize - 1);
- int widthRem = m_param->sourceWidth & (m_param->maxCUSize - 1);
+ int maxCUSize = m_param->maxCUSize;
+ int heightRem = m_param->sourceHeight & (maxCUSize - 1);
+ int widthRem = m_param->sourceWidth & (maxCUSize - 1);
int allocGeoms = 1; // body
if (heightRem && widthRem)
allocGeoms = 4; // body, right, bottom, corner
if (!m_cuGeoms || !m_ctuGeomMap)
return false;
- CUGeom cuLocalData[CUGeom::MAX_GEOMS];
- memset(cuLocalData, 0, sizeof(cuLocalData)); // temporal fix for memcmp
+ // body
+ CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, m_cuGeoms);
+ memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols);
+ if (allocGeoms == 1)
+ return true;
- int countGeoms = 0;
- for (uint32_t ctuAddr = 0; ctuAddr < m_numRows * m_numCols; ctuAddr++)
+ int countGeoms = 1;
+ if (widthRem)
{
- /* TODO: detach this logic from TComDataCU */
- encData.m_picCTU[ctuAddr].initCTU(*m_frame, ctuAddr, 0);
- encData.m_picCTU[ctuAddr].calcCTUGeoms(m_param->sourceWidth, m_param->sourceHeight, m_param->maxCUSize, cuLocalData);
-
- m_ctuGeomMap[ctuAddr] = MAX_INT;
- for (int i = 0; i < countGeoms; i++)
+ // right
+ CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+ for (int i = 0; i < m_numRows; i++)
{
- if (!memcmp(cuLocalData, m_cuGeoms + i * CUGeom::MAX_GEOMS, sizeof(CUGeom) * CUGeom::MAX_GEOMS))
- {
- m_ctuGeomMap[ctuAddr] = i * CUGeom::MAX_GEOMS;
- break;
- }
+ uint32_t ctuAddr = m_numCols * (i + 1) - 1;
+ m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
}
+ countGeoms++;
+ }
+ if (heightRem)
+ {
+ // bottom
+ CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+ for (uint32_t i = 0; i < m_numCols; i++)
+ {
+ uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i;
+ m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
+ }
+ countGeoms++;
- if (m_ctuGeomMap[ctuAddr] == MAX_INT)
+ if (widthRem)
{
- X265_CHECK(countGeoms < allocGeoms, "geometry match check failure\n");
+ // corner
+ CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+
+ uint32_t ctuAddr = m_numCols * m_numRows - 1;
m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
- memcpy(m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS, cuLocalData, sizeof(CUGeom) * CUGeom::MAX_GEOMS);
countGeoms++;
}
+ X265_CHECK(countGeoms == allocGeoms, "geometry match check failure\n");
}
return true;
m_frame = curFrame;
curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it
curFrame->m_encData->m_slice->m_mref = m_mref;
+
if (!m_cuGeoms)
{
- if (!initializeGeoms(*curFrame->m_encData))
+ if (!initializeGeoms())
return false;
}
+
m_enable.trigger();
return true;
}
void FrameEncoder::compressFrame()
{
- PPAScopeEvent(FrameEncoder_compressFrame);
+ //ProfileScopeEvent(frameThread);
int64_t startCompressTime = x265_mdate();
Slice* slice = m_frame->m_encData->m_slice;
WeightParam *w = NULL;
if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
w = slice->m_weightPredTable[l][ref];
- m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPicYuv, w);
+ m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
}
}
for (int i = 0; i < m_top->m_numThreadLocalData; i++)
{
NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID];
- memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
+ memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
}
void FrameEncoder::compressCTURows()
{
- PPAScopeEvent(FrameEncoder_compressRows);
Slice* slice = m_frame->m_encData->m_slice;
m_bAllRowsStop = false;
}
}
- processRow(i * 2 + 0, -1);
+ processRowEncoder(i, *m_tld);
}
// Filter
if (i >= m_filterRowDelay)
- processRow((i - m_filterRowDelay) * 2 + 1, -1);
+ m_frameFilter.processRow(i - m_filterRowDelay);
}
}
m_frameTime = (double)m_totalTime / 1000000;
processRowEncoder(realRow, tld);
else
{
- processRowFilter(realRow);
+ m_frameFilter.processRow(realRow);
// NOTE: Active next row
if (realRow != m_numRows - 1)
// Called by worker threads
void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
{
- PPAScopeEvent(Thread_ProcessRow);
-
CTURow& curRow = m_rows[row];
{
Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder;
FrameData& curEncData = *m_frame->m_encData;
Slice *slice = curEncData.m_slice;
- PicYuv* fencPic = m_frame->m_origPicYuv;
-
- tld.analysis.m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
int64_t startTime = x265_mdate();
const uint32_t numCols = m_numCols;
while (curRow.completed < numCols)
{
+ ProfileScopeEvent(encodeCTU);
+
int col = curRow.completed;
const uint32_t cuAddr = lineStartCUAddr + col;
CUData* ctu = curEncData.getPicCTU(cuAddr);
int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp);
tld.analysis.setQP(*slice, qp);
qp = Clip3(QP_MIN, QP_MAX_SPEC, qp);
- ctu->setQPSubParts((char)qp, 0, 0);
+ ctu->setQPSubParts((int8_t)qp, 0, 0);
curEncData.m_rowStat[row].sumQpAq += qp;
}
else
}
// Does all the CU analysis, returns best top level mode decision
- Search::Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
+ Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
/* advance top-level row coder to include the context of this CTU.
* if SAO is disabled, rowCoder writes the final CTU bitstream */
if (dequeueRow(r * 2))
stopRow.active = false;
else
+ {
+ /* we must release the row lock to allow the thread to exit */
+ stopRow.lock.release();
GIVE_UP_TIME();
+ stopRow.lock.acquire();
+ }
}
-
stopRow.lock.release();
bool bRowBusy = true;
m_top->m_rateControl->rateControlUpdateStats(&m_rce);
}
- // trigger row-wise loop filters
- if (row >= m_filterRowDelay)
+ if (m_param->bEnableWavefront)
{
- enableRowFilter(row - m_filterRowDelay);
+ /* trigger row-wise loop filters */
+ if (row >= m_filterRowDelay)
+ {
+ enableRowFilter(row - m_filterRowDelay);
- // NOTE: Active Filter to first row (row 0)
- if (row == m_filterRowDelay)
- enqueueRowFilter(0);
- }
- if (row == m_numRows - 1)
- {
- for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
- enableRowFilter(i);
+ /* NOTE: Activate filter if first row (row 0) */
+ if (row == m_filterRowDelay)
+ enqueueRowFilter(0);
+ }
+ if (row == m_numRows - 1)
+ {
+ for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
+ enableRowFilter(i);
+ }
}
m_totalTime += x265_mdate() - startTime;
log->cntIntra[depth]++;
log->qTreeIntraCnt[depth]++;
- if (ctu.m_partSize[absPartIdx] == SIZE_NONE)
+ if (ctu.m_predMode[absPartIdx] == MODE_NONE)
{
log->totalCu--;
log->cntIntra[depth]--;
log->qTreeIntraCnt[depth]--;
}
- else if (ctu.m_partSize[absPartIdx] == SIZE_NxN)
+ else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
{
/* TODO: log intra modes at absPartIdx +0 to +3 */
X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
log->totalCu++;
log->cntTotalCu[depth]++;
- if (ctu.m_partSize[absPartIdx] == SIZE_NONE)
+ if (ctu.m_predMode[absPartIdx] == MODE_NONE)
{
log->totalCu--;
log->cntTotalCu[depth]--;
log->cntSkipCu[depth]++;
log->qTreeSkipCnt[depth]++;
}
- else if (ctu.m_predMode[absPartIdx] == MODE_INTER)
+ else if (ctu.isInter(absPartIdx))
{
log->cntInter[depth]++;
log->qTreeInterCnt[depth]++;
else
log->cuInterDistribution[depth][AMP_ID]++;
}
- else if (ctu.m_predMode[absPartIdx] == MODE_INTRA)
+ else if (ctu.isIntra(absPartIdx))
{
log->cntIntra[depth]++;
log->qTreeIntraCnt[depth]++;
- if (ctu.m_partSize[absPartIdx] == SIZE_NxN)
+ if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
{
X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
log->cntIntraNxN++;
m_nr->count[cat] >>= 1;
}
- uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
+ int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter;
+ uint64_t scaledCount = (uint64_t)nrStrength * m_nr->count[cat];
for (int i = 0; i < coefCount; i++)
{
/* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
double qp_offset = 0;
- uint32_t maxBlockCols = (m_frame->m_origPicYuv->m_picWidth + (16 - 1)) / 16;
- uint32_t maxBlockRows = (m_frame->m_origPicYuv->m_picHeight + (16 - 1)) / 16;
+ uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
+ uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16;
uint32_t noOfBlocks = g_maxCUSize / 16;
uint32_t block_y = (ctuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks;
uint32_t block_x = (ctuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth;
protected:
- bool initializeGeoms(const FrameData& encData);
+ bool initializeGeoms();
/* analyze / compress frame, can be run in parallel within reference constraints */
void compressFrame();
/* Called by WaveFront::findJob() */
void processRow(int row, int threadId);
void processRowEncoder(int row, ThreadLocalData& tld);
- void processRowFilter(int row) { m_frameFilter.processRow(row); }
void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); }
void enqueueRowFilter(int row) { WaveFront::enqueueRow(row * 2 + 1); }
#include "framefilter.h"
#include "frameencoder.h"
#include "wavefront.h"
-#include "PPA/ppa.h"
using namespace x265;
void FrameFilter::processRow(int row)
{
- PPAScopeEvent(Thread_filterCU);
+ ProfileScopeEvent(filterCTURow);
if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
{
for (uint32_t col = 0; col < numCols; col++)
{
uint32_t cuAddr = lineStartCUAddr + col;
- CUData* cu = encData.getPicCTU(cuAddr);
+ const CUData* ctu = encData.getPicCTU(cuAddr);
- m_deblock.deblockCTU(cu, Deblock::EDGE_VER);
+ m_deblock.deblockCTU(ctu, Deblock::EDGE_VER);
if (col > 0)
{
- CUData* cuPrev = encData.getPicCTU(cuAddr - 1);
- m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
+ const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
+ m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
}
}
- CUData* cuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
- m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
+ const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
+ m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
}
// SAO
void FrameFilter::processRowPost(int row)
{
- PicYuv *reconPic = m_frame->m_reconPicYuv;
+ PicYuv *reconPic = m_frame->m_reconPic;
const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
const uint32_t lineStartCUAddr = row * numCols;
const int realH = getCUHeight(row);
uint32_t cuAddr = lineStartCUAddr;
if (m_param->bEnablePsnr)
{
- PicYuv* origPic = m_frame->m_origPicYuv;
+ PicYuv* fencPic = m_frame->m_fencPic;
intptr_t stride = reconPic->m_stride;
uint32_t width = reconPic->m_picWidth - m_pad[0];
uint32_t height = getCUHeight(row);
- uint64_t ssdY = computeSSD(origPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
+ uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
height >>= m_vChromaShift;
width >>= m_hChromaShift;
stride = reconPic->m_strideC;
- uint64_t ssdU = computeSSD(origPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
- uint64_t ssdV = computeSSD(origPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
+ uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
+ uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
m_frameEncoder->m_SSDY += ssdY;
m_frameEncoder->m_SSDU += ssdU;
}
if (m_param->bEnableSsim && m_ssimBuf)
{
- pixel *rec = m_frame->m_reconPicYuv->m_picOrg[0];
- pixel *org = m_frame->m_origPicYuv->m_picOrg[0];
- intptr_t stride1 = m_frame->m_origPicYuv->m_stride;
- intptr_t stride2 = m_frame->m_reconPicYuv->m_stride;
+ pixel *rec = m_frame->m_reconPic->m_picOrg[0];
+ pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
+ intptr_t stride1 = m_frame->m_fencPic->m_stride;
+ intptr_t stride2 = m_frame->m_reconPic->m_stride;
uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
uint32_t bStart = (row == 0);
uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
/* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
* to avoid alignment of ssim blocks with DCT blocks. */
minPixY += bStart ? 2 : -6;
- m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, org + 2 + minPixY * stride2, stride2,
+ m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
m_frameEncoder->m_ssimCnt += ssim_cnt;
}
uint32_t size = g_maxCUSize >> depth;
int part = partitionFromSizes(size, size);
- PicYuv* reconPic = frame.m_reconPicYuv;
- PicYuv* fencPic = frame.m_origPicYuv;
+ PicYuv* reconPic = frame.m_reconPic;
+ PicYuv* fencPic = frame.m_fencPic;
pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
using namespace x265;
namespace {
+
struct SubpelWorkload
{
int hpel_iters;
bool hpel_satd;
};
-SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
+const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
{
{ 1, 4, 0, 4, false }, // 4 SAD HPEL only
{ 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
{ 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL
{ 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL
};
-}
-static int size_scale[NUM_LUMA_PARTITIONS];
-#define SAD_THRESH(v) (bcost < (((v >> 4) * size_scale[partEnum])))
+int sizeScale[NUM_LUMA_PARTITIONS];
+#define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
-static void init_scales(void)
+void initScales(void)
{
#define SETUP_SCALE(W, H) \
- size_scale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
+ sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
SETUP_SCALE(4, 4);
SETUP_SCALE(8, 8);
SETUP_SCALE(8, 4);
#undef SETUP_SCALE
}
-MotionEstimate::MotionEstimate()
- : searchMethod(3)
- , subpelRefine(5)
-{
- if (size_scale[0] == 0)
- init_scales();
-
- fenc = X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE);
-}
-
-MotionEstimate::~MotionEstimate()
-{
- X265_FREE(fenc);
-}
-
-void MotionEstimate::setSourcePU(intptr_t offset, int width, int height)
-{
- partEnum = partitionFromSizes(width, height);
- X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
- sad = primitives.sad[partEnum];
- satd = primitives.satd[partEnum];
- sa8d = primitives.sa8d_inter[partEnum];
- sad_x3 = primitives.sad_x3[partEnum];
- sad_x4 = primitives.sad_x4[partEnum];
-
- blockwidth = width;
- blockheight = height;
- blockOffset = offset;
-
- /* copy PU block into cache */
- primitives.luma_copy_pp[partEnum](fenc, FENC_STRIDE, fencplane + offset, fencLumaStride);
-}
-
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
-static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
-static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
-static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
-static const MV hex4[16] =
+const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
+const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
+const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
+const MV hex4[16] =
{
- MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
+ MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
- MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
+ MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
};
-static const MV offsets[] =
+const MV offsets[] =
{
MV(-1, 0), MV(0, -1),
MV(-1, -1), MV(1, -1),
MV(1, 0), MV(0, 1),
}; // offsets for Two Point Search
-/* sum of absolute differences between MV candidates */
-static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidates)
+/* sum of absolute differences between MV candidates, used for adaptive ME range */
+inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
{
int sum = 0;
return sum;
}
+}
+
+MotionEstimate::MotionEstimate()
+{
+ ctuAddr = -1;
+ absPartIdx = -1;
+ searchMethod = X265_HEX_SEARCH;
+ subpelRefine = 2;
+ bChromaSATD = false;
+ chromaSatd = NULL;
+}
+
+void MotionEstimate::init(int method, int refine, int csp)
+{
+ if (!sizeScale[0])
+ initScales();
+
+ searchMethod = method;
+ subpelRefine = refine;
+ fencPUYuv.create(FENC_STRIDE, csp);
+}
+
+MotionEstimate::~MotionEstimate()
+{
+ fencPUYuv.destroy();
+}
+
+/* Called by lookahead, luma only, no use of PicYuv */
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
+{
+ partEnum = partitionFromSizes(pwidth, pheight);
+ X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
+ sad = primitives.sad[partEnum];
+ satd = primitives.satd[partEnum];
+ sad_x3 = primitives.sad_x3[partEnum];
+ sad_x4 = primitives.sad_x4[partEnum];
+
+ blockwidth = pwidth;
+ blockOffset = offset;
+ absPartIdx = ctuAddr = -1;
+
+ /* copy PU block into cache */
+ primitives.luma_copy_pp[partEnum](fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
+ X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
+}
+
+/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
+void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
+{
+ partEnum = partitionFromSizes(pwidth, pheight);
+ X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
+ sad = primitives.sad[partEnum];
+ satd = primitives.satd[partEnum];
+ sad_x3 = primitives.sad_x3[partEnum];
+ sad_x4 = primitives.sad_x4[partEnum];
+ chromaSatd = primitives.chroma[fencPUYuv.m_csp].satd[partEnum];
+
+ /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
+ * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
+ bChromaSATD = subpelRefine > 2 && chromaSatd;
+ X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
+
+ ctuAddr = _ctuAddr;
+ absPartIdx = cuPartIdx + puPartIdx;
+ blockwidth = pwidth;
+ blockOffset = 0;
+
+ /* copy PU from CU Yuv */
+ fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
+}
+
#define COST_MV_PT_DIST(mx, my, point, dist) \
do \
{ \
int merange)
{
ALIGN_VAR_16(int, costs[16]);
- pixel *fref = ref->fpelPlane + blockOffset;
- size_t stride = ref->lumaStride;
+ pixel* fenc = fencPUYuv.m_buf[0];
+ pixel* fref = ref->fpelPlane[0] + blockOffset;
+ intptr_t stride = ref->lumaStride;
MV omv = bmv;
int saved = bcost;
MV & outQMv)
{
ALIGN_VAR_16(int, costs[16]);
- size_t stride = ref->lumaStride;
- pixel *fref = ref->fpelPlane + blockOffset;
+ if (ctuAddr >= 0)
+ blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
+ intptr_t stride = ref->lumaStride;
+ pixel* fenc = fencPUYuv.m_buf[0];
+ pixel* fref = ref->fpelPlane[0] + blockOffset;
setMVP(qmvp);
MV bmv = pmv.roundToFPel();
int bcost = bprecost;
if (pmv.isSubpel())
- {
bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
- }
// measure SAD cost at MV(0) if MVP is not zero
if (pmv.notZero())
}
// measure SAD cost at each QPEL motion vector candidate
- for (int i = 0; i < numCandidates; i++)
+ if (ref->isLowres)
{
- MV m = mvc[i].clipped(qmvmin, qmvmax);
- if (m.notZero() && m != pmv && m != bestpre) // check already measured
+ for (int i = 0; i < numCandidates; i++)
{
- int cost;
- if (ref->isLowres)
- cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
- else
- cost = subpelCompare(ref, m, sad) + mvcost(m);
-
- if (cost < bprecost)
+ MV m = mvc[i].clipped(qmvmin, qmvmax);
+ if (m.notZero() && m != pmv && m != bestpre) // check already measured
{
- bprecost = cost;
- bestpre = m;
+ int cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
+ if (cost < bprecost)
+ {
+ bprecost = cost;
+ bestpre = m;
+ }
+ }
+ }
+ }
+ else
+ {
+ for (int i = 0; i < numCandidates; i++)
+ {
+ MV m = mvc[i].clipped(qmvmin, qmvmax);
+ if (m.notZero() && m != pmv && m != bestpre) // check already measured
+ {
+ int cost = subpelCompare(ref, m, sad) + mvcost(m);
+ if (cost < bprecost)
+ {
+ bprecost = cost;
+ bestpre = m;
+ }
}
}
}
mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
denom++;
}
- mvd += x265_predictor_difference(mvc, numCandidates);
+ mvd += predictorDifference(mvc, numCandidates);
}
sad_ctx = SAD_THRESH(1000) ? 0
else
bmv = bmv.toQPel(); // promote search bmv to qpel
- SubpelWorkload& wl = workload[this->subpelRefine];
+ const SubpelWorkload& wl = workload[this->subpelRefine];
if (!bcost)
{
}
else if (ref->isLowres)
{
- int bdir = 0, cost;
+ int bdir = 0;
for (int i = 1; i <= wl.hpel_dirs; i++)
{
MV qmv = bmv + square1[i] * 2;
- cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
+ int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
for (int i = 1; i <= wl.qpel_dirs; i++)
{
MV qmv = bmv + square1[i];
- cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
+ int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
for (int iter = 0; iter < wl.hpel_iters; iter++)
{
- int bdir = 0, cost;
+ int bdir = 0;
for (int i = 1; i <= wl.hpel_dirs; i++)
{
MV qmv = bmv + square1[i] * 2;
- cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
+ int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
for (int iter = 0; iter < wl.qpel_iters; iter++)
{
- int bdir = 0, cost;
+ int bdir = 0;
for (int i = 1; i <= wl.qpel_dirs; i++)
{
MV qmv = bmv + square1[i];
- cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
+ int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
{
+ intptr_t refStride = ref->lumaStride;
+ pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
int xFrac = qmv.x & 0x3;
int yFrac = qmv.y & 0x3;
+ int cost;
+ intptr_t lclStride = fencPUYuv.m_size;
+ X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
- if ((yFrac | xFrac) == 0)
- {
- pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
- return cmp(fenc, FENC_STRIDE, fref, ref->lumaStride);
- }
+ if (!(yFrac | xFrac))
+ cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
else
{
- /* We are taking a short-cut here if the reference is weighted. To be
+ /* we are taking a short-cut here if the reference is weighted. To be
* accurate we should be interpolating unweighted pixels and weighting
- * the final 16bit values prior to rounding and downshifting. Instead we
+ * the final 16bit values prior to rounding and down shifting. Instead we
* are simply interpolating the weighted full-pel pixels. Not 100%
* accurate but good enough for fast qpel ME */
ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
- pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
- if (yFrac == 0)
+ if (!yFrac)
+ primitives.luma_hpp[partEnum](fref, refStride, subpelbuf, lclStride, xFrac);
+ else if (!xFrac)
+ primitives.luma_vpp[partEnum](fref, refStride, subpelbuf, lclStride, yFrac);
+ else
{
- primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac);
+ ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_LUMA)]);
+
+ int filterSize = NTAPS_LUMA;
+ int halfFilterSize = filterSize >> 1;
+ primitives.luma_hps[partEnum](fref, refStride, immed, blockwidth, xFrac, 1);
+ primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, lclStride, yFrac);
}
- else if (xFrac == 0)
+ cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
+ }
+
+ if (bChromaSATD)
+ {
+ int csp = fencPUYuv.m_csp;
+ int hshift = fencPUYuv.m_hChromaShift;
+ int vshift = fencPUYuv.m_vChromaShift;
+ int shiftHor = (2 + hshift);
+ int shiftVer = (2 + vshift);
+ lclStride = fencPUYuv.m_csize;
+
+ intptr_t refStrideC = ref->reconPic->m_strideC;
+ intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
+
+ const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
+ const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
+
+ xFrac = qmv.x & ((1 << shiftHor) - 1);
+ yFrac = qmv.y & ((1 << shiftVer) - 1);
+
+ if (!(yFrac | xFrac))
{
- primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac);
+ cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
+ cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
}
else
{
- ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]);
+ ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
+ if (!yFrac)
+ {
+ primitives.chroma[csp].filter_hpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
+ cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
- int filterSize = NTAPS_LUMA;
- int halfFilterSize = filterSize >> 1;
- primitives.luma_hps[partEnum](fref, ref->lumaStride, immed, blockwidth, xFrac, 1);
- primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac);
+ primitives.chroma[csp].filter_hpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
+ cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+ }
+ else if (!xFrac)
+ {
+ primitives.chroma[csp].filter_vpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
+ cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+
+ primitives.chroma[csp].filter_vpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
+ cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+ }
+ else
+ {
+ ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
+
+ int extStride = blockwidth >> hshift;
+ int filterSize = NTAPS_CHROMA;
+ int halfFilterSize = (filterSize >> 1);
+
+ primitives.chroma[csp].filter_hps[partEnum](refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
+ primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
+ cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+
+ primitives.chroma[csp].filter_hps[partEnum](refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
+ primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
+ cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+ }
}
- return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE);
}
+
+ return cost;
}
#include "reference.h"
#include "mv.h"
#include "bitcost.h"
+#include "yuv.h"
namespace x265 {
// private x265 namespace
{
protected:
- /* Aligned copy of original pixels, extra room for manual alignment */
- pixel *fencplane;
- intptr_t fencLumaStride;
-
- pixelcmp_t sad;
- pixelcmp_t satd;
- pixelcmp_t sa8d;
- pixelcmp_x3_t sad_x3;
- pixelcmp_x4_t sad_x4;
-
intptr_t blockOffset;
- int partEnum;
+
+ int ctuAddr;
+ int absPartIdx; // part index of PU, including CU offset within CTU
+
int searchMethod;
int subpelRefine;
- /* subpel generation buffers */
int blockwidth;
int blockheight;
+ pixelcmp_t sad;
+ pixelcmp_x3_t sad_x3;
+ pixelcmp_x4_t sad_x4;
+ pixelcmp_t satd;
+ pixelcmp_t chromaSatd;
+
MotionEstimate& operator =(const MotionEstimate&);
public:
static const int COST_MAX = 1 << 28;
- pixel *fenc;
+ Yuv fencPUYuv;
+ int partEnum;
+ bool bChromaSATD;
MotionEstimate();
-
~MotionEstimate();
- void setSearchMethod(int i) { searchMethod = i; }
-
- void setSubpelRefine(int i) { subpelRefine = i; }
+ void init(int method, int refine, int csp);
/* Methods called at slice setup */
- void setSourcePlane(pixel *Y, intptr_t luma)
- {
- fencplane = Y;
- fencLumaStride = luma;
- }
-
- void setSourcePU(intptr_t offset, int pwidth, int pheight);
+ void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight);
+ void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight);
/* buf*() and motionEstimate() methods all use cached fenc pixels and thus
* require setSourcePU() to be called prior. */
- inline int bufSAD(pixel *fref, intptr_t stride) { return sad(fenc, FENC_STRIDE, fref, stride); }
+ inline int bufSAD(const pixel* fref, intptr_t stride) { return sad(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
- inline int bufSA8D(pixel *fref, intptr_t stride) { return sa8d(fenc, FENC_STRIDE, fref, stride); }
+ inline int bufSATD(const pixel* fref, intptr_t stride) { return satd(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
- inline int bufSATD(pixel *fref, intptr_t stride) { return satd(fenc, FENC_STRIDE, fref, stride); }
+ inline int bufChromaSATD(const Yuv& refYuv, int puPartIdx)
+ {
+ return chromaSatd(refYuv.getCbAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[1], fencPUYuv.m_csize) +
+ chromaSatd(refYuv.getCrAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[2], fencPUYuv.m_csize);
+ }
- int motionEstimate(ReferencePlanes *ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
+ int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
- int subpelCompare(ReferencePlanes * ref, const MV &qmv, pixelcmp_t);
+ int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
protected:
{
for (uint32_t i = 0; i < inSize; i++)
{
- if (bytes > 2 && !out[bytes - 2] && !out[bytes - 3] && out[bytes - 1] <= 0x03)
+ if (bytes >= 2 && !out[bytes - 2] && !out[bytes - 1] && inBytes[i] <= 0x03)
{
/* inject 0x03 to prevent emulating a start code */
- out[bytes] = out[bytes - 1];
- out[bytes - 1] = 0x03;
- bytes++;
+ out[bytes++] = 3;
}
out[bytes++] = inBytes[i];
using namespace x265;
/* Amortize the partial cost of I frames over the next N frames */
-const double RateControl::s_amortizeFraction = 0.85;
-const int RateControl::s_amortizeFrames = 75;
+
const int RateControl::s_slidingWindowFrames = 20;
const char *RateControl::s_defaultStatFileName = "x265_2pass.log";
/* Find the total AC energy of each block in all planes */
uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t block_y)
{
- intptr_t stride = curFrame->m_origPicYuv->m_stride;
- intptr_t cStride = curFrame->m_origPicYuv->m_strideC;
+ intptr_t stride = curFrame->m_fencPic->m_stride;
+ intptr_t cStride = curFrame->m_fencPic->m_strideC;
intptr_t blockOffsetLuma = block_x + (block_y * stride);
int colorFormat = m_param->internalCsp;
int hShift = CHROMA_H_SHIFT(colorFormat);
uint32_t var;
- var = acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat);
- var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat);
- var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat);
+ var = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat);
+ var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat);
+ var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat);
x265_emms();
return var;
}
void RateControl::calcAdaptiveQuantFrame(Frame *curFrame)
{
/* Actual adaptive quantization */
- int maxCol = curFrame->m_origPicYuv->m_picWidth;
- int maxRow = curFrame->m_origPicYuv->m_picHeight;
+ int maxCol = curFrame->m_fencPic->m_picWidth;
+ int maxRow = curFrame->m_fencPic->m_picHeight;
for (int y = 0; y < 3; y++)
{
m_bTerminated = false;
m_finalFrameCount = 0;
m_numEntries = 0;
+ m_amortizeFraction = 0.85;
+ m_amortizeFrames = 75;
+ if (m_param->totalFrames <= 2 * m_fps)
+ {
+ m_amortizeFraction = m_amortizeFrames = 0;
+ }
if (m_param->rc.rateControlMode == X265_RC_CRF)
{
m_param->rc.qp = (int)m_param->rc.rfConstant;
/* Frame Predictors and Row predictors used in vbv */
for (int i = 0; i < 5; i++)
{
- m_pred[i].coeff = 2.0;
+ m_pred[i].coeff = 1.5;
m_pred[i].count = 1.0;
m_pred[i].decay = 0.5;
m_pred[i].offset = 0.0;
}
- m_predBfromP = m_pred[0];
+ m_pred[0].coeff = 1.0;
if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead))
{
/* If the user hasn't defined the stat filename, use the default value */
rce->leadingNoBSatd = m_leadingNoBSatd;
if (curFrame->m_forceqp)
{
- m_qp = int32_t(curFrame->m_forceqp + 0.5) - 1;
+ m_qp = (int32_t)(curFrame->m_forceqp + 0.5) - 1;
m_qp = Clip3(QP_MIN, QP_MAX_MAX, m_qp);
rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = m_qp;
}
return false;
}
+double RateControl::tuneAbrQScaleFromFeedback(double qScale)
+{
+ double abrBuffer = 2 * m_param->rc.rateTolerance * m_bitrate;
+ if (m_currentSatd)
+ {
+ /* use framesDone instead of POC as poc count is not serial with bframes enabled */
+ double overflow = 1.0;
+ double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
+ double wantedBits = timeDone * m_bitrate;
+ if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames)
+ {
+ abrBuffer *= X265_MAX(1, sqrt(timeDone));
+ overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer);
+ qScale *= overflow;
+ }
+ }
+ return qScale;
+}
+
double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
{
double q;
q += m_pbOffset / 2;
else
q += m_pbOffset;
- rce->qpNoVbv = q;
- double qScale = x265_qp2qScale(q);
- if (!m_2pass && m_isVbv)
+ double qScale = x265_qp2qScale(q);
+ if (m_isCbr)
{
- if (m_leadingBframes > 5)
+ qScale = tuneAbrQScaleFromFeedback(qScale);
+ if (!m_isAbrReset)
{
- qScale = clipQscale(curFrame, rce, qScale);
- m_lastQScaleFor[m_sliceType] = qScale;
+ double lmin = m_lastQScaleFor[P_SLICE] / m_lstep;
+ double lmax = m_lastQScaleFor[P_SLICE] * m_lstep;
+ qScale = Clip3(lmin, lmax, qScale);
}
- rce->frameSizePlanned = predictSize(&m_predBfromP, qScale, (double)m_leadingNoBSatd);
+ q = x265_qScale2qp(qScale);
+ }
+ rce->qpNoVbv = q;
+ if (!m_2pass && m_isVbv)
+ {
+ qScale = clipQscale(curFrame, rce, qScale);
+ m_lastQScaleFor[m_sliceType] = qScale;
+ rce->frameSizePlanned = predictSize(&m_pred[m_sliceType], qScale, (double)m_currentSatd);
}
else if (m_2pass && m_isVbv)
{
* tradeoff between quality and bitrate precision. But at large
* tolerances, the bit distribution approaches that of 2pass. */
- double wantedBits, overflow = 1;
+ double overflow = 1;
m_shortTermCplxSum *= 0.5;
m_shortTermCplxCount *= 0.5;
{
if (!m_param->rc.bStatRead)
checkAndResetABR(rce, false);
- q = getQScale(rce, m_wantedBitsWindow / m_cplxrSum);
-
- /* ABR code can potentially be counterproductive in CBR, so just
- * don't bother. Don't run it if the frame complexity is zero
- * either. */
- if (!m_isCbr && m_currentSatd)
- {
- /* use framesDone instead of POC as poc count is not serial with bframes enabled */
- double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
- wantedBits = timeDone * m_bitrate;
- if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames)
- {
- abrBuffer *= X265_MAX(1, sqrt(timeDone));
- overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer);
- q *= overflow;
- }
- }
+ double initialQScale = getQScale(rce, m_wantedBitsWindow / m_cplxrSum);
+ q = tuneAbrQScaleFromFeedback(initialQScale);
+ overflow = q / initialQScale;
}
-
if (m_sliceType == I_SLICE && m_param->keyframeMax > 1
&& m_lastNonBPictType != I_SLICE && !m_isAbrReset)
{
{
q = x265_qp2qScale(CRF_INIT_QP) / fabs(m_param->rc.ipFactor);
}
- else if (m_framesDone == 0 && !m_isVbv)
+ else if (m_framesDone == 0 && !m_isVbv && m_param->rc.rateControlMode == X265_RC_ABR)
{
/* for ABR alone, clip the first I frame qp */
double lqmax = x265_qp2qScale(ABR_INIT_QP_MAX) * m_lstep;
if (m_partialResidualFrames)
rce->rowTotalBits += m_partialResidualCost * m_partialResidualFrames;
- m_partialResidualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
- m_partialResidualCost = (int)((rce->rowTotalBits * s_amortizeFraction) /m_partialResidualFrames);
+ m_partialResidualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax);
+ m_partialResidualCost = (int)((rce->rowTotalBits * m_amortizeFraction) /m_partialResidualFrames);
rce->rowTotalBits -= m_partialResidualCost * m_partialResidualFrames;
}
else if (m_partialResidualFrames)
{
double frameQ[3];
double curBits;
- if (m_sliceType == B_SLICE)
- curBits = predictSize(&m_predBfromP, q, (double)m_currentSatd);
- else
- curBits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd);
+ curBits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd);
double bufferFillCur = m_bufferFill - curBits;
double targetFill;
- double totalDuration = 0;
+ double totalDuration = m_frameDuration;
+ bool isIFramePresent = m_sliceType == I_SLICE ? true : false;
frameQ[P_SLICE] = m_sliceType == I_SLICE ? q * m_param->rc.ipFactor : (m_sliceType == B_SLICE ? q / m_param->rc.pbFactor : q);
frameQ[B_SLICE] = frameQ[P_SLICE] * m_param->rc.pbFactor;
frameQ[I_SLICE] = frameQ[P_SLICE] / m_param->rc.ipFactor;
bufferFillCur += wantedFrameSize;
int64_t satd = curFrame->m_lowres.plannedSatd[j] >> (X265_DEPTH - 8);
type = IS_X265_TYPE_I(type) ? I_SLICE : IS_X265_TYPE_B(type) ? B_SLICE : P_SLICE;
+ if (type == I_SLICE)
+ isIFramePresent = true;
curBits = predictSize(&m_pred[type], frameQ[type], (double)satd);
bufferFillCur -= curBits;
}
- /* Try to get the buffer at least 50% filled, but don't set an impossible goal. */
- targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * 0.5);
+ /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */
+ double tol = isIFramePresent ? 1 / totalDuration : totalDuration < 0.5 ? 2 : 1;
+ targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5 , m_bufferSize * (1 - 0.8 * totalDuration * tol));
if (bufferFillCur < targetFill)
{
q *= 1.01;
loopTerminate |= 1;
continue;
}
- /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */
- targetFill = Clip3(m_bufferSize * 0.8, m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
+ /* Try to get the buffer atleast 50% filled, but don't set an impossible goal. */
+ targetFill = Clip3(m_bufferSize - (m_bufferSize * totalDuration * 0.5), m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
if (m_isCbr && bufferFillCur > targetFill)
{
q /= 1.01;
if (pbits > rce->frameSizeMaximum)
q *= pbits / rce->frameSizeMaximum;
- // Check B-frame complexity, and use up any bits that would
- // overflow before the next P-frame.
- if (m_leadingBframes <= 5 && m_sliceType == P_SLICE && !m_singleFrameVbv)
- {
- int nb = m_leadingBframes;
- double bits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd);
- double bbits = predictSize(&m_predBfromP, q * m_param->rc.pbFactor, (double)m_currentSatd);
- double space;
- if (bbits > m_bufferRate)
- nb = 0;
- double pbbits = nb * bbits;
-
- space = m_bufferFill + (1 + nb) * m_bufferRate - m_bufferSize;
- if (pbbits < space)
- q *= X265_MAX(pbbits / space, bits / (0.5 * m_bufferSize));
-
- q = X265_MAX(q0 / 2, q);
- }
-
if (!m_isCbr || (m_isAbr && m_currentSatd >= rce->movingAvgSum && q <= q0 / 2))
q = X265_MAX(q0, q);
&& refQScale > 0
&& refRowSatdCost > 0)
{
- if (abs(int32_t(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2)
+ if (abs((int32_t)(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2)
{
double predTotal = refRowBits * satdCostForPendingCus / refRowSatdCost * refQScale / qScale;
- totalSatdBits += int32_t((pred_s + predTotal) * 0.5);
+ totalSatdBits += (int32_t)((pred_s + predTotal) * 0.5);
continue;
}
}
- totalSatdBits += int32_t(pred_s);
+ totalSatdBits += (int32_t)pred_s;
}
- else
+ else if (picType == P_SLICE)
{
/* Our QP is lower than the reference! */
double pred_intra = predictSize(rce->rowPred[1], qScale, intraCost);
/* Sum: better to overestimate than underestimate by using only one of the two predictors. */
- totalSatdBits += int32_t(pred_intra + pred_s);
+ totalSatdBits += (int32_t)(pred_intra + pred_s);
}
+ else
+ totalSatdBits += (int32_t)pred_s;
}
}
if (row < sps.numCuInHeight - 1)
{
- /* B-frames shouldn't use lower QP than their reference frames. */
- if (rce->sliceType == B_SLICE)
- {
- Frame* refSlice1 = curEncData.m_slice->m_refPicList[0][0];
- Frame* refSlice2 = curEncData.m_slice->m_refPicList[1][0];
- qpMin = X265_MAX(qpMin, X265_MAX(refSlice1->m_encData->m_rowStat[row].diagQp, refSlice2->m_encData->m_rowStat[row].diagQp));
- qpVbv = X265_MAX(qpVbv, qpMin);
- }
/* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
- double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_param->rc.rateTolerance;
+ double rcTol = (bufferLeftPlanned * 0.2) / m_param->frameNumThreads * m_param->rc.rateTolerance;
int32_t encodedBitsSoFar = 0;
double accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
while (qpVbv < qpMax
&& ((accFrameBits > rce->frameSizePlanned + rcTol) ||
- (rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.5) ||
+ (rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.2) ||
(accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv)))
{
qpVbv += stepSize;
{
if (var < 10)
return;
- const double range = 1.5;
+ const double range = 2;
double old_coeff = p->coeff / p->count;
double new_coeff = bits * q / var;
double new_coeff_clipped = Clip3(old_coeff / range, old_coeff * range, new_coeff);
/* previous I still had a residual; roll it into the new loan */
if (m_residualFrames)
bits += m_residualCost * m_residualFrames;
- m_residualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
- m_residualCost = (int)((bits * s_amortizeFraction) / m_residualFrames);
+ m_residualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax);
+ m_residualCost = (int)((bits * m_amortizeFraction) / m_residualFrames);
bits -= m_residualCost * m_residualFrames;
}
else if (m_residualFrames)
if (m_isVbv)
{
- if (rce->sliceType == B_SLICE)
- {
- m_bframeBits += actualBits;
- if (rce->bLastMiniGopBFrame)
- {
- if (rce->bframes != 0)
- updatePredictor(&m_predBfromP, x265_qp2qScale(rce->qpaRc), (double)rce->leadingNoBSatd, (double)m_bframeBits / rce->bframes);
- m_bframeBits = 0;
- }
- }
updateVbv(actualBits, rce);
if (m_param->bEmitHRDSEI)
void initHRD(SPS* sps);
int rateControlSliceType(int frameNum);
bool cuTreeReadFor2Pass(Frame* curFrame);
+ double tuneAbrQScaleFromFeedback(double qScale);
protected:
- static const double s_amortizeFraction;
- static const int s_amortizeFrames;
static const int s_slidingWindowFrames;
static const char *s_defaultStatFileName;
int m_partialResidualFrames;
int m_residualCost;
int m_partialResidualCost;
+ int m_amortizeFrames;
+ double m_amortizeFraction;
double getQScale(RateControlEntry *rce, double rateFactor);
double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
/* all weights and factors stored as FIX8 */
uint64_t m_lambda2;
uint64_t m_lambda;
- uint64_t m_cbDistortionWeight;
- uint64_t m_crDistortionWeight;
+ uint32_t m_chromaDistWeight[2];
+ uint32_t m_psyRdBase;
uint32_t m_psyRd;
int m_qp;
- void setPsyRdScale(double scale) { m_psyRd = (uint32_t)floor(256.0 * scale * 0.33); }
- void setCbDistortionWeight(uint16_t weightFix8) { m_cbDistortionWeight = weightFix8; }
- void setCrDistortionWeight(uint16_t weightFix8) { m_crDistortionWeight = weightFix8; }
+ void setPsyRdScale(double scale) { m_psyRdBase = (uint32_t)floor(256.0 * scale * 0.33); }
void setQP(const Slice& slice, int qp)
{
m_qp = qp;
+ int qpCb, qpCr;
+ /* Scale PSY RD factor by a slice type factor */
+ static const uint32_t psyScaleFix8[3] = { 300, 256, 96 }; /* B, P, I */
+ m_psyRd = (m_psyRdBase * psyScaleFix8[slice.m_sliceType]) >> 8;
setLambda(x265_lambda2_tab[qp], x265_lambda_tab[qp]);
-
- int qpCb = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCbQpOffset);
+ if (slice.m_sps->chromaFormatIdc == X265_CSP_I420)
+ qpCb = Clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice.m_pps->chromaQpOffset[0]]);
+ else
+ qpCb = X265_MIN(qp + slice.m_pps->chromaQpOffset[0], QP_MAX_SPEC);
int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
- setCbDistortionWeight(lambdaOffset);
+ m_chromaDistWeight[0] = lambdaOffset;
- int qpCr = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCrQpOffset);
+ if (slice.m_sps->chromaFormatIdc == X265_CSP_I420)
+ qpCr = Clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice.m_pps->chromaQpOffset[0]]);
+ else
+ qpCr = X265_MIN(qp + slice.m_pps->chromaQpOffset[0], QP_MAX_SPEC);
chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
- setCrDistortionWeight(lambdaOffset);
+ m_chromaDistWeight[1] = lambdaOffset;
}
void setLambda(double lambda2, double lambda)
inline uint64_t calcRdCost(uint32_t distortion, uint32_t bits) const
{
X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
- "calcRdCost wrap detected dist: %d, bits %d, lambda: %d\n", distortion, bits, (int)m_lambda2);
+ "calcRdCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
return distortion + ((bits * m_lambda2 + 128) >> 8);
}
/* return the difference in energy between the source block and the recon block */
- inline int psyCost(int size, pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) const
+ inline int psyCost(int size, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) const
{
return primitives.psy_cost_pp[size](source, sstride, recon, rstride);
}
/* return the difference in energy between the source block and the recon block */
- inline int psyCost(int size, int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride) const
+ inline int psyCost(int size, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) const
{
return primitives.psy_cost_ss[size](source, sstride, recon, rstride);
}
inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits) const
{
X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
- "calcRdSADCost wrap detected dist: %d, bits %d, lambda: "X265_LL"\n", sadCost, bits, m_lambda);
+ "calcRdSADCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", sadCost, bits, m_lambda);
return sadCost + ((bits * m_lambda + 128) >> 8);
}
- inline uint32_t scaleChromaDistCb(uint32_t dist) const
- {
- X265_CHECK(dist <= (UINT64_MAX - 128) / m_cbDistortionWeight,
- "scaleChromaDistCb wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_cbDistortionWeight);
- return (uint32_t)(((dist * m_cbDistortionWeight) + 128) >> 8);
- }
-
- inline uint32_t scaleChromaDistCr(uint32_t dist) const
+ inline uint32_t scaleChromaDist(uint32_t plane, uint32_t dist) const
{
- X265_CHECK(dist <= (UINT64_MAX - 128) / m_crDistortionWeight,
- "scaleChromaDistCr wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_crDistortionWeight);
- return (uint32_t)(((dist * m_crDistortionWeight) + 128) >> 8);
+ X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
+ "scaleChromaDist wrap detected dist: %u, lambda: %u\n", dist, m_chromaDistWeight[plane - 1]);
+ return (uint32_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
}
inline uint32_t getCost(uint32_t bits) const
MotionReference::MotionReference()
{
- m_weightBuffer = NULL;
+ weightBuffer[0] = NULL;
+ weightBuffer[1] = NULL;
+ weightBuffer[2] = NULL;
}
-int MotionReference::init(PicYuv* recPic, WeightParam *w)
+MotionReference::~MotionReference()
+{
+ X265_FREE(weightBuffer[0]);
+ X265_FREE(weightBuffer[1]);
+ X265_FREE(weightBuffer[2]);
+}
+
+int MotionReference::init(PicYuv* recPic, WeightParam *wp, const x265_param& p)
{
- m_reconPic = recPic;
+ reconPic = recPic;
+ numWeightedRows = 0;
lumaStride = recPic->m_stride;
- intptr_t startpad = recPic->m_lumaMarginY * lumaStride + recPic->m_lumaMarginX;
+ chromaStride = recPic->m_strideC;
+ numInterpPlanes = p.subpelRefine > 2 ? 3 : 1; /* is chroma satd possible? */
- /* directly reference the pre-extended integer pel plane */
- fpelPlane = recPic->m_picBuf[0] + startpad;
+ /* directly reference the extended integer pel planes */
+ fpelPlane[0] = recPic->m_picOrg[0];
+ fpelPlane[1] = recPic->m_picOrg[1];
+ fpelPlane[2] = recPic->m_picOrg[2];
isWeighted = false;
- if (w)
+ if (wp)
{
- if (!m_weightBuffer)
+ uint32_t numCUinHeight = (reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
+
+ int marginX = reconPic->m_lumaMarginX;
+ int marginY = reconPic->m_lumaMarginY;
+ intptr_t stride = reconPic->m_stride;
+ int cuHeight = g_maxCUSize;
+
+ for (int c = 0; c < numInterpPlanes; c++)
{
- uint32_t numCUinHeight = (recPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
- size_t padheight = (numCUinHeight * g_maxCUSize) + recPic->m_lumaMarginY * 2;
- m_weightBuffer = X265_MALLOC(pixel, lumaStride * padheight);
- if (!m_weightBuffer)
- return -1;
+ if (c == 1)
+ {
+ marginX = reconPic->m_chromaMarginX;
+ marginY = reconPic->m_chromaMarginY;
+ stride = reconPic->m_strideC;
+ cuHeight >>= reconPic->m_vChromaShift;
+ }
+
+ if (wp[c].bPresentFlag)
+ {
+ if (!weightBuffer[c])
+ {
+ size_t padheight = (numCUinHeight * cuHeight) + marginY * 2;
+ weightBuffer[c] = X265_MALLOC(pixel, stride * padheight);
+ if (!weightBuffer[c])
+ return -1;
+ }
+
+ /* use our buffer which will have weighted pixels written to it */
+ fpelPlane[c] = weightBuffer[c] + marginY * stride + marginX;
+ X265_CHECK(recPic->m_picOrg[c] - recPic->m_picBuf[c] == marginY * stride + marginX, "PicYuv pad calculation mismatch\n");
+
+ w[c].weight = wp[c].inputWeight;
+ w[c].offset = wp[c].inputOffset * (1 << (X265_DEPTH - 8));
+ w[c].shift = wp[c].log2WeightDenom;
+ w[c].round = w[c].shift ? 1 << (w[c].shift - 1) : 0;
+ }
}
isWeighted = true;
- weight = w->inputWeight;
- offset = w->inputOffset * (1 << (X265_DEPTH - 8));
- shift = w->log2WeightDenom;
- round = shift ? 1 << (shift - 1) : 0;
- m_numWeightedRows = 0;
-
- /* use our buffer which will have weighted pixels written to it */
- fpelPlane = m_weightBuffer + startpad;
}
return 0;
}
-MotionReference::~MotionReference()
-{
- X265_FREE(m_weightBuffer);
-}
-
-void MotionReference::applyWeight(int rows, int numRows)
+void MotionReference::applyWeight(int finishedRows, int maxNumRows)
{
- rows = X265_MIN(rows, numRows);
- if (m_numWeightedRows >= rows)
+ finishedRows = X265_MIN(finishedRows, maxNumRows);
+ if (numWeightedRows >= finishedRows)
return;
- int marginX = m_reconPic->m_lumaMarginX;
- int marginY = m_reconPic->m_lumaMarginY;
- pixel* src = (pixel*)m_reconPic->m_picOrg[0] + (m_numWeightedRows * (int)g_maxCUSize * lumaStride);
- pixel* dst = fpelPlane + ((m_numWeightedRows * (int)g_maxCUSize) * lumaStride);
- int width = m_reconPic->m_picWidth;
- int height = ((rows - m_numWeightedRows) * g_maxCUSize);
- if (rows == numRows)
- height = ((m_reconPic->m_picHeight % g_maxCUSize) ? (m_reconPic->m_picHeight % g_maxCUSize) : g_maxCUSize);
-
- // Computing weighted CU rows
- int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
- int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths
- primitives.weight_pp(src, dst, lumaStride, padwidth, height,
- weight, round << correction, shift + correction, offset);
-
- // Extending Left & Right
- primitives.extendRowBorder(dst, lumaStride, width, height, marginX);
-
- // Extending Above
- if (m_numWeightedRows == 0)
+
+ int marginX = reconPic->m_lumaMarginX;
+ int marginY = reconPic->m_lumaMarginY;
+ intptr_t stride = reconPic->m_stride;
+ int width = reconPic->m_picWidth;
+ int height = (finishedRows - numWeightedRows) * g_maxCUSize;
+ if (finishedRows == maxNumRows && (reconPic->m_picHeight % g_maxCUSize))
{
- pixel *pixY = fpelPlane - marginX;
- for (int y = 0; y < marginY; y++)
- memcpy(pixY - (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel));
+ /* the last row may be partial height */
+ height -= g_maxCUSize;
+ height += reconPic->m_picHeight % g_maxCUSize;
}
+ int cuHeight = g_maxCUSize;
- // Extending Bottom
- if (rows == numRows)
+ for (int c = 0; c < numInterpPlanes; c++)
{
- pixel *pixY = fpelPlane - marginX + (m_reconPic->m_picHeight - 1) * lumaStride;
- for (int y = 0; y < marginY; y++)
- memcpy(pixY + (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel));
+ if (c == 1)
+ {
+ marginX = reconPic->m_chromaMarginX;
+ marginY = reconPic->m_chromaMarginY;
+ stride = reconPic->m_strideC;
+ width >>= reconPic->m_hChromaShift;
+ height >>= reconPic->m_vChromaShift;
+ cuHeight >>= reconPic->m_vChromaShift;
+ }
+
+ /* Do not generate weighted predictions if using original picture */
+ if (fpelPlane[c] == reconPic->m_picOrg[c])
+ continue;
+
+ const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride;
+ pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
+
+ // Computing weighted CU rows
+ int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
+ int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths
+ primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset);
+
+ // Extending Left & Right
+ primitives.extendRowBorder(dst, stride, width, height, marginX);
+
+ // Extending Above
+ if (numWeightedRows == 0)
+ {
+ pixel *pixY = fpelPlane[c] - marginX;
+ for (int y = 0; y < marginY; y++)
+ memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
+ }
+
+ // Extending Bottom
+ if (finishedRows == maxNumRows)
+ {
+ int picHeight = reconPic->m_picHeight;
+ if (c) picHeight >>= reconPic->m_vChromaShift;
+ pixel *pixY = fpelPlane[c] - marginX + (picHeight - 1) * stride;
+ for (int y = 0; y < marginY; y++)
+ memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
+ }
}
- m_numWeightedRows = rows;
+
+ numWeightedRows = finishedRows;
}
#define X265_REFERENCE_H
#include "primitives.h"
+#include "picyuv.h"
#include "lowres.h"
#include "mv.h"
namespace x265 {
// private x265 namespace
-class PicYuv;
struct WeightParam;
class MotionReference : public ReferencePlanes
MotionReference();
~MotionReference();
- int init(PicYuv*, WeightParam* w = NULL);
+ int init(PicYuv*, WeightParam* wp, const x265_param& p);
void applyWeight(int rows, int numRows);
- PicYuv* m_reconPic;
- pixel* m_weightBuffer;
- int m_numWeightedRows;
+ pixel* weightBuffer[3];
+ int numInterpPlanes;
+ int numWeightedRows;
protected:
void SAO::startSlice(Frame* frame, Entropy& initState, int qp)
{
Slice* slice = frame->m_encData->m_slice;
-
- int qpCb = Clip3(0, QP_MAX_MAX, qp + slice->m_pps->chromaCbQpOffset);
+ int qpCb = qp;
+ if (m_param->internalCsp == X265_CSP_I420)
+ qpCb = Clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
+ else
+ qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);
m_lumaLambda = x265_lambda2_tab[qp];
m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma
m_frame = frame;
{
int x, y;
const CUData* cu = m_frame->m_encData->getPicCTU(addr);
- pixel* rec = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr);
- intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride;
+ pixel* rec = m_frame->m_reconPic->getPlaneAddr(plane, addr);
+ intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
uint32_t picHeight = m_param->sourceHeight;
int ctuWidth = g_maxCUSize;
/* Process SAO all units */
void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
{
- intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride;
+ intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
int ctuWidth = g_maxCUSize;
int ctuHeight = g_maxCUSize;
if (!idxY)
{
- pixel* rec = m_frame->m_reconPicYuv->m_picOrg[plane];
+ pixel* rec = m_frame->m_reconPic->m_picOrg[plane];
memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
}
int addr = idxY * m_numCuInWidth;
- pixel* rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr);
+ pixel* rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr);
for (int i = 0; i < ctuHeight + 1; i++)
{
}
else if (idxX != (m_numCuInWidth - 1))
{
- rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr);
+ rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr);
for (int i = 0; i < ctuHeight + 1; i++)
{
void SAO::calcSaoStatsCu(int addr, int plane)
{
int x, y;
- CUData* cu = m_frame->m_encData->getPicCTU(addr);
- const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr);
- const pixel* rec0 = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr);
+ const CUData* cu = m_frame->m_encData->getPicCTU(addr);
+ const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
+ const pixel* rec0 = m_frame->m_reconPic->getPlaneAddr(plane, addr);
const pixel* fenc;
const pixel* rec;
- intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride;
+ intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
uint32_t picHeight = m_param->sourceHeight;
int ctuWidth = g_maxCUSize;
int addr = idxX + m_numCuInWidth * idxY;
int x, y;
- CUData* cu = frame->m_encData->getPicCTU(addr);
+ const CUData* cu = frame->m_encData->getPicCTU(addr);
const pixel* fenc;
const pixel* rec;
- intptr_t stride = m_frame->m_reconPicYuv->m_stride;
+ intptr_t stride = m_frame->m_reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
uint32_t picHeight = m_param->sourceHeight;
int ctuWidth = g_maxCUSize;
{
if (plane == 1)
{
- stride = frame->m_reconPicYuv->m_strideC;
+ stride = frame->m_reconPic->m_strideC;
picWidth >>= m_hChromaShift;
picHeight >>= m_vChromaShift;
ctuWidth >>= m_hChromaShift;
stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
count = m_countPreDblk[addr][plane][SAO_BO];
- const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr);
- const pixel* rec0 = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr);
+ const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
+ const pixel* rec0 = m_frame->m_reconPic->getPlaneAddr(plane, addr);
fenc = fenc0;
rec = rec0;
#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
#endif
+#define MVP_IDX_BITS 1
+
ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
m_numLayers = g_log2Size[param.maxCUSize] - 2;
m_rdCost.setPsyRdScale(param.psyRd);
- m_me.setSearchMethod(param.searchMethod);
- m_me.setSubpelRefine(param.subpelRefine);
+ m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
- if (m_param->noiseReduction)
+ if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
ok &= m_quant.allocNoiseReduction(param);
ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
void Search::invalidateContexts(int) {}
#endif
-void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height)
+void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
{
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
- uint32_t subdiv = tuDepthL > trDepth;
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx];
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- bool mCodeAll = true;
- const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift);
- if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
- mCodeAll = false;
-
- if (mCodeAll)
+ if (!(log2TrSize - m_hChromaShift < 2))
{
- if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv);
-
- if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
}
if (subdiv)
{
- absPartIdxStep >>= 2;
- width >>= 1;
- height >>= 1;
-
- uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t part = 0; part < 4; part++)
- codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
}
}
-void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype)
+void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
{
- if (!cu.getCbf(absPartIdx, ttype, trDepth))
+ if (!cu.getCbf(absPartIdx, ttype, tuDepth))
return;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- if (tuDepthL > trDepth)
+ if (tuDepth < cu.m_tuDepth[absPartIdx])
{
- uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t part = 0; part < 4; part++)
- codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
return;
}
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-
- uint32_t trDepthC = trDepth;
+ uint32_t tuDepthC = tuDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- if (log2TrSizeC == 1)
- {
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n");
- trDepthC--;
- log2TrSizeC++;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
- if (!bFirstQ)
+
+ if (log2TrSizeC < 2)
+ {
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ if (absPartIdx & 3)
return;
+ log2TrSizeC = 2;
+ tuDepthC--;
}
uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
uint32_t subTUSize = 1 << (log2TrSizeC * 2);
- uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1);
- if (cu.getCbf(absPartIdx, ttype, trDepth + 1))
+ uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
+ if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
- if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype);
+ if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+ m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
}
}
-void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2])
+void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
{
- uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t qtLayer = log2TrSize - 2;
uint32_t sizeIdx = log2TrSize - 2;
if (mightSplit)
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
- pixel* fenc = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = mode.fencYuv->m_size;
// init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
- initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+ initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
// get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
}
else
// no coded residual, recon = pred
- primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
- bCBF = !!numSig << trDepth;
+ bCBF = !!numSig << tuDepth;
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride);
}
else
{
- uint32_t qtNumParts = cuGeom.numPartitions >> 2;
- if (!trDepth)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ if (!tuDepth)
{
- for (uint32_t part = 0; part < 4; part++)
- m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+ m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
- else if (!(absPartIdx & (qtNumParts - 1)))
+ else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
if (log2TrSize != depthRange[0])
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
- m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+ m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
- if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+ if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
}
// code split block
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- uint32_t absPartIdxSub = absPartIdx;
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
if (m_param->bEnableTSkipFast)
- checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN;
+ checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
Cost splitCost;
uint32_t cbf = 0;
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
if (checkTransformSkip)
- codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost);
+ codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
else
- codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange);
+ codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
- cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+ cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
- cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth);
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+ cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
if (mightNotSplit && log2TrSize != depthRange[0])
{
m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
// recover transform index and Cbf values
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
// set reconstruction for next intra prediction blocks if full TU prediction won
- pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
- primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+ pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = m_frame->m_reconPic->m_stride;
+ primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
outCost.energy += fullCost.energy;
}
-void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost)
+void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
{
- uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t tuSize = 1 << log2TrSize;
int bTSkip = 0;
uint32_t bCBF = 0;
- pixel* fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
pixel* pred = predYuv->getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = fencYuv->m_size;
// init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
- initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+ initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
// get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
}
else
// no residual coded, recon = pred
- primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride);
cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+ cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
if (useTSkip)
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
}
else
{
- uint32_t qtNumParts = cuGeom.numPartitions >> 2;
- if (!trDepth)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ if (!tuDepth)
{
- for (uint32_t part = 0; part < 4; part++)
- m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+ m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
- else if (!(absPartIdx & (qtNumParts - 1)))
+ else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
- m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+ m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
- if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+ if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
if (bTSkip)
{
memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
- primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
+ primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
}
else if (checkTransformSkip)
{
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+ cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
}
// set reconstruction for next intra prediction blocks
- pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
- primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+ pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = m_frame->m_reconPic->m_stride;
+ primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
}
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
-void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2])
+void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
bool bCheckFull = log2TrSize <= depthRange[1];
if (bCheckFull)
{
- pixel* fenc = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
- pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
+ pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = m_frame->m_reconPic->m_stride;
uint32_t stride = mode.fencYuv->m_size;
uint32_t sizeIdx = log2TrSize - 2;
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeff = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
- initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+ initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
{
m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
- cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+ cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
}
else
{
- primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
/* code split block */
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t cbf = 0;
- for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange);
- cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+ residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange);
+ cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
- cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth);
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+ cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth);
}
}
-void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx)
+void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
{
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- if (tuDepth == trDepth)
+ if (tuDepth == cu.m_tuDepth[absPartIdx])
{
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t qtLayer = log2TrSize - 2;
// copy transform coefficients
}
else
{
- uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
- extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
}
}
+inline void offsetCBFs(uint8_t subTUCBF[2])
+{
+ uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
+ subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
+ subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
+}
+
/* 4:2:2 post-TU split processing */
-void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx)
+void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
{
uint32_t depth = cu.m_cuDepth[0];
- uint32_t fullDepth = depth + trDepth;
+ uint32_t fullDepth = depth + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- uint32_t trDepthC = trDepth;
if (log2TrSize == 2)
{
- X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n");
- trDepthC--;
+ X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ ++log2TrSize;
}
- uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1;
+ uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
// move the CBFs down a level and set the parent CBF
uint8_t subTUCBF[2];
- uint8_t combinedSubTUCBF = 0;
-
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
+ subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth);
+ subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
+ offsetCBFs(subTUCBF);
- subTUCBF[subTU] = cu.getCbf(subTUAbsPartIdx, ttype, trDepth);
- combinedSubTUCBF |= subTUCBF[subTU];
- }
-
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
- const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF;
-
- cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU);
- }
+ cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts);
+ cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
}
/* returns distortion */
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- if (tuDepthL > trDepth)
+ if (tuDepth < cu.m_tuDepth[absPartIdx])
{
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
- for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy);
- splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
- splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
+ outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
+ splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
- cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth);
- cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth);
+ cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth);
+ cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth);
}
return outDist;
}
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
- uint32_t trDepthC = trDepth;
- if (log2TrSizeC == 1)
+ uint32_t tuDepthC = tuDepth;
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n");
- trDepthC--;
- log2TrSizeC++;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
- if (!bFirstQ)
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ if (absPartIdx & 3)
return 0;
+ log2TrSizeC = 2;
+ tuDepthC--;
}
if (m_bEnableRDOQ)
bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
if (checkTransformSkip)
- return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy);
+ return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
uint32_t qtLayer = log2TrSize - 2;
uint32_t tuSize = 1 << log2TrSizeC;
uint32_t outDist = 0;
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+ uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- pixel* fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+ const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t stride = mode.fencYuv->m_csize;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
- pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
- intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
+ pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+ intptr_t picStride = m_frame->m_reconPic->m_strideC;
// init availability pattern
- initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+ initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
- uint32_t tmpDist;
if (numSig)
{
m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride);
- cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
// no coded residual, recon = pred
- primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
- tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride);
- outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+ outDist += m_rdCost.scaleChromaDist(chromaId, primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride));
if (m_rdCost.m_psyRd)
psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride);
- primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
+ primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
- offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+ offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
}
return outDist;
}
/* returns distortion */
-uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- uint32_t log2TrSizeC = 2;
+ const uint32_t log2TrSizeC = 2;
uint32_t tuSize = 4;
uint32_t qtLayer = log2TrSize - 2;
uint32_t outDist = 0;
ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
ALIGN_VAR_32(pixel, tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+ uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- pixel* fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+ const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t stride = mode.fencYuv->m_csize;
- uint32_t sizeIdxC = log2TrSizeC - 2;
+ const uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
// init availability pattern
- initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+ initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
{
m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride);
- cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else if (useTSkip)
{
}
else
{
- primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride);
- tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+ tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
if (bTSkip)
{
memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
- primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
+ primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
}
- cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
- pixel* reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
- intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
- primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
+ pixel* reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+ intptr_t picStride = m_frame->m_reconPic->m_strideC;
+ primitives.luma_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
outDist += bDist;
psyEnergy += bEnergy;
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
- offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+ offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
}
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
return outDist;
}
-void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad)
+void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
{
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+ uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
- if (tuDepthL == trDepth)
+ if (tuDepthL == tuDepth || log2TrSizeC == 2)
{
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- if (tuQuad)
- {
- log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */
- trDepth--; /* also adjust the number of coeff read */
- }
-
// copy transform coefficients
uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
- uint32_t qtLayer = log2TrSize - 2;
+ uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth);
coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
}
else
{
- if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444)
- /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */
- extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true);
- else
- {
- uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
- extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false);
- }
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
}
}
-void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx)
+void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx)
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- if (tuDepthL == trDepth)
+ if (tuDepth == cu.m_tuDepth[absPartIdx])
{
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
- uint32_t trDepthC = trDepth;
- if (log2TrSizeC == 1)
+ uint32_t tuDepthC = tuDepth;
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n");
- trDepthC--;
- log2TrSizeC++;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
- if (!bFirstQ)
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ if (absPartIdx & 3)
return;
+ log2TrSizeC = 2;
+ tuDepthC--;
}
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
uint32_t stride = mode.fencYuv->m_csize;
const int sizeIdxC = log2TrSizeC - 2;
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+ uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- pixel* fenc = const_cast<pixel*>(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC));
+ const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
pixel* recon = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeff = cu.m_trCoeff[ttype] + coeffOffsetC;
- pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
- uint32_t picStride = m_frame->m_reconPicYuv->m_strideC;
+ pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+ uint32_t picStride = m_frame->m_reconPic->m_strideC;
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
- initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+ initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
{
m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
- primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
- cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
+ cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
- primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride);
- primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride);
+ primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
- offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx);
+ offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
}
}
else
{
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t splitCbfU = 0, splitCbfV = 0;
- for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC);
- splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1);
- splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1);
+ residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx);
+ splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
- cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth);
- cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth);
+ cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
+ cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
}
}
}
intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
bool bCodeDQP = m_slice->m_pps->bUseDQP;
- m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange);
+ m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
m_entropyCoder.store(intraMode.contexts);
intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
updateModeCost(intraMode);
}
-uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes)
+/* Note that this function does not save the best intra prediction, it must
+ * be generated later. It records the best mode in the cu */
+void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+ CUData& cu = intraMode.cu;
+ uint32_t depth = cu.m_cuDepth[0];
+
+ cu.setPartSizeSubParts(SIZE_2Nx2N);
+ cu.setPredModeSubParts(MODE_INTRA);
+
+ const uint32_t initTuDepth = 0;
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
+ uint32_t tuSize = 1 << log2TrSize;
+ const uint32_t absPartIdx = 0;
+
+ // Reference sample smoothing
+ initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
+
+ const pixel* fenc = intraMode.fencYuv->m_buf[0];
+ uint32_t stride = intraMode.fencYuv->m_size;
+
+ pixel* above = m_refAbove + tuSize - 1;
+ pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
+ pixel* left = m_refLeft + tuSize - 1;
+ pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
+ int sad, bsad;
+ uint32_t bits, bbits, mode, bmode;
+ uint64_t cost, bcost;
+
+ // 33 Angle modes once
+ ALIGN_VAR_32(pixel, bufScale[32 * 32]);
+ ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
+ ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
+ int scaleTuSize = tuSize;
+ int scaleStride = stride;
+ int costShift = 0;
+ int sizeIdx = log2TrSize - 2;
+
+ if (tuSize > 32)
+ {
+ // origin is 64x64, we scale to 32x32 and setup required parameters
+ primitives.scale2D_64to32(bufScale, fenc, stride);
+ fenc = bufScale;
+
+ // reserve space in case primitives need to store data in above
+ // or left buffers
+ pixel _above[4 * 32 + 1];
+ pixel _left[4 * 32 + 1];
+ pixel* aboveScale = _above + 2 * 32;
+ pixel* leftScale = _left + 2 * 32;
+ aboveScale[0] = leftScale[0] = above[0];
+ primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
+ primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
+
+ scaleTuSize = 32;
+ scaleStride = 32;
+ costShift = 2;
+ sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
+
+ // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
+ above = aboveScale;
+ left = leftScale;
+ aboveFiltered = aboveScale;
+ leftFiltered = leftScale;
+ }
+
+ pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
+ int predsize = scaleTuSize * scaleTuSize;
+
+ m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
+
+ /* there are three cost tiers for intra modes:
+ * pred[0] - mode probable, least cost
+ * pred[1], pred[2] - less probable, slightly more cost
+ * non-mpm modes - all cost the same (rbits) */
+ uint64_t mpms;
+ uint32_t preds[3];
+ uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
+
+ // DC
+ primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+ bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+ bmode = mode = DC_IDX;
+ bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ bcost = m_rdCost.calcRdSADCost(bsad, bbits);
+
+ pixel* abovePlanar = above;
+ pixel* leftPlanar = left;
+
+ if (tuSize & (8 | 16 | 32))
+ {
+ abovePlanar = aboveFiltered;
+ leftPlanar = leftFiltered;
+ }
+
+ // PLANAR
+ primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+ sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+ mode = PLANAR_IDX;
+ bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ cost = m_rdCost.calcRdSADCost(sad, bits);
+ COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+
+ // Transpose NxN
+ primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
+
+ primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+
+ bool modeHor;
+ const pixel* cmp;
+ intptr_t srcStride;
+
+#define TRY_ANGLE(angle) \
+ modeHor = angle < 18; \
+ cmp = modeHor ? bufTrans : fenc; \
+ srcStride = modeHor ? scaleTuSize : scaleStride; \
+ sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
+ bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+ cost = m_rdCost.calcRdSADCost(sad, bits)
+
+ if (m_param->bEnableFastIntra)
+ {
+ int asad = 0;
+ uint32_t lowmode, highmode, amode = 5, abits = 0;
+ uint64_t acost = MAX_INT64;
+
+ /* pick the best angle, sampling at distance of 5 */
+ for (mode = 5; mode < 35; mode += 5)
+ {
+ TRY_ANGLE(mode);
+ COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
+ }
+
+ /* refine best angle at distance 2, then distance 1 */
+ for (uint32_t dist = 2; dist >= 1; dist--)
+ {
+ lowmode = amode - dist;
+ highmode = amode + dist;
+
+ X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
+ TRY_ANGLE(lowmode);
+ COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+
+ X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
+ TRY_ANGLE(highmode);
+ COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+ }
+
+ if (amode == 33)
+ {
+ TRY_ANGLE(34);
+ COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
+ }
+
+ COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
+ }
+ else // calculate and search all intra prediction angles for lowest cost
+ {
+ for (mode = 2; mode < 35; mode++)
+ {
+ TRY_ANGLE(mode);
+ COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+ }
+ }
+
+ cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
+ intraMode.initCosts();
+ intraMode.totalBits = bbits;
+ intraMode.distortion = bsad;
+ intraMode.sa8dCost = bcost;
+ intraMode.sa8dBits = bbits;
+}
+
+void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+ CUData& cu = intraMode.cu;
+ Yuv* reconYuv = &intraMode.reconYuv;
+ const Yuv* fencYuv = intraMode.fencYuv;
+
+ X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
+ X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
+
+ m_quant.setQPforQuant(cu);
+
+ uint32_t tuDepthRange[2];
+ cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+ m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
+
+ Cost icosts;
+ codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
+ extractIntraResultQT(cu, *reconYuv, 0, 0);
+
+ intraMode.distortion = icosts.distortion;
+ intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
+
+ m_entropyCoder.resetBits();
+ if (m_slice->m_pps->bTransquantBypassEnabled)
+ m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
+ m_entropyCoder.codeSkipFlag(cu, 0);
+ m_entropyCoder.codePredMode(cu.m_predMode[0]);
+ m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
+ m_entropyCoder.codePredInfo(cu, 0);
+ intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
+
+ bool bCodeDQP = m_slice->m_pps->bUseDQP;
+ m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
+
+ intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
+ intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
+ if (m_rdCost.m_psyRd)
+ intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+
+ m_entropyCoder.store(intraMode.contexts);
+ updateModeCost(intraMode);
+}
+
+uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
{
CUData& cu = intraMode.cu;
Yuv* reconYuv = &intraMode.reconYuv;
const Yuv* fencYuv = intraMode.fencYuv;
uint32_t depth = cu.m_cuDepth[0];
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1;
- uint32_t numPU = 1 << (2 * initTrDepth);
- uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
+ uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
+ uint32_t numPU = 1 << (2 * initTuDepth);
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
uint32_t tuSize = 1 << log2TrSize;
uint32_t qNumParts = cuGeom.numPartitions >> 2;
uint32_t sizeIdx = log2TrSize - 2;
uint32_t absPartIdx = 0;
uint32_t totalDistortion = 0;
- int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN;
+ int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
// loop over partitions
- for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts)
+ for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
{
uint32_t bmode = 0;
if (sharedModes)
- bmode = sharedModes[pu];
+ bmode = sharedModes[puIdx];
else
{
// Reference sample smoothing
- initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
+ initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
// determine set of modes to be tested (using prediction signal only)
- pixel* fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t stride = predYuv->m_size;
- pixel *above = m_refAbove + tuSize - 1;
- pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
- pixel *left = m_refLeft + tuSize - 1;
- pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
+ pixel* above = m_refAbove + tuSize - 1;
+ pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
+ pixel* left = m_refLeft + tuSize - 1;
+ pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
// 33 Angle modes once
ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
if (tuSize > 32)
{
- pixel *aboveScale = _above + 2 * 32;
- pixel *leftScale = _left + 2 * 32;
+ pixel* aboveScale = _above + 2 * 32;
+ pixel* leftScale = _left + 2 * 32;
// origin is 64x64, we scale to 32x32 and setup required parameters
primitives.scale2D_64to32(bufScale, fenc, stride);
modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
// PLANAR
- pixel *abovePlanar = above;
- pixel *leftPlanar = left;
+ pixel* abovePlanar = above;
+ pixel* leftPlanar = left;
if (tuSize >= 8 && tuSize <= 32)
{
abovePlanar = aboveFiltered;
for (int mode = 2; mode < 35; mode++)
{
bool modeHor = (mode < 18);
- pixel *cmp = (modeHor ? buf_trans : fenc);
+ const pixel* cmp = (modeHor ? buf_trans : fenc);
intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
* levels and at higher depths */
uint64_t candCostList[MAX_RD_INTRA_MODES];
uint32_t rdModeList[MAX_RD_INTRA_MODES];
- int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1);
+ int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
for (int i = 0; i < maxCandCount; i++)
candCostList[i] = MAX_INT64;
if (candCostList[i] == MAX_INT64)
break;
m_entropyCoder.load(m_rqt[depth].cur);
- cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth);
+ cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
Cost icosts;
if (checkTransformSkip)
- codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+ codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
- codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange);
+ codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
}
}
/* remeasure best mode, allowing TU splits */
- cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth);
+ cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
m_entropyCoder.load(m_rqt[depth].cur);
Cost icosts;
if (checkTransformSkip)
- codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+ codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
- codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange);
+ codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
totalDistortion += icosts.distortion;
- extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx);
+ extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
// set reconstruction for next intra prediction blocks
- if (pu != numPU - 1)
+ if (puIdx != numPU - 1)
{
/* This has important implications for parallelism and RDO. It is writing intermediate results into the
* output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
* it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
* that the contexts should be tracked through each PU */
- pixel* dst = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- uint32_t dststride = m_frame->m_reconPicYuv->m_stride;
- pixel* src = reconYuv->getLumaAddr(absPartIdx);
+ pixel* dst = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ uint32_t dststride = m_frame->m_reconPic->m_stride;
+ const pixel* src = reconYuv->getLumaAddr(absPartIdx);
uint32_t srcstride = reconYuv->m_size;
- primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
+ primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
}
}
if (numPU > 1)
{
uint32_t combCbfY = 0;
- uint32_t partIdx = 0;
- for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
- combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1);
+ for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
+ combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
cu.m_cbf[0][offs] |= combCbfY;
uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
uint32_t tuSize = 1 << log2TrSizeC;
int32_t scaleTuSize = tuSize;
+ uint32_t tuDepth = 0;
int32_t costShift = 0;
if (tuSize > 32)
{
scaleTuSize = 32;
+ tuDepth = 1;
costShift = 2;
log2TrSizeC = 5;
}
- Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1);
- Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2);
+ Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1);
+ Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2);
cu.getAllowedChromaDir(0, modeList);
// check chroma modes
uint64_t cost = 0;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
- pixel* fenc = fencYuv->m_buf[chromaId];
+ const pixel* fenc = fencYuv->m_buf[chromaId];
pixel* pred = predYuv->m_buf[chromaId];
pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
Yuv& reconYuv = intraMode.reconYuv;
uint32_t depth = cu.m_cuDepth[0];
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444;
- uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
+ uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
uint32_t totalDistortion = 0;
int part = partitionFromLog2Size(log2TrSize);
- TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
+ TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- int cuSize = 1 << cu.m_log2CUSize[absPartIdxC];
uint32_t bestMode = 0;
uint32_t bestDist = 0;
// restore context models
m_entropyCoder.load(m_rqt[depth].cur);
- cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
+ cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
uint32_t psyEnergy = 0;
- uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy);
+ uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
if (m_slice->m_pps->bTransformSkipEnabled)
m_entropyCoder.load(m_rqt[depth].cur);
}
else
{
- uint32_t qtNumParts = cuGeom.numPartitions >> 2;
- if (!(absPartIdxC & (qtNumParts - 1)))
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ if (!(absPartIdxC & (qNumParts - 1)))
m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
}
- codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize);
- codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U);
- codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V);
+ codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
+ codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
+ codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
bestCost = cost;
bestDist = dist;
bestMode = modeList[mode];
- extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false);
+ extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
if (!tuIterator.isLastSection())
{
uint32_t zorder = cuGeom.encodeIdx + absPartIdxC;
- uint32_t dststride = m_frame->m_reconPicYuv->m_strideC;
- pixel *src, *dst;
+ uint32_t dststride = m_frame->m_reconPic->m_strideC;
+ const pixel* src;
+ pixel* dst;
- dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder);
+ dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCbAddr(absPartIdxC);
primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
- dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder);
+ dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCrAddr(absPartIdxC);
primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
}
memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
- cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth);
+ cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
totalDistortion += bestDist;
}
while (tuIterator.isNextSection());
- if (initTrDepth != 0)
+ if (initTuDepth != 0)
{
uint32_t combCbfU = 0;
uint32_t combCbfV = 0;
- uint32_t partIdx = 0;
- for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep)
+ uint32_t qNumParts = tuIterator.absPartIdxStep;
+ for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1);
- combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1);
+ combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
+ combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
}
- for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++)
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
cu.m_cbf[1][offs] |= combCbfU;
cu.m_cbf[2][offs] |= combCbfV;
continue;
cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
- cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx;
+ cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx;
cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
- cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx;
+ cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
prepMotionCompensation(cu, cuGeom, puIdx);
- motionCompensation(tempYuv, true, false);
+ motionCompensation(tempYuv, true, m_me.bChromaSATD);
+
uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
+ if (m_me.bChromaSATD)
+ costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx);
+
uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
costCand = costCand + m_rdCost.getCost(bitsCand);
if (costCand < outCost)
/* this function assumes the caller has configured its MotionEstimation engine with the
* correct source plane and source PU, and has called prepMotionCompensation() to set
* m_puAbsPartIdx, m_puWidth, and m_puHeight */
-void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref)
+void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref)
{
uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
- MV amvpCand[AMVP_NUM_CANDS];
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
- int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc);
+ int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
- uint32_t bestCost = MAX_INT;
int mvpIdx = 0;
int merange = m_param->searchRange;
- for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ MotionData* bestME = interMode.bestME[part];
+
+ if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1])
{
- MV mvCand = amvpCand[i];
+ uint32_t bestCost = MAX_INT;
+ for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ {
+ MV mvCand = interMode.amvpCand[list][ref][i];
- // NOTE: skip mvCand if Y is > merange and -FN>1
- if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
- continue;
+ // NOTE: skip mvCand if Y is > merange and -FN>1
+ if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+ continue;
- cu.clipMv(mvCand);
+ interMode.cu.clipMv(mvCand);
- Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
- predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand);
- uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+ Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
+ predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
+ uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
- if (bestCost > cost)
- {
- bestCost = cost;
- mvpIdx = i;
+ if (bestCost > cost)
+ {
+ bestCost = cost;
+ mvpIdx = i;
+ }
}
}
- MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx];
- setSearchRange(cu, mvp, merange, mvmin, mvmax);
+ MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx];
+ setSearchRange(interMode.cu, mvp, merange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
- checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost);
+ checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost);
/* tie goes to the smallest ref ID, just like --no-pme */
- ScopedLock _lock(master.m_outputLock);
- if (cost < master.m_bestME[list].cost ||
- (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref))
+ ScopedLock _lock(master.m_meLock);
+ if (cost < bestME[list].cost ||
+ (cost == bestME[list].cost && ref < bestME[list].ref))
{
- master.m_bestME[list].mv = outmv;
- master.m_bestME[list].mvp = mvp;
- master.m_bestME[list].mvpIdx = mvpIdx;
- master.m_bestME[list].ref = ref;
- master.m_bestME[list].cost = cost;
- master.m_bestME[list].bits = bits;
+ bestME[list].mv = outmv;
+ bestME[list].mvp = mvp;
+ bestME[list].mvpIdx = mvpIdx;
+ bestME[list].ref = ref;
+ bestME[list].cost = cost;
+ bestME[list].bits = bits;
}
}
/* search of the best candidate for inter prediction
* returns true if predYuv was filled with a motion compensated prediction */
-bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma)
+bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
{
CUData& cu = interMode.cu;
Yuv* predYuv = &interMode.predYuv;
- MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
const Slice *slice = m_slice;
- PicYuv* fencPic = m_frame->m_origPicYuv;
int numPart = cu.getNumPartInter();
int numPredDir = slice->isInterP() ? 1 : 2;
const int* numRefIdx = slice->m_numRefIdx;
for (int puIdx = 0; puIdx < numPart; puIdx++)
{
+ MotionData* bestME = interMode.bestME[puIdx];
+
/* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
initMotionCompensation(cu, cuGeom, puIdx);
- pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
- m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
+ m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
uint32_t mrgCost = MAX_UINT;
- /* find best cost merge candidate */
- if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N)
+ /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
+ if (cu.m_partSize[0] != SIZE_2Nx2N)
{
merge.absPartIdx = m_puAbsPartIdx;
merge.width = m_puWidth;
merge.height = m_puHeight;
mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
- if (bMergeOnly && cu.m_log2CUSize[0] > 3)
+ if (bMergeOnly)
{
if (mrgCost == MAX_UINT)
{
totalmebits += merge.bits;
prepMotionCompensation(cu, cuGeom, puIdx);
- motionCompensation(*predYuv, true, bChroma);
+ motionCompensation(*predYuv, true, bChromaSA8D);
continue;
}
}
- MotionData bidir[2];
- uint32_t bidirCost = MAX_UINT;
- int bidirBits = 0;
-
- m_bestME[0].cost = MAX_UINT;
- m_bestME[1].cost = MAX_UINT;
+ bestME[0].cost = MAX_UINT;
+ bestME[1].cost = MAX_UINT;
getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
- if (bDistributed)
+ /* Uni-directional prediction */
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0)
{
- m_curMECu = &cu;
- m_curGeom = &cuGeom;
+ for (int l = 0; l < numPredDir; l++)
+ {
+ int ref = bestME[l].ref;
+ uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
+ bits += getTUBits(ref, numRefIdx[l]);
+
+ int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
+
+ // Pick the best possible MVP from AMVP candidates based on least residual
+ int mvpIdx = 0;
+ int merange = m_param->searchRange;
+
+ if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
+ {
+ uint32_t bestCost = MAX_INT;
+ for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ {
+ MV mvCand = interMode.amvpCand[l][ref][i];
+
+ // NOTE: skip mvCand if Y is > merange and -FN>1
+ if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+ continue;
+
+ cu.clipMv(mvCand);
+ predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+ uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
- /* this worker might already be enqueued for pmode, so other threads
- * might be looking at the ME job counts at any time, do these sets
- * in a safe order */
+ if (bestCost > cost)
+ {
+ bestCost = cost;
+ mvpIdx = i;
+ }
+ }
+ }
+
+ MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
+
+ int satdCost;
+ setSearchRange(cu, mvp, merange, mvmin, mvmax);
+ satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
+
+ /* Get total cost of partition, but only include MV bit cost once */
+ bits += m_me.bitcost(outmv);
+ uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
+
+ /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
+ checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+
+ if (cost < bestME[l].cost)
+ {
+ bestME[l].mv = outmv;
+ bestME[l].mvp = mvp;
+ bestME[l].mvpIdx = mvpIdx;
+ bestME[l].cost = cost;
+ bestME[l].bits = bits;
+ }
+ }
+ }
+ else if (bDistributed)
+ {
+ m_meLock.acquire();
+ m_curInterMode = &interMode;
+ m_curGeom = &cuGeom;
m_curPart = puIdx;
m_totalNumME = 0;
m_numAcquiredME = 1;
m_numCompletedME = 0;
m_totalNumME = numRefIdx[0] + numRefIdx[1];
+ m_meLock.release();
if (!m_bJobsQueued)
JobProvider::enqueue();
for (int i = 1; i < m_totalNumME; i++)
m_pool->pokeIdleThread();
- while (m_totalNumME > m_numAcquiredME)
+ do
{
- int id = ATOMIC_INC(&m_numAcquiredME);
- if (m_totalNumME >= id)
+ m_meLock.acquire();
+ if (m_totalNumME > m_numAcquiredME)
{
- id -= 1;
+ int id = m_numAcquiredME++;
+ m_meLock.release();
+
if (id < numRefIdx[0])
- singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id);
+ singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id);
else
- singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]);
+ singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]);
- if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
- m_meCompletionEvent.trigger();
+ m_meLock.acquire();
+ m_numCompletedME++;
+ m_meLock.release();
}
+ else
+ m_meLock.release();
}
+ while (m_totalNumME > m_numAcquiredME);
+
if (!m_bJobsQueued)
JobProvider::dequeue();
/* we saved L0-0 for ourselves */
- singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0);
- if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
+ singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0);
+
+ m_meLock.acquire();
+ if (++m_numCompletedME == m_totalNumME)
m_meCompletionEvent.trigger();
+ m_meLock.release();
m_meCompletionEvent.wait();
}
else
{
- // Uni-directional prediction
for (int l = 0; l < numPredDir; l++)
{
for (int ref = 0; ref < numRefIdx[l]; ref++)
uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[l]);
- int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc);
+ int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
// Pick the best possible MVP from AMVP candidates based on least residual
- uint32_t bestCost = MAX_INT;
int mvpIdx = 0;
int merange = m_param->searchRange;
- for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
{
- MV mvCand = amvpCand[l][ref][i];
+ uint32_t bestCost = MAX_INT;
+ for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ {
+ MV mvCand = interMode.amvpCand[l][ref][i];
- // NOTE: skip mvCand if Y is > merange and -FN>1
- if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
- continue;
+ // NOTE: skip mvCand if Y is > merange and -FN>1
+ if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+ continue;
- cu.clipMv(mvCand);
- predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand);
- uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+ cu.clipMv(mvCand);
+ predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+ uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
- if (bestCost > cost)
- {
- bestCost = cost;
- mvpIdx = i;
+ if (bestCost > cost)
+ {
+ bestCost = cost;
+ mvpIdx = i;
+ }
}
}
- MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx];
+ MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
setSearchRange(cu, mvp, merange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
- checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+ checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
- if (cost < m_bestME[l].cost)
+ if (cost < bestME[l].cost)
{
- m_bestME[l].mv = outmv;
- m_bestME[l].mvp = mvp;
- m_bestME[l].mvpIdx = mvpIdx;
- m_bestME[l].ref = ref;
- m_bestME[l].cost = cost;
- m_bestME[l].bits = bits;
+ bestME[l].mv = outmv;
+ bestME[l].mvp = mvp;
+ bestME[l].mvpIdx = mvpIdx;
+ bestME[l].ref = ref;
+ bestME[l].cost = cost;
+ bestME[l].bits = bits;
}
}
}
}
/* Bi-directional prediction */
- if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT)
+ MotionData bidir[2];
+ uint32_t bidirCost = MAX_UINT;
+ int bidirBits = 0;
+
+ if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */
+ cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */
+ bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
{
- bidir[0] = m_bestME[0];
- bidir[1] = m_bestME[1];
+ bidir[0] = bestME[0];
+ bidir[1] = bestME[1];
+
+ int satdCost;
- /* Generate reference subpels */
- PicYuv* refPic0 = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv;
- PicYuv* refPic1 = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv;
- Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
- predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv);
- predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv);
+ if (m_me.bChromaSATD)
+ {
+ cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
+ cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+ cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
+ cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
- pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx);
- pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx);
+ prepMotionCompensation(cu, cuGeom, puIdx);
+ motionCompensation(tmpPredYuv, true, true);
- int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
- primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32);
- int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+ m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+ }
+ else
+ {
+ PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
+ PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
+ Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
- bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+ /* Generate reference subpels */
+ predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
+ predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
+
+ primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
+ bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
+ satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ }
+
+ bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
bidirCost = satdCost + m_rdCost.getCost(bidirBits);
- bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero();
+ bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
if (bTryZero)
{
/* Do not try zero MV if unidir motion predictors are beyond
mvmin <<= 2;
mvmax <<= 2;
- bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax);
- bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax);
+ bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
+ bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
}
if (bTryZero)
{
- // coincident blocks of the two reference pictures
- pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
- pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
- intptr_t refStride = slice->m_mref[0][0].lumaStride;
+ /* coincident blocks of the two reference pictures */
+ if (m_me.bChromaSATD)
+ {
+ cu.m_mv[0][m_puAbsPartIdx] = mvzero;
+ cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+ cu.m_mv[1][m_puAbsPartIdx] = mvzero;
+ cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
- primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
- satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ prepMotionCompensation(cu, cuGeom, puIdx);
+ motionCompensation(tmpPredYuv, true, true);
- MV mvp0 = m_bestME[0].mvp;
- int mvpIdx0 = m_bestME[0].mvpIdx;
- uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
+ satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+ m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+ }
+ else
+ {
+ const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+ const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+ intptr_t refStride = slice->m_mref[0][0].lumaStride;
- MV mvp1 = m_bestME[1].mvp;
- int mvpIdx1 = m_bestME[1].mvpIdx;
- uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+ primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
+ satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ }
- uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
+ MV mvp0 = bestME[0].mvp;
+ int mvpIdx0 = bestME[0].mvpIdx;
+ uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
- if (bDistributed)
- {
- cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc);
- cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc);
- }
+ MV mvp1 = bestME[1].mvp;
+ int mvpIdx1 = bestME[1].mvpIdx;
+ uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+
+ uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
/* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
- checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
- checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
+ checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
+ checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
if (cost < bidirCost)
{
}
/* select best option and store into CU */
- if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost)
+ if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
{
cu.m_mergeFlag[m_puAbsPartIdx] = true;
cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
totalmebits += merge.bits;
}
- else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost)
+ else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
{
lastMode = 2;
cu.m_mergeFlag[m_puAbsPartIdx] = false;
cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
totalmebits += bidirBits;
}
- else if (m_bestME[0].cost <= m_bestME[1].cost)
+ else if (bestME[0].cost <= bestME[1].cost)
{
lastMode = 0;
cu.m_mergeFlag[m_puAbsPartIdx] = false;
cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
- cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
- cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp;
- cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx;
+ cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
+ cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
+ cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx;
cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
- totalmebits += m_bestME[0].bits;
+ totalmebits += bestME[0].bits;
}
else
{
cu.m_mergeFlag[m_puAbsPartIdx] = false;
cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
- cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
- cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp;
- cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx;
+ cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
+ cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
+ cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx;
cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
- totalmebits += m_bestME[1].bits;
+ totalmebits += bestME[1].bits;
}
prepMotionCompensation(cu, cuGeom, puIdx);
- motionCompensation(*predYuv, true, bChroma);
+ motionCompensation(*predYuv, true, bChromaSA8D);
}
interMode.sa8dBits += totalmebits;
// No residual coding : SKIP mode
- cu.setSkipFlagSubParts(true);
+ cu.setPredModeSubParts(MODE_SKIP);
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
// Chroma
part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
- interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
- interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+ interMode.distortion += m_rdCost.scaleChromaDist(1, primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+ interMode.distortion += m_rdCost.scaleChromaDist(2, primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
if (!cu.m_tqBypass[0])
{
uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
- cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
- cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
+ cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
+ cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
/* Consider the RD cost of not signaling any residual */
m_entropyCoder.load(m_rqt[depth].cur);
uint32_t coeffBits, bits;
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
{
- cu.setSkipFlagSubParts(true);
+ cu.setPredModeSubParts(MODE_SKIP);
/* Merge/Skip */
m_entropyCoder.resetBits();
uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
bool bCodeDQP = m_slice->m_pps->bUseDQP;
- m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange);
+ m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
bits = m_entropyCoder.getNumberOfWrittenBits();
coeffBits = bits - mvBits;
// update with clipped distortion and cost (qp estimation loop uses unclipped values)
uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
- bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
- bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+ bestDist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+ bestDist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
if (m_rdCost.m_psyRd)
interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
updateModeCost(interMode);
}
-void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom)
-{
- CUData& cu = mode.cu;
-
- m_quant.setQPforQuant(mode.cu);
-
- if (cu.m_predMode[0] == MODE_INTER)
- {
- uint32_t tuDepthRange[2];
- cu.getInterTUQtDepthRange(tuDepthRange, 0);
-
- residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange);
- if (cu.getQtRootCbf(0))
- mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
- else
- {
- mode.reconYuv.copyFromYuv(mode.predYuv);
- if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
- cu.setSkipFlagSubParts(true);
- }
- }
- else if (cu.m_predMode[0] == MODE_INTRA)
- {
- uint32_t tuDepthRange[2];
- cu.getIntraTUQtDepthRange(tuDepthRange, 0);
-
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
- residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange);
- getBestIntraModeChroma(mode, cuGeom);
- residualQTIntraChroma(mode, cuGeom, 0, 0);
- mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
- }
-}
-
-void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2])
+void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
uint32_t tuDepth = depth - cu.m_cuDepth[0];
bool bCheckFull = log2TrSize <= depthRange[1];
- if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
+ if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
bCheckFull = false;
if (bCheckFull)
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if (log2TrSizeC == 1)
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
- log2TrSizeC++;
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ log2TrSizeC = 2;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
const Yuv* fencYuv = mode.fencYuv;
- int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx);
+ int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = resiYuv.m_size;
- pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSigY)
cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
- pixel* fencCb = const_cast<pixel*>(fencYuv->getCbAddr(absPartIdxC));
+ const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
if (numSigU)
{
}
int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
- pixel* fencCr = const_cast<pixel*>(fencYuv->getCrAddr(absPartIdxC));
+ const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
if (numSigV)
{
{
X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
- const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
- for (uint32_t i = 0; i < 4; i++)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange);
- ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
- ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
- vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+ residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange);
+ ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+ ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++)
+ for (uint32_t i = 0; i < 4 * qNumParts; i++)
{
cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
}
}
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2])
+uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
+{
+ uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
+
+ if (m_rdCost.m_psyRd)
+ return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
+ else
+ return m_rdCost.calcRdCost(dist, nullBits);
+}
+
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
uint32_t log2TrSize = g_maxLog2CUSize - depth;
bool bCheckSplit = log2TrSize > depthRange[0];
bool bCheckFull = log2TrSize <= depthRange[1];
+ bool bSplitPresentFlag = bCheckSplit && bCheckFull;
- if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
+ if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
bCheckFull = false;
X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
+ if (log2TrSizeC < 2)
{
- log2TrSizeC++;
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ log2TrSizeC = 2;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
// code full block
uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
- uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
- uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
- uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+ uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+ uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+ uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
- pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
- int16_t *resi = resiYuv.getLumaAddr(absPartIdx);
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
+ int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
- if (cbfFlag[TEXT_LUMA][0])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
- singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
-
- uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
-
- if (bCodeChroma)
- {
- uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
- {
- coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
- TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
-
- do
- {
- uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
-
- cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
- if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
- m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
-
- fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
- resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
- numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
- cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
-
- m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
- if (cbfFlag[chromaId][tuIterator.section])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
-
- uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
- singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+ if (bSplitPresentFlag && log2TrSize > depthRange[0])
+ m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
+ fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
- singleBitsPrev = newBits;
- }
- while (tuIterator.isNextSection());
- }
- }
+ // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
+ // So it is valid if we encode coefficients and then cbfs at least for analysis.
+// m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
+ if (cbfFlag[TEXT_LUMA][0])
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
- const uint32_t numCoeffY = 1 << (log2TrSize * 2);
- const uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
+ uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
+ singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
if (m_rdCost.m_psyRd)
psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
- int16_t *curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
+ int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
if (cbfFlag[TEXT_LUMA][0])
{
m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
+ // non-zero cost calculation for luma - This is an approximation
+ // finally we have to encode correct cbf after comparing with null cost
const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
- uint32_t nonZeroPsyEnergyY = 0;
+ uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
+ uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
if (m_rdCost.m_psyRd)
+ {
nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
+ singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
+ }
+ else
+ singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
if (cu.m_tqBypass[0])
{
- distY = nonZeroDistY;
- psyEnergyY = nonZeroPsyEnergyY;
+ singleDist[TEXT_LUMA][0] = nonZeroDistY;
+ singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
}
else
{
- uint64_t singleCostY = 0;
- if (m_rdCost.m_psyRd)
- singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY);
- else
- singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
- const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
- uint64_t nullCostY = 0;
- if (m_rdCost.m_psyRd)
- nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
- else
- nullCostY = m_rdCost.calcRdCost(distY, nullBitsY);
+ // zero-cost calculation for luma. This is an approximation
+ // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
+ // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
+ uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+
if (nullCostY < singleCostY)
{
cbfFlag[TEXT_LUMA][0] = 0;
+ singleBits[TEXT_LUMA][0] = 0;
+ primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
#if CHECKED_BUILD || _DEBUG
+ uint32_t numCoeffY = 1 << (log2TrSize << 1);
memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
#endif
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = nullCostY;
+ singleDist[TEXT_LUMA][0] = distY;
+ singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
}
else
{
- distY = nonZeroDistY;
- psyEnergyY = nonZeroPsyEnergyY;
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = singleCostY;
+ singleDist[TEXT_LUMA][0] = nonZeroDistY;
+ singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
}
}
}
- else if (checkTransformSkipY)
+ else
{
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
- const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
- if (m_rdCost.m_psyRd)
- minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
- else
- minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY);
+ if (checkTransformSkipY)
+ minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+ primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
+ singleDist[TEXT_LUMA][0] = distY;
+ singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
}
- singleDistComp[TEXT_LUMA][0] = distY;
- singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
- if (!cbfFlag[TEXT_LUMA][0])
- primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
if (bCodeChroma)
{
- uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+ uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t distC = 0, psyEnergyC = 0;
coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
- do
- {
- uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+ do
+ {
+ uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+ uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+ cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
- distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+ if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
+ m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
- if (cbfFlag[chromaId][tuIterator.section])
- {
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
- log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
- uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
- const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
- uint32_t nonZeroPsyEnergyC = 0;
- if (m_rdCost.m_psyRd)
- nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
-
- if (cu.m_tqBypass[0])
- {
- distC = nonZeroDistC;
- psyEnergyC = nonZeroPsyEnergyC;
- }
- else
+ fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
+ resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
+ numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
+ cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
+
+ //Coding cbf flags has been removed from here
+// m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth);
+ if (cbfFlag[chromaId][tuIterator.section])
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
+ uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
+ singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+ singleBitsPrev = newBits;
+
+ int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+ distC = m_rdCost.scaleChromaDist(chromaId, primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+
+ if (cbfFlag[chromaId][tuIterator.section])
{
- uint64_t singleCostC = 0;
+ m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
+ log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
+
+ // non-zero cost calculation for luma, same as luma - This is an approximation
+ // finally we have to encode correct cbf after comparing with null cost
+ uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+ uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
+ uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
+ uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
if (m_rdCost.m_psyRd)
- singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+ {
+ nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+ singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+ }
else
- singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth);
- const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
- uint64_t nullCostC = 0;
- if (m_rdCost.m_psyRd)
- nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
+ singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
+
+ if (cu.m_tqBypass[0])
+ {
+ singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+ singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+ }
else
- nullCostC = m_rdCost.calcRdCost(distC, nullBitsC);
- if (nullCostC < singleCostC)
{
- cbfFlag[chromaId][tuIterator.section] = 0;
+ //zero-cost calculation for chroma. This is an approximation
+ uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
+
+ if (nullCostC < singleCostC)
+ {
+ cbfFlag[chromaId][tuIterator.section] = 0;
+ singleBits[chromaId][tuIterator.section] = 0;
+ primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
#if CHECKED_BUILD || _DEBUG
+ uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
#endif
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = nullCostC;
+ singleDist[chromaId][tuIterator.section] = distC;
+ singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
}
else
{
- distC = nonZeroDistC;
- psyEnergyC = nonZeroPsyEnergyC;
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = singleCostC;
+ singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+ singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
}
}
}
- else if (checkTransformSkipC)
+ else
{
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC);
- const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
- if (m_rdCost.m_psyRd)
- minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
- else
- minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC);
- }
-
- singleDistComp[chromaId][tuIterator.section] = distC;
- singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC;
-
- if (!cbfFlag[chromaId][tuIterator.section])
+ if (checkTransformSkipC)
+ minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
+ singleDist[chromaId][tuIterator.section] = distC;
+ singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
+ }
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
- fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ fenc = fencYuv->getLumaAddr(absPartIdx);
resi = resiYuv.getLumaAddr(absPartIdx);
uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
if (numSigTSkipY)
{
m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth);
+ m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
else
{
- singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
- singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
+ singleDist[TEXT_LUMA][0] = nonZeroDistY;
+ singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
bestTransformMode[TEXT_LUMA][0] = 1;
+ uint32_t numCoeffY = 1 << (log2TrSize << 1);
memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
- primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
+ primitives.luma_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
}
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+ int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
- fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
+ fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
m_entropyCoder.resetBits();
- singleBitsComp[chromaId][tuIterator.section] = 0;
+ singleBits[chromaId][tuIterator.section] = 0;
if (numSigTSkipC)
{
- m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth);
+ m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
- singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
+ singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
- nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
+ nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
if (m_rdCost.m_psyRd)
{
nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
- singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+ singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
}
else
- singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
+ singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
}
if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
else
{
- singleDistComp[chromaId][tuIterator.section] = nonZeroDistC;
- singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+ singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+ singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
bestTransformMode[chromaId][tuIterator.section] = 1;
+ uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
- primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
+ primitives.luma_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
}
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
}
+ // Here we were encoding cbfs and coefficients, after calculating distortion above.
+ // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
+ // bits required for coefficients and added with number of cbf bits. As I tested the order does not
+ // make any difference. But bit confused whether I should load the original context as below.
m_entropyCoder.load(m_rqt[depth].rqtRoot);
-
m_entropyCoder.resetBits();
- if (log2TrSize > depthRange[0])
- m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
-
+ //Encode cbf flags
if (bCodeChroma)
{
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
if (!splitIntoSubTUs)
- m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
else
{
offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth);
}
}
}
- m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
- if (cbfFlag[TEXT_LUMA][0])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+ m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
- if (bCodeChroma)
- {
- uint32_t subTUSize = 1 << (log2TrSizeC * 2);
- uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
- uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+ uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
- {
- coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
- if (!splitIntoSubTUs)
- {
- if (cbfFlag[chromaId][0])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId);
- }
- else
- {
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- if (cbfFlag[chromaId][subTU])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId);
- }
- }
- }
+ uint32_t coeffBits = 0;
+ coeffBits = singleBits[TEXT_LUMA][0];
+ for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
+ {
+ coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
+ coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
}
- fullCost.distortion += singleDistComp[TEXT_LUMA][0];
- fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also
+ // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
+ // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for
+ // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks.
+ // For that reason, I am collecting individual coefficient bits only.
+ fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
+
+ fullCost.distortion += singleDist[TEXT_LUMA][0];
+ fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
{
- fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex];
- fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex];
+ fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
+ fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
}
- fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
if (m_rdCost.m_psyRd)
fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
else
}
Cost splitCost;
- const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+ if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
+ {
+ // Subdiv flag can be encoded at the start of anlysis of splitted blocks.
+ m_entropyCoder.resetBits();
+ m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
+ splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+ }
+
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
- for (uint32_t i = 0; i < 4; ++i)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange);
- ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
- ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
- vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+ estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange);
+ ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+ ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i)
+ for (uint32_t i = 0; i < 4 * qNumParts; ++i)
{
cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
}
+ // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
+ // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
+ // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
+ // at depth 0 (for example).
m_entropyCoder.load(m_rqt[depth].rqtRoot);
m_entropyCoder.resetBits();
- encodeResidualQT(cu, absPartIdx, depth, true, TEXT_LUMA, depthRange);
- encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange);
- encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange);
- encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange);
-
- splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+ codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange);
+ uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
+ splitCost.bits += splitCbfBits;
if (m_rdCost.m_psyRd)
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
if (bCodeChroma)
{
- const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
-
- uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
- for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
+ if (!splitIntoSubTUs)
{
- const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
-
- cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU);
- cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU);
+ cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
+ cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
+ }
+ else
+ {
+ uint32_t tuNumParts = absPartIdxStep >> 1;
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts);
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts);
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
X265_CHECK(bCheckFull, "check-full must be set\n");
if (bCodeChroma)
{
- uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
- uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
-
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ if (!splitIntoSubTUs)
{
- for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
- {
- const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
+ cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
+ cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
+ }
+ else
+ {
+ uint32_t tuNumParts = absPartIdxStep >> 1;
- if (splitIntoSubTUs)
- {
- uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1];
- cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
- }
- else
- cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
- }
+ offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
+ offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
outCosts.energy += fullCost.energy;
}
-void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2])
+void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2])
{
X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
- X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n");
+ X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
- const uint32_t curTuDepth = depth - cu.m_cuDepth[0];
- const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
- const bool bSubdiv = curTuDepth != tuDepth;
+ const uint32_t tuDepth = depth - cu.m_cuDepth[0];
+ const bool bSubdiv = tuDepth != cu.m_tuDepth[absPartIdx];
const uint32_t log2TrSize = g_maxLog2CUSize - depth;
- uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
+ if (!(log2TrSize - m_hChromaShift < 2))
+ {
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
+ }
+ else
+ {
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
+ }
- if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])
- m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize);
+ if (!bSubdiv)
+ {
+ m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
+ }
+ else
+ {
+ uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange);
+ }
+}
- bool mCodeAll = true;
- uint32_t trWidthC = 1 << log2TrSizeC;
- uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
+void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2])
+{
+ X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
+ X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n");
- const uint32_t numPels = trWidthC * trHeightC;
- if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
- mCodeAll = false;
+ const uint32_t curTuDepth = depth - cu.m_cuDepth[0];
+ const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
+ const bool bSubdiv = curTuDepth != tuDepth;
+ const uint32_t log2TrSize = g_maxLog2CUSize - depth;
- if (bSubdivAndCbf)
+ if (bSubdiv)
{
- const bool bFirstCbfOfCU = curTuDepth == 0;
- if (bFirstCbfOfCU || mCodeAll)
- {
- uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + curTuDepth) << 1);
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv);
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv);
- }
- else
+ if (cu.getCbf(absPartIdx, ttype, curTuDepth))
{
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n");
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n");
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange);
}
+ return;
}
-
- if (!bSubdiv)
+ else
{
+ const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
+ uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
+
// Luma
const uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
// Chroma
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
+ if (log2TrSize == 2 && m_csp != X265_CSP_I444)
{
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
log2TrSizeC++;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
- if (bSubdivAndCbf)
- m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth);
- else
+ if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+
+ if (bCodeChroma)
{
- if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+ uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+ coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
+ coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
- if (bCodeChroma)
+ if (!splitIntoSubTUs)
{
- uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
- coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
- coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
-
- if (!splitIntoSubTUs)
+ if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
+ if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+ }
+ else
+ {
+ uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
+ uint32_t subTUSize = 1 << (log2TrSizeC * 2);
+ if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
{
- if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
+ if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
- if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+ if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U);
}
- else
+ if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
{
- uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1);
- uint32_t subTUSize = 1 << (log2TrSizeC * 2);
- if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
- {
- if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
- if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
- }
- if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
- {
- if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
- if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
- }
+ if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+ if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V);
}
}
}
}
- else
- {
- if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth))
- {
- const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
- for (uint32_t i = 0; i < 4; ++i)
- encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange);
- }
- }
}
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
const uint32_t curTrMode = depth - cu.m_cuDepth[0];
const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
+ const uint32_t log2TrSize = g_maxLog2CUSize - depth;
if (curTrMode < tuDepth)
{
- uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
- for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv)
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
return;
}
- const uint32_t log2TrSize = g_maxLog2CUSize - depth;
const uint32_t qtLayer = log2TrSize - 2;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if (log2TrSizeC == 1)
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
- log2TrSizeC++;
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ log2TrSizeC = 2;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
#include "entropy.h"
#include "motion.h"
-#define MVP_IDX_BITS 1
-#define NUM_LAYERS 4
-
namespace x265 {
// private namespace
Yuv bidirPredYuv[2];
};
+struct MotionData
+{
+ MV mv;
+ MV mvp;
+ int mvpIdx;
+ int ref;
+ uint32_t cost;
+ int bits;
+ bool costZero;
+};
+
+struct Mode
+{
+ CUData cu;
+ const Yuv* fencYuv;
+ Yuv predYuv;
+ Yuv reconYuv;
+ Entropy contexts;
+
+ enum { MAX_INTER_PARTS = 2 };
+
+ MotionData bestME[MAX_INTER_PARTS][2];
+ MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
+
+ uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits)
+ uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits)
+ uint32_t sa8dBits; // signal bits used in sa8dCost calculation
+ uint32_t psyEnergy; // sum of partition psycho-visual energy difference
+ uint32_t distortion; // sum of partition SSE distortion
+ uint32_t totalBits; // sum of partition bits (mv + coeff)
+ uint32_t mvBits; // Mv bits + Ref + block type (or intra mode)
+ uint32_t coeffBits; // Texture bits (DCT Coeffs)
+
+ void initCosts()
+ {
+ rdCost = 0;
+ sa8dCost = 0;
+ sa8dBits = 0;
+ psyEnergy = 0;
+ distortion = 0;
+ totalBits = 0;
+ mvBits = 0;
+ coeffBits = 0;
+ }
+
+ void addSubCosts(const Mode& subMode)
+ {
+ rdCost += subMode.rdCost;
+ sa8dCost += subMode.sa8dCost;
+ sa8dBits += subMode.sa8dBits;
+ psyEnergy += subMode.psyEnergy;
+ distortion += subMode.distortion;
+ totalBits += subMode.totalBits;
+ mvBits += subMode.mvBits;
+ coeffBits += subMode.coeffBits;
+ }
+};
+
inline int getTUBits(int idx, int numIdx)
{
return idx + (idx < numIdx - 1);
uint32_t m_numLayers;
uint32_t m_refLagPixels;
- struct Mode
- {
- CUData cu;
- const Yuv* fencYuv;
- Yuv predYuv;
- Yuv reconYuv;
- Entropy contexts;
-
- uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits)
- uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits)
- uint32_t sa8dBits; // signal bits used in sa8dCost calculation
- uint32_t psyEnergy; // sum of partition psycho-visual energy difference
- uint32_t distortion; // sum of partition SSE distortion
- uint32_t totalBits; // sum of partition bits (mv + coeff)
- uint32_t mvBits; // Mv bits + Ref + block type (or intra mode)
- uint32_t coeffBits; // Texture bits (DCT Coeffs)
-
- void initCosts()
- {
- rdCost = 0;
- sa8dCost = 0;
- sa8dBits = 0;
- psyEnergy = 0;
- distortion = 0;
- totalBits = 0;
- mvBits = 0;
- coeffBits = 0;
- }
-
- void addSubCosts(const Mode& subMode)
- {
- rdCost += subMode.rdCost;
- sa8dCost += subMode.sa8dCost;
- sa8dBits += subMode.sa8dBits;
- psyEnergy += subMode.psyEnergy;
- distortion += subMode.distortion;
- totalBits += subMode.totalBits;
- mvBits += subMode.mvBits;
- coeffBits += subMode.coeffBits;
- }
- };
-
- struct MotionData
- {
- MV mv;
- MV mvp;
- int mvpIdx;
- int ref;
- uint32_t cost;
- int bits;
- };
-
Search();
~Search();
// full RD search of intra modes. if sharedModes is not NULL, it directly uses them
void checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes);
+ // select best intra mode using only sa8d costs, cannot measure NxN intra
+ void checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+ // encode luma mode selected by checkIntraInInter, then pick and encode a chroma mode
+ void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+
// estimation inter prediction (non-skip)
bool predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma);
void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
void encodeResAndCalcRdSkipCU(Mode& interMode);
- void generateCoeffRecon(Mode& mode, const CUGeom& cuGeom);
- void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2]);
+ // encode residual without rd-cost
+ void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2]);
+ void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2]);
+ void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx);
- uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const;
+ // pick be chroma mode from available using just sa8d costs
+ void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
protected:
/* motion estimation distribution */
ThreadLocalData* m_tld;
- CUData* m_curMECu;
+ Mode* m_curInterMode;
const CUGeom* m_curGeom;
int m_curPart;
- MotionData m_bestME[2];
uint32_t m_listSelBits[3];
int m_totalNumME;
volatile int m_numAcquiredME;
volatile int m_numCompletedME;
Event m_meCompletionEvent;
- Lock m_outputLock;
+ Lock m_meLock;
bool m_bJobsQueued;
- void singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref);
+ void singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref);
void saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth);
// RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
- uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes);
+ uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
// RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
- void codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height);
- void codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype);
+ void codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
+ void codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2]);
+ void codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype);
struct Cost
{
Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
};
- void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, uint32_t depthRange[2]);
+ uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
+ void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
- void encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]);
+ // estimate bit cost of residual QT
+ void encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, TextType ttype, const uint32_t depthRange[2]);
// generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
- void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, uint32_t depthRange[2]);
- void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& costs);
- void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx);
+ void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
+ void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs);
+ void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
// generate chroma prediction, generate residual and recon
- uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
- uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
- void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad);
-
- void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2]);
- void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx);
+ uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
+ uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
+ void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
- void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx);
+ // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
+ void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx);
struct MergeData
{
/* intra helper functions */
enum { MAX_RD_INTRA_MODES = 16 };
static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
- void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
+
+ // get most probable luma modes for CU part, and bit cost of all non mpm modes
+ uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const;
void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); }
};
/* Called by API thread */
void Lookahead::addPicture(Frame *curFrame, int sliceType)
{
- PicYuv *orig = curFrame->m_origPicYuv;
+ PicYuv *orig = curFrame->m_fencPic;
curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
/* Called by pool worker threads */
bool Lookahead::findJob(int)
{
- if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1)
+ if (m_bReady > 0 && ATOMIC_DEC(&m_bReady) == 0)
{
m_inputQueueLock.acquire();
slicetypeDecide();
/* called by API thread or worker thread with inputQueueLock acquired */
void Lookahead::slicetypeDecide()
{
+ ProfileScopeEvent(slicetypeDecideEV);
+
ScopedLock lock(m_decideLock);
Lowres *frames[X265_LOOKAHEAD_MAX];
list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF;
brefs++;
}
-
/* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
if (m_param->rc.rateControlMode != X265_RC_CQP)
{
void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
{
int prevNonB = 0, curNonB = 1, idx = 0;
- bool isNextNonB = false;
-
while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
curNonB++;
-
int nextNonB = keyframe ? prevNonB : curNonB;
- int nextB = keyframe ? prevNonB + 1 : curNonB + 1;
-
+ int nextB = prevNonB + 1;
+ int nextBRef = 0;
+ int miniGopEnd = keyframe ? prevNonB : curNonB;
while (curNonB < numFrames + !keyframe)
{
/* P/I cost: This shouldn't include the cost of nextNonB */
int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);
frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;
+ /* Save the nextNonB Cost in each B frame of the current miniGop */
+ if (curNonB > miniGopEnd)
+ {
+ for (int j = nextB; j < miniGopEnd; j++)
+ {
+ frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx];
+ frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx];
+
+ }
+ }
idx++;
}
/* Handle the B-frames: coded order */
- for (int i = prevNonB + 1; i < curNonB; i++, idx++)
- {
- frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, prevNonB, curNonB, i);
- frames[nextNonB]->plannedType[idx] = X265_TYPE_B;
- }
+ if (m_param->bBPyramid && curNonB - prevNonB > 1)
+ nextBRef = (prevNonB + curNonB + 1) / 2;
- for (int i = nextB; i <= curNonB; i++)
+ for (int i = prevNonB + 1; i < curNonB; i++, idx++)
{
- for (int j = frames[i]->indB + i + 1; j <= curNonB; j++, frames[i]->indB++)
+ int64_t satdCost = 0; int type = X265_TYPE_B;
+ if (nextBRef)
{
- if (j == curNonB)
+ if (i == nextBRef)
{
- if (isNextNonB)
- {
- int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
- frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, p0, curNonB, curNonB);
- frames[i]->plannedType[frames[i]->indB] = frames[curNonB]->sliceType;
- }
+ satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef);
+ type = X265_TYPE_BREF;
}
+ else if (i < nextBRef)
+ satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i);
else
- {
- frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, prevNonB, curNonB, j);
- frames[i]->plannedType[frames[i]->indB] = X265_TYPE_B;
- }
+ satdCost = vbvFrameCost(frames, nextBRef, curNonB, i);
}
- if (i == curNonB && !isNextNonB)
- isNextNonB = true;
- }
+ else
+ satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i);
+ frames[nextNonB]->plannedSatd[idx] = satdCost;
+ frames[nextNonB]->plannedType[idx] = type;
+ /* Save the nextB Cost in each B frame of the current miniGop */
+ for (int j = nextB; j < miniGopEnd; j++)
+ {
+ if (nextBRef && i == nextBRef)
+ break;
+ if (j >= i && j !=nextBRef)
+ continue;
+ frames[j]->plannedSatd[frames[j]->indB] = satdCost;
+ frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B;
+ }
+ }
prevNonB = curNonB;
curNonB++;
while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
if (m_param->bEnableWeightedPred)
{
- PicYuv *orig = curFrame->m_origPicYuv;
+ PicYuv *orig = curFrame->m_fencPic;
m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY;
intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX;
m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
}
- m_weightedRef.fpelPlane = m_weightedRef.lowresPlane[0];
+ m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0];
m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride;
m_weightedRef.isLowres = true;
m_weightedRef.isWeighted = false;
for (int i = 0; i < m_heightInCU; i++)
{
m_rows[i].init();
- m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
if (!fenc->bIntraCalculated)
fenc->rowSatds[0][0][i] = 0;
fenc->rowSatds[b - p0][p1 - b][i] = 0;
{
Lowres *fenc = frames[b];
Lowres *ref = frames[p0];
- pixel *src = ref->fpelPlane;
+ pixel *src = ref->fpelPlane[0];
intptr_t stride = fenc->lumaStride;
if (wp)
primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
scale, round << correction, denom + correction, offset);
- src = m_weightedRef.fpelPlane;
+ src = m_weightedRef.fpelPlane[0];
}
uint32_t cost = 0;
{
for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8)
{
- int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane + pixoff, stride);
+ int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride);
cost += X265_MIN(satd, fenc->intraCost[mb]);
}
}
void CostEstimate::processRow(int row, int /*threadId*/)
{
+ ProfileScopeEvent(costEstimateRow);
+
int realrow = m_heightInCU - 1 - row;
Lowres **frames = m_curframes;
ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0];
const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
- m_me.setSourcePU(pelOffset, cuSize, cuSize);
+ m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
/* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
int lowresPenalty = 4;
}
if (bBidir)
{
- pixel subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE], subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
+ ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+ ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0);
pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1);
- pixel ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
+ ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
// Copy Left
for (int i = 0; i < cuSize + 1; i++)
- {
left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
- }
for (int i = 0; i < cuSize; i++)
{
// generate 35 intra predictions into m_predictions
pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
- int icost = m_me.COST_MAX, cost;
+ int icost = m_me.COST_MAX;
primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
- cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+ int cost = m_me.bufSATD(m_predictions, cuSize);
if (cost < icost)
icost = cost;
pixel *above = (cuSize >= 8) ? above1 : above0;
pixel *left = (cuSize >= 8) ? left1 : left0;
primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0);
- cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+ cost = m_me.bufSATD(m_predictions, cuSize);
if (cost < icost)
icost = cost;
primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
// calculate satd costs, keep least cost
ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
- primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
+ primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
int acost = m_me.COST_MAX;
uint32_t mode, lowmode = 4;
if (mode < 18)
cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
else
- cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+ cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
COPY2_IF_LT(acost, cost, lowmode, mode);
}
for (uint32_t dist = 2; dist >= 1; dist--)
if (mode < 18)
cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
else
- cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+ cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
COPY2_IF_LT(acost, cost, lowmode, mode);
mode = lowmode + dist;
if (mode < 18)
cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
else
- cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+ cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
COPY2_IF_LT(acost, cost, lowmode, mode);
}
if (acost < icost)
const int intraPenalty = 5 * m_lookAheadLambda;
icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
fenc->intraCost[cuXY] = icost;
+ fenc->intraMode[cuXY] = (uint8_t)lowmode;
int icostAq = icost;
if (bFrameScoreCU)
{
EstimateRow()
{
m_me.setQP(X265_LOOKAHEAD_QP);
- m_me.setSearchMethod(X265_HEX_SEARCH);
- m_me.setSubpelRefine(1);
+ m_me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
m_predictions = X265_MALLOC(pixel, 35 * 8 * 8);
m_merange = 16;
m_lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
{
WeightParam wp[2][MAX_NUM_REF][3];
- PicYuv *fencPic = frame.m_origPicYuv;
+ PicYuv *fencPic = frame.m_fencPic;
Lowres& fenc = frame.m_lowres;
Cache cache;
if (!refFrame->m_bChromaExtended)
{
refFrame->m_bChromaExtended = true;
- PicYuv *refPic = refFrame->m_origPicYuv;
+ PicYuv *refPic = refFrame->m_fencPic;
int width = refPic->m_picWidth >> cache.hshift;
int height = refPic->m_picHeight >> cache.vshift;
extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
case 1:
orig = fencPic->m_picOrg[1];
stride = fencPic->m_strideC;
- fref = refFrame->m_origPicYuv->m_picOrg[1];
+ fref = refFrame->m_fencPic->m_picOrg[1];
/* Clamp the chroma dimensions to the nearest multiple of
* 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
break;
case 2:
- fref = refFrame->m_origPicYuv->m_picOrg[2];
+ fref = refFrame->m_fencPic->m_picOrg[2];
orig = fencPic->m_picOrg[2];
stride = fencPic->m_strideC;
width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
ipfilterharness.cpp ipfilterharness.h
intrapredharness.cpp intrapredharness.h)
target_link_libraries(TestBench x265-static ${PLATFORM_LIBS})
-
-add_executable(PoolTest testpool.cpp)
-target_link_libraries(PoolTest x265-static ${PLATFORM_LIBS})
{
for (int i = 0; i < INPUT_SIZE; i++)
pixel_buff[i] = rand() % PIXEL_MAX;
-
- initROM();
}
bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width)
for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
{
- if (opt.chroma_p2s[csp])
+ if (opt.chroma[csp].p2s)
{
- if (!check_IPFilter_primitive(ref.chroma_p2s[csp], opt.chroma_p2s[csp], 1, csp))
+ if (!check_IPFilter_primitive(ref.chroma[csp].p2s, opt.chroma[csp].p2s, 1, csp))
{
printf("chroma_p2s[%s]", x265_source_csp_names[csp]);
return false;
for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
{
printf("= Color Space %s =\n", x265_source_csp_names[csp]);
- if (opt.chroma_p2s[csp])
+ if (opt.chroma[csp].p2s)
{
printf("chroma_p2s\t");
- REPORT_SPEEDUP(opt.chroma_p2s[csp], ref.chroma_p2s[csp],
+ REPORT_SPEEDUP(opt.chroma[csp].p2s, ref.chroma[csp].p2s,
pixel_buff, srcStride, IPF_vec_output_s, width, height);
}
for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
int_test_buff[0][i] = rand() % PIXEL_MAX;
int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
- int_denoise_test_buff1[0][i] = int_denoise_test_buff2[0][i] = (rand() & UNSIGNED_SHORT_MAX) - (rand() & UNSIGNED_SHORT_MAX);
+ short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
short_test_buff[1][i] = -PIXEL_MAX;
int_test_buff[1][i] = -PIXEL_MAX;
int_idct_test_buff[1][i] = SHORT_MIN;
- int_denoise_test_buff1[1][i] = int_denoise_test_buff2[1][i] = -UNSIGNED_SHORT_MAX;
+ short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
short_test_buff[2][i] = PIXEL_MAX;
int_test_buff[2][i] = PIXEL_MAX;
int_idct_test_buff[2][i] = SHORT_MAX;
- int_denoise_test_buff1[2][i] = int_denoise_test_buff2[2][i] = UNSIGNED_SHORT_MAX;
+ short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
mbuf1[i] = rand() & PIXEL_MAX;
mbufdct[i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, intptr_t width)
{
int j = 0;
- intptr_t cmp_size = sizeof(int) * width * width;
+ intptr_t cmp_size = sizeof(short) * width * width;
for (int i = 0; i < ITERS; i++)
{
int index = rand() % TEST_CASES;
- ref(short_test_buff[index] + j, mintbuf3, width);
- checked(opt, short_test_buff[index] + j, mintbuf4, width);
+ ref(short_test_buff[index] + j, mshortbuf2, width);
+ checked(opt, short_test_buff[index] + j, mshortbuf3, width);
- if (memcmp(mintbuf3, mintbuf4, cmp_size))
+ if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
return false;
reportfail();
{
int index = rand() % TEST_CASES;
- ref(int_idct_test_buff[index] + j, mshortbuf2, width);
- checked(opt, int_idct_test_buff[index] + j, mshortbuf3, width);
+ ref(short_test_buff[index] + j, mshortbuf2, width);
+ checked(opt, short_test_buff[index] + j, mshortbuf3, width);
if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
return false;
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
- ref(short_test_buff[index] + j, mintbuf3, width * height, scale, shift);
- checked(opt, short_test_buff[index] + j, mintbuf4, width * height, scale, shift);
+ ref(short_test_buff[index] + j, mshortbuf2, width * height, scale, shift);
+ checked(opt, short_test_buff[index] + j, mshortbuf3, width * height, scale, shift);
- if (memcmp(mintbuf3, mintbuf4, sizeof(int) * height * width))
+ if (memcmp(mshortbuf2, mshortbuf3, sizeof(int16_t) * height * width))
return false;
reportfail();
for (int i = 0; i < ITERS; i++)
{
+
+ memset(mshortbuf2, 0, MAX_TU_SIZE * sizeof(int16_t));
+ memset(mshortbuf3, 0, MAX_TU_SIZE * sizeof(int16_t));
+
int log2TrSize = (rand() % 4) + 2;
int width = (1 << log2TrSize);
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
- int cmp_size = sizeof(int) * height * width;
+ int cmp_size = sizeof(int16_t) * height * width;
int index1 = rand() % TEST_CASES;
- ref(short_test_buff[index1] + j, mintbuf3, mintbuf1, width * height, per, shift);
- checked(opt, short_test_buff[index1] + j, mintbuf4, mintbuf2, width * height, per, shift);
+ ref(short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf2, width * height, per, shift);
+ checked(opt, short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf3, width * height, per, shift);
- if (memcmp(mintbuf1, mintbuf2, cmp_size))
+ if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
return false;
reportfail();
int index1 = rand() % TEST_CASES;
int index2 = rand() % TEST_CASES;
- refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
- optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
+ refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
+ optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
if (memcmp(mintbuf1, mintbuf3, cmp_size))
return false;
int index1 = rand() % TEST_CASES;
int index2 = rand() % TEST_CASES;
- refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
- optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
+ refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
+ optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
return false;
int log2TrSize = s + 2;
int num = 1 << (log2TrSize * 2);
int cmp_size = sizeof(int) * num;
+ int cmp_short = sizeof(short) * num;
for (int i = 0; i < ITERS; i++)
{
int index = rand() % TEST_CASES;
- ref(int_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
- checked(opt, int_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
+ ref(short_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
+ checked(opt, short_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
- if (memcmp(int_denoise_test_buff1[index] + j, int_denoise_test_buff2[index] + j, cmp_size))
+ if (memcmp(short_denoise_test_buff1[index] + j, short_denoise_test_buff2[index] + j, cmp_short))
return false;
if (memcmp(mubuf1, mubuf2, cmp_size))
if (opt.dct[value])
{
printf("%s\t", dctInfo[value].name);
- REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mintbuf3, dctInfo[value].width);
+ REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mshortbuf2, dctInfo[value].width);
}
}
if (opt.idct[value])
{
printf("%s\t", idctInfo[value].name);
- REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mbufidct, mshortbuf2, idctInfo[value].width);
+ REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mshortbuf3, mshortbuf2, idctInfo[value].width);
}
}
if (opt.dequant_normal)
{
printf("dequant_normal\t");
- REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mintbuf3, 32 * 32, 70, 1);
+ REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
}
if (opt.dequant_scaling)
{
printf("dequant_scaling\t");
- REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1);
+ REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mshortbuf2, 32 * 32, 5, 1);
}
if (opt.quant)
{
printf("quant\t\t");
- REPORT_SPEEDUP(opt.quant, ref.quant, int_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
+ REPORT_SPEEDUP(opt.quant, ref.quant, short_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
}
if (opt.nquant)
{
printf("nquant\t\t");
- REPORT_SPEEDUP(opt.nquant, ref.nquant, int_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
+ REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
}
if (opt.count_nonzero)
if (opt.denoiseDct)
{
printf("denoiseDct\t");
- REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, int_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
+ REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, short_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
}
}
uint32_t mubuf2[MAX_TU_SIZE];
uint16_t mushortbuf1[MAX_TU_SIZE];
- int int_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
- int int_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
+ int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
+ int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
return true;
}
-bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt)
+bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
int shift = (rand() % 7 + 1);
int index = i % TEST_CASES;
- checked(opt, opt_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
- ref(ref_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
return false;
return true;
}
-bool PixelHarness::check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt)
-{
- ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
-
- int j = 0;
- intptr_t stride = STRIDE;
- for (int i = 0; i < ITERS; i++)
- {
- int shift = (rand() % 7 + 1);
-
- int index = i % TEST_CASES;
- checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
- ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
- if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
- return false;
-
- reportfail();
- j += INCR;
- }
-
- return true;
-}
-
-bool PixelHarness::check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt)
-{
- ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
-
- memset(ref_dest, 0xCD, sizeof(ref_dest));
- memset(opt_dest, 0xCD, sizeof(opt_dest));
-
- int j = 0;
- intptr_t stride = STRIDE;
- for (int i = 0; i < ITERS; i++)
- {
- int shift = (rand() % 7 + 1);
-
- int index = i % TEST_CASES;
- checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
- ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
- if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
- return false;
-
- reportfail();
- j += INCR;
- }
-
- return true;
-}
-
-bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt)
+bool PixelHarness::check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
int shift = (rand() % 7 + 1);
int index = i % TEST_CASES;
- checked(opt, opt_dest, int_test_buff[index] + j, stride, shift);
- ref(ref_dest, int_test_buff[index] + j, stride, shift);
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
return false;
return true;
}
-bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
+bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
int shift = (rand() % 7 + 1);
int index = i % TEST_CASES;
- checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
- ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
return false;
return true;
}
-bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt)
+bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
}
}
- if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
+ if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
{
- if (!check_cvt16to32_shr_t(ref.cvt16to32_shr[i], opt.cvt16to32_shr[i]))
+ if (!check_cpy2Dto1D_shl_t(ref.cpy2Dto1D_shl[i], opt.cpy2Dto1D_shl[i]))
{
- printf("cvt16to32_shr failed!\n");
+ printf("cpy2Dto1D_shl failed!\n");
return false;
}
}
- if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
+ if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
{
- if (!check_cvt32to16_shl_t(ref.cvt32to16_shl[i], opt.cvt32to16_shl[i]))
+ if (!check_cpy2Dto1D_shr_t(ref.cpy2Dto1D_shr[i], opt.cpy2Dto1D_shr[i]))
{
- printf("cvt32to16_shl failed!\n");
+ printf("cpy2Dto1D_shr failed!\n");
return false;
}
}
- if ((i < BLOCK_64x64) && opt.copy_shl[i])
+ if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
{
- if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i]))
+ if (!check_cpy1Dto2D_shl_t(ref.cpy1Dto2D_shl[i], opt.cpy1Dto2D_shl[i]))
{
- printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i);
+ printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
return false;
}
}
- }
-
- if (opt.cvt32to16_shr)
- {
- if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr))
+ if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
{
- printf("cvt32to16 failed!\n");
- return false;
- }
- }
-
- if (opt.cvt16to32_shl)
- {
- if (!check_cvt16to32_shl_t(ref.cvt16to32_shl, opt.cvt16to32_shl))
- {
- printf("cvt16to32_shl failed!\n");
- return false;
+ if (!check_cpy1Dto2D_shr_t(ref.cpy1Dto2D_shr[i], opt.cpy1Dto2D_shr[i]))
+ {
+ printf("cpy1Dto2D_shr[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
}
}
}
}
- if (opt.frame_init_lowres_core)
+ if (opt.frameInitLowres)
{
- if (!check_downscale_t(ref.frame_init_lowres_core, opt.frame_init_lowres_core))
+ if (!check_downscale_t(ref.frameInitLowres, opt.frameInitLowres))
{
printf("downscale failed!\n");
return false;
}
}
- if (opt.copy_shr)
- {
- if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr))
- {
- printf("copy_shr failed!\n");
- return false;
- }
- }
-
return true;
}
REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE);
}
- if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
+ if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
{
- HEADER("cvt16to32_shr[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.cvt16to32_shr[i], ref.cvt16to32_shr[i], ibuf1, sbuf2, STRIDE, 3, 4);
+ HEADER("cpy2Dto1D_shl[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cpy2Dto1D_shl[i], ref.cpy2Dto1D_shl[i], sbuf1, sbuf2, STRIDE, MAX_TR_DYNAMIC_RANGE - X265_DEPTH - (i + 2));
}
- if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
+ if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
{
- HEADER("cvt32to16_shl[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.cvt32to16_shl[i], ref.cvt32to16_shl[i], sbuf2, ibuf1, STRIDE, 3);
+ HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cpy2Dto1D_shr[i], ref.cpy2Dto1D_shr[i], sbuf1, sbuf2, STRIDE, 3);
}
- if ((i < BLOCK_64x64) && opt.copy_cnt[i])
+ if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
{
- HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
+ HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cpy1Dto2D_shl[i], ref.cpy1Dto2D_shl[i], sbuf1, sbuf2, STRIDE, 64);
}
- if ((i < BLOCK_64x64) && opt.copy_shl[i])
+ if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
{
- HEADER("copy_shl[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64);
+ HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cpy1Dto2D_shr[i], ref.cpy1Dto2D_shr[i], sbuf1, sbuf2, STRIDE, 64);
}
- }
-
- if (opt.cvt32to16_shr)
- {
- HEADER0("cvt32to16_shr");
- REPORT_SPEEDUP(opt.cvt32to16_shr, ref.cvt32to16_shr, sbuf1, ibuf1, 64, 5, 64);
- }
-
- if (opt.cvt16to32_shl)
- {
- HEADER0("cvt16to32_shl");
- REPORT_SPEEDUP(opt.cvt16to32_shl, ref.cvt16to32_shl, ibuf1, sbuf1, 64, 5, 64);
+ if ((i < BLOCK_64x64) && opt.copy_cnt[i])
+ {
+ HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
+ }
}
if (opt.weight_pp)
REPORT_SPEEDUP(opt.weight_sp, ref.weight_sp, (int16_t*)sbuf1, pbuf1, 64, 64, 32, 32, 128, 1 << 9, 10, 100);
}
- if (opt.frame_init_lowres_core)
+ if (opt.frameInitLowres)
{
HEADER0("downscale");
- REPORT_SPEEDUP(opt.frame_init_lowres_core, ref.frame_init_lowres_core, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
+ REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
}
if (opt.scale1D_128to64)
HEADER0("planecopy_cp");
REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
}
-
- if (opt.copy_shr)
- {
- HEADER0("copy_shr");
- REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64);
- }
-
}
bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
bool check_downscale_t(downscale_t ref, downscale_t opt);
- bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
- bool check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt);
- bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
- bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
+ bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
+ bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
+ bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
+ bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
- bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
- bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt);
bool check_pixel_var(var_t ref, var_t opt);
bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
+++ /dev/null
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Steve Borho <steve@borho.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com
- *****************************************************************************/
-
-#include "common.h"
-#include "threadpool.h"
-#include "wavefront.h"
-#include "threading.h"
-#include "md5.h"
-#include "PPA/ppa.h"
-
-#include <sstream>
-#include <iostream>
-
-using namespace x265;
-
-struct CUData
-{
- CUData()
- {
- memset(digest, 0, sizeof(digest));
- }
-
- unsigned char digest[16];
-};
-
-struct RowData
-{
- RowData() : active(false), curCol(0) {}
-
- Lock lock;
- volatile bool active;
- volatile int curCol;
-};
-
-// Create a fake frame class with manufactured data in each CU block. We
-// need to create an MD5 hash such that each CU's hash includes the hashes
-// of the blocks that would have HEVC data dependencies (left, top-left,
-// top, top-right). This will give us one deterministic output hash. We
-// then generate the same hash using the thread pool and wave-front parallelism
-// to verify the thread-pool behavior and the wave-front schedule data
-// structures.
-class MD5Frame : public WaveFront
-{
-private:
-
- CUData *cu;
- RowData *row;
- int numrows;
- int numcols;
- Event complete;
-
-public:
-
- MD5Frame(ThreadPool *pool) : WaveFront(pool), cu(0), row(0) {}
-
- virtual ~MD5Frame()
- {
- // ensure no threads are lingering on FindJob() before allowing
- // this object's vtable to be destroyed
- JobProvider::flush();
-
- delete[] this->cu;
- delete[] this->row;
- }
-
- void initialize(int cols, int rows);
-
- void encode();
-
- void processRow(int row, int threadid);
-};
-
-void MD5Frame::initialize(int cols, int rows)
-{
- this->cu = new CUData[rows * cols];
- this->row = new RowData[rows];
- this->numrows = rows;
- this->numcols = cols;
-
- if (!this->WaveFront::init(rows))
- {
- assert(!"Unable to initialize job queue");
- }
-}
-
-void MD5Frame::encode()
-{
- this->JobProvider::enqueue();
-
- this->WaveFront::enqueueRow(0);
-
- // NOTE: When EnableRow after enqueueRow at first row, we'd better call pokeIdleThread, it will release a thread to do job
- this->WaveFront::enableRow(0);
- this->m_pool->pokeIdleThread();
-
- this->complete.wait();
-
- this->JobProvider::dequeue();
-
- unsigned int *outdigest = (unsigned int*)this->cu[this->numrows * this->numcols - 1].digest;
-
- std::stringstream ss;
-
- for (int i = 0; i < 4; i++)
- {
- ss << std::hex << outdigest[i];
- }
-
- if (ss.str().compare("da667b741a7a9d0ee862158da2dd1882"))
- std::cout << "Bad hash: " << ss.str() << std::endl;
-}
-
-void MD5Frame::processRow(int rownum, int)
-{
- // Called by worker thread
- RowData &curRow = this->row[rownum];
-
- assert(rownum < this->numrows && rownum >= 0);
- assert(curRow.curCol < this->numcols);
-
- while (curRow.curCol < this->numcols)
- {
- int id = rownum * this->numcols + curRow.curCol;
- CUData &curCTU = this->cu[id];
- MD5 hash;
-
- // * Fake CTU processing *
- PPAStartCpuEventFunc(encode_block);
- memset(curCTU.digest, id, sizeof(curCTU.digest));
- hash.update(curCTU.digest, sizeof(curCTU.digest));
- if (curRow.curCol > 0)
- hash.update(this->cu[id - 1].digest, sizeof(curCTU.digest));
-
- if (rownum > 0)
- {
- if (curRow.curCol > 0)
- hash.update(this->cu[id - this->numcols - 1].digest, sizeof(curCTU.digest));
-
- hash.update(this->cu[id - this->numcols].digest, sizeof(curCTU.digest));
- if (curRow.curCol < this->numcols - 1)
- hash.update(this->cu[id - this->numcols + 1].digest, sizeof(curCTU.digest));
- }
-
- hash.finalize(curCTU.digest);
- PPAStopCpuEventFunc(encode_block);
-
- curRow.curCol++;
-
- if (curRow.curCol >= 2 && rownum < this->numrows - 1)
- {
- ScopedLock below(this->row[rownum + 1].lock);
-
- if (this->row[rownum + 1].active == false &&
- this->row[rownum + 1].curCol + 2 <= curRow.curCol)
- {
- // set active indicator so row is only enqueued once
- // row stays marked active until blocked or done
- this->row[rownum + 1].active = true;
- this->WaveFront::enqueueRow(rownum + 1);
- this->WaveFront::enableRow(rownum + 1);
- }
- }
-
- ScopedLock self(curRow.lock);
-
- if (rownum > 0 &&
- curRow.curCol < this->numcols - 1 &&
- this->row[rownum - 1].curCol < curRow.curCol + 2)
- {
- // row is blocked, quit job
- curRow.active = false;
- return;
- }
- }
-
- // * Row completed *
-
- if (rownum == this->numrows - 1)
- this->complete.trigger();
-}
-
-int main(int, char **)
-{
- ThreadPool *pool;
-
- PPA_INIT();
-
- pool = ThreadPool::allocThreadPool(1);
- {
- MD5Frame frame(pool);
- frame.initialize(60, 40);
- frame.encode();
- }
- pool->release();
- pool = ThreadPool::allocThreadPool(2);
- {
- MD5Frame frame(pool);
- frame.initialize(60, 40);
- frame.encode();
- }
- pool->release();
- pool = ThreadPool::allocThreadPool(4);
- {
- MD5Frame frame(pool);
- frame.initialize(60, 40);
- frame.encode();
- }
- pool->release();
- pool = ThreadPool::allocThreadPool(8);
- {
- MD5Frame frame(pool);
- frame.initialize(60, 40);
- frame.encode();
- }
- pool->release();
-
- return 0;
-}
/* Visual Leak Detector */
#include <vld.h>
#endif
-#include "PPA/ppa.h"
#include <signal.h>
#include <errno.h>
{ "aq-strength", required_argument, NULL, 0 },
{ "ipratio", required_argument, NULL, 0 },
{ "pbratio", required_argument, NULL, 0 },
+ { "qcomp", required_argument, NULL, 0 },
+ { "qpstep", required_argument, NULL, 0 },
+ { "ratetol", required_argument, NULL, 0 },
+ { "cplxblur", required_argument, NULL, 0 },
+ { "qblur", required_argument, NULL, 0 },
{ "cbqpoffs", required_argument, NULL, 0 },
{ "crqpoffs", required_argument, NULL, 0 },
{ "rd", required_argument, NULL, 0 },
{ "no-lossless", no_argument, NULL, 0 },
{ "no-signhide", no_argument, NULL, 0 },
{ "signhide", no_argument, NULL, 0 },
- { "no-lft", no_argument, NULL, 0 },
- { "lft", no_argument, NULL, 0 },
+ { "no-lft", no_argument, NULL, 0 }, /* DEPRECATED */
+ { "lft", no_argument, NULL, 0 }, /* DEPRECATED */
+ { "no-deblock", no_argument, NULL, 0 },
+ { "deblock", required_argument, NULL, 0 },
{ "no-sao", no_argument, NULL, 0 },
{ "sao", no_argument, NULL, 0 },
{ "no-sao-non-deblock", no_argument, NULL, 0 },
{ "lambda-file", required_argument, NULL, 0 },
{ "b-intra", no_argument, NULL, 0 },
{ "no-b-intra", no_argument, NULL, 0 },
- { "nr", required_argument, NULL, 0 },
+ { "nr-intra", required_argument, NULL, 0 },
+ { "nr-inter", required_argument, NULL, 0 },
{ "stats", required_argument, NULL, 0 },
{ "pass", required_argument, NULL, 0 },
{ "slow-firstpass", no_argument, NULL, 0 },
void showHelp(x265_param *param);
bool parse(int argc, char **argv, x265_param* param);
bool parseQPFile(x265_picture &pic_org);
- void readAnalysisFile(x265_picture* pic, x265_param*);
- void writeAnalysisFile(x265_picture* pic, x265_param*);
bool validateFanout(x265_param*);
};
void CLIOptions::writeNALs(const x265_nal* nal, uint32_t nalcount)
{
- PPAScopeEvent(bitstream_write);
+ ProfileScopeEvent(bitstreamWrite);
for (uint32_t i = 0; i < nalcount; i++)
{
bitstreamFile.write((const char*)nal->payload, nal->sizeBytes);
void CLIOptions::showHelp(x265_param *param)
{
+ int level = param->logLevel;
x265_param_default(param);
printVersion(param);
-#define H0 printf
#define OPT(value) (value ? "enabled" : "disabled")
+#define H0 printf
+#define H1 if (level >= X265_LOG_DEBUG) printf
+
H0("\nSyntax: x265 [options] infile [-o] outfile\n");
H0(" infile can be YUV or Y4M\n");
H0(" outfile is raw HEVC bitstream\n");
H0(" --log-level <string> Logging level: none error warning info debug full. Default %s\n", logLevelNames[param->logLevel + 1]);
H0(" --no-progress Disable CLI progress reports\n");
H0(" --[no-]cu-stats Enable logging stats about distribution of cu across all modes. Default %s\n",OPT(param->bLogCuStats));
- H0(" --csv <filename> Comma separated log file, log level >= 3 frame log, else one line per run\n");
+ H1(" --csv <filename> Comma separated log file, log level >= 3 frame log, else one line per run\n");
H0("\nInput Options:\n");
H0(" --input <filename> Raw YUV or Y4M input file name. `-` for stdin\n");
- H0(" --y4m Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
+ H1(" --y4m Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
H0(" --fps <float|rational> Source frame rate (float or num/denom), auto-detected if Y4M\n");
H0(" --input-res WxH Source picture size [w x h], auto-detected if Y4M\n");
- H0(" --input-depth <integer> Bit-depth of input file. Default 8\n");
- H0(" --input-csp <string> Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
+ H1(" --input-depth <integer> Bit-depth of input file. Default 8\n");
+ H1(" --input-csp <string> Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
H0("-f/--frames <integer> Maximum number of frames to encode. Default all\n");
H0(" --seek <integer> First frame to encode\n");
- H0(" --[no-]interlace <bff|tff> Indicate input pictures are interlace fields in temporal order. Default progressive\n");
- H0(" --dither Enable dither if downscaling to 8 bit pixels. Default disabled\n");
+ H1(" --[no-]interlace <bff|tff> Indicate input pictures are interlace fields in temporal order. Default progressive\n");
+ H1(" --dither Enable dither if downscaling to 8 bit pixels. Default disabled\n");
H0("\nQuality reporting metrics:\n");
H0(" --[no-]ssim Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
H0(" --[no-]psnr Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
H0("-p/--preset <string> Trade off performance for compression efficiency. Default medium\n");
H0(" ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
H0("-t/--tune <string> Tune the settings for a particular type of source or situation:\n");
- H0(" psnr, ssim, zerolatency, or fastdecode\n");
+ H0(" psnr, ssim, grain, zerolatency, fastdecode or cbr\n");
H0("\nQuad-Tree size and depth:\n");
- H0("-s/--ctu <64|32|16> Maximum CU size (default: 64x64). Default %d\n", param->maxCUSize);
+ H0("-s/--ctu <64|32|16> Maximum CU size (WxH). Default %d\n", param->maxCUSize);
H0(" --tu-intra-depth <integer> Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
H0(" --tu-inter-depth <integer> Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
- H0(" --[no-]rect Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
- H0(" --[no-]amp Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
H0("\nAnalysis:\n");
H0(" --rd <0..6> Level of RD in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
- H0(" --psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default %f\n", param->psyRd);
- H0(" --psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default %f\n", param->psyRdoq);
- H0(" --nr <integer> An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled\n");
- H0(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
+ H0(" --psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
+ H0(" --psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
H0(" --[no-]early-skip Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
- H0(" --[no-]fast-cbf Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode));
+ H1(" --[no-]fast-cbf Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode));
+ H1(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
+ H1(" --nr-intra <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
+ H1(" --nr-inter <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
H0("\nCoding tools:\n");
H0("-w/--[no-]weightp Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
H0(" --[no-]weightb Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
H0(" --[no-]cu-lossless Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
H0(" --[no-]signhide Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
- H0(" --[no-]tskip Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
+ H1(" --[no-]tskip Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
H0("\nTemporal / motion search options:\n");
H0(" --me <string> Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
H0("-m/--subme <integer> Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
H0(" --merange <integer> Motion search range. Default %d\n", param->searchRange);
H0(" --max-merge <1..5> Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
- H0(" --[no-]temporal-mvp Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
+ H0(" --[no-]rect Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
+ H0(" --[no-]amp Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
+ H1(" --[no-]temporal-mvp Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
H0("\nSpatial / intra options:\n");
H0(" --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
H0(" --[no-]constrained-intra Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra));
H0(" --scenecut <integer> How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
H0(" --rc-lookahead <integer> Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
H0(" --bframes <integer> Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
- H0(" --bframe-bias <integer> Bias towards B frame decisions. Default %d\n", param->bFrameBias);
+ H1(" --bframe-bias <integer> Bias towards B frame decisions. Default %d\n", param->bFrameBias);
H0(" --b-adapt <0..2> 0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
H0(" --[no-]b-pyramid Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
H0(" --ref <integer> max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
- H0(" --qpfile <string> Force frametypes and QPs for some or all frames\n");
- H0(" Format of each line: framenumber frametype QP\n");
- H0(" QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
- H0(" QPs are restricted by qpmin/qpmax.\n");
- H0("\nRate control, Quantization:\n");
+ H1(" --qpfile <string> Force frametypes and QPs for some or all frames\n");
+ H1(" Format of each line: framenumber frametype QP\n");
+ H1(" QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
+ H1(" QPs are restricted by qpmin/qpmax.\n");
+ H0("\nRate control, Adaptive Quantization:\n");
H0(" --bitrate <integer> Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
- H0("-q/--qp <integer> QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n");
- H0(" --crf <float> Quality-based VBR (0-51). Default %f\n", param->rc.rfConstant);
- H0(" --[no-]lossless Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless));
- H0(" --crf-max <float> With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax);
- H0(" May cause VBV underflows!\n");
- H0(" --crf-min <float> With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin);
- H0(" this specifies a minimum rate factor value for encode!\n");
+ H1("-q/--qp <integer> QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n");
+ H0(" --crf <float> Quality-based VBR (0-51). Default %.1f\n", param->rc.rfConstant);
+ H1(" --[no-]lossless Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless));
+ H1(" --crf-max <float> With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax);
+ H1(" May cause VBV underflows!\n");
+ H1(" --crf-min <float> With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin);
+ H1(" this specifies a minimum rate factor value for encode!\n");
H0(" --vbv-maxrate <integer> Max local bitrate (kbit/s). Default %d\n", param->rc.vbvMaxBitrate);
H0(" --vbv-bufsize <integer> Set size of the VBV buffer (kbit). Default %d\n", param->rc.vbvBufferSize);
- H0(" --vbv-init <float> Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %f\n", param->rc.vbvBufferInit);
- H0(" --aq-mode <integer> Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode);
- H0(" --aq-strength <float> Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default %f\n", param->rc.aqStrength);
- H0(" --[no-]cutree Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
- H0(" --ipratio <float> QP factor between I and P. Default %f\n", param->rc.ipFactor);
- H0(" --pbratio <float> QP factor between P and B. Default %f\n", param->rc.pbFactor);
- H0(" --cbqpoffs <integer> Chroma Cb QP Offset. Default %d\n", param->cbQpOffset);
- H0(" --crqpoffs <integer> Chroma Cr QP Offset. Default %d\n", param->crQpOffset);
- H0(" --stats Filename for stats file in multipass pass rate control. Default x265_2pass.log\n");
+ H0(" --vbv-init <float> Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit);
H0(" --pass Multi pass rate control.\n"
" - 1 : First pass, creates stats file\n"
" - 2 : Last pass, does not overwrite stats file\n"
" - 3 : Nth pass, overwrites stats file\n");
+ H0(" --stats Filename for stats file in multipass pass rate control. Default x265_2pass.log\n");
H0(" --[no-]slow-firstpass Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass));
H0(" --analysis-mode <string|int> save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode);
H0(" --analysis-file <filename> Specify file name used for either dumping or reading analysis data.\n");
- H0(" --scaling-list <string> Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
- H0(" --lambda-file <string> Specify a file containing replacement values for the lambda tables\n");
- H0(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
- H0(" Blank lines and lines starting with hash(#) are ignored\n");
- H0(" Comma is considered to be white-space\n");
+ H0(" --aq-mode <integer> Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode);
+ H0(" --aq-strength <float> Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
+ H0(" --[no-]cutree Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
+ H1(" --ipratio <float> QP factor between I and P. Default %.2f\n", param->rc.ipFactor);
+ H1(" --pbratio <float> QP factor between P and B. Default %.2f\n", param->rc.pbFactor);
+ H1(" --qcomp <float> Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress);
+ H1(" --ratetol <float> Degree of rate fluctuation that can be tolerated. Default %.2f\n", param->rc.rateTolerance);
+ H1(" --cbqpoffs <integer> Chroma Cb QP Offset. Default %d\n", param->cbQpOffset);
+ H1(" --crqpoffs <integer> Chroma Cr QP Offset. Default %d\n", param->crQpOffset);
+ H1(" --scaling-list <string> Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
+ H1(" --lambda-file <string> Specify a file containing replacement values for the lambda tables\n");
+ H1(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
+ H1(" Blank lines and lines starting with hash(#) are ignored\n");
+ H1(" Comma is considered to be white-space\n");
H0("\nLoop filters (deblock and SAO):\n");
- H0(" --[no-]lft Enable Deblocking Loop Filter. Default %s\n", OPT(param->bEnableLoopFilter));
+ H0(" --[no-]deblock Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter));
H0(" --[no-]sao Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
- H0(" --[no-]sao-non-deblock Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
+ H1(" --[no-]sao-non-deblock Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
H0("\nVUI options:\n");
H0(" --sar <width:height|int> Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n");
H0(" Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");
H0(" 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n");
H0(" 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default %d\n", param->vui.aspectRatioIdc);
- H0(" --crop-rect <string> Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n");
- H0(" --overscan <string> Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
+ H1(" --crop-rect <string> Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n");
+ H1(" --overscan <string> Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
H0(" --videoformat <string> Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n");
H0(" --range <string> Specify black level and range of luma and chroma signals as full or limited Default limited\n");
H0(" --colorprim <string> Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,\n");
H0(" --transfer <string> Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,\n");
H0(" smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
H0(" bt2020-10, bt2020-12. Default undef\n");
- H0(" --colormatrix <string> Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
- H0(" smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
- H0(" --chromaloc <integer> Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
+ H1(" --colormatrix <string> Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
+ H1(" smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
+ H1(" --chromaloc <integer> Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
H0("\nBitstream options:\n");
H0(" --[no-]info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
H0(" --[no-]aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
- H0(" --[no-]hrd Enable HRD parameters signalling. Default %s\n", OPT(param->bEmitHRDSEI));
+ H0(" --[no-]hrd Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
H0(" --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
- H0(" --hash <integer> Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
- H0("\nReconstructed video options (debugging):\n");
- H0("-r/--recon <filename> Reconstructed raw image YUV or Y4M output file name\n");
- H0(" --recon-depth <integer> Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
+ H1(" --hash <integer> Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
+ H1("\nReconstructed video options (debugging):\n");
+ H1("-r/--recon <filename> Reconstructed raw image YUV or Y4M output file name\n");
+ H1(" --recon-depth <integer> Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
#undef OPT
#undef H0
- printf("\n\nFull documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n");
+#undef H1
+
+ if (level < X265_LOG_DEBUG)
+ printf("\nUse --log-level full --help for a full listing\n");
+ printf("\n\nComplete documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n");
exit(0);
}
const char *preset = NULL;
const char *tune = NULL;
const char *profile = NULL;
- const char *analysisfn = "x265_analysis.dat";
if (argc <= 1)
{
OPT("profile") profile = optarg; /* handled last */
OPT("preset") /* handled above */;
OPT("tune") /* handled above */;
- OPT("analysis-file") analysisfn = optarg;
OPT("qpfile")
{
this->qpfile = fopen(optarg, "rb");
x265_log(NULL, X265_LOG_ERROR, "failed to open bitstream file <%s> for writing\n", bitstreamfn);
return true;
}
-
- if (param->analysisMode)
- {
- const char *mode = param->analysisMode == X265_ANALYSIS_SAVE ? "wb" : "rb";
- this->analysisFile = fopen(analysisfn, mode);
- if (!this->analysisFile)
- {
- x265_log(NULL, X265_LOG_ERROR, "failed to open analysis file %s\n", analysisfn);
- return true;
- }
- }
-
return false;
}
-bool CLIOptions::validateFanout(x265_param *param)
-{
-#define CMP_OPT_FANOUT(opt, param_val)\
- {\
- bErr = 0;\
- p = strstr(paramBuf, opt "=");\
- char* q = strstr(paramBuf, "no-"opt);\
- if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\
- bErr = 1;\
- else if (!param_val && !q)\
- bErr = 1;\
- else if (param_val && (q || !strstr(paramBuf, opt)))\
- bErr = 1;\
- if (bErr)\
- {\
- x265_log(param, X265_LOG_ERROR, "different " opt " setting than given in analysis file (%d vs %d)\n", param_val, i);\
- X265_FREE(paramBuf);\
- return false;\
- }\
- }
-
- char *p = NULL, *paramBuf;
- int i, j;
- uint32_t k , l;
- bool bErr = false;
-
- paramBuf = X265_MALLOC(char, MAXPARAMSIZE);
- if (!paramBuf)
- return false;
-
- fread(paramBuf, 1, MAXPARAMSIZE, this->analysisFile);
-
- /* check whether fanout options are compatible */
- if (strncmp(paramBuf, "#options:", 9))
- {
- x265_log(param, X265_LOG_ERROR, "options list in analysis file is not valid\n");
- X265_FREE(paramBuf);
- return false;
- }
-
- char* buf = strchr(paramBuf, '\n');
- if (!buf)
- {
- x265_log(param, X265_LOG_ERROR, "Malformed analysis file\n");
- X265_FREE(paramBuf);
- return false;
- }
- *buf = '\0';
- fseek(this->analysisFile, (int)strlen(paramBuf) + 1, SEEK_SET);
-
- if (sscanf(paramBuf, "#options: %dx%d", &i, &j) != 2)
- {
- x265_log(param, X265_LOG_ERROR, "Resolution specified in analysis file is not valid\n");
- X265_FREE(paramBuf);
- return false;
- }
- if ((p = strstr(paramBuf, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2)
- {
- x265_log(param, X265_LOG_ERROR, "fps specified in analysis file is not valid\n");
- X265_FREE(paramBuf);
- return false;
- }
- if (k != param->fpsNum || l != param->fpsDenom)
- {
- x265_log(param, X265_LOG_ERROR, "fps mismatch than given in analysis file (%u/%u vs %u/%u)\n",
- param->fpsNum, param->fpsDenom, k, l);
- X265_FREE(paramBuf);
- return false;
- }
-
- CMP_OPT_FANOUT("bitdepth", param->internalBitDepth);
- CMP_OPT_FANOUT("weightp", param->bEnableWeightedPred);
- CMP_OPT_FANOUT("bframes", param->bframes);
- CMP_OPT_FANOUT("b-pyramid", param->bBPyramid);
- CMP_OPT_FANOUT("b-adapt", param->bFrameAdaptive);
- CMP_OPT_FANOUT("open-gop", param->bOpenGOP);
- CMP_OPT_FANOUT("keyint", param->keyframeMax);
- CMP_OPT_FANOUT("min-keyint", param->keyframeMin);
- CMP_OPT_FANOUT("scenecut", param->scenecutThreshold);
- CMP_OPT_FANOUT("ctu", (int)param->maxCUSize);
- CMP_OPT_FANOUT("ref", param->maxNumReferences);
- CMP_OPT_FANOUT("rc-lookahead", param->lookaheadDepth);
-
-#undef CMP_OPT_FANOUT
-
- X265_FREE(paramBuf);
- return true;
-}
-
-void CLIOptions::readAnalysisFile(x265_picture* pic, x265_param* p)
-{
- int poc, width, height;
- uint32_t numPart, numCU;
- fread(&width, sizeof(int), 1, this->analysisFile);
- fread(&height, sizeof(int), 1, this->analysisFile);
- fread(&poc, sizeof(int), 1, this->analysisFile);
- fread(&pic->sliceType, sizeof(int), 1, this->analysisFile);
- fread(&numCU, sizeof(int), 1, this->analysisFile);
- fread(&numPart, sizeof(int), 1, this->analysisFile);
-
- if (poc != pic->poc || width != p->sourceWidth || height != p->sourceHeight)
- {
- x265_log(NULL, X265_LOG_WARNING, "Error in reading intra-inter data.\n");
- x265_free_analysis_data(pic);
- return;
- }
-
- fread(pic->analysisData.intraData->depth,
- sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.intraData->modes,
- sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.intraData->partSizes,
- sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.intraData->poc,
- sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.intraData->cuAddr,
- sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile);
-}
-
-void CLIOptions::writeAnalysisFile(x265_picture* pic, x265_param *p)
-{
- uint64_t seekTo = pic->poc * this->analysisRecordSize + this->analysisHeaderSize;
- fseeko(this->analysisFile, seekTo, SEEK_SET);
- fwrite(&p->sourceWidth, sizeof(int), 1, this->analysisFile);
- fwrite(&p->sourceHeight, sizeof(int), 1, this->analysisFile);
- fwrite(&pic->poc, sizeof(int), 1, this->analysisFile);
- fwrite(&pic->sliceType, sizeof(int), 1, this->analysisFile);
- fwrite(&pic->analysisData.numCUsInFrame, sizeof(int), 1, this->analysisFile);
- fwrite(&pic->analysisData.numPartitions, sizeof(int), 1, this->analysisFile);
-
- fwrite(pic->analysisData.intraData->depth,
- sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.intraData->modes,
- sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.intraData->partSizes,
- sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.intraData->poc, sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.intraData->cuAddr, sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile);
-}
-
bool CLIOptions::parseQPFile(x265_picture &pic_org)
{
int32_t num = -1, qp, ret;
// This uses Microsoft's proprietary WCHAR type, but this only builds on Windows to start with
VLDSetReportOptions(VLD_OPT_REPORT_TO_DEBUGGER | VLD_OPT_REPORT_TO_FILE, L"x265_leaks.txt");
#endif
- PPA_INIT();
+ PROFILE_INIT();
x265_param *param = x265_param_alloc();
CLIOptions cliopt;
x265_picture pic_orig, pic_out;
x265_picture *pic_in = &pic_orig;
- x265_picture *pic_recon = cliopt.recon ? &pic_out : NULL;
+ /* Allocate recon picture if analysisMode is enabled */
+ x265_picture *pic_recon = (cliopt.recon || !!param->analysisMode) ? &pic_out : NULL;
uint32_t inFrameCount = 0;
uint32_t outFrameCount = 0;
x265_nal *p_nal;
x265_picture_init(param, pic_in);
- if (param->analysisMode && !pic_recon)
- {
- x265_log(NULL, X265_LOG_ERROR, "Must specify recon with analysis-mode option.\n");
- goto fail;
- }
if (param->analysisMode)
{
- if (param->analysisMode == X265_ANALYSIS_SAVE)
+ if (param->bDistributeModeAnalysis || param->bDistributeMotionEstimation)
{
- char *p = x265_param2string(param);
- if (!p)
- {
- x265_log(NULL, X265_LOG_ERROR, "analysis: buffer allocation failure, aborting");
- goto fail;
- }
- uint32_t numCU = pic_in->analysisData.numCUsInFrame;
- uint32_t numPart = pic_in->analysisData.numPartitions;
-
- cliopt.analysisRecordSize = ((sizeof(int) * 4 + sizeof(uint32_t) * 2) + sizeof(x265_inter_data) * numCU * 85 +
- sizeof(uint8_t) * 2 * numPart * numCU + sizeof(char) * numPart * numCU + sizeof(int) * numCU + sizeof(uint32_t) * numCU);
-
- fprintf(cliopt.analysisFile, "#options: %s\n", p);
- cliopt.analysisHeaderSize = ftell(cliopt.analysisFile);
- X265_FREE(p);
- }
- else
- {
- if (!cliopt.validateFanout(param))
- goto fail;
+ x265_log(NULL, X265_LOG_ERROR, "Analysis load/save options incompatible with pmode/pme");
+ goto fail;
}
}
ditherImage(*pic_in, param->sourceWidth, param->sourceHeight, errorBuf, X265_DEPTH);
pic_in->bitDepth = X265_DEPTH;
}
- if (param->analysisMode)
- {
- x265_alloc_analysis_data(pic_in);
-
- if (param->analysisMode == X265_ANALYSIS_LOAD)
- cliopt.readAnalysisFile(pic_in, param);
- }
}
int numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, pic_in, pic_recon);
break;
}
outFrameCount += numEncoded;
- if (numEncoded && pic_recon)
- {
- cliopt.recon->writePicture(pic_out);
- if (param->analysisMode == X265_ANALYSIS_SAVE)
- cliopt.writeAnalysisFile(pic_recon, param);
- if (param->analysisMode)
- x265_free_analysis_data(pic_recon);
- }
+ if (numEncoded && pic_recon && cliopt.recon)
+ cliopt.recon->writePicture(pic_out);
if (nal)
cliopt.writeNALs(p_nal, nal);
{
uint32_t numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, NULL, pic_recon);
outFrameCount += numEncoded;
- if (numEncoded && pic_recon)
- {
+ if (numEncoded && pic_recon && cliopt.recon)
cliopt.recon->writePicture(pic_out);
- if (param->analysisMode == X265_ANALYSIS_SAVE)
- cliopt.writeAnalysisFile(pic_recon, param);
- if (param->analysisMode)
- x265_free_analysis_data(pic_recon);
- }
-
if (nal)
cliopt.writeNALs(p_nal, nal);
x265_picture_init\r
x265_picture_alloc\r
x265_picture_free\r
-x265_alloc_analysis_data\r
-x265_free_analysis_data\r
x265_param_apply_profile\r
x265_max_bit_depth\r
x265_version_str\r
uint8_t* payload;
} x265_nal;
-/* Stores inter (motion estimation) analysis data for a single frame */
-typedef struct x265_inter_data
-{
- uint32_t zOrder;
- int ref[2];
- int costZero[2];
- int16_t mvx[2];
- int16_t mvy[2];
- uint32_t depth;
- int poc;
- uint32_t cuAddr;
-} x265_inter_data;
-
-/* Stores intra (motion estimation) analysis data for a single frame */
-typedef struct x265_intra_data
-{
- uint8_t* depth;
- uint8_t* modes;
- char* partSizes;
- int* poc;
- uint32_t* cuAddr;
-} x265_intra_data;
-
/* Stores all analysis data for a single frame */
typedef struct x265_analysis_data
{
- x265_inter_data* interData;
- x265_intra_data* intraData;
+ uint32_t frameRecordSize;
+ int32_t poc;
+ int32_t sliceType;
uint32_t numCUsInFrame;
uint32_t numPartitions;
+ void* interData;
+ void* intraData;
} x265_analysis_data;
/* Used to pass pictures into the encoder, and to get picture data back out of
#define X265_ANALYSIS_OFF 0
#define X265_ANALYSIS_SAVE 1
#define X265_ANALYSIS_LOAD 2
-
typedef struct
{
int planes;
* per-slice statistics to this log file in encode order. Otherwise the
* encoder will emit per-stream statistics into the log file when
* x265_encoder_log is called (presumably at the end of the encode) */
- const char *csvfn;
+ char *csvfn;
/* Enable the generation of SEI messages for each encoded frame containing
* the hashes of the three reconstructed picture planes. Most decoders will
/* The additional depth the residual quadtree is allowed to recurse beyond
* the coding quadtree, for inter coded blocks. This must be between 1 and
- * 3. The higher the value the more efficiently the residual can be
+ * 4. The higher the value the more efficiently the residual can be
* compressed by the DCT transforms, at the expense of much more compute */
uint32_t tuQTMaxInterDepth;
/* The additional depth the residual quadtree is allowed to recurse beyond
* the coding quadtree, for intra coded blocks. This must be between 1 and
- * 3. The higher the value the more efficiently the residual can be
+ * 4. The higher the value the more efficiently the residual can be
* compressed by the DCT transforms, at the expense of much more compute */
uint32_t tuQTMaxIntraDepth;
/* Enable the use of `coded block flags` (flags set to true when a residual
* has been coded for a given block) to avoid intra analysis in likely skip
- * blocks. Default is disabled */
+ * blocks. Only applicable in RD levels 5 and 6. Default is disabled */
int bEnableCbfFastMode;
/* Enable early skip decisions to avoid intra and inter analysis in likely
* buffer and use this analysis information to reduce the amount of work
* the encoder must perform. Default X265_ANALYSIS_OFF */
int analysisMode;
+ /* Filename for analysisMode save/load. Default name is "x265_analysis.dat" */
+ char* analysisFileName;
/*== Coding tools ==*/
-
/* Enable the implicit signaling of the sign bit of the last coefficient of
* each transform unit. This saves one bit per TU at the expense of figuring
* out which coefficient can be toggled with the least distortion.
/* Enable the deblocking loop filter, which improves visual quality by
* reducing blocking effects at block edges, particularly at lower bitrates
* or higher QP. When enabled it adds another CU row of reference lag,
- * reducing frame parallelism effectiveness. Default is enabled */
+ * reducing frame parallelism effectiveness. Default is enabled */
int bEnableLoopFilter;
+ /* deblocking filter tC offset [-6, 6] -6 light filter, 6 strong.
+ * This is the coded div2 value, actual offset is doubled at use */
+ int deblockingFilterTCOffset;
+
+ /* deblocking filter Beta offset [-6, 6] -6 light filter, 6 strong
+ * This is the coded div2 value, actual offset is doubled at use */
+ int deblockingFilterBetaOffset;
+
/* Enable the Sample Adaptive Offset loop filter, which reduces distortion
* effects by adjusting reconstructed sample values based on histogram
* analysis to better approximate the original samples. When enabled it adds
* regardless of this setting. */
int bIntraInBFrames;
- /* An integer value in range of 100 to 1000, which denotes strength of noise
- * reduction */
- int noiseReduction;
+ /* An integer value in range of 0 to 2000, which denotes strength of noise
+ * reduction in intra CUs. 0 means disabled */
+ int noiseReductionIntra;
+
+ /* An integer value in range of 0 to 2000, which denotes strength of noise
+ * reduction in inter CUs. 0 means disabled */
+ int noiseReductionInter;
/* The lossless flag enables true lossless coding, by bypassing scaling,
* transform, quantization and in-loop filter processes. This is used for
int bitrate;
/* The degree of rate fluctuation that x265 tolerates. Rate tolerance is used
- * alongwith overflow (difference between actual and target bitrate), to adjust
+ * along with overflow (difference between actual and target bitrate), to adjust
* qp. Default is 1.0 */
double rateTolerance;
double rfConstant;
/* Enable adaptive quantization. This mode distributes available bits between all
- * macroblocks of a frame, assigning more bits to low complexity areas. Turning
+ * CTUs of a frame, assigning more bits to low complexity areas. Turning
* this ON will usually affect PSNR negatively, however SSIM and visual quality
- * generally improves. Default: X265_AQ_AUTO_VARIANCE */
+ * generally improves. Default: X265_AQ_VARIANCE */
int aqMode;
- /* Sets the strength of AQ bias towards low detail macroblocks. Valid only if
+ /* Sets the strength of AQ bias towards low detail CTUs. Valid only if
* AQ is enabled. Default value: 1.0. Acceptable values between 0.0 and 3.0 */
double aqStrength;
/* In CRF mode, minimum CRF as caused by VBV */
double rfConstantMin;
- /* Two pass (INCOMPLETE) */
+ /* Multi-pass encoding */
/* Enable writing the stats in a multipass encode to the stat output file */
int bStatWrite;
/* Enable loading data from the stat input file in a multi pass encode */
int bStatRead;
- /* Filename of the 2pass output/input stats file */
+ /* Filename of the 2pass output/input stats file, if unspecified the
+ * encoder will default to using x265_2pass.log */
char* statFileName;
/* temporally blur quants */
* special in any way, but using this method together with x265_param_free()
* and x265_param_parse() to set values by name allows the application to treat
* x265_param as an opaque data struct for version safety */
-x265_param *x265_param_alloc();
+x265_param *x265_param_alloc(void);
/* x265_param_free:
* Use x265_param_free() to release storage for an x265_param instance
* 100 times faster than placebo!
*
* Currently available tunings are: */
-static const char * const x265_tune_names[] = { "psnr", "ssim", "zerolatency", "fastdecode", 0 };
+static const char * const x265_tune_names[] = { "psnr", "ssim", "grain", "zerolatency", "fastdecode", "cbr", 0 };
/* returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
* special in any way, but using this method together with x265_picture_free()
* and x265_picture_init() allows some version safety. New picture fields will
* always be added to the end of x265_picture */
-x265_picture *x265_picture_alloc();
+x265_picture *x265_picture_alloc(void);
/* x265_picture_free:
* Use x265_picture_free() to release storage for an x265_picture instance
* allocated by x265_picture_alloc() */
void x265_picture_free(x265_picture *);
-
-/* x265_alloc_analysis_data:
- * Allocate memory to hold analysis data, returns 0 on success else negative */
-int x265_alloc_analysis_data(x265_picture*);
-
-/* x265_free_analysis_data:
- * Use x265_free_analysis_data to release storage of members allocated by
- * x265_alloc_analysis_data */
-void x265_free_analysis_data(x265_picture*);
-
/***
* Initialize an x265_picture structure to default values. It sets the pixel
* depth and color space to the encoder's internal values and sets the slice