From: Jérôme Benoit <jerome.benoit@piment-noir.org>
Date: Wed, 14 Jan 2015 16:51:29 +0000 (+0100)
Subject: Imported Upstream version 1.4+222+hg5f9f7194267b
X-Git-Tag: upstream/1.4+222+hg5f9f7194267b^0
X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=b53f7c52d8280ab63876efd6eb292c21430ac607;p=deb_x265.git

Imported Upstream version 1.4+222+hg5f9f7194267b
---

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index a8da9d5..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-doc/uncrustify/uncrustify.exe
-build/
-*.rej
-*.orig
-*.hevc
-*.yuv
-*.y4m
-*.out
-*.swp
-.DS_Store
-.pc
diff --git a/.hg_archival.txt b/.hg_archival.txt
deleted file mode 100644
index 39aab44..0000000
--- a/.hg_archival.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
-node: 5e604833c5aa605d0b6efbe5234492b5e7d8ac61
-branch: stable
-tag: 1.4
diff --git a/.hgignore b/.hgignore
deleted file mode 100644
index 87f1042..0000000
--- a/.hgignore
+++ /dev/null
@@ -1,11 +0,0 @@
-syntax: glob
-doc/uncrustify/uncrustify.exe
-build/
-**.rej
-**.orig
-**.hevc
-**.yuv
-**.y4m
-**.out
-**.swp
-.DS_Store
diff --git a/.hgtags b/.hgtags
deleted file mode 100644
index 42d4ebd..0000000
--- a/.hgtags
+++ /dev/null
@@ -1,17 +0,0 @@
-681eabf8a086faea6141f9c1f5a72c9897ed8b29 LASTKNOWNGOOD1
-3ec4837e6f6c7159f438e1f537dff117c93ee139 LASTKNOWNGOOD2
-9a6800e84295db446fdce2e7f27059ec8ae838a7 LASTKNOWNGOOD
-99fab2ef92be051cd3b3b2d817064cead282b42c 0.1
-b3471d9009f5cd487b23c8c61a6bfff8980e54f2 0.2
-3767fbfa970ff4b2dc2e8647db0274168727147e 0.3
-2ba6ec553f218d2b06ad803b87d6ec751fd639f7 0.4
-93707bc4fccdaa89a1f2da11db8808ca912a691c 0.4.1
-69acb3cb777f977f5edde908069ac565915dd366 0.5
-b970ffbdd696e3ce45c93b315902eb6366ff085e 0.6
-d24e2a8c4326b0cd01bfa6c414c5378481af9018 0.7
-527d03c56d6860dc979ddea1196f7e94d13d3e82 0.8
-82bbd2bf3b49ba086be0f0922f91fe0084896351 0.9
-cea97c4d79456842e00ade6be6fd5ec34610e5f8 1.0
-ae9609aeebdc3271114168ece003679e9b1dca1b 1.1
-d6257335c5370ee54317a0426a12c1f0724b18b9 1.2
-c1e4fc0162c14fdb84f5c3bd404fb28cfe10a17f 1.3
diff --git a/ChangeLog b/ChangeLog
index 80323fb..8aa1413 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,1639 @@
+2014-12-23  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/encoder/rdcost.h, source/encoder/search.cpp:
+	rdcost: unify scaleChromaDist*()
+	[5f9f7194267b] [tip]
+
+2014-12-23  Gopu Govindaswamy  <gopu@multicorewareinc.com>
+
+	* source/encoder/encoder.cpp:
+	encoder: allocate memory for inter and intra analysis data based on
+	slicetype
+	[9fdab427a191]
+
+	* source/encoder/analysis.cpp, source/encoder/analysis.h:
+	analysis: remove redundant argument in compressIntraCU
+	[c4ec3f22846b]
+
+2014-12-20  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/encoder/search.cpp:
+	fix 4:4:4 rd<=1
+	[8d2f418829c8]
+
+2014-12-18  David T Yuen  <dtyx265@gmail.com>
+
+	* source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm,
+	source/common/x86/dct8.h:
+	asm: idct[8x8] sse2 12232 -> 3500 over c code 3550 -> 3500 over
+	intrinsic
+	[7b816fdb393d]
+
+2014-12-17  Steve Borho  <steve@borho.org>
+
+	* source/PPA/ppaCPUEvents.h, source/encoder/frameencoder.cpp:
+	ppa: emit one event per CTU for more clarity, disable frame threads
+	events
+
+	The frame threads are generally uninteresting when WPP is in use
+	[78ae7996a1ce]
+
+	* source/PPA/ppaCPUEvents.h, source/encoder/frameencoder.cpp,
+	source/encoder/framefilter.cpp, source/encoder/slicetype.cpp,
+	source/x265.cpp:
+	ppa: refine event names
+
+	Drop the unused names, remove uninteresting events. Try to cover the
+	main thread pool tasks and the frame encoder times.
+	[6cbd7d26b2a1]
+
+	* source/PPA/ppa.cpp, source/PPA/ppa.h, source/PPA/ppaApi.h:
+	ppa: simplify interfaces, enforce coding style
+	[952a2a361fcb]
+
+	* source/common/common.h, source/encoder/analysis.cpp,
+	source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp,
+	source/x265.cpp:
+	ppa: minimize code foot-print of profiling events
+
+	This will allow us to add support for more profiling systems without
+	littering the code
+	[3315d6c0ced1]
+
+	* doc/reST/cli.rst, source/x265.h:
+	doc: improve documentation for --stats and multi-pass in general
+	[42fb030a4c43]
+
+2014-12-16  Min Chen  <chenm003@163.com>
+
+	* source/encoder/nal.cpp:
+	fix: output wrong WppEntryOffset when emulating start code at end of
+	WPP row
+	[295d033cb091]
+
+2014-12-16  Aasaipriya Chandran  <aasaipriya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: chroma_hpp[16x16] for colorspace i420 in avx2 improve
+	1540c->969c
+	[775ebb4694ad]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: chroma_hpp[32x32] for colorspace i420 in avx2 improve
+	6189c->3537c
+	[619c0e654f5b]
+
+2014-12-13  Steve Borho  <steve@borho.org>
+
+	* source/encoder/api.cpp, source/encoder/encoder.cpp,
+	source/encoder/encoder.h:
+	encoder: combine create() and init() functions
+
+	They were always called back-to-back() and their functionality was
+	non-distinct. It also now checks for abort errors at startup and
+	returns a NULL from the encoder open function (early aborts are
+	usually malloc failures)
+	[6ba7be7b1697]
+
+	* source/CMakeLists.txt, source/cmake/CMakeASM_YASMInformation.cmake:
+	cmake: eoln and white-space fixes, slight refactor
+	[ee36b6311aaf]
+
+2014-12-12  Steve Borho  <steve@borho.org>
+
+	* source/encoder/analysis.h:
+	analysis: typo
+	[d00a5b93c07e]
+
+	* source/CMakeLists.txt, source/cmake/CMakeASM_YASMInformation.cmake:
+	cmake: allow position independent code to be generally configurable
+	(fixes #91)
+
+	Allow the builder to over-ride the default
+	[afdcb68dace4]
+
+2014-12-11  Steve Borho  <steve@borho.org>
+
+	* source/encoder/entropy.cpp, source/encoder/entropy.h:
+	entropy: add methods to estimate CU mode decision costs
+	[e0374c37e745]
+
+2014-12-12  Steve Borho  <steve@borho.org>
+
+	* source/common/pixel.cpp:
+	pixel: nits
+	[750839e8e0cf]
+
+	* doc/reST/cli.rst, source/common/param.cpp, source/x265.h:
+	api: change default AQ mode to 1
+
+	We've received a lot of feedback that AQ mode 2 is often
+	problematic, but AQ mode 1 is generally safe and useful.
+	[cbf5cad2e12b]
+
+2014-12-12  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vps[4x4] in avx2: improve 337c->219c
+	[6f770a6b24f0]
+
+2014-12-11  Steve Borho  <steve@borho.org>
+
+	* build/README.txt:
+	build: update README to not be so specific about yasm 1.2.0
+	[b1c2ef980dfe]
+
+2014-12-10  Steve Borho  <steve@borho.org>
+
+	* source/encoder/reference.cpp:
+	reference: avoid weighting pixels when plane is unweighted
+
+	Just because the luma plane is weighted does not mean either of the
+	chroma planes are also weighted. If the weight parameters for a
+	given plane are not present, then just directly use the un-weighted
+	reference plane.
+	[ae50be4c3a6e]
+
+2014-12-11  Aasaipriya Chandran  <aasaipriya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: chroma_hpp[4x4] for colorspace i420 in avx2 improve 217c->192c
+	[667e4ea0899f]
+
+2014-12-10  Steve Borho  <steve@borho.org>
+
+	* doc/reST/cli.rst:
+	doc: describe what happens when psy-rd is too high for bitrate
+	[9c3b478a60b2]
+
+2014-12-10  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: chroma_vpp[32x32] for colorspace i420 in avx2: improve
+	3881c->1933c
+	[04d145864dd6]
+
+2014-12-10  Steve Borho  <steve@borho.org>
+
+	* source/encoder/analysis.cpp:
+	analysis: avoid redundant MC work
+	[9e244ebe21d2]
+
+	* source/encoder/analysis.cpp:
+	analysis: fix chroma predictions for 2Nx2N bidir at zero mv
+
+	Valgrind discovered that the chroma predictions were not in fact
+	predicted
+	[0dc816f49c01]
+
+	* source/x265.h:
+	api: add some blank lines
+	[ab1e1e0ca75c]
+
+2014-12-09  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h:
+	asm: chroma_vpp[4x4] for colorspace i422 in avx2: improve 228c->184c
+	[5f16dc82652a]
+
+2014-12-10  Steve Borho  <steve@borho.org>
+
+	* source/common/lowres.cpp, source/common/lowres.h,
+	source/encoder/frameencoder.cpp, source/encoder/motion.cpp,
+	source/encoder/reference.cpp, source/encoder/reference.h,
+	source/encoder/slicetype.cpp:
+	reference: weight chroma planes of reference pictures if using
+	chroma satd
+	[6c32c8d4e0a1]
+
+2014-12-08  Steve Borho  <steve@borho.org>
+
+	* doc/reST/cli.rst, source/encoder/analysis.cpp,
+	source/encoder/frameencoder.cpp, source/encoder/motion.cpp,
+	source/encoder/motion.h, source/encoder/search.cpp,
+	source/encoder/slicetype.cpp:
+	motion: chroma ME [CHANGES OUTPUTS]
+
+	include chroma distortion in satd decisions when --subme > 2 and
+	chroma blocks are multiples of 4x4
+
+	This required making the MotionEstimate class more aware of PicYuv
+	and its indexing scheme so that it could find the correct chroma
+	pixels to interpolate. This allowed me to merge the setSourcePlane()
+	method into the lookahead's version of setSourcePU.
+
+	This requires further work. The Reference class needs to generate
+	weighted chroma planes if subpel refine will use chroma residual
+	cost. Until this is fixed, the chroma subpel steps will use
+	unweighted reference pixels.
+	[afd5620c77a4]
+
+2014-12-09  Steve Borho  <steve@borho.org>
+
+	* source/common/pixel.cpp, source/common/primitives.cpp:
+	primitives: use NULL chroma satd func pointers for blocks not
+	capable of satd
+
+	If the block is not a multiple of 4x4, then chroma satd measurements
+	are not possible, so we will disable chroma residual measurements
+	for these block sizes (and thus only measure luma residual)
+	[4c97d85c8488]
+
+	* source/common/primitives.cpp:
+	primitives: use luma satd functions for chroma, where applicable
+
+	The commented lines should be considered TODO items for the assembly
+	team
+	[29489f2fc2c7]
+
+	* source/common/pixel.cpp, source/common/primitives.h:
+	primitives: add a chroma satd table that is indexed by luma
+	partition
+
+	There are a number of chroma partitions that have dimensions of 2 or
+	6 and those cannot use satd (which is 4x4 based), so we degrade them
+	down to SAD which makes me unhappy.
+	[47c490836fd8]
+
+2014-12-08  Steve Borho  <steve@borho.org>
+
+	* source/common/lowres.h, source/encoder/reference.cpp,
+	source/encoder/reference.h:
+	reference: move reconPic pointer to base class so it is available to
+	ME
+	[dd55fd39745c]
+
+	* source/encoder/motion.cpp:
+	motion: sync argument names between the header and the cpp file
+	[e2b958539e6a]
+
+	* source/common/yuv.cpp:
+	yuv: fix size check in copyFromYuv
+
+	The target buffer needs to be as large as or larger than the source.
+	The fact that this check has never failed tells me all users of this
+	function have equal sized arguments.
+	[15be837edb36]
+
+	* source/encoder/search.cpp:
+	search: rename index variable to puIdx for consistency
+	[1cab6a4c0ab8]
+
+	* source/common/yuv.cpp, source/common/yuv.h,
+	source/encoder/analysis.cpp, source/encoder/motion.cpp,
+	source/encoder/motion.h, source/encoder/search.cpp:
+	motion: add a version of setSourcePU which can accept fenc from
+	another Yuv
+
+	The analysis code has already gone through the trouble of loading
+	the CU's fenc pixels from the source picture into a much smaller Yuv
+	buffer with small strides. This allows us to avoid accessing the
+	fenc PicYuv in a performance critical portion of the encoder.
+
+	We utilize the Yuv class to copy the PU, since it already has logic
+	for calculating part offsets for luma and chroma
+	[1d1f803a3eec]
+
+	* source/encoder/motion.cpp, source/encoder/motion.h,
+	source/encoder/search.cpp, source/encoder/slicetype.cpp,
+	source/encoder/slicetype.h:
+	motion: use Yuv instance to hold fenc PU pixels (preparing for
+	chroma ME)
+
+	This required making an init function which accepts the encoder
+	color space. We use 4:0:0 for lookahead since it does not keep
+	chroma planes. Note that I explicitly renamed this Yuv instance
+	fencPUYuv to make sure people understand it is not a duplicate of
+	the fencYuv kept by the Analysis structure; it will often be a sub-
+	partition of the CU fenc yuv.
+	[e640c8461495]
+
+	* source/encoder/slicetype.cpp:
+	slicetype: cleanups - use bufSATD method where applicable
+	[b5b05c94ae7c]
+
+	* source/common/yuv.cpp:
+	yuv: plumb in support for mono-chrome YUV buffers
+
+	The need for this will be obvious in the next commit
+	[5a44d694ed9b]
+
+2014-12-09  Aasaipriya Chandran  <aasaipriya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: chroma_hpp[8x8] for colorspace i420 in avx2 improve 530c->373c
+	[88498ec9b10b]
+
+2014-12-08  Steve Borho  <steve@borho.org>
+
+	* source/common/x86/asm-primitives.cpp:
+	asm: fix x86 link errors
+	[b376435b31c1]
+
+2014-12-09  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: chroma_vpp[16x16] for colorspace i420 in avx2: improve
+	998c->978c
+	[d042d1ea2d69]
+
+2014-12-05  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: chroma_vpp[8x8] for colorspace i420 in avx2: improve 338c->269c
+	[fee9fb1f9762]
+
+2014-12-06  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/cudata.h, source/encoder/analysis.cpp,
+	source/encoder/entropy.cpp, source/encoder/entropy.h,
+	source/encoder/search.cpp, source/encoder/search.h:
+	refine tuDepth related
+	[53f7efef5ebd]
+
+2014-12-05  Steve Borho  <steve@borho.org>
+
+	* source/cmake/version.cmake:
+	cmake: do not use a cache string for version found in hg_archive.txt
+	(refs #84)
+
+	This was not passing the tagged version number to version.cpp
+	[35d086074bb5]
+
+2014-12-04  Aarthi Thirumalai  <Aarthi Thirumalai>
+
+	* source/encoder/ratecontrol.cpp:
+	rc : fix bug in deciding qp for first frame in CRF
+	[1458ad34157c]
+
+	* source/encoder/rdcost.h, source/encoder/sao.cpp:
+	rc: fix chroma qp and chroma lambda derivations.
+
+	fix the chroma qp values for Main10 profile, derive chroma qp from
+	luma qp values according to the HEVC spec. improves quality at high
+	qps.
+	[a1e76461c0d4]
+
+2014-12-05  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/encoder/analysis.cpp:
+	analysis: comments
+	[4ae9691c1a23]
+
+2014-12-05  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/encoder/analysis.cpp:
+	fix chroma distortion for 4:2:2
+	[42df5c8bdb25]
+
+2014-12-04  Steve Borho  <steve@borho.org>
+
+	* source/encoder/CMakeLists.txt:
+	cmake: disable idiotic uninitialized local variable warnings from VC
+
+	If the compiler is not going to make any minimal attempt to figure
+	out if a variable was initialized, I am not going to make any
+	attempt to look at their stupid warnings.
+	[c9fd35f97e6d]
+
+2014-12-04  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h:
+	asm: chroma_vpp[4x4] for colorspace i420 in avx2: improve 228c->184c
+	[23e637065aec]
+
+2014-12-04  Steve Borho  <steve@borho.org>
+
+	* source/encoder/analysis.cpp, source/encoder/analysis.h:
+	analysis: cache m_bChromaSa8d and reduce redundant work
+
+	Renamed some 'part' variables to 'puIdx' to avoid variable shadow
+	warnings and for consistency with search.cpp
+	[cc327e846dac]
+
+2014-12-04  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/encoder/analysis.cpp:
+	analysis: add chroma distortion to rdLevels 3 and 4
+
+	At these rdLevels, inter/bidir and merge candidate decisions were
+	being taken based on luma sa8dCost only. This will increase bitrate
+	and lower ssim slightly, with better subjective quality.
+
+	Also fixed some naming nits.
+	[1d2a11f6a33f]
+
+	* doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp,
+	source/encoder/frameencoder.cpp, source/encoder/search.cpp,
+	source/x265.cpp, source/x265.h:
+	noiseReduction: allow separate strengths to be specified for intra
+	and inter CUs
+	[ec06f5878e8b]
+
+2014-12-04  Aarthi Thirumalai  <Aarthi Thirumalai>
+
+	* source/common/x86/asm-primitives.cpp:
+	primitives: fix build error in refactor of chroma p2s primitive.
+	[511dde5ac1de]
+
+2014-12-03  Steve Borho  <steve@borho.org>
+
+	* source/common/ipfilter.cpp, source/common/lowres.cpp,
+	source/common/pixel.cpp, source/common/predict.cpp,
+	source/common/primitives.cpp, source/common/primitives.h,
+	source/common/quant.cpp, source/common/shortyuv.cpp,
+	source/common/x86/asm-primitives.cpp, source/common/yuv.cpp,
+	source/encoder/search.cpp, source/test/ipfilterharness.cpp,
+	source/test/pixelharness.cpp:
+	primitives: cleanup EncoderPrimitives, refactor chroma p2s primitive
+
+	No behavior changes
+	[b1b5f06fe9ce]
+
+	* source/common/pixel.cpp, source/common/primitives.h:
+	primitives: remove unused chroma lowres primitive
+	[bfeee4ac5463]
+
+	* source/encoder/search.cpp:
+	search: avoid AMVP selection if both MVs are the same
+
+	This is a simple work avoidance optimization, should have no effect
+	on outputs
+	[2f66c3284c35]
+
+	* source/common/CMakeLists.txt, source/common/primitives.cpp:
+	cmake: remove buggy workarounds for partial SIMD support (fixes #92)
+
+	In the past, there were a number of primitives written in SIMD
+	intrinsics that could work without compiling with YASM. Most of
+	those are now gone, and we generally require YASM for SIMD support.
+	This commit remoes support for using the few remaining SIMD
+	intrinsics without having YASM to provide implementations of
+	x265_emms(), x265_cpu_cpuid(), etc. Fixing a bug in the process.
+	[d7b5e73fc91a]
+
+	* doc/reST/cli.rst:
+	doc: fix typo (closes #83)
+	[7192725cbb0a]
+
+	* doc/reST/cli.rst, source/common/param.cpp, source/x265.cpp,
+	source/x265.h:
+	param: allow NR values from 1..99, clarify docs (closes #87)
+	[21b869f9f706]
+
+	* doc/reST/Makefile, doc/reST/conf.py, doc/reST/x265.rst:
+	doc: add support for reST generated man-pages (closes #89)
+
+	This patch was attached to issue #89 by djcj
+	[ff08fd9b294c]
+
+	* source/common/constants.cpp:
+	constants: adjust lambda tabels for 10bit encodes (fixes #55)
+
+	Since samples are 10bits, where two bits of extra resolution has
+	been added to add more granularity, distortion also has two extra
+	bits. A typical resolution for this problem is to down-shift
+	distortion by 2 bits everywhere, before adding lambda * bits to
+	calculate RD cost. Instead, we multiply lambda by 4 (essentially
+	shift it up by two bits) so distortion and lambda * bits are both at
+	the higher scale.
+
+	lambda2 uses the square of the up-shifted lambda, so it has the
+	doubled up-shift same as the squared distortion values used for RDO.
+
+	Example output change: ./x265
+	/Volumes/video/sintel_trailer_2k_480p24.y4m o.bin --ssim --no-info
+
+	Main: 195.67 kb/s, SSIM Mean Y: 0.9833338 (17.782 dB) Main10 before:
+	363.49 kb/s, SSIM Mean Y: 0.9888182 (19.515 dB) Main10 after: 206.54
+	kb/s, SSIM Mean Y: 0.9855121 (18.390 dB)
+	[014a1e0fb58b]
+
+2014-12-03  Gopu Govindaswamy  <gopu@multicorewareinc.com>
+
+	* source/encoder/encoder.cpp:
+	encoder: fix binary mismatch for analysis load vs save with same
+	bitrate
+	[50d2b92ecc89]
+
+2014-12-02  Steve Borho  <steve@borho.org>
+
+	* Merge
+	[de54cffaecf2]
+
+2014-11-27  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[8x16, 8x32] in avx2: improve 1139c->774c, 1968c->1452c
+	[2e055cbc9046]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[12x16] in avx2: improve 1977c->1418c
+	[ef4ca8474f5c]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[24x32] in avx2: improve 5637c->3695c
+	[8aeeaf6950f7]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[48x64] in avx2: improve 21298c->14696c
+	[d97b1c9f5106]
+
+2014-12-02  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/x265.cpp:
+	x265: add ratetol to command line help
+	[f636a0aadd68]
+
+2014-12-01  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/CMakeLists.txt, source/encoder/encoder.cpp, source/x265.h:
+	encoder: free csv file name
+
+	Since strdup is used uniformly for filenames, csvfn cannot be const.
+	[bde1753de250]
+
+2014-11-27  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[8x16, 8x32] in avx2: improve 1139c->774c, 1968c->1452c
+	[5ee693e4b5fa]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[12x16] in avx2: improve 1977c->1418c
+	[e280ce2e5076]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[24x32] in avx2: improve 5637c->3695c
+	[e1ca311bbb5b]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[48x64] in avx2: improve 21298c->14696c
+	[984271a3aae9]
+
+2014-11-30  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/x265.cpp:
+	x265: remove validateFanout
+	[d9f835ddd112]
+
+2014-11-27  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/pixel.cpp, source/common/primitives.h,
+	source/common/quant.cpp, source/common/x86/asm-primitives.cpp,
+	source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h,
+	source/test/pixelharness.cpp, source/test/pixelharness.h:
+	primitives: refactor tskip related
+	[90401d77a05d]
+
+2014-11-28  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/dct.cpp, source/common/quant.h,
+	source/common/x86/dct8.asm, source/common/x86/dct8.h,
+	source/encoder/search.cpp:
+	nits
+	[e2db5f3c6df8]
+
+2014-11-28  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/common/param.cpp:
+	param: disable b-intra in B frames when tune grain is true.
+	[d32249002258]
+
+2014-11-25  Gopu Govindaswamy  <gopu@multicorewareinc.com>
+
+	* source/encoder/encoder.h:
+	encoder: make all member fields public
+	[af6b68f0feaa]
+
+2014-11-26  Steve Borho  <steve@borho.org>
+
+	* doc/reST/cli.rst, doc/reST/presets.rst:
+	doc: restructure documentation with better grouping, improve cross-
+	refs
+	[dfe0803ae6be]
+
+	* doc/reST/introduction.rst:
+	doc: fix a sphinx build warning
+	[f488b394693b]
+
+	* doc/reST/presets.rst:
+	doc: improve readability of film grain section
+	[03bd64057e72]
+
+	* doc/reST/cli.rst, doc/reST/presets.rst:
+	doc: add cbr to the list of tunings, add helpful cross-refs
+	[071dbe651364]
+
+2014-11-27  Aarthi Thirumalai  <Aarthi Thirumalai>
+
+	* source/CMakeLists.txt, source/common/param.cpp,
+	source/encoder/ratecontrol.cpp, source/x265.cpp, source/x265.h:
+	rc: introduce cli option to tune for cbr.
+	[8e602ed5ca4c]
+
+2014-11-25  Aarthi Thirumalai  <Aarthi Thirumalai>
+
+	* source/encoder/ratecontrol.cpp:
+	rc: improve the frame size planning with respect to vbv buffer
+	occupancy and the lookahead window.
+	[2870269cdd60]
+
+	* source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h:
+	rc: adjust qp for B frames from ABR feedback in case of CBR.
+
+	limits the bitrate fluctuation for CBR with respect to the target
+	bitrate set.
+	[576c675adf92]
+
+	* source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h:
+	rc: limit bit amortization in ABR to longer sequences
+	[11342c8376dd]
+
+2014-11-26  Steve Borho  <steve@borho.org>
+
+	* source/encoder/ratecontrol.cpp:
+	rc: use c-style typecasts
+	[c67b4f3a5e3c]
+
+2014-11-19  Aarthi Thirumalai  <Aarthi Thirumalai>
+
+	* source/encoder/ratecontrol.cpp:
+	rc: tune midframe vbv logic for B frames
+	[8f5fa9538e13]
+
+2014-11-21  Aarthi Thirumalai  <Aarthi Thirumalai>
+
+	* source/encoder/slicetype.cpp:
+	slicetype: fix vbv lookahead data collection for all frames within
+	the lookahead window.
+	[52246e09727d]
+
+2014-11-26  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_hpp[8x8, 8x16, 8x32] in avx2: improve 623c->523c,
+	1384c->1083c, 2555c->2058c
+	[01d82aa06285]
+
+2014-11-26  Aasaipriya Chandran  <aasaipriya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	Luma_hpp[48x64] avx2 asm code : improved 25053c->17882c
+	[bb7303bb00d1]
+
+2014-11-26  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_hpp[8x4] in avx2: improve 357c->261c
+	[a88ddc970748]
+
+2014-11-26  Aasaipriya Chandran  <aasaipriya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	Luma_hpp[32x8 , 32x16 , 32x24 , 32x32 , 32x64] avx2 asm code:
+	improved 2032c->1556c, 4238c->3014c, 6696c->4801c, 8697c->6433c,
+	16823c->12297c
+	[b0153f354186]
+
+2014-11-26  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[64x16] in avx2: improve 7245c->4910c
+	[5700875b428f]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[64x32, 64x48, 64x64] in avx2: improve 14150c->9810c,
+	21132c->14684c, 28663c->19616c
+	[db518f7c8474]
+
+2014-11-25  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[32x8] in avx2: improve 2047c->1472c
+	[d57c28a3010b]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[32x24] in avx2: improve 5562c->3899c
+	[dedc5a8589a6]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[32x16] in avx2: improve 3808c->2491c
+	[3db00b06aea6]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[32x32, 32x64] in avx2: improve 7247c->4909c,
+	14365c->9774c
+	[adf15e303c37]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[16x32, 16x64] in avx2: improve 3875c->2463c,
+	7499c->4894c
+	[45456cd145d8]
+
+2014-11-25  Aasaipriya Chandran  <aasaipriya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: avx2 for Luma_hpp[16x4, 16x8, 16x12, 16x16 , 16x32, 16x64]
+
+	619c->458c, 1174c->812c, 1694c->1112c, 2291c->1535c, 4846c->3207c,
+	9294c->6104c
+	[d11d3120361f]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: avx2 for luma_hpp[64x64, 64x48, 64x32, 64x16]
+
+	33137c->22606c , 24826c->17202c , 16726c->11560c , 7830c->5534c
+	[1e8a0f1e0889]
+
+2014-11-22  Steve Borho  <steve@borho.org>
+
+	* source/encoder/frameencoder.cpp, source/encoder/frameencoder.h:
+	frameencoder: do not use bitmaps for framefilter if not WPP
+
+	The non-WPP row loop wants to do frame filter work in between each
+	row, with a m_filterRowDelay lag. If we use the functions which
+	update the bitmap, it would allow a worker thread to process a
+	filter row before it was ready. In short, the non-WPP path was never
+	intended to work in the presence of a thread pool. This was causing
+	crashes when --no-wpp --pmode||--pme was used.
+	[8011e2a68b88]
+
+2014-11-24  Steve Borho  <steve@borho.org>
+
+	* source/encoder/frameencoder.cpp:
+	frameencoder: release row lock while waiting during VBV restarts
+
+	This fixes what appears to have been an old deadlock bug that has
+	just recently become very reproducible
+	[82f6e4847d57]
+
+2014-11-21  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[16x4] in avx2: improve 734c->497c
+	[3c6f703f94ea]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[16x8] in avx2: improve 1195c->745c
+	[fc83cf5299ae]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[16x12] in avx2: improve 1644c->1018c
+	[65017182318c]
+
+2014-11-21  Praveen Tiwari  <Praveen Tiwari>
+
+	* source/common/dct.cpp:
+	idct32_c: C code optimization
+	[346fccbba4de]
+
+	* source/common/dct.cpp:
+	idct16_c: optimization
+	[388c893d3825]
+
+	* source/common/dct.cpp:
+	idct8_c: optimization
+	[f7d7c480b85d]
+
+	* source/common/dct.cpp:
+	idct4_c: optimization
+	[69a472a77b49]
+
+	* source/common/dct.cpp:
+	dct32_c: optimization
+	[a60dfb900169]
+
+	* source/common/dct.cpp:
+	dct16_c: optimization
+	[7e94ea285179]
+
+	* source/common/dct.cpp:
+	dct8_c: optimization
+	[d426e93e240c]
+
+	* source/common/dct.cpp:
+	dct4_c: C code optimization
+	[d4376e113855]
+
+	* source/common/dct.cpp:
+	idst4_c: optimization
+	[8f373c20bc41]
+
+	* source/common/dct.cpp:
+	dst4_c: optimization
+	[49b66c57972d]
+
+2014-11-21  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/pixel.cpp, source/common/x86/asm-primitives.cpp,
+	source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h:
+	fix copy16to16_shl
+	[5a8da9cb52e8]
+
+2014-11-20  Steve Borho  <steve@borho.org>
+
+	* source/encoder/analysis.cpp, source/encoder/analysis.h,
+	source/encoder/search.cpp, source/encoder/search.h:
+	analysis: explicit locking for pmode and pme parameters
+
+	We've found a repro case involving --no-wpp --pmode --pme --preset
+	slower where time starved worker threads get stuck in the findJob()
+	routine and pushed off the CPU in the mean time the master thread
+	moves on to another CU. This caused very hard to reproduce crashes.
+	[2f8df4c972b9]
+
+2014-11-20  David T Yuen  <dtyx265@gmail.com>
+
+	* source/common/vec/dct-sse3.cpp:
+	Updated intrinsic of idct8 sse3 for new input format
+	[2abf89f5c4f2]
+
+2014-11-20  Divya Manivannan  <divya@multicorewareinc.com>
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[16x16] in avx2: improve 2141c->1284c
+	[2a2142982602]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[8x4] in avx2: improve 498c->257c
+	[c2fd1b7d5d99]
+
+	* source/common/x86/asm-primitives.cpp,
+	source/common/x86/ipfilter8.asm:
+	asm: luma_vpp[8x8] in avx2: improve 701c->387c
+	[562c43f738e4]
+
+2014-11-20  Steve Borho  <steve@borho.org>
+
+	* source/encoder/encoder.cpp:
+	encoder: nits and alloc zero fix
+
+	intraData needs to be zerod on allocation else if one of the later
+	allocs failed some of the pointers will be uninitialized and passed
+	to X265_FREE()
+	[80dcd3dfb805]
+
+2014-11-20  Praveen Tiwari  <Praveen Tiwari>
+
+	* source/common/dct.cpp:
+	Fix for C code mismatch
+
+	This patch is for fix the the binary mismatch in encoded output
+	introduced during refactorizaton of the transform/quant path.
+	Basically it is original version of code to make sure all valid
+	inputs are copied in input buffer, in other hand it is not fully
+	optimized code but this patch is quick fix for the problem and allow
+	us to optimze one function at a time.
+	[1d17ec0cb954]
+
+2014-11-20  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/pixel.cpp:
+	fix for old gcc
+	[ed587d360b97]
+
+2014-11-20  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* build/icl32/build-all.bat, build/icl32/make-makefile.bat,
+	build/icl64/build-all.bat, build/icl64/make-makefile.bat:
+	build: remove icl32 and icl64 scripts
+
+	Typical Windows ICL users link with Visual Studio
+	[3649fabf90d3]
+
+2014-11-20  Praveen Tiwari  <Praveen Tiwari>
+
+	* source/common/x86/ipfilter8.asm:
+	luma_hpp[4x4]: AVX2 asm code bug fix
+	[4b637cb9b792]
+
+2014-11-20  Gopu Govindaswamy  <gopu@multicorewareinc.com>
+
+	* source/encoder/encoder.cpp:
+	encoder: fix analysis file read
+	[0c25a6eac0ca]
+
+2014-11-20  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/encoder/analysis.cpp:
+	fix for rd=0
+	[b33cbe130c63]
+
+	* source/common/cudata.cpp, source/common/cudata.h,
+	source/encoder/analysis.cpp, source/encoder/frameencoder.cpp,
+	source/encoder/search.cpp:
+	replace char to int8_t, where it should be signed char
+	[14a8bb7bbcab]
+
+2014-11-19  Praveen Tiwari  <Praveen Tiwari>
+
+	* source/common/x86/asm-primitives.cpp:
+	disable denoiseDct asm code until fixed for Mac OS
+	[f236adb703f5]
+
+2014-11-16  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/dct.cpp, source/common/ipfilter.cpp,
+	source/common/picyuv.h, source/common/pixel.cpp,
+	source/common/predict.cpp, source/common/primitives.h,
+	source/common/quant.cpp, source/common/quant.h,
+	source/common/shortyuv.cpp, source/common/vec/dct-sse3.cpp,
+	source/common/vec/dct-ssse3.cpp, source/common/x86/blockcopy8.h,
+	source/common/x86/dct8.h, source/common/x86/ipfilter8.h,
+	source/common/x86/mc.h, source/common/x86/pixel-util.h,
+	source/common/x86/pixel.h, source/common/yuv.cpp,
+	source/encoder/analysis.cpp, source/encoder/rdcost.h,
+	source/encoder/search.cpp:
+	primitives: clarify constness
+	[99b5cebf8193]
+
+2014-11-18  Steve Borho  <steve@borho.org>
+
+	* source/common/dct.cpp:
+	dct: fix gcc warnings
+	[34cb58c53859]
+
+2014-11-18  Praveen Tiwari  <Praveen Tiwari>
+
+	* source/common/dct.cpp, source/common/pixel.cpp,
+	source/common/primitives.h, source/common/quant.cpp,
+	source/common/quant.h, source/common/vec/dct-sse3.cpp,
+	source/common/vec/dct-sse41.cpp, source/common/vec/dct-ssse3.cpp,
+	source/common/x86/asm-primitives.cpp,
+	source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h,
+	source/common/x86/dct8.asm, source/common/x86/dct8.h,
+	source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm,
+	source/test/mbdstharness.cpp, source/test/mbdstharness.h,
+	source/test/pixelharness.cpp, source/test/pixelharness.h:
+	refactorizaton of the transform/quant path.
+
+	This patch involves scaling down the DCT/IDCT coefficients from
+	int32_t to int16_t as they can be accommodated on int16_t without
+	any introduction of encode error, this allows us to clean up lots of
+	DCT/IDCT intermediate buffers, optimize enode efficiency for
+	different cli options including noise reduction by reducing data
+	movement operations, accommodating more number of coefficients in a
+	single register for SIMD operations. This patch include all
+	necessary changes for the transfor/quant path including unit test
+	code.
+	[8bee552a1964]
+
+2014-11-19  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/common.h:
+	fseeko for mingw32
+	[cb9bb697fcaa]
+
+2014-11-19  Steve Borho  <steve@borho.org>
+
+	* source/common/threading.h:
+	threadind: fixes for VC11 Win32 includes, prune two unused functions
+	[2b830f08d948]
+
+2014-11-18  Steve Borho  <steve@borho.org>
+
+	* source/common/wavefront.cpp:
+	wavefront: fix msvc warning
+
+	warning C4800: 'unsigned long' : forcing value to bool 'true' or
+	'false' (performance warning)
+	[e29c618cd9a7]
+
+	* source/common/param.cpp, source/common/quant.cpp,
+	source/common/threading.h, source/common/threadpool.cpp,
+	source/common/wavefront.cpp, source/common/wavefront.h,
+	source/common/winxp.h, source/encoder/entropy.cpp,
+	source/encoder/slicetype.cpp:
+	threading: use 32bit atomic integer operations exclusively
+
+	The 32bit operations have better portability and have less onerous
+	alignment restrictions.
+	[814b687db30e]
+
+	* source/common/constants.cpp, source/common/constants.h,
+	source/common/primitives.cpp, source/encoder/api.cpp,
+	source/test/intrapredharness.cpp:
+	constants: remove init/destroyROM functions
+	[d3389bb9efd0]
+
+	* source/x265.h:
+	api: fix range limit docs for RQT limit params
+	[d059cfa88f1a]
+
+	* source/encoder/frameencoder.cpp:
+	frameencoder: white-space nits
+	[29a374b62920]
+
+	* source/encoder/analysis.cpp:
+	analysis: drop MATCH_NON_PMODE macro
+
+	this was a debugging feature, it's not being tested which means it
+	will get broken and so it's best just to keep the code clean
+	[dc61091d5cc4]
+
+	* source/common/threading.cpp:
+	threading: don't use this->
+
+	We don't do this anywhere else
+	[3731d9bc7b88]
+
+	* source/common/threading.cpp, source/common/threading.h,
+	source/common/threadpool.cpp, source/common/threadpool.h:
+	threading: copyright comment format nits
+
+	be consistent with our other files
+	[a7b9b90e1bdd]
+
+	* source/common/param.cpp:
+	param: use strdup() on input strings uniformly
+	[ad532c30bc95]
+
+2014-11-18  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/encoder/encoder.cpp:
+	encoder: init filename to NULL
+	[2f0062f0791b]
+
+2014-11-17  Gopu Govindaswamy  <gopu@multicorewareinc.com>
+
+	* source/common/common.h, source/encoder/analysis.cpp,
+	source/encoder/search.cpp:
+	search: fix binary mismatch and inconsistent crash for share inter
+	information
+	[854fcbb50220]
+
+	* source/encoder/encoder.cpp:
+	encoder: force slicetype using analysis file
+	[05d824463602]
+
+2014-11-17  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/cudata.cpp, source/common/lowres.h,
+	source/common/mv.h, source/encoder/bitcost.h,
+	source/encoder/motion.cpp, source/encoder/motion.h,
+	source/encoder/slicetype.cpp:
+	modify MV default constructor to do nothing
+	[7a1ec67bd004]
+
+2014-11-17  Aarthi Thirumalai  <Aarthi Thirumalai>
+
+	* source/encoder/ratecontrol.cpp:
+	vbv: tune vbv predictors for better mapping of predicted bits to
+	encoded bits
+	[27d36c4b4a27]
+
+2014-11-16  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/encoder/analysis.cpp, source/encoder/analysis.h:
+	analysis: cleanups, init pointers, variable names are made self-
+	explanatory
+	[ed2ba7a90567]
+
+2014-11-12  Gopu Govindaswamy  <gopu@multicorewareinc.com>
+
+	* source/encoder/analysis.cpp:
+	analysis: fix binary mismatch for share intra save and load mode
+	with same cli
+	[10b8d3fbe408]
+
+2014-11-14  Steve Borho  <steve@borho.org>
+
+	* source/x265.cpp:
+	cli: fix analysis filename argument
+
+	This showed up as a GCC warning about an unused variable, but having
+	the arg handled here prevented the org from being passed to
+	x265_param_parse()
+	[8191e0d02455]
+
+	* source/encoder/encoder.cpp:
+	encoder: add prefix to FREAD and FWRITE macros to avoid MacOSX macro
+	conflict
+
+	/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform
+	/Developer/SDKs/MacOSX10.10.sdk/usr/include/sys/fcntl.h:111:9: note:
+	previous definition is here #define FWRITE 0x0002
+	[b617dca5ce12]
+
+	* source/common/common.h, source/common/frame.h,
+	source/encoder/encoder.h:
+	common: move analysis reuse structs to common.h
+
+	files in common/ shouldn't include encoder.h
+	[72f1222903a3]
+
+2014-11-14  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/encoder/analysis.cpp:
+	analysis: encodeResidue() directly write to reconPic
+	[c3096034934f]
+
+2014-11-14  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/CMakeLists.txt, source/common/common.h,
+	source/common/frame.h, source/common/param.cpp,
+	source/encoder/analysis.cpp, source/encoder/analysis.h,
+	source/encoder/api.cpp, source/encoder/encoder.cpp,
+	source/encoder/encoder.h, source/x265.cpp, source/x265.def.in,
+	source/x265.h:
+	analysis save/load: refactor full implementation
+
+	1. Move analysis inter/intra data into encoder 2. Encoder allocates
+	and frees memory for x265 analysis, remove api calls 3. Inter and
+	intra data allocated based on sliceType only 4. frame record size is
+	now variable
+	[58c2e06c2e4a]
+
+2014-11-13  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/encoder/analysis.cpp:
+	analysis: don't add the cost of picture boundary CU to avgCost
+	[CHANGES OUTPUT]
+	[64314f8061f1]
+
+2014-11-13  Steve Borho  <steve@borho.org>
+
+	* source/cmake/FindVLD.cmake:
+	cmake: hack to avoid escaping problems in cmake 3.1 parser
+
+	Fix suggested by Mario *LigH* RohkrÃ¤mer
+	[17f2fb0996db]
+
+2014-11-13  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/cudata.cpp, source/encoder/analysis.cpp,
+	source/encoder/entropy.cpp, source/encoder/entropy.h,
+	source/encoder/sao.cpp:
+	nits
+	[03974d78f241]
+
+2014-11-12  Steve Borho  <steve@borho.org>
+
+	* source/encoder/rdcost.h:
+	rdcost: lower the psy-rd scale factor for I slices to 96/256
+
+	Based on Santhoshini's testing, this is better at preventing
+	artifacts
+	[18aefbde72ab]
+
+2014-11-11  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/cudata.cpp, source/common/cudata.h,
+	source/encoder/frameencoder.cpp, source/encoder/frameencoder.h:
+	refine initializeGeoms()
+	[98fb658f3229]
+
+2014-11-11  Steve Borho  <steve@borho.org>
+
+	* source/encoder/analysis.cpp:
+	analysis: fix bidir non-determinism in --pmode --rd 5
+	[306ef9782a30]
+
+	* source/common/param.cpp, source/encoder/analysis.cpp,
+	source/encoder/search.cpp, source/encoder/search.h:
+	Merge
+	[fa2fedd97ff2]
+
+2014-11-10  Steve Borho  <steve@borho.org>
+
+	* source/common/quant.cpp:
+	quant: allow --nr in all slice types evenly
+	[38fa64a5c51c]
+
+2014-11-06  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/common/common.h, source/common/quant.cpp,
+	source/common/quant.h:
+	noiseReduction: apply only for I and P, move NoiseReduction to
+	quant.h
+
+	This doubles the number of quant nr categories; intra blocks now use
+	the lower half.
+	[ed89e58b44e8]
+
+2014-11-10  Steve Borho  <steve@borho.org>
+
+	* doc/reST/cli.rst, source/common/param.cpp:
+	param: raise --nr limit to 2000
+	[27f293dd9eee]
+
+	* doc/reST/presets.rst, source/common/param.cpp:
+	param: remove --b-intra from --tune grain, document rdoq restriction
+	[64ccc616be33]
+
+2014-11-09  Steve Borho  <steve@borho.org>
+
+	* source/encoder/rdcost.h:
+	rdcost: experimental slice-type based psy-rd scale factor
+	[4f3fd7ab8868]
+
+2014-11-08  Steve Borho  <steve@borho.org>
+
+	* source/encoder/analysis.cpp, source/encoder/analysis.h,
+	source/encoder/search.cpp:
+	analysis: RDO based BIDIR decisions
+
+	At RD 0, 1, and 2, this changes 2Nx2N bidir from a SATD decision to
+	an SA8D decision.
+
+	At RD 3 and 4, if the bidir SA8D cost is within 17/16 of the best
+	inter cost, then it makes an RDO decision between bestInter and
+	Bidir (allowing psy-rd to influence the decision, which is the whole
+	point)
+
+	At RD 5 and 6, 2Nx2N BIDIR is yet another RD choice at the same
+	level as 2Nx2N inter and rect and amp. (psy) RDO picks the best mode
+	for each block.
+	[4c6c28cc93d9]
+
+2014-11-11  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/x265.cpp:
+	x265: more meaningful error messages in analysis
+	[838e41fb256b]
+
+	* source/encoder/api.cpp:
+	api: cleanup
+	[3c01e8881946]
+
+	* source/encoder/api.cpp:
+	api: replace analysis data with pre defined constant
+	[b4effa4dd53b]
+
+	* source/x265.cpp:
+	x265: create and initialise recon object if analysis mode is enabled
+	[47b290236ca3]
+
+	* source/common/param.cpp:
+	param: add default value to analysis mode
+	[5c397e744cfd]
+
+2014-11-11  Gopu Govindaswamy  <gopu@multicorewareinc.com>
+
+	* source/encoder/analysis.cpp, source/encoder/api.cpp,
+	source/x265.cpp, source/x265.h:
+	x265: remove redundant variables from intra and inter analysis
+	structure
+	[ad5177c86756]
+
+	* source/encoder/analysis.cpp, source/encoder/search.cpp,
+	source/encoder/search.h, source/x265.h:
+	analysis: Dump best MV statistics and re-use this for analysis load
+	mode
+
+	This patch fixes a bug in inter slices in analysis=load|save mode.
+	Inter data for all partitions is now saved correctly.
+	[c8004323493e]
+
+2014-11-10  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/cudata.cpp, source/common/cudata.h,
+	source/common/deblock.cpp, source/encoder/analysis.cpp,
+	source/encoder/entropy.cpp, source/encoder/frameencoder.cpp,
+	source/encoder/search.cpp:
+	cleanup SIZE_NONE. empty CU has MODE_NONE.
+	[32513a4c3bd4]
+
+2014-11-09  Steve Borho  <steve@borho.org>
+
+	* source/encoder/search.cpp:
+	search: fixup
+	[1e04e178a349]
+
+2014-11-08  Steve Borho  <steve@borho.org>
+
+	* source/encoder/reference.cpp, source/encoder/reference.h,
+	source/encoder/search.cpp:
+	reference: add methods for querying CU/PU pointers
+	[9687a9d1205a]
+
+	* source/encoder/analysis.cpp:
+	analysis: delay initialization of prediction cu in RD 5 and 6
+	[b9147e641ce6]
+
+2014-11-09  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/encoder/analysis.cpp:
+	fix typo
+	[3dc9857c59d3]
+
+2014-11-08  Steve Borho  <steve@borho.org>
+
+	* source/encoder/analysis.cpp:
+	analysis: delay initialization of prediction cu until just before
+	use
+
+	This avoids initializing CUs that may never be used because of
+	various early-outs
+	[3f2d68368554]
+
+	* source/encoder/search.cpp, source/encoder/search.h:
+	search: keep AMVP candidates in mode structure
+
+	This fixes some work replication in --pme and will also make
+	handling BIDIR as a seperate prediction easier.
+	[6124c837b3ab]
+
+	* source/encoder/motion.h, source/encoder/search.cpp,
+	source/encoder/slicetype.h:
+	motion: remove trivial set methods; make some members public
+	[53c146f7eb9f]
+
+2014-11-07  Steve Borho  <steve@borho.org>
+
+	* source/encoder/frameencoder.cpp:
+	nr: fix denoise offset memcopy size
+	[0912563c4ac1]
+
+	* source/encoder/entropy.h:
+	entropy: pass context model (state) to bitsCodeBin as uint32_t
+
+	Should be slightly more efficient
+	[a67b848d6c04]
+
+	* source/encoder/entropy.cpp:
+	entropy: nit
+	[b55799a2f5ad]
+
+	* source/encoder/entropy.cpp:
+	entropy: ensure X265_CHECK() has braces
+	[0fd8e0c5272a]
+
+	* source/encoder/entropy.cpp, source/encoder/entropy.h:
+	entropy: inline methods which mapped to encodeBin() calls
+	[640d2936e699]
+
+	* source/encoder/entropy.cpp, source/encoder/entropy.h:
+	entropy: inline bit counting functions
+	[ca7873cab172]
+
+	* source/encoder/entropy.cpp:
+	entropy: use bitsCodeBin in intra mode bit estimate functions
+	[84fc74874406]
+
+	* source/encoder/entropy.cpp, source/encoder/entropy.h:
+	entropy: rename encodeBinContext to bitsCodeBin, make const
+
+	The function is not modifying the context, so there is no need to
+	pass as a reference, and the function can be const. Also, group the
+	bit counting RDO functions together
+	[a1ee9422183b]
+
+	* source/encoder/entropy.cpp:
+	entropy: white-space nits
+	[429742055057]
+
+2014-11-07  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/encoder/search.cpp:
+	fix bug in 522baf03fbbd
+	[f2130a4dc876]
+
+2014-11-07  Deepthi Nandakumar  <deepthi@multicorewareinc.com>
+
+	* source/encoder/search.cpp:
+	search: fix warnings
+	[7338b1f1f43d]
+
+2014-11-07  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/encoder/analysis.cpp:
+	fix typo
+	[4f034e3adef8]
+
+2014-11-05  Ashok Kumar Mishra  <ashok@multicorewareinc.com>
+
+	* source/encoder/entropy.cpp, source/encoder/entropy.h,
+	source/encoder/search.cpp, source/encoder/search.h:
+	[REVIEW PATCH/OUTPUT CHANGED]search: removed multiple encode
+	Coefficients from estimateResidualQT()
+
+	Tried to remove multiple encode coefficients from
+	estimateResidualQT() function. Coefficients are encoded in three
+	stages: Once for calculation of distortion and twice for split and
+	unsplit block cost calculation. I have given comments where I have
+	changed the code.
+	[eb5a9eb03dd6]
+
+	* source/encoder/search.cpp, source/encoder/search.h:
+	search: made a function for null cost calculation in
+	xEstimateResidualQT()
+	[522baf03fbbd]
+
+	* source/encoder/search.cpp, source/encoder/search.h:
+	search: made separate functions for encoding cbfs in
+	xEstimateResidualQT()
+	[0b7c709335b2]
+
+2014-11-07  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/cudata.cpp, source/common/cudata.h:
+	cudata: remove default argument
+	[bc4f3dab51db]
+
+2014-11-06  Steve Borho  <steve@borho.org>
+
+	* doc/reST/presets.rst:
+	doc: fix sub-title depth
+
+	Single dash was already used by a higher section
+	[0ebd0b00bf9b]
+
+	* doc/reST/cli.rst, doc/reST/presets.rst, source/common/param.cpp:
+	param: add --tune grain
+	[ec5588025568]
+
+	* source/encoder/search.cpp:
+	search: ugly bias hack for bidir with psy-rd
+	[e33e09549c0c]
+
+	* doc/reST/cli.rst:
+	docs: document RC params, at least minimally
+	[beac946dac85]
+
+	* source/x265.h:
+	api: cleanup comments
+	[8ceaab303bfa]
+
+	* source/x265.cpp:
+	cli: cleanup CLI help, add 'verbose' tier
+
+	Remove a lot of uncommon features from the initial help output,
+	require
+	--log-level debug --help to see it all
+	[f599a4df57ac]
+
+	* source/common/param.cpp, source/x265.cpp:
+	api: expose rate control params via x265_param_parse() and CLI
+
+	Adds range checks for qCompress, which has documented limits. The
+	others have very minimal explanations; so I'm not adding them to the
+	CLI help. Users should not touch them unless they know what they are
+	doing.
+
+	Note this commit doesn't bump X265_BUILD since no new params were
+	added.
+	[b37cda5d3092]
+
+2014-11-05  Steve Borho  <steve@borho.org>
+
+	* source/common/deblock.cpp:
+	deblock: fix type conversion warnings
+	[4a3997fd4fc1]
+
+2014-11-05  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/deblock.cpp, source/common/deblock.h,
+	source/common/quant.cpp, source/common/slice.h,
+	source/encoder/encoder.cpp, source/encoder/entropy.cpp,
+	source/encoder/framefilter.cpp, source/encoder/rdcost.h,
+	source/encoder/sao.cpp:
+	refine deblocking filter
+	[65e14d5a5728]
+
+2014-11-04  Gopu Govindaswamy  <gopu@multicorewareinc.com>
+
+	* source/x265.cpp:
+	cli: bug fix for validatefanout param analysis-mode=save and load
+	[2a8f3d5820a6]
+
+2014-11-04  gopi jayaraman  <gopi@multicorewareinc.com>
+
+	* source/encoder/encoder.cpp:
+	encoder: use 6 frameNumThreads for cpucount 32 and above
+	[0dcc6a1d8f02]
+
+2014-11-04  Steve Borho  <steve@borho.org>
+
+	* source/x265.h:
+	api: add void to functions that take no parameters for -Wstrict-
+	prototypes
+	[0d44fcb269a6]
+
+	* source/common/deblock.cpp, source/common/frame.cpp,
+	source/common/frame.h, source/common/framedata.h,
+	source/common/predict.cpp, source/encoder/analysis.cpp,
+	source/encoder/dpb.cpp, source/encoder/encoder.cpp,
+	source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp,
+	source/encoder/ratecontrol.cpp, source/encoder/sao.cpp,
+	source/encoder/search.cpp, source/encoder/slicetype.cpp,
+	source/encoder/weightPrediction.cpp:
+	frame: rename m_reconPicYuv -> m_reconPic, m_origPicYuv -> m_fencPic
+
+	the fooPicYuv names were potentially confusing, preferred names:
+	PicYuv* fooPic; Yuv* fooYuv;
+	[67bf055c13d5]
+
+	* source/encoder/motion.cpp, source/encoder/motion.h:
+	motion: remove unused sa8d pointer and bufSA8D method
+	[59a08101dfc6]
+
+2014-11-04  Gopu Govindaswamy  <gopu@multicorewareinc.com>
+
+	* source/common/cudata.h, source/encoder/analysis.cpp,
+	source/encoder/analysis.h, source/encoder/api.cpp,
+	source/encoder/encoder.cpp, source/encoder/search.cpp,
+	source/encoder/search.h, source/x265.cpp, source/x265.h:
+	search: dump and share the best motion statistics for inter(P&B)
+	slices
+	[d5f6133b99d4]
+
+2014-11-03  Steve Borho  <steve@borho.org>
+
+	* doc/reST/cli.rst:
+	docs: fix reST parsing issues
+	[a8ec469d7fb1]
+
+2014-11-03  Min Chen  <chenm003@163.com>
+
+	* source/common/primitives.h, source/common/x86/pixel-util.h,
+	source/common/x86/pixel-util8.asm:
+	cleanup: remove unused asm calcrecon
+	[5637b495e2e1]
+
+	* source/common/x86/ipfilter8.asm:
+	asm: fix typo error in interp_8tap_vert_pp_4x4_avx2
+	[ee88b63aced0]
+
+2014-11-03  Satoshi Nakagawa  <nakagawa424@oki.com>
+
+	* source/common/cudata.cpp, source/common/cudata.h,
+	source/common/quant.cpp, source/encoder/analysis.cpp,
+	source/encoder/entropy.cpp, source/encoder/frameencoder.cpp,
+	source/encoder/search.cpp:
+	cleanup CUData::m_skipFlag
+	[2e60f3b81981]
+
+2014-10-31  Steve Borho  <steve@borho.org>
+
+	* source/encoder/encoder.cpp:
+	encoder: make it clear that --fast-cbf is innefective at lower rd
+	levels
+
+	This begs the question of whether the feature should exist, or
+	whether it should be added to the lower RD levels
+	[eebb372eec89]
+
+	* source/common/param.cpp:
+	param: show options using their CLI / param_parse names
+	[c32a733a819b]
+
+2014-10-30  Steve Borho  <steve@borho.org>
+
+	* .hgtags:
+	remove dead non-release tags
+
+	anyone interested in archeology can still find them; there's no
+	sense to keep them on the tip since we stopped tracking last known
+	good more than a year ago
+	[75cb2ab1ecec]
+
 2014-10-31  Steve Borho  <steve@borho.org>
 
+	* source/encoder/encoder.cpp:
+	Merge with stable
+	[ae8a661acdc4]
+
+	* .hgtags:
+	Added tag 1.4 for changeset 5e604833c5aa
+	[d2db9c1ab44b] <stable>
+
 	* source/encoder/encoder.cpp:
 	encoder: emit an Active Parameter Sets SEI in stream headers if
 	interlaced
diff --git a/build/README.txt b/build/README.txt
index c087349..d131884 100644
--- a/build/README.txt
+++ b/build/README.txt
@@ -11,26 +11,27 @@ Note: MSVC12 requires cmake 2.8.11 or later
 
 1. Yasm 1.2.0 or later, to compile assembly primitives (performance)
 
-   For Windows, download
-   http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win32.exe or
-   http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win64.exe
-   depending on your O/S and copy the EXE into C:\Windows or somewhere else
-   in your %PATH% that a 32-bit app (cmake) can find it. If it is not in the
-   path, you must manually tell cmake where to find it.
+   For Windows, download the latest yasm executable
+   http://yasm.tortall.net/Download.html and copy the EXE into
+   C:\Windows or somewhere else in your %PATH% that a 32-bit app (cmake)
+   can find it. If it is not in the path, you must manually tell cmake
+   where to find it.  Note: you do not need the vsyasm packages, x265
+   does not use them.  You only need the yasm executable itself.
 
-   For Linux, yasm-1.2.0 is likely too new to be packaged for your system so you
-   will need get http://www.tortall.net/projects/yasm/releases/yasm-1.2.0.tar.gz
-   compile, and install it.
+   On Linux, the packaged yasm may be older than 1.2, in which case
+   so you will need get the latest source and build it yourself.
 
    Once YASM is properly installed, run cmake to regenerate projects. If you
    do not see the below line in the cmake output, YASM is not in the PATH.
 
-   -- Found Yasm 1.2.0 to build assembly primitives
+   -- Found Yasm 1.3.0 to build assembly primitives
 
-   Now build the encoder and run x265 -V. If you see "assembly" on this
-   line, you have YASM properly installed:
+   Now build the encoder and run x265 -V:
 
-   x265 [info]: performance primitives: intrinsic assembly
+   x265 [info]: using cpu capabilities: MMX, SSE2, ...
+
+   If cpu capabilities line says 'none!', then the encoder was built
+   without yasm.
 
 2. VisualLeakDetector (Windows Only)
 
diff --git a/build/icl32/build-all.bat b/build/icl32/build-all.bat
deleted file mode 100644
index cbe9a59..0000000
--- a/build/icl32/build-all.bat
+++ /dev/null
@@ -1,14 +0,0 @@
-@echo off
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )
-if "%ICL%" == "" (
-  msg "%username%" "Intel C++ 2013 not detected"
-  exit 1
-)
-if not exist Makefile (
-  call make-makefile.bat
-)
-if exist Makefile (
-  call "%ICL%\bin\compilervars.bat" ia32
-  nmake
-)
diff --git a/build/icl32/make-makefile.bat b/build/icl32/make-makefile.bat
deleted file mode 100644
index 799344e..0000000
--- a/build/icl32/make-makefile.bat
+++ /dev/null
@@ -1,15 +0,0 @@
-@echo off
-::
-:: run this batch file to create an Intel C++ 2013 NMake makefile for this project.
-:: See the cmake documentation for other generator targets
-::
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )
-if "%ICL%" == "" (
-  msg "%username%" "Intel C++ 2013 not detected"
-  exit 1
-)
-call "%ICL%\bin\compilervars.bat" ia32
-set CC=icl
-set CXX=icl
-cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source
diff --git a/build/icl64/build-all.bat b/build/icl64/build-all.bat
deleted file mode 100644
index d1d6b8d..0000000
--- a/build/icl64/build-all.bat
+++ /dev/null
@@ -1,14 +0,0 @@
-@echo off
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )
-if "%ICL%" == "" (
-  msg "%username%" "Intel C++ 2013 not detected"
-  exit 1
-)
-if not exist Makefile (
-  call make-makefile.bat
-)
-if exist Makefile (
-  call "%ICL%\bin\compilervars.bat" intel64
-  nmake
-)
diff --git a/build/icl64/make-makefile.bat b/build/icl64/make-makefile.bat
deleted file mode 100644
index 2d3f629..0000000
--- a/build/icl64/make-makefile.bat
+++ /dev/null
@@ -1,17 +0,0 @@
-@echo off
-::
-:: run this batch file to create an Intel C++ 2013 NMake makefile for this project.
-:: See the cmake documentation for other generator targets
-::
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )
-if "%ICL%" == "" (
-  msg "%username%" "Intel C++ 2013 not detected"
-  pause
-  exit 1
-)
-call "%ICL%\bin\compilervars.bat" intel64
-set CC=icl
-set CXX=icl
-cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source
-pause
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index eaa41cc..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,131 +0,0 @@
-x265 (1.4-4~ubuntu1) trusty; urgency=medium
-
-  * Support for ARMv7 with NEON extensions
-
- -- JÃ©rÃ´me Benoit <jerome.benoit@piment-noir.org>  Tue, 09 Dec 2014 22:19:18 +0100
-
-x265 (1.4-3~trusty) trusty; urgency=low
-
-  * Upstream bugfixes:
-    * cli: bug fix for validatefanout param analysis-mode=save and load
-    * docs: fix reST parsing issues
-
- -- Marshall Banana <djcj@gmx.de>  Wed, 05 Nov 2014 01:29:32 +0100
-
-x265 (1.4-2~trusty2) trusty; urgency=low
-
-  * Install documentation in x265-doc package to avoid package conflicts
-
- -- Marshall Banana <djcj@gmx.de>  Sat, 01 Nov 2014 02:49:51 +0100
-
-x265 (1.4-1~trusty) trusty; urgency=low
-
-  * New upstream release
-
- -- Marshall Banana <djcj@gmx.de>  Sat, 01 Nov 2014 00:20:42 +0100
-
-x265 (1.3-4~trusty) trusty; urgency=low
-
-  * Update manpage
-
- -- Marshall Banana <djcj@gmx.de>  Wed, 01 Oct 2014 18:09:33 +0200
-
-x265 (1.3-3~trusty) trusty; urgency=low
-
-  * Rename x265-10b to x265-10bit
-  * Provide x265-16bit via symbolic link
-
- -- Marshall Banana <djcj@gmx.de>  Wed, 01 Oct 2014 17:56:41 +0200
-
-x265 (1.3-2~trusty) trusty; urgency=low
-
-  * Add doc-base control file
-
- -- Marshall Banana <djcj@gmx.de>  Fri, 05 Sep 2014 04:07:20 +0200
-
-x265 (1.3-1~trusty) trusty; urgency=low
-
-  * New upstream release
-
- -- Marshall Banana <djcj@gmx.de>  Fri, 22 Aug 2014 20:30:50 +0200
-
-x265 (1.2+510-hg2bdcfcc1bb33-1~trusty) trusty; urgency=low
-
-  * Current snapshot
-
- -- Marshall Banana <djcj@gmx.de>  Mon, 11 Aug 2014 12:51:05 +0200
-
-x265 (1.2-3~trusty) trusty; urgency=low
-
-  * Build static library from different object files
-
- -- Marshall Banana <djcj@gmx.de>  Mon, 21 Jul 2014 05:29:20 +0200
-
-x265 (1.2-2~trusty1) trusty; urgency=low
-
-  * Provide separate optimized shared libraries for i686
-
- -- Marshall Banana <djcj@gmx.de>  Fri, 11 Jul 2014 20:36:37 +0200
-
-x265 (1.2-1~trusty) trusty; urgency=low
-
-  * New upstream version
-  * Update patch
-  * Update man page
-  * Install upstream changelog
-
- -- Marshall Banana <djcj@gmx.de>  Thu, 10 Jul 2014 19:40:33 +0200
-
-x265 (1.1-4~trusty) trusty; urgency=low
-
-  * Install 10bit binary to make usage of 10bit library possible.
-
- -- Marshall Banana <djcj@gmx.de>  Tue, 17 Jun 2014 10:53:50 +0200
-
-x265 (1.1-3~trusty) trusty; urgency=low
-
-  * Don't rename 10 bit library
-
- -- Marshall Banana <djcj@gmx.de>  Fri, 13 Jun 2014 16:43:34 +0200
-
-x265 (1.1-2~trusty) trusty; urgency=low
-
-  * Build less packages
-
- -- Marshall Banana <djcj@gmx.de>  Wed, 11 Jun 2014 03:47:39 +0200
-
-x265 (1.1-1~trusty) trusty; urgency=low
-
-  * New upstream version
-
- -- Marshall Banana <djcj@gmx.de>  Fri, 13 Jun 2014 16:42:16 +0200
-
-x265 (1.1) unstable; urgency=low
-
-  * New upstream version
-
- -- Marshall Banana <djcj@gmx.de>  Sat, 07 Jun 2014 17:44:06 +0200
-
-x265 (1.0+5-dcf74ea39e31) unstable; urgency=low
-
-  * New upstream version
-
- -- Marshall Banana <djcj@gmx.de>  Sun, 04 May 2014 19:07:30 +0100
-
-x265 (0.9+114-c630b0b393ee) unstable; urgency=low
-
-  * New upstream version
-
- -- Marshall Banana <djcj@gmx.de>  Fri, 04 Apr 2014 01:45:30 +0100
-
-x265 (0.8+52-93861c42b879) unstable; urgency=low
-
-  * New upstream version
-
- -- Marshall Banana <djcj@gmx.de>  Sat, 08 Mar 2014 10:08:00 +0100
-
-x265 (0.7+216-591ca91f0501) unstable; urgency=low
-
-  * Initial upload
-
- -- Marshall Banana <djcj@gmx.de>  Wed, 19 Feb 2014 21:30:00 +0100
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/confflags b/debian/confflags
deleted file mode 100644
index 714d8c1..0000000
--- a/debian/confflags
+++ /dev/null
@@ -1,45 +0,0 @@
-libdir := lib/$(DEB_HOST_MULTIARCH)
-
-
-common_confflags := \
-	-DCMAKE_INSTALL_PREFIX=/usr \
-	-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-	-DCMAKE_VERBOSE_MAKEFILE=ON
-
-8bit_confflags := \
-	$(common_confflags) \
-	-DLIB_INSTALL_DIR=$(libdir)
-
-10bit_confflags := \
-	$(common_confflags) \
-	-DBIN_INSTALL_DIR=$(libdir)/x265-10bit \
-	-DLIB_INSTALL_DIR=$(libdir)/x265-10bit \
-	-DHIGH_BIT_DEPTH=ON
-
-
-static_confflags := \
-	-DCMAKE_INSTALL_PREFIX=/usr \
-	-DCMAKE_BUILD_TYPE=Release \
-	-DCMAKE_VERBOSE_MAKEFILE=ON \
-	-DENABLE_CLI=OFF \
-	-DENABLE_SHARED=OFF
-
-8bit_static_confflags  := \
-	$(static_confflags) \
-	-DLIB_INSTALL_DIR=$(libdir)
-
-10bit_static_confflags := \
-	$(static_confflags) \
-	-DLIB_INSTALL_DIR=$(libdir)/x265-10bit \
-	-DHIGH_BIT_DEPTH=ON
-
-
-# disable assembly on x86 and arm
-ifneq (,$(filter i386 i486 i586 i686 pentium arm,$(DEB_HOST_GNU_CPU)))
-noasm = -DENABLE_ASSEMBLY=OFF -DCMAKE_CXX_FLAGS='-DX86_64=0'
-8bit_confflags  += $(noasm)
-10bit_confflags += $(noasm)
-8bit_static_confflags  += $(noasm)
-10bit_static_confflags += $(noasm)
-endif
-
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 35db2b2..0000000
--- a/debian/control
+++ /dev/null
@@ -1,73 +0,0 @@
-Source: x265
-Section: libs
-Priority: optional
-Maintainer: Marshall Banana <djcj@gmx.de>
-Homepage: https://bitbucket.org/multicoreware/x265/wiki/Home
-Standards-Version: 3.9.5
-Build-Depends:
- debhelper (>= 9),
- cmake (>= 2.8.8),
- python3-sphinx,
- yasm (>= 1.2.0) [any-i386]
-Vcs-Browser: https://bitbucket.org/multicoreware/x265/src
-
-Package: x265
-Architecture: any
-Section: video
-Depends:
- ${misc:Depends},
- ${shlibs:Depends}
-Description: H.265/HEVC video encoder - frontend binary
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
-
-Package: libx265-35
-Architecture: any
-Pre-Depends:
- ${misc:Pre-Depends}
-Depends:
- ${misc:Depends},
- ${shlibs:Depends}
-Description: H.265/HEVC video encoder - runtime files
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
-
-Package: libx265-dev
-Architecture: any
-Section: libdevel
-Depends:
- ${misc:Depends},
- libx265-35 (= ${binary:Version})
-Description: H.265/HEVC video encoder - development files
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the static library and
- headers used to build programs that use libx265-35.
-
-Package: x265-doc
-Architecture: all
-Section: doc
-Depends:
- ${misc:Depends},
- libjs-jquery (>= 1.4),
- libjs-underscore
-Description: x265 documentation
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the x265 documentation.
-
-Package: x265-dbg
-Architecture: any
-Section: debug
-Priority: extra
-Depends:
- ${misc:Depends},
- x265 (= ${binary:Version}),
- libx265-35 (= ${binary:Version})
-Description: debugging symbols for x265 and libx265
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the debugging symbols for x265.
diff --git a/debian/control.in b/debian/control.in
deleted file mode 100644
index 8703763..0000000
--- a/debian/control.in
+++ /dev/null
@@ -1,73 +0,0 @@
-Source: x265
-Section: libs
-Priority: optional
-Maintainer: Marshall Banana <djcj@gmx.de>
-Homepage: https://bitbucket.org/multicoreware/x265/wiki/Home
-Standards-Version: 3.9.5
-Build-Depends:
- debhelper (>= 9),
- cmake (>= 2.8.8),
- python3-sphinx,
- yasm (>= 1.2.0) [any-i386]
-Vcs-Browser: https://bitbucket.org/multicoreware/x265/src
-
-Package: x265
-Architecture: any
-Section: video
-Depends:
- ${misc:Depends},
- ${shlibs:Depends}
-Description: H.265/HEVC video encoder - frontend binary
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
-
-Package: libx265-@API@
-Architecture: any
-Pre-Depends:
- ${misc:Pre-Depends}
-Depends:
- ${misc:Depends},
- ${shlibs:Depends}
-Description: H.265/HEVC video encoder - runtime files
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
-
-Package: libx265-dev
-Architecture: any
-Section: libdevel
-Depends:
- ${misc:Depends},
- libx265-@API@ (= ${binary:Version})
-Description: H.265/HEVC video encoder - development files
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the static library and
- headers used to build programs that use libx265-@API@.
-
-Package: x265-doc
-Architecture: all
-Section: doc
-Depends:
- ${misc:Depends},
- libjs-jquery (>= 1.4),
- libjs-underscore
-Description: x265 documentation
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the x265 documentation.
-
-Package: x265-dbg
-Architecture: any
-Section: debug
-Priority: extra
-Depends:
- ${misc:Depends},
- x265 (= ${binary:Version}),
- libx265-@API@ (= ${binary:Version})
-Description: debugging symbols for x265 and libx265
- library for encoding video using the High
- Efficiency Video Coding (HEVC/H.265) standard.
- .
- This package contains the debugging symbols for x265.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index bd5f670..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,115 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: x265
-Upstream-Contact: Steve Borho <steve@borho.org>
-Source: https://bitbucket.org/multicoreware/x265/wiki/Home
-
-
-Files: *
-Copyright: 2013-2014 x265 project
-License: GPL-2.0+
-
-Files: source/common/const-a.asm
-       source/common/cpu-a.asm
-       source/common/intrapred.h
-       source/common/mc-a*.asm
-       source/common/pixel.h
-       source/common/pixel-32.asm
-       source/common/pixel-a.asm
-       source/common/sad*.asm
-       source/common/ssd.asm
-       source/common/x86inc.asm
-       source/test/checkasm-a.asm
-Copyright: 2003-2014 x264 project
-License: GPL-2.0+
-
-Files: source/common/x86util.asm
-Copyright: 2008-2013 x264 project
-License: ISC
-
-Files: source/compat/getopt/*
-Copyright: 1987-2001 Free Software Foundation, Inc.
-License: LGPL-2.1+
-
-Files: source/Lib/*
-Copyright: 2010-2013 ITU/ISO/IEC
-License: BSD-3-clause
-
-Files: debian/*
-Copyright: 2014 djcj <djcj@gmx.de>
-License: ISC
-
-
-License: GPL-2.0+
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- .
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- .
- You should have received a copy of the GNU General Public License
- along with this program.  If not, see <http://www.gnu.org/licenses/>.
- .
- On Debian GNU/Linux systems, the complete text of the GNU General Public
- License version 2 can be found in '/usr/share/common-licenses/GPL-2'.
-
-
-License: ISC
- Permission to use, copy, modify, and/or distribute this software for any
- purpose with or without fee is hereby granted, provided that the above
- copyright notice and this permission notice appear in all copies.
- .
- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-
-License: LGPL-2.1+
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
- .
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
- .
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <http://www.gnu.org/licenses/>.
- .
- On Debian systems, the complete text of the GNU Lesser General
- Public License version 3 can be found in '/usr/share/common-licenses/LGPL-2.1'.
-
-
-License: BSD-3-clause
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- .
-  * Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-  * Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
-    be used to endorse or promote products derived from this software without
-    specific prior written permission.
- .
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
- BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/debian/getapi.sh b/debian/getapi.sh
deleted file mode 100755
index c23fd26..0000000
--- a/debian/getapi.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-egrep 'set\(X265_BUILD ' source/CMakeLists.txt | sed -e 's/set(X265_BUILD //; s/)//g'
diff --git a/debian/libx265-dev.install b/debian/libx265-dev.install
deleted file mode 100644
index f58613d..0000000
--- a/debian/libx265-dev.install
+++ /dev/null
@@ -1,6 +0,0 @@
-usr/include
-usr/lib/*/*.a
-usr/lib/*/*.so
-usr/lib/*/pkgconfig
-usr/lib/*/x265-10bit/*.a
-usr/lib/*/x265-10bit/*.so
diff --git a/debian/libx265N.install b/debian/libx265N.install
deleted file mode 100644
index 3d608f1..0000000
--- a/debian/libx265N.install
+++ /dev/null
@@ -1,2 +0,0 @@
-usr/lib/*/*.so.*
-usr/lib/*/x265-10bit/*.so.*
diff --git a/debian/patches/armv7l-support.patch b/debian/patches/armv7l-support.patch
deleted file mode 100644
index b56a92e..0000000
--- a/debian/patches/armv7l-support.patch
+++ /dev/null
@@ -1,43 +0,0 @@
---- a/source/CMakeLists.txt
-+++ b/source/CMakeLists.txt
-@@ -49,9 +49,13 @@ if("${SYSPROC}" STREQUAL "" OR X86MATCH
-         message(STATUS "Detected x86 system processor")
-     endif()
- elseif(${SYSPROC} STREQUAL "armv6l")
--    message(STATUS "Detected ARM target processor")
--    set(ARM 1)
-+    message(STATUS "Detected ARMV6 target processor")
-+    set(ARMV6 1)
-     add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
-+elseif(${SYSPROC} STREQUAL "armv7l")
-+    message(STATUS "Detected ARMV7 target processor")
-+    set(ARMV7 1)
-+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=1)
- else()
-     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
-     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-@@ -129,9 +133,12 @@ if(GCC)
-     if(X86 AND NOT X64)
-         add_definitions(-march=i686)
-     endif()
--    if(ARM)
-+    if(ARMV6)
-         add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
-     endif()
-+    if(ARMV7)
-+        add_definitions(-fPIC -march=armv7 -mfloat-abi=hard -mfpu=neon)
-+    endif()
-     check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING) 
-     check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS) 
-     if (CC_HAS_NO_ARRAY_BOUNDS)
---- a/source/common/cpu.cpp
-+++ b/source/common/cpu.cpp
-@@ -356,7 +356,7 @@ uint32_t cpu_detect(void)
-     // which may result in incorrect detection and the counters stuck enabled.
-     // right now Apple does not seem to support performance counters for this test
- #ifndef __MACH__
--    flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
-+    //flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
- #endif
-     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
- #endif // if HAVE_ARMV6
diff --git a/debian/patches/bug_fix_for_validatefanout_param_analysis-mode_save_and_load.patch b/debian/patches/bug_fix_for_validatefanout_param_analysis-mode_save_and_load.patch
deleted file mode 100644
index 1048433..0000000
--- a/debian/patches/bug_fix_for_validatefanout_param_analysis-mode_save_and_load.patch
+++ /dev/null
@@ -1,16 +0,0 @@
---- a/source/x265.cpp
-+++ b/source/x265.cpp
-@@ -772,12 +772,9 @@
-     {\
-         bErr = 0;\
-         p = strstr(paramBuf, opt "=");\
--        char* q = strstr(paramBuf, "no-"opt);\
-         if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\
-             bErr = 1;\
--        else if (!param_val && !q)\
--            bErr = 1;\
--        else if (param_val && (q || !strstr(paramBuf, opt)))\
-+        else if (param_val && strstr(paramBuf, "no-"opt))\
-             bErr = 1;\
-         if (bErr)\
-         {\
diff --git a/debian/patches/cpu-detection.patch b/debian/patches/cpu-detection.patch
deleted file mode 100644
index 1a441ac..0000000
--- a/debian/patches/cpu-detection.patch
+++ /dev/null
@@ -1,18 +0,0 @@
---- a/source/CMakeLists.txt
-+++ b/source/CMakeLists.txt
-@@ -39,12 +39,14 @@
- set(X86_ALIASES x86 i386 i686 x86_64 amd64)
- list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
- if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
--    message(STATUS "Detected x86 target processor")
-     set(X86 1)
-     add_definitions(-DX265_ARCH_X86=1)
-     if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
-+        message(STATUS "Detected x86_64 system processor")
-         set(X64 1)
-         add_definitions(-DX86_64=1)
-+    else()
-+        message(STATUS "Detected x86 system processor")
-     endif()
- elseif(${SYSPROC} STREQUAL "armv6l")
-     message(STATUS "Detected ARM target processor")
diff --git a/debian/patches/fix-reST-parsing-issues-in-docs.patch b/debian/patches/fix-reST-parsing-issues-in-docs.patch
deleted file mode 100644
index 9658aff..0000000
--- a/debian/patches/fix-reST-parsing-issues-in-docs.patch
+++ /dev/null
@@ -1,31 +0,0 @@
---- a/doc/reST/cli.rst
-+++ b/doc/reST/cli.rst
-@@ -63,7 +63,7 @@
- 	is used for WPP and for distributed analysis and motion search:
- 	:option:`--wpp` :option:`--pmode` and :option:`--pme` respectively.
- 
--	If :option:`--threads`=1 is specified, then no thread pool is
-+	If :option:`--threads` 1 is specified, then no thread pool is
- 	created. When no thread pool is created, all the thread pool
- 	features are implicitly disabled. If all the pool features are
- 	disabled by the user, then the pool is implicitly disabled.
-@@ -904,8 +904,8 @@
- 	between 0 and 1, or in kbits. In other words these two option pairs
- 	are equivalent::
- 
--	:option:`--vbv-bufsize` 1000 :option:`--vbv-init` 900
--	:option:`--vbv-bufsize` 1000 :option:`--vbv-init` 0.9
-+	--vbv-bufsize 1000 --vbv-init 900
-+	--vbv-bufsize 1000 --vbv-init 0.9
- 
- 	Default 0.9
- 
-@@ -1206,7 +1206,7 @@
- .. option:: --aud, --no-aud
- 
- 	Emit an access unit delimiter NAL at the start of each slice access
--	unit. If option:`--repeat-headers` is not enabled (indicating the
-+	unit. If :option:`--repeat-headers` is not enabled (indicating the
- 	user will be writing headers manually at the start of the stream)
- 	the very first AUD will be skipped since it cannot be placed at the
- 	start of the access unit, where it belongs. Default disabled
diff --git a/debian/patches/make_it_clear_that_--fast-cbf_is_innefective_at_lower_rd_levels.patch b/debian/patches/make_it_clear_that_--fast-cbf_is_innefective_at_lower_rd_levels.patch
deleted file mode 100644
index d1ee9cb..0000000
--- a/debian/patches/make_it_clear_that_--fast-cbf_is_innefective_at_lower_rd_levels.patch
+++ /dev/null
@@ -1,15 +0,0 @@
---- a/source/encoder/encoder.cpp
-+++ b/source/encoder/encoder.cpp
-@@ -1330,6 +1330,12 @@
-         p->bBPyramid = 0;
- 
-     /* Disable features which are not supported by the current RD level */
-+    if (p->rdLevel < 5)
-+    {
-+        if (p->bEnableCbfFastMode)      /* impossible */
-+            x265_log(p, X265_LOG_WARNING, "--fast-cbf disabled, requires --rdlevel 5 or higher\n");
-+        p->bEnableCbfFastMode = 0;
-+    }
-     if (p->rdLevel < 4)
-     {
-         if (p->psyRdoq > 0)             /* impossible */
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 1282f51..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1,7 +0,0 @@
-version.patch
-cpu-detection.patch
-show-options-using-their-param_parse-names.patch
-make_it_clear_that_--fast-cbf_is_innefective_at_lower_rd_levels.patch
-bug_fix_for_validatefanout_param_analysis-mode_save_and_load.patch
-fix-reST-parsing-issues-in-docs.patch
-armv7l-support.patch
diff --git a/debian/patches/show-options-using-their-param_parse-names.patch b/debian/patches/show-options-using-their-param_parse-names.patch
deleted file mode 100644
index a6cba2d..0000000
--- a/debian/patches/show-options-using-their-param_parse-names.patch
+++ /dev/null
@@ -1,13 +0,0 @@
---- a/source/common/param.cpp
-+++ b/source/common/param.cpp
-@@ -1152,8 +1152,8 @@
-         fprintf(stderr, "psy-rd=%.2lf ", param->psyRd);
-     if (param->psyRdoq > 0.)
-         fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq);
--    TOOLOPT(param->bEnableEarlySkip, "esd");
--    TOOLOPT(param->bEnableCbfFastMode, "cfm");
-+    TOOLOPT(param->bEnableEarlySkip, "early-skip");
-+    TOOLOPT(param->bEnableCbfFastMode, "fast-cbf");
-     if (param->noiseReduction)
-         fprintf(stderr, "nr=%d ", param->noiseReduction);
-     TOOLOPT(param->bEnableLoopFilter, "lft");
diff --git a/debian/patches/version.patch b/debian/patches/version.patch
deleted file mode 100644
index 639310c..0000000
--- a/debian/patches/version.patch
+++ /dev/null
@@ -1,13 +0,0 @@
---- a/source/cmake/version.cmake
-+++ b/source/cmake/version.cmake
-@@ -6,8 +6,8 @@
- find_package(Git QUIET) # present in 2.8.8
- 
- # defaults, in case everything below fails
--set(X265_VERSION "unknown")
--set(X265_LATEST_TAG "0.0")
-+set(X265_VERSION "1.4")
-+set(X265_LATEST_TAG "1.4")
- set(X265_TAG_DISTANCE "0")
- 
- if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt)
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 33f9531..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/make -f
-
-DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
-DEB_HOST_GNU_CPU   ?= $(shell dpkg-architecture -qDEB_HOST_GNU_CPU)
-DEB_HOST_GNU_TYPE  ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
-API ?= $(shell debian/getapi.sh)
-LIB  = $(CURDIR)/debian/install/usr/lib/$(DEB_HOST_MULTIARCH)
-
-include debian/confflags
-
-builddir     = compiling
-x265-clean   = rm -rf $(builddir) && mkdir $(builddir)
-x265-install = $(MAKE) -C $(builddir) install DESTDIR=$(CURDIR)/debian/install
-x265-build   = dh_auto_build -D$(builddir)
-test-build   = \
-	$(builddir)/x265 --pass 1 --bitrate 10 -o /dev/null debian/test.y4m && \
-	$(builddir)/x265 --pass 2 --bitrate 10 -o test.hevc debian/test.y4m
-
-
-%:
-	dh ${@} --parallel
-
-override_dh_auto_clean:
-	dh_auto_clean
-	rm -rf $(builddir) doc/reST/build debian/install debian/test.y4m
-
-override_dh_auto_build:
-	sed -e 's/@API@/$(API)/g' debian/control.in > debian/control
-	cp -f debian/libx265N.install debian/libx265-$(API).install
-	unxz -fk debian/test.y4m.xz
-# build x265 8-bit
-	mkdir $(builddir)
-	cd $(builddir) && cmake $(8bit_confflags) ../source
-	$(x265-build)
-	$(x265-install)
-# test x265 8-bit
-#	$(test-build)
-# build x265 10-bit
-	$(x265-clean)
-	cd $(builddir) && cmake $(10bit_confflags) ../source
-	$(x265-build)
-	$(x265-install)
-	sed -e 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g' \
-		debian/x265-10bit.in > $(builddir)/x265-10bit
-	install -c -D -m755 $(builddir)/x265-10bit $(CURDIR)/debian/install/usr/bin
-# test x265 10-bit
-#	$(test-build)
-# build x265 8-bit static
-	$(x265-clean)
-	rm -f $(LIB)/libx265.a
-	cd $(builddir) && cmake $(8bit_static_confflags) ../source
-	$(x265-build)
-	install -c -D -m644 $(builddir)/libx265.a $(LIB)
-# build x265 10-bit static
-	$(x265-clean)
-	rm -f $(LIB)/x265-10bit/libx265.a
-	cd $(builddir) && cmake $(10bit_static_confflags) ../source
-	$(x265-build)
-	install -c -D -m644 $(builddir)/libx265.a $(LIB)/x265-10bit
-# build documentation
-	$(MAKE) -C doc/reST pickle html
-
-override_dh_install:
-	dh_install --list-missing --sourcedir=$(CURDIR)/debian/install
-
-override_dh_installchangelogs:
-	dh_installchangelogs -px265-doc -plibx265-$(API) ChangeLog
-
-override_dh_installdocs:
-	dh_installdocs -plibx265-$(API) -px265-doc -X.buildinfo
-	dh_installdocs -px265 -px265-dbg -plibx265-dev --link-doc=libx265-$(API)
-
-override_dh_strip:
-	dh_strip --dbg-package=x265-dbg
-
-override_dh_builddeb:
-	dh_builddeb -- -Zxz -z9
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/source/include-binaries b/debian/source/include-binaries
deleted file mode 100644
index c489363..0000000
--- a/debian/source/include-binaries
+++ /dev/null
@@ -1 +0,0 @@
-debian/test.y4m.xz
diff --git a/debian/source/options b/debian/source/options
deleted file mode 100644
index 74a452b..0000000
--- a/debian/source/options
+++ /dev/null
@@ -1,2 +0,0 @@
-compression = "xz"
-compression-level = 9
diff --git a/debian/test.y4m.xz b/debian/test.y4m.xz
deleted file mode 100644
index 28d3c1c..0000000
Binary files a/debian/test.y4m.xz and /dev/null differ
diff --git a/debian/upstream-changelog_help.txt b/debian/upstream-changelog_help.txt
deleted file mode 100644
index 7539445..0000000
--- a/debian/upstream-changelog_help.txt
+++ /dev/null
@@ -1 +0,0 @@
-hg log --style changelog > ChangeLog
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 4867948..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=3
-https://bitbucket.org/multicoreware/x265/get/([0-9.]+)\.tar\.(?:xz|bz2|gz)
\ No newline at end of file
diff --git a/debian/x265-10bit.in b/debian/x265-10bit.in
deleted file mode 100644
index 18b1fb7..0000000
--- a/debian/x265-10bit.in
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-LIBX265_PATH=/usr/lib/@DEB_HOST_MULTIARCH@/x265-10bit
-LD_LIBRARY_PATH="$LIBX265_PATH:$LD_LIBRARY_PATH" $LIBX265_PATH/x265 "$@"
diff --git a/debian/x265-doc.doc-base b/debian/x265-doc.doc-base
deleted file mode 100644
index 904a43c..0000000
--- a/debian/x265-doc.doc-base
+++ /dev/null
@@ -1,10 +0,0 @@
-Document: x265
-Title: x265 Documentation
-Author: Steve Borho <steve@borho.org>
-Abstract: This is the official documentation x265,
- a free H.265/HEVC video encoder.
-Section: Video
-
-Format: HTML
-Index: /usr/share/doc/x265-doc/html/index.html
-Files: /usr/share/doc/x265-doc/html/*.html
diff --git a/debian/x265-doc.docs b/debian/x265-doc.docs
deleted file mode 100644
index b05b5c7..0000000
--- a/debian/x265-doc.docs
+++ /dev/null
@@ -1,2 +0,0 @@
-doc/intra
-doc/reST/build/html
diff --git a/debian/x265-doc.links b/debian/x265-doc.links
deleted file mode 100644
index 29586c4..0000000
--- a/debian/x265-doc.links
+++ /dev/null
@@ -1,2 +0,0 @@
-/usr/share/javascript/jquery/jquery.js           /usr/share/doc/x265-doc/html/_static/jquery.js
-/usr/share/javascript/underscore/underscore.js   /usr/share/doc/x265-doc/html/_static/underscore.js
diff --git a/debian/x265-help.txt b/debian/x265-help.txt
deleted file mode 100644
index dda07f7..0000000
--- a/debian/x265-help.txt
+++ /dev/null
@@ -1,174 +0,0 @@
-
-Syntax: x265 [options] infile [-o] outfile
-    infile can be YUV or Y4M
-    outfile is raw HEVC bitstream
-
-Executable Options:
--h/--help                        Show this help text and exit
--V/--version                     Show version info and exit
-
-Output Options:
--o/--output <filename>           Bitstream output file name
-   --log-level <string>          Logging level: none error warning info debug full. Default info
-   --no-progress                 Disable CLI progress reports
-   --[no-]cu-stats               Enable logging stats about distribution of cu across all modes. Default disabled
-   --csv <filename>              Comma separated log file, log level >= 3 frame log, else one line per run
-
-Input Options:
-   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin
-   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension
-   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M
-   --input-res WxH               Source picture size [w x h], auto-detected if Y4M
-   --input-depth <integer>       Bit-depth of input file. Default 8
-   --input-csp <string>          Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420
--f/--frames <integer>            Maximum number of frames to encode. Default all
-   --seek <integer>              First frame to encode
-   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive
-   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled
-
-Quality reporting metrics:
-   --[no-]ssim                   Enable reporting SSIM metric scores. Default disabled
-   --[no-]psnr                   Enable reporting PSNR metric scores. Default disabled
-
-Profile, Level, Tier:
-   --profile <string>            Enforce an encode profile: main, main10, mainstillpicture
-   --level-idc <integer|float>   Force a minumum required decoder level (as '5.0' or '50')
-   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level
-
-Threading, performance:
-   --threads <integer>           Number of threads for thread pool (0: detect CPU core count, default)
--F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count
-   --[no-]wpp                    Enable Wavefront Parallel Processing. Default enabled
-   --[no-]pmode                  Parallel mode analysis. Default disabled
-   --[no-]pme                    Parallel motion estimation. Default disabled
-   --[no-]asm <bool|int|string>  Override CPU detection. Default: auto
-
-Presets:
--p/--preset <string>             Trade off performance for compression efficiency. Default medium
-                                 ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo
--t/--tune <string>               Tune the settings for a particular type of source or situation:
-                                 psnr, ssim, zerolatency, or fastdecode
-
-Quad-Tree size and depth:
--s/--ctu <64|32|16>              Maximum CU size (default: 64x64). Default 64
-   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default 1
-   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default 1
-   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default disabled
-   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default disabled
-
-Analysis:
-   --rd <0..6>                   Level of RD in mode decision 0:least....6:full RDO. Default 3
-   --psy-rd <0..2.0>             Strength of psycho-visual rate distortion optimization, 0 to disable. Default 0.000000
-   --psy-rdoq <0..50.0>          Strength of psycho-visual optimization in quantization, 0 to disable. Default 0.000000
-   --nr <integer>                An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled
-   --[no-]tskip-fast             Enable fast intra transform skipping. Default disabled
-   --[no-]early-skip             Enable early SKIP detection. Default disabled
-   --[no-]fast-cbf               Enable early outs based on whether residual is coded. Default disabled
-
-Coding tools:
--w/--[no-]weightp                Enable weighted prediction in P slices. Default enabled
-   --[no-]weightb                Enable weighted prediction in B slices. Default disabled
-   --[no-]cu-lossless            Consider lossless mode in CU RDO decisions. Default disabled
-   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default enabled
-   --[no-]tskip                  Enable intra 4x4 transform skipping. Default disabled
-
-Temporal / motion search options:
-   --me <string>                 Motion search method dia hex umh star full. Default 1
--m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default 2 
-   --merange <integer>           Motion search range. Default 57
-   --max-merge <1..5>            Maximum number of merge candidates. Default 2
-   --[no-]temporal-mvp           Enable temporal MV predictors. Default enabled
-
-Spatial / intra options:
-   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default enabled
-   --[no-]constrained-intra      Constrained intra prediction (use only intra coded reference pixels) Default disabled
-   --[no-]b-intra                Enable intra in B frames in veryslow presets. Default disabled
-   --[no-]fast-intra             Enable faster search method for angular intra predictions. Default disabled
-   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default 0
-
-Slice decision options:
-   --[no-]open-gop               Enable open-GOP, allows I slices to be non-IDR. Default enabled
--I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default 250
--i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto
-   --no-scenecut                 Disable adaptive I-frame decision
-   --scenecut <integer>          How aggressively to insert extra I-frames. Default 40
-   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default 20
-   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default 4
-   --bframe-bias <integer>       Bias towards B frame decisions. Default 0
-   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default 2
-   --[no-]b-pyramid              Use B-frames as references. Default enabled
-   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default 3
-   --qpfile <string>             Force frametypes and QPs for some or all frames
-                                 Format of each line: framenumber frametype QP
-                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.
-                                 QPs are restricted by qpmin/qpmax.
-
-Rate control, Quantization:
-   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default 0
--q/--qp <integer>                QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs
-   --crf <float>                 Quality-based VBR (0-51). Default 28.000000
-   --[no-]lossless               Enable lossless: bypass transform, quant and loop filters globally. Default disabled
-   --crf-max <float>             With CRF+VBV, limit RF to this value. Default 0.000000
-                                 May cause VBV underflows!
-   --crf-min <float>             With CRF+VBV, limit RF to this value. Default 0.000000
-                                 this specifies a minimum rate factor value for encode!
-   --vbv-maxrate <integer>       Max local bitrate (kbit/s). Default 0
-   --vbv-bufsize <integer>       Set size of the VBV buffer (kbit). Default 0
-   --vbv-init <float>            Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default 0.900000
-   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default 2
-   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default 1.000000
-   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default enabled
-   --ipratio <float>             QP factor between I and P. Default 1.400000
-   --pbratio <float>             QP factor between P and B. Default 1.300000
-   --cbqpoffs <integer>          Chroma Cb QP Offset. Default 0
-   --crqpoffs <integer>          Chroma Cr QP Offset. Default 0
-   --stats                       Filename for stats file in multipass pass rate control. Default x265_2pass.log
-   --pass                        Multi pass rate control.
-                                   - 1 : First pass, creates stats file
-                                   - 2 : Last pass, does not overwrite stats file
-                                   - 3 : Nth pass, overwrites stats file
-   --[no-]slow-firstpass         Enable a slow first pass in a multipass rate control mode. Default disabled
-   --analysis-mode <string|int>  save - Dump analysis info into file, load - Load analysis buffers from the file. Default 0
-   --analysis-file <filename>    Specify file name used for either dumping or reading analysis data.
-   --scaling-list <string>       Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off
-   --lambda-file <string>        Specify a file containing replacement values for the lambda tables
-                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table
-                                 Blank lines and lines starting with hash(#) are ignored
-                                 Comma is considered to be white-space
-
-Loop filters (deblock and SAO):
-   --[no-]lft                    Enable Deblocking Loop Filter. Default enabled
-   --[no-]sao                    Enable Sample Adaptive Offset. Default enabled
-   --[no-]sao-non-deblock        Use non-deblocked pixels, else right/bottom boundary areas skipped. Default disabled
-
-VUI options:
-   --sar <width:height|int>      Sample Aspect Ratio, the ratio of width to height of an individual pixel.
-                                 Choose from 0=undef, 1=1:1("square"), 2=12:11, 3=10:11, 4=16:11,
-                                 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,
-                                 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default 0
-   --crop-rect <string>          Add 'left,top,right,bottom' to the bitstream-level cropping rectangle
-   --overscan <string>           Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef
-   --videoformat <string>        Specify video format from undef, component, pal, ntsc, secam, mac. Default undef
-   --range <string>              Specify black level and range of luma and chroma signals as full or limited Default limited
-   --colorprim <string>          Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,
-                                 smpte240m, film, bt2020. Default undef
-   --transfer <string>           Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,
-                                 smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,
-                                 bt2020-10, bt2020-12. Default undef
-   --colormatrix <string>        Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,
-                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef
-   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of 0
-
-Bitstream options:
-   --[no-]info                   Emit SEI identifying encoder and parameters. Default enabled
-   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default disabled
-   --[no-]hrd                    Enable HRD parameters signalling. Default disabled
-   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default disabled
-   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default 0
-
-Reconstructed video options (debugging):
--r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name
-   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M
-
-
-Full documentation may be found at http://x265.readthedocs.org/en/default/cli.html
diff --git a/debian/x265.1 b/debian/x265.1
deleted file mode 100644
index c388850..0000000
--- a/debian/x265.1
+++ /dev/null
@@ -1,472 +0,0 @@
-.TH X265 "1" "AUGUST 2014" "v1.4" "User Commands"
-.SH NAME
-x265 \- H.265/HEVC video encoder
-
-.SH SYNOPSIS
-.B x265 \fR[options] \fIinfile \fR[\-o] \fIoutfile
-.br
-Bit depth: 8
-.PP
-.B x265\-10bit \fR[options] \fIinfile \fR[\-o] \fIoutfile
-.br
-Bit depth: 10
-.PP
-infile can be YUV or Y4M
-.br
-outfile is raw HEVC bitstream
-
-.SH DESCRIPTION
-Increasing demand for high definition and ultra\-high definition video,
-along with an increasing desire for video on demand has led to
-exponential growth in demand for bandwidth and storage requirements.
-These challenges can be met by the new High Efficiency Video Coding
-(HEVC) standard, also known as H.265. The x265 HEVC encoder project was
-launched by MulticoreWare in 2013, aiming to provide the most efficient,
-highest performance HEVC video encoder.
-.SS About HEVC
-The High Efficiency Video Coding (HEVC) was developed by the ISO/IEC
-Moving Picture Experts Group (MPEG) and ITU\-T Video Coding Experts Group
-(VCEG), through their Joint Collaborative Team on Video Coding (JCT\-VC).
-HEVC is also known as ISO/IEC 23008\-2 MPEG\-H Part 2 and ITU\-T H.265.
-HEVC provides superior video quality and up to twice the data
-compression as the previous standard (H.264/MPEG\-4 AVC).  HEVC can
-support 8K Ultra High Definition video, with a picture size up to
-8192x4320 pixels.
-.SS About x265
-The primary objective of x265 is to become the best H.265/HEVC encoder
-available anywhere, offering the highest compression efficiency and the
-highest performance on a wide variety of hardware platforms. The x265
-encoder is available as an open source library, published under the
-GPLv2 license. It is also available under a commercial license, enabling
-commercial companies to utilize and distribute x265 in their solutions
-without being subject to the restrictions of the GPL license.
-
-.SH OPTIONS
-.TP
-\fB\-h/\-\-help
-Show this help text and exit
-.TP
-\fB\-V/\-\-version
-Show version info and exit
-
-.SS "Output Options:"
-.TP
-\fB\-\-output <filename>
-Bitstream output file name
-.TP
-\fB\-\-log\-level <string>
-Logging level: none error warning info debug full. Default info
-.TP
-\fB\-\-no\-progress
-Disable CLI progress reports
-.TP
-\fB\-\-[no\-]cu\-stats
-Enable logging stats about distribution of cu across all modes. Default disabled
-.TP
-\fB\-\-csv <filename>
-Comma separated log file, log level >= 3 frame log, else one line per run
-
-.SS "Input Options:"
-.TP
-\fB\-\-input <filename>
-Raw YUV or Y4M input file name. `\-` for stdin
-.TP
-\fB\-\-y4m
-Force parsing of input stream as YUV4MPEG2 regardless of file extension
-.TP
-\fB\-\-fps <float|rational>
-Source frame rate (float or num/denom), auto\-detected if Y4M
-.TP
-\fB\-\-input\-res WxH
-Source picture size [w x h], auto\-detected if Y4M
-.TP
-\fB\-\-input\-depth <integer>
-Bit\-depth of input file. Default 8
-.TP
-\fB\-\-input\-csp <string>
-Source color space: i420, i444 or i422, auto\-detected if Y4M. Default: i420
-.TP
-\fB\-f/\-\-frames <integer>
-Maximum number of frames to encode. Default all
-.TP
-\fB\-\-seek <integer>
-First frame to encode
-.TP
-\fB\-\-[no\-]interlace <bff|tff>
-Indicate input pictures are interlace fields in temporal order. Default progressive
-.TP
-\fB\-\-dither
-Enable dither if downscaling to 8 bit pixels. Default disabled
-
-.SS "Quality reporting metrics:"
-.TP
-\fB\-\-[no\-]ssim
-Enable reporting SSIM metric scores. Default disabled
-.TP
-\fB\-\-[no\-]psnr
-Enable reporting PSNR metric scores. Default disabled
-
-.SS "Profile, Level, Tier:"
-.TP
-\fB\-\-profile <string>
-Enforce an encode profile: main, main10, mainstillpicture
-.TP
-\fB\-\-level\-idc <integer|float>
-Force a minumum required decoder level (as '5.0' or '50')
-.TP
-\fB\-\-[no\-]high\-tier
-If a decoder level is specified, this modifier selects High tier of that level
-
-.SS "Threading, performance:"
-.TP
-\fB\-\-threads <integer>
-Number of threads for thread pool (0: detect CPU core count, default)
-.TP
-\fB\-F/\-\-frame\-threads <integer>
-Number of concurrently encoded frames. 0: auto\-determined by core count
-.TP
-\fB\-\-[no\-]wpp
-Enable Wavefront Parallel Processing. Default enabled
-.TP
-\fB\-\-[no\-]pmode
-Parallel mode analysis. Default disabled
-.TP
-\fB\-\-[no\-]pme
-Parallel motion estimation. Default disabled
-.TP
-\fB\-\-[no\-]asm <bool|int|string>
-Override CPU detection. Default: auto
-
-.SS Presets:
-.TP
-\fB\-p/\-\-preset <string>
-Trade off performance for compression efficiency. Default medium
-.br
-ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo
-.TP
-\fB\-t/--tune <string>
-Tune the settings for a particular type of source or situation:"
-.br
-psnr, ssim, zerolatency, or fastdecode
-
-.SS "Quad-Tree size and depth:"
-.TP
-\fB\-s/\-\-ctu <64|32|16>
-Maximum CU size (default: 64x64). Default 64
-.TP
-\fB\-\-tu\-intra\-depth <integer>
-Max TU recursive depth for intra CUs. Default 1
-.TP
-\fB\-\-tu\-inter\-depth <integer>
-Max TU recursive depth for inter CUs. Default 1
-.TP
-\fB\-\-[no\-]rect
-Enable rectangular motion partitions Nx2N and 2NxN. Default disabled
-.TP
-\fB\-\-[no\-]amp
-Enable asymmetric motion partitions, requires \fB\-\-rect\fR. Default disabled
-.TP
-\fB\-\-rd <0..6>
-Level of RD in mode decision 0:least....6:full RDO. Default 3
-.TP
-\fB\-\-psy\-rd <0..2.0>
-Strength of psycho\-visual rate distortion optimization, 0 to disable. Default 0.000000
-.TP
-\fB\-\-psy\-rdoq <0..50.0>
-Strength of psycho\-visual optimization in quantization, 0 to disable. Default 0.000000
-.TP
-\fB\-\-nr <integer>
-An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled
-.TP
-\fB\-\-[no\-]tskip\-fast
-Enable fast intra transform skipping. Default disabled
-.TP
-\fB\-\-[no\-]early\-skip
-Enable early SKIP detection. Default disabled
-.TP
-\fB\-\-[no\-]fast\-cbf
-Enable early outs based on whether residual is coded. Default disabled
-
-.SS "Coding tools:"
-.TP
-\fB\-w/\-\-[no\-]weightp
-Enable weighted prediction in P slices. Default enabled
-.TP
-\fB\-\-[no\-]weightb
-Enable weighted prediction in B slices. Default disabled
-.TP
-\fB\-\-[no\-]cu-lossless
-Consider lossless mode in CU RDO decisions. Default disabled
-.TP
-\fB\-\-[no\-]signhide
-Hide sign bit of one coeff per TU (rdo). Default enabled
-.TP
-\fB\-\-[no\-]tskip
-Enable intra transform skipping. Default disabled
-
-.SS "Temporal / motion search options:"
-.TP
-\fB\-\-me <string>
-Motion search method dia hex umh star full. Default 1
-.TP
-\fB\-m/\-\-subme <integer>
-Amount of subpel refinement to perform (0:least .. 7:most). Default 2
-.TP
-\fB\-\-merange <integer>
-Motion search range. Default 57
-.TP
-\fB\-\-max\-merge <1..5>
-Maximum number of merge candidates. Default 2
-.TP
-\fB\-\-[no\-]temporal\-mvp
-Enable temporal MV predictors. Default enabled
-
-.SS "Spatial / intra options:"
-.TP
-\fB\-\-[no\-]strong\-intra\-smoothing
-Enable strong intra smoothing for 32x32 blocks. Default enabled
-.TP
-\fB\-\-[no\-]constrained\-intra
-Constrained intra prediction (use only intra coded reference pixels) Default disabled
-.TP
-\fB\-\-[no\-]b\-intra
-Enable intra in B frames in veryslow presets. Default disabled
-.TP
-\fB\-\-[no\-]fast\-intra
-Enable faster search method for angular intra predictions. Default disabled
-.TP
-\fB\-\-rdpenalty <0..2>
-penalty for 32x32 intra TU in non\-I slices. 0:disabled 1:RD\-penalty 2:maximum. Default 0
-
-.SS "Slice decision options:"
-.TP
-\fB\-\-[no\-]open\-gop
-Enable open\-GOP, allows I slices to be non\-IDR. Default enabled
-.TP
-\fB\-I/\-\-keyint <integer>
-Max IDR period in frames. \-1 for infinite\-gop. Default 250
-.TP
-\fB\-i/\-\-min\-keyint <integer>
-Scenecuts closer together than this are coded as I, not IDR. Default: auto
-.TP
-\fB\-\-no\-scenecut
-Disable adaptive I\-frame decision
-.TP
-\fB\-\-scenecut <integer>
-How aggressively to insert extra I\-frames. Default 40
-.TP
-\fB\-\-rc\-lookahead <integer>
-Number of frames for frame\-type lookahead (determines encoder latency) Default 20
-.TP
-\fB\-\-bframes <integer>
-Maximum number of consecutive b\-frames (now it only enables B GOP structure) Default 4
-.TP
-\fB\-\-bframe\-bias <integer>
-Bias towards B frame decisions. Default 0
-.TP
-\fB\-\-b\-adapt <0..2>
-0 \- none, 1 \- fast, 2 \- full (trellis) adaptive B frame scheduling. Default 2
-.TP
-\fB\-\-[no\-]b\-pyramid
-Use B\-frames as references. Default enabled
-.TP
-\fB\-\-ref <integer>
-max number of L0 references to be allowed (1 .. 16) Default 3
-.TP
-\fB\-\-qpfile <string>
-Force frametypes and QPs for some or all frames
-.br
-Format of each line: framenumber frametype QP
-.br
-QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.
-.br
-QPs are restricted by qpmin/qpmax.
-.PP
-
-.SS "Rate control, Quantization:"
-.TP
-\fB\-\-bitrate <integer>
-Target bitrate (kbps) for ABR (implied). Default 0
-.TP
-\fB\-\-crf <float>
-Quality\-based VBR (0\-51). Default 28.000000
-.TP
-\fB\-\-[no\-]lossless
-Enable lossless: bypass transform, quant and loop filters globally. Default disabled
-.TP
-\fB\-\-crf\-max <float>
-With CRF+VBV, limit RF to this value. Default 0.000000
-.br
-May cause VBV underflows!
-.TP
-\fB\-\-crf\-min <float>
-With CRF+VBV, limit RF to this value. Default 0.000000
-.br
-this specifies a minimum rate factor value for encode!
-.TP
-\fB\-\-vbv\-maxrate <integer>
-Max local bitrate (kbit/s). Default 0
-.TP
-\fB\-\-vbv\-bufsize <integer>
-Set size of the VBV buffer (kbit). Default 0
-.TP
-\fB\-\-vbv\-init <float>
-Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default 0.900000
-.TP
-\fB\-\-aq\-mode <integer>
-Mode for Adaptive Quantization \- 0:none 1:uniform AQ 2:auto variance. Default 2
-.TP
-\fB\-\-aq\-strength <float>
-Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default 1.000000
-.TP
-\fB\-\-[no\-]cutree
-Enable cutree for Adaptive Quantization. Default enabled
-.TP
-\fB\-\-ipratio <float>
-QP factor between I and P. Default 1.400000
-.TP
-\fB\-\-pbratio <float>
-QP factor between P and B. Default 1.300000
-.TP
-\fB\-\-cbqpoffs <integer>
-Chroma Cb QP Offset. Default 0
-.TP
-\fB\-\-crqpoffs <integer>
-Chroma Cr QP Offset. Default 0
-.TP
-\fB\-\-stats
-Filename for stats file in multipass pass rate control. Default x265_2pass.log
-.TP
-\fB\-\-pass
-Multi pass rate control.
-.br
-  \- 1 : First pass, creates stats file
-.br
-  \- 2 : Last pass, does not overwrite stats file
-.br
-  \- 3 : Nth pass, overwrites stats file
-.TP
-\fB\-\-[no\-]slow\-firstpass
-Enable a slow first pass in a multipass rate control mode. Default disabled
-.TP
-\fB\-\-analysis\-mode <string|int>
-save \- Dump analysis info into file, load \- Load analysis buffers from the file. Default 0
-.TP
-\fB\-\-analysis-file <filename>
-Specify file name used for either dumping or reading analysis data.
-.TP
-\fB\-\-scaling\-list <string>
-Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off
-.TP
-\fB\-\-lambda\-file <string>
-Specify a file containing replacement values for the lambda tables
-.br
-MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table
-.br
-Blank lines and lines starting with hash(#) are ignored
-.br
-Comma is considered to be white-space
-
-.SS "Loop filters (deblock and SAO):"
-.TP
-\fB\-\-[no\-]lft
-Enable Deblocking Loop Filter. Default enabled
-.TP
-\fB\-\-[no\-]sao
-Enable Sample Adaptive Offset. Default enabled
-.TP
-\fB\-\-[no\-]sao\-non\-deblock
-Use non\-deblocked pixels, else right/bottom boundary areas skipped. Default disabled
-
-.SS "VUI options:"
-.TP
-\fB\-\-sar <width:height|int>
-Sample Aspect Ratio, the ratio of width to height of an individual pixel.
-.br
-Choose from 0=undef, 1=1:1("square"), 2=12:11, 3=10:11, 4=16:11,
-5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,
-12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default 0
-.TP
-\fB\-\-crop\-rect <string>
-Add 'left,top,right,bottom' to the bitstream\-level cropping rectangle
-.TP
-\fB\-\-overscan <string>
-Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef
-.TP
-\fB\-\-videoformat <string>
-Specify video format from undef, component, pal, ntsc, secam, mac. Default undef
-.TP
-\fB\-\-range <string>
-Specify black level and range of luma and chroma signals as full or limited Default limited
-.TP
-\fB\-\-colorprim <string>
-Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,
-smpte240m, film, bt2020. Default undef
-.TP
-\fB\-\-transfer <string>
-Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,
-smpte240m, linear, log100, log316, iec61966\-2\-4, bt1361e, iec61966\-2\-1,
-bt2020\-10, bt2020\-12. Default undef
-.TP
-\fB\-\-colormatrix <string>
-Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,
-smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef
-.TP
-\fB\-\-chromaloc <integer>
-Specify chroma sample location (0 to 5). Default of 0
-
-.SS "Bitstream options:"
-.TP
-\fB\-\-[no\-]info
-Emit SEI identifying encoder and parameters. Default enabled
-.TP
-\fB\-\-[no\-]aud
-Emit access unit delimiters at the start of each access unit. Default disabled
-.TP
-\fB\-\-[no\-]hrd
-Enable HRD parameters signalling. Default disabled
-.TP
-\fB\-\-[no\-]repeat\-headers
-Emit SPS and PPS headers at each keyframe. Default disabled
-.TP
-\fB\-\-hash <integer>
-Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default 0
-
-.SS "Reconstructed video options (debugging):"
-.TP
-\fB\-r/\-\-recon <filename>
-Reconstructed raw image YUV or Y4M output file name
-.TP
-\fB\-\-recon\-depth <integer>
-Bit\-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M
-.SH COPYRIGHT
-Copyright \(co 2013\-2014  MulticoreWare, Inc.
-.PP
-The x265 software is owned and copyrighted by MulticoreWare, Inc.
-MulticoreWare is committed to offering the x265 software under the GNU
-GPL v2 license.  Companies who do not wish to integrate the x265
-Software in their products under the terms of the GPL license can
-contact MulticoreWare (\fIlicense@x265.com\fR) to obtain a commercial
-license agreement.  Companies who use x265 under the GPL may also wish
-to work with MulticoreWare to accelerate the development of specific
-features or optimized support for specific hardware or software
-platforms, or to contract for support.
-.PP
-The GNU GPL v2 license or the x265 commercial license agreement govern
-your rights to access the copyrighted x265 software source code, but do
-not cover any patents that may be applicable to the function of binary
-executable software created from the x265 source code.  You are
-responsible for understanding the laws in your country, and for
-licensing all applicable patent rights needed for use or distribution of
-software applications created from the x265 source code.  A good place
-to start is with the Motion Picture Experts Group \- Licensing Authority
-\- HEVC Licensing Program.
-.PP
-x265 is a registered trademark of MulticoreWare, Inc.  The x265 logo is
-a trademark of MulticoreWare, and may only be used with explicit written
-permission.  All rights reserved.
-
-.SH "SEE ALSO"
-.TP
-\fIhttp://x265.readthedocs.org/en/default/cli.html\fR
-online documentation
diff --git a/debian/x265.install b/debian/x265.install
deleted file mode 100644
index 3500295..0000000
--- a/debian/x265.install
+++ /dev/null
@@ -1,2 +0,0 @@
-usr/bin
-usr/lib/*/x265-10bit/x265
diff --git a/debian/x265.links b/debian/x265.links
deleted file mode 100644
index 740a041..0000000
--- a/debian/x265.links
+++ /dev/null
@@ -1,3 +0,0 @@
-/usr/bin/x265-10bit             /usr/bin/x265-16bit
-/usr/share/man/man1/x265.1.gz   /usr/share/man/man1/x265-10bit.1.gz
-/usr/share/man/man1/x265.1.gz   /usr/share/man/man1/x265-16bit.1.gz
diff --git a/debian/x265.manpages b/debian/x265.manpages
deleted file mode 100644
index 300fc8f..0000000
--- a/debian/x265.manpages
+++ /dev/null
@@ -1 +0,0 @@
-debian/x265.1
diff --git a/doc/reST/Makefile b/doc/reST/Makefile
index 6b1d44c..b2d1c3d 100644
--- a/doc/reST/Makefile
+++ b/doc/reST/Makefile
@@ -13,7 +13,7 @@ PAPEROPT_a4     = -D latex_paper_size=a4
 PAPEROPT_letter = -D latex_paper_size=letter
 ALLSPHINXOPTS   = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 
-.PHONY: help clean html web pickle htmlhelp qthelp qhc latex changes linkcheck
+.PHONY: help clean distclean html web pickle htmlhelp qthelp qhc latex changes linkcheck
 
 help:
 	@echo "Please use \`make <target>' where <target> is one of"
@@ -24,12 +24,16 @@ help:
 	@echo "  qthelp    to make HTML files and a qthelp project"
 	@echo "  qhc       to make QHC file"
 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  man       to make manpages"
 	@echo "  changes   to make an overview over all changed/added/deprecated items"
 	@echo "  linkcheck to check all external links for integrity"
 
 clean:
 	-rm -rf build/*
 
+distclean: clean
+	-rmdir build/
+
 html:
 	mkdir -p build/html build/doctrees
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html
@@ -83,6 +87,14 @@ latex:
 	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
 	      "run these through (pdf)latex."
 
+man:
+	mkdir -p build/man build/doctrees
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) build/man
+	@echo
+	@echo "Build finished; the manpages are in build/man."
+	@echo "Run \`man -l build/man/x265.1' or \`man -l build/man/libx265.3'" \
+	      "to view them."
+
 changes:
 	mkdir -p build/changes build/doctrees
 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes
diff --git a/doc/reST/api.rst b/doc/reST/api.rst
index 214881a..f15212d 100644
--- a/doc/reST/api.rst
+++ b/doc/reST/api.rst
@@ -32,6 +32,12 @@ library was compiled (it will contain a value of 8 or 10). Further,
 x265 which was compiled, and **x265_build_info_str** is a pointer to a
 string identifying the compiler and build options.
 
+.. Note::
+
+	**x265_version_str** is only updated when **cmake** runs. If you are
+	making binaries for others to use, it is recommended to run
+	**cmake** prior to **make** in your build scripts.
+
 x265 will accept input pixels of any depth between 8 and 16 bits
 regardless of the depth of its internal pixels (8 or 10).  It will shift
 and mask input pixels as required to reach the internal depth. If
diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index 324b83a..8740c8e 100644
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -30,8 +30,8 @@ Generally, when an option expects a string value from a list of strings
 the user may specify the integer ordinal of the value they desire. ie:
 :option:`--log-level` 3 is equivalent to :option:`--log-level` debug.
 
-Standalone Executable Options
-=============================
+Executable Options
+==================
 
 .. option:: --help, -h
 
@@ -45,6 +45,59 @@ Standalone Executable Options
 
 	**CLI ONLY**
 
+Logging/Statistic Options
+=========================
+
+.. option:: --log-level <integer|string>
+
+	Logging level. Debug level enables per-frame QP, metric, and bitrate
+	logging. If a CSV file is being generated, debug level makes the log
+	be per-frame rather than per-encode. Full level enables hash and
+	weight logging. -1 disables all logging, except certain fatal
+	errors, and can be specified by the string "none".
+
+	0. error
+	1. warning
+	2. info **(default)**
+	3. debug
+	4. full
+
+.. option:: --no-progress
+
+	Disable periodic progress reports from the CLI
+
+	**CLI ONLY**
+
+.. option:: --csv <filename>
+
+	Writes encoding results to a comma separated value log file. Creates
+	the file if it doesnt already exist, else adds one line per run.  if
+	:option:`--log-level` is debug or above, it writes one line per
+	frame. Default none
+
+.. option:: --cu-stats, --no-cu-stats
+
+	Records statistics on how each CU was coded (split depths and other
+	mode decisions) and reports those statistics at the end of the
+	encode. Default disabled
+
+.. option:: --ssim, --no-ssim
+
+	Calculate and report Structural Similarity values. It is
+	recommended to use :option:`--tune` ssim if you are measuring ssim,
+	else the results should not be used for comparison purposes.
+	Default disabled
+
+.. option:: --psnr, --no-psnr
+
+	Calculate and report Peak Signal to Noise Ratio.  It is recommended
+	to use :option:`--tune` psnr if you are measuring PSNR, else the
+	results should not be used for comparison purposes.  Default
+	disabled
+
+Performance Options
+===================
+
 .. option:: --asm <integer:false:string>, --no-asm
 
 	x265 will use all detected CPU SIMD architectures by default. You can
@@ -57,13 +110,24 @@ Standalone Executable Options
 
 	One may also directly supply the CPU capability bitmap as an integer.
 
+.. option:: --frame-threads, -F <integer>
+
+	Number of concurrently encoded frames. Using a single frame thread
+	gives a slight improvement in compression, since the entire reference
+	frames are always available for motion compensation, but it has
+	severe performance implications. Default is an autodetected count
+	based on the number of CPU cores and whether WPP is enabled or not.
+
+	Over-allocation of frame threads will not improve performance, it
+	will generally just increase memory use.
+
 .. option:: --threads <integer>
 
 	Number of threads to allocate for the worker thread pool  This pool
 	is used for WPP and for distributed analysis and motion search:
 	:option:`--wpp` :option:`--pmode` and :option:`--pme` respectively.
 
-	If :option:`--threads`=1 is specified, then no thread pool is
+	If :option:`--threads` 1 is specified, then no thread pool is
 	created. When no thread pool is created, all the thread pool
 	features are implicitly disabled. If all the pool features are
 	disabled by the user, then the pool is implicitly disabled.
@@ -71,13 +135,24 @@ Standalone Executable Options
 	Default 0, one thread is allocated per detected hardware thread
 	(logical CPU cores)
 
+.. option:: --wpp, --no-wpp
+
+	Enable Wavefront Parallel Processing. The encoder may begin encoding
+	a row as soon as the row above it is at least two CTUs ahead in the
+	encode process. This gives a 3-5x gain in parallelism for about 1%
+	overhead in compression efficiency.
+
+	This feature is implicitly disabled when no thread pool is present.
+
+	Default: Enabled
+
 .. option:: --pmode, --no-pmode
 
 	Parallel mode decision, or distributed mode analysis. When enabled
 	the encoder will distribute the analysis work of each CU (merge,
 	inter, intra) across multiple worker threads. Only recommended if
 	x265 is not already saturating the CPU cores. In RD levels 3 and 4
-	it will be most effective if --rect was enabled. At RD levels 5 and
+	it will be most effective if --rect is enabled. At RD levels 5 and
 	6 there is generally always enough work to distribute to warrant the
 	overhead, assuming your CPUs are not already saturated.
 	
@@ -85,7 +160,8 @@ Standalone Executable Options
 	efficiency. In fact, since the modes are all measured in parallel it
 	makes certain early-outs impractical and thus you usually get
 	slightly better compression when it is enabled (at the expense of
-	not skipping improbable modes).
+	not skipping improbable modes). This bypassing of early-outs can
+	cause pmode to slow down encodes, especially at faster presets.
 
 	This feature is implicitly disabled when no thread pool is present.
 
@@ -113,7 +189,8 @@ Standalone Executable Options
 
 	Sets parameters to preselected values, trading off compression efficiency against 
 	encoding speed. These parameters are applied before all other input parameters are 
-	applied, and so you can override any parameters that these values control.
+	applied, and so you can override any parameters that these values control.  See
+	:ref:`presets <presets>` for more detail.
 
 	0. ultrafast
 	1. superfast
@@ -129,84 +206,18 @@ Standalone Executable Options
 .. option:: --tune, -t <string>
 
 	Tune the settings for a particular type of source or situation. The changes will
-	be applied after :option:`--preset` but before all other parameters. Default none
-
-	**Values:** psnr, ssim, zero-latency, fast-decode.
-
-.. option:: --frame-threads, -F <integer>
-
-	Number of concurrently encoded frames. Using a single frame thread
-	gives a slight improvement in compression, since the entire reference
-	frames are always available for motion compensation, but it has
-	severe performance implications. Default is an autodetected count
-	based on the number of CPU cores and whether WPP is enabled or not.
-
-	Over-allocation of frame threads will not improve performance, it
-	will generally just increase memory use.
-
-.. option:: --log-level <integer|string>
-
-	Logging level. Debug level enables per-frame QP, metric, and bitrate
-	logging. If a CSV file is being generated, debug level makes the log
-	be per-frame rather than per-encode. Full level enables hash and
-	weight logging. -1 disables all logging, except certain fatal
-	errors, and can be specified by the string "none".
-
-	0. error
-	1. warning
-	2. info **(default)**
-	3. debug
-	4. full
-
-.. option:: --csv <filename>
-
-	Writes encoding results to a comma separated value log file. Creates
-	the file if it doesnt already exist, else adds one line per run.  if
-	:option:`--log-level` is debug or above, it writes one line per
-	frame. Default none
-
-.. option:: --cu-stats, --no-cu-stats
-
-	Records statistics on how each CU was coded (split depths and other
-	mode decisions) and reports those statistics at the end of the
-	encode. Default disabled
-
-.. option:: --output, -o <filename>
-
-	Bitstream output file name. If there are two extra CLI options, the
-	first is implicitly the input filename and the second is the output
-	filename, making the :option:`--output` option optional.
-
-	The output file will always contain a raw HEVC bitstream, the CLI
-	does not support any container file formats.
-
-	**CLI ONLY**
-
-.. option:: --no-progress
+	be applied after :option:`--preset` but before all other parameters. Default none.
+	See :ref:`tunings <tunings>` for more detail.
 
-	Disable CLI periodic progress reports
+	**Values:** psnr, ssim, grain, zero-latency, fast-decode, cbr.
 
-	**CLI ONLY**
-
-Quality reporting metrics
+Input/Output File Options
 =========================
 
-.. option:: --ssim, --no-ssim
-
-	Calculate and report Structural Similarity values. It is
-	recommended to use :option:`--tune` ssim if you are measuring ssim,
-	else the results should not be used for comparison purposes.
-	Default disabled
-
-.. option:: --psnr, --no-psnr
-
-	Calculate and report Peak Signal to Noise Ratio.  It is recommended
-	to use :option:`--tune` psnr if you are measuring PSNR, else the
-	results should not be used for comparison purposes.  Default
-	disabled
-
-Input Options
-=============
+These options all describe the input video sequence or, in the case of
+:option:`--dither`, operations that are performed on the sequence prior
+to encode. All options dealing with files (names, formats, offsets or
+frame counts) are only applicable to the CLI application.
 
 .. option:: --input <filename>
 
@@ -242,21 +253,6 @@ Input Options
 
 	**CLI ONLY**
 
-.. option:: --nr <integer>
-
-	Noise reduction - an adaptive deadzone applied after DCT
-	(subtracting from DCT coefficients), before quantization, on inter
-	blocks. It does no pixel-level filtering, doesn't cross DCT block
-	boundaries, has no overlap, doesn't affect intra blocks. The higher
-	the strength value parameter, the more aggressively it will reduce
-	noise.
-
-	Enabling noise reduction will make outputs diverge between different
-	numbers of frame threads. Outputs will be deterministic but the
-	outputs of -F2 will no longer match the outputs of -F3, etc.
-
-	**Values:** any value in range of 100 to 1000. Default disabled.
-
 .. option:: --input-res <wxh>
 
 	YUV only: Source picture size [w x h]
@@ -285,8 +281,6 @@ Input Options
 
 .. option:: --interlaceMode <false|tff|bff>, --no-interlaceMode
 
-	**EXPERIMENTAL** Specify interlace type of source pictures. 
-	
 	0. progressive pictures **(default)**
 	1. top field first 
 	2. bottom field first
@@ -305,61 +299,20 @@ Input Options
 
 .. option:: --frames, -f <integer>
 
-	Number of frames to be encoded. Default 0 (all)
+	Number of frames of input sequence to be encoded. Default 0 (all)
 
 	**CLI ONLY**
 
-.. option:: --qpfile <filename>
-
-	Specify a text file which contains frametypes and QPs for some or
-	all frames. The format of each line is:
-
-	framenumber frametype QP
-
-	Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame,
-	**b** is an unreferenced B frame.  **I** is a keyframe (random
-	access point) while **i** is a I frame that is not a keyframe
-	(references are not broken).
-
-	Specifying QP (integer) is optional, and if specified they are
-	clamped within the encoder to qpmin/qpmax.
-
-.. option:: --scaling-list <filename>
-
-	Quantization scaling lists. HEVC supports 6 quantization scaling
-	lists to be defined; one each for Y, Cb, Cr for intra prediction and
-	one each for inter prediction.
-
-	x265 does not use scaling lists by default, but this can also be
-	made explicit by :option:`--scaling-list` *off*.
-
-	HEVC specifies a default set of scaling lists which may be enabled
-	without requiring them to be signaled in the SPS. Those scaling
-	lists can be enabled via :option:`--scaling-list` *default*.
-    
-	All other strings indicate a filename containing custom scaling
-	lists in the HM format. The encode will abort if the file is not
-	parsed correctly. Custom lists must be signaled in the SPS
+.. option:: --output, -o <filename>
 
-.. option:: --lambda-file <filename>
+	Bitstream output file name. If there are two extra CLI options, the
+	first is implicitly the input filename and the second is the output
+	filename, making the :option:`--output` option optional.
 
-	Specify a text file containing values for x265_lambda_tab and
-	x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float
-	values.
-	
-	The text file syntax is simple. Comma is considered to be
-	white-space. All white-space is ignored. Lines must be less than 2k
-	bytes in length. Content following hash (#) characters are ignored.
-	The values read from the file are logged at :option:`--log-level`
-	debug.
+	The output file will always contain a raw HEVC bitstream, the CLI
+	does not support any container file formats.
 
-	Note that the lambda tables are process-global and so the new values
-	affect all encoders running in the same process. 
-	
-	Lambda values affect encoder mode decisions, the lower the lambda
-	the more bits it will try to spend on signaling information (motion
-	vectors and splits) and less on residual. This feature is intended
-	for experimentation.
+	**CLI ONLY**
 
 Profile, Level, Tier
 ====================
@@ -417,15 +370,41 @@ Profile, Level, Tier
 	parameters to meet those requirements but it will never raise
 	them.
 
-Quad-Tree analysis
-==================
+Mode decision / Analysis
+========================
 
-.. option:: --wpp, --no-wpp
+.. option:: --rd <0..6>
 
-	Enable Wavefront Parallel Processing. The encoder may begin encoding
-	a row as soon as the row above it is at least two CTUs ahead in the
-	encode process. This gives a 3-5x gain in parallelism for about 1%
-	overhead in compression efficiency. Default: Enabled
+	Level of RDO in mode decision. The higher the value, the more
+	exhaustive the analysis and the more rate distortion optimization is
+	used. The lower the value the faster the encode, the higher the
+	value the smaller the bitstream (in general). Default 3
+
+	Note that this table aims for accuracy, but is not necessarily our
+	final target behavior for each mode.
+
+	+-------+---------------------------------------------------------------+
+	| Level | Description                                                   |
+	+=======+===============================================================+
+	| 0     | sa8d mode and split decisions, intra w/ source pixels         |
+	+-------+---------------------------------------------------------------+
+	| 1     | recon generated (better intra), RDO merge/skip selection      |
+	+-------+---------------------------------------------------------------+
+	| 2     | RDO splits and merge/skip selection                           |
+	+-------+---------------------------------------------------------------+
+	| 3     | RDO mode and split decisions, chroma residual used for sa8d   |
+	+-------+---------------------------------------------------------------+
+	| 4     | Adds RDO Quant                                                |
+	+-------+---------------------------------------------------------------+
+	| 5     | Adds RDO prediction decisions                                 |
+	+-------+---------------------------------------------------------------+
+	| 6     | Currently same as 5                                           |
+	+-------+---------------------------------------------------------------+
+
+	**Range of values:** 0: least .. 6: full RDO analysis
+
+Options which affect the coding unit quad-tree, sometimes referred to as
+the prediction quad-tree.
 
 .. option:: --ctu, -s <64|32|16>
 
@@ -436,6 +415,108 @@ Quad-Tree analysis
 	and less frame parallelism as well. Because of this the faster
 	presets use a CU size of 32. Default: 64
 
+.. option:: --rect, --no-rect
+
+	Enable analysis of rectangular motion partitions Nx2N and 2NxN
+	(50/50 splits, two directions). Default disabled
+
+.. option:: --amp, --no-amp
+
+	Enable analysis of asymmetric motion partitions (75/25 splits, four
+	directions). At RD levels 0 through 4, AMP partitions are only
+	considered at CU sizes 32x32 and below. At RD levels 5 and 6, it
+	will only consider AMP partitions as merge candidates (no motion
+	search) at 64x64, and as merge or inter candidates below 64x64.
+
+	The AMP partitions which are searched are derived from the current
+	best inter partition. If Nx2N (vertical rectangular) is the best
+	current prediction, then left and right asymmetrical splits will be
+	evaluated. If 2NxN (horizontal rectangular) is the best current
+	prediction, then top and bottom asymmetrical splits will be
+	evaluated, If 2Nx2N is the best prediction, and the block is not a
+	merge/skip, then all four AMP partitions are evaluated.
+
+	This setting has no effect if rectangular partitions are disabled.
+	Default disabled
+
+.. option:: --early-skip, --no-early-skip
+
+	Measure full CU size (2Nx2N) merge candidates first; if no residual
+	is found the analysis is short circuited. Default disabled
+
+.. option:: --fast-cbf, --no-fast-cbf
+
+	Short circuit analysis if a prediction is found that does not set
+	the coded block flag (aka: no residual was encoded).  It prevents
+	the encoder from perhaps finding other predictions that also have no
+	residual but require less signaling bits or have less distortion.
+	Only applicable for RD levels 5 and 6. Default disabled
+
+.. option:: --fast-intra, --no-fast-intra
+
+	Perform an initial scan of every fifth intra angular mode, then
+	check modes +/- 2 distance from the best mode, then +/- 1 distance
+	from the best mode, effectively performing a gradient descent. When
+	enabled 10 modes in total are checked. When disabled all 33 angular
+	modes are checked.  Only applicable for :option:`--rd` levels 4 and
+	below (medium preset and faster).
+
+.. option:: --b-intra, --no-b-intra
+
+	Enables the evaluation of intra modes in B slices. Default disabled.
+
+.. option:: --cu-lossless, --no-cu-lossless
+
+	For each CU, evaluate lossless (transform and quant bypass) encode
+	of the best non-lossless mode option as a potential rate distortion
+	optimization. If the global option :option:`--lossless` has been
+	specified, all CUs will be encoded as lossless unconditionally
+	regardless of whether this option was enabled. Default disabled.
+
+	Only effective at RD levels 3 and above, which perform RDO mode
+	decisions.
+
+.. option:: --tskip, --no-tskip
+
+	Enable evaluation of transform skip (bypass DCT but still use
+	quantization) coding for 4x4 TU coded blocks.
+
+	Only effective at RD levels 3 and above, which perform RDO mode
+	decisions. Default disabled
+
+.. option:: --tskip-fast, --no-tskip-fast
+
+	Only evaluate transform skip for NxN intra predictions (4x4 blocks).
+	Only applicable if transform skip is enabled. For chroma, only
+	evaluate if luma used tskip. Inter block tskip analysis is
+	unmodified. Default disabled
+
+Analysis re-use options, to improve performance when encoding the same
+sequence multiple times (presumably at varying bitrates). The encoder
+will not reuse analysis if the resolution and slice type parameters do
+not match.
+
+.. option:: --analysis-mode <string|int>
+
+	Specify whether analysis information of each frame is output by encoder
+	or input for reuse. By reading the analysis data writen by an
+	earlier encode of the same sequence, substantial redundant work may
+	be avoided.
+
+	The following data may be stored and reused:
+	I frames   - split decisions and luma intra directions of all CUs.
+	P/B frames - motion vectors are dumped at each depth for all CUs.
+
+	**Values:** off(0), save(1): dump analysis data, load(2): read analysis data
+
+.. option:: --analysis-file <filename>
+
+	Specify a filename for analysis data (see :option:`--analysis-mode`)
+	If no filename is specified, x265_analysis.dat is used.
+
+Options which affect the transform unit quad-tree, sometimes referred to
+as the residual quad-tree (RQT).
+
 .. option:: --tu-intra-depth <1..4>
 
 	The transform unit (residual) quad-tree begins with the same depth
@@ -504,147 +585,44 @@ Temporal / motion search options
 	|  5 | 1          | 8         | 1          | 8         | true      |
 	+----+------------+-----------+------------+-----------+-----------+
 	|  6 | 2          | 8         | 1          | 8         | true      |
-	+----+------------+-----------+------------+-----------+-----------+
-	|  7 | 2          | 8         | 2          | 8         | true      |
-	+----+------------+-----------+------------+-----------+-----------+
-
-.. option:: --merange <integer>
-
-	Motion search range. Default 57
-
-	The default is derived from the default CTU size (64) minus the luma
-	interpolation half-length (4) minus maximum subpel distance (2)
-	minus one extra pixel just in case the hex search method is used. If
-	the search range were any larger than this, another CTU row of
-	latency would be required for reference frames.
-
-	**Range of values:** an integer from 0 to 32768
-
-.. option:: --max-merge <1..5>
-
-	Maximum number of neighbor (spatial and temporal) candidate blocks
-	that the encoder may consider for merging motion predictions. If a
-	merge candidate results in no residual, it is immediately selected
-	as a "skip".  Otherwise the merge candidates are tested as part of
-	motion estimation when searching for the least cost inter option.
-	The max candidate number is encoded in the SPS and determines the
-	bit cost of signaling merge CUs. Default 2
-
-.. option:: --temporal-mvp, --no-temporal-mvp
-
-	Enable temporal motion vector predictors in P and B slices.
-	This enables the use of the motion vector from the collocated block
-	in the previous frame to be used as a predictor. Default is enabled
-
-Spatial/intra options
-=====================
-
-.. option:: --rdpenalty <0..2>
-
-	When set to 1, transform units of size 32x32 are given a 4x bit cost
-	penalty compared to smaller transform units, in intra coded CUs in P
-	or B slices.
-
-	When set to 2, transform units of size 32x32 are not even attempted,
-	unless otherwise required by the maximum recursion depth.  For this
-	option to be effective with 32x32 intra CUs,
-	:option:`--tu-intra-depth` must be at least 2.  For it to be
-	effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be
-	at least 3.
-
-	Note that in HEVC an intra transform unit (a block of the residual
-	quad-tree) is also a prediction unit, meaning that the intra
-	prediction signal is generated for each TU block, the residual
-	subtracted and then coded. The coding unit simply provides the
-	prediction modes that will be used when predicting all of the
-	transform units within the CU. This means that when you prevent
-	32x32 intra transform units, you are preventing 32x32 intra
-	predictions.
-
-	Default 0, disabled.
-
-	**Values:** 0:disabled 1:4x cost penalty 2:force splits
-
-.. option:: --b-intra, --no-b-intra
-
-	Enables the evaluation of intra modes in B slices. Default disabled.
-
-.. option:: --tskip, --no-tskip
-
-	Enable evaluation of transform skip (bypass DCT but still use
-	quantization) coding for 4x4 TU coded blocks.
-
-	Only effective at RD levels 3 and above, which perform RDO mode
-	decisions. Default disabled
-
-.. option:: --tskip-fast, --no-tskip-fast
-
-	Only evaluate transform skip for NxN intra predictions (4x4 blocks).
-	Only applicable if transform skip is enabled. For chroma, only
-	evaluate if luma used tskip. Inter block tskip analysis is
-	unmodified. Default disabled
-
-.. option:: --strong-intra-smoothing, --no-strong-intra-smoothing
-
-	Enable strong intra smoothing for 32x32 intra blocks. Default enabled
-
-.. option:: --constrained-intra, --no-constrained-intra
-
-	Constrained intra prediction. When generating intra predictions for
-	blocks in inter slices, only intra-coded reference pixels are used.
-	Inter-coded reference pixels are replaced with intra-coded neighbor
-	pixels or default values. The general idea is to block the
-	propagation of reference errors that may have resulted from lossy
-	signals. Default disabled
-
-Mode decision / Analysis
-========================
-
-.. option:: --rect, --no-rect
-
-	Enable analysis of rectangular motion partitions Nx2N and 2NxN
-	(50/50 splits, two directions). Default disabled
-
-.. option:: --amp, --no-amp
-
-	Enable analysis of asymmetric motion partitions (75/25 splits, four
-	directions). At RD levels 0 through 4, AMP partitions are only
-	considered at CU sizes 32x32 and below. At RD levels 5 and 6, it
-	will only consider AMP partitions as merge candidates (no motion
-	search) at 64x64, and as merge or inter candidates below 64x64.
-
-	The AMP partitions which are searched are derived from the current
-	best inter partition. If Nx2N (vertical rectangular) is the best
-	current prediction, then left and right asymmetrical splits will be
-	evaluated. If 2NxN (horizontal rectangular) is the best current
-	prediction, then top and bottom asymmetrical splits will be
-	evaluated, If 2Nx2N is the best prediction, and the block is not a
-	merge/skip, then all four AMP partitions are evaluated.
+	+----+------------+-----------+------------+-----------+-----------+
+	|  7 | 2          | 8         | 2          | 8         | true      |
+	+----+------------+-----------+------------+-----------+-----------+
 
-	This setting has no effect if rectangular partitions are disabled.
-	Default disabled
+	At --subme values larger than 2, chroma residual cost is included
+	in all subpel refinement steps and chroma residual is included in
+	all motion estimation decisions (selecting the best reference
+	picture in each list, and chosing between merge, uni-directional
+	motion and bi-directional motion). The 'slow' preset is the first
+	preset to enable the use of chroma residual.
 
-.. option:: --early-skip, --no-early-skip
+.. option:: --merange <integer>
 
-	Measure full CU size (2Nx2N) merge candidates first; if no residual
-	is found the analysis is short circuited. Default disabled
+	Motion search range. Default 57
 
-.. option:: --fast-cbf, --no-fast-cbf
+	The default is derived from the default CTU size (64) minus the luma
+	interpolation half-length (4) minus maximum subpel distance (2)
+	minus one extra pixel just in case the hex search method is used. If
+	the search range were any larger than this, another CTU row of
+	latency would be required for reference frames.
 
-	Short circuit analysis if a prediction is found that does not set
-	the coded block flag (aka: no residual was encoded).  It prevents
-	the encoder from perhaps finding other predictions that also have no
-	residual but require less signaling bits or have less distortion.
-	Only applicable for RD levels 5 and 6. Default disabled
+	**Range of values:** an integer from 0 to 32768
 
-.. option:: --fast-intra, --no-fast-intra
+.. option:: --max-merge <1..5>
 
-	Perform an initial scan of every fifth intra angular mode, then
-	check modes +/- 2 distance from the best mode, then +/- 1 distance
-	from the best mode, effectively performing a gradient descent. When
-	enabled 10 modes in total are checked. When disabled all 33 angular
-	modes are checked.  Only applicable for :option:`--rd` levels 3 and
-	below (medium preset and faster).
+	Maximum number of neighbor (spatial and temporal) candidate blocks
+	that the encoder may consider for merging motion predictions. If a
+	merge candidate results in no residual, it is immediately selected
+	as a "skip".  Otherwise the merge candidates are tested as part of
+	motion estimation when searching for the least cost inter option.
+	The max candidate number is encoded in the SPS and determines the
+	bit cost of signaling merge CUs. Default 2
+
+.. option:: --temporal-mvp, --no-temporal-mvp
+
+	Enable temporal motion vector predictors in P and B slices.
+	This enables the use of the motion vector from the collocated block
+	in the previous frame to be used as a predictor. Default is enabled
 
 .. option:: --weightp, -w, --no-weightp
 
@@ -660,54 +638,48 @@ Mode decision / Analysis
 
 	Enable weighted prediction in B slices. Default disabled
 
-.. option:: --rd <0..6>
+Spatial/intra options
+=====================
 
-	Level of RDO in mode decision. The higher the value, the more
-	exhaustive the analysis and the more rate distortion optimization is
-	used. The lower the value the faster the encode, the higher the
-	value the smaller the bitstream (in general). Default 3
+.. option:: --strong-intra-smoothing, --no-strong-intra-smoothing
 
-	Note that this table aims for accuracy, but is not necessarily our
-	final target behavior for each mode.
+	Enable strong intra smoothing for 32x32 intra blocks. Default enabled
 
-	+-------+---------------------------------------------------------------+
-	| Level | Description                                                   |
-	+=======+===============================================================+
-	| 0     | sa8d mode and split decisions, intra w/ source pixels         |
-	+-------+---------------------------------------------------------------+
-	| 1     | recon generated (better intra), RDO merge/skip selection      |
-	+-------+---------------------------------------------------------------+
-	| 2     | RDO splits and merge/skip selection                           |
-	+-------+---------------------------------------------------------------+
-	| 3     | RDO mode and split decisions                                  |
-	+-------+---------------------------------------------------------------+
-	| 4     | Adds RDO Quant                                                |
-	+-------+---------------------------------------------------------------+
-	| 5     | Adds RDO prediction decisions                                 |
-	+-------+---------------------------------------------------------------+
-	| 6     | Currently same as 5                                           |
-	+-------+---------------------------------------------------------------+
+.. option:: --constrained-intra, --no-constrained-intra
 
-	**Range of values:** 0: least .. 6: full RDO analysis
+	Constrained intra prediction. When generating intra predictions for
+	blocks in inter slices, only intra-coded reference pixels are used.
+	Inter-coded reference pixels are replaced with intra-coded neighbor
+	pixels or default values. The general idea is to block the
+	propagation of reference errors that may have resulted from lossy
+	signals. Default disabled
 
-.. option:: --cu-lossless, --no-cu-lossless
+.. option:: --rdpenalty <0..2>
 
-	For each CU, evaluate lossless (transform and quant bypass) encode
-	of the best non-lossless mode option as a potential rate distortion
-	optimization. If the global option :option:`--lossless` has been
-	specified, all CUs will be encoded as lossless unconditionally
-	regardless of whether this option was enabled. Default disabled.
+	When set to 1, transform units of size 32x32 are given a 4x bit cost
+	penalty compared to smaller transform units, in intra coded CUs in P
+	or B slices.
 
-	Only effective at RD levels 3 and above, which perform RDO mode
-	decisions.
+	When set to 2, transform units of size 32x32 are not even attempted,
+	unless otherwise required by the maximum recursion depth.  For this
+	option to be effective with 32x32 intra CUs,
+	:option:`--tu-intra-depth` must be at least 2.  For it to be
+	effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be
+	at least 3.
 
-.. option:: --signhide, --no-signhide
+	Note that in HEVC an intra transform unit (a block of the residual
+	quad-tree) is also a prediction unit, meaning that the intra
+	prediction signal is generated for each TU block, the residual
+	subtracted and then coded. The coding unit simply provides the
+	prediction modes that will be used when predicting all of the
+	transform units within the CU. This means that when you prevent
+	32x32 intra transform units, you are preventing 32x32 intra
+	predictions.
+
+	Default 0, disabled.
+
+	**Values:** 0:disabled 1:4x cost penalty 2:force splits
 
-	Hide sign bit of one coeff per TU (rdo). The last sign is implied.
-	This requires analyzing all the coefficients to determine if a sign
-	must be toggled, and then to determine which one can be toggled with
-	the least amount of distortion. Default enabled
- 
 Psycho-visual options
 =====================
 
@@ -753,8 +725,16 @@ quality and begin introducing artifacts and increase bitrate, which may
 force rate control to increase global QP. Finding the optimal
 psycho-visual parameters for a given video requires experimentation. Our
 recommended defaults (1.0 for both) are generally on the low end of the
-spectrum. And generally the lower the bitrate, the lower the optimal
-psycho-visual settings.
+spectrum.
+
+The lower the bitrate, the lower the optimal psycho-visual settings. If
+the bitrate is too low for the psycho-visual settings, you will begin to
+see temporal artifacts (motion judder). This is caused when the encoder
+is forced to code skip blocks (no residual) in areas of difficult motion
+because it is the best option psycho-visually (they have great amounts
+of energy and no residual cost). One can lower psy-rd settings when
+judder is happening, and allow the encoder to use some blur in these
+areas of high motion.
 
 .. option:: --psy-rd <float>
 
@@ -880,9 +860,7 @@ Quality, rate control and rate distortion options
 .. option:: --crf-min <0..51.0>
 
 	Specify an lower limit to the rate factor which may be assigned to
-	any given frame (ensuring a min QP).  This is dangerous when CRF is
-	used in combination with VBV as it may result in buffer underruns.
-	Default disabled
+	any given frame (ensuring a min compression factor).
 
 .. option:: --vbv-bufsize <integer>
 
@@ -904,8 +882,8 @@ Quality, rate control and rate distortion options
 	between 0 and 1, or in kbits. In other words these two option pairs
 	are equivalent::
 
-	:option:`--vbv-bufsize` 1000 :option:`--vbv-init` 900
-	:option:`--vbv-bufsize` 1000 :option:`--vbv-init` 0.9
+	--vbv-bufsize 1000 --vbv-init 900
+	--vbv-bufsize 1000 --vbv-init 0.9
 
 	Default 0.9
 
@@ -923,18 +901,6 @@ Quality, rate control and rate distortion options
 
 	**Range of values:** an integer from 0 to 51
 
-.. option:: --ipratio <float>
-
-	QP ratio factor between I and P slices. This ratio is used in all of
-	the rate control modes. Some :option:`--tune` options may change the
-	default value. It is not typically manually specified. Default 1.4
-
-.. option:: --pbratio <float>
-
-	QP ratio factor between P and B slices. This ratio is used in all of
-	the rate control modes. Some :option:`--tune` options may change the
-	default value. It is not typically manually specified. Default 1.3
-
 .. option:: --lossless, --no-lossless
 
 	Enables true lossless coding by bypassing scaling, transform,
@@ -954,8 +920,8 @@ Quality, rate control and rate distortion options
 	and not enough in flat areas.
 
 	0. disabled
-	1. AQ enabled
-	2. AQ enabled with auto-variance **(default)**
+	1. AQ enabled **(default)**
+	2. AQ enabled with auto-variance
 
 .. option:: --aq-strength <float>
 
@@ -974,25 +940,23 @@ Quality, rate control and rate distortion options
 	less bits. This tends to improve detail in the backgrounds of video
 	with less detail in areas of high motion. Default enabled
 
-.. option:: --cbqpoffs <integer>
-
-	Offset of Cb chroma QP from the luma QP selected by rate control.
-	This is a general way to spend more or less bits on the chroma
-	channel.  Default 0
-
-	**Range of values:** -12 to 12
+.. option:: --nr-intra <integer>, --nr-inter <integer>
 
-.. option:: --crqpoffs <integer>
+	Noise reduction - an adaptive deadzone applied after DCT
+	(subtracting from DCT coefficients), before quantization.  It does
+	no pixel-level filtering, doesn't cross DCT block boundaries, has no
+	overlap, The higher the strength value parameter, the more
+	aggressively it will reduce noise.
 
-	Offset of Cr chroma QP from the luma QP selected by rate control.
-	This is a general way to spend more or less bits on the chroma
-	channel.  Default 0
+	Enabling noise reduction will make outputs diverge between different
+	numbers of frame threads. Outputs will be deterministic but the
+	outputs of -F2 will no longer match the outputs of -F3, etc.
 
-	**Range of values:**  -12 to 12
+	**Values:** any value in range of 0 to 2000. Default 0 (disabled).
 
 .. option:: --pass <integer>
 
-	Enable multipass rate control mode. Input is encoded multiple times,
+	Enable multi-pass rate control mode. Input is encoded multiple times,
 	storing the encoded information of each pass in a stats file from which
 	the consecutive pass tunes the qp of each frame to improve the quality
 	of the output. Default disabled
@@ -1003,12 +967,17 @@ Quality, rate control and rate distortion options
 
 	**Range of values:** 1 to 3
 
+.. option:: --stats <filename>
+
+	Specify file name of of the multi-pass stats file. If unspecified
+	the encoder will use x265_2pass.log
+
 .. option:: --slow-firstpass, --no-slow-firstpass
 
-	Enable a slow and more detailed first pass encode in Multipass rate
+	Enable a slow and more detailed first pass encode in multi-pass rate
 	control mode.  Speed of the first pass encode is slightly lesser and
 	quality midly improved when compared to the default settings in a
-	multipass encode. Default disabled (turbo mode enabled)
+	multi-pass encode. Default disabled (turbo mode enabled)
 
 	When **turbo** first pass is not disabled, these options are
 	set on the first pass to improve performance:
@@ -1023,30 +992,146 @@ Quality, rate control and rate distortion options
 	* :option:`--subme` = MIN(2, :option:`--subme`)
 	* :option:`--rd` = MIN(2, :option:`--rd`)
 
-.. option:: --analysis-mode <string|int>
+.. option:: --cbqpoffs <integer>
 
-	Specify whether analysis information of each frame is output by encoder
-	or input for reuse. By reading the analysis data writen by an
-	earlier encode of the same sequence, substantial redundant work may
-	be avoided.
+	Offset of Cb chroma QP from the luma QP selected by rate control.
+	This is a general way to spend more or less bits on the chroma
+	channel.  Default 0
 
-	The following data may be stored and reused:
-	I frames   - split decisions and luma intra directions of all CUs.
-	P/B frames - motion vectors are dumped at each depth for all CUs.
+	**Range of values:** -12 to 12
 
-	**Values:** off(0), save(1): dump analysis data, load(2): read analysis data
+.. option:: --crqpoffs <integer>
 
-.. option:: --analysis-file <filename>
+	Offset of Cr chroma QP from the luma QP selected by rate control.
+	This is a general way to spend more or less bits on the chroma
+	channel.  Default 0
 
-	Specify a filename for analysis data (see :option:`--analysis-mode`)
-	If no filename is specified, x265_analysis.dat is used.
+	**Range of values:**  -12 to 12
+
+.. option:: --ipratio <float>
+
+	QP ratio factor between I and P slices. This ratio is used in all of
+	the rate control modes. Some :option:`--tune` options may change the
+	default value. It is not typically manually specified. Default 1.4
+
+.. option:: --pbratio <float>
+
+	QP ratio factor between P and B slices. This ratio is used in all of
+	the rate control modes. Some :option:`--tune` options may change the
+	default value. It is not typically manually specified. Default 1.3
+
+.. option:: --qcomp <float>
+
+	qComp sets the quantizer curve compression factor. It weights the
+	frame quantizer based on the complexity of residual (measured by
+	lookahead).  Default value is 0.6. Increasing it to 1 will
+	effectively generate CQP
+
+.. option:: --qstep <integer>
+
+	The maximum single adjustment in QP allowed to rate control. Default
+	4
+
+.. option:: --ratetol <float>
+
+	The degree of rate fluctuation that x265 tolerates. Rate tolerance
+	is used along with overflow (difference between actual and target
+	bitrate), to adjust qp. Default is 1.0
+
+.. option:: --qblur <float>
+
+	Temporally blur quants. Default 0.5
+
+.. option:: --cplxblur <float>
+
+	temporally blur complexity. default 20
+
+Quantization Options
+====================
+
+Note that rate-distortion optimized quantization (RDOQ) is enabled
+implicitly at :option:`--rd` 4, 5, and 6 and disabled implicitly at all
+other levels.
+ 
+.. option:: --signhide, --no-signhide
+
+	Hide sign bit of one coeff per TU (rdo). The last sign is implied.
+	This requires analyzing all the coefficients to determine if a sign
+	must be toggled, and then to determine which one can be toggled with
+	the least amount of distortion. Default enabled
+
+.. option:: --qpfile <filename>
+
+	Specify a text file which contains frametypes and QPs for some or
+	all frames. The format of each line is:
+
+	framenumber frametype QP
+
+	Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame,
+	**b** is an unreferenced B frame.  **I** is a keyframe (random
+	access point) while **i** is a I frame that is not a keyframe
+	(references are not broken).
+
+	Specifying QP (integer) is optional, and if specified they are
+	clamped within the encoder to qpmin/qpmax.
+
+.. option:: --scaling-list <filename>
+
+	Quantization scaling lists. HEVC supports 6 quantization scaling
+	lists to be defined; one each for Y, Cb, Cr for intra prediction and
+	one each for inter prediction.
+
+	x265 does not use scaling lists by default, but this can also be
+	made explicit by :option:`--scaling-list` *off*.
+
+	HEVC specifies a default set of scaling lists which may be enabled
+	without requiring them to be signaled in the SPS. Those scaling
+	lists can be enabled via :option:`--scaling-list` *default*.
+    
+	All other strings indicate a filename containing custom scaling
+	lists in the HM format. The encode will abort if the file is not
+	parsed correctly. Custom lists must be signaled in the SPS
+
+.. option:: --lambda-file <filename>
+
+	Specify a text file containing values for x265_lambda_tab and
+	x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float
+	values.
+	
+	The text file syntax is simple. Comma is considered to be
+	white-space. All white-space is ignored. Lines must be less than 2k
+	bytes in length. Content following hash (#) characters are ignored.
+	The values read from the file are logged at :option:`--log-level`
+	debug.
+
+	Note that the lambda tables are process-global and so the new values
+	affect all encoders running in the same process. 
+	
+	Lambda values affect encoder mode decisions, the lower the lambda
+	the more bits it will try to spend on signaling information (motion
+	vectors and splits) and less on residual. This feature is intended
+	for experimentation.
 
 Loop filters
 ============
 
-.. option:: --lft, --no-lft
+.. option:: --deblock=<int>:<int>, --no-deblock
+
+	Toggle deblocking loop filter, optionally specify deblocking
+	strength offsets.
+
+	<int>:<int> - parsed as tC offset and Beta offset
+	<int>,<int> - parsed as tC offset and Beta offset
+	<int>       - both tC and Beta offsets assigned the same value
+
+	If unspecified, the offsets default to 0. The offsets must be in a
+	range of -6 (lowest strength) to 6 (highest strength).
+
+	To disable the deblocking filter entirely, use --no-deblock or
+	--deblock=false. Default enabled, with both offsets defaulting to 0
 
-	Toggle deblocking loop filter, default enabled
+	If deblocking is disabled, or the offsets are non-zero, these
+	changes from the default configuration are signaled in the PPS.
 
 .. option:: --sao, --no-sao
 
@@ -1172,7 +1257,7 @@ VUI fields must be manually specified.
 	9. bt2020nc
 	10. bt2020c
 
-.. option:: --chromalocs <0..5>
+.. option:: --chromaloc <0..5>
 
 	Specify chroma sample location for 4:2:0 inputs. Consult the HEVC
 	specification for a description of these values. Default undefined
@@ -1206,7 +1291,7 @@ Bitstream options
 .. option:: --aud, --no-aud
 
 	Emit an access unit delimiter NAL at the start of each slice access
-	unit. If option:`--repeat-headers` is not enabled (indicating the
+	unit. If :option:`--repeat-headers` is not enabled (indicating the
 	user will be writing headers manually at the start of the stream)
 	the very first AUD will be skipped since it cannot be placed at the
 	start of the access unit, where it belongs. Default disabled
diff --git a/doc/reST/conf.py b/doc/reST/conf.py
index 561f7d0..eea837f 100644
--- a/doc/reST/conf.py
+++ b/doc/reST/conf.py
@@ -15,3 +15,12 @@ copyright = u'2014 MulticoreWare Inc'
 
 # -- Options for HTML output ---------------------------------------------------
 html_theme = "default"
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'libx265', 'Full x265 Documentation',
+    ['MulticoreWare Inc'], 3),
+    ('x265', 'x265', 'x265 CLI Documentation',
+    ['MulticoreWare Inc'], 1)
+]
diff --git a/doc/reST/introduction.rst b/doc/reST/introduction.rst
index 1d953f4..c503946 100644
--- a/doc/reST/introduction.rst
+++ b/doc/reST/introduction.rst
@@ -75,7 +75,7 @@ responsible for understanding the laws in your country, and for
 licensing all applicable patent rights needed for use or distribution of
 software applications created from the x265 source code.  A good place
 to start is with the `Motion Picture Experts Group - Licensing Authority
-- HEVC Licensing Program<http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
+- HEVC Licensing Program <http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
 
 x265 is a registered trademark of MulticoreWare, Inc.  The x265 logo is
 a trademark of MulticoreWare, and may only be used with explicit written
diff --git a/doc/reST/presets.rst b/doc/reST/presets.rst
index 99085a2..63134ea 100644
--- a/doc/reST/presets.rst
+++ b/doc/reST/presets.rst
@@ -1,11 +1,11 @@
 Preset Options
 --------------
 
+.. _presets:
+
 Presets
 =======
 
-.. _preset-tune-ref:
-
 x265 has a number of predefined :option:`--preset` options that make
 trade-offs between encode speed (encoded frames per second) and
 compression efficiency (quality per bit in the bitstream).  The default
@@ -66,7 +66,7 @@ The presets adjust encoder parameters to affect these trade-offs.
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
 | rdLevel      |    2      |     2     |    2     |   2    |  2   |    3   |  4   |   6    |    6     |    6    |
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| lft          |    0      |     1     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
+| deblock      |    0      |     1     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
 | tu-intra     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
@@ -75,6 +75,8 @@ The presets adjust encoder parameters to affect these trade-offs.
 
 Placebo mode enables transform-skip prediction evaluation.
 
+.. _tunings:
+
 Tuning
 ======
 
@@ -97,7 +99,46 @@ after the preset.
 +--------------+-----------------------------------------------------+
 | ssim         | enables adaptive quant auto-mode, disables psy-rd   |
 +--------------+-----------------------------------------------------+
+| grain        | improves retention of film grain. more below        |
++--------------+-----------------------------------------------------+
 | fastdecode   | no loop filters, no weighted pred, no intra in B    |
 +--------------+-----------------------------------------------------+
 | zerolatency  | no lookahead, no B frames, no cutree                |
 +--------------+-----------------------------------------------------+
+| cbr          | --pbratio 1.0 --ratetol 0.5                         |
++--------------+-----------------------------------------------------+
+
+
+Film Grain Retention
+~~~~~~~~~~~~~~~~~~~~
+
+:option:`--tune` grain tries to improve the retention of film grain in
+the reconstructed output. It helps rate distortion optimizations select
+modes which preserve high frequency noise:
+
+    * :option:`--psy-rd` 0.5
+    * :option:`--psy-rdoq` 30
+
+.. Note::
+
+    --psy-rdoq is only effective when RDOQuant is enabled, which is at
+    RD levels 4, 5, and 6 (presets slow and below).
+
+It lowers the strength of adaptive quantization, so residual energy can
+be more evenly distributed across the (noisy) picture:
+
+    * :option:`--aq-mode` 1
+    * :option:`--aq-strength` 0.3
+
+And it similarly tunes rate control to prevent the slice QP from
+swinging too wildly from frame to frame:
+
+    * :option:`--ipratio` 1.1
+    * :option:`--pbratio` 1.1
+    * :option:`--qcomp` 0.8
+
+And lastly it reduces the strength of deblocking to prevent grain being
+blurred on block boundaries:
+
+    * :option:`--deblock` -2
+
diff --git a/doc/reST/x265.rst b/doc/reST/x265.rst
new file mode 100644
index 0000000..32a416d
--- /dev/null
+++ b/doc/reST/x265.rst
@@ -0,0 +1,49 @@
+x265 CLI Documentation
+######################
+
+
+SYNOPSIS
+========
+
+**x265** [options] infile [-o] outfile
+
+Bit depth: 8
+
+
+**x265-10bit** [options] infile [-o] outfile
+
+Bit depth: 10
+
+
+infile can be YUV or Y4M
+
+outfile is raw HEVC bitstream
+
+
+DESCRIPTION
+===========
+
+.. toctree::
+   :maxdepth: 2
+
+   introduction
+
+
+OPTIONS
+=======
+
+.. toctree::
+   :maxdepth: 2
+
+   cli
+   presets
+   lossless
+
+
+SEE ALSO
+========
+
+**libx265**\(3)
+
+Online documentation: http://x265.readthedocs.org/en/default/cli.html
+
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index ba63f81..75b5eb4 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -21,7 +21,7 @@ include(CheckSymbolExists)
 include(CheckCXXCompilerFlag)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 35)
+set(X265_BUILD 40)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -63,6 +63,12 @@ if(UNIX)
     endif()
 endif(UNIX)
 
+if(X64 AND NOT WIN32)
+    option(ENABLE_PIC "Enable Position Independent Code" ON)
+else()
+    option(ENABLE_PIC "Enable Position Independent Code" OFF)
+endif(X64 AND NOT WIN32)
+
 # Compiler detection
 if(CMAKE_GENERATOR STREQUAL "Xcode")
   set(XCODE 1)
@@ -121,9 +127,9 @@ endif()
 if(GCC)
     add_definitions(-Wall -Wextra -Wshadow)
     add_definitions(-D__STDC_LIMIT_MACROS=1)
-    if(X64 AND NOT WIN32)
-        add_definitions(-fPIC)
-    endif(X64 AND NOT WIN32)
+    if(ENABLE_PIC)
+         add_definitions(-fPIC)
+    endif(ENABLE_PIC)
     if(X86 AND NOT X64)
         add_definitions(-march=i686)
     endif()
diff --git a/source/PPA/ppa.cpp b/source/PPA/ppa.cpp
index 607a946..9e924c9 100644
--- a/source/PPA/ppa.cpp
+++ b/source/PPA/ppa.cpp
@@ -41,8 +41,10 @@ typedef ppa::Base *(FUNC_PPALibInit)(const char **, int);
 typedef void (FUNC_PPALibRelease)(ppa::Base* &);
 }
 
+using namespace ppa;
+
 static FUNC_PPALibRelease *_pfuncPpaRelease;
-ppa::Base *ppabase;
+ppa::Base *ppa::ppabase;
 
 static void _ppaReleaseAtExit()
 {
diff --git a/source/PPA/ppa.h b/source/PPA/ppa.h
index 42f43b8..abb2a12 100644
--- a/source/PPA/ppa.h
+++ b/source/PPA/ppa.h
@@ -21,17 +21,8 @@
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
 
-#ifndef _PPA_H_
-#define _PPA_H_
-
-#if !defined(ENABLE_PPA)
-
-#define PPA_INIT()
-#define PPAStartCpuEventFunc(e)
-#define PPAStopCpuEventFunc(e)
-#define PPAScopeEvent(e)
-
-#else
+#ifndef PPA_H
+#define PPA_H
 
 /* declare enum list of users CPU events */
 #define PPA_REGISTER_CPU_EVENT(x) x,
@@ -40,32 +31,13 @@ enum PPACpuEventEnum
 #include "ppaCPUEvents.h"
     PPACpuGroupNums
 };
-
 #undef PPA_REGISTER_CPU_EVENT
 
-#define PPA_INIT()               initializePPA()
-#define PPAStartCpuEventFunc(e)  if (ppabase) ppabase->triggerStartEvent(ppabase->getEventId(e))
-#define PPAStopCpuEventFunc(e)   if (ppabase) ppabase->triggerEndEvent(ppabase->getEventId(e))
-#define PPAScopeEvent(e)         _PPAScope __scope_(e)
-
 #include "ppaApi.h"
 
 void initializePPA();
-extern ppa::Base *ppabase;
-
-class _PPAScope
-{
-protected:
-
-    ppa::EventID m_id;
-
-public:
-
-    _PPAScope(int e) { if (ppabase) { m_id = ppabase->getEventId(e); ppabase->triggerStartEvent(m_id); } else m_id = 0; }
 
-    ~_PPAScope()     { if (ppabase) ppabase->triggerEndEvent(m_id); }
-};
-
-#endif // if !defined(ENABLE_PPA)
+#define PPA_INIT()               initializePPA()
+#define PPAScopeEvent(e)         ppa::ProfileScope ppaScope_(e)
 
-#endif /* _PPA_H_ */
+#endif /* PPA_H */
diff --git a/source/PPA/ppaApi.h b/source/PPA/ppaApi.h
index 149de6d..15fa76b 100644
--- a/source/PPA/ppaApi.h
+++ b/source/PPA/ppaApi.h
@@ -54,6 +54,17 @@ protected:
 
     virtual void init(const char **pNames, int eventCount) = 0;
 };
+
+extern ppa::Base *ppabase;
+
+struct ProfileScope
+{
+    ppa::EventID id;
+
+    ProfileScope(int e) { if (ppabase) { id = ppabase->getEventId(e); ppabase->triggerStartEvent(id); } else id = 0; }
+    ~ProfileScope()     { if (ppabase) ppabase->triggerEndEvent(id); }
+};
+
 }
 
 #endif //_PPA_API_H_
diff --git a/source/PPA/ppaCPUEvents.h b/source/PPA/ppaCPUEvents.h
index 1a47b39..203f055 100644
--- a/source/PPA/ppaCPUEvents.h
+++ b/source/PPA/ppaCPUEvents.h
@@ -1,25 +1,6 @@
-PPA_REGISTER_CPU_EVENT(encode_block)
-PPA_REGISTER_CPU_EVENT(bitstream_write)
-PPA_REGISTER_CPU_EVENT(DPB_prepareEncode)
-PPA_REGISTER_CPU_EVENT(FrameEncoder_compressFrame)
-PPA_REGISTER_CPU_EVENT(FrameEncoder_compressRows)
-PPA_REGISTER_CPU_EVENT(CompressCU)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth1)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth2)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth3)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth4)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth1)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth2)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth3)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth4)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth1)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth2)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth3)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth4)
-PPA_REGISTER_CPU_EVENT(CalcRDCostIntra)
-PPA_REGISTER_CPU_EVENT(Thread_ProcessRow)
-PPA_REGISTER_CPU_EVENT(Thread_compressCU)
-PPA_REGISTER_CPU_EVENT(Thread_encodeCU)
-PPA_REGISTER_CPU_EVENT(Thread_filterCU)
+PPA_REGISTER_CPU_EVENT(bitstreamWrite)
+PPA_REGISTER_CPU_EVENT(frameThread)
+PPA_REGISTER_CPU_EVENT(encodeCTU)
+PPA_REGISTER_CPU_EVENT(filterCTURow)
+PPA_REGISTER_CPU_EVENT(slicetypeDecideEV)
+PPA_REGISTER_CPU_EVENT(costEstimateRow)
diff --git a/source/cmake/CMakeASM_YASMInformation.cmake b/source/cmake/CMakeASM_YASMInformation.cmake
index 0af7c24..7a7586c 100644
--- a/source/cmake/CMakeASM_YASMInformation.cmake
+++ b/source/cmake/CMakeASM_YASMInformation.cmake
@@ -2,7 +2,10 @@ set(ASM_DIALECT "_YASM")
 set(CMAKE_ASM${ASM_DIALECT}_SOURCE_FILE_EXTENSIONS asm)
 
 if(X64)
-    list(APPEND ASM_FLAGS -DARCH_X86_64=1 -DPIC)
+    list(APPEND ASM_FLAGS -DARCH_X86_64=1)
+    if(ENABLE_PIC)
+        list(APPEND ASM_FLAGS -DPIC)
+    endif()
     if(APPLE)
         set(ARGS -f macho64 -m amd64 -DPREFIX)
     elseif(UNIX AND NOT CYGWIN)
diff --git a/source/cmake/FindVLD.cmake b/source/cmake/FindVLD.cmake
index 716625c..ece8bae 100644
--- a/source/cmake/FindVLD.cmake
+++ b/source/cmake/FindVLD.cmake
@@ -54,11 +54,14 @@ ELSEIF (CMAKE_SIZEOF_VOID_P EQUAL 8)
   LIST (APPEND _VLD_POSSIBLE_LIB_SUFFIXES lib/Win64)
 ENDIF (CMAKE_SIZEOF_VOID_P EQUAL 4)
 
+SET (PFILES "ProgramFiles")
+SET (PFILES_X86 "ProgramFiles(x86)") # hack to avoid escaping issues in cmake 3.1
+
 FIND_PATH (VLD_ROOT_DIR
   NAMES include/vld.h
   PATHS ENV VLDROOT
-        "$ENV{PROGRAMFILES}/Visual Leak Detector"
-        "$ENV{PROGRAMFILES(X86)}/Visual Leak Detector"
+        "$ENV{PFILES}/Visual Leak Detector"
+        "$ENV{PFILES_X86}/Visual Leak Detector"
         "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]"
         "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]"
   DOC "VLD root directory")
diff --git a/source/cmake/version.cmake b/source/cmake/version.cmake
index b6adfb9..0662cda 100644
--- a/source/cmake/version.cmake
+++ b/source/cmake/version.cmake
@@ -6,9 +6,9 @@ endif()
 find_package(Git QUIET) # present in 2.8.8
 
 # defaults, in case everything below fails
-set(X265_VERSION "unknown")
-set(X265_LATEST_TAG "0.0")
-set(X265_TAG_DISTANCE "0")
+set(X265_VERSION "1.4+222+hg5f9f7194267b")
+set(X265_LATEST_TAG "1.4")
+set(X265_TAG_DISTANCE "222")
 
 if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt)
     # read the lines of the archive summary file to extract the version
@@ -22,9 +22,9 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt)
         set(hg_${key} ${value})
     endforeach()
     if(DEFINED hg_tag)
-        set(X265_VERSION ${hg_tag} CACHE STRING "x265 version string.")
+        set(X265_VERSION ${hg_tag})
         set(X265_LATEST_TAG ${hg_tag})
-        set(X265_TAG_DISTANCE "0")
+        set(X265_TAG_DISTANCE "222")
     elseif(DEFINED hg_node)
         string(SUBSTRING "${hg_node}" 0 16 hg_id)
         set(X265_VERSION "${hg_latesttag}+${hg_latesttagdistance}-${hg_id}")
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 46929ca..4ead346 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -1,44 +1,46 @@
 # vim: syntax=cmake
-set(SSE3  vec/dct-sse3.cpp)
-set(SSSE3 vec/dct-ssse3.cpp)
-set(SSE41 vec/dct-sse41.cpp)
 
-if(MSVC AND X86)
-    set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
-    set(WARNDISABLE "/wd4100") # unreferenced formal parameter
-    if(INTEL_CXX)
-        add_definitions(/Qwd111) # statement is unreachable
-        add_definitions(/Qwd128) # loop is unreachable
-        add_definitions(/Qwd177) # declared function is unused
-        add_definitions(/Qwd185) # dynamic initialization in unreachable code
-        add_definitions(/Qwd280) # conditional expression is constant
-    endif()
-    if(X64)
-        set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE}")
-    else()
-        # x64 implies SSE4, so only add /arch:SSE2 if building for Win32
-        set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
-    endif()
-endif()
-if(GCC AND X86)
-    if(CLANG)
-        # llvm intrinsic headers cause shadow warnings
-        set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
-    else()
-        set(WARNDISABLE "-Wno-unused-parameter")
-    endif()
-    if(INTEL_CXX OR CLANG OR (NOT CC_VERSION VERSION_LESS 4.3))
+if(ENABLE_ASSEMBLY)
+    set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
+
+    set(SSE3  vec/dct-sse3.cpp)
+    set(SSSE3 vec/dct-ssse3.cpp)
+    set(SSE41 vec/dct-sse41.cpp)
+
+    if(MSVC AND X86)
         set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
-        set_source_files_properties(${SSE3}  PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse3")
-        set_source_files_properties(${SSSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -mssse3")
-        set_source_files_properties(${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse4.1")
+        set(WARNDISABLE "/wd4100") # unreferenced formal parameter
+        if(INTEL_CXX)
+            add_definitions(/Qwd111) # statement is unreachable
+            add_definitions(/Qwd128) # loop is unreachable
+            add_definitions(/Qwd177) # declared function is unused
+            add_definitions(/Qwd185) # dynamic initialization in unreachable code
+            add_definitions(/Qwd280) # conditional expression is constant
+        endif()
+        if(X64)
+            set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE}")
+        else()
+            # x64 implies SSE4, so only add /arch:SSE2 if building for Win32
+            set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
+        endif()
     endif()
-endif()
-set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES})
-source_group(Intrinsics FILES ${VEC_PRIMITIVES})
+    if(GCC AND X86)
+        if(CLANG)
+            # llvm intrinsic headers cause shadow warnings
+            set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
+        else()
+            set(WARNDISABLE "-Wno-unused-parameter")
+        endif()
+        if(INTEL_CXX OR CLANG OR (NOT CC_VERSION VERSION_LESS 4.3))
+            set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
+            set_source_files_properties(${SSE3}  PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse3")
+            set_source_files_properties(${SSSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -mssse3")
+            set_source_files_properties(${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse4.1")
+        endif()
+    endif()
+    set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES})
+    source_group(Intrinsics FILES ${VEC_PRIMITIVES})
 
-if(ENABLE_ASSEMBLY)
-    set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
     set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
     set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
                mc-a2.asm pixel-util8.asm blockcopy8.asm
diff --git a/source/common/common.h b/source/common/common.h
index b447bb3..a0f8286 100644
--- a/source/common/common.h
+++ b/source/common/common.h
@@ -41,6 +41,15 @@
 
 #include "x265.h"
 
+#if ENABLE_PPA
+#include "PPA/ppa.h"
+#define ProfileScopeEvent(x) PPAScopeEvent(x)
+#define PROFILE_INIT()       PPA_INIT()
+#else
+#define ProfileScopeEvent(x)
+#define PROFILE_INIT()
+#endif
+
 #define FENC_STRIDE 64
 #define NUM_INTRA_MODE 35
 
@@ -56,6 +65,10 @@ extern "C" intptr_t x265_stack_align(void (*func)(), ...);
 #define x265_stack_align(func, ...) func(__VA_ARGS__)
 #endif
 
+#if defined(__MINGW32__)
+#define fseeko fseeko64
+#endif
+
 #elif defined(_MSC_VER)
 
 #define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
@@ -245,9 +258,6 @@ typedef int16_t  coeff_t;      // transform coefficient
 #define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
 #define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
 
-#define MAX_NUM_TR_COEFFS        MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
-#define MAX_NUM_TR_CATEGORIES    8                         /* 32, 16, 8, 4 transform categories each for luma and chroma */
-
 #define COEF_REMAIN_BIN_REDUCTION   3 // indicates the level at which the VLC
                                       // transitions from Golomb-Rice to TU+EG(k)
 
@@ -297,21 +307,12 @@ typedef int16_t  coeff_t;      // transform coefficient
 
 #define CHROMA_H_SHIFT(x) (x == X265_CSP_I420 || x == X265_CSP_I422)
 #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
+#define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
 
 namespace x265 {
 
 enum { SAO_NUM_OFFSET = 4 };
 
-// NOTE: MUST be alignment to 16 or 32 bytes for asm code
-struct NoiseReduction
-{
-    /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
-     * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
-    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
-    uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
-    uint32_t count[MAX_NUM_TR_CATEGORIES];
-};
-
 enum SaoMergeMode
 {
     SAO_MERGE_NONE,
@@ -358,6 +359,20 @@ struct SAOParam
     }
 };
 
+/* Stores inter (motion estimation) analysis data for a single frame */
+struct analysis_inter_data
+{
+    int      ref;
+};
+
+/* Stores intra analysis data for a single frame. This struct needs better packing */
+struct analysis_intra_data
+{
+    uint8_t*  depth;
+    uint8_t*  modes;
+    char*     partSizes;
+};
+
 enum TextType
 {
     TEXT_LUMA     = 0,  // luma
diff --git a/source/common/constants.cpp b/source/common/constants.cpp
index 4252cb4..749d888 100644
--- a/source/common/constants.cpp
+++ b/source/common/constants.cpp
@@ -27,21 +27,46 @@
 
 namespace x265 {
 
-static int initialized /* = 0 */;
-
-// initialize ROM variables
-void initROM()
+#if HIGH_BIT_DEPTH
+// lambda = pow(2, (double)q / 6 - 2) * (1 << (X265_DEPTH - 8));
+double x265_lambda_tab[QP_MAX_MAX + 1] =
 {
-    if (ATOMIC_CAS32(&initialized, 0, 1) == 1)
-        return;
-}
+    1.0000, 1.1225, 1.2599, 1.4142, 1.5874, 
+    1.7818, 2.0000, 2.2449, 2.5198, 2.8284, 
+    3.1748, 3.5636, 4.0000, 4.4898, 5.0397, 
+    5.6569, 6.3496, 7.1272, 8.0000, 8.9797, 
+    10.0794, 11.3137, 12.6992, 14.2544, 16.0000, 
+    17.9594, 20.1587, 22.6274, 25.3984, 28.5088, 
+    32.0000, 35.9188, 40.3175, 45.2548, 50.7968, 
+    57.0175, 64.0000, 71.8376, 80.6349, 90.5097, 
+    101.5937, 114.0350, 128.0000, 143.6751, 161.2699, 
+    181.0193, 203.1873, 228.0701, 256.0000, 287.3503, 
+    322.5398, 362.0387, 406.3747, 456.1401, 512.0000, 
+    574.7006, 645.0796, 724.0773, 812.7493, 912.2803, 
+    1024.0000, 1149.4011, 1290.1592, 1448.1547, 1625.4987, 
+    1824.5606, 2048.0000, 2298.8023, 2580.3183, 2896.3094,
+};
 
-void destroyROM()
+// lambda2 = pow(lambda, 2) * scale (0.85);
+double x265_lambda2_tab[QP_MAX_MAX + 1] =
 {
-    if (ATOMIC_CAS32(&initialized, 1, 0) == 0)
-        return;
-}
+    0.8500, 1.0709, 1.3493, 1.7000, 2.1419, 
+    2.6986, 3.4000, 4.2837, 5.3972, 6.8000, 
+    8.5675, 10.7943, 13.6000, 17.1349, 21.5887, 
+    27.2000, 34.2699, 43.1773, 54.4000, 68.5397, 
+    86.3546, 108.8000, 137.0794, 172.7092, 217.6000, 
+    274.1588, 345.4185, 435.2000, 548.3176, 690.8369, 
+    870.4000, 1096.6353, 1381.6739, 1740.8000, 2193.2706, 
+    2763.3478, 3481.6000, 4386.5411, 5526.6955, 6963.2000, 
+    8773.0823, 11053.3910, 13926.4000, 17546.1645, 22106.7820, 
+    27852.8000, 35092.3290, 44213.5640, 55705.6000, 70184.6580, 
+    88427.1280, 111411.2000, 140369.3161, 176854.2561, 222822.4000, 
+    280738.6321, 353708.5122, 445644.8000, 561477.2643, 707417.0243, 
+    891289.6000, 1122954.5286, 1414834.0486, 1782579.2000, 2245909.0572, 
+    2829668.0973, 3565158.4000, 4491818.1144, 5659336.1946, 7130316.8000, 
+};
 
+#else /* !HIGH_BIT_DEPTH */
 
 // lambda = pow(2, (double)q / 6 - 2);
 double x265_lambda_tab[QP_MAX_MAX + 1] =
@@ -81,6 +106,8 @@ double x265_lambda2_tab[QP_MAX_MAX + 1] =
     176854.2222, 222822.4000, 280738.6627, 353708.5368, 445644.7459
 };
 
+#endif
+
 const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
 {
        16,    20,    25,    32,    40,    50,
diff --git a/source/common/constants.h b/source/common/constants.h
index 9db47db..fa82b80 100644
--- a/source/common/constants.h
+++ b/source/common/constants.h
@@ -29,9 +29,6 @@
 namespace x265 {
 // private namespace
 
-void initROM();
-void destroyROM();
-
 void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx);
 void initRasterToZscan(uint32_t maxFullDepth);
 
diff --git a/source/common/cudata.cpp b/source/common/cudata.cpp
index d28e005..c2e3c23 100644
--- a/source/common/cudata.cpp
+++ b/source/common/cudata.cpp
@@ -227,16 +227,15 @@ void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp,
     /* Each CU's data is layed out sequentially within the charMemBlock */
     uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
 
-    m_qp          = (char*)charBuf; charBuf += m_numPartitions;
+    m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
     m_log2CUSize         = charBuf; charBuf += m_numPartitions;
-    m_partSize           = charBuf; charBuf += m_numPartitions;
-    m_predMode           = charBuf; charBuf += m_numPartitions;
     m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
     m_tqBypass           = charBuf; charBuf += m_numPartitions;
-    m_refIdx[0]   = (char*)charBuf; charBuf += m_numPartitions;
-    m_refIdx[1]   = (char*)charBuf; charBuf += m_numPartitions;
+    m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
+    m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
     m_cuDepth            = charBuf; charBuf += m_numPartitions;
-    m_skipFlag           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+    m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+    m_partSize           = charBuf; charBuf += m_numPartitions;
     m_mergeFlag          = charBuf; charBuf += m_numPartitions;
     m_interDir           = charBuf; charBuf += m_numPartitions;
     m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
@@ -278,8 +277,6 @@ void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
     /* sequential memsets */
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
     m_partSet(m_log2CUSize,   (uint8_t)g_maxLog2CUSize);
-    m_partSet(m_partSize,     (uint8_t)SIZE_NONE);
-    m_partSet(m_predMode,     (uint8_t)MODE_NONE);
     m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
     m_partSet(m_tqBypass,     (uint8_t)frame.m_encData->m_param->bLossless);
     if (m_slice->m_sliceType != I_SLICE)
@@ -291,7 +288,7 @@ void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
     X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n");
 
     /* initialize the remaining CU data in one memset */
-    memset(m_cuDepth, 0, (BytesPerPartition - 8) * m_numPartitions);
+    memset(m_cuDepth, 0, (BytesPerPartition - 6) * m_numPartitions);
 
     uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
     m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
@@ -318,8 +315,6 @@ void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom)
     /* sequential memsets */
     m_partSet((uint8_t*)m_qp, (uint8_t)ctu.m_qp[0]);
     m_partSet(m_log2CUSize,   (uint8_t)cuGeom.log2CUSize);
-    m_partSet(m_partSize,     (uint8_t)SIZE_NONE);
-    m_partSet(m_predMode,     (uint8_t)MODE_NONE);
     m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
     m_partSet(m_tqBypass,     (uint8_t)m_encData->m_param->bLossless);
     m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
@@ -327,7 +322,7 @@ void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom)
     m_partSet(m_cuDepth,      (uint8_t)cuGeom.depth);
 
     /* initialize the remaining CU data in one memset */
-    memset(m_skipFlag, 0, (BytesPerPartition - 9) * m_numPartitions);
+    memset(m_predMode, 0, (BytesPerPartition - 7) * m_numPartitions);
 }
 
 /* Copy the results of a sub-part (split) CU to the parent CU */
@@ -339,14 +334,13 @@ void CUData::copyPartFrom(const CUData& subCU, const CUGeom& childGeom, uint32_t
 
     m_subPartCopy((uint8_t*)m_qp + offset, (uint8_t*)subCU.m_qp);
     m_subPartCopy(m_log2CUSize + offset, subCU.m_log2CUSize);
-    m_subPartCopy(m_partSize + offset, subCU.m_partSize);
-    m_subPartCopy(m_predMode + offset, subCU.m_predMode);
     m_subPartCopy(m_lumaIntraDir + offset, subCU.m_lumaIntraDir);
     m_subPartCopy(m_tqBypass + offset, subCU.m_tqBypass);
     m_subPartCopy((uint8_t*)m_refIdx[0] + offset, (uint8_t*)subCU.m_refIdx[0]);
     m_subPartCopy((uint8_t*)m_refIdx[1] + offset, (uint8_t*)subCU.m_refIdx[1]);
     m_subPartCopy(m_cuDepth + offset, subCU.m_cuDepth);
-    m_subPartCopy(m_skipFlag + offset, subCU.m_skipFlag);
+    m_subPartCopy(m_predMode + offset, subCU.m_predMode);
+    m_subPartCopy(m_partSize + offset, subCU.m_partSize);
     m_subPartCopy(m_mergeFlag + offset, subCU.m_mergeFlag);
     m_subPartCopy(m_interDir + offset, subCU.m_interDir);
     m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
@@ -410,7 +404,7 @@ void CUData::initLosslessCU(const CUData& cu, const CUGeom& cuGeom)
     m_partSet(m_tqBypass, true);
 
     /* clear residual coding flags */
-    m_partSet(m_skipFlag, 0);
+    m_partSet(m_predMode, cu.m_predMode[0] & (MODE_INTRA | MODE_INTER));
     m_partSet(m_tuDepth, 0);
     m_partSet(m_transformSkip[0], 0);
     m_partSet(m_transformSkip[1], 0);
@@ -427,14 +421,13 @@ void CUData::copyToPic(uint32_t depth) const
 
     m_partCopy((uint8_t*)ctu.m_qp + m_absIdxInCTU, (uint8_t*)m_qp);
     m_partCopy(ctu.m_log2CUSize + m_absIdxInCTU, m_log2CUSize);
-    m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize);
-    m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
     m_partCopy(ctu.m_lumaIntraDir + m_absIdxInCTU, m_lumaIntraDir);
     m_partCopy(ctu.m_tqBypass + m_absIdxInCTU, m_tqBypass);
     m_partCopy((uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU, (uint8_t*)m_refIdx[0]);
     m_partCopy((uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU, (uint8_t*)m_refIdx[1]);
     m_partCopy(ctu.m_cuDepth + m_absIdxInCTU, m_cuDepth);
-    m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag);
+    m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
+    m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize);
     m_partCopy(ctu.m_mergeFlag + m_absIdxInCTU, m_mergeFlag);
     m_partCopy(ctu.m_interDir + m_absIdxInCTU, m_interDir);
     m_partCopy(ctu.m_mvpIdx[0] + m_absIdxInCTU, m_mvpIdx[0]);
@@ -477,13 +470,13 @@ void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom)
     /* copy out all prediction info for this part */
     m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
     m_partCopy(m_log2CUSize,   ctu.m_log2CUSize + m_absIdxInCTU);
-    m_partCopy(m_partSize,     ctu.m_partSize + m_absIdxInCTU);
-    m_partCopy(m_predMode,     ctu.m_predMode + m_absIdxInCTU);
     m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU);
     m_partCopy(m_tqBypass,     ctu.m_tqBypass + m_absIdxInCTU);
     m_partCopy((uint8_t*)m_refIdx[0], (uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU);
     m_partCopy((uint8_t*)m_refIdx[1], (uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU);
     m_partCopy(m_cuDepth,      ctu.m_cuDepth + m_absIdxInCTU);
+    m_partSet(m_predMode, ctu.m_predMode[m_absIdxInCTU] & (MODE_INTRA | MODE_INTER)); /* clear skip flag */
+    m_partCopy(m_partSize,     ctu.m_partSize + m_absIdxInCTU);
     m_partCopy(m_mergeFlag,    ctu.m_mergeFlag + m_absIdxInCTU);
     m_partCopy(m_interDir,     ctu.m_interDir + m_absIdxInCTU);
     m_partCopy(m_mvpIdx[0],    ctu.m_mvpIdx[0] + m_absIdxInCTU);
@@ -496,7 +489,6 @@ void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom)
     memcpy(m_mvd[1], ctu.m_mvd[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
 
     /* clear residual coding flags */
-    m_partSet(m_skipFlag, 0);
     m_partSet(m_tuDepth, 0);
     m_partSet(m_transformSkip[0], 0);
     m_partSet(m_transformSkip[1], 0);
@@ -515,7 +507,7 @@ void CUData::updatePic(uint32_t depth) const
     m_partCopy(ctu.m_transformSkip[0] + m_absIdxInCTU, m_transformSkip[0]);
     m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
     m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
-    m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag);
+    m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
     m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth);
     m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]);
     m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]);
@@ -552,7 +544,7 @@ const CUData* CUData::getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx)
     return m_cuLeft;
 }
 
-const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary) const
+const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx) const
 {
     uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx];
 
@@ -563,15 +555,10 @@ const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx
         if (isEqualRow(absPartIdx, absZorderCUIdx, s_numPartInCUSize))
             return m_encData->getPicCTU(m_cuAddr);
         else
-        {
             aPartUnitIdx -= m_absIdxInCTU;
-            return this;
-        }
+        return this;
     }
 
-    if (planarAtCTUBoundary)
-        return NULL;
-
     aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize];
     return m_cuAbove;
 }
@@ -785,7 +772,7 @@ const CUData* CUData::getQpMinCuAbove(uint32_t& aPartUnitIdx, uint32_t curAbsIdx
 }
 
 /* Get reference QP from left QpMinCu or latest coded QP */
-char CUData::getRefQP(uint32_t curAbsIdxInCTU) const
+int8_t CUData::getRefQP(uint32_t curAbsIdxInCTU) const
 {
     uint32_t lPartIdx = 0, aPartIdx = 0;
     const CUData* cULeft = getQpMinCuLeft(lPartIdx, m_absIdxInCTU + curAbsIdxInCTU);
@@ -807,7 +794,7 @@ int CUData::getLastValidPartIdx(int absPartIdx) const
     return lastValidPartIdx;
 }
 
-char CUData::getLastCodedQP(uint32_t absPartIdx) const
+int8_t CUData::getLastCodedQP(uint32_t absPartIdx) const
 {
     uint32_t quPartIdxMask = 0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2;
     int lastValidPartIdx = getLastValidPartIdx(absPartIdx & quPartIdxMask);
@@ -821,7 +808,7 @@ char CUData::getLastCodedQP(uint32_t absPartIdx) const
         else if (m_cuAddr > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled && !(m_cuAddr % m_slice->m_sps->numCuInWidth)))
             return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_CU_PARTITIONS);
         else
-            return (char)m_slice->m_sliceQp;
+            return (int8_t)m_slice->m_sliceQp;
     }
 }
 
@@ -859,7 +846,7 @@ int CUData::getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred
     leftIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX;
 
     // Get intra direction of above PU
-    tempCU = getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx, true);
+    tempCU = g_zscanToPelY[m_absIdxInCTU + absPartIdx] > 0 ? getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx) : NULL;
 
     aboveIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX;
 
@@ -912,7 +899,7 @@ uint32_t CUData::getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const
 void CUData::getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const
 {
     uint32_t log2CUSize = m_log2CUSize[absPartIdx];
-    uint32_t splitFlag = m_partSize[absPartIdx] == SIZE_NxN;
+    uint32_t splitFlag = m_partSize[absPartIdx] != SIZE_2Nx2N;
 
     tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
     tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;
@@ -949,7 +936,7 @@ uint32_t CUData::getCtxSkipFlag(uint32_t absPartIdx) const
     return ctx;
 }
 
-bool CUData::setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth)
+bool CUData::setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth)
 {
     uint32_t curPartNumb = NUM_CU_PARTITIONS >> (depth << 1);
     uint32_t curPartNumQ = curPartNumb >> 2;
@@ -1224,7 +1211,7 @@ void CUData::setPUMv(int list, const MV& mv, int absPartIdx, int puIdx)
     setAllPU(m_mv[list], mv, absPartIdx, puIdx);
 }
 
-void CUData::setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx)
+void CUData::setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx)
 {
     setAllPU(m_refIdx[list], refIdx, absPartIdx, puIdx);
 }
@@ -1250,7 +1237,7 @@ void CUData::getMvField(const CUData* cu, uint32_t absPartIdx, int picList, MVFi
     else
     {
         // OUT OF BOUNDARY
-        outMvField.mv.word = 0;
+        outMvField.mv = 0;
         outMvField.refIdx = REF_NOT_VALID;
     }
 }
@@ -1412,6 +1399,8 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
 
     for (uint32_t i = 0; i < maxNumMergeCand; ++i)
     {
+        mvFieldNeighbours[i][0].mv = 0;
+        mvFieldNeighbours[i][1].mv = 0;
         mvFieldNeighbours[i][0].refIdx = REF_NOT_VALID;
         mvFieldNeighbours[i][1].refIdx = REF_NOT_VALID;
     }
@@ -1441,7 +1430,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
     bool isAvailableA1 = cuLeft &&
         cuLeft->isDiffMER(xP - 1, yP + nPSH - 1, xP, yP) &&
         !(puIdx == 1 && (curPS == SIZE_Nx2N || curPS == SIZE_nLx2N || curPS == SIZE_nRx2N)) &&
-        !cuLeft->isIntra(leftPartIdx);
+        cuLeft->isInter(leftPartIdx);
     if (isAvailableA1)
     {
         // get Inter Dir
@@ -1465,7 +1454,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
     bool isAvailableB1 = cuAbove &&
         cuAbove->isDiffMER(xP + nPSW - 1, yP - 1, xP, yP) &&
         !(puIdx == 1 && (curPS == SIZE_2NxN || curPS == SIZE_2NxnU || curPS == SIZE_2NxnD)) &&
-        !cuAbove->isIntra(abovePartIdx);
+        cuAbove->isInter(abovePartIdx);
     if (isAvailableB1 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAbove, abovePartIdx)))
     {
         // get Inter Dir
@@ -1486,7 +1475,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
     const CUData* cuAboveRight = getPUAboveRight(aboveRightPartIdx, partIdxRT);
     bool isAvailableB0 = cuAboveRight &&
         cuAboveRight->isDiffMER(xP + nPSW, yP - 1, xP, yP) &&
-        !cuAboveRight->isIntra(aboveRightPartIdx);
+        cuAboveRight->isInter(aboveRightPartIdx);
     if (isAvailableB0 && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveRight, aboveRightPartIdx)))
     {
         // get Inter Dir
@@ -1507,7 +1496,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
     const CUData* cuLeftBottom = this->getPUBelowLeft(leftBottomPartIdx, partIdxLB);
     bool isAvailableA0 = cuLeftBottom &&
         cuLeftBottom->isDiffMER(xP - 1, yP + nPSH, xP, yP) &&
-        !cuLeftBottom->isIntra(leftBottomPartIdx);
+        cuLeftBottom->isInter(leftBottomPartIdx);
     if (isAvailableA0 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuLeftBottom, leftBottomPartIdx)))
     {
         // get Inter Dir
@@ -1530,7 +1519,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
         const CUData* cuAboveLeft = getPUAboveLeft(aboveLeftPartIdx, absPartAddr);
         bool isAvailableB2 = cuAboveLeft &&
             cuAboveLeft->isDiffMER(xP - 1, yP - 1, xP, yP) &&
-            !cuAboveLeft->isIntra(aboveLeftPartIdx);
+            cuAboveLeft->isInter(aboveLeftPartIdx);
         if (isAvailableB2 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAboveLeft, aboveLeftPartIdx))
             && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveLeft, aboveLeftPartIdx)))
         {
@@ -1659,7 +1648,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
     while (count < maxNumMergeCand)
     {
         interDirNeighbours[count] = 1;
-        mvFieldNeighbours[count][0].mv.word = 0;
+        mvFieldNeighbours[count][0].mv = 0;
         mvFieldNeighbours[count][0].refIdx = r;
 
         if (isInterB)
@@ -1966,26 +1955,18 @@ bool CUData::addMVPCandOrder(MV& outMV, int picList, int refIdx, uint32_t partUn
 
 bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int partUnitIdx) const
 {
-    uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
-
-    int colRefPicList;
-    int colPOC, colRefPOC, curPOC, curRefPOC;
-    MV colmv;
-
-    // use coldir.
-    Frame *colPic = m_slice->m_refPicList[m_slice->isInterB() ? 1 - m_slice->m_colFromL0Flag : 0][m_slice->m_colRefIdx];
-    CUData *colCU = colPic->m_encData->getPicCTU(cuAddr);
+    const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
+    const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
 
-    if (colCU->m_partSize[partUnitIdx] == SIZE_NONE)
+    if (colCU->m_predMode[partUnitIdx] == MODE_NONE)
         return false;
 
-    curPOC = m_slice->m_poc;
-    colPOC = colCU->m_slice->m_poc;
+    uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
 
     if (colCU->isIntra(absPartAddr))
         return false;
 
-    colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag;
+    int colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag;
 
     int colRefIdx = colCU->m_refIdx[colRefPicList][absPartAddr];
 
@@ -1999,9 +1980,12 @@ bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int p
     }
 
     // Scale the vector
-    colRefPOC = colCU->m_slice->m_refPOCList[colRefPicList][colRefIdx];
-    colmv = colCU->m_mv[colRefPicList][absPartAddr];
-    curRefPOC = m_slice->m_refPOCList[picList][outRefIdx];
+    int colRefPOC = colCU->m_slice->m_refPOCList[colRefPicList][colRefIdx];
+    int colPOC = colCU->m_slice->m_poc;
+    MV colmv = colCU->m_mv[colRefPicList][absPartAddr];
+
+    int curRefPOC = m_slice->m_refPOCList[picList][outRefIdx];
+    int curPOC = m_slice->m_poc;
 
     scaleMvByPOCDist(outMV, colmv, curPOC, curRefPOC, colPOC, colRefPOC);
     return true;
@@ -2096,7 +2080,7 @@ void CUData::getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uin
 
 #define CU_SET_FLAG(bitfield, flag, value) (bitfield) = ((bitfield) & (~(flag))) | ((~((value) - 1)) & (flag))
 
-void CUData::calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const
+void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS])
 {
     // Initialize the coding blocks inside the CTB
     for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= MIN_LOG2_CU_SIZE; log2CUSize--)
@@ -2111,10 +2095,10 @@ void CUData::calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUS
                 uint32_t depthIdx = g_depthScanIdx[sbY][sbX];
                 uint32_t cuIdx = rangeCUIdx + depthIdx;
                 uint32_t childIdx = rangeCUIdx + sbWidth * sbWidth + (depthIdx << 2);
-                uint32_t px = m_cuPelX + sbX * blockSize;
-                uint32_t py = m_cuPelY + sbY * blockSize;
-                int32_t presentFlag = px < picWidth && py < picHeight;
-                int32_t splitMandatoryFlag = presentFlag && !lastLevelFlag && (px + blockSize > picWidth || py + blockSize > picHeight);
+                uint32_t px = sbX * blockSize;
+                uint32_t py = sbY * blockSize;
+                int32_t presentFlag = px < ctuWidth && py < ctuHeight;
+                int32_t splitMandatoryFlag = presentFlag && !lastLevelFlag && (px + blockSize > ctuWidth || py + blockSize > ctuHeight);
                 
                 /* Offset of the luma CU in the X, Y direction in terms of pixels from the CTU origin */
                 uint32_t xOffset = (sbX * blockSize) >> 3;
diff --git a/source/common/cudata.h b/source/common/cudata.h
index 7f735d6..e5d88cf 100644
--- a/source/common/cudata.h
+++ b/source/common/cudata.h
@@ -46,14 +46,15 @@ enum PartSize
     SIZE_2NxnD, // asymmetric motion partition, 2Nx(3N/2) + 2Nx( N/2)
     SIZE_nLx2N, // asymmetric motion partition, ( N/2)x2N + (3N/2)x2N
     SIZE_nRx2N, // asymmetric motion partition, (3N/2)x2N + ( N/2)x2N
-    SIZE_NONE = 15
+    NUM_SIZES
 };
 
 enum PredMode
 {
-    MODE_INTER,
-    MODE_INTRA,
-    MODE_NONE = 15
+    MODE_NONE  = 0,
+    MODE_INTER = (1 << 0),
+    MODE_INTRA = (1 << 1),
+    MODE_SKIP  = (1 << 2) | MODE_INTER
 };
 
 // motion vector predictor direction used in AMVP
@@ -126,15 +127,14 @@ public:
     int           m_vChromaShift;
 
     /* Per-part data, stored contiguously */
-    char*         m_qp;               // array of QP values
+    int8_t*       m_qp;               // array of QP values
     uint8_t*      m_log2CUSize;       // array of cu log2Size TODO: seems redundant to depth
-    uint8_t*      m_partSize;         // array of partition sizes
-    uint8_t*      m_predMode;         // array of prediction modes
     uint8_t*      m_lumaIntraDir;     // array of intra directions (luma)
     uint8_t*      m_tqBypass;         // array of CU lossless flags
-    char*         m_refIdx[2];        // array of motion reference indices per list
+    int8_t*       m_refIdx[2];        // array of motion reference indices per list
     uint8_t*      m_cuDepth;          // array of depths
-    uint8_t*      m_skipFlag;         // array of skip flags
+    uint8_t*      m_predMode;         // array of prediction modes
+    uint8_t*      m_partSize;         // array of partition sizes
     uint8_t*      m_mergeFlag;        // array of merge flags
     uint8_t*      m_interDir;         // array of inter directions
     uint8_t*      m_mvpIdx[2];        // array of motion vector predictor candidates or merge candidate indices [0]
@@ -142,7 +142,7 @@ public:
     uint8_t*      m_transformSkip[3]; // array of transform skipping flags per plane
     uint8_t*      m_cbf[3];           // array of coded block flags (CBF) per plane
     uint8_t*      m_chromaIntraDir;   // array of intra directions (chroma)
-    enum { BytesPerPartition = 22 };  // combined sizeof() of all per-part data
+    enum { BytesPerPartition = 21 };  // combined sizeof() of all per-part data
 
     coeff_t*      m_trCoeff[3];       // transformed coefficient buffer per plane
 
@@ -158,7 +158,7 @@ public:
     CUData();
 
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance);
-    void     calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const;
+    static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]);
 
     void     initCTU(const Frame& frame, uint32_t cuAddr, int qp);
     void     initSubCU(const CUData& ctu, const CUGeom& cuGeom);
@@ -173,12 +173,11 @@ public:
     void     updatePic(uint32_t depth) const;
 
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
-    void     setSkipFlagSubParts(uint8_t skipFlag) { m_partSet(m_skipFlag, skipFlag); }
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
     void     clearCbf()                            { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); }
 
     /* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */
-    void     setQPSubParts(char qp, uint32_t absPartIdx, uint32_t depth)                      { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
+    void     setQPSubParts(int8_t qp, uint32_t absPartIdx, uint32_t depth)                    { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
     void     setTUDepthSubParts(uint8_t tuDepth, uint32_t absPartIdx, uint32_t depth)         { s_partSet[depth](m_tuDepth + absPartIdx, tuDepth); }
     void     setLumaIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth)        { s_partSet[depth](m_lumaIntraDir + absPartIdx, dir); }
     void     setChromIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth)       { s_partSet[depth](m_chromaIntraDir + absPartIdx, dir); }
@@ -187,15 +186,15 @@ public:
     void     setTransformSkipSubParts(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_transformSkip[ttype] + absPartIdx, tskip); }
     void     setTransformSkipPartRange(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t coveredPartIdxes) { memset(m_transformSkip[ttype] + absPartIdx, tskip, coveredPartIdxes); }
 
-    bool     setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth);
+    bool     setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth);
 
     void     setPUInterDir(uint8_t dir, uint32_t absPartIdx, uint32_t puIdx);
     void     setPUMv(int list, const MV& mv, int absPartIdx, int puIdx);
-    void     setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx);
+    void     setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
 
-    uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t trDepth) const { return (m_cbf[ttype][absPartIdx] >> trDepth) & 0x1; }
+    uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
     uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
-    char     getRefQP(uint32_t currAbsIdxInCTU) const;
+    int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const;
     void     clipMv(MV& outMV) const;
     int      fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const;
@@ -204,7 +203,8 @@ public:
 
     uint32_t getNumPartInter() const              { return nbPartsTable[(int)m_partSize[0]]; }
     bool     isIntra(uint32_t absPartIdx) const   { return m_predMode[absPartIdx] == MODE_INTRA; }
-    bool     isSkipped(uint32_t absPartIdx) const { return !!m_skipFlag[absPartIdx]; }
+    bool     isInter(uint32_t absPartIdx) const   { return !!(m_predMode[absPartIdx] & MODE_INTER); }
+    bool     isSkipped(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_SKIP; }
     bool     isBipredRestriction() const          { return m_log2CUSize[0] == 3 && m_partSize[0] != SIZE_2Nx2N; }
 
     void     getPartIndexAndSize(uint32_t puIdx, uint32_t& absPartIdx, int& puWidth, int& puHeight) const;
@@ -221,7 +221,7 @@ public:
     void     getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const;
 
     const CUData* getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const;
-    const CUData* getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary = false) const;
+    const CUData* getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx) const;
     const CUData* getPUAboveLeft(uint32_t& alPartUnitIdx, uint32_t curPartUnitIdx) const;
     const CUData* getPUAboveRight(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx) const;
     const CUData* getPUBelowLeft(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx) const;
@@ -237,7 +237,7 @@ protected:
     template<typename T>
     void setAllPU(T *p, const T& val, int absPartIdx, int puIdx);
 
-    char getLastCodedQP(uint32_t absPartIdx) const;
+    int8_t getLastCodedQP(uint32_t absPartIdx) const;
     int  getLastValidPartIdx(int absPartIdx) const;
 
     bool hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const;
diff --git a/source/common/dct.cpp b/source/common/dct.cpp
index 714006e..09cf829 100644
--- a/source/common/dct.cpp
+++ b/source/common/dct.cpp
@@ -41,7 +41,7 @@ namespace {
 
 // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
 // give identical results
-void fastForwardDst(int16_t *block, int16_t *coeff, int shift)  // input block, output coeff
+void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
 {
     int c[4];
     int rnd_factor = 1 << (shift - 1);
@@ -61,7 +61,7 @@ void fastForwardDst(int16_t *block, int16_t *coeff, int shift)  // input block,
     }
 }
 
-void inversedst(int16_t *tmp, int16_t *block, int shift)  // input tmp, output block
+void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
 {
     int i, c[4];
     int rnd_factor = 1 << (shift - 1);
@@ -81,7 +81,7 @@ void inversedst(int16_t *tmp, int16_t *block, int shift)  // input tmp, output b
     }
 }
 
-void partialButterfly16(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[8], O[8];
@@ -134,7 +134,7 @@ void partialButterfly16(int16_t *src, int16_t *dst, int shift, int line)
     }
 }
 
-void partialButterfly32(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[16], O[16];
@@ -203,7 +203,7 @@ void partialButterfly32(int16_t *src, int16_t *dst, int shift, int line)
     }
 }
 
-void partialButterfly8(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[4], O[4];
@@ -240,7 +240,7 @@ void partialButterfly8(int16_t *src, int16_t *dst, int shift, int line)
     }
 }
 
-void partialButterflyInverse4(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j;
     int E[2], O[2];
@@ -265,7 +265,7 @@ void partialButterflyInverse4(int16_t *src, int16_t *dst, int shift, int line)
     }
 }
 
-void partialButterflyInverse8(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[4], O[4];
@@ -301,7 +301,7 @@ void partialButterflyInverse8(int16_t *src, int16_t *dst, int shift, int line)
     }
 }
 
-void partialButterflyInverse16(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[8], O[8];
@@ -352,7 +352,7 @@ void partialButterflyInverse16(int16_t *src, int16_t *dst, int shift, int line)
     }
 }
 
-void partialButterflyInverse32(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[16], O[16];
@@ -416,7 +416,7 @@ void partialButterflyInverse32(int16_t *src, int16_t *dst, int shift, int line)
     }
 }
 
-void partialButterfly4(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j;
     int E[2], O[2];
@@ -440,7 +440,7 @@ void partialButterfly4(int16_t *src, int16_t *dst, int shift, int line)
     }
 }
 
-void dst4_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -450,25 +450,14 @@ void dst4_c(int16_t *src, int32_t *dst, intptr_t stride)
 
     for (int i = 0; i < 4; i++)
     {
-        memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
     }
 
     fastForwardDst(block, coef, shift_1st);
-    fastForwardDst(coef, block, shift_2nd);
-
-#define N (4)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    fastForwardDst(coef, dst, shift_2nd);
 }
 
-void dct4_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -478,24 +467,14 @@ void dct4_c(int16_t *src, int32_t *dst, intptr_t stride)
 
     for (int i = 0; i < 4; i++)
     {
-        memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
     }
 
     partialButterfly4(block, coef, shift_1st, 4);
-    partialButterfly4(coef, block, shift_2nd, 4);
-#define N (4)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    partialButterfly4(coef, dst, shift_2nd, 4);
 }
 
-void dct8_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 2 + X265_DEPTH - 8;
     const int shift_2nd = 9;
@@ -505,25 +484,14 @@ void dct8_c(int16_t *src, int32_t *dst, intptr_t stride)
 
     for (int i = 0; i < 8; i++)
     {
-        memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
+        memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
     }
 
     partialButterfly8(block, coef, shift_1st, 8);
-    partialButterfly8(coef, block, shift_2nd, 8);
-
-#define N (8)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    partialButterfly8(coef, dst, shift_2nd, 8);
 }
 
-void dct16_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 3 + X265_DEPTH - 8;
     const int shift_2nd = 10;
@@ -533,25 +501,14 @@ void dct16_c(int16_t *src, int32_t *dst, intptr_t stride)
 
     for (int i = 0; i < 16; i++)
     {
-        memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
+        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
     }
 
     partialButterfly16(block, coef, shift_1st, 16);
-    partialButterfly16(coef, block, shift_2nd, 16);
-
-#define N (16)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    partialButterfly16(coef, dst, shift_2nd, 16);
 }
 
-void dct32_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 4 + X265_DEPTH - 8;
     const int shift_2nd = 11;
@@ -561,25 +518,14 @@ void dct32_c(int16_t *src, int32_t *dst, intptr_t stride)
 
     for (int i = 0; i < 32; i++)
     {
-        memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
+        memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
     }
 
     partialButterfly32(block, coef, shift_1st, 32);
-    partialButterfly32(coef, block, shift_2nd, 32);
-
-#define N (32)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    partialButterfly32(coef, dst, shift_2nd, 32);
 }
 
-void idst4_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -587,27 +533,16 @@ void idst4_c(int32_t *src, int16_t *dst, intptr_t stride)
     ALIGN_VAR_32(int16_t, coef[4 * 4]);
     ALIGN_VAR_32(int16_t, block[4 * 4]);
 
-#define N (4)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
+    inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
     inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
 
     for (int i = 0; i < 4; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
     }
 }
 
-void idct4_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -615,27 +550,16 @@ void idct4_c(int32_t *src, int16_t *dst, intptr_t stride)
     ALIGN_VAR_32(int16_t, coef[4 * 4]);
     ALIGN_VAR_32(int16_t, block[4 * 4]);
 
-#define N (4)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
+    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
     partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
 
     for (int i = 0; i < 4; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
     }
 }
 
-void idct8_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -643,26 +567,16 @@ void idct8_c(int32_t *src, int16_t *dst, intptr_t stride)
     ALIGN_VAR_32(int16_t, coef[8 * 8]);
     ALIGN_VAR_32(int16_t, block[8 * 8]);
 
-#define N (8)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    partialButterflyInverse8(block, coef, shift_1st, 8);
+    partialButterflyInverse8(src, coef, shift_1st, 8);
     partialButterflyInverse8(coef, block, shift_2nd, 8);
+
     for (int i = 0; i < 8; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 8], 8 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
     }
 }
 
-void idct16_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -670,26 +584,16 @@ void idct16_c(int32_t *src, int16_t *dst, intptr_t stride)
     ALIGN_VAR_32(int16_t, coef[16 * 16]);
     ALIGN_VAR_32(int16_t, block[16 * 16]);
 
-#define N (16)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    partialButterflyInverse16(block, coef, shift_1st, 16);
+    partialButterflyInverse16(src, coef, shift_1st, 16);
     partialButterflyInverse16(coef, block, shift_2nd, 16);
+
     for (int i = 0; i < 16; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 16], 16 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
     }
 }
 
-void idct32_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -697,27 +601,16 @@ void idct32_c(int32_t *src, int16_t *dst, intptr_t stride)
     ALIGN_VAR_32(int16_t, coef[32 * 32]);
     ALIGN_VAR_32(int16_t, block[32 * 32]);
 
-#define N (32)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    partialButterflyInverse32(block, coef, shift_1st, 32);
+    partialButterflyInverse32(src, coef, shift_1st, 32);
     partialButterflyInverse32(coef, block, shift_2nd, 32);
 
     for (int i = 0; i < 32; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 32], 32 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
     }
 }
 
-void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
+void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
 {
 #if HIGH_BIT_DEPTH
     X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > 2), "dequant invalid scale %d\n", scale);
@@ -737,11 +630,11 @@ void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scal
     for (int n = 0; n < num; n++)
     {
         coeffQ = (quantCoef[n] * scale + add) >> shift;
-        coef[n] = Clip3(-32768, 32767, coeffQ);
+        coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ);
     }
 }
 
-void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
 {
     X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
 
@@ -756,7 +649,7 @@ void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int
         for (int n = 0; n < num; n++)
         {
             coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
-            coef[n] = Clip3(-32768, 32767, coeffQ);
+            coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ);
         }
     }
     else
@@ -764,12 +657,12 @@ void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int
         for (int n = 0; n < num; n++)
         {
             coeffQ   = Clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
-            coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift));
+            coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ << (per - shift));
         }
     }
 }
 
-uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
 {
     X265_CHECK(qBits >= 8, "qBits less than 8\n");
     X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
@@ -793,7 +686,7 @@ uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* q
     return numSig;
 }
 
-uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
+uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
 {
     X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
     X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
@@ -817,7 +710,7 @@ uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits,
     return numSig;
 }
 
-int  count_nonzero_c(const int16_t *quantCoeff, int numCoeff)
+int  count_nonzero_c(const int16_t* quantCoeff, int numCoeff)
 {
     X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
     X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff);
@@ -833,22 +726,22 @@ int  count_nonzero_c(const int16_t *quantCoeff, int numCoeff)
 }
 
 template<int trSize>
-uint32_t copy_count(int16_t* coeff, int16_t* residual, intptr_t stride)
+uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
 {
     uint32_t numSig = 0;
     for (int k = 0; k < trSize; k++)
     {
         for (int j = 0; j < trSize; j++)
         {
-            coeff[k * trSize + j] = residual[k * stride + j];
-            numSig += (residual[k * stride + j] != 0);
+            coeff[k * trSize + j] = residual[k * resiStride + j];
+            numSig += (residual[k * resiStride + j] != 0);
         }
     }
 
     return numSig;
 }
 
-void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
+void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff)
 {
     for (int i = 0; i < numCoeff; i++)
     {
@@ -857,7 +750,7 @@ void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numC
         level = (level + sign) ^ sign;
         resSum[i] += level;
         level -= offset[i];
-        dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
+        dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
     }
 }
 
diff --git a/source/common/deblock.cpp b/source/common/deblock.cpp
index c9a2731..7369787 100644
--- a/source/common/deblock.cpp
+++ b/source/common/deblock.cpp
@@ -33,20 +33,44 @@ using namespace x265;
 #define DEBLOCK_SMALLEST_BLOCK  8
 #define DEFAULT_INTRA_TC_OFFSET 2
 
-void Deblock::deblockCTU(CUData* cu, int32_t dir)
+void Deblock::deblockCTU(const CUData* ctu, int32_t dir)
 {
-    uint8_t blockingStrength[MAX_NUM_PARTITIONS];
+    uint8_t blockStrength[MAX_NUM_PARTITIONS];
 
-    memset(blockingStrength, 0, sizeof(uint8_t) * m_numPartitions);
+    memset(blockStrength, 0, sizeof(uint8_t) * m_numPartitions);
 
-    deblockCU(cu, 0, 0, dir, blockingStrength);
+    deblockCU(ctu, 0, 0, dir, blockStrength);
+}
+
+static inline uint8_t bsCuEdge(const CUData* cu, uint32_t absPartIdx, int32_t dir)
+{
+    if (dir == Deblock::EDGE_VER)
+    {
+        if (cu->m_cuPelX + g_zscanToPelX[absPartIdx] > 0)
+        {
+            uint32_t    tempPartIdx;
+            const CUData* tempCU = cu->getPULeft(tempPartIdx, absPartIdx);
+            return tempCU ? 2 : 0;
+        }
+    }
+    else
+    {
+        if (cu->m_cuPelY + g_zscanToPelY[absPartIdx] > 0)
+        {
+            uint32_t    tempPartIdx;
+            const CUData* tempCU = cu->getPUAbove(tempPartIdx, absPartIdx);
+            return tempCU ? 2 : 0;
+        }
+    }
+
+    return 0;
 }
 
 /* Deblocking filter process in CU-based (the same function as conventional's)
  * param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */
-void Deblock::deblockCU(CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockingStrength[])
+void Deblock::deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[])
 {
-    if (cu->m_partSize[absPartIdx] == SIZE_NONE)
+    if (cu->m_predMode[absPartIdx] == MODE_NONE)
         return;
 
     uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
@@ -60,23 +84,21 @@ void Deblock::deblockCU(CUData* cu, uint32_t absPartIdx, uint32_t depth, const i
         uint32_t ymax = sps.picHeightInLumaSamples - cu->m_cuPelY;
         for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
             if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
-                deblockCU(cu, absPartIdx, depth + 1, dir, blockingStrength);
+                deblockCU(cu, absPartIdx, depth + 1, dir, blockStrength);
         return;
     }
 
-    const uint32_t widthInBaseUnits = sps.numPartInCUSize >> depth;
-    Param params;
-    setLoopfilterParam(cu, absPartIdx, &params);
-    setEdgefilterPU(cu, absPartIdx, dir, blockingStrength, widthInBaseUnits);
-    setEdgefilterTU(cu, absPartIdx, depth, dir, blockingStrength);
-    setEdgefilterMultiple(cu, absPartIdx, dir, 0, (dir == EDGE_VER ? params.leftEdge : params.topEdge), blockingStrength, widthInBaseUnits);
+    const uint32_t numUnits  = sps.numPartInCUSize >> depth;
+    setEdgefilterPU(cu, absPartIdx, dir, blockStrength, numUnits);
+    setEdgefilterTU(cu, absPartIdx, depth, dir, blockStrength);
+    setEdgefilterMultiple(cu, absPartIdx, dir, 0, bsCuEdge(cu, absPartIdx, dir), blockStrength, numUnits);
 
     for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + curNumParts; partIdx++)
     {
         uint32_t bsCheck = !(partIdx & (1 << dir));
 
-        if (bsCheck && blockingStrength[partIdx])
-            getBoundaryStrengthSingle(cu, dir, partIdx, blockingStrength);
+        if (bsCheck && blockStrength[partIdx])
+            blockStrength[partIdx] = getBoundaryStrength(cu, dir, partIdx, blockStrength);
     }
 
     const uint32_t partIdxIncr = DEBLOCK_SMALLEST_BLOCK >> LOG2_UNIT_SIZE;
@@ -87,34 +109,33 @@ void Deblock::deblockCU(CUData* cu, uint32_t absPartIdx, uint32_t depth, const i
         
     for (uint32_t e = 0; e < sizeInPU; e += partIdxIncr)
     {
-        edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockingStrength);
+        edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
         if (!((e0 + e) & chromaMask))
-            edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockingStrength);
+            edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockStrength);
     }
 }
 
-static inline uint32_t calcBsIdx(CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
+static inline uint32_t calcBsIdx(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
 {
-    uint32_t ctuWidthInBaseUnits = cu->m_slice->m_sps->numPartInCUSize;
+    uint32_t numPartInCUSize = cu->m_slice->m_sps->numPartInCUSize;
 
     if (dir)
-        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * ctuWidthInBaseUnits + baseUnitIdx];
+        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * numPartInCUSize + baseUnitIdx];
     else
-        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * ctuWidthInBaseUnits + edgeIdx];
+        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numPartInCUSize + edgeIdx];
 }
 
-void Deblock::setEdgefilterMultiple(CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits)
+void Deblock::setEdgefilterMultiple(const CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits)
 {
-    const uint32_t numElem = widthInBaseUnits;
-    X265_CHECK(numElem > 0, "numElem edge filter check\n");
-    for (uint32_t i = 0; i < numElem; i++)
+    X265_CHECK(numUnits > 0, "numUnits edge filter check\n");
+    for (uint32_t i = 0; i < numUnits; i++)
     {
         const uint32_t bsidx = calcBsIdx(cu, scanIdx, dir, edgeIdx, i);
-        blockingStrength[bsidx] = value;
+        blockStrength[bsidx] = value;
     }
 }
 
-void Deblock::setEdgefilterTU(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[])
+void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[])
 {
     if ((uint32_t)cu->m_tuDepth[absPartIdx] + cu->m_cuDepth[absPartIdx] > depth)
     {
@@ -122,47 +143,47 @@ void Deblock::setEdgefilterTU(CUData* cu, uint32_t absPartIdx, uint32_t depth, i
         const uint32_t qNumParts   = curNumParts >> 2;
 
         for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
-            setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockingStrength);
+            setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockStrength);
         return;
     }
 
-    uint32_t widthInBaseUnits  = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE);
-    setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockingStrength, widthInBaseUnits);
+    uint32_t numUnits  = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE);
+    setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockStrength, numUnits);
 }
 
-void Deblock::setEdgefilterPU(CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits)
+void Deblock::setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits)
 {
-    const uint32_t hWidthInBaseUnits = widthInBaseUnits >> 1;
-    const uint32_t qWidthInBaseUnits = widthInBaseUnits >> 2;
+    const uint32_t hNumUnits = numUnits >> 1;
+    const uint32_t qNumUnits = numUnits >> 2;
 
     switch (cu->m_partSize[absPartIdx])
     {
     case SIZE_2NxN:
         if (EDGE_HOR == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+            setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_Nx2N:
         if (EDGE_VER == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+            setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_NxN:
-        setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+        setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_2NxnU:
         if (EDGE_HOR == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+            setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_nLx2N:
         if (EDGE_VER == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+            setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_2NxnD:
         if (EDGE_HOR == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+            setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_nRx2N:
         if (EDGE_VER == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+            setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
         break;
 
     case SIZE_2Nx2N:
@@ -171,151 +192,65 @@ void Deblock::setEdgefilterPU(CUData* cu, uint32_t absPartIdx, int32_t dir, uint
     }
 }
 
-void Deblock::setLoopfilterParam(CUData* cu, uint32_t absPartIdx, Param *params)
+uint8_t Deblock::getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[])
 {
-    uint32_t x = cu->m_cuPelX + g_zscanToPelX[absPartIdx];
-    uint32_t y = cu->m_cuPelY + g_zscanToPelY[absPartIdx];
-
-    const CUData* tempCU;
-    uint32_t    tempPartIdx;
+    // Calculate block index
+    uint32_t partP;
+    const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ));
 
-    if (!x)
-        params->leftEdge = 0;
-    else
-    {
-        tempCU = cu->getPULeft(tempPartIdx, absPartIdx);
-        if (tempCU)
-            params->leftEdge = 2;
-        else
-            params->leftEdge = 0;
-    }
+    // Set BS for Intra MB : BS = 2
+    if (cuP->isIntra(partP) || cuQ->isIntra(partQ))
+        return 2;
 
-    if (!y)
-        params->topEdge = 0;
-    else
-    {
-        tempCU = cu->getPUAbove(tempPartIdx, absPartIdx);
-        if (tempCU)
-            params->topEdge = 2;
-        else
-            params->topEdge = 0;
-    }
-}
+    // Set BS for not Intra MB : BS = 1 or 0
+    if (blockStrength[partQ] > 1 &&
+        (cuQ->getCbf(partQ, TEXT_LUMA, cuQ->m_tuDepth[partQ]) ||
+         cuP->getCbf(partP, TEXT_LUMA, cuP->m_tuDepth[partP])))
+        return 1;
 
-void Deblock::getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t absPartIdx, uint8_t blockingStrength[])
-{
-    const Slice* const slice = cu->m_slice;
-    const uint32_t partQ = absPartIdx;
-    CUData* const cuQ = cu;
+    static const MV zeroMv(0, 0);
+    const Slice* const sliceQ = cuQ->m_slice;
+    const Slice* const sliceP = cuP->m_slice;
 
-    uint32_t partP;
-    const CUData* cuP;
-    uint8_t bs = 0;
+    const Frame* refP0 = sliceP->getRefPic(0, cuP->m_refIdx[0][partP]);
+    const Frame* refQ0 = sliceQ->getRefPic(0, cuQ->m_refIdx[0][partQ]);
+    const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv;
+    const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv;
 
-    // Calculate block index
-    if (dir == EDGE_VER)
-        cuP = cuQ->getPULeft(partP, partQ);
-    else // (dir == EDGE_HOR)
-        cuP = cuQ->getPUAbove(partP, partQ);
+    if (sliceQ->isInterP() && sliceP->isInterP())
+    {
+        return ((refP0 != refQ0) ||
+                (abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4)) ? 1 : 0;
+    }
 
-    // Set BS for Intra MB : BS = 4 or 3
-    if (cuP->isIntra(partP) || cuQ->isIntra(partQ))
-        bs = 2;
+    // (sliceQ->isInterB() || sliceP->isInterB())
+    const Frame* refP1 = sliceP->getRefPic(1, cuP->m_refIdx[1][partP]);
+    const Frame* refQ1 = sliceQ->getRefPic(1, cuQ->m_refIdx[1][partQ]);
+    const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv;
+    const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv;
 
-    // Set BS for not Intra MB : BS = 2 or 1 or 0
-    if (!cuP->isIntra(partP) && !cuQ->isIntra(partQ))
+    if (((refP0 == refQ0) && (refP1 == refQ1)) || ((refP0 == refQ1) && (refP1 == refQ0)))
     {
-        uint32_t nsPartQ = partQ;
-        uint32_t nsPartP = partP;
-
-        if (blockingStrength[absPartIdx] > 1 &&
-            (cuQ->getCbf(nsPartQ, TEXT_LUMA, cuQ->m_tuDepth[nsPartQ]) ||
-             cuP->getCbf(nsPartP, TEXT_LUMA, cuP->m_tuDepth[nsPartP])))
-            bs = 1;
-        else
+        if (refP0 != refP1) // Different L0 & L1
         {
-            if (dir == EDGE_HOR)
-                cuP = cuQ->getPUAbove(partP, partQ);
-
-            if (slice->isInterB() || cuP->m_slice->isInterB())
-            {
-                int32_t refIdx;
-                Frame *refP0, *refP1, *refQ0, *refQ1;
-                refIdx = cuP->m_refIdx[0][partP];
-                refP0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx];
-                refIdx = cuP->m_refIdx[1][partP];
-                refP1 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[1][refIdx];
-                refIdx = cuQ->m_refIdx[0][partQ];
-                refQ0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx];
-                refIdx = cuQ->m_refIdx[1][partQ];
-                refQ1 = (refIdx < 0) ? NULL : slice->m_refPicList[1][refIdx];
-
-                MV mvp0 = cuP->m_mv[0][partP];
-                MV mvp1 = cuP->m_mv[1][partP];
-                MV mvq0 = cuQ->m_mv[0][partQ];
-                MV mvq1 = cuQ->m_mv[1][partQ];
-
-                if (!refP0) mvp0 = 0;
-                if (!refP1) mvp1 = 0;
-                if (!refQ0) mvq0 = 0;
-                if (!refQ1) mvq1 = 0;
-
-                if (((refP0 == refQ0) && (refP1 == refQ1)) || ((refP0 == refQ1) && (refP1 == refQ0)))
-                {
-                    if (refP0 != refP1) // Different L0 & L1
-                    {
-                        if (refP0 == refQ0)
-                        {
-                            bs  = ((abs(mvq0.x - mvp0.x) >= 4) ||
-                                   (abs(mvq0.y - mvp0.y) >= 4) ||
-                                   (abs(mvq1.x - mvp1.x) >= 4) ||
-                                   (abs(mvq1.y - mvp1.y) >= 4)) ? 1 : 0;
-                        }
-                        else
-                        {
-                            bs  = ((abs(mvq1.x - mvp0.x) >= 4) ||
-                                   (abs(mvq1.y - mvp0.y) >= 4) ||
-                                   (abs(mvq0.x - mvp1.x) >= 4) ||
-                                   (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0;
-                        }
-                    }
-                    else // Same L0 & L1
-                    {
-                        bs  = ((abs(mvq0.x - mvp0.x) >= 4) ||
-                               (abs(mvq0.y - mvp0.y) >= 4) ||
-                               (abs(mvq1.x - mvp1.x) >= 4) ||
-                               (abs(mvq1.y - mvp1.y) >= 4)) &&
-                              ((abs(mvq1.x - mvp0.x) >= 4) ||
-                               (abs(mvq1.y - mvp0.y) >= 4) ||
-                               (abs(mvq0.x - mvp1.x) >= 4) ||
-                               (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0;
-                    }
-                }
-                else // for all different Ref_Idx
-                    bs = 1;
-            }
-            else // slice->isInterP()
-            {
-                int32_t refIdx;
-                Frame *refp0, *refq0;
-                refIdx = cuP->m_refIdx[0][partP];
-                refp0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx];
-                refIdx = cuQ->m_refIdx[0][partQ];
-                refq0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx];
-                MV mvp0 = cuP->m_mv[0][partP];
-                MV mvq0 = cuQ->m_mv[0][partQ];
-
-                if (!refp0) mvp0 = 0;
-                if (!refq0) mvq0 = 0;
-
-                bs = ((refp0 != refq0) ||
-                      (abs(mvq0.x - mvp0.x) >= 4) ||
-                      (abs(mvq0.y - mvp0.y) >= 4)) ? 1 : 0;
-            }
+            if (refP0 == refQ0)
+                return ((abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4) ||
+                        (abs(mvQ1.x - mvP1.x) >= 4) || (abs(mvQ1.y - mvP1.y) >= 4)) ? 1 : 0;
+            else
+                return ((abs(mvQ1.x - mvP0.x) >= 4) || (abs(mvQ1.y - mvP0.y) >= 4) ||
+                        (abs(mvQ0.x - mvP1.x) >= 4) || (abs(mvQ0.y - mvP1.y) >= 4)) ? 1 : 0;
+        }
+        else // Same L0 & L1
+        {
+            return (((abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4) ||
+                     (abs(mvQ1.x - mvP1.x) >= 4) || (abs(mvQ1.y - mvP1.y) >= 4)) &&
+                    ((abs(mvQ1.x - mvP0.x) >= 4) || (abs(mvQ1.y - mvP0.y) >= 4) ||
+                     (abs(mvQ0.x - mvP1.x) >= 4) || (abs(mvQ0.y - mvP1.y) >= 4))) ? 1 : 0;
         }
     }
-
-    blockingStrength[absPartIdx] = bs;
+        
+    // for all different Ref_Idx
+    return 1;
 }
 
 static inline int32_t calcDP(pixel* src, intptr_t offset)
@@ -340,46 +275,45 @@ static inline bool useStrongFiltering(intptr_t offset, int32_t beta, int32_t tc,
 }
 
 /* Deblocking for the luminance component with strong or weak filter
- * \param src            pointer to picture data
- * \param offset         offset value for picture data
- * \param tc             tc value
- * \param partPNoFilter  indicator to disable filtering on partP
- * \param partQNoFilter  indicator to disable filtering on partQ
- * \param filterSecondP  decision weak filter/no filter for partP
- * \param filterSecondQ  decision weak filter/no filter for partQ */
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter)
+ * \param src     pointer to picture data
+ * \param offset  offset value for picture data
+ * \param tc      tc value
+ * \param maskP   indicator to enable filtering on partP
+ * \param maskQ   indicator to enable filtering on partQ
+ * \param maskP1  decision weak filter/no filter for partP
+ * \param maskQ1  decision weak filter/no filter for partQ */
+static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
 {
+    int32_t tc2 = 2 * tc;
+    int32_t tcP = (tc2 & maskP);
+    int32_t tcQ = (tc2 & maskQ);
     for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
     {
         int16_t m4  = (int16_t)src[0];
         int16_t m3  = (int16_t)src[-offset];
         int16_t m5  = (int16_t)src[offset];
         int16_t m2  = (int16_t)src[-offset * 2];
-        int32_t tc2 = 2 * tc;
-        if (!partPNoFilter)
-        {
-            int16_t m1  = (int16_t)src[-offset * 3];
-            int16_t m0  = (int16_t)src[-offset * 4];
-            src[-offset * 3] = (pixel)(Clip3(-tc2, tc2, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
-            src[-offset * 2] = (pixel)(Clip3(-tc2, tc2, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
-            src[-offset]     = (pixel)(Clip3(-tc2, tc2, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
-        }
-        if (!partQNoFilter)
-        {
-            int16_t m6  = (int16_t)src[offset * 2];
-            int16_t m7  = (int16_t)src[offset * 3];
-            src[0]           = (pixel)(Clip3(-tc2, tc2, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
-            src[offset]      = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
-            src[offset * 2]  = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
-        }
+        int16_t m6  = (int16_t)src[offset * 2];
+        int16_t m1  = (int16_t)src[-offset * 3];
+        int16_t m7  = (int16_t)src[offset * 3];
+        int16_t m0  = (int16_t)src[-offset * 4];
+        src[-offset * 3] = (pixel)(Clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
+        src[-offset * 2] = (pixel)(Clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
+        src[-offset]     = (pixel)(Clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
+        src[0]           = (pixel)(Clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
+        src[offset]      = (pixel)(Clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
+        src[offset * 2]  = (pixel)(Clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
     }
 }
 
 /* Weak filter */
-static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter,
-                                 bool filterSecondP, bool filterSecondQ)
+static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
+                                 int32_t maskP1, int32_t maskQ1)
 {
     int32_t thrCut = tc * 10;
+    int32_t tc2 = tc >> 1;
+    maskP1 &= maskP;
+    maskQ1 &= maskQ;
 
     for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
     {
@@ -394,38 +328,31 @@ static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset,
         {
             delta = Clip3(-tc, tc, delta);
 
-            int32_t tc2 = tc >> 1;
-            if (!partPNoFilter)
+            src[-offset] = Clip(m3 + (delta & maskP));
+            src[0] = Clip(m4 - (delta & maskQ));
+            if (maskP1)
             {
-                src[-offset] = Clip(m3 + delta);
-                if (filterSecondP)
-                {
-                    int16_t m1  = (int16_t)src[-offset * 3];
-                    int32_t delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1));
-                    src[-offset * 2] = Clip(m2 + delta1);
-                }
+                int16_t m1  = (int16_t)src[-offset * 3];
+                int32_t delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1));
+                src[-offset * 2] = Clip(m2 + delta1);
             }
-            if (!partQNoFilter)
+            if (maskQ1)
             {
-                src[0] = Clip(m4 - delta);
-                if (filterSecondQ)
-                {
-                    int16_t m6  = (int16_t)src[offset * 2];
-                    int32_t delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1));
-                    src[offset] = Clip(m5 + delta2);
-                }
+                int16_t m6  = (int16_t)src[offset * 2];
+                int32_t delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1));
+                src[offset] = Clip(m5 + delta2);
             }
         }
     }
 }
 
 /* Deblocking of one line/column for the chrominance component
- * \param src            pointer to picture data
- * \param offset         offset value for picture data
- * \param tc             tc value
- * \param partPNoFilter  indicator to disable filtering on partP
- * \param partQNoFilter  indicator to disable filtering on partQ */
-static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter)
+ * \param src     pointer to picture data
+ * \param offset  offset value for picture data
+ * \param tc      tc value
+ * \param maskP   indicator to disable filtering on partP
+ * \param maskQ   indicator to disable filtering on partQ */
+static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
 {
     for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
     {
@@ -435,31 +362,25 @@ static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset
         int16_t m2  = (int16_t)src[-offset * 2];
 
         int32_t delta = Clip3(-tc, tc, ((((m4 - m3) << 2) + m2 - m5 + 4) >> 3));
-        if (!partPNoFilter)
-            src[-offset] = Clip(m3 + delta);
-        if (!partQNoFilter)
-            src[0] = Clip(m4 - delta);
+        src[-offset] = Clip(m3 + (delta & maskP));
+        src[0] = Clip(m4 - (delta & maskQ));
     }
 }
 
-void Deblock::edgeFilterLuma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[])
+void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
 {
-    PicYuv* reconYuv = cu->m_encData->m_reconPicYuv;
-    pixel* src = reconYuv->getLumaAddr(cu->m_cuAddr, absPartIdx);
-
-    intptr_t stride = reconYuv->m_stride;
-    uint32_t numParts = cu->m_slice->m_sps->numPartInCUSize >> depth;
+    PicYuv* reconPic = cuQ->m_encData->m_reconPic;
+    pixel* src = reconPic->getLumaAddr(cuQ->m_cuAddr, absPartIdx);
+    intptr_t stride = reconPic->m_stride;
+    const PPS* pps = cuQ->m_slice->m_pps;
 
     intptr_t offset, srcStep;
 
-    bool  partPNoFilter = false;
-    bool  partQNoFilter = false;
-    uint32_t  partP = 0;
-    uint32_t  partQ = 0;
-    const CUData* cuP = cu;
-    const CUData* cuQ = cu;
-    int32_t betaOffset = cuQ->m_slice->m_pps->deblockingFilterBetaOffsetDiv2 << 1;
-    int32_t tcOffset = cuQ->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1;
+    int32_t maskP = -1;
+    int32_t maskQ = -1;
+    int32_t betaOffset = pps->deblockingFilterBetaOffsetDiv2 << 1;
+    int32_t tcOffset = pps->deblockingFilterTcOffsetDiv2 << 1;
+    bool bCheckNoFilter = pps->bTransquantBypassEnabled;
 
     if (dir == EDGE_VER)
     {
@@ -474,106 +395,103 @@ void Deblock::edgeFilterLuma(CUData* cu, uint32_t absPartIdx, uint32_t depth, in
         src += (edge << LOG2_UNIT_SIZE) * stride;
     }
 
-    for (uint32_t idx = 0; idx < numParts; idx++)
+    uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> depth;
+    for (uint32_t idx = 0; idx < numUnits; idx++)
     {
-        uint32_t unitOffset = idx << LOG2_UNIT_SIZE;
-        uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx);
-        uint32_t bs = blockingStrength[bsAbsIdx];
-        if (bs)
-        {
-            int32_t qpQ = cu->m_qp[bsAbsIdx];
-            partQ = bsAbsIdx;
+        uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx);
+        uint32_t bs = blockStrength[partQ];
 
-            // Derive neighboring PU index
-            if (dir == EDGE_VER)
-                cuP = cuQ->getPULeft(partP, partQ);
-            else // (dir == EDGE_HOR)
-                cuP = cuQ->getPUAbove(partP, partQ);
+        if (!bs)
+            continue;
 
-            int32_t qpP = cuP->m_qp[partP];
-            int32_t qp = (qpP + qpQ + 1) >> 1;
+        int32_t qpQ = cuQ->m_qp[partQ];
 
-            int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset);
+        // Derive neighboring PU index
+        uint32_t partP;
+        const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ));
 
-            const int32_t bitdepthShift = X265_DEPTH - 8;
-            int32_t beta = s_betaTable[indexB] << bitdepthShift;
+        int32_t qpP = cuP->m_qp[partP];
+        int32_t qp = (qpP + qpQ + 1) >> 1;
 
-            int32_t dp0 = calcDP(src + srcStep * (unitOffset + 0), offset);
-            int32_t dq0 = calcDQ(src + srcStep * (unitOffset + 0), offset);
-            int32_t dp3 = calcDP(src + srcStep * (unitOffset + 3), offset);
-            int32_t dq3 = calcDQ(src + srcStep * (unitOffset + 3), offset);
-            int32_t d0 = dp0 + dq0;
-            int32_t d3 = dp3 + dq3;
+        int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset);
 
-            int32_t d =  d0 + d3;
+        const int32_t bitdepthShift = X265_DEPTH - 8;
+        int32_t beta = s_betaTable[indexB] << bitdepthShift;
 
-            if (d < beta)
-            {
-                if (cu->m_slice->m_pps->bTransquantBypassEnabled)
-                {
-                    // check if each of PUs is lossless coded
-                    partPNoFilter = !!cuP->m_tqBypass[partP];
-                    partQNoFilter = !!cuQ->m_tqBypass[partQ];
-                }
-
-                int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
-                int32_t tc = s_tcTable[indexTC] << bitdepthShift;
-
-                bool sw = (2 * d0 < (beta >> 2) &&
-                           2 * d3 < (beta >> 2) &&
-                           useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 0)) &&
-                           useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 3)));
-
-                if (sw)
-                    pelFilterLumaStrong(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter);
-                else
-                {
-                    int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
-                    int32_t dp = dp0 + dp3;
-                    int32_t dq = dq0 + dq3;
-                    bool filterP = (dp < sideThreshold);
-                    bool filterQ = (dq < sideThreshold);
-
-                    pelFilterLuma(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter, filterP, filterQ);
-                }
-            }
+        intptr_t unitOffset = idx * srcStep << LOG2_UNIT_SIZE;
+        int32_t dp0 = calcDP(src + unitOffset              , offset);
+        int32_t dq0 = calcDQ(src + unitOffset              , offset);
+        int32_t dp3 = calcDP(src + unitOffset + srcStep * 3, offset);
+        int32_t dq3 = calcDQ(src + unitOffset + srcStep * 3, offset);
+        int32_t d0 = dp0 + dq0;
+        int32_t d3 = dp3 + dq3;
+
+        int32_t d =  d0 + d3;
+
+        if (d >= beta)
+            continue;
+
+        if (bCheckNoFilter)
+        {
+            // check if each of PUs is lossless coded
+            maskP = (cuP->m_tqBypass[partP] ? 0 : -1);
+            maskQ = (cuQ->m_tqBypass[partQ] ? 0 : -1);
+        }
+
+        int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
+        int32_t tc = s_tcTable[indexTC] << bitdepthShift;
+
+        bool sw = (2 * d0 < (beta >> 2) &&
+                   2 * d3 < (beta >> 2) &&
+                   useStrongFiltering(offset, beta, tc, src + unitOffset              ) &&
+                   useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
+
+        if (sw)
+            pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
+        else
+        {
+            int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
+            int32_t dp = dp0 + dp3;
+            int32_t dq = dq0 + dq3;
+            int32_t maskP1 = (dp < sideThreshold ? -1 : 0);
+            int32_t maskQ1 = (dq < sideThreshold ? -1 : 0);
+
+            pelFilterLuma(src + unitOffset, srcStep, offset, tc, maskP, maskQ, maskP1, maskQ1);
         }
     }
 }
 
-void Deblock::edgeFilterChroma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[])
+void Deblock::edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
 {
-    int32_t chFmt = cu->m_chromaFormat, chromaShift;
+    int32_t chFmt = cuQ->m_chromaFormat, chromaShift;
     intptr_t offset, srcStep;
+    const PPS* pps = cuQ->m_slice->m_pps;
 
-    bool partPNoFilter = false;
-    bool partQNoFilter = false;
-    uint32_t partP;
-    uint32_t partQ;
-    const CUData* cuP;
-    const CUData* cuQ = cu;
-    int32_t tcOffset = cu->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1;
+    int32_t maskP = -1;
+    int32_t maskQ = -1;
+    int32_t tcOffset = pps->deblockingFilterTcOffsetDiv2 << 1;
 
     X265_CHECK(((dir == EDGE_VER)
-                ? ((g_zscanToPelX[absPartIdx] + edge * UNIT_SIZE) >> cu->m_hChromaShift)
-                : ((g_zscanToPelY[absPartIdx] + edge * UNIT_SIZE) >> cu->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0,
+                ? ((g_zscanToPelX[absPartIdx] + edge * UNIT_SIZE) >> cuQ->m_hChromaShift)
+                : ((g_zscanToPelY[absPartIdx] + edge * UNIT_SIZE) >> cuQ->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0,
                "invalid edge\n");
 
-    PicYuv* reconPic = cu->m_encData->m_reconPicYuv;
+    PicYuv* reconPic = cuQ->m_encData->m_reconPic;
     intptr_t stride = reconPic->m_strideC;
-    intptr_t srcOffset = reconPic->getChromaAddrOffset(cu->m_cuAddr, absPartIdx);
+    intptr_t srcOffset = reconPic->getChromaAddrOffset(cuQ->m_cuAddr, absPartIdx);
+    bool bCheckNoFilter = pps->bTransquantBypassEnabled;
 
     if (dir == EDGE_VER)
     {
-        chromaShift = cu->m_vChromaShift;
-        srcOffset += (edge << (LOG2_UNIT_SIZE - cu->m_hChromaShift));
+        chromaShift = cuQ->m_vChromaShift;
+        srcOffset += (edge << (LOG2_UNIT_SIZE - cuQ->m_hChromaShift));
         offset     = 1;
         srcStep    = stride;
     }
     else // (dir == EDGE_HOR)
     {
-        chromaShift = cu->m_hChromaShift;
-        srcOffset += edge * stride << (LOG2_UNIT_SIZE - cu->m_vChromaShift);
+        chromaShift = cuQ->m_hChromaShift;
+        srcOffset += edge * stride << (LOG2_UNIT_SIZE - cuQ->m_vChromaShift);
         offset     = stride;
         srcStep    = 1;
     }
@@ -582,53 +500,50 @@ void Deblock::edgeFilterChroma(CUData* cu, uint32_t absPartIdx, uint32_t depth,
     srcChroma[0] = reconPic->m_picOrg[1] + srcOffset;
     srcChroma[1] = reconPic->m_picOrg[2] + srcOffset;
 
-    uint32_t numUnits = cu->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift);
+    uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift);
 
     for (uint32_t idx = 0; idx < numUnits; idx++)
     {
-        uint32_t unitOffset = idx << LOG2_UNIT_SIZE;
-        uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx << chromaShift);
-        uint32_t bs = blockingStrength[bsAbsIdx];
+        uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx << chromaShift);
+        uint32_t bs = blockStrength[partQ];
 
-        if (bs > 1)
-        {
-            int32_t qpQ = cu->m_qp[bsAbsIdx];
-            partQ = bsAbsIdx;
+        if (bs <= 1)
+            continue;
 
-            // Derive neighboring PU index
-            if (dir == EDGE_VER)
-                cuP = cuQ->getPULeft(partP, partQ);
-            else // (dir == EDGE_HOR)
-                cuP = cuQ->getPUAbove(partP, partQ);
+        int32_t qpQ = cuQ->m_qp[partQ];
 
-            int32_t qpP = cuP->m_qp[partP];
+        // Derive neighboring PU index
+        uint32_t partP;
+        const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ));
 
-            if (cu->m_slice->m_pps->bTransquantBypassEnabled)
-            {
-                // check if each of PUs is lossless coded
-                partPNoFilter = !!cuP->m_tqBypass[partP];
-                partQNoFilter = !!cuQ->m_tqBypass[partQ];
-            }
+        int32_t qpP = cuP->m_qp[partP];
 
-            for (uint32_t chromaIdx = 0; chromaIdx < 2; chromaIdx++)
+        if (bCheckNoFilter)
+        {
+            // check if each of PUs is lossless coded
+            maskP = (cuP->m_tqBypass[partP] ? 0 : -1);
+            maskQ = (cuQ->m_tqBypass[partQ] ? 0 : -1);
+        }
+
+        intptr_t unitOffset = idx * srcStep << LOG2_UNIT_SIZE;
+        for (uint32_t chromaIdx = 0; chromaIdx < 2; chromaIdx++)
+        {
+            int32_t chromaQPOffset  = pps->chromaQpOffset[chromaIdx];
+            int32_t qp = ((qpP + qpQ + 1) >> 1) + chromaQPOffset;
+            if (qp >= 30)
             {
-                int32_t chromaQPOffset  = !chromaIdx ? cu->m_slice->m_pps->chromaCbQpOffset : cu->m_slice->m_pps->chromaCrQpOffset;
-                int32_t qp = ((qpP + qpQ + 1) >> 1) + chromaQPOffset;
-                if (qp >= 30)
-                {
-                    if (chFmt == X265_CSP_I420)
-                        qp = g_chromaScale[qp];
-                    else
-                        qp = X265_MIN(qp, 51);
-                }
-
-                int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET + tcOffset));
-                const int32_t bitdepthShift = X265_DEPTH - 8;
-                int32_t tc = s_tcTable[indexTC] << bitdepthShift;
-                pixel* srcC = srcChroma[chromaIdx];
-
-                pelFilterChroma(srcC + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter);
+                if (chFmt == X265_CSP_I420)
+                    qp = g_chromaScale[qp];
+                else
+                    qp = X265_MIN(qp, 51);
             }
+
+            int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET + tcOffset));
+            const int32_t bitdepthShift = X265_DEPTH - 8;
+            int32_t tc = s_tcTable[indexTC] << bitdepthShift;
+            pixel* srcC = srcChroma[chromaIdx];
+
+            pelFilterChroma(srcC + unitOffset, srcStep, offset, tc, maskP, maskQ);
         }
     }
 }
diff --git a/source/common/deblock.h b/source/common/deblock.h
index 4bdfeff..f872625 100644
--- a/source/common/deblock.h
+++ b/source/common/deblock.h
@@ -42,31 +42,24 @@ public:
 
     void init() { m_numPartitions = 1 << (g_maxFullDepth * 2); }
 
-    void deblockCTU(CUData* cu, int32_t dir);
+    void deblockCTU(const CUData* ctu, int32_t dir);
 
 protected:
 
     // CU-level deblocking function
-    void deblockCU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, const int32_t Edge, uint8_t blockingStrength[]);
-
-    struct Param
-    {
-        uint8_t leftEdge;
-        uint8_t topEdge;
-    };
+    void deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[]);
 
     // set filtering functions
-    void setLoopfilterParam(CUData* cu, uint32_t absZOrderIdx, Param *params);
-    void setEdgefilterTU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[]);
-    void setEdgefilterPU(CUData* cu, uint32_t absZOrderIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits);
-    void setEdgefilterMultiple(CUData* cu, uint32_t absZOrderIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits);
+    void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[]);
+    void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
+    void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
 
     // get filtering functions
-    void getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t partIdx, uint8_t blockingStrength[]);
+    uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
 
     // filter luma/chroma functions
-    void edgeFilterLuma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]);
-    void edgeFilterChroma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]);
+    void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
+    void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
 
     static const uint8_t s_tcTable[54];
     static const uint8_t s_betaTable[52];
diff --git a/source/common/frame.cpp b/source/common/frame.cpp
index 8ae912f..9c7abee 100644
--- a/source/common/frame.cpp
+++ b/source/common/frame.cpp
@@ -34,7 +34,7 @@ Frame::Frame()
     m_reconRowCount.set(0);
     m_countRefEncoders = 0;
     m_encData = NULL;
-    m_reconPicYuv = NULL;
+    m_reconPic = NULL;
     m_next = NULL;
     m_prev = NULL;
     memset(&m_lowres, 0, sizeof(m_lowres));
@@ -42,26 +42,26 @@ Frame::Frame()
 
 bool Frame::create(x265_param *param)
 {
-    m_origPicYuv = new PicYuv;
+    m_fencPic = new PicYuv;
 
-    return m_origPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
-           m_lowres.create(m_origPicYuv, param->bframes, !!param->rc.aqMode);
+    return m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
+           m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
 }
 
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
 {
     m_encData = new FrameData;
-    m_reconPicYuv = new PicYuv;
-    m_encData->m_reconPicYuv = m_reconPicYuv;
-    bool ok = m_encData->create(param, sps) && m_reconPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
+    m_reconPic = new PicYuv;
+    m_encData->m_reconPic = m_reconPic;
+    bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
     if (ok)
     {
         /* initialize right border of m_reconpicYuv as SAO may read beyond the
          * end of the picture accessing uninitialized pixels */
         int maxHeight = sps.numCuInHeight * g_maxCUSize;
-        memset(m_reconPicYuv->m_picOrg[0], 0, m_reconPicYuv->m_stride * maxHeight);
-        memset(m_reconPicYuv->m_picOrg[1], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift));
-        memset(m_reconPicYuv->m_picOrg[2], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift));
+        memset(m_reconPic->m_picOrg[0], 0, m_reconPic->m_stride * maxHeight);
+        memset(m_reconPic->m_picOrg[1], 0, m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+        memset(m_reconPic->m_picOrg[2], 0, m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
     }
     return ok;
 }
@@ -70,7 +70,7 @@ bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
 void Frame::reinit(const SPS& sps)
 {
     m_bChromaExtended = false;
-    m_reconPicYuv = m_encData->m_reconPicYuv;
+    m_reconPic = m_encData->m_reconPic;
     m_encData->reinit(sps);
 }
 
@@ -83,18 +83,18 @@ void Frame::destroy()
         m_encData = NULL;
     }
 
-    if (m_origPicYuv)
+    if (m_fencPic)
     {
-        m_origPicYuv->destroy();
-        delete m_origPicYuv;
-        m_origPicYuv = NULL;
+        m_fencPic->destroy();
+        delete m_fencPic;
+        m_fencPic = NULL;
     }
 
-    if (m_reconPicYuv)
+    if (m_reconPic)
     {
-        m_reconPicYuv->destroy();
-        delete m_reconPicYuv;
-        m_reconPicYuv = NULL;
+        m_reconPic->destroy();
+        delete m_reconPic;
+        m_reconPic = NULL;
     }
 
     m_lowres.destroy();
diff --git a/source/common/frame.h b/source/common/frame.h
index 0fae62a..d023946 100644
--- a/source/common/frame.h
+++ b/source/common/frame.h
@@ -43,30 +43,29 @@ public:
 
     /* These two items will be NULL until the Frame begins to be encoded, at which point
      * it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */
-    FrameData*        m_encData;
-    PicYuv*           m_reconPicYuv;
+    FrameData*             m_encData;
+    PicYuv*                m_reconPic;
 
     /* Data associated with x265_picture */
-    PicYuv*           m_origPicYuv;
-    int               m_poc;
-    int64_t           m_pts;                // user provided presentation time stamp
-    int64_t           m_reorderedPts;
-    int64_t           m_dts;
-    int32_t           m_forceqp;            // Force to use the qp specified in qp file
-    x265_intra_data*  m_intraData;
-    x265_inter_data*  m_interData;
-    void*             m_userData;           // user provided pointer passed in with this picture
+    PicYuv*                m_fencPic;
+    int                    m_poc;
+    int64_t                m_pts;                // user provided presentation time stamp
+    int64_t                m_reorderedPts;
+    int64_t                m_dts;
+    int32_t                m_forceqp;            // Force to use the qp specified in qp file
+    void*                  m_userData;           // user provided pointer passed in with this picture
 
-    Lowres            m_lowres;
-    bool              m_bChromaExtended;    // orig chroma planes motion extended for weight analysis
+    Lowres                 m_lowres;
+    bool                   m_bChromaExtended;    // orig chroma planes motion extended for weight analysis
 
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
-    ThreadSafeInteger m_reconRowCount;      // count of CTU rows completely reconstructed and extended for motion reference
-    volatile uint32_t m_countRefEncoders;   // count of FrameEncoder threads monitoring m_reconRowCount
+    ThreadSafeInteger      m_reconRowCount;      // count of CTU rows completely reconstructed and extended for motion reference
+    volatile uint32_t      m_countRefEncoders;   // count of FrameEncoder threads monitoring m_reconRowCount
 
-    Frame*            m_next;               // PicList doubly linked list pointers
-    Frame*            m_prev;
+    Frame*                 m_next;               // PicList doubly linked list pointers
+    Frame*                 m_prev;
 
+    x265_analysis_data     m_analysisData;
     Frame();
 
     bool create(x265_param *param);
diff --git a/source/common/framedata.h b/source/common/framedata.h
index f6ea9d4..92754ce 100644
--- a/source/common/framedata.h
+++ b/source/common/framedata.h
@@ -49,7 +49,7 @@ public:
     x265_param*    m_param;
 
     FrameData*     m_freeListNext;
-    PicYuv*        m_reconPicYuv;
+    PicYuv*        m_reconPic;
     bool           m_bHasReferences;   /* used during DPB/RPS updates */
     int            m_frameEncoderID;   /* the ID of the FrameEncoder encoding this frame */
 
diff --git a/source/common/ipfilter.cpp b/source/common/ipfilter.cpp
index 4982cba..8467654 100644
--- a/source/common/ipfilter.cpp
+++ b/source/common/ipfilter.cpp
@@ -35,7 +35,7 @@ using namespace x265;
 
 namespace {
 template<int dstStride>
-void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+void filterConvertPelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height)
 {
     int shift = IF_INTERNAL_PREC - X265_DEPTH;
     int row, col;
@@ -74,9 +74,9 @@ void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, in
 }
 
 template<int N, int width, int height>
-void interp_horiz_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 {
-    int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+    const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
     int headRoom = IF_FILTER_PREC;
     int offset =  (1 << (headRoom - 1));
     uint16_t maxVal = (1 << X265_DEPTH) - 1;
@@ -115,9 +115,9 @@ void interp_horiz_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstS
 }
 
 template<int N, int width, int height>
-void interp_horiz_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 {
-    int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+    const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
     int shift = IF_FILTER_PREC - headRoom;
     int offset = -IF_INTERNAL_OFFS << shift;
@@ -160,9 +160,9 @@ void interp_horiz_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t ds
 }
 
 template<int N, int width, int height>
-void interp_vert_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 {
-    int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+    const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
     int shift = IF_FILTER_PREC;
     int offset = 1 << (shift - 1);
     uint16_t maxVal = (1 << X265_DEPTH) - 1;
@@ -201,9 +201,9 @@ void interp_vert_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstSt
 }
 
 template<int N, int width, int height>
-void interp_vert_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 {
-    int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+    const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
     int shift = IF_FILTER_PREC - headRoom;
     int offset = -IF_INTERNAL_OFFS << shift;
@@ -239,13 +239,13 @@ void interp_vert_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dst
 }
 
 template<int N, int width, int height>
-void interp_vert_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 {
     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
     int shift = IF_FILTER_PREC + headRoom;
     int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
     uint16_t maxVal = (1 << X265_DEPTH) - 1;
-    const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+    const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
 
     src -= (N / 2 - 1) * srcStride;
 
@@ -282,9 +282,9 @@ void interp_vert_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dst
 }
 
 template<int N, int width, int height>
-void interp_vert_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 {
-    const int16_t *const c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+    const int16_t* c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
     int shift = IF_FILTER_PREC;
     int row, col;
 
@@ -317,13 +317,13 @@ void interp_vert_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t d
 }
 
 template<int N>
-void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx)
+void filterVertical_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int coeffIdx)
 {
     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
     int shift = IF_FILTER_PREC + headRoom;
     int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
     uint16_t maxVal = (1 << X265_DEPTH) - 1;
-    const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+    const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
 
     src -= (N / 2 - 1) * srcStride;
 
@@ -360,7 +360,7 @@ void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t
 }
 
 template<int N, int width, int height>
-void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+void interp_hv_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
 {
     short immedVals[(64 + 8) * (64 + 8)];
 
@@ -509,9 +509,9 @@ void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
     CHROMA_444(16, 64);
     p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
 
-    p.chroma_p2s[X265_CSP_I444] = filterConvertPelToShort_c<MAX_CU_SIZE>;
-    p.chroma_p2s[X265_CSP_I420] = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
-    p.chroma_p2s[X265_CSP_I422] = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
+    p.chroma[X265_CSP_I444].p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
+    p.chroma[X265_CSP_I420].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
+    p.chroma[X265_CSP_I422].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
 
     p.extendRowBorder = extendCURowColBorder;
 }
diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp
index fe4f7b9..50bbc89 100644
--- a/source/common/lowres.cpp
+++ b/source/common/lowres.cpp
@@ -69,6 +69,7 @@ bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled)
     lowresPlane[3] = buffer[3] + padoffset;
 
     CHECKED_MALLOC(intraCost, int32_t, cuCount);
+    CHECKED_MALLOC(intraMode, uint8_t, cuCount);
 
     for (int i = 0; i < bframes + 2; i++)
     {
@@ -99,6 +100,7 @@ void Lowres::destroy()
         X265_FREE(buffer[i]);
 
     X265_FREE(intraCost);
+    X265_FREE(intraMode);
 
     for (int i = 0; i < bframes + 2; i++)
     {
@@ -155,7 +157,7 @@ void Lowres::init(PicYuv *origPic, int poc, int type)
         intraMbs[i] = 0;
 
     /* downscale and generate 4 hpel planes for lookahead */
-    primitives.frame_init_lowres_core(origPic->m_picOrg[0],
+    primitives.frameInitLowres(origPic->m_picOrg[0],
                                       lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3],
                                       origPic->m_stride, lumaStride, width, lines);
 
@@ -164,5 +166,5 @@ void Lowres::init(PicYuv *origPic, int poc, int type)
     extendPicBorder(lowresPlane[1], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
     extendPicBorder(lowresPlane[2], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
     extendPicBorder(lowresPlane[3], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
-    fpelPlane = lowresPlane[0];
+    fpelPlane[0] = lowresPlane[0];
 }
diff --git a/source/common/lowres.h b/source/common/lowres.h
index b88ad3e..a206c96 100644
--- a/source/common/lowres.h
+++ b/source/common/lowres.h
@@ -26,27 +26,36 @@
 
 #include "primitives.h"
 #include "common.h"
+#include "picyuv.h"
 #include "mv.h"
 
 namespace x265 {
 // private namespace
 
-class PicYuv;
-
 struct ReferencePlanes
 {
     ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
 
-    pixel*   fpelPlane;
+    pixel*   fpelPlane[3];
     pixel*   lowresPlane[4];
+    PicYuv*  reconPic;
 
     bool     isWeighted;
     bool     isLowres;
+
     intptr_t lumaStride;
-    int      weight;
-    int      offset;
-    int      shift;
-    int      round;
+    intptr_t chromaStride;
+
+    struct {
+        int      weight;
+        int      offset;
+        int      shift;
+        int      round;
+    } w[3];
+
+    pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[0] + reconPic->m_cuOffsetY[ctuAddr] + reconPic->m_buOffsetY[absPartIdx]; }
+    pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx)   { return fpelPlane[1] + reconPic->m_cuOffsetC[ctuAddr] + reconPic->m_buOffsetC[absPartIdx]; }
+    pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx)   { return fpelPlane[2] + reconPic->m_cuOffsetC[ctuAddr] + reconPic->m_buOffsetC[absPartIdx]; }
 
     /* lowres motion compensation, you must provide a buffer and stride for QPEL averaged pixels
      * in case QPEL is required.  Else it returns a pointer to the HPEL pixels */
@@ -56,11 +65,10 @@ struct ReferencePlanes
         {
             int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
             pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
-
-            MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2);
-            int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1);
-
-            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride;
+            int qmvx = qmv.x + (qmv.x & 1);
+            int qmvy = qmv.y + (qmv.y & 1);
+            int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
+            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
             primitives.pixelavg_pp[LUMA_8x8](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
             return buf;
         }
@@ -79,9 +87,10 @@ struct ReferencePlanes
             ALIGN_VAR_16(pixel, subpelbuf[8 * 8]);
             int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
             pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
-            MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2);
-            int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1);
-            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride;
+            int qmvx = qmv.x + (qmv.x & 1);
+            int qmvy = qmv.y + (qmv.y & 1);
+            int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
+            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
             primitives.pixelavg_pp[LUMA_8x8](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
             return comp(fenc, FENC_STRIDE, subpelbuf, 8);
         }
@@ -116,6 +125,7 @@ struct Lowres : public ReferencePlanes
     int32_t*  rowSatds[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2];
     int       intraMbs[X265_BFRAME_MAX + 2];
     int32_t*  intraCost;
+    uint8_t*  intraMode;
     int64_t   satdCost;
     uint16_t* lowresCostForRc;
     uint16_t(*lowresCosts[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]);
diff --git a/source/common/mv.h b/source/common/mv.h
index 22a7073..dad3729 100644
--- a/source/common/mv.h
+++ b/source/common/mv.h
@@ -44,19 +44,19 @@ public:
         int32_t word;
     };
 
-    MV() : word(0)                             {}
-
+    MV()                                       {}
+    MV(int32_t w) : word(w)                    {}
     MV(int16_t _x, int16_t _y) : x(_x), y(_y)  {}
 
-    const MV& operator =(uint32_t w)           { word = w; return *this; }
+    MV& operator =(uint32_t w)                 { word = w; return *this; }
 
-    const MV& operator +=(const MV& other)     { x += other.x; y += other.y; return *this; }
+    MV& operator +=(const MV& other)           { x += other.x; y += other.y; return *this; }
 
-    const MV& operator -=(const MV& other)     { x -= other.x; y -= other.y; return *this; }
+    MV& operator -=(const MV& other)           { x -= other.x; y -= other.y; return *this; }
 
-    const MV& operator >>=(int i)              { x >>= i; y >>= i; return *this; }
+    MV& operator >>=(int i)                    { x >>= i; y >>= i; return *this; }
 
-    const MV& operator <<=(int i)              { x <<= i; y <<= i; return *this; }
+    MV& operator <<=(int i)                    { x <<= i; y <<= i; return *this; }
 
     MV operator >>(int i) const                { return MV(x >> i, y >> i); }
 
@@ -64,16 +64,18 @@ public:
 
     MV operator *(int16_t i) const             { return MV(x * i, y * i); }
 
-    const MV operator -(const MV& other) const { return MV(x - other.x, y - other.y); }
+    MV operator -(const MV& other) const       { return MV(x - other.x, y - other.y); }
 
-    const MV operator +(const MV& other) const { return MV(x + other.x, y + other.y); }
+    MV operator +(const MV& other) const       { return MV(x + other.x, y + other.y); }
 
     bool operator ==(const MV& other) const    { return word == other.word; }
 
     bool operator !=(const MV& other) const    { return word != other.word; }
 
+    bool operator !() const                    { return !word; }
+
     // Scale down a QPEL mv to FPEL mv, rounding up by one HPEL offset
-    MV roundToFPel() const                     { return MV(x + 2, y + 2) >> 2; }
+    MV roundToFPel() const                     { return MV((x + 2) >> 2, (y + 2) >> 2); }
 
     // Scale up an FPEL mv to QPEL by shifting up two bits
     MV toQPel() const                          { return *this << 2; }
diff --git a/source/common/param.cpp b/source/common/param.cpp
index af70058..2159fd9 100644
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -176,6 +176,8 @@ void x265_param_default(x265_param *param)
     param->rdPenalty = 0;
     param->psyRd = 0.0;
     param->psyRdoq = 0.0;
+    param->analysisMode = 0;
+    param->analysisFileName = NULL;
     param->bIntraInBFrames = 0;
     param->bLossless = 0;
     param->bCULossless = 0;
@@ -193,7 +195,7 @@ void x265_param_default(x265_param *param)
     param->rc.qpStep = 4;
     param->rc.rateControlMode = X265_RC_CRF;
     param->rc.qp = 32;
-    param->rc.aqMode = X265_AQ_AUTO_VARIANCE;
+    param->rc.aqMode = X265_AQ_VARIANCE;
     param->rc.aqStrength = 1.0;
     param->rc.cuTree = 1;
     param->rc.rfConstantMax = 0;
@@ -406,6 +408,24 @@ int x265_param_default_preset(x265_param *param, const char *preset, const char
             param->scenecutThreshold = 0;
             param->rc.cuTree = 0;
         }
+        else if (!strcmp(tune, "grain"))
+        {
+            param->deblockingFilterBetaOffset = -2;
+            param->deblockingFilterTCOffset = -2;
+            param->bIntraInBFrames = 0;
+            param->psyRdoq = 30;
+            param->psyRd = 0.5;
+            param->rc.ipFactor = 1.1;
+            param->rc.pbFactor = 1.1;
+            param->rc.aqMode = X265_AQ_VARIANCE;
+            param->rc.aqStrength = 0.3;
+            param->rc.qCompress = 0.8;
+        }
+        else if (!strcmp(tune, "cbr"))
+        {
+            param->rc.pbFactor = 1.0;
+            param->rc.rateTolerance = 0.5;
+        }
         else
             return -1;
     }
@@ -532,9 +552,6 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
             }
         }
     }
-    OPT("csv") p->csvfn = value;
-    OPT("scaling-list") p->scalingLists = value;
-    OPT("lambda-file") p->rc.lambdaFileName = value;
     OPT("threads") p->poolNumThreads = atoi(value);
     OPT("frame-threads") p->frameNumThreads = atoi(value);
     OPT("pmode") p->bDistributeModeAnalysis = atobool(value);
@@ -623,7 +640,22 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
     OPT("psy-rdoq") p->psyRdoq = atof(value);
     OPT("signhide") p->bEnableSignHiding = atobool(value);
     OPT("b-intra") p->bIntraInBFrames = atobool(value);
-    OPT("lft") p->bEnableLoopFilter = atobool(value);
+    OPT("lft") p->bEnableLoopFilter = atobool(value); /* DEPRECATED */
+    OPT("deblock")
+    {
+        if (2 == sscanf(value, "%d:%d", &p->deblockingFilterTCOffset, &p->deblockingFilterBetaOffset) ||
+            2 == sscanf(value, "%d,%d", &p->deblockingFilterTCOffset, &p->deblockingFilterBetaOffset))
+        {
+            p->bEnableLoopFilter = true;
+        }
+        else if (sscanf(value, "%d", &p->deblockingFilterTCOffset))
+        {
+            p->bEnableLoopFilter = 1;
+            p->deblockingFilterBetaOffset = p->deblockingFilterTCOffset;
+        }
+        else
+            p->bEnableLoopFilter = atobool(value);
+    }
     OPT("sao") p->bEnableSAO = atobool(value);
     OPT("sao-non-deblock") p->bSaoNonDeblocked = atobool(value);
     OPT("ssim") p->bEnableSsim = atobool(value);
@@ -635,6 +667,11 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
     OPT("hrd") p->bEmitHRDSEI = atobool(value);
     OPT2("ipratio", "ip-factor") p->rc.ipFactor = atof(value);
     OPT2("pbratio", "pb-factor") p->rc.pbFactor = atof(value);
+    OPT("qcomp") p->rc.qCompress = atof(value);
+    OPT("qpstep") p->rc.qpStep = atoi(value);
+    OPT("ratetol") p->rc.rateTolerance = atof(value);
+    OPT("cplxblur") p->rc.complexityBlur = atof(value);
+    OPT("qblur") p->rc.qblur = atof(value);
     OPT("aq-mode") p->rc.aqMode = atoi(value);
     OPT("aq-strength") p->rc.aqStrength = atof(value);
     OPT("vbv-maxrate") p->rc.vbvMaxBitrate = atoi(value);
@@ -729,7 +766,8 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
                          &p->vui.defDispWinRightOffset,
                          &p->vui.defDispWinBottomOffset) != 4;
     }
-    OPT("nr") p->noiseReduction = atoi(value);
+    OPT("nr-intra") p->noiseReductionIntra = atoi(value);
+    OPT("nr-inter") p->noiseReductionInter = atoi(value);
     OPT("pass")
     {
         int pass = Clip3(0, 3, atoi(value));
@@ -737,6 +775,10 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
         p->rc.bStatRead = pass & 2;
     }
     OPT("stats") p->rc.statFileName = strdup(value);
+    OPT("csv") p->csvfn = strdup(value);
+    OPT("scaling-list") p->scalingLists = strdup(value);
+    OPT("lambda-file") p->rc.lambdaFileName = strdup(value);
+    OPT("analysis-file") p->analysisFileName = strdup(value);
     else
         return X265_PARAM_BAD_NAME;
 #undef OPT
@@ -960,6 +1002,10 @@ int x265_check_params(x265_param *param)
           "Aq-Mode is out of range");
     CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3,
           "Aq-Strength is out of range");
+    CHECK(param->deblockingFilterTCOffset < -6 || param->deblockingFilterTCOffset > 6,
+          "deblocking filter tC offset must be in the range of -6 to +6");
+    CHECK(param->deblockingFilterBetaOffset < -6 || param->deblockingFilterBetaOffset > 6,
+          "deblocking filter Beta offset must be in the range of -6 to +6");
     CHECK(param->psyRd < 0 || 2.0 < param->psyRd, "Psy-rd strength must be between 0 and 2.0");
     CHECK(param->psyRdoq < 0 || 50.0 < param->psyRdoq, "Psy-rdoq strength must be between 0 and 50.0");
     CHECK(param->bEnableWavefront < 0, "WaveFrontSynchro cannot be negative");
@@ -1031,8 +1077,12 @@ int x265_check_params(x265_param *param)
           "Valid initial VBV buffer occupancy must be a fraction 0 - 1, or size in kbits");
     CHECK(param->rc.bitrate < 0,
           "Target bitrate can not be less than zero");
-    if (param->noiseReduction)
-        CHECK(100 > param->noiseReduction || param->noiseReduction > 1000, "Valid noise reduction range 100 - 1000");
+    CHECK(param->rc.qCompress < 0.5 || param->rc.qCompress > 1.0,
+          "qCompress must be between 0.5 and 1.0");
+    if (param->noiseReductionIntra)
+        CHECK(0 > param->noiseReductionIntra || param->noiseReductionIntra > 2000, "Valid noise reduction range 0 - 2000");
+    if (param->noiseReductionInter)
+        CHECK(0 > param->noiseReductionInter || param->noiseReductionInter > 2000, "Valid noise reduction range 0 - 2000");
     CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead,
           "Constant rate-factor is incompatible with 2pass");
     CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead,
@@ -1061,7 +1111,7 @@ int x265_set_globals(x265_param *param)
 {
     static int once /* = 0 */;
 
-    if (ATOMIC_CAS32(&once, 0, 1) == 1)
+    if (ATOMIC_INC(&once) > 1)
     {
         if (param->maxCUSize != g_maxCUSize)
         {
@@ -1152,11 +1202,19 @@ void x265_print_params(x265_param *param)
         fprintf(stderr, "psy-rd=%.2lf ", param->psyRd);
     if (param->psyRdoq > 0.)
         fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq);
-    TOOLOPT(param->bEnableEarlySkip, "esd");
-    TOOLOPT(param->bEnableCbfFastMode, "cfm");
-    if (param->noiseReduction)
-        fprintf(stderr, "nr=%d ", param->noiseReduction);
-    TOOLOPT(param->bEnableLoopFilter, "lft");
+    TOOLOPT(param->bEnableEarlySkip, "early-skip");
+    TOOLOPT(param->bEnableCbfFastMode, "fast-cbf");
+    if (param->noiseReductionIntra)
+        fprintf(stderr, "nr-intra=%d ", param->noiseReductionIntra);
+    if (param->noiseReductionInter)
+        fprintf(stderr, "nr-inter=%d ", param->noiseReductionInter);
+    if (param->bEnableLoopFilter)
+    {
+        if (param->deblockingFilterBetaOffset || param->deblockingFilterTCOffset)
+            fprintf(stderr, "deblock(tC=%d:B=%d) ", param->deblockingFilterTCOffset, param->deblockingFilterBetaOffset);
+        else
+            TOOLOPT(param->bEnableLoopFilter, "deblock");
+    }
     if (param->bEnableSAO)
         fprintf(stderr, "sao%s ", param->bSaoNonDeblocked ? "-non-deblock" : "");
     TOOLOPT(param->bEnableSignHiding, "signhide");
diff --git a/source/common/picyuv.h b/source/common/picyuv.h
index 1e18d8c..a702517 100644
--- a/source/common/picyuv.h
+++ b/source/common/picyuv.h
@@ -76,12 +76,21 @@ public:
     pixel*  getCrAddr(uint32_t ctuAddr)                        { return m_picOrg[2] + m_cuOffsetC[ctuAddr]; }
     pixel*  getChromaAddr(uint32_t chromaId, uint32_t ctuAddr) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr]; }
     pixel*  getPlaneAddr(uint32_t plane, uint32_t ctuAddr)     { return m_picOrg[plane] + (plane ? m_cuOffsetC[ctuAddr] : m_cuOffsetY[ctuAddr]); }
+    const pixel* getLumaAddr(uint32_t ctuAddr) const           { return m_picOrg[0] + m_cuOffsetY[ctuAddr]; }
+    const pixel* getCbAddr(uint32_t ctuAddr) const             { return m_picOrg[1] + m_cuOffsetC[ctuAddr]; }
+    const pixel* getCrAddr(uint32_t ctuAddr) const             { return m_picOrg[2] + m_cuOffsetC[ctuAddr]; }
+    const pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr) const { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr]; }
+    const pixel* getPlaneAddr(uint32_t plane, uint32_t ctuAddr) const     { return m_picOrg[plane] + (plane ? m_cuOffsetC[ctuAddr] : m_cuOffsetY[ctuAddr]); }
 
     /* get pointer to CU start address */
     pixel*  getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[0] + m_cuOffsetY[ctuAddr] + m_buOffsetY[absPartIdx]; }
     pixel*  getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx)   { return m_picOrg[1] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
     pixel*  getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx)   { return m_picOrg[2] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
     pixel*  getChromaAddr(uint32_t chromaId, uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
+    const pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[0] + m_cuOffsetY[ctuAddr] + m_buOffsetY[absPartIdx]; }
+    const pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) const   { return m_picOrg[1] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
+    const pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) const   { return m_picOrg[2] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
+    const pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
 };
 
 void updateChecksum(const pixel* plane, uint32_t& checksumVal, uint32_t height, uint32_t width, intptr_t stride, int row, uint32_t cuHeight);
diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp
index 3e0530d..a56b8d7 100644
--- a/source/common/pixel.cpp
+++ b/source/common/pixel.cpp
@@ -32,32 +32,32 @@
 
 using namespace x265;
 
-#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
-    p.FUNC_PREFIX[LUMA_4x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_4x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x8]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x16]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x4]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_4x16]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x8]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x32]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
+#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
+    p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
 
 #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
     p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX<4,  4>; \
@@ -90,16 +90,14 @@ namespace {
 // place functions in anonymous namespace (file static)
 
 template<int lx, int ly>
-int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
     int sum = 0;
 
     for (int y = 0; y < ly; y++)
     {
         for (int x = 0; x < lx; x++)
-        {
             sum += abs(pix1[x] - pix2[x]);
-        }
 
         pix1 += stride_pix1;
         pix2 += stride_pix2;
@@ -109,16 +107,14 @@ int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
 }
 
 template<int lx, int ly>
-int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
 {
     int sum = 0;
 
     for (int y = 0; y < ly; y++)
     {
         for (int x = 0; x < lx; x++)
-        {
             sum += abs(pix1[x] - pix2[x]);
-        }
 
         pix1 += stride_pix1;
         pix2 += stride_pix2;
@@ -128,7 +124,7 @@ int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2
 }
 
 template<int lx, int ly>
-void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstride, int32_t *res)
+void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
 {
     res[0] = 0;
     res[1] = 0;
@@ -150,7 +146,7 @@ void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstr
 }
 
 template<int lx, int ly>
-void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, intptr_t frefstride, int32_t *res)
+void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
 {
     res[0] = 0;
     res[1] = 0;
@@ -175,17 +171,17 @@ void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, int
 }
 
 template<int lx, int ly, class T1, class T2>
-int sse(T1 *pix1, intptr_t stride_pix1, T2 *pix2, intptr_t stride_pix2)
+int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
 {
     int sum = 0;
-    int iTemp;
+    int tmp;
 
     for (int y = 0; y < ly; y++)
     {
         for (int x = 0; x < lx; x++)
         {
-            iTemp = pix1[x] - pix2[x];
-            sum += (iTemp * iTemp);
+            tmp = pix1[x] - pix2[x];
+            sum += (tmp * tmp);
         }
 
         pix1 += stride_pix1;
@@ -217,7 +213,7 @@ inline sum2_t abs2(sum2_t a)
     return (a + s) ^ s;
 }
 
-int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
     sum2_t tmp[4][2];
     sum2_t a0, a1, a2, a3, b0, b1;
@@ -245,7 +241,7 @@ int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix
     return (int)(sum >> 1);
 }
 
-int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
 {
     ssum2_t tmp[4][2];
     ssum2_t a0, a1, a2, a3, b0, b1;
@@ -274,7 +270,7 @@ int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride
 }
 
 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
-int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
     sum2_t tmp[4][4];
     sum2_t a0, a1, a2, a3;
@@ -300,41 +296,33 @@ int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix
 
 template<int w, int h>
 // calculate satd in blocks of 4x4
-int satd4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
     int satd = 0;
 
     for (int row = 0; row < h; row += 4)
-    {
         for (int col = 0; col < w; col += 4)
-        {
             satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
                              pix2 + row * stride_pix2 + col, stride_pix2);
-        }
-    }
 
     return satd;
 }
 
 template<int w, int h>
 // calculate satd in blocks of 8x4
-int satd8(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
     int satd = 0;
 
     for (int row = 0; row < h; row += 4)
-    {
         for (int col = 0; col < w; col += 8)
-        {
             satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
                              pix2 + row * stride_pix2 + col, stride_pix2);
-        }
-    }
 
     return satd;
 }
 
-inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     sum2_t tmp[8][4];
     sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
@@ -371,12 +359,12 @@ inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
     return (int)sum;
 }
 
-int sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
 }
 
-inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
 {
     ssum2_t tmp[8][4];
     ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
@@ -413,12 +401,12 @@ inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_p
     return (int)sum;
 }
 
-int sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
 {
     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
 }
 
-int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
         + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
@@ -432,159 +420,129 @@ int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
 
 template<int w, int h>
 // Calculate sa8d in blocks of 8x8
-int sa8d8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     int cost = 0;
 
     for (int y = 0; y < h; y += 8)
-    {
         for (int x = 0; x < w; x += 8)
-        {
             cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
-        }
-    }
 
     return cost;
 }
 
 template<int w, int h>
 // Calculate sa8d in blocks of 16x16
-int sa8d16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     int cost = 0;
 
     for (int y = 0; y < h; y += 16)
-    {
         for (int x = 0; x < w; x += 16)
-        {
             cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
-        }
-    }
 
     return cost;
 }
 
 template<int size>
-int pixel_ssd_s_c(short *a, intptr_t dstride)
+int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
 {
     int sum = 0;
     for (int y = 0; y < size; y++)
     {
         for (int x = 0; x < size; x++)
-        {
             sum += a[x] * a[x];
-        }
+
         a += dstride;
     }
     return sum;
 }
 
 template<int size>
-void blockfil_s_c(int16_t *dst, intptr_t dstride, int16_t val)
+void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val)
 {
     for (int y = 0; y < size; y++)
-    {
         for (int x = 0; x < size; x++)
-        {
             dst[y * dstride + x] = val;
-        }
-    }
-}
-
-void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size)
-{
-    for (int i = 0; i < size; i++)
-    {
-        for (int j = 0; j < size; j++)
-        {
-            dst[i * size + j] = ((int)src[i * stride + j]) << shift;
-        }
-    }
 }
 
 template<int size>
-void convert16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset)
+void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
 {
-    for (int i = 0; i < size; i++)
-    {
-        for (int j = 0; j < size; j++)
-        {
-            dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
-        }
-    }
-}
-
-void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size)
-{
-    int round = 1 << (shift - 1);
+    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+    X265_CHECK(shift >= 0, "invalid shift\n");
 
     for (int i = 0; i < size; i++)
     {
         for (int j = 0; j < size; j++)
-        {
-            dst[j] = (int16_t)((src[j] + round) >> shift);
-        }
+            dst[j] = src[j] << shift;
 
-        src += size;
-        dst += stride;
+        src += srcStride;
+        dst += size;
     }
 }
 
-void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
+template<int size>
+void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
 {
-    int round = 1 << (shift - 1);
+    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+    X265_CHECK(shift > 0, "invalid shift\n");
 
+    int16_t round = 1 << (shift - 1);
     for (int i = 0; i < size; i++)
     {
         for (int j = 0; j < size; j++)
-        {
-            dst[j] = (int16_t)((src[j] + round) >> shift);
-        }
+            dst[j] = (src[j] + round) >> shift;
 
-        src += size;
-        dst += stride;
+        src += srcStride;
+        dst += size;
     }
 }
 
 template<int size>
-void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 {
+    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+    X265_CHECK(shift >= 0, "invalid shift\n");
+
     for (int i = 0; i < size; i++)
     {
         for (int j = 0; j < size; j++)
-        {
-            dst[j] = ((int16_t)src[j] << shift);
-        }
+            dst[j] = src[j] << shift;
 
         src += size;
-        dst += stride;
+        dst += dstStride;
     }
 }
 
 template<int size>
-void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 {
+    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+    X265_CHECK(shift > 0, "invalid shift\n");
+
+    int16_t round = 1 << (shift - 1);
     for (int i = 0; i < size; i++)
     {
         for (int j = 0; j < size; j++)
-        {
-            dst[j] = (src[j] << shift);
-        }
+            dst[j] = (src[j] + round) >> shift;
 
         src += size;
-        dst += stride;
+        dst += dstStride;
     }
 }
 
 template<int blockSize>
-void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
+void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
 {
     for (int y = 0; y < blockSize; y++)
     {
         for (int x = 0; x < blockSize; x++)
-        {
             residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
-        }
 
         fenc += stride;
         residual += stride;
@@ -593,18 +551,14 @@ void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
 }
 
 template<int blockSize>
-void transpose(pixel* dst, pixel* src, intptr_t stride)
+void transpose(pixel* dst, const pixel* src, intptr_t stride)
 {
     for (int k = 0; k < blockSize; k++)
-    {
         for (int l = 0; l < blockSize; l++)
-        {
             dst[k * blockSize + l] = src[l * stride + k];
-        }
-    }
 }
 
-void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
 {
     int x, y;
 
@@ -622,7 +576,7 @@ void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStrid
     }
 }
 
-void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
+void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
 {
     int x, y;
 
@@ -646,14 +600,12 @@ void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height,
 }
 
 template<int lx, int ly>
-void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int)
+void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
 {
     for (int y = 0; y < ly; y++)
     {
         for (int x = 0; x < lx; x++)
-        {
             dst[x] = (src0[x] + src1[x] + 1) >> 1;
-        }
 
         src0 += sstride0;
         src1 += sstride1;
@@ -661,7 +613,7 @@ void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, p
     }
 }
 
-void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/)
 {
     int x;
 
@@ -675,9 +627,9 @@ void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
     }
 }
 
-void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
 {
-    int x, y;
+    uint32_t x, y;
 
     for (y = 0; y < 64; y += 2)
     {
@@ -694,13 +646,13 @@ void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
     }
 }
 
-void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
+void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
                             intptr_t src_stride, intptr_t dst_stride, int width, int height)
 {
     for (int y = 0; y < height; y++)
     {
-        pixel *src1 = src0 + src_stride;
-        pixel *src2 = src1 + src_stride;
+        const pixel* src1 = src0 + src_stride;
+        const pixel* src2 = src1 + src_stride;
         for (int x = 0; x < width; x++)
         {
             // slower than naive bilinear, but matches asm
@@ -720,7 +672,7 @@ void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv,
 }
 
 /* structural similarity metric */
-void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4])
+void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
 {
     for (int z = 0; z < 2; z++)
     {
@@ -794,7 +746,7 @@ float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
 }
 
 template<int size>
-uint64_t pixel_var(pixel *pix, intptr_t i_stride)
+uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
 {
     uint32_t sum = 0, sqr = 0;
 
@@ -817,7 +769,7 @@ uint64_t pixel_var(pixel *pix, intptr_t i_stride)
 #endif
 
 template<int size>
-int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
+int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
 {
     static pixel zeroBuf[8] /* = { 0 } */;
 
@@ -850,7 +802,7 @@ int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
 }
 
 template<int size>
-int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride)
+int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
 {
     static int16_t zeroBuf[8] /* = { 0 } */;
 
@@ -882,28 +834,13 @@ int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstri
     }
 }
 
-void plane_copy_deinterleave_chroma(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride,
-                                    pixel *src,  intptr_t srcStride, int w, int h)
-{
-    for (int y = 0; y < h; y++, dstu += dstuStride, dstv += dstvStride, src += srcStride)
-    {
-        for (int x = 0; x < w; x++)
-        {
-            dstu[x] = src[2 * x];
-            dstv[x] = src[2 * x + 1];
-        }
-    }
-}
-
 template<int bx, int by>
-void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
 {
     for (int y = 0; y < by; y++)
     {
         for (int x = 0; x < bx; x++)
-        {
             a[x] = b[x];
-        }
 
         a += stridea;
         b += strideb;
@@ -911,14 +848,12 @@ void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
 }
 
 template<int bx, int by>
-void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
 {
     for (int y = 0; y < by; y++)
     {
         for (int x = 0; x < bx; x++)
-        {
             a[x] = b[x];
-        }
 
         a += stridea;
         b += strideb;
@@ -926,7 +861,7 @@ void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb)
 }
 
 template<int bx, int by>
-void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
 {
     for (int y = 0; y < by; y++)
     {
@@ -942,14 +877,12 @@ void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb)
 }
 
 template<int bx, int by>
-void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
 {
     for (int y = 0; y < by; y++)
     {
         for (int x = 0; x < bx; x++)
-        {
             a[x] = (int16_t)b[x];
-        }
 
         a += stridea;
         b += strideb;
@@ -957,14 +890,12 @@ void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb)
 }
 
 template<int bx, int by>
-void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
 {
     for (int y = 0; y < by; y++)
     {
         for (int x = 0; x < bx; x++)
-        {
             a[x] = (int16_t)(b0[x] - b1[x]);
-        }
 
         b0 += sstride0;
         b1 += sstride1;
@@ -973,14 +904,12 @@ void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t
 }
 
 template<int bx, int by>
-void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
 {
     for (int y = 0; y < by; y++)
     {
         for (int x = 0; x < bx; x++)
-        {
             a[x] = Clip(b0[x] + b1[x]);
-        }
 
         b0 += sstride0;
         b1 += sstride1;
@@ -989,7 +918,7 @@ void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t
 }
 
 template<int bx, int by>
-void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
 {
     int shiftNum, offset;
 
@@ -1010,28 +939,24 @@ void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intpt
     }
 }
 
-void planecopy_cp_c(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
+void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
 {
     for (int r = 0; r < height; r++)
     {
         for (int c = 0; c < width; c++)
-        {
             dst[c] = ((pixel)src[c]) << shift;
-        }
 
         dst += dstStride;
         src += srcStride;
     }
 }
 
-void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 {
     for (int r = 0; r < height; r++)
     {
         for (int c = 0; c < width; c++)
-        {
             dst[c] = (pixel)((src[c] >> shift) & mask);
-        }
 
         dst += dstStride;
         src += srcStride;
@@ -1040,8 +965,8 @@ void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstS
 
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given CU. */
-void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts,
-                             int32_t *invQscales, double *fpsFactor, int len)
+void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
+                             const int32_t* invQscales, const double* fpsFactor, int len)
 {
     double fps = *fpsFactor / 256;
 
@@ -1068,12 +993,12 @@ void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int mar
     primitives.extendRowBorder(pic, stride, width, height, marginX);
 
     /* copy top row to create above margin */
-    pixel *top = pic - marginX;
+    pixel* top = pic - marginX;
     for (int y = 0; y < marginY; y++)
         memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
 
     /* copy bottom row to create below margin */
-    pixel *bot = pic - marginX + (height - 1) * stride;
+    pixel* bot = pic - marginX + (height - 1) * stride;
     for (int y = 0; y < marginY; y++)
         memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
 }
@@ -1113,6 +1038,62 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
     p.satd[LUMA_64x16] = satd8<64, 16>;
     p.satd[LUMA_16x64] = satd8<16, 64>;
 
+    p.chroma[X265_CSP_I420].satd[CHROMA_2x2]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x4]   = satd_4x4;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x8]   = satd8<8, 8>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>;
+
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x2]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_2x4]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x4]   = satd_8x4;
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x8]   = satd4<4, 8>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x8]  = satd8<16, 8>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x16]  = satd8<8, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>;
+
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x6]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_6x8]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x2]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_2x8]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x4]  = satd4<16, 4>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x16]  = satd4<4, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x8]  = satd8<32, 8>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x32]  = satd8<8, 32>;
+
+    p.chroma[X265_CSP_I422].satd[CHROMA422_2x4]   = NULL;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x8]   = satd4<4, 8>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x16]  = satd8<8, 16>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>;
+
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x4]   = satd_4x4;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_2x8]   = NULL;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x8]   = satd8<8, 8>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x16]  = satd4<4, 16>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x32]  = satd8<8, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>;
+
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x12]  = satd4<8, 12>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_6x16]  = NULL;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x4]   = satd4<8, 4>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_2x16]  = NULL;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x8]  = satd8<16, 8>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x32]  = satd4<4, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x64]  = satd8<8, 64>;
+
 #define CHROMA_420(W, H) \
     p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H]  = addAvg<W, H>;         \
     p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
@@ -1121,13 +1102,14 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
     p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
 
 #define CHROMA_422(W, H) \
-    p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>;         \
+    p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H]  = addAvg<W, H>;         \
     p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
     p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
     p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
     p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
 
 #define CHROMA_444(W, H) \
+    p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H]    = p.satd[LUMA_ ## W ## x ## H]; \
     p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H]  = addAvg<W, H>; \
     p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
     p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
@@ -1157,8 +1139,6 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
     p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
     p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
 
-
-
     LUMA(4, 4);
     LUMA(8, 8);
     CHROMA_420(4, 4);
@@ -1278,9 +1258,9 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
     CHROMA_444(64, 16);
     CHROMA_444(16, 64);
 
-    SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
-    SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
-    SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
+    SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
+    SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
+    SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
 
     p.blockfill_s[BLOCK_4x4]   = blockfil_s_c<4>;
     p.blockfill_s[BLOCK_8x8]   = blockfil_s_c<8>;
@@ -1288,22 +1268,22 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
     p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
     p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
 
-    p.cvt16to32_shl = convert16to32_shl;
-    p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
-    p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
-    p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
-    p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
-    p.cvt32to16_shr = convert32to16_shr;
-    p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
-    p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
-    p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
-    p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
-
-    p.copy_shr = copy_shr;
-    p.copy_shl[BLOCK_4x4] = copy_shl<4>;
-    p.copy_shl[BLOCK_8x8] = copy_shl<8>;
-    p.copy_shl[BLOCK_16x16] = copy_shl<16>;
-    p.copy_shl[BLOCK_32x32] = copy_shl<32>;
+    p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
+    p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
+    p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
+    p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
+    p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
+    p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
+    p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
+    p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
+    p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
+    p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
+    p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
+    p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
+    p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
+    p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
+    p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
+    p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
 
     p.sa8d[BLOCK_4x4]   = satd_4x4;
     p.sa8d[BLOCK_8x8]   = sa8d_8x8;
@@ -1371,7 +1351,7 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
 
     p.scale1D_128to64 = scale1D_128to64;
     p.scale2D_64to32 = scale2D_64to32;
-    p.frame_init_lowres_core = frame_init_lowres_core;
+    p.frameInitLowres = frame_init_lowres_core;
     p.ssim_4x4x2_core = ssim_4x4x2_core;
     p.ssim_end_4 = ssim_end_4;
 
@@ -1379,7 +1359,6 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
     p.var[BLOCK_16x16] = pixel_var<16>;
     p.var[BLOCK_32x32] = pixel_var<32>;
     p.var[BLOCK_64x64] = pixel_var<64>;
-    p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
     p.planecopy_cp = planecopy_cp_c;
     p.planecopy_sp = planecopy_sp_c;
     p.propagateCost = estimateCUPropagateCost;
diff --git a/source/common/predict.cpp b/source/common/predict.cpp
index a142c5a..6be4f19 100644
--- a/source/common/predict.cpp
+++ b/source/common/predict.cpp
@@ -83,7 +83,8 @@ void Predict::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, ui
 {
     int tuSize = 1 << log2TrSize;
 
-    pixel *refLft, *refAbv;
+    pixel* refLft;
+    pixel* refAbv;
 
     if (!(g_intraFilterFlags[dirMode] & tuSize))
     {
@@ -187,18 +188,18 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
             ShortYuv& shortYuv = m_predShortYuv[0];
 
             if (bLuma)
-                predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+                predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
             if (bChroma)
-                predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+                predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
 
             addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
         }
         else
         {
             if (bLuma)
-                predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+                predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
             if (bChroma)
-                predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+                predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
         }
     }
     else
@@ -253,13 +254,13 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
 
             if (bLuma)
             {
-                predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
-                predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+                predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
             }
             if (bChroma)
             {
-                predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
-                predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+                predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
             }
 
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
@@ -277,18 +278,18 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+                    predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
                 if (bChroma)
-                    predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+                    predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
 
                 addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+                    predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
                 if (bChroma)
-                    predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+                    predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
             }
         }
         else
@@ -302,18 +303,18 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+                    predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
                 if (bChroma)
-                    predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+                    predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
 
                 addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+                    predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
                 if (bChroma)
-                    predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+                    predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
             }
         }
     }
@@ -321,13 +322,13 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
 
 void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
 {
-    pixel *dst = dstYuv.getLumaAddr(m_puAbsPartIdx);
+    pixel* dst = dstYuv.getLumaAddr(m_puAbsPartIdx);
     intptr_t dstStride = dstYuv.m_size;
 
     intptr_t srcStride = refPic.m_stride;
     intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
     int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
-    pixel* src = const_cast<PicYuv&>(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+    const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
 
     int xFrac = mv.x & 0x3;
     int yFrac = mv.y & 0x3;
@@ -350,12 +351,12 @@ void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv
 
 void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
 {
-    int16_t *dst = dstSYuv.getLumaAddr(m_puAbsPartIdx);
+    int16_t* dst = dstSYuv.getLumaAddr(m_puAbsPartIdx);
     int dstStride = dstSYuv.m_size;
 
     intptr_t srcStride = refPic.m_stride;
     intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
-    pixel *src = const_cast<PicYuv&>(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+    const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
 
     int xFrac = mv.x & 0x3;
     int yFrac = mv.y & 0x3;
@@ -391,8 +392,8 @@ void Predict::predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV&
 
     intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
 
-    pixel* refCb = const_cast<PicYuv&>(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
-    pixel* refCr = const_cast<PicYuv&>(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+    const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+    const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
 
     pixel* dstCb = dstYuv.getCbAddr(m_puAbsPartIdx);
     pixel* dstCr = dstYuv.getCrAddr(m_puAbsPartIdx);
@@ -441,8 +442,8 @@ void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, cons
 
     intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
 
-    pixel* refCb = const_cast<PicYuv&>(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
-    pixel* refCr = const_cast<PicYuv&>(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+    const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+    const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
 
     int16_t* dstCb = dstSYuv.getCbAddr(m_puAbsPartIdx);
     int16_t* dstCr = dstSYuv.getCrAddr(m_puAbsPartIdx);
@@ -459,8 +460,8 @@ void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, cons
 
     if (!(yFrac | xFrac))
     {
-        primitives.chroma_p2s[m_csp](refCb, refStride, dstCb, cxWidth, cxHeight);
-        primitives.chroma_p2s[m_csp](refCr, refStride, dstCr, cxWidth, cxHeight);
+        primitives.chroma[m_csp].p2s(refCb, refStride, dstCb, cxWidth, cxHeight);
+        primitives.chroma[m_csp].p2s(refCr, refStride, dstCr, cxWidth, cxHeight);
     }
     else if (!yFrac)
     {
@@ -492,20 +493,12 @@ void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv&
     int w0, w1, offset, shiftNum, shift, round;
     uint32_t src0Stride, src1Stride, dststride;
 
-    pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
-    pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
-    pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-
-    const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx);
-    const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx);
-    const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx);
-
-    const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx);
-    const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx);
-    const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx);
-
     if (bLuma)
     {
+        pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
+        const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx);
+        const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx);
+
         // Luma
         w0      = wp0[0].w;
         offset  = wp0[0].o + wp1[0].o;
@@ -542,6 +535,13 @@ void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv&
 
     if (bChroma)
     {
+        pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
+        pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
+        const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx);
+        const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx);
+        const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx);
+        const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx);
+
         // Chroma U
         w0      = wp0[1].w;
         offset  = wp0[1].o + wp1[1].o;
@@ -602,19 +602,14 @@ void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv&
 /* weighted averaging for uni-pred */
 void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const
 {
-    pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
-    pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
-    pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-
-    const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx);
-    const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx);
-    const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx);
-
     int w0, offset, shiftNum, shift, round;
     uint32_t srcStride, dstStride;
 
     if (bLuma)
     {
+        pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
+        const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx);
+
         // Luma
         w0      = wp[0].w;
         offset  = wp[0].offset;
@@ -624,11 +619,16 @@ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightVal
         srcStride = srcYuv.m_size;
         dstStride = predYuv.m_size;
 
-        primitives.weight_sp(const_cast<int16_t*>(srcY0), dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset);
+        primitives.weight_sp(srcY0, dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset);
     }
 
     if (bChroma)
     {
+        pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
+        pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
+        const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx);
+        const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx);
+
         // Chroma U
         w0      = wp[1].w;
         offset  = wp[1].offset;
@@ -642,7 +642,7 @@ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightVal
         uint32_t cwidth = m_puWidth >> srcYuv.m_hChromaShift;
         uint32_t cheight = m_puHeight >> srcYuv.m_vChromaShift;
 
-        primitives.weight_sp(const_cast<int16_t*>(srcU0), dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
+        primitives.weight_sp(srcU0, dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
 
         // Chroma V
         w0     = wp[2].w;
@@ -650,7 +650,7 @@ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightVal
         shift  = wp[2].shift + shiftNum;
         round  = shift ? (1 << (shift - 1)) : 0;
 
-        primitives.weight_sp(const_cast<int16_t*>(srcV0), dstV, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
+        primitives.weight_sp(srcV0, dstV, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
     }
 }
 
@@ -668,8 +668,8 @@ void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t ab
     int tuSize = intraNeighbors.tuSize;
     int tuSize2 = tuSize << 1;
 
-    pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-    intptr_t picStride = cu.m_encData->m_reconPicYuv->m_stride;
+    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
 
     fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors);
 
@@ -744,8 +744,8 @@ void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint3
     initIntraNeighbors(cu, absPartIdx, partDepth, false, &intraNeighbors);
     uint32_t tuSize = intraNeighbors.tuSize;
 
-    const pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-    intptr_t picStride = cu.m_encData->m_reconPicYuv->m_strideC;
+    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
     pixel* adiRef = getAdiChromaBuf(chromaId, tuSize);
 
     fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors);
@@ -765,7 +765,7 @@ void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t
     }
 
     int   numIntraNeighbor = 0;
-    bool *bNeighborFlags = intraNeighbors->bNeighborFlags;
+    bool* bNeighborFlags = intraNeighbors->bNeighborFlags;
 
     uint32_t partIdxLT, partIdxRT, partIdxLB;
 
@@ -829,15 +829,15 @@ void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, p
     }
     else // reference samples are partially available
     {
-        const bool *bNeighborFlags = intraNeighbors.bNeighborFlags;
-        const bool *pNeighborFlags;
+        const bool* bNeighborFlags = intraNeighbors.bNeighborFlags;
+        const bool* pNeighborFlags;
         int aboveUnits = intraNeighbors.aboveUnits;
         int leftUnits = intraNeighbors.leftUnits;
         int unitWidth = intraNeighbors.unitWidth;
         int unitHeight = intraNeighbors.unitHeight;
         int totalSamples = (leftUnits * unitHeight) + ((aboveUnits + 1) * unitWidth);
         pixel adiLineBuffer[5 * MAX_CU_SIZE];
-        pixel *adi;
+        pixel* adi;
 
         // Initialize
         for (int i = 0; i < totalSamples; i++)
@@ -893,7 +893,7 @@ void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, p
             while (next < totalUnits && !bNeighborFlags[next])
                 next++;
 
-            pixel *pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
+            pixel* pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
             const pixel refSample = *pAdiLineNext;
             // Pad unavailable samples with new value
             int nextOrTop = X265_MIN(next, leftUnits);
@@ -959,12 +959,12 @@ bool Predict::isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT)
         return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft);
 }
 
-int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags)
+int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
 {
     const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
     const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT] + 1;
     const uint32_t idxStep = 1;
-    bool *validFlagPtr = bValidFlags;
+    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;
 
     for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
@@ -985,12 +985,12 @@ int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t par
     return numIntra;
 }
 
-int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags)
+int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
 {
     const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
     const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB] + 1;
     const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize;
-    bool *validFlagPtr = bValidFlags;
+    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;
 
     for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
@@ -1011,10 +1011,10 @@ int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t part
     return numIntra;
 }
 
-int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags)
+int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
 {
     const uint32_t numUnitsInPU = g_zscanToRaster[partIdxRT] - g_zscanToRaster[partIdxLT] + 1;
-    bool *validFlagPtr = bValidFlags;
+    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;
 
     for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
@@ -1035,10 +1035,10 @@ int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_
     return numIntra;
 }
 
-int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags)
+int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
 {
     const uint32_t numUnitsInPU = (g_zscanToRaster[partIdxLB] - g_zscanToRaster[partIdxLT]) / cu.m_slice->m_sps->numPartInCUSize + 1;
-    bool *validFlagPtr = bValidFlags;
+    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;
 
     for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
diff --git a/source/common/primitives.cpp b/source/common/primitives.cpp
index 7592d27..ebb8af6 100644
--- a/source/common/primitives.cpp
+++ b/source/common/primitives.cpp
@@ -75,7 +75,8 @@ void Setup_Alias_Primitives(EncoderPrimitives &p)
         p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
         p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
         p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
-        p.chroma[X265_CSP_I444].addAvg[i]  = p.luma_addAvg[i];
+        p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
+        p.chroma[X265_CSP_I444].satd[i] = p.satd[i];
     }
 
     for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
@@ -84,15 +85,6 @@ void Setup_Alias_Primitives(EncoderPrimitives &p)
         p.chroma[X265_CSP_I444].sub_ps[i]  = p.luma_sub_ps[i];
     }
 
-    for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
-    {
-        int partL = partitionFromLog2Size(i + 2);
-        p.square_copy_pp[i] = p.luma_copy_pp[partL];
-        p.square_copy_ps[i] = p.luma_copy_ps[partL];
-        p.square_copy_sp[i] = p.luma_copy_sp[partL];
-        p.square_copy_ss[i] = p.luma_copy_ss[partL];
-    }
-
     primitives.sa8d[BLOCK_4x4]   = primitives.sa8d_inter[LUMA_4x4];
     primitives.sa8d[BLOCK_8x8]   = primitives.sa8d_inter[LUMA_8x8];
     primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16];
@@ -107,6 +99,52 @@ void Setup_Alias_Primitives(EncoderPrimitives &p)
     primitives.sa8d_inter[LUMA_16x4]  = primitives.satd[LUMA_16x4];
     primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12];
     primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16];
+
+    // Chroma SATD can often reuse luma primitives
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x4]   = primitives.satd[LUMA_4x4];
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x8]   = primitives.satd[LUMA_8x8];
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = primitives.satd[LUMA_16x16];
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = primitives.satd[LUMA_32x32];
+
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x4]   = primitives.satd[LUMA_8x4];
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x8]   = primitives.satd[LUMA_4x8];
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x8]  = primitives.satd[LUMA_16x8];
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x16]  = primitives.satd[LUMA_8x16];
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = primitives.satd[LUMA_32x16];
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = primitives.satd[LUMA_16x32];
+
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = primitives.satd[LUMA_16x12];
+    p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = primitives.satd[LUMA_12x16];
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x4]  = primitives.satd[LUMA_16x4];
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x16]  = primitives.satd[LUMA_4x16];
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = primitives.satd[LUMA_32x24];
+    p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = primitives.satd[LUMA_24x32];
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x8]  = primitives.satd[LUMA_32x8];
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x32]  = primitives.satd[LUMA_8x32];
+
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x8]   = primitives.satd[LUMA_4x8];
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x16]  = primitives.satd[LUMA_8x16];
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = primitives.satd[LUMA_16x32];
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = primitives.satd[LUMA_32x64];
+
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x4]   = primitives.satd[LUMA_4x4];
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x8]   = primitives.satd[LUMA_8x8];
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x16]  = primitives.satd[LUMA_4x16];
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = primitives.satd[LUMA_16x16];
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x32]  = primitives.satd[LUMA_8x32];
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = primitives.satd[LUMA_32x32];
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = primitives.satd[LUMA_16x64];
+
+    //p.chroma[X265_CSP_I422].satd[CHROMA422_8x12]  = satd4<8, 12>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x4]   = primitives.satd[LUMA_8x4];
+    //p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
+    //p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x8]  = primitives.satd[LUMA_16x8];
+    //p.chroma[X265_CSP_I422].satd[CHROMA422_4x32]  = satd4<4, 32>;
+    //p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
+    //p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = primitives.satd[LUMA_32x16];
+    //p.chroma[X265_CSP_I422].satd[CHROMA422_8x64]  = satd8<8, 64>;
 }
 }
 using namespace x265;
@@ -123,17 +161,15 @@ void x265_setup_primitives(x265_param *param, int cpuid)
     if (!primitives.sad[0])
     {
         Setup_C_Primitives(primitives);
-        Setup_Instrinsic_Primitives(primitives, cpuid);
 
 #if ENABLE_ASSEMBLY
+        Setup_Instrinsic_Primitives(primitives, cpuid);
         Setup_Assembly_Primitives(primitives, cpuid);
 #else
         x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
 #endif
 
         Setup_Alias_Primitives(primitives);
-
-        initROM();
     }
 
     if (param->logLevel >= X265_LOG_INFO)
@@ -169,74 +205,14 @@ void x265_setup_primitives(x265_param *param, int cpuid)
     }
 }
 
-#if !defined(ENABLE_ASSEMBLY)
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-
+#if ENABLE_ASSEMBLY
+/* these functions are implemented in assembly. When assembly is not being
+ * compiled, they are unnecessary and can be NOPs */
+#else
 extern "C" {
-// the intrinsic primitives will not use MMX instructions, so if assembly
-// is disabled there should be no reason to use EMMS.
+int x265_cpu_cpuid_test(void) { return 0; }
 void x265_cpu_emms(void) {}
-
-#if defined(X265_ARCH_X86)
-
-#if defined(_MSC_VER)
-# pragma warning(disable: 4100)
-#elif defined(__GNUC__) || defined(__clang__)    // use inline assembly, Gnu/AT&T syntax
-# define __cpuidex(regsArray, level, index) \
-    __asm__ __volatile__ ("cpuid" \
-                          : "=a" ((regsArray)[0]), "=b" ((regsArray)[1]), "=c" ((regsArray)[2]), "=d" ((regsArray)[3]) \
-                          : "0" (level), "2" (index));
-#else
-# error "compiler not supported"
-#endif
-
-int x265_cpu_cpuid_test(void)
-{
-    return 0;
+void x265_cpu_cpuid(uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *) {}
+void x265_cpu_xgetbv(uint32_t, uint32_t *, uint32_t *) {}
 }
-
-void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
-{
-    int output[4];
-
-    __cpuidex(output, op, 0);
-    *eax = output[0];
-    *ebx = output[1];
-    *ecx = output[2];
-    *edx = output[3];
-}
-
-void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx)
-{
-    uint64_t out = 0;
-
-#if X265_ARCH_X86
-
-#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
-
-    // MSVC 2010 SP1 or later, or similar Intel release
-    out = _xgetbv(op);
-
-#elif defined(__GNUC__) || defined(__clang__)    // use inline assembly, Gnu/AT&T syntax
-
-    uint32_t a, d;
-    __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (op) :);
-    *eax = a;
-    *edx = d;
-    return;
-
-#elif defined(_WIN64)      // On x64 with older compilers, this is impossible
-
-#endif // if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
-
-#endif // if x86
-
-    *eax = (uint32_t)out;
-    *edx = (uint32_t)(out >> 32);
-}
-
-#endif // X265_ARCH_X86
-}
-#endif // if !ENABLE_ASSEMBLY
+#endif
diff --git a/source/common/primitives.h b/source/common/primitives.h
index 8300c21..0c93f98 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -132,162 +132,147 @@ inline int partitionFromLog2Size(int log2Size)
     return log2Size - 2;
 }
 
-typedef int  (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned
-typedef int  (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride);
-typedef int  (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
-typedef int  (*pixel_ssd_s_t)(int16_t *fenc, intptr_t fencstride);
-typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res);
-typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
-typedef void (*blockcpy_sp_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
-typedef void (*blockcpy_sc_t)(int bx, int by, int16_t *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned
-typedef void (*pixelsub_ps_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight);
-typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val);
-
-typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter);
-typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma);
-
-typedef void (*cvt16to32_shl_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
-typedef void (*cvt16to32_shr_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
-typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int);
-typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int);
-typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride);
-typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
-typedef void (*copy_shl_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift);
-
-typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
-typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
-typedef void (*denoiseDct_t)(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
-
-typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
-typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-typedef int  (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
-
-typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
-typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc,
+typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
+typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
+typedef int  (*pixelcmp_sp_t)(const int16_t* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride);
+typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
+typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
+typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
+
+typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter);
+typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma);
+
+typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy1Dto2D_shl_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef void (*cpy1Dto2D_shr_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
+
+typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t dstStride);
+typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff);
+
+typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride);
+typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
+typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+typedef int  (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff);
+
+typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride);
+typedef void (*downscale_t)(const pixel* src0, pixel* dstf, pixel* dsth, pixel* dstv, pixel* dstc,
                             intptr_t src_stride, intptr_t dst_stride, int width, int height);
 typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX);
-typedef void (*ssim_4x4x2_core_t)(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]);
+typedef void (*ssim_4x4x2_core_t)(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
 typedef float (*ssim_end4_t)(int sum0[5][4], int sum1[5][4], int width);
-typedef uint64_t (*var_t)(pixel *pix, intptr_t stride);
-typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src, intptr_t srcStride, int w, int h);
+typedef uint64_t (*var_t)(const pixel* pix, intptr_t stride);
+typedef void (*plane_copy_deinterleave_t)(pixel* dstu, intptr_t dstuStride, pixel* dstv, intptr_t dstvStride, const pixel* src, intptr_t srcStride, int w, int h);
 
-typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_hps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-typedef void (*filter_ps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_sp_t) (int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_ss_t) (int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY);
-typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+typedef void (*filter_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+typedef void (*filter_ps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_sp_t) (const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_ss_t) (const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
+typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
 
-typedef void (*copy_pp_t)(pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
-typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride);
-typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
-typedef void (*copy_ss_t)(int16_t *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride);
+typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned
+typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+typedef void (*copy_ps_t)(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+typedef void (*copy_ss_t)(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 
-typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+typedef void (*pixel_sub_ps_t)(int16_t* dst, intptr_t dstride, const pixel* src0, const pixel* src1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*pixel_add_ps_t)(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
 
-typedef void (*saoCuOrgE0_t)(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft);
-typedef void (*planecopy_cp_t) (uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift);
-typedef void (*planecopy_sp_t) (uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 
-typedef void (*cutree_propagate_cost) (int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len);
+typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
  * a vectorized primitive, or a C function. */
 struct EncoderPrimitives
 {
-    pixelcmp_t      sad[NUM_LUMA_PARTITIONS];        // Sum of Differences for each size
-    pixelcmp_x3_t   sad_x3[NUM_LUMA_PARTITIONS];     // Sum of Differences 3x for each size
-    pixelcmp_x4_t   sad_x4[NUM_LUMA_PARTITIONS];     // Sum of Differences 4x for each size
-    pixelcmp_t      sse_pp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (pixel, pixel) fenc alignment not assumed
-    pixelcmp_ss_t   sse_ss[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, short) fenc alignment not assumed
-    pixelcmp_sp_t   sse_sp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, pixel) fenc alignment not assumed
-    pixel_ssd_s_t   ssd_s[NUM_SQUARE_BLOCKS - 1];    // Sum of Square Error (short) fenc alignment not assumed
-    pixelcmp_t      satd[NUM_LUMA_PARTITIONS];       // Sum of Transformed differences (HADAMARD)
-    pixelcmp_t      sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
-    pixelcmp_t      sa8d[NUM_SQUARE_BLOCKS];         // sa8d primitives for square intra blocks
-    pixelcmp_t      psy_cost_pp[NUM_SQUARE_BLOCKS];     // difference in AC energy between two blocks
-    pixelcmp_ss_t   psy_cost_ss[NUM_SQUARE_BLOCKS];
-
-    blockfill_s_t   blockfill_s[NUM_SQUARE_BLOCKS];  // block fill with value
-    cvt16to32_shl_t cvt16to32_shl;
-    cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1];
-    cvt32to16_shr_t cvt32to16_shr;
-    cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
-    copy_cnt_t      copy_cnt[NUM_SQUARE_BLOCKS - 1];
-    copy_shr_t      copy_shr;
-    copy_shl_t      copy_shl[NUM_SQUARE_BLOCKS - 1];
-
-    copy_pp_t       luma_copy_pp[NUM_LUMA_PARTITIONS];
-    copy_sp_t       luma_copy_sp[NUM_LUMA_PARTITIONS];
-    copy_ps_t       luma_copy_ps[NUM_LUMA_PARTITIONS];
-    copy_ss_t       luma_copy_ss[NUM_LUMA_PARTITIONS];
-    pixel_sub_ps_t  luma_sub_ps[NUM_SQUARE_BLOCKS];
-    pixel_add_ps_t  luma_add_ps[NUM_SQUARE_BLOCKS];
-    copy_pp_t       square_copy_pp[NUM_SQUARE_BLOCKS];
-    copy_sp_t       square_copy_sp[NUM_SQUARE_BLOCKS];
-    copy_ps_t       square_copy_ps[NUM_SQUARE_BLOCKS];
-    copy_ss_t       square_copy_ss[NUM_SQUARE_BLOCKS];
-
-    filter_pp_t     luma_hpp[NUM_LUMA_PARTITIONS];
-    filter_hps_t    luma_hps[NUM_LUMA_PARTITIONS];
-    filter_pp_t     luma_vpp[NUM_LUMA_PARTITIONS];
-    filter_ps_t     luma_vps[NUM_LUMA_PARTITIONS];
-    filter_sp_t     luma_vsp[NUM_LUMA_PARTITIONS];
-    filter_ss_t     luma_vss[NUM_LUMA_PARTITIONS];
-    filter_hv_pp_t  luma_hvpp[NUM_LUMA_PARTITIONS];
-    filter_p2s_t    luma_p2s;
-    filter_p2s_t    chroma_p2s[X265_CSP_COUNT];
-
-    weightp_sp_t    weight_sp;
-    weightp_pp_t    weight_pp;
-    pixelavg_pp_t   pixelavg_pp[NUM_LUMA_PARTITIONS];
-    addAvg_t        luma_addAvg[NUM_LUMA_PARTITIONS];
-
-    intra_pred_t    intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE];
-    intra_allangs_t intra_pred_allangs[NUM_TR_SIZE];
-    scale_t         scale1D_128to64;
-    scale_t         scale2D_64to32;
-
-    dct_t           dct[NUM_DCTS];
-    idct_t          idct[NUM_IDCTS];
-    quant_t         quant;
-    nquant_t        nquant;
-    dequant_scaling_t dequant_scaling;
-    dequant_normal_t dequant_normal;
-    count_nonzero_t count_nonzero;
-    denoiseDct_t    denoiseDct;
-
-    calcresidual_t  calcresidual[NUM_SQUARE_BLOCKS];
-    transpose_t     transpose[NUM_SQUARE_BLOCKS];
-
-    var_t           var[NUM_SQUARE_BLOCKS];
-    ssim_4x4x2_core_t ssim_4x4x2_core;
-    ssim_end4_t     ssim_end_4;
-
-    downscale_t     frame_init_lowres_core;
-    plane_copy_deinterleave_t plane_copy_deinterleave_c;
-    extendCURowBorder_t extendRowBorder;
-    // sao primitives
-    saoCuOrgE0_t      saoCuOrgE0;
-    planecopy_cp_t    planecopy_cp;
-    planecopy_sp_t    planecopy_sp;
-
-    cutree_propagate_cost    propagateCost;
+    pixelcmp_t            sad[NUM_LUMA_PARTITIONS];        // Sum of Differences for each size
+    pixelcmp_x3_t         sad_x3[NUM_LUMA_PARTITIONS];     // Sum of Differences 3x for each size
+    pixelcmp_x4_t         sad_x4[NUM_LUMA_PARTITIONS];     // Sum of Differences 4x for each size
+    pixelcmp_t            sse_pp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (pixel, pixel) fenc alignment not assumed
+    pixelcmp_ss_t         sse_ss[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, short) fenc alignment not assumed
+    pixelcmp_sp_t         sse_sp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, pixel) fenc alignment not assumed
+    pixel_ssd_s_t         ssd_s[NUM_SQUARE_BLOCKS - 1];    // Sum of Square Error (short) fenc alignment not assumed
+    pixelcmp_t            satd[NUM_LUMA_PARTITIONS];       // Sum of Transformed differences (HADAMARD)
+    pixelcmp_t            sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
+    pixelcmp_t            sa8d[NUM_SQUARE_BLOCKS];         // sa8d primitives for square intra blocks
+    pixelcmp_t            psy_cost_pp[NUM_SQUARE_BLOCKS];  // difference in AC energy between two blocks
+    pixelcmp_ss_t         psy_cost_ss[NUM_SQUARE_BLOCKS];
+
+    dct_t                 dct[NUM_DCTS];
+    idct_t                idct[NUM_IDCTS];
+    quant_t               quant;
+    nquant_t              nquant;
+    dequant_scaling_t     dequant_scaling;
+    dequant_normal_t      dequant_normal;
+    count_nonzero_t       count_nonzero;
+    denoiseDct_t          denoiseDct;
+    calcresidual_t        calcresidual[NUM_SQUARE_BLOCKS];
+    blockfill_s_t         blockfill_s[NUM_SQUARE_BLOCKS];  // block fill with value
+    cpy2Dto1D_shl_t       cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1];
+    cpy2Dto1D_shr_t       cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1];
+    cpy1Dto2D_shl_t       cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1];
+    cpy1Dto2D_shr_t       cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1];
+    copy_cnt_t            copy_cnt[NUM_SQUARE_BLOCKS - 1];
+
+    intra_pred_t          intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE];
+    intra_allangs_t       intra_pred_allangs[NUM_TR_SIZE];
+    transpose_t           transpose[NUM_SQUARE_BLOCKS];
+    scale_t               scale1D_128to64;
+    scale_t               scale2D_64to32;
+
+    var_t                 var[NUM_SQUARE_BLOCKS];
+    ssim_4x4x2_core_t     ssim_4x4x2_core;
+    ssim_end4_t           ssim_end_4;
+
+    saoCuOrgE0_t          saoCuOrgE0;
+
+    downscale_t           frameInitLowres;
+    cutree_propagate_cost propagateCost;
+
+    extendCURowBorder_t   extendRowBorder;
+    planecopy_cp_t        planecopy_cp;
+    planecopy_sp_t        planecopy_sp;
+
+    weightp_sp_t          weight_sp;
+    weightp_pp_t          weight_pp;
+    pixelavg_pp_t         pixelavg_pp[NUM_LUMA_PARTITIONS];
+    addAvg_t              luma_addAvg[NUM_LUMA_PARTITIONS];
+
+    filter_pp_t           luma_hpp[NUM_LUMA_PARTITIONS];
+    filter_hps_t          luma_hps[NUM_LUMA_PARTITIONS];
+    filter_pp_t           luma_vpp[NUM_LUMA_PARTITIONS];
+    filter_ps_t           luma_vps[NUM_LUMA_PARTITIONS];
+    filter_sp_t           luma_vsp[NUM_LUMA_PARTITIONS];
+    filter_ss_t           luma_vss[NUM_LUMA_PARTITIONS];
+    filter_hv_pp_t        luma_hvpp[NUM_LUMA_PARTITIONS];
+    filter_p2s_t          luma_p2s;
+
+    copy_pp_t             luma_copy_pp[NUM_LUMA_PARTITIONS];
+    copy_sp_t             luma_copy_sp[NUM_LUMA_PARTITIONS];
+    copy_ps_t             luma_copy_ps[NUM_LUMA_PARTITIONS];
+    copy_ss_t             luma_copy_ss[NUM_LUMA_PARTITIONS];
+    pixel_sub_ps_t        luma_sub_ps[NUM_SQUARE_BLOCKS];
+    pixel_add_ps_t        luma_add_ps[NUM_SQUARE_BLOCKS];
 
     struct
     {
+        pixelcmp_t      satd[NUM_LUMA_PARTITIONS];
         filter_pp_t     filter_vpp[NUM_LUMA_PARTITIONS];
         filter_ps_t     filter_vps[NUM_LUMA_PARTITIONS];
         filter_sp_t     filter_vsp[NUM_LUMA_PARTITIONS];
@@ -301,7 +286,8 @@ struct EncoderPrimitives
         copy_ss_t       copy_ss[NUM_LUMA_PARTITIONS];
         pixel_sub_ps_t  sub_ps[NUM_SQUARE_BLOCKS];
         pixel_add_ps_t  add_ps[NUM_SQUARE_BLOCKS];
-    } chroma[4]; // X265_CSP_COUNT - do not want to include x265.h here
+        filter_p2s_t    p2s;
+    } chroma[X265_CSP_COUNT];
 };
 
 void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY);
diff --git a/source/common/quant.cpp b/source/common/quant.cpp
index 387962c..a6b50fd 100644
--- a/source/common/quant.cpp
+++ b/source/common/quant.cpp
@@ -50,7 +50,7 @@ inline int fastMin(int x, int y)
     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
 }
 
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
 {
     X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
@@ -81,7 +81,7 @@ inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOne
 
             // NOTE: mapping to x86 hardware instruction BSR
             unsigned long size;
-            CLZ32(size, absLevel);
+            CLZ(size, absLevel);
             int egs = size * 2 + 1;
 
             rate += egs << 15;
@@ -106,7 +106,7 @@ inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOne
 }
 
 /* Calculates the cost for specific absolute transform level */
-inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
 {
     X265_CHECK(absLevel, "absLevel should not be zero\n");
 
@@ -135,7 +135,7 @@ inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *g
             if (symbol)
             {
                 unsigned long idx;
-                CLZ32(idx, symbol + 1);
+                CLZ(idx, symbol + 1);
                 length = idx;
             }
 
@@ -166,7 +166,7 @@ bool Quant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList,
     m_useRDOQ = useRDOQ;
     m_psyRdoqScale = (int64_t)(psyScale * 256.0);
     m_scalingList = &scalingList;
-    m_resiDctCoeff = X265_MALLOC(int32_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
+    m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
     m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
     m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
 
@@ -195,8 +195,8 @@ void Quant::setQPforQuant(const CUData& ctu)
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
     int qpy = ctu.m_qp[0];
     m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET);
-    setChromaQP(qpy + ctu.m_slice->m_pps->chromaCbQpOffset, TEXT_CHROMA_U, ctu.m_chromaFormat);
-    setChromaQP(qpy + ctu.m_slice->m_pps->chromaCrQpOffset, TEXT_CHROMA_V, ctu.m_chromaFormat);
+    setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
+    setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
 }
 
 void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
@@ -216,7 +216,7 @@ void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
 uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams)
 {
     const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG;
-    const uint16_t *scan = codeParams.scan;
+    const uint16_t* scan = codeParams.scan;
     bool lastCG = true;
 
     for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--)
@@ -322,58 +322,55 @@ uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSi
     return numSig;
 }
 
-uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16_t* residual, uint32_t stride,
+uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
                              coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
 {
+    const uint32_t sizeIdx = log2TrSize - 2;
     if (cu.m_tqBypass[absPartIdx])
     {
         X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
-        return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride);
+        return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
     }
 
     bool isLuma  = ttype == TEXT_LUMA;
     bool usePsy  = m_psyRdoqScale && isLuma && !useTransformSkip;
-    bool isIntra = cu.m_predMode[absPartIdx] == MODE_INTRA;
     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
-    int trSize = 1 << log2TrSize;
 
     X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
     if (useTransformSkip)
     {
 #if X265_DEPTH <= 10
-        primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+        X265_CHECK(transformShift >= 0, "invalid transformShift\n");
+        primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
 #else
         if (transformShift >= 0)
-            primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+            primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
         else
-        {
-            int shift = -transformShift;
-            int offset = (1 << (shift - 1));
-            primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset);
-        }
+            primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
 #endif
     }
     else
     {
-        const uint32_t sizeIdx = log2TrSize - 2;
+        bool isIntra = cu.isIntra(absPartIdx);
         int useDST = !sizeIdx && isLuma && isIntra;
         int index = DCT_4x4 + sizeIdx - useDST;
 
-        primitives.dct[index](residual, m_resiDctCoeff, stride);
+        primitives.dct[index](residual, m_resiDctCoeff, resiStride);
 
         /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
          * there is no risk of performing this DCT unnecessarily */
         if (usePsy)
         {
+            int trSize = 1 << log2TrSize;
             /* perform DCT on source pixels for psy-rdoq */
-            primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
+            primitives.luma_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
             primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
         }
 
-        if (m_nr && !isIntra)
+        if (m_nr)
         {
             /* denoise is not applied to intra residual, so DST can be ignored */
-            int cat = sizeIdx + 4 * !isLuma;
+            int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
             int numCoeff = 1 << (log2TrSize * 2);
             primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
             m_nr->count[cat]++;
@@ -389,7 +386,7 @@ uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16
         int scalingListType = ttype + (isLuma ? 3 : 0);
         int rem = m_qpParam[ttype].rem;
         int per = m_qpParam[ttype].per;
-        int32_t *quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
+        const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
 
         int qbits = QUANT_SHIFT + per + transformShift;
         int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
@@ -408,12 +405,13 @@ uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16
     }
 }
 
-void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff,
+void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
                             uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
 {
+    const uint32_t sizeIdx = log2TrSize - 2;
     if (transQuantBypass)
     {
-        primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0);
+        primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
         return;
     }
 
@@ -427,7 +425,7 @@ void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t s
     if (m_scalingList->m_bEnabled)
     {
         int scalingListType = (bIntra ? 0 : 3) + ttype;
-        int32_t *dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+        const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
         primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
     }
     else
@@ -438,20 +436,18 @@ void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t s
 
     if (useTransformSkip)
     {
-        int trSize = 1 << log2TrSize;
-
 #if X265_DEPTH <= 10
-        primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+        X265_CHECK(transformShift > 0, "invalid transformShift\n");
+        primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
 #else
         if (transformShift > 0)
-            primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+            primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
         else
-            primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
+            primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
 #endif
     }
     else
     {
-        const uint32_t sizeIdx = log2TrSize - 2;
         int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
 
         X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
@@ -459,23 +455,23 @@ void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t s
         // DC only
         if (numSig == 1 && coeff[0] != 0 && !useDST)
         {
-            const int shift_1st = 7;
+            const int shift_1st = 7 - 6;
             const int add_1st = 1 << (shift_1st - 1);
-            const int shift_2nd = 12 - (X265_DEPTH - 8);
+            const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
             const int add_2nd = 1 << (shift_2nd - 1);
 
-            int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
-            primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val);
+            int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
+            primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
             return;
         }
 
-        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride);
+        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
     }
 }
 
 /* Rate distortion optimized quantization for entropy coding engines using
  * probability models like CABAC */
-uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
 {
     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
     int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
@@ -486,7 +482,7 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex
     int per = m_qpParam[ttype].per;
     int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
     int add = (1 << (qbits - 1));
-    int32_t *qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
+    const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
 
     int numCoeff = 1 << (log2TrSize * 2);
 
@@ -503,7 +499,7 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex
     /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
      * scale applied that must be removed during unquant. Note that in real dequant there is clipping
      * at several stages. We skip the clipping for simplicity when measuring RD cost */
-    int32_t *unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+    const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
     int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
     int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
     int scaleBits = SCALE_BITS - 2 * transformShift;
@@ -616,8 +612,8 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex
                 // coefficient level estimation
                 const uint32_t oneCtx = 4 * ctxSet + c1;
                 const uint32_t absCtx = ctxSet + c2;
-                const int *greaterOneBits = estBitsSbac.greaterOneBits[oneCtx];
-                const int *levelAbsBits = estBitsSbac.levelAbsBits[absCtx];
+                const int* greaterOneBits = estBitsSbac.greaterOneBits[oneCtx];
+                const int* levelAbsBits = estBitsSbac.levelAbsBits[absCtx];
 
                 uint16_t level = 0;
                 uint32_t sigCoefBits = 0;
diff --git a/source/common/quant.h b/source/common/quant.h
index ac575f7..2801f00 100644
--- a/source/common/quant.h
+++ b/source/common/quant.h
@@ -58,6 +58,20 @@ struct QpParam
     }
 };
 
+#define MAX_NUM_TR_COEFFS        MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
+#define MAX_NUM_TR_CATEGORIES    16                        /* 32, 16, 8, 4 transform categories each for luma and chroma */
+
+// NOTE: MUST be 16-byte aligned for asm code
+struct NoiseReduction
+{
+    /* 0 = luma 4x4,   1 = luma 8x8,   2 = luma 16x16,   3 = luma 32x32
+     * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
+     * Intra 0..7 - Inter 8..15 */
+    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint32_t count[MAX_NUM_TR_CATEGORIES];
+};
+
 class Quant
 {
 protected:
@@ -69,8 +83,8 @@ protected:
 
     bool               m_useRDOQ;
     int64_t            m_psyRdoqScale;
-    int32_t*           m_resiDctCoeff;
-    int32_t*           m_fencDctCoeff;
+    int16_t*           m_resiDctCoeff;
+    int16_t*           m_fencDctCoeff;
     int16_t*           m_fencShortBuf;
 
     enum { IEP_RATE = 32768 }; /* FIX15 cost of an equal probable bit */
@@ -90,10 +104,10 @@ public:
     /* CU setup */
     void setQPforQuant(const CUData& ctu);
 
-    uint32_t transformNxN(CUData& cu, pixel *fenc, uint32_t fencstride, int16_t* residual, uint32_t stride, coeff_t* coeff,
+    uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
                           uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
 
-    void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff,
+    void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
                          uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
 
     /* static methods shared with entropy.cpp */
@@ -107,7 +121,7 @@ protected:
 
     uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters);
 
-    uint32_t rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
+    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
     inline uint32_t getRateLast(uint32_t posx, uint32_t posy) const;
 };
 
diff --git a/source/common/shortyuv.cpp b/source/common/shortyuv.cpp
index 2a7e153..0b95f0e 100644
--- a/source/common/shortyuv.cpp
+++ b/source/common/shortyuv.cpp
@@ -84,7 +84,7 @@ void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_
     const int16_t* src = getLumaAddr(absPartIdx);
     int16_t* dst = dstYuv.getLumaAddr(absPartIdx);
 
-    primitives.square_copy_ss[log2Size - 2](dst, dstYuv.m_size, const_cast<int16_t*>(src), m_size);
+    primitives.luma_copy_ss[log2Size - 2](dst, dstYuv.m_size, src, m_size);
 }
 
 void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
@@ -92,7 +92,7 @@ void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log
     const int16_t* src = getLumaAddr(absPartIdx);
     pixel* dst = dstYuv.getLumaAddr(absPartIdx);
 
-    primitives.square_copy_sp[log2Size - 2](dst, dstYuv.m_size, const_cast<int16_t*>(src), m_size);
+    primitives.luma_copy_sp[log2Size - 2](dst, dstYuv.m_size, src, m_size);
 }
 
 void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
@@ -103,8 +103,8 @@ void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint3
     int16_t* dstU = dstYuv.getCbAddr(absPartIdx);
     int16_t* dstV = dstYuv.getCrAddr(absPartIdx);
 
-    primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, const_cast<int16_t*>(srcU), m_csize);
-    primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, const_cast<int16_t*>(srcV), m_csize);
+    primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, srcU, m_csize);
+    primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, srcV, m_csize);
 }
 
 void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
@@ -115,6 +115,6 @@ void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t l
     pixel* dstU = dstYuv.getCbAddr(absPartIdx);
     pixel* dstV = dstYuv.getCrAddr(absPartIdx);
 
-    primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, const_cast<int16_t*>(srcU), m_csize);
-    primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, const_cast<int16_t*>(srcV), m_csize);
+    primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, srcU, m_csize);
+    primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, srcV, m_csize);
 }
diff --git a/source/common/slice.h b/source/common/slice.h
index bd0ba63..90712b9 100644
--- a/source/common/slice.h
+++ b/source/common/slice.h
@@ -242,8 +242,7 @@ struct PPS
 {
     uint32_t maxCuDQPDepth;
 
-    int      chromaCbQpOffset;       // use param
-    int      chromaCrQpOffset;       // use param
+    int      chromaQpOffset[2];      // use param
 
     bool     bUseWeightPred;         // use param
     bool     bUseWeightedBiPred;     // use param
@@ -334,6 +333,8 @@ public:
 
     void setRefPicList(PicList& picList);
 
+    const Frame* getRefPic(int list, int refIdx) const { return refIdx >= 0 ? m_refPicList[list][refIdx] : NULL; }
+
     bool getRapPicFlag() const
     {
         return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
diff --git a/source/common/threading.cpp b/source/common/threading.cpp
index cb50eb2..1d888ae 100644
--- a/source/common/threading.cpp
+++ b/source/common/threading.cpp
@@ -1,6 +1,4 @@
 /*****************************************************************************
- * x265: threading class and intrinsics
- *****************************************************************************
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
@@ -48,21 +46,21 @@ bool Thread::start()
 {
     DWORD threadId;
 
-    this->thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)ThreadShim, this, 0, &threadId);
+    thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)ThreadShim, this, 0, &threadId);
 
     return threadId > 0;
 }
 
 void Thread::stop()
 {
-    if (this->thread)
-        WaitForSingleObject(this->thread, INFINITE);
+    if (thread)
+        WaitForSingleObject(thread, INFINITE);
 }
 
 Thread::~Thread()
 {
-    if (this->thread)
-        CloseHandle(this->thread);
+    if (thread)
+        CloseHandle(thread);
 }
 
 #else /* POSIX / pthreads */
@@ -79,10 +77,9 @@ static void *ThreadShim(void *opaque)
 
 bool Thread::start()
 {
-    if (pthread_create(&this->thread, NULL, ThreadShim, this))
+    if (pthread_create(&thread, NULL, ThreadShim, this))
     {
-        this->thread = 0;
-
+        thread = 0;
         return false;
     }
 
@@ -91,8 +88,8 @@ bool Thread::start()
 
 void Thread::stop()
 {
-    if (this->thread)
-        pthread_join(this->thread, NULL);
+    if (thread)
+        pthread_join(thread, NULL);
 }
 
 Thread::~Thread() {}
@@ -101,6 +98,7 @@ Thread::~Thread() {}
 
 Thread::Thread()
 {
-    this->thread = 0;
+    thread = 0;
 }
+
 }
diff --git a/source/common/threading.h b/source/common/threading.h
index ef5642a..5f95782 100644
--- a/source/common/threading.h
+++ b/source/common/threading.h
@@ -1,6 +1,4 @@
 /*****************************************************************************
- * x265: threading class and intrinsics
- *****************************************************************************
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
@@ -49,11 +47,10 @@
 #include <sys/time.h>
 #include <unistd.h>
 
-#define CLZ32(id, x)                        id = (unsigned long)__builtin_clz(x) ^ 31
-#define CTZ64(id, x)                        id = (unsigned long)__builtin_ctzll(x)
-#define ATOMIC_OR(ptr, mask)                __sync_or_and_fetch(ptr, mask)
-#define ATOMIC_CAS(ptr, oldval, newval)     __sync_val_compare_and_swap(ptr, oldval, newval)
-#define ATOMIC_CAS32(ptr, oldval, newval)   __sync_val_compare_and_swap(ptr, oldval, newval)
+#define CLZ(id, x)                          id = (unsigned long)__builtin_clz(x) ^ 31
+#define CTZ(id, x)                          id = (unsigned long)__builtin_ctz(x)
+#define ATOMIC_OR(ptr, mask)                __sync_fetch_and_or(ptr, mask)
+#define ATOMIC_AND(ptr, mask)               __sync_fetch_and_and(ptr, mask)
 #define ATOMIC_INC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, 1)
 #define ATOMIC_DEC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, -1)
 #define GIVE_UP_TIME()                      usleep(0)
@@ -62,53 +59,12 @@
 
 #include <intrin.h>
 
-#if !_WIN64
-inline int _BitScanReverse64(DWORD *id, uint64_t x64) // fake 64bit CLZ
-{
-    uint32_t high32 = (uint32_t)(x64 >> 32);
-    uint32_t low32 = (uint32_t)x64;
-
-    if (high32)
-    {
-        _BitScanReverse(id, high32);
-        *id += 32;
-        return 1;
-    }
-    else if (low32)
-        return _BitScanReverse(id, low32);
-    else
-        return *id = 0;
-}
-
-inline int _BitScanForward64(DWORD *id, uint64_t x64) // fake 64bit CLZ
-{
-    uint32_t high32 = (uint32_t)(x64 >> 32);
-    uint32_t low32 = (uint32_t)x64;
-
-    if (high32)
-    {
-        _BitScanForward(id, high32);
-        *id += 32;
-        return 1;
-    }
-    else if (low32)
-        return _BitScanForward(id, low32);
-    else
-        return *id = 0;
-}
-
-#endif // if !_WIN64
-
-#ifndef ATOMIC_OR
-#define ATOMIC_OR(ptr, mask)                InterlockedOr64((volatile LONG64*)ptr, mask)
-#endif
-
-#define CLZ32(id, x)                        _BitScanReverse(&id, x)
-#define CTZ64(id, x)                        _BitScanForward64(&id, x)
-#define ATOMIC_CAS(ptr, oldval, newval)     (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
-#define ATOMIC_CAS32(ptr, oldval, newval)   (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval)
+#define CLZ(id, x)                          _BitScanReverse(&id, x)
+#define CTZ(id, x)                          _BitScanForward(&id, x)
 #define ATOMIC_INC(ptr)                     InterlockedIncrement((volatile LONG*)ptr)
 #define ATOMIC_DEC(ptr)                     InterlockedDecrement((volatile LONG*)ptr)
+#define ATOMIC_OR(ptr, mask)                _InterlockedOr((volatile LONG*)ptr, (LONG)mask)
+#define ATOMIC_AND(ptr, mask)               _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
 #define GIVE_UP_TIME()                      Sleep(0)
 
 #endif // ifdef __GNUC__
diff --git a/source/common/threadpool.cpp b/source/common/threadpool.cpp
index 8a2ab9d..d32e55f 100644
--- a/source/common/threadpool.cpp
+++ b/source/common/threadpool.cpp
@@ -1,6 +1,4 @@
 /*****************************************************************************
- * x265: singleton thread pool and interface classes
- *****************************************************************************
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
@@ -87,7 +85,7 @@ private:
     int          m_numThreads;
     int          m_numSleepMapWords;
     PoolThread  *m_threads;
-    volatile uint64_t *m_sleepMap;
+    volatile uint32_t *m_sleepMap;
 
     /* Lock for write access to the provider lists.  Threads are
      * always allowed to read m_firstProvider and follow the
@@ -174,8 +172,8 @@ void PoolThread::threadMain()
 
 void ThreadPoolImpl::markThreadAsleep(int id)
 {
-    int word = id >> 6;
-    uint64_t bit = 1LL << (id & 63);
+    int word = id >> 5;
+    uint32_t bit = 1 << (id & 31);
 
     ATOMIC_OR(&m_sleepMap[word], bit);
 }
@@ -186,16 +184,16 @@ void ThreadPoolImpl::pokeIdleThread()
      * not give up until a thread is awakened or all of them are awake */
     for (int i = 0; i < m_numSleepMapWords; i++)
     {
-        uint64_t oldval = m_sleepMap[i];
+        uint32_t oldval = m_sleepMap[i];
         while (oldval)
         {
             unsigned long id;
-            CTZ64(id, oldval);
+            CTZ(id, oldval);
 
-            uint64_t newval = oldval & ~(1LL << id);
-            if (ATOMIC_CAS(&m_sleepMap[i], oldval, newval) == oldval)
+            uint32_t bit = 1 << id;
+            if (ATOMIC_AND(&m_sleepMap[i], ~bit) & bit)
             {
-                m_threads[(i << 6) | id].poke();
+                m_threads[i * 32 + id].poke();
                 return;
             }
 
@@ -249,8 +247,8 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads)
     , m_firstProvider(NULL)
     , m_lastProvider(NULL)
 {
-    m_numSleepMapWords = (numThreads + 63) >> 6;
-    m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords);
+    m_numSleepMapWords = (numThreads + 31) >> 5;
+    m_sleepMap = X265_MALLOC(uint32_t, m_numSleepMapWords);
 
     char *buffer = (char*)X265_MALLOC(PoolThread, numThreads);
     m_threads = reinterpret_cast<PoolThread*>(buffer);
@@ -259,9 +257,7 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads)
     if (m_threads && m_sleepMap)
     {
         for (int i = 0; i < m_numSleepMapWords; i++)
-        {
             m_sleepMap[i] = 0;
-        }
 
         m_ok = true;
         int i;
@@ -277,9 +273,7 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads)
         }
 
         if (m_ok)
-        {
             waitForAllIdle();
-        }
         else
         {
             // stop threads that did start up
@@ -300,12 +294,10 @@ void ThreadPoolImpl::waitForAllIdle()
     int id = 0;
     do
     {
-        int word = id >> 6;
-        uint64_t bit = 1LL << (id & 63);
+        int word = id >> 5;
+        uint32_t bit = 1 << (id & 31);
         if (m_sleepMap[word] & bit)
-        {
             id++;
-        }
         else
         {
             GIVE_UP_TIME();
@@ -338,9 +330,7 @@ ThreadPoolImpl::~ThreadPoolImpl()
     {
         // cleanup thread handles
         for (int i = 0; i < m_numThreads; i++)
-        {
             m_threads[i].~PoolThread();
-        }
 
         X265_FREE(reinterpret_cast<char*>(m_threads));
     }
diff --git a/source/common/threadpool.h b/source/common/threadpool.h
index 7616670..2c192ca 100644
--- a/source/common/threadpool.h
+++ b/source/common/threadpool.h
@@ -1,6 +1,4 @@
 /*****************************************************************************
- * x265: singleton thread pool and interface classes
- *****************************************************************************
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
diff --git a/source/common/vec/dct-sse3.cpp b/source/common/vec/dct-sse3.cpp
index c435b52..53333e6 100644
--- a/source/common/vec/dct-sse3.cpp
+++ b/source/common/vec/dct-sse3.cpp
@@ -52,30 +52,22 @@ ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
     {  83,  36,  83,  36, 83,  36, 83,  36 },
     {  36, -83,  36, -83, 36, -83, 36, -83 }
 };
-void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+void idct8(const int16_t* src, int16_t* dst, intptr_t stride)
 {
     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
     __m128i T00, T01, T02, T03, T04, T05, T06, T07;
 
     m128iAdd = _mm_set1_epi32(64);
 
-    T00 = _mm_load_si128((__m128i*)&src[8 + 0]);
-    T01 = _mm_load_si128((__m128i*)&src[8 + 4]);
-    m128iS1 = _mm_packs_epi32(T00, T01);
-    T00 = _mm_load_si128((__m128i*)&src[24 + 0]);
-    T01 = _mm_load_si128((__m128i*)&src[24 + 4]);
-    m128iS3 = _mm_packs_epi32(T00, T01);
+    m128iS1 = _mm_load_si128((__m128i*)&src[8 + 0]);
+    m128iS3 = _mm_load_si128((__m128i*)&src[24 + 0]);
     m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
     E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
     m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
     E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
 
-    T00 = _mm_load_si128((__m128i*)&src[40 + 0]);
-    T01 = _mm_load_si128((__m128i*)&src[40 + 4]);
-    m128iS5 = _mm_packs_epi32(T00, T01);
-    T00 = _mm_load_si128((__m128i*)&src[56 + 0]);
-    T01 = _mm_load_si128((__m128i*)&src[56 + 4]);
-    m128iS7 = _mm_packs_epi32(T00, T01);
+    m128iS5 = _mm_load_si128((__m128i*)&src[40 + 0]);
+    m128iS7 = _mm_load_si128((__m128i*)&src[56 + 0]);
     m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
     E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
     m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
@@ -107,12 +99,8 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride)
 
     /*    -------     */
 
-    T00 = _mm_load_si128((__m128i*)&src[0 + 0]);
-    T01 = _mm_load_si128((__m128i*)&src[0 + 4]);
-    m128iS0 = _mm_packs_epi32(T00, T01);
-    T00 = _mm_load_si128((__m128i*)&src[32 + 0]);
-    T01 = _mm_load_si128((__m128i*)&src[32 + 4]);
-    m128iS4 = _mm_packs_epi32(T00, T01);
+    m128iS0 = _mm_load_si128((__m128i*)&src[0 + 0]);
+    m128iS4 = _mm_load_si128((__m128i*)&src[32 + 0]);
     m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
     EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
     m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
@@ -123,12 +111,8 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride)
 
     /*    -------     */
 
-    T00 = _mm_load_si128((__m128i*)&src[16 + 0]);
-    T01 = _mm_load_si128((__m128i*)&src[16 + 4]);
-    m128iS2 = _mm_packs_epi32(T00, T01);
-    T00 = _mm_load_si128((__m128i*)&src[48 + 0]);
-    T01 = _mm_load_si128((__m128i*)&src[48 + 4]);
-    m128iS6 = _mm_packs_epi32(T00, T01);
+    m128iS2 = _mm_load_si128((__m128i*)&src[16 + 0]);
+    m128iS6 = _mm_load_si128((__m128i*)&src[48 + 0]);
     m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
     E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
     m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
@@ -305,7 +289,7 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride)
     _mm_storeh_pi((__m64*)&dst[7 * stride +  4], _mm_castsi128_ps(T11));
 }
 
-void idct16(int32_t *src, int16_t *dst, intptr_t stride)
+void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
 {
     const __m128i c16_p87_p90   = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
     const __m128i c16_p70_p80   = _mm_set1_epi32(0x00460050);
@@ -367,71 +351,22 @@ void idct16(int32_t *src, int16_t *dst, intptr_t stride)
     for (int i = 0; i < 2; i++)
     {
         const int offset = (i << 3);
-        __m128i T00, T01;
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset + 4]);
-        in00[i]  = _mm_packs_epi32(T00, T01);                       // [07 06 05 04 03 02 01 00]
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset + 4]);
-        in01[i]  = _mm_packs_epi32(T00, T01);                           // [17 16 15 14 13 12 11 10]
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset + 4]);
-        in02[i]  = _mm_packs_epi32(T00, T01);                       // [27 26 25 24 23 22 21 20]
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset + 4]);
-        in03[i]  = _mm_packs_epi32(T00, T01);                       // [37 36 35 34 33 32 31 30]
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset + 4]);
-        in04[i]  = _mm_packs_epi32(T00, T01);                       // [47 46 45 44 43 42 41 40]
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset + 4]);
-        in05[i]  = _mm_packs_epi32(T00, T01);                       // [57 56 55 54 53 52 51 50]
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset + 4]);
-        in06[i]  = _mm_packs_epi32(T00, T01);                       // [67 66 65 64 63 62 61 60]
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset + 4]);
-        in07[i]  = _mm_packs_epi32(T00, T01);                       // [77 76 75 74 73 72 71 70]
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset + 4]);
-        in08[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset + 4]);
-        in09[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset + 4]);
-        in10[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset + 4]);
-        in11[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset + 4]);
-        in12[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset + 4]);
-        in13[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset + 4]);
-        in14[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset + 4]);
-        in15[i]  = _mm_packs_epi32(T00, T01);
+        in00[i]  = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00]
+        in01[i]  = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10]
+        in02[i]  = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20]
+        in03[i]  = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30]
+        in04[i]  = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); // [47 46 45 44 43 42 41 40]
+        in05[i]  = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); // [57 56 55 54 53 52 51 50]
+        in06[i]  = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); // [67 66 65 64 63 62 61 60]
+        in07[i]  = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); // [77 76 75 74 73 72 71 70]
+        in08[i]  = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
+        in09[i]  = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
+        in10[i]  = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
+        in11[i]  = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
+        in12[i]  = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
+        in13[i]  = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
+        in14[i]  = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
+        in15[i]  = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
     }
 
     for (int pass = 0; pass < 2; pass++)
@@ -716,7 +651,7 @@ void idct16(int32_t *src, int16_t *dst, intptr_t stride)
     _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
 }
 
-void idct32(int32_t *src, int16_t *dst, intptr_t stride)
+void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
 {
     //Odd
     const __m128i c16_p90_p90   = _mm_set1_epi32(0x005A005A); //column 0
@@ -909,135 +844,38 @@ void idct32(int32_t *src, int16_t *dst, intptr_t stride)
     for (int i = 0; i < 4; i++)
     {
         const int offset = (i << 3);
-        __m128i T00, T01;
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
-        in00[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
-        in01[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
-        in02[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
-        in03[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
-        in04[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
-        in05[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
-        in06[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
-        in07[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
-        in08[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
-        in09[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
-        in10[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
-        in11[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
-        in12[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
-        in13[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
-        in14[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
-        in15[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
-        in16[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
-        in17[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
-        in18[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
-        in19[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
-        in20[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
-        in21[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
-        in22[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
-        in23[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
-        in24[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
-        in25[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
-        in26[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
-        in27[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
-        in28[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
-        in29[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
-        in30[i]  = _mm_packs_epi32(T00, T01);
-
-        T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
-        in31[i]  = _mm_packs_epi32(T00, T01);
+        in00[i]  = _mm_loadu_si128((const __m128i*)&src[0  * 32 + offset]);
+        in01[i]  = _mm_loadu_si128((const __m128i*)&src[1  * 32 + offset]);
+        in02[i]  = _mm_loadu_si128((const __m128i*)&src[2  * 32 + offset]);
+        in03[i]  = _mm_loadu_si128((const __m128i*)&src[3  * 32 + offset]);
+        in04[i]  = _mm_loadu_si128((const __m128i*)&src[4  * 32 + offset]);
+        in05[i]  = _mm_loadu_si128((const __m128i*)&src[5  * 32 + offset]);
+        in06[i]  = _mm_loadu_si128((const __m128i*)&src[6  * 32 + offset]);
+        in07[i]  = _mm_loadu_si128((const __m128i*)&src[7  * 32 + offset]);
+        in08[i]  = _mm_loadu_si128((const __m128i*)&src[8  * 32 + offset]);
+        in09[i]  = _mm_loadu_si128((const __m128i*)&src[9  * 32 + offset]);
+        in10[i]  = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
+        in11[i]  = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
+        in12[i]  = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
+        in13[i]  = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
+        in14[i]  = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
+        in15[i]  = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
+        in16[i]  = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
+        in17[i]  = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
+        in18[i]  = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
+        in19[i]  = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
+        in20[i]  = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
+        in21[i]  = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
+        in22[i]  = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
+        in23[i]  = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
+        in24[i]  = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
+        in25[i]  = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
+        in26[i]  = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
+        in27[i]  = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
+        in28[i]  = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
+        in29[i]  = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
+        in30[i]  = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
+        in31[i]  = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
     }
 
     for (int pass = 0; pass < 2; pass++)
diff --git a/source/common/vec/dct-sse41.cpp b/source/common/vec/dct-sse41.cpp
index aa52709..81a7889 100644
--- a/source/common/vec/dct-sse41.cpp
+++ b/source/common/vec/dct-sse41.cpp
@@ -36,7 +36,7 @@
 using namespace x265;
 
 namespace {
-void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
 {
     X265_CHECK(num <= 32 * 32, "dequant num too large\n");
 
@@ -66,11 +66,7 @@ void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32
             quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per));
 
             quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
-            sign = _mm_srai_epi16(quantCoef12, 15);
-            quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
-            _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
-            quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
-            _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
+            _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
         }
     }
     else
@@ -100,11 +96,7 @@ void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32
             quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift));
 
             quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
-            sign = _mm_srai_epi16(quantCoef12, 15);
-            quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
-            _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
-            quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
-            _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
+            _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
         }
     }
 }
diff --git a/source/common/vec/dct-ssse3.cpp b/source/common/vec/dct-ssse3.cpp
index bbb7858..251d500 100644
--- a/source/common/vec/dct-ssse3.cpp
+++ b/source/common/vec/dct-ssse3.cpp
@@ -100,7 +100,7 @@ ALIGN_VAR_32(static const int16_t, tab_dct_16_1[][8]) =
 #undef MAKE_COEF
 };
 
-void dct16(int16_t *src, int32_t *dst, intptr_t stride)
+void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
 {
     // Const
     __m128i c_4     = _mm_set1_epi32(4);
@@ -344,8 +344,10 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
         T41  = _mm_hsub_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
         T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10);
-        _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40);
-        _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41);
+        T40  = _mm_packs_epi32(T40, T40);
+        T41  = _mm_packs_epi32(T41, T41);
+        _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
+        _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
@@ -366,7 +368,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&dst[4 * 16 + i], T40);
+        T40  = _mm_packs_epi32(T40, T40);
+        _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
@@ -387,7 +390,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&dst[12 * 16 + i], T40);
+        T40  = _mm_packs_epi32(T40, T40);
+        _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
@@ -408,7 +412,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&dst[2 * 16 + i], T40);
+        T40  = _mm_packs_epi32(T40, T40);
+        _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
@@ -429,7 +434,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&dst[6 * 16 + i], T40);
+        T40  = _mm_packs_epi32(T40, T40);
+        _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
@@ -450,7 +456,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&dst[10 * 16 + i], T40);
+        T40  = _mm_packs_epi32(T40, T40);
+        _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
@@ -471,7 +478,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&dst[14 * 16 + i], T40);
+        T40  = _mm_packs_epi32(T40, T40);
+        _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
 
 #define MAKE_ODD(tab, dstPos) \
     T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)]));       /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \
@@ -493,7 +501,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
         \
     T40  = _mm_hadd_epi32(T30, T31); \
     T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \
-    _mm_storeu_si128((__m128i*)&dst[(dstPos) * 16 + i], T40);
+    T40  = _mm_packs_epi32(T40, T40); \
+    _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
 
         MAKE_ODD(14,  1);
         MAKE_ODD(16,  3);
@@ -657,7 +666,7 @@ ALIGN_VAR_32(static const int16_t, tab_dct_32_1[][8]) =
 #undef MAKE_COEF16
 };
 
-void dct32(int16_t *src, int32_t *dst, intptr_t stride)
+void dct32(const int16_t *src, int16_t *dst, intptr_t stride)
 {
     // Const
     __m128i c_8     = _mm_set1_epi32(8);
@@ -1050,7 +1059,8 @@ void dct32(int16_t *src, int32_t *dst, intptr_t stride)
     T60  = _mm_hadd_epi32(T60, T61); \
         \
     T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \
-    _mm_storeu_si128((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
+    T60  = _mm_packs_epi32(T60, T60); \
+    _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
 
         MAKE_ODD(44, 44, 44, 44,  0);
         MAKE_ODD(45, 45, 45, 45, 16);
diff --git a/source/common/wavefront.cpp b/source/common/wavefront.cpp
index 17c44aa..533e768 100644
--- a/source/common/wavefront.cpp
+++ b/source/common/wavefront.cpp
@@ -33,14 +33,14 @@ bool WaveFront::init(int numRows)
 {
     m_numRows = numRows;
 
-    m_numWords = (numRows + 63) >> 6;
-    m_internalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords);
+    m_numWords = (numRows + 31) >> 5;
+    m_internalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords);
     if (m_internalDependencyBitmap)
-        memset((void*)m_internalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+        memset((void*)m_internalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
 
-    m_externalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords);
+    m_externalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords);
     if (m_externalDependencyBitmap)
-        memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+        memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
 
     return m_internalDependencyBitmap && m_externalDependencyBitmap;
 }
@@ -53,58 +53,31 @@ WaveFront::~WaveFront()
 
 void WaveFront::clearEnabledRowMask()
 {
-    memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+    memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
 }
 
 void WaveFront::enqueueRow(int row)
 {
-    // thread safe
-    uint64_t bit = 1LL << (row & 63);
-
-    X265_CHECK(row < m_numRows, "invalid row\n");
-    ATOMIC_OR(&m_internalDependencyBitmap[row >> 6], bit);
+    uint32_t bit = 1 << (row & 31);
+    ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);
     if (m_pool) m_pool->pokeIdleThread();
 }
 
 void WaveFront::enableRow(int row)
 {
-    // thread safe
-    uint64_t bit = 1LL << (row & 63);
-
-    X265_CHECK(row < m_numRows, "invalid row\n");
-    ATOMIC_OR(&m_externalDependencyBitmap[row >> 6], bit);
+    uint32_t bit = 1 << (row & 31);
+    ATOMIC_OR(&m_externalDependencyBitmap[row >> 5], bit);
 }
 
 void WaveFront::enableAllRows()
 {
-    memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint64_t) * m_numWords);
-}
-
-bool WaveFront::checkHigherPriorityRow(int curRow)
-{
-    int fullwords = curRow >> 6;
-    uint64_t mask = (1LL << (curRow & 63)) - 1;
-
-    // Check full bitmap words before curRow
-    for (int i = 0; i < fullwords; i++)
-    {
-        if (m_internalDependencyBitmap[i] & m_externalDependencyBitmap[i])
-            return true;
-    }
-
-    // check the partially masked bitmap word of curRow
-    if (m_internalDependencyBitmap[fullwords] & m_externalDependencyBitmap[fullwords] & mask)
-        return true;
-    return false;
+    memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint32_t) * m_numWords);
 }
 
 bool WaveFront::dequeueRow(int row)
 {
-    uint64_t oldval, newval;
-
-    oldval = m_internalDependencyBitmap[row >> 6];
-    newval = oldval & ~(1LL << (row & 63));
-    return ATOMIC_CAS(&m_internalDependencyBitmap[row >> 6], oldval, newval) == oldval;
+    uint32_t bit = 1 << (row & 31);
+    return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit);
 }
 
 bool WaveFront::findJob(int threadId)
@@ -114,22 +87,21 @@ bool WaveFront::findJob(int threadId)
     // thread safe
     for (int w = 0; w < m_numWords; w++)
     {
-        uint64_t oldval = m_internalDependencyBitmap[w];
-        while (oldval & m_externalDependencyBitmap[w])
+        uint32_t oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
+        while (oldval)
         {
-            uint64_t mask = oldval & m_externalDependencyBitmap[w];
-
-            CTZ64(id, mask);
+            CTZ(id, oldval);
 
-            uint64_t newval = oldval & ~(1LL << id);
-            if (ATOMIC_CAS(&m_internalDependencyBitmap[w], oldval, newval) == oldval)
+            uint32_t bit = 1 << id;
+            if (ATOMIC_AND(&m_internalDependencyBitmap[w], ~bit) & bit)
             {
-                // we cleared the bit, process row
-                processRow(w * 64 + id, threadId);
+                /* we cleared the bit, we get to process the row */
+                processRow(w * 32 + id, threadId);
                 return true;
             }
+
             // some other thread cleared the bit, try another bit
-            oldval = m_internalDependencyBitmap[w];
+            oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
         }
     }
 
diff --git a/source/common/wavefront.h b/source/common/wavefront.h
index a34b9a4..4692a41 100644
--- a/source/common/wavefront.h
+++ b/source/common/wavefront.h
@@ -43,8 +43,8 @@ private:
     // Dependencies are categorized as internal and external. Internal dependencies
     // are caused by neighbor block availability.  External dependencies are generally
     // reference frame reconstructed pixels being available.
-    uint64_t volatile *m_internalDependencyBitmap;
-    uint64_t volatile *m_externalDependencyBitmap;
+    uint32_t volatile *m_internalDependencyBitmap;
+    uint32_t volatile *m_externalDependencyBitmap;
 
     // number of words in the bitmap
     int m_numWords;
@@ -92,10 +92,6 @@ public:
     // Start or resume encode processing of this row, must be implemented by
     // derived classes.
     virtual void processRow(int row, int threadId) = 0;
-
-    // Returns true if a row above curRow is available for processing.  The processRow()
-    // method may call this function periodically and voluntarily exit
-    bool checkHigherPriorityRow(int curRow);
 };
 } // end namespace x265
 
diff --git a/source/common/winxp.h b/source/common/winxp.h
index b105804..0446265 100644
--- a/source/common/winxp.h
+++ b/source/common/winxp.h
@@ -56,30 +56,6 @@ void cond_destroy(ConditionVariable *cond);
 #define WakeAllConditionVariable    x265::cond_broadcast
 #define XP_CONDITION_VAR_FREE       x265::cond_destroy
 
-#if defined(_MSC_VER)
-
-/* Windows XP did not define atomic OR 64, but gcc has a good version, so
- * only use this workaround when targeting XP with MSVC */
-FORCEINLINE LONGLONG interlocked_OR64(__inout LONGLONG volatile *Destination,
-                                      __in    LONGLONG           Value)
-{
-    LONGLONG Old;
-
-    do
-    {
-        Old = *Destination;
-    }
-    while (_InterlockedCompareExchange64(Destination, Old | Value, Old) != Old);
-
-    return Old;
-}
-
-#define ATOMIC_OR(ptr, mask) x265::interlocked_OR64((volatile LONG64*)ptr, mask)
-
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#pragma intrinsic(_InterlockedCompareExchange64)
-#endif
-#endif // defined(_MSC_VER)
 } // namespace x265
 
 #else // if defined(_WIN32) && (_WIN32_WINNT < 0x0600)
diff --git a/source/common/x86/asm-primitives.cpp b/source/common/x86/asm-primitives.cpp
index ec1607d..b81115b 100644
--- a/source/common/x86/asm-primitives.cpp
+++ b/source/common/x86/asm-primitives.cpp
@@ -1336,11 +1336,22 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
         p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
 
-        p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
-        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
-        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
-        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
-        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
+        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
+        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
+        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
+        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
+        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
+        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
+        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
+        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
+        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
+        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
+        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
+        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
+        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
+        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
+        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
+        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
 
         CHROMA_PIXELSUB_PS(_sse2);
         CHROMA_PIXELSUB_PS_422(_sse2);
@@ -1354,9 +1365,9 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         CHROMA_VERT_FILTERS_422(_sse2);
         CHROMA_VERT_FILTERS_444(_sse2);
         p.luma_p2s = x265_luma_p2s_sse2;
-        p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
-        p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_sse2;
-        p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
+        p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2;
+        p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
+        p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
 
         p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
         p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
@@ -1376,6 +1387,9 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
 
         p.dct[DCT_4x4] = x265_dct4_sse2;
         p.idct[IDCT_4x4] = x265_idct4_sse2;
+#if X86_64
+        p.idct[IDCT_8x8] = x265_idct8_sse2;
+#endif
         p.idct[IDST_4x4] = x265_idst4_sse2;
 
         LUMA_SS_FILTERS(_sse2);
@@ -1407,11 +1421,6 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         p.quant = x265_quant_sse4;
         p.nquant = x265_nquant_sse4;
         p.dequant_normal = x265_dequant_normal_sse4;
-        p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
-        p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
-        p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
-        p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
-        p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
         p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
         p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
         p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
@@ -1428,7 +1437,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
     }
     if (cpuMask & X265_CPU_XOP)
     {
-        p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
+        p.frameInitLowres = x265_frame_init_lowres_core_xop;
         SA8D_INTER_FROM_BLOCK(xop);
         INIT7(satd, _xop);
         HEVC_SATD(xop);
@@ -1440,6 +1449,14 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         p.nquant = x265_nquant_avx2;
         p.dequant_normal = x265_dequant_normal_avx2;
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
+        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
+        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
+        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
+        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
+        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
+        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
+        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
 #if X86_64
         p.dct[DCT_8x8] = x265_dct8_avx2;
         p.dct[DCT_16x16] = x265_dct16_avx2;
@@ -1448,7 +1465,6 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         p.idct[IDCT_8x8] = x265_idct8_avx2;
         p.idct[IDCT_16x16] = x265_idct16_avx2;
         p.idct[IDCT_32x32] = x265_idct32_avx2;
-
         p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
         p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
         p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
@@ -1500,7 +1516,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         INIT8(sad_x4, _mmx2);
         p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
         p.sa8d_inter[LUMA_4x4]  = x265_pixel_satd_4x4_mmx2;
-        p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
+        p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
 
         PIXEL_AVG(sse2);
         PIXEL_AVG_W4(mmx2);
@@ -1548,14 +1564,26 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
         p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
 
-        p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
+        p.frameInitLowres = x265_frame_init_lowres_core_sse2;
         SA8D_INTER_FROM_BLOCK(sse2);
 
-        p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
-        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
-        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
-        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
-        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
+        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
+        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
+        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
+        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
+        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
+        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
+        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
+        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
+        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
+        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
+        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
+        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
+        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
+        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
+        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
+        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
+
         p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
         p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
         p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
@@ -1565,18 +1593,19 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
         p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
+
         p.dct[DCT_4x4] = x265_dct4_sse2;
         p.idct[IDCT_4x4] = x265_idct4_sse2;
+#if X86_64
+        p.idct[IDCT_8x8] = x265_idct8_sse2;
+#endif
         p.idct[IDST_4x4] = x265_idst4_sse2;
+
         p.planecopy_sp = x265_downShift_16_sse2;
-        p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
-        p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
-        p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
-        p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
-        p.frame_init_lowres_core = x265_frame_init_lowres_core_ssse3;
+        p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
         SA8D_INTER_FROM_BLOCK(ssse3);
         p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3;
         ASSGN_SSE(ssse3);
@@ -1601,9 +1630,9 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
 
         p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
         p.luma_p2s = x265_luma_p2s_ssse3;
-        p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3;
-        p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3;
-        p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s
+        p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
+        p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
+        p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_ssse3; // for i444, chroma_p2s can use luma_p2s
 
         p.dct[DST_4x4] = x265_dst4_ssse3;
         p.idct[IDCT_8x8] = x265_idct8_ssse3;
@@ -1616,11 +1645,6 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         LUMA_ADDAVG(_sse4);
         CHROMA_ADDAVG(_sse4);
         CHROMA_ADDAVG_422(_sse4);
-        p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
-        p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
-        p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
-        p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
-        p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
 
         // TODO: check POPCNT flag!
         p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
@@ -1690,12 +1714,11 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         INTRA_ANG_SSE4(sse4);
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
-        p.copy_shr = x265_copy_shr_sse4;
-        p.denoiseDct = x265_denoise_dct_sse4;
+//        p.denoiseDct = x265_denoise_dct_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
-        p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
+        p.frameInitLowres = x265_frame_init_lowres_core_avx;
         HEVC_SATD(avx);
         SA8D_INTER_FROM_BLOCK(avx);
         ASSGN_SSE(avx);
@@ -1736,7 +1759,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
     }
     if (cpuMask & X265_CPU_XOP)
     {
-        p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
+        p.frameInitLowres = x265_frame_init_lowres_core_xop;
         SA8D_INTER_FROM_BLOCK(xop);
         INIT7(satd, _xop);
         INIT5_NAME(sse_pp, ssd, _xop);
@@ -1761,15 +1784,21 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
         p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
 
-        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
-        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
-        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
-        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
-        p.denoiseDct = x265_denoise_dct_avx2;
+        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
+        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
+        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
+        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
+        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
+        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
+        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
+        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
+
+//        p.denoiseDct = x265_denoise_dct_avx2;
         p.dct[DCT_4x4] = x265_dct4_avx2;
         p.quant = x265_quant_avx2;
         p.nquant = x265_nquant_avx2;
         p.dequant_normal = x265_dequant_normal_avx2;
+
         p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx;
         p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx;
         p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx;
@@ -1785,6 +1814,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         p.weight_pp = x265_weight_pp_avx2;
 
 #if X86_64
+
         p.dct[DCT_8x8] = x265_dct8_avx2;
         p.dct[DCT_16x16] = x265_dct16_avx2;
         p.dct[DCT_32x32] = x265_dct32_avx2;
@@ -1797,8 +1827,83 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
         p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
         p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
         p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
+
+        p.luma_vpp[LUMA_12x16] = x265_interp_8tap_vert_pp_12x16_avx2;
+
+        p.luma_vpp[LUMA_16x4] = x265_interp_8tap_vert_pp_16x4_avx2;
+        p.luma_vpp[LUMA_16x8] = x265_interp_8tap_vert_pp_16x8_avx2;
+        p.luma_vpp[LUMA_16x12] = x265_interp_8tap_vert_pp_16x12_avx2;
+        p.luma_vpp[LUMA_16x16] = x265_interp_8tap_vert_pp_16x16_avx2;
+        p.luma_vpp[LUMA_16x32] = x265_interp_8tap_vert_pp_16x32_avx2;
+        p.luma_vpp[LUMA_16x64] = x265_interp_8tap_vert_pp_16x64_avx2;
+
+        p.luma_vpp[LUMA_24x32] = x265_interp_8tap_vert_pp_24x32_avx2;
+
+        p.luma_vpp[LUMA_32x8] = x265_interp_8tap_vert_pp_32x8_avx2;
+        p.luma_vpp[LUMA_32x16] = x265_interp_8tap_vert_pp_32x16_avx2;
+        p.luma_vpp[LUMA_32x24] = x265_interp_8tap_vert_pp_32x24_avx2;
+        p.luma_vpp[LUMA_32x32] = x265_interp_8tap_vert_pp_32x32_avx2;
+        p.luma_vpp[LUMA_32x64] = x265_interp_8tap_vert_pp_32x64_avx2;
+
+        p.luma_vpp[LUMA_48x64] = x265_interp_8tap_vert_pp_48x64_avx2;
+
+        p.luma_vpp[LUMA_64x16] = x265_interp_8tap_vert_pp_64x16_avx2;
+        p.luma_vpp[LUMA_64x32] = x265_interp_8tap_vert_pp_64x32_avx2;
+        p.luma_vpp[LUMA_64x48] = x265_interp_8tap_vert_pp_64x48_avx2;
+        p.luma_vpp[LUMA_64x64] = x265_interp_8tap_vert_pp_64x64_avx2;
 #endif
         p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
+
+        p.luma_hpp[LUMA_8x4] = x265_interp_8tap_horiz_pp_8x4_avx2;
+        p.luma_hpp[LUMA_8x8] = x265_interp_8tap_horiz_pp_8x8_avx2;
+        p.luma_hpp[LUMA_8x16] = x265_interp_8tap_horiz_pp_8x16_avx2;
+        p.luma_hpp[LUMA_8x32] = x265_interp_8tap_horiz_pp_8x32_avx2;
+
+        p.luma_hpp[LUMA_16x4] = x265_interp_8tap_horiz_pp_16x4_avx2;
+        p.luma_hpp[LUMA_16x8] = x265_interp_8tap_horiz_pp_16x8_avx2;
+        p.luma_hpp[LUMA_16x12] = x265_interp_8tap_horiz_pp_16x12_avx2;
+        p.luma_hpp[LUMA_16x16] = x265_interp_8tap_horiz_pp_16x16_avx2;
+        p.luma_hpp[LUMA_16x32] = x265_interp_8tap_horiz_pp_16x32_avx2;
+        p.luma_hpp[LUMA_16x64] = x265_interp_8tap_horiz_pp_16x64_avx2;
+
+        p.luma_hpp[LUMA_32x8] = x265_interp_8tap_horiz_pp_32x8_avx2;
+        p.luma_hpp[LUMA_32x16] = x265_interp_8tap_horiz_pp_32x16_avx2;
+        p.luma_hpp[LUMA_32x24] = x265_interp_8tap_horiz_pp_32x24_avx2;
+        p.luma_hpp[LUMA_32x32] = x265_interp_8tap_horiz_pp_32x32_avx2;
+        p.luma_hpp[LUMA_32x64] = x265_interp_8tap_horiz_pp_32x64_avx2;
+
+        p.luma_hpp[LUMA_64x64] = x265_interp_8tap_horiz_pp_64x64_avx2;
+        p.luma_hpp[LUMA_64x48] = x265_interp_8tap_horiz_pp_64x48_avx2;
+        p.luma_hpp[LUMA_64x32] = x265_interp_8tap_horiz_pp_64x32_avx2;
+        p.luma_hpp[LUMA_64x16] = x265_interp_8tap_horiz_pp_64x16_avx2;
+
+        p.luma_hpp[LUMA_48x64] = x265_interp_8tap_horiz_pp_48x64_avx2;
+
+        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_8x8] = x265_interp_4tap_horiz_pp_8x8_avx2;
+        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_4x4] = x265_interp_4tap_horiz_pp_4x4_avx2;
+        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_32x32] = x265_interp_4tap_horiz_pp_32x32_avx2;
+        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_16x16] = x265_interp_4tap_horiz_pp_16x16_avx2;
+
+        p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2;
+
+        p.luma_vpp[LUMA_8x4] = x265_interp_8tap_vert_pp_8x4_avx2;
+        p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2;
+        p.luma_vpp[LUMA_8x16] = x265_interp_8tap_vert_pp_8x16_avx2;
+        p.luma_vpp[LUMA_8x32] = x265_interp_8tap_vert_pp_8x32_avx2;
+
+        // color space i420
+        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
+        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_8x8] = x265_interp_4tap_vert_pp_8x8_avx2;
+
+        // color space i422
+        p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
+
+        p.luma_vps[LUMA_4x4] = x265_interp_8tap_vert_ps_4x4_avx2;
+
+#if X86_64
+        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_16x16] = x265_interp_4tap_vert_pp_16x16_avx2;
+        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_32x32] = x265_interp_4tap_vert_pp_32x32_avx2;
+#endif
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff --git a/source/common/x86/blockcopy8.asm b/source/common/x86/blockcopy8.asm
index e892157..f82ff79 100644
--- a/source/common/x86/blockcopy8.asm
+++ b/source/common/x86/blockcopy8.asm
@@ -41,7 +41,7 @@ cextern pb_128
 SECTION .text
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_2x4, 4, 7, 0
@@ -59,7 +59,7 @@ cglobal blockcopy_pp_2x4, 4, 7, 0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_2x8, 4, 7, 0
@@ -97,7 +97,7 @@ cglobal blockcopy_pp_2x8, 4, 7, 0
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_2x16, 4, 7, 0
@@ -115,7 +115,7 @@ cglobal blockcopy_pp_2x16, 4, 7, 0
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_4x2, 4, 6, 0
@@ -127,7 +127,7 @@ cglobal blockcopy_pp_4x2, 4, 6, 0
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_4x4, 4, 4, 4
@@ -145,7 +145,7 @@ cglobal blockcopy_pp_4x4, 4, 4, 4
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W4_H8 2
 INIT_XMM sse2
@@ -192,7 +192,7 @@ BLOCKCOPY_PP_W4_H8 4, 16
 BLOCKCOPY_PP_W4_H8 4, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_6x8, 4, 7, 8
@@ -257,7 +257,7 @@ cglobal blockcopy_pp_6x8, 4, 7, 8
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_6x16, 4, 7, 2
@@ -279,7 +279,7 @@ cglobal blockcopy_pp_6x16, 4, 7, 2
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_8x2, 4, 4, 2
@@ -291,7 +291,7 @@ cglobal blockcopy_pp_8x2, 4, 4, 2
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_8x4, 4, 4, 4
@@ -309,7 +309,7 @@ cglobal blockcopy_pp_8x4, 4, 4, 4
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_8x6, 4, 7, 6
@@ -333,7 +333,7 @@ cglobal blockcopy_pp_8x6, 4, 7, 6
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_8x12, 4, 5, 2
@@ -350,7 +350,7 @@ cglobal blockcopy_pp_8x12, 4, 5, 2
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W8_H8 2
 INIT_XMM sse2
@@ -397,7 +397,7 @@ BLOCKCOPY_PP_W8_H8 8, 32
 BLOCKCOPY_PP_W8_H8 8, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W12_H4 2
 INIT_XMM sse2
@@ -439,7 +439,7 @@ BLOCKCOPY_PP_W12_H4 12, 16
 BLOCKCOPY_PP_W12_H4 12, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W16_H4 2
 INIT_XMM sse2
@@ -471,7 +471,7 @@ BLOCKCOPY_PP_W16_H4 16, 4
 BLOCKCOPY_PP_W16_H4 16, 12
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W16_H8 2
 INIT_XMM sse2
@@ -519,7 +519,7 @@ BLOCKCOPY_PP_W16_H8 16, 64
 BLOCKCOPY_PP_W16_H8 16, 24
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W24_H4 2
 INIT_XMM sse2
@@ -560,7 +560,7 @@ BLOCKCOPY_PP_W24_H4 24, 32
 BLOCKCOPY_PP_W24_H4 24, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W32_H4 2
 INIT_XMM sse2
@@ -684,7 +684,7 @@ cglobal blockcopy_pp_32x16, 4, 6, 6
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_YMM avx
 cglobal blockcopy_pp_32x24, 4, 7, 6
@@ -722,7 +722,7 @@ mov    r6d, 24/8
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W32_H16_avx 2
 INIT_YMM avx
@@ -788,7 +788,7 @@ BLOCKCOPY_PP_W32_H16_avx 32, 48
 BLOCKCOPY_PP_W32_H16_avx 32, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W48_H2 2
 INIT_XMM sse2
@@ -836,7 +836,7 @@ cglobal blockcopy_pp_%1x%2, 4, 5, 6
 BLOCKCOPY_PP_W48_H2 48, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W64_H4 2
 INIT_XMM sse2
@@ -897,7 +897,7 @@ BLOCKCOPY_PP_W64_H4 64, 48
 BLOCKCOPY_PP_W64_H4 64, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal blockcopy_sp_2x4, 4, 5, 2
@@ -926,7 +926,7 @@ RET
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal blockcopy_sp_2x8, 4, 5, 2
@@ -974,11 +974,11 @@ pextrw     [r0 + r1], m0, 4
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W2_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
     add         r3,     r3
     mov         r6d,    %2/2
 .loop:
@@ -1003,10 +1003,10 @@ BLOCKCOPY_SP_W2_H2 2,  8
 BLOCKCOPY_SP_W2_H2 2, 16
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
 
 add        r3,        r3
 
@@ -1022,10 +1022,10 @@ movd       [r0 + r1], m0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
 
 add        r3,     r3
 
@@ -1049,10 +1049,10 @@ movd       [r0 + r1],     m2
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
 
 add        r3,      r3
 
@@ -1092,11 +1092,11 @@ movd       [r0 + r1],     m6
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W4_H8 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov         r4d,    %2/8
 
@@ -1150,7 +1150,7 @@ BLOCKCOPY_SP_W4_H8 4, 16
 BLOCKCOPY_SP_W4_H8 4, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal blockcopy_sp_6x8, 4, 4, 2
@@ -1213,11 +1213,11 @@ cglobal blockcopy_sp_6x8, 4, 4, 2
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W6_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
     add         r3,     r3
     mov         r6d,    %2/2
 .loop:
@@ -1247,10 +1247,10 @@ BLOCKCOPY_SP_W6_H2 6,  8
 BLOCKCOPY_SP_W6_H2 6, 16
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
 
 add        r3,         r3
 
@@ -1265,10 +1265,10 @@ movhps     [r0 + r1],  m0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
 
 add        r3,     r3
 
@@ -1290,10 +1290,10 @@ movhps     [r0 + r1],     m2
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
 
 add        r3,      r3
 
@@ -1322,10 +1322,10 @@ movhps     [r0 + r1],     m4
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
 
 add        r3,      r3
 
@@ -1361,11 +1361,11 @@ movhps     [r0 + r1],     m6
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W8_H4 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
     add         r3,     r3
     mov         r4d,    %2/4
 .loop:
@@ -1391,11 +1391,11 @@ cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
 BLOCKCOPY_SP_W8_H4 8, 12
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W8_H8 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov         r4d,    %2/8
 
@@ -1446,11 +1446,11 @@ BLOCKCOPY_SP_W8_H8 8, 32
 BLOCKCOPY_SP_W8_H8 8, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W12_H4 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov             r4d,     %2/4
 
@@ -1503,11 +1503,11 @@ BLOCKCOPY_SP_W12_H4 12, 16
 BLOCKCOPY_SP_W12_H4 12, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W16_H4 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov             r4d,     %2/4
 
@@ -1554,11 +1554,11 @@ BLOCKCOPY_SP_W16_H4 16, 64
 BLOCKCOPY_SP_W16_H4 16, 24
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W24_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
 
 mov             r4d,     %2/2
 
@@ -1595,11 +1595,11 @@ BLOCKCOPY_SP_W24_H2 24, 32
 BLOCKCOPY_SP_W24_H2 24, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W32_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov             r4d,     %2/2
 
@@ -1643,11 +1643,11 @@ BLOCKCOPY_SP_W32_H2 32, 64
 BLOCKCOPY_SP_W32_H2 32, 48
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W48_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
 
 mov             r4d,     %2
 
@@ -1681,11 +1681,11 @@ RET
 BLOCKCOPY_SP_W48_H2 48, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W64_H1 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov             r4d,       %2
 
@@ -1726,10 +1726,10 @@ BLOCKCOPY_SP_W64_H1 64, 48
 BLOCKCOPY_SP_W64_H1 64, 64
 
 ;-----------------------------------------------------------------------------
-; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
 
 add        r1,            r1
 
@@ -1745,10 +1745,10 @@ movh       [r0 + r1],     m0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
 
 add        r1,            r1
 
@@ -1774,11 +1774,11 @@ movu       [r0 + r1],     m0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
 %macro BLOCKFILL_S_W16_H8 2
 INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
 
 mov        r3d,           %2/8
 
@@ -1855,11 +1855,11 @@ movu       [r0 + r3], m0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
 %macro BLOCKFILL_S_W32_H4 2
 INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
 
 mov        r3d,           %2/4
 
@@ -1983,10 +1983,10 @@ movu       [r0 + r3 + 32], m0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2013,10 +2013,10 @@ RET
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2065,10 +2065,10 @@ RET
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
     add         r1,         r1
     mov         r4d,        16/2
 .loop:
@@ -2086,10 +2086,10 @@ cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,         r1
 
@@ -2105,10 +2105,10 @@ RET
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2135,11 +2135,11 @@ RET
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W4_H4 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
 
 add     r1,      r1
 mov    r4d,      %2/4
@@ -2180,11 +2180,11 @@ BLOCKCOPY_PS_W4_H4 4, 32
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W6_H4 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
 
 add     r1,      r1
 mov    r4d,      %2/4
@@ -2227,10 +2227,10 @@ BLOCKCOPY_PS_W6_H4 6, 8
 BLOCKCOPY_PS_W6_H4 6, 16
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,         r1
 
@@ -2245,10 +2245,10 @@ movu       [r0 + r1],  m0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2274,10 +2274,10 @@ movu       [r0 + r1],     m0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2314,11 +2314,11 @@ movu       [r0 + r1],     m0
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W8_H4 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
 
 add     r1,      r1
 mov    r4d,      %2/4
@@ -2361,11 +2361,11 @@ BLOCKCOPY_PS_W8_H4  8, 64
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W12_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2398,10 +2398,10 @@ BLOCKCOPY_PS_W12_H2 12, 16
 BLOCKCOPY_PS_W12_H2 12, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 pxor       m0,      m0
@@ -2436,11 +2436,11 @@ movu       [r0 + r1 + 16],     m1
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W16_H4 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/4
@@ -2492,11 +2492,11 @@ BLOCKCOPY_PS_W16_H4 16, 64
 BLOCKCOPY_PS_W16_H4 16, 24
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W24_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2537,11 +2537,11 @@ BLOCKCOPY_PS_W24_H2 24, 32
 BLOCKCOPY_PS_W24_H2 24, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W32_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2590,11 +2590,11 @@ BLOCKCOPY_PS_W32_H2 32, 64
 BLOCKCOPY_PS_W32_H2 32, 48
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W48_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2649,11 +2649,11 @@ RET
 BLOCKCOPY_PS_W48_H2 48, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W64_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2723,7 +2723,7 @@ BLOCKCOPY_PS_W64_H2 64, 48
 BLOCKCOPY_PS_W64_H2 64, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_2x4, 4, 6, 0
@@ -2746,7 +2746,7 @@ cglobal blockcopy_ss_2x4, 4, 6, 0
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_2x8, 4, 6, 0
@@ -2785,7 +2785,7 @@ cglobal blockcopy_ss_2x8, 4, 6, 0
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_2x16, 4, 7, 0
@@ -2805,7 +2805,7 @@ cglobal blockcopy_ss_2x16, 4, 7, 0
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_4x2, 4, 4, 2
@@ -2821,7 +2821,7 @@ cglobal blockcopy_ss_4x2, 4, 4, 2
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_4x4, 4, 4, 4
@@ -2841,7 +2841,7 @@ cglobal blockcopy_ss_4x4, 4, 4, 4
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W4_H8 2
 INIT_XMM sse2
@@ -2889,7 +2889,7 @@ BLOCKCOPY_SS_W4_H8 4, 16
 BLOCKCOPY_SS_W4_H8 4, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_6x8, 4, 4, 4
@@ -2944,7 +2944,7 @@ cglobal blockcopy_ss_6x8, 4, 4, 4
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_6x16, 4, 5, 4
@@ -2968,7 +2968,7 @@ cglobal blockcopy_ss_6x16, 4, 5, 4
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_8x2, 4, 4, 2
@@ -2984,7 +2984,7 @@ cglobal blockcopy_ss_8x2, 4, 4, 2
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_8x4, 4, 4, 4
@@ -3005,7 +3005,7 @@ cglobal blockcopy_ss_8x4, 4, 4, 4
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_8x6, 4, 4, 4
@@ -3034,7 +3034,7 @@ cglobal blockcopy_ss_8x6, 4, 4, 4
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_8x12, 4, 5, 2
@@ -3054,7 +3054,7 @@ cglobal blockcopy_ss_8x12, 4, 5, 2
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W8_H8 2
 INIT_XMM sse2
@@ -3105,7 +3105,7 @@ BLOCKCOPY_SS_W8_H8 8, 32
 BLOCKCOPY_SS_W8_H8 8, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W12_H4 2
 INIT_XMM sse2
@@ -3149,7 +3149,7 @@ BLOCKCOPY_SS_W12_H4 12, 16
 BLOCKCOPY_SS_W12_H4 12, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W16_H4 2
 INIT_XMM sse2
@@ -3192,7 +3192,7 @@ BLOCKCOPY_SS_W16_H4 16, 4
 BLOCKCOPY_SS_W16_H4 16, 12
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W16_H4_avx 2
 INIT_YMM avx
@@ -3229,7 +3229,7 @@ BLOCKCOPY_SS_W16_H4_avx 16, 32
 BLOCKCOPY_SS_W16_H4_avx 16, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W16_H8 2
 INIT_XMM sse2
@@ -3302,7 +3302,7 @@ BLOCKCOPY_SS_W16_H8 16, 64
 BLOCKCOPY_SS_W16_H8 16, 24
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W24_H4 2
 INIT_XMM sse2
@@ -3354,7 +3354,7 @@ BLOCKCOPY_SS_W24_H4 24, 32
 BLOCKCOPY_SS_W24_H4 24, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W32_H4 2
 INIT_XMM sse2
@@ -3422,7 +3422,7 @@ BLOCKCOPY_SS_W32_H4 32, 64
 BLOCKCOPY_SS_W32_H4 32, 48
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W48_H2 2
 INIT_XMM sse2
@@ -3500,11 +3500,11 @@ RET
 BLOCKCOPY_SS_W48_H2 48, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W64_H4 2
 INIT_XMM sse2
-cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
     mov     r4d, %2/4
     add     r1, r1
     add     r3, r3
@@ -3606,11 +3606,11 @@ BLOCKCOPY_SS_W64_H4 64, 48
 BLOCKCOPY_SS_W64_H4 64, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W64_H4_avx 2
 INIT_YMM avx
-cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
     mov     r4d, %2/4
     add     r1, r1
     add     r3, r3
@@ -3669,229 +3669,83 @@ BLOCKCOPY_SS_W64_H4_avx 64, 32
 BLOCKCOPY_SS_W64_H4_avx 64, 48
 BLOCKCOPY_SS_W64_H4_avx 64, 64
 
-;-----------------------------------------------------------------------------
-; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal cvt32to16_shr, 4, 7, 3, dst, src, stride
-%define rnd     m2
-%define shift   m1
-
-    ; make shift
-    mov         r5d, r3m
-    movd        shift, r5d
-
-    ; make round
-    dec         r5
-    xor         r6, r6
-    bts         r6, r5
-    
-    movd        rnd, r6d
-    pshufd      rnd, rnd, 0
-
-    ; register alloc
-    ; r0 - dst
-    ; r1 - src
-    ; r2 - stride * 2 (short*)
-    ; r3 - lx
-    ; r4 - size
-    ; r5 - ly
-    ; r6 - diff
-    add         r2d, r2d
-
-    mov         r4d, r4m
-    mov         r5, r4
-    mov         r6, r2
-    sub         r6, r4
-    add         r6, r6
-
-    shr         r5, 1
-.loop_row:
-
-    mov         r3, r4
-    shr         r3, 2
-.loop_col:
-    ; row 0
-    movu        m0, [r1]
-    paddd       m0, rnd
-    psrad       m0, shift
-    packssdw    m0, m0
-    movh        [r0], m0
-
-    ; row 1
-    movu        m0, [r1 + r4 * 4]
-    paddd       m0, rnd
-    psrad       m0, shift
-    packssdw    m0, m0
-    movh        [r0 + r2], m0
-
-    ; move col pointer
-    add         r1, 16
-    add         r0, 8
-
-    dec         r3
-    jg          .loop_col
-
-    ; update pointer
-    lea         r1, [r1 + r4 * 4]
-    add         r0, r6
-
-    ; end of loop_row
-    dec         r5
-    jg         .loop_row
-    
-    RET
-
-
-;--------------------------------------------------------------------------------------
-; void cvt16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size);
-;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shl, 5, 7, 2, dst, src, stride, shift, size
-%define shift       m1
-
-    ; make shift
-    mov             r5d,      r3m
-    movd            shift,    r5d
-
-    ; register alloc
-    ; r0 - dst
-    ; r1 - src
-    ; r2 - stride
-    ; r3 - shift
-    ; r4 - size
-
-    sub             r2d,      r4d
-    add             r2d,      r2d
-    mov             r5d,      r4d
-    shr             r4d,      2
-.loop_row:
-    mov             r6d,      r4d
-
-.loop_col:
-    pmovsxwd        m0,       [r1]
-    pslld           m0,       shift
-    movu            [r0],     m0
-
-    add             r1,       8
-    add             r0,       16
-
-    dec             r6d
-    jnz             .loop_col
-
-    add             r1,       r2
-    dec             r5d
-    jnz             .loop_row
-    RET
-
-
 ;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_4, 3,3,3
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_4, 3, 4, 4
     add             r2d, r2d
     movd            m0, r3m
-    movd            m1, r4m
-    pshufd          m1, m1, 0
+    pcmpeqw	    m1, m1
+    psllw           m1, m0
+    psraw           m1, 1
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride
+    ; r2 - srcStride
     ; m0 - shift
-    ; m1 - dword [offset]
-
-    ; Row 0
-    pmovsxwd        m2, [r1]
-    paddd           m2, m1
-    psrad           m2, m0
-    movu            [r0 + 0 * mmsize], m2
-
-    ; Row 1
-    pmovsxwd        m2, [r1 + r2]
-    paddd           m2, m1
-    psrad           m2, m0
-    movu            [r0 + 1 * mmsize], m2
+    ; m1 - word [-round]
 
-    ; Row 2
+    ; Row 0-3
+    movh            m2, [r1]
+    movhps          m2, [r1 + r2]
     lea             r1, [r1 + r2 * 2]
-    pmovsxwd        m2, [r1]
-    paddd           m2, m1
-    psrad           m2, m0
-    movu            [r0 + 2 * mmsize], m2
-
-    ; Row 3
-    pmovsxwd        m2, [r1 + r2]
-    paddd           m2, m1
-    psrad           m2, m0
-    movu            [r0 + 3 * mmsize], m2
+    movh            m3, [r1]
+    movhps          m3, [r1 + r2]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
     RET
 
 
 ;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_8, 3,5,3
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_8, 3, 5, 4
     add             r2d, r2d
     movd            m0, r3m
-    movd            m1, r4m
-    pshufd          m1, m1, 0
+    pcmpeqw	    m1, m1
+    psllw           m1, m0
+    psraw           m1, 1
     mov             r3d, 8/4
     lea             r4, [r2 * 3]
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride
+    ; r2 - srcStride
     ; r3 - loop counter
     ; r4 - stride * 3
     ; m0 - shift
-    ; m1 - dword [offset]
+    ; m1 - word [-round]
 
 .loop:
-    ; Row 0
-    pmovsxwd        m2, [r1]
-    pmovsxwd        m3, [r1 + mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    movu            [r0 + 0 * mmsize], m2
-    movu            [r0 + 1 * mmsize], m3
+    ; Row 0-1
+    mova            m2, [r1]
+    mova            m3, [r1 + r2]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
 
-    ; Row 1
-    pmovsxwd        m2, [r1 + r2]
-    pmovsxwd        m3, [r1 + r2 + mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    movu            [r0 + 2 * mmsize], m2
-    movu            [r0 + 3 * mmsize], m3
-
-    ; Row 2
-    pmovsxwd        m2, [r1 + r2 * 2]
-    pmovsxwd        m3, [r1 + r2 * 2 + mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    movu            [r0 + 4 * mmsize], m2
-    movu            [r0 + 5 * mmsize], m3
-
-    ; Row 3
-    pmovsxwd        m2, [r1 + r4]
-    pmovsxwd        m3, [r1 + r4 + mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    movu            [r0 + 6 * mmsize], m2
-    movu            [r0 + 7 * mmsize], m3
-
-    add             r0, 8 * mmsize
+    ; Row 2-3
+    mova            m2, [r1 + r2 * 2]
+    mova            m3, [r1 + r4]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 2 * mmsize], m2
+    mova            [r0 + 3 * mmsize], m3
+
+    add             r0, 4 * mmsize
     lea             r1, [r1 + r2 * 4]
     dec             r3d
     jnz            .loop
@@ -3899,62 +3753,47 @@ cglobal cvt16to32_shr_8, 3,5,3
 
 
 ;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_16, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_16, 3, 4, 4
     add             r2d, r2d
     movd            m0, r3m
-    movd            m1, r4m
-    pshufd          m1, m1, 0
+    pcmpeqw	    m1, m1
+    psllw           m1, m0
+    psraw           m1, 1
     mov             r3d, 16/2
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride
+    ; r2 - srcStride
     ; r3 - loop counter
     ; m0 - shift
-    ; m1 - dword [offset]
+    ; m1 - word [-round]
 
 .loop:
     ; Row 0
-    pmovsxwd        m2, [r1 + 0 * mmsize/2]
-    pmovsxwd        m3, [r1 + 1 * mmsize/2]
-    pmovsxwd        m4, [r1 + 2 * mmsize/2]
-    pmovsxwd        m5, [r1 + 3 * mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    paddd           m4, m1
-    paddd           m5, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    psrad           m4, m0
-    psrad           m5, m0
-    movu            [r0 + 0 * mmsize], m2
-    movu            [r0 + 1 * mmsize], m3
-    movu            [r0 + 2 * mmsize], m4
-    movu            [r0 + 3 * mmsize], m5
+    mova            m2, [r1 + 0 * mmsize]
+    mova            m3, [r1 + 1 * mmsize]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
 
     ; Row 1
-    pmovsxwd        m2, [r1 + r2 + 0 * mmsize/2]
-    pmovsxwd        m3, [r1 + r2 +1 * mmsize/2]
-    pmovsxwd        m4, [r1 + r2 +2 * mmsize/2]
-    pmovsxwd        m5, [r1 + r2 +3 * mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    paddd           m4, m1
-    paddd           m5, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    psrad           m4, m0
-    psrad           m5, m0
-    movu            [r0 + 4 * mmsize], m2
-    movu            [r0 + 5 * mmsize], m3
-    movu            [r0 + 6 * mmsize], m4
-    movu            [r0 + 7 * mmsize], m5
-
-    add             r0, 8 * mmsize
+    mova            m2, [r1 + r2 + 0 * mmsize]
+    mova            m3, [r1 + r2 + 1 * mmsize]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 2 * mmsize], m2
+    mova            [r0 + 3 * mmsize], m3
+
+    add             r0, 4 * mmsize
     lea             r1, [r1 + r2 * 2]
     dec             r3d
     jnz            .loop
@@ -3962,61 +3801,45 @@ cglobal cvt16to32_shr_16, 3,4,6
 
 
 ;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_32, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_32, 3, 4, 6
     add             r2d, r2d
     movd            m0, r3m
-    movd            m1, r4m
-    pshufd          m1, m1, 0
+    pcmpeqw	    m1, m1
+    psllw           m1, m0
+    psraw           m1, 1
     mov             r3d, 32/1
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride
+    ; r2 - srcStride
     ; r3 - loop counter
     ; m0 - shift
-    ; m1 - dword [offset]
+    ; m1 - word [-round]
 
 .loop:
     ; Row 0
-    pmovsxwd        m2, [r1 + 0 * mmsize/2]
-    pmovsxwd        m3, [r1 + 1 * mmsize/2]
-    pmovsxwd        m4, [r1 + 2 * mmsize/2]
-    pmovsxwd        m5, [r1 + 3 * mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    paddd           m4, m1
-    paddd           m5, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    psrad           m4, m0
-    psrad           m5, m0
-    movu            [r0 + 0 * mmsize], m2
-    movu            [r0 + 1 * mmsize], m3
-    movu            [r0 + 2 * mmsize], m4
-    movu            [r0 + 3 * mmsize], m5
-
-    pmovsxwd        m2, [r1 + 4 * mmsize/2]
-    pmovsxwd        m3, [r1 + 5 * mmsize/2]
-    pmovsxwd        m4, [r1 + 6 * mmsize/2]
-    pmovsxwd        m5, [r1 + 7 * mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    paddd           m4, m1
-    paddd           m5, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    psrad           m4, m0
-    psrad           m5, m0
-    movu            [r0 + 4 * mmsize], m2
-    movu            [r0 + 5 * mmsize], m3
-    movu            [r0 + 6 * mmsize], m4
-    movu            [r0 + 7 * mmsize], m5
-
-    add             r0, 8 * mmsize
+    mova            m2, [r1 + 0 * mmsize]
+    mova            m3, [r1 + 1 * mmsize]
+    mova            m4, [r1 + 2 * mmsize]
+    mova            m5, [r1 + 3 * mmsize]
+    psubw           m2, m1
+    psubw           m3, m1
+    psubw           m4, m1
+    psubw           m5, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    psraw           m4, m0
+    psraw           m5, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
+    mova            [r0 + 2 * mmsize], m4
+    mova            [r0 + 3 * mmsize], m5
+
+    add             r0, 4 * mmsize
     add             r1, r2
     dec             r3d
     jnz            .loop
@@ -4024,172 +3847,150 @@ cglobal cvt16to32_shr_32, 3,4,6
 
 
 ;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal cvt32to16_shl_4, 3,3,5
+cglobal cpy1Dto2D_shl_4, 3, 3, 3
     add         r2d, r2d
     movd        m0, r3m
 
     ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
+    mova        m1, [r1 + 0 * mmsize]
+    mova        m2, [r1 + 1 * mmsize]
     psllw       m1, m0
-    psllw       m3, m0
+    psllw       m2, m0
     movh        [r0], m1
     movhps      [r0 + r2], m1
-    movh        [r0 + r2 * 2], m3
+    movh        [r0 + r2 * 2], m2
     lea         r2, [r2 * 3]
-    movhps      [r0 + r2], m3
+    movhps      [r0 + r2], m2
     RET
 
 
 INIT_YMM avx2
-cglobal cvt32to16_shl_4, 3,3,3
+cglobal cpy1Dto2D_shl_4, 3, 3, 2
     add         r2d, r2d
     movd        xm0, r3m
 
     ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    packssdw    m1, m2
+    movu        m1, [r1]
     psllw       m1, xm0
     vextracti128 xm0, m1, 1
     movq        [r0], xm1
-    movq        [r0 + r2], xm0
+    movhps      [r0 + r2], xm1
     lea         r0, [r0 + r2 * 2]
-    movhps      [r0], xm1
+    movq        [r0], xm0
     movhps      [r0 + r2], xm0
     RET
 
 
 ;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal cvt32to16_shl_8, 3,5,5
+cglobal cpy1Dto2D_shl_8, 3, 4, 5
     add         r2d, r2d
     movd        m0, r3m
-    mov         r3d, 8/4
-    lea         r4, [r2 * 3]
+    lea         r3, [r2 * 3]
 
-.loop:
-    ; Row 0-1
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
+    ; Row 0-3
+    mova        m1, [r1 + 0 * mmsize]
+    mova        m2, [r1 + 1 * mmsize]
+    mova        m3, [r1 + 2 * mmsize]
+    mova        m4, [r1 + 3 * mmsize]
     psllw       m1, m0
+    psllw       m2, m0
     psllw       m3, m0
-    movu        [r0], m1
-    movu        [r0 + r2], m3
+    psllw       m4, m0
+    mova        [r0], m1
+    mova        [r0 + r2], m2
+    mova        [r0 + r2 * 2], m3
+    mova        [r0 + r3], m4
+    lea         r0, [r0 + r2 * 4]
 
-    ; Row 2-3
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
+    ; Row 4-7
+    mova        m1, [r1 + 4 * mmsize]
+    mova        m2, [r1 + 5 * mmsize]
+    mova        m3, [r1 + 6 * mmsize]
+    mova        m4, [r1 + 7 * mmsize]
     psllw       m1, m0
+    psllw       m2, m0
     psllw       m3, m0
-    movu        [r0 + r2 * 2], m1
-    movu        [r0 + r4], m3
-
-    add         r1, 8 * mmsize
-    lea         r0, [r0 + r2 * 4]
-    dec         r3d
-    jnz        .loop
+    psllw       m4, m0
+    mova        [r0], m1
+    mova        [r0 + r2], m2
+    mova        [r0 + r2 * 2], m3
+    mova        [r0 + r3], m4
     RET
 
 
 INIT_YMM avx2
-cglobal cvt32to16_shl_8, 3,4,3
+cglobal cpy1Dto2D_shl_8, 3, 4, 3
     add         r2d, r2d
     movd        xm0, r3m
     lea         r3, [r2 * 3]
 
-    ; Row 0-1
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    psllw       m1, xm0
-    movu        [r0], xm1
-    vextracti128 [r0 + r2], m1, 1
-
-    ; Row 2-3
-    movu        xm1, [r1 + 2 * mmsize]
-    vinserti128  m1, m1, [r1 + 3 * mmsize], 1
-    movu        xm2, [r1 + 2 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    psllw       m1, xm0
-    movu        [r0 + r2 * 2], xm1
-    vextracti128 [r0 + r3], m1, 1
-
-    add         r1, 4 * mmsize
-    lea         r0, [r0 + r2 * 4]
-
-    ; Row 4-5
+    ; Row 0-3
     movu        m1, [r1 + 0 * mmsize]
     movu        m2, [r1 + 1 * mmsize]
-    packssdw    m1, m2
-    vpermq      m1, m1, 11011000b
     psllw       m1, xm0
+    psllw       m2, xm0
     movu        [r0], xm1
     vextracti128 [r0 + r2], m1, 1
+    movu        [r0 + r2 * 2], xm2
+    vextracti128 [r0 + r3], m2, 1
 
-    ; Row 6-7
+    ; Row 4-7
     movu        m1, [r1 + 2 * mmsize]
     movu        m2, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    vpermq      m1, m1, 11011000b
+    lea         r0, [r0 + r2 * 4]
     psllw       m1, xm0
-    movu        [r0 + r2 * 2], xm1
-    vextracti128 [r0 + r3], m1, 1
+    psllw       m2, xm0
+    movu        [r0], xm1
+    vextracti128 [r0 + r2], m1, 1
+    movu        [r0 + r2 * 2], xm2
+    vextracti128 [r0 + r3], m2, 1
     RET
 
+
 ;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal cvt32to16_shl_16, 3,4,5
+cglobal cpy1Dto2D_shl_16, 3, 4, 5
     add         r2d, r2d
     movd        m0, r3m
-    mov         r3d, 16/2
+    mov         r3d, 16/4
 
 .loop:
-    ; Row 0
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
+    ; Row 0-1
+    mova        m1, [r1 + 0 * mmsize]
+    mova        m2, [r1 + 1 * mmsize]
+    mova        m3, [r1 + 2 * mmsize]
+    mova        m4, [r1 + 3 * mmsize]
     psllw       m1, m0
+    psllw       m2, m0
     psllw       m3, m0
-    movu        [r0], m1
-    movu        [r0 + mmsize], m3
+    psllw       m4, m0
+    mova        [r0], m1
+    mova        [r0 + 16], m2
+    mova        [r0 + r2], m3
+    mova        [r0 + r2 + 16], m4
 
-    ; Row 1
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
+    ; Row 2-3
+    mova        m1, [r1 + 4 * mmsize]
+    mova        m2, [r1 + 5 * mmsize]
+    mova        m3, [r1 + 6 * mmsize]
+    mova        m4, [r1 + 7 * mmsize]
+    lea         r0, [r0 + r2 * 2]
     psllw       m1, m0
+    psllw       m2, m0
     psllw       m3, m0
-    movu        [r0 + r2], m1
-    movu        [r0 + r2 + mmsize], m3
+    psllw       m4, m0
+    mova        [r0], m1
+    mova        [r0 + 16], m2
+    mova        [r0 + r2], m3
+    mova        [r0 + r2 + 16], m4
 
     add         r1, 8 * mmsize
     lea         r0, [r0 + r2 * 2]
@@ -4199,49 +4000,28 @@ cglobal cvt32to16_shl_16, 3,4,5
 
 
 INIT_YMM avx2
-cglobal cvt32to16_shl_16, 3,5,3
+cglobal cpy1Dto2D_shl_16, 3, 5, 3
     add         r2d, r2d
     movd        xm0, r3m
     mov         r3d, 16/4
     lea         r4, [r2 * 3]
 
 .loop:
-    ; Row 0
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
+    ; Row 0-1
+    movu        m1, [r1 + 0 * mmsize]
+    movu        m2, [r1 + 1 * mmsize]
     psllw       m1, xm0
+    psllw       m2, xm0
     movu        [r0], m1
+    movu        [r0 + r2], m2
 
-    ; Row 1
-    movu        xm1, [r1 + 2 * mmsize]
-    vinserti128  m1, m1, [r1 + 3 * mmsize], 1
-    movu        xm2, [r1 + 2 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    psllw       m1, xm0
-    movu        [r0 + r2], m1
-
-    add         r1, 4 * mmsize
-
-    ; Row 2
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    psllw       m1, xm0
-    movu        [r0 + r2 * 2], m1
-
-    ; Row 3
+    ; Row 2-3
     movu        m1, [r1 + 2 * mmsize]
     movu        m2, [r1 + 3 * mmsize]
-    packssdw    m1, m2
     psllw       m1, xm0
-    vpermq      m1, m1, 11011000b
-    movu        [r0 + r4], m1
+    psllw       m2, xm0
+    movu        [r0 + r2 * 2], m1
+    movu        [r0 + r4], m2
 
     add         r1, 4 * mmsize
     lea         r0, [r0 + r2 * 4]
@@ -4251,84 +4031,70 @@ cglobal cvt32to16_shl_16, 3,5,3
 
 
 ;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal cvt32to16_shl_32, 3,4,5
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
     add         r2d, r2d
     movd        m0, r3m
-    mov         r3d, 32/1
+    mov         r3d, 32/2
 
 .loop:
     ; Row 0
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
+    mova        m1, [r1 + 0 * mmsize]
+    mova        m2, [r1 + 1 * mmsize]
+    mova        m3, [r1 + 2 * mmsize]
+    mova        m4, [r1 + 3 * mmsize]
     psllw       m1, m0
+    psllw       m2, m0
     psllw       m3, m0
-    movu        [r0 + 0 * mmsize], m1
-    movu        [r0 + 1 * mmsize], m3
-
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
+    psllw       m4, m0
+    mova        [r0 + 0 * mmsize], m1
+    mova        [r0 + 1 * mmsize], m2
+    mova        [r0 + 2 * mmsize], m3
+    mova        [r0 + 3 * mmsize], m4
+
+    ; Row 1
+    mova        m1, [r1 + 4 * mmsize]
+    mova        m2, [r1 + 5 * mmsize]
+    mova        m3, [r1 + 6 * mmsize]
+    mova        m4, [r1 + 7 * mmsize]
     psllw       m1, m0
+    psllw       m2, m0
     psllw       m3, m0
-    movu        [r0 + 2 * mmsize], m1
-    movu        [r0 + 3 * mmsize], m3
+    psllw       m4, m0
+    mova        [r0 + r2 + 0 * mmsize], m1
+    mova        [r0 + r2 + 1 * mmsize], m2
+    mova        [r0 + r2 + 2 * mmsize], m3
+    mova        [r0 + r2 + 3 * mmsize], m4
 
     add         r1, 8 * mmsize
-    add         r0, r2
+    lea         r0, [r0 + r2 * 2]
     dec         r3d
     jnz        .loop
     RET
 
 
 INIT_YMM avx2
-cglobal cvt32to16_shl_32, 3,4,5
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
     add         r2d, r2d
     movd        xm0, r3m
     mov         r3d, 32/2
 
 .loop:
-    ; Row 0
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
-    movu        xm3, [r1 + 2 * mmsize]
-    vinserti128  m3, m3, [r1 + 3 * mmsize], 1
-    movu        xm4, [r1 + 2 * mmsize + mmsize/2]
-    vinserti128  m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    packssdw    m3, m4
-    psllw       m1, xm0
-    psllw       m3, xm0
-    movu        [r0], m1
-    movu        [r0 + mmsize], m3
-
-    add         r1, 4 * mmsize
-
-    ; Row 1
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+    ; Row 0-1
+    movu        m1, [r1 + 0 * mmsize]
+    movu        m2, [r1 + 1 * mmsize]
     movu        m3, [r1 + 2 * mmsize]
     movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
     psllw       m1, xm0
+    psllw       m2, xm0
     psllw       m3, xm0
-    vpermq      m3, m3, 11011000b
-    movu        [r0 + r2], m1
-    movu        [r0 + r2 + mmsize], m3
+    psllw       m4, xm0
+    movu        [r0], m1
+    movu        [r0 + mmsize], m2
+    movu        [r0 + r2], m3
+    movu        [r0 + r2 + mmsize], m4
 
     add         r1, 4 * mmsize
     lea         r0, [r0 + r2 * 2]
@@ -4338,7 +4104,7 @@ cglobal cvt32to16_shl_32, 3,4,5
 
 
 ;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal copy_cnt_4, 3,3,3
@@ -4377,7 +4143,7 @@ cglobal copy_cnt_4, 3,3,3
 
 
 ;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal copy_cnt_8, 3,3,6
@@ -4481,7 +4247,7 @@ cglobal copy_cnt_8, 3,4,5
 
 
 ;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal copy_cnt_16, 3,4,6
@@ -4592,7 +4358,7 @@ cglobal copy_cnt_16, 3, 5, 5
     RET
 
 ;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal copy_cnt_32, 3,4,6
@@ -4699,227 +4465,470 @@ cglobal copy_cnt_32, 3, 5, 5
     movd         eax, xm4
     RET
 
-;-----------------------------------------------------------------------------
-; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
-;-----------------------------------------------------------------------------
 
-INIT_XMM sse4
-cglobal copy_shr, 4, 7, 4, dst, src, stride
-%define rnd     m2
-%define shift   m1
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_4, 4, 4, 4
+    add             r2d, r2d
+    movd            m0, r3d
+
+    ; register alloc
+    ; r0 - dst
+    ; r1 - src
+    ; r2 - srcStride
+    ; m0 - shift
 
-    ; make shift
-    mov         r5d, r3m
-    movd        shift, r5d
+    ; Row 0-3
+    movh            m2, [r1]
+    movhps          m2, [r1 + r2]
+    lea             r1, [r1 + r2 * 2]
+    movh            m3, [r1]
+    movhps          m3, [r1 + r2]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
 
-    ; make round
-    dec         r5
-    xor         r6, r6
-    bts         r6, r5
+    RET
 
-    movd        rnd, r6d
-    pshufd      rnd, rnd, 0
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_8, 4, 5, 4
+    add             r2d, r2d
+    movd            m0, r3d
+    mov             r3d, 8/4
+    lea             r4, [r2 * 3]
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride * 2 (short*)
-    ; r3 - lx
-    ; r4 - size
-    ; r5 - ly
-    ; r6 - diff
-    add         r2d, r2d
+    ; r2 - srcStride
+    ; r3 - loop counter
+    ; r4 - stride * 3
+    ; m0 - shift
+
+.loop:
+    ; Row 0, 1
+    mova            m2, [r1]
+    mova            m3, [r1 + r2]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
+
+    ; Row 2, 3
+    mova            m2, [r1 + r2 * 2]
+    mova            m3, [r1 + r4]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 2 * mmsize], m2
+    mova            [r0 + 3 * mmsize], m3
+
+    add             r0, 4 * mmsize
+    lea             r1, [r1 + r2 * 4]
+    dec             r3d
+    jnz            .loop
+    RET
 
-    mov         r4d, r4m
-    mov         r5, r4 ; size
-    mov         r6, r2 ; stride
-    sub         r6, r4
-    add         r6, r6
 
-    shr         r5, 1
-.loop_row:
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_16, 4, 4, 4
+    add             r2d, r2d
+    movd            m0, r3d
+    mov             r3d, 16/2
 
-    mov         r3, r4
-    shr         r3, 2
-.loop_col:
-    ; row 0
-    movh        m3, [r1]
-    pmovsxwd    m0, m3
-    paddd       m0, rnd
-    psrad       m0, shift
-    packssdw    m0, m0
-    movh        [r0], m0
+    ; register alloc
+    ; r0 - dst
+    ; r1 - src
+    ; r2 - srcStride
+    ; r3 - loop counter
+    ; m0 - shift
 
-    ; row 1
-    movh        m3, [r1 + r4 * 2]
-    pmovsxwd    m0, m3
-    paddd       m0, rnd
-    psrad       m0, shift
-    packssdw    m0, m0
-    movh        [r0 + r2], m0
+.loop:
+    ; Row 0
+    mova            m2, [r1 + 0 * mmsize]
+    mova            m3, [r1 + 1 * mmsize]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
 
-    ; move col pointer
-    add         r1, 8
-    add         r0, 8
+    ; Row 1
+    mova            m2, [r1 + r2 + 0 * mmsize]
+    mova            m3, [r1 + r2 + 1 * mmsize]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 2 * mmsize], m2
+    mova            [r0 + 3 * mmsize], m3
+
+    add             r0, 4 * mmsize
+    lea             r1, [r1 + r2 * 2]
+    dec             r3d
+    jnz            .loop
+    RET
 
-    dec         r3
-    jg          .loop_col
 
-    ; update pointer
-    lea         r1, [r1 + r4 * 2]
-    add         r0, r6
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_32, 4, 4, 6
+    add             r2d, r2d
+    movd            m0, r3d
+    mov             r3d, 32/1
 
-    ; end of loop_row
-    dec         r5
-    jg         .loop_row
+    ; register alloc
+    ; r0 - dst
+    ; r1 - src
+    ; r2 - srcStride
+    ; r3 - loop counter
+    ; m0 - shift
 
+.loop:
+    ; Row 0
+    mova            m2, [r1 + 0 * mmsize]
+    mova            m3, [r1 + 1 * mmsize]
+    mova            m4, [r1 + 2 * mmsize]
+    mova            m5, [r1 + 3 * mmsize]
+    psllw           m2, m0
+    psllw           m3, m0
+    psllw           m4, m0
+    psllw           m5, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
+    mova            [r0 + 2 * mmsize], m4
+    mova            [r0 + 3 * mmsize], m5
+
+    add             r0, 4 * mmsize
+    add             r1, r2
+    dec             r3d
+    jnz            .loop
     RET
 
+
 ;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal copy_shl_4, 3,3,3
+cglobal cpy1Dto2D_shr_4, 3, 3, 4
     add         r2d, r2d
     movd        m0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, m0
+    psraw       m1, 1
 
     ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    movh        [r0], m1
-    movhps      [r0 + r2], m1
-    movh        [r0 + r2 * 2], m2
-    lea         r2, [r2 * 3]
+    mova        m2, [r1 + 0 * mmsize]
+    mova        m3, [r1 + 1 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    movh        [r0], m2
     movhps      [r0 + r2], m2
+    movh        [r0 + r2 * 2], m3
+    lea         r2, [r2 * 3]
+    movhps      [r0 + r2], m3
     RET
 
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_4, 3, 3, 3
+    add         r2d, r2d
+    movd        xm0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, xm0
+    psraw       m1, 1
+
+    ; Row 0-3
+    movu        m2, [r1]
+    psubw       m2, m1
+    psraw       m2, xm0
+    vextracti128 xm1, m2, 1
+    movq        [r0], xm2
+    movhps      [r0 + r2], xm2
+    lea         r0, [r0 + r2 * 2]
+    movq        [r0], xm1
+    movhps      [r0 + r2], xm1
+    RET
+
+
 ;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal copy_shl_8, 3,4,5
+cglobal cpy1Dto2D_shr_8, 3, 4, 6
     add         r2d, r2d
     movd        m0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, m0
+    psraw       m1, 1
+    lea         r3, [r2 * 3]
 
     ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0], m1
-    movu        [r0 + r2], m2
-    movu        [r0 + 2 * r2], m3
-    lea         r0, [r0 + 2 * r2]
-    movu        [r0 + r2], m4
+    mova        m2, [r1 + 0 * mmsize]
+    mova        m3, [r1 + 1 * mmsize]
+    mova        m4, [r1 + 2 * mmsize]
+    mova        m5, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0], m2
+    mova        [r0 + r2], m3
+    mova        [r0 + r2 * 2], m4
+    mova        [r0 + r3], m5
 
     ; Row 4-7
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0 + r2 * 2], m1
-    lea         r0, [r0 + 2 * r2]
-    movu        [r0 + r2], m2
-    movu        [r0 + 2 * r2], m3
-    lea         r0, [r0 + 2 * r2]
-    movu        [r0 + r2], m4
+    mova        m2, [r1 + 4 * mmsize]
+    mova        m3, [r1 + 5 * mmsize]
+    mova        m4, [r1 + 6 * mmsize]
+    mova        m5, [r1 + 7 * mmsize]
+    lea         r0, [r0 + r2 * 4]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0], m2
+    mova        [r0 + r2], m3
+    mova        [r0 + r2 * 2], m4
+    mova        [r0 + r3], m5
+    RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_8, 3, 4, 4
+    add         r2d, r2d
+    movd        xm0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, xm0
+    psraw       m1, 1
+    lea         r3, [r2 * 3]
+
+    ; Row 0-3
+    movu        m2, [r1 + 0 * mmsize]
+    movu        m3, [r1 + 1 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    movu        [r0], xm2
+    vextracti128 [r0 + r2], m2, 1
+    movu        [r0 + r2 * 2], xm3
+    vextracti128 [r0 + r3], m3, 1
+
+    ; Row 4-7
+    movu        m2, [r1 + 2 * mmsize]
+    movu        m3, [r1 + 3 * mmsize]
+    lea         r0, [r0 + r2 * 4]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    movu        [r0], xm2
+    vextracti128 [r0 + r2], m2, 1
+    movu        [r0 + r2 * 2], xm3
+    vextracti128 [r0 + r3], m3, 1
     RET
 
+
 ;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal copy_shl_16, 3,4,5
+cglobal cpy1Dto2D_shr_16, 3, 5, 6
     add         r2d, r2d
     movd        m0, r3m
-    mov         r3d, 256/64
+    pcmpeqw	m1, m1
+    psllw       m1, m0
+    psraw       m1, 1
+    mov         r3d, 16/4
+    lea         r4, [r2 * 3]
 
 .loop:
-    ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0], m1
-    movu        [r0 + 16], m2
-    movu        [r0 + r2], m3
-    movu        [r0 + r2 + 16], m4
+    ; Row 0-1
+    mova        m2, [r1 + 0 * mmsize]
+    mova        m3, [r1 + 1 * mmsize]
+    mova        m4, [r1 + 2 * mmsize]
+    mova        m5, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0], m2
+    mova        [r0 + mmsize], m3
+    mova        [r0 + r2], m4
+    mova        [r0 + r2 + mmsize], m5
 
-    ; Row 4-7
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0 + r2 * 2], m1
-    movu        [r0 + r2 * 2 + 16], m2
-    lea         r0, [r0 + r2 * 2]
-    movu        [r0 + r2], m3
-    movu        [r0 + r2 + 16], m4
+    ; Row 2-3
+    mova        m2, [r1 + 4 * mmsize]
+    mova        m3, [r1 + 5 * mmsize]
+    mova        m4, [r1 + 6 * mmsize]
+    mova        m5, [r1 + 7 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0 + r2 * 2], m2
+    mova        [r0 + r2 * 2 + mmsize], m3
+    mova        [r0 + r4], m4
+    mova        [r0 + r4 + mmsize], m5
 
     add         r1, 8 * mmsize
-    lea         r0, [r0 + r2 * 2]
+    lea         r0, [r0 + r2 * 4]
+    dec         r3d
+    jnz        .loop
+    RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_16, 3, 5, 4
+    add         r2d, r2d
+    movd        xm0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, xm0
+    psraw       m1, 1
+    mov         r3d, 16/4
+    lea         r4, [r2 * 3]
+
+.loop:
+    ; Row 0-1
+    movu        m2, [r1 + 0 * mmsize]
+    movu        m3, [r1 + 1 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    movu        [r0], m2
+    movu        [r0 + r2], m3
+
+    ; Row 2-3
+    movu        m2, [r1 + 2 * mmsize]
+    movu        m3, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    movu        [r0 + r2 * 2], m2
+    movu        [r0 + r4], m3
+
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 4]
     dec         r3d
     jnz        .loop
     RET
 
+
 ;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal copy_shl_32, 3,4,5
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
     add         r2d, r2d
     movd        m0, r3m
-    mov         r3d, 1024/64
+    pcmpeqw	m1, m1
+    psllw       m1, m0
+    psraw       m1, 1
+    mov         r3d, 32/2
 
 .loop:
-    ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0], m1
-    movu        [r0 + 16], m2
-    movu        [r0 + 32], m3
-    movu        [r0 + 48], m4
+    ; Row 0
+    mova        m2, [r1 + 0 * mmsize]
+    mova        m3, [r1 + 1 * mmsize]
+    mova        m4, [r1 + 2 * mmsize]
+    mova        m5, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0 + 0 * mmsize], m2
+    mova        [r0 + 1 * mmsize], m3
+    mova        [r0 + 2 * mmsize], m4
+    mova        [r0 + 3 * mmsize], m5
 
-    ; Row 4-7
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0 + r2], m1
-    movu        [r0 + r2 + 16], m2
-    movu        [r0 + r2 + 32], m3
-    movu        [r0 + r2 + 48], m4
+    ; Row 1
+    mova        m2, [r1 + 4 * mmsize]
+    mova        m3, [r1 + 5 * mmsize]
+    mova        m4, [r1 + 6 * mmsize]
+    mova        m5, [r1 + 7 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0 + r2 + 0 * mmsize], m2
+    mova        [r0 + r2 + 1 * mmsize], m3
+    mova        [r0 + r2 + 2 * mmsize], m4
+    mova        [r0 + r2 + 3 * mmsize], m5
 
     add         r1, 8 * mmsize
     lea         r0, [r0 + r2 * 2]
     dec         r3d
     jnz        .loop
     RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
+    add         r2d, r2d
+    movd        xm0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, xm0
+    psraw       m1, 1
+    mov         r3d, 32/2
+
+.loop:
+    ; Row 0-1
+    movu        m2, [r1 + 0 * mmsize]
+    movu        m3, [r1 + 1 * mmsize]
+    movu        m4, [r1 + 2 * mmsize]
+    movu        m5, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    psraw       m4, xm0
+    psraw       m5, xm0
+    movu        [r0], m2
+    movu        [r0 + mmsize], m3
+    movu        [r0 + r2], m4
+    movu        [r0 + r2 + mmsize], m5
+
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 2]
+    dec         r3d
+    jnz        .loop
+    RET
diff --git a/source/common/x86/blockcopy8.h b/source/common/x86/blockcopy8.h
index 115e340..9fbbeea 100644
--- a/source/common/x86/blockcopy8.h
+++ b/source/common/x86/blockcopy8.h
@@ -24,48 +24,53 @@
 #ifndef X265_BLOCKCOPY8_H
 #define X265_BLOCKCOPY8_H
 
-void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int);
-void x265_cvt32to16_shl_4_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_8_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_16_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_32_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_4_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_8_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_16_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_32_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_4_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_copy_shr_sse4(int16_t * dst, int16_t *src, intptr_t, int, int);
-void x265_copy_shl_4_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-void x265_copy_shl_8_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-void x265_copy_shl_16_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-void x265_copy_shl_32_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-uint32_t x265_copy_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_32_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_4_avx2(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_8_avx2(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_16_avx2(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_32_avx2(int16_t * dst, int16_t * src, intptr_t);
+void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
 
 #define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
-    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
-    void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb); \
-    void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
+    void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); \
+    void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
 
 #define SETUP_BLOCKCOPY_PS(W, H, cpu) \
-    void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t * dst, intptr_t dstStride, pixel * src, intptr_t srcStride);
+    void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 
 #define SETUP_BLOCKCOPY_SP(W, H, cpu) \
-    void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+    void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
 
 #define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \
-    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
-    void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
+    void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
 
 #define BLOCKCOPY_COMMON(cpu) \
     SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \
@@ -178,31 +183,31 @@ BLOCKCOPY_PS(_sse4);
 
 BLOCKCOPY_SP(_sse2);
 
-void x265_blockfill_s_4x4_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_8x8_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_32x32_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockcopy_ss_16x4_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x8_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x12_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x24_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x48_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-
-void x265_blockcopy_pp_32x8_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x16_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x24_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x32_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x48_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-
-void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_32x32_avx2(int16_t *dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_4x4_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+
+void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x24_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+void x265_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val);
 
 #undef BLOCKCOPY_COMMON
 #undef BLOCKCOPY_SS_PP
diff --git a/source/common/x86/dct8.asm b/source/common/x86/dct8.asm
index 5323a42..7e1ebbc 100644
--- a/source/common/x86/dct8.asm
+++ b/source/common/x86/dct8.asm
@@ -245,7 +245,7 @@ avx2_idct4_1:   dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64
 
 avx2_idct4_2:   dw 64, 64, 64, -64, 83, 36, 36, -83
 
-const idct4_shuf1,    times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+const idct4_shuf1,    times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
 
 idct4_shuf2:    times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
 
@@ -318,7 +318,7 @@ cextern pd_2048
 cextern pw_ppppmmmm
 
 ;------------------------------------------------------
-;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
+;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;------------------------------------------------------
 INIT_XMM sse2
 cglobal dct4, 3, 4, 8
@@ -384,28 +384,28 @@ cglobal dct4, 3, 4, 8
     paddd       m1, m3
     paddd       m1, m7
     psrad       m1, 8
-    movu        [r1 + 0 * 16], m1
 
-    pmaddwd     m1, m2, m5
+    pmaddwd     m4, m2, m5
     pmaddwd     m3, m0, m5
-    psubd       m1, m3
-    paddd       m1, m7
-    psrad       m1, 8
-    movu        [r1 + 1 * 16], m1
+    psubd       m4, m3
+    paddd       m4, m7
+    psrad       m4, 8
+    packssdw    m1, m4
+    movu        [r1 + 0 * 16], m1
 
     pmaddwd     m1, m2, m6
     pmaddwd     m3, m0, m6
     paddd       m1, m3
     paddd       m1, m7
     psrad       m1, 8
-    movu        [r1 + 2 * 16], m1
 
     pmaddwd     m2, [r3 + 3 * 16]
     pmaddwd     m0, [r3 + 3 * 16]
     psubd       m2, m0
     paddd       m2, m7
     psrad       m2, 8
-    movu        [r1 + 3 * 16], m2
+    packssdw    m1, m2
+    movu        [r1 + 1 * 16], m1
     RET
 
 ; DCT 4x4
@@ -470,14 +470,12 @@ cglobal dct4, 3, 4, 8, src, dst, srcStride
     paddd           m2, m7
     psrad           m2, 8
 
-    movu            [r1], xm3
-    movu            [r1 + mmsize/2], m2
-    vextracti128    [r1 + mmsize], m3, 1
-    vextracti128    [r1 + mmsize + mmsize/2], m2, 1
+    packssdw        m3, m2
+    movu            [r1], m3
     RET
 
 ;-------------------------------------------------------
-;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
+;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_XMM sse2
 cglobal idct4, 3, 4, 7
@@ -497,11 +495,6 @@ cglobal idct4, 3, 4, 7
 
     movu        m0, [r0 + 0 * 16]
     movu        m1, [r0 + 1 * 16]
-    packssdw    m0, m1
-
-    movu        m1, [r0 + 2 * 16]
-    movu        m2, [r0 + 3 * 16]
-    packssdw    m1, m2
 
     punpcklwd   m2, m0, m1
     pmaddwd     m3, m2, [r3 + 0 * 16]       ; m3 = E1
@@ -572,7 +565,7 @@ cglobal idct4, 3, 4, 7
     RET
 
 ;------------------------------------------------------
-;void dst4(int16_t *src, int32_t *dst, intptr_t stride)
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;------------------------------------------------------
 INIT_XMM ssse3
 %if ARCH_X86_64
@@ -638,33 +631,33 @@ cglobal dst4, 3, 4, 8
     phaddd      m0, m1
     paddd       m0, m5
     psrad       m0, 8
-    movu        [r1 + 0 * 16], m0
 
-    pmaddwd     m0, m2, coef1
+    pmaddwd     m4, m2, coef1
     pmaddwd     m1, m3, coef1
-    phaddd      m0, m1
-    paddd       m0, m5
-    psrad       m0, 8
-    movu        [r1 + 1 * 16], m0
+    phaddd      m4, m1
+    paddd       m4, m5
+    psrad       m4, 8
+    packssdw    m0, m4
+    movu        [r1 + 0 * 16], m0
 
     pmaddwd     m0, m2, coef2
     pmaddwd     m1, m3, coef2
     phaddd      m0, m1
     paddd       m0, m5
     psrad       m0, 8
-    movu        [r1 + 2 * 16], m0
 
     pmaddwd     m2, coef3
     pmaddwd     m3, coef3
     phaddd      m2, m3
     paddd       m2, m5
     psrad       m2, 8
-    movu        [r1 + 3 * 16], m2
+    packssdw    m0, m2
+    movu        [r1 + 1 * 16], m0
 
     RET
 
 ;-------------------------------------------------------
-;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
+;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_XMM sse2
 cglobal idst4, 3, 4, 7
@@ -683,11 +676,6 @@ cglobal idst4, 3, 4, 7
 
     movu        m0, [r0 + 0 * 16]
     movu        m1, [r0 + 1 * 16]
-    packssdw    m0, m1
-
-    movu        m1, [r0 + 2 * 16]
-    movu        m2, [r0 + 3 * 16]
-    packssdw    m1, m2
 
     punpcklwd   m2, m0, m1                  ; m2 = m128iAC
     punpckhwd   m0, m1                      ; m0 = m128iBD
@@ -762,7 +750,7 @@ cglobal idst4, 3, 4, 7
 
 
 ;-------------------------------------------------------
-; void dct8(int16_t *src, int32_t *dst, intptr_t stride)
+; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;-------------------------------------------------------
 INIT_XMM sse4
 cglobal dct8, 3,6,7,0-16*mmsize
@@ -935,10 +923,16 @@ cglobal dct8, 3,6,7,0-16*mmsize
     phsubd      m4, m2                  ; m4 = [Row6 Row4]
     paddd       m4, m6
     psrad       m4, 9
-    movh        [r1 + 0*2*mmsize], m3
-    movhps      [r1 + 2*2*mmsize], m3
-    movh        [r1 + 4*2*mmsize], m4
-    movhps      [r1 + 6*2*mmsize], m4
+
+    packssdw    m3, m3
+    movd        [r1 + 0*mmsize], m3
+    pshufd      m3, m3, 1
+    movd        [r1 + 2*mmsize], m3
+
+    packssdw    m4, m4
+    movd        [r1 + 4*mmsize], m4
+    pshufd      m4, m4, 1
+    movd        [r1 + 6*mmsize], m4
 
     ; odd
     pmulld      m2, m0, [r4 + 2*16]
@@ -950,8 +944,11 @@ cglobal dct8, 3,6,7,0-16*mmsize
     phaddd      m2, m4                  ; m2 = [Row3 Row1]
     paddd       m2, m6
     psrad       m2, 9
-    movh        [r1 + 1*2*mmsize], m2
-    movhps      [r1 + 3*2*mmsize], m2
+
+    packssdw    m2, m2
+    movd        [r1 + 1*mmsize], m2
+    pshufd      m2, m2, 1
+    movd        [r1 + 3*mmsize], m2
 
     pmulld      m2, m0, [r4 + 4*16]
     pmulld      m3, m1, [r4 + 4*16]
@@ -962,10 +959,13 @@ cglobal dct8, 3,6,7,0-16*mmsize
     phaddd      m2, m4                  ; m2 = [Row7 Row5]
     paddd       m2, m6
     psrad       m2, 9
-    movh        [r1 + 5*2*mmsize], m2
-    movhps      [r1 + 7*2*mmsize], m2
 
-    add         r1, mmsize/2
+    packssdw    m2, m2
+    movd        [r1 + 5*mmsize], m2
+    pshufd      m2, m2, 1
+    movd        [r1 + 7*mmsize], m2
+
+    add         r1, mmsize/4
     add         r0, 2*2*mmsize
 %endrep
 
@@ -974,17 +974,392 @@ cglobal dct8, 3,6,7,0-16*mmsize
     RET
 
 ;-------------------------------------------------------
-; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
+;-------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse2
+%if BIT_DEPTH == 10
+    %define     IDCT_SHIFT 10
+    %define     IDCT_ADD pd_512
+%elif BIT_DEPTH == 8
+    %define     IDCT_SHIFT 12
+    %define     IDCT_ADD pd_2048
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+cglobal idct8, 3, 6, 16, 0-5*mmsize
+    mova        m9, [r0 + 1 * mmsize]
+    mova        m1, [r0 + 3 * mmsize]
+    mova        m7, m9
+    punpcklwd   m7, m1
+    punpckhwd   m9, m1
+    mova        m14, [tab_idct8_3]
+    mova        m3, m14
+    pmaddwd     m14, m7
+    pmaddwd     m3, m9
+    mova        m0, [r0 + 5 * mmsize]
+    mova        m10, [r0 + 7 * mmsize]
+    mova        m2, m0
+    punpcklwd   m2, m10
+    punpckhwd   m0, m10
+    mova        m15, [tab_idct8_3 + 1 * mmsize]
+    mova        m11, [tab_idct8_3 + 1 * mmsize]
+    pmaddwd     m15, m2
+    mova        m4, [tab_idct8_3 + 2 * mmsize]
+    pmaddwd     m11, m0
+    mova        m1, [tab_idct8_3 + 2 * mmsize]
+    paddd       m15, m14
+    mova        m5, [tab_idct8_3 + 4 * mmsize]
+    mova        m12, [tab_idct8_3 + 4 * mmsize]
+    paddd       m11, m3
+    mova        [rsp + 0 * mmsize], m11
+    mova        [rsp + 1 * mmsize], m15
+    pmaddwd     m4, m7
+    pmaddwd     m1, m9
+    mova        m14, [tab_idct8_3 + 3 * mmsize]
+    mova        m3, [tab_idct8_3 + 3 * mmsize]
+    pmaddwd     m14, m2
+    pmaddwd     m3, m0
+    paddd       m14, m4
+    paddd       m3, m1
+    mova        [rsp + 2 * mmsize], m3
+    pmaddwd     m5, m9
+    pmaddwd     m9, [tab_idct8_3 + 6 * mmsize]
+    mova        m6, [tab_idct8_3 + 5 * mmsize]
+    pmaddwd     m12, m7
+    pmaddwd     m7, [tab_idct8_3 + 6 * mmsize]
+    mova        m4, [tab_idct8_3 + 5 * mmsize]
+    pmaddwd     m6, m2
+    paddd       m6, m12
+    pmaddwd     m2, [tab_idct8_3 + 7 * mmsize]
+    paddd       m7, m2
+    mova        [rsp + 3 * mmsize], m6
+    pmaddwd     m4, m0
+    pmaddwd     m0, [tab_idct8_3 + 7 * mmsize]
+    paddd       m9, m0
+    paddd       m5, m4
+    mova        m6, [r0 + 0 * mmsize]
+    mova        m0, [r0 + 4 * mmsize]
+    mova        m4, m6
+    punpcklwd   m4, m0
+    punpckhwd   m6, m0
+    mova        m12, [r0 + 2 * mmsize]
+    mova        m0, [r0 + 6 * mmsize]
+    mova        m13, m12
+    mova        m8, [tab_dct4]
+    punpcklwd   m13, m0
+    mova        m10, [tab_dct4]
+    punpckhwd   m12, m0
+    pmaddwd     m8, m4
+    mova        m3, m8
+    pmaddwd     m4, [tab_dct4 + 2 * mmsize]
+    pmaddwd     m10, m6
+    mova        m2, [tab_dct4 + 1 * mmsize]
+    mova        m1, m10
+    pmaddwd     m6, [tab_dct4 + 2 * mmsize]
+    mova        m0, [tab_dct4 + 1 * mmsize]
+    pmaddwd     m2, m13
+    paddd       m3, m2
+    psubd       m8, m2
+    mova        m2, m6
+    pmaddwd     m13, [tab_dct4 + 3 * mmsize]
+    pmaddwd     m0, m12
+    paddd       m1, m0
+    psubd       m10, m0
+    mova        m0, m4
+    pmaddwd     m12, [tab_dct4 + 3 * mmsize]
+    paddd       m3, [pd_64]
+    paddd       m1, [pd_64]
+    paddd       m8, [pd_64]
+    paddd       m10, [pd_64]
+    paddd       m0, m13
+    paddd       m2, m12
+    paddd       m0, [pd_64]
+    paddd       m2, [pd_64]
+    psubd       m4, m13
+    psubd       m6, m12
+    paddd       m4, [pd_64]
+    paddd       m6, [pd_64]
+    mova        m12, m8
+    psubd       m8, m7
+    psrad       m8, 7
+    paddd       m15, m3
+    psubd       m3, [rsp + 1 * mmsize]
+    psrad       m15, 7
+    paddd       m12, m7
+    psrad       m12, 7
+    paddd       m11, m1
+    mova        m13, m14
+    psrad       m11, 7
+    packssdw    m15, m11
+    psubd       m1, [rsp + 0 * mmsize]
+    psrad       m1, 7
+    mova        m11, [rsp + 2 * mmsize]
+    paddd       m14, m0
+    psrad       m14, 7
+    psubd       m0, m13
+    psrad       m0, 7
+    paddd       m11, m2
+    mova        m13, [rsp + 3 * mmsize]
+    psrad       m11, 7
+    packssdw    m14, m11
+    mova        m11, m6
+    psubd       m6, m5
+    paddd       m13, m4
+    psrad       m13, 7
+    psrad       m6, 7
+    paddd       m11, m5
+    psrad       m11, 7
+    packssdw    m13, m11
+    mova        m11, m10
+    psubd       m4, [rsp + 3 * mmsize]
+    psubd       m10, m9
+    psrad       m4, 7
+    psrad       m10, 7
+    packssdw    m4, m6
+    packssdw    m8, m10
+    paddd       m11, m9
+    psrad       m11, 7
+    packssdw    m12, m11
+    psubd       m2, [rsp + 2 * mmsize]
+    mova        m5, m15
+    psrad       m2, 7
+    packssdw    m0, m2
+    mova        m2, m14
+    psrad       m3, 7
+    packssdw    m3, m1
+    mova        m6, m13
+    punpcklwd   m5, m8
+    punpcklwd   m2, m4
+    mova        m1, m12
+    punpcklwd   m6, m0
+    punpcklwd   m1, m3
+    mova        m9, m5
+    punpckhwd   m13, m0
+    mova        m0, m2
+    punpcklwd   m9, m6
+    punpckhwd   m5, m6
+    punpcklwd   m0, m1
+    punpckhwd   m2, m1
+    punpckhwd   m15, m8
+    mova        m1, m5
+    punpckhwd   m14, m4
+    punpckhwd   m12, m3
+    mova        m6, m9
+    punpckhwd   m9, m0
+    punpcklwd   m1, m2
+    mova        m4, [tab_idct8_3 + 0 * mmsize]
+    punpckhwd   m5, m2
+    punpcklwd   m6, m0
+    mova        m2, m15
+    mova        m0, m14
+    mova        m7, m9
+    punpcklwd   m2, m13
+    punpcklwd   m0, m12
+    punpcklwd   m7, m5
+    punpckhwd   m14, m12
+    mova        m10, m2
+    punpckhwd   m15, m13
+    punpckhwd   m9, m5
+    pmaddwd     m4, m7
+    mova        m13, m1
+    punpckhwd   m2, m0
+    punpcklwd   m10, m0
+    mova        m0, m15
+    punpckhwd   m15, m14
+    mova        m12, m1
+    mova        m3, [tab_idct8_3 + 0 * mmsize]
+    punpcklwd   m0, m14
+    pmaddwd     m3, m9
+    mova        m11, m2
+    punpckhwd   m2, m15
+    punpcklwd   m11, m15
+    mova        m8, [tab_idct8_3 + 1 * mmsize]
+    punpcklwd   m13, m0
+    punpckhwd   m12, m0
+    pmaddwd     m8, m11
+    paddd       m8, m4
+    mova        [rsp + 4 * mmsize], m8
+    mova        m4, [tab_idct8_3 + 2 * mmsize]
+    pmaddwd     m4, m7
+    mova        m15, [tab_idct8_3 + 2 * mmsize]
+    mova        m5, [tab_idct8_3 + 1 * mmsize]
+    pmaddwd     m15, m9
+    pmaddwd     m5, m2
+    paddd       m5, m3
+    mova        [rsp + 3 * mmsize], m5
+    mova        m14, [tab_idct8_3 + 3 * mmsize]
+    mova        m5, [tab_idct8_3 + 3 * mmsize]
+    pmaddwd     m14, m11
+    paddd       m14, m4
+    mova        [rsp + 2 * mmsize], m14
+    pmaddwd     m5, m2
+    paddd       m5, m15
+    mova        [rsp + 1 * mmsize], m5
+    mova        m15, [tab_idct8_3 + 4 * mmsize]
+    mova        m5, [tab_idct8_3 + 4 * mmsize]
+    pmaddwd     m15, m7
+    pmaddwd     m7, [tab_idct8_3 + 6 * mmsize]
+    pmaddwd     m5, m9
+    pmaddwd     m9, [tab_idct8_3 + 6 * mmsize]
+    mova        m4, [tab_idct8_3 + 5 * mmsize]
+    pmaddwd     m4, m2
+    paddd       m5, m4
+    mova        m4, m6
+    mova        m8, [tab_idct8_3 + 5 * mmsize]
+    punpckhwd   m6, m10
+    pmaddwd     m2, [tab_idct8_3 + 7 * mmsize]
+    punpcklwd   m4, m10
+    paddd       m9, m2
+    pmaddwd     m8, m11
+    mova        m10, [tab_dct4]
+    paddd       m8, m15
+    pmaddwd     m11, [tab_idct8_3 + 7 * mmsize]
+    paddd       m7, m11
+    mova        [rsp + 0 * mmsize], m8
+    pmaddwd     m10, m6
+    pmaddwd     m6, [tab_dct4 + 2 * mmsize]
+    mova        m1, m10
+    mova        m8, [tab_dct4]
+    mova        m3, [tab_dct4 + 1 * mmsize]
+    pmaddwd     m8, m4
+    pmaddwd     m4, [tab_dct4 + 2 * mmsize]
+    mova        m0, m8
+    mova        m2, [tab_dct4 + 1 * mmsize]
+    pmaddwd     m3, m13
+    psubd       m8, m3
+    paddd       m0, m3
+    mova        m3, m6
+    pmaddwd     m13, [tab_dct4 + 3 * mmsize]
+    pmaddwd     m2, m12
+    paddd       m1, m2
+    psubd       m10, m2
+    mova        m2, m4
+    pmaddwd     m12, [tab_dct4 + 3 * mmsize]
+    paddd       m0, [IDCT_ADD]
+    paddd       m1, [IDCT_ADD]
+    paddd       m8, [IDCT_ADD]
+    paddd       m10, [IDCT_ADD]
+    paddd       m2, m13
+    paddd       m3, m12
+    paddd       m2, [IDCT_ADD]
+    paddd       m3, [IDCT_ADD]
+    psubd       m4, m13
+    psubd       m6, m12
+    paddd       m4, [IDCT_ADD]
+    paddd       m6, [IDCT_ADD]
+    mova        m15, [rsp + 4 * mmsize]
+    mova        m12, m8
+    psubd       m8, m7
+    psrad       m8, IDCT_SHIFT
+    mova        m11, [rsp + 3 * mmsize]
+    paddd       m15, m0
+    psrad       m15, IDCT_SHIFT
+    psubd       m0, [rsp + 4 * mmsize]
+    psrad       m0, IDCT_SHIFT
+    paddd       m12, m7
+    paddd       m11, m1
+    mova        m14, [rsp + 2 * mmsize]
+    psrad       m11, IDCT_SHIFT
+    packssdw    m15, m11
+    psubd       m1, [rsp + 3 * mmsize]
+    psrad       m1, IDCT_SHIFT
+    mova        m11, [rsp + 1 * mmsize]
+    paddd       m14, m2
+    psrad       m14, IDCT_SHIFT
+    packssdw    m0, m1
+    psrad       m12, IDCT_SHIFT
+    psubd       m2, [rsp + 2 * mmsize]
+    paddd       m11, m3
+    mova        m13, [rsp + 0 * mmsize]
+    psrad       m11, IDCT_SHIFT
+    packssdw    m14, m11
+    mova        m11, m6
+    psubd       m6, m5
+    paddd       m13, m4
+    psrad       m13, IDCT_SHIFT
+    mova        m1, m15
+    paddd       m11, m5
+    psrad       m11, IDCT_SHIFT
+    packssdw    m13, m11
+    mova        m11, m10
+    psubd       m10, m9
+    psrad       m10, IDCT_SHIFT
+    packssdw    m8, m10
+    psrad       m6, IDCT_SHIFT
+    psubd       m4, [rsp + 0 * mmsize]
+    paddd       m11, m9
+    psrad       m11, IDCT_SHIFT
+    packssdw    m12, m11
+    punpcklwd   m1, m14
+    mova        m5, m13
+    psrad       m4, IDCT_SHIFT
+    packssdw    m4, m6
+    psubd       m3, [rsp + 1 * mmsize]
+    psrad       m2, IDCT_SHIFT
+    mova        m6, m8
+    psrad       m3, IDCT_SHIFT
+    punpcklwd   m5, m12
+    packssdw    m2, m3
+    punpcklwd   m6, m4
+    punpckhwd   m8, m4
+    mova        m4, m1
+    mova        m3, m2
+    punpckhdq   m1, m5
+    punpckldq   m4, m5
+    punpcklwd   m3, m0
+    punpckhwd   m2, m0
+    mova        m0, m6
+    lea         r2, [r2 + r2]
+    lea         r4, [r2 + r2]
+    lea         r3, [r4 + r2]
+    lea         r4, [r4 + r3]
+    lea         r0, [r4 + r2 * 2]
+    movq        [r1], m4
+    punpckhwd   m15, m14
+    movhps      [r1 + r2], m4
+    punpckhdq   m0, m3
+    movq        [r1 + r2 * 2], m1
+    punpckhwd   m13, m12
+    movhps      [r1 + r3], m1
+    mova        m1, m6
+    punpckldq   m1, m3
+    movq        [r1 + 8], m1
+    movhps      [r1 + r2 + 8], m1
+    movq        [r1 + r2 * 2 + 8], m0
+    movhps      [r1 + r3 + 8], m0
+    mova        m0, m15
+    punpckhdq   m15, m13
+    punpckldq   m0, m13
+    movq        [r1 + r2 * 4], m0
+    movhps      [r1 + r4], m0
+    mova        m0, m8
+    punpckhdq   m8, m2
+    movq        [r1 + r3 * 2], m15
+    punpckldq   m0, m2
+    movhps      [r1 + r0], m15
+    movq        [r1 + r2 * 4 + 8], m0
+    movhps      [r1 + r4 + 8], m0
+    movq        [r1 + r3 * 2 + 8], m8
+    movhps      [r1 + r0 + 8], m8
+    RET
+
+%undef IDCT_SHIFT
+%undef IDCT_ADD
+%endif
+
+;-------------------------------------------------------
+; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_XMM ssse3
 
 cglobal patial_butterfly_inverse_internal_pass1
-    movu        m0, [r0]
-    movu        m1, [r0 + 4 * 32]
-    movu        m2, [r0 + 2 * 32]
-    movu        m3, [r0 + 6 * 32]
-    packssdw    m0, m2
-    packssdw    m1, m3
+    movh        m0, [r0]
+    movhps      m0, [r0 + 2 * 16]
+    movh        m1, [r0 + 4 * 16]
+    movhps      m1, [r0 + 6 * 16]
+
     punpckhwd   m2, m0, m1                  ; [2 6]
     punpcklwd   m0, m1                      ; [0 4]
     pmaddwd     m1, m0, [r6]                ; EE[0]
@@ -1004,12 +1379,10 @@ cglobal patial_butterfly_inverse_internal_pass1
     paddd       m3, m5
     paddd       m4, m5
 
-    movu        m2, [r0 + 32]
-    movu        m5, [r0 + 5 * 32]
-    packssdw    m2, m5
-    movu        m5, [r0 + 3 * 32]
-    movu        m6, [r0 + 7 * 32]
-    packssdw    m5, m6
+    movh        m2, [r0 + 16]
+    movhps      m2, [r0 + 5 * 16]
+    movh        m5, [r0 + 3 * 16]
+    movhps      m5, [r0 + 7 * 16]
     punpcklwd   m6, m2, m5                  ;[1 3]
     punpckhwd   m2, m5                      ;[5 7]
 
@@ -1136,7 +1509,7 @@ cglobal idct8, 3,7,8 ;,0-16*mmsize
 
     call        patial_butterfly_inverse_internal_pass1
 
-    add         r0, 16
+    add         r0, 8
     add         r5, 8
 
     call        patial_butterfly_inverse_internal_pass1
@@ -1167,27 +1540,35 @@ cglobal idct8, 3,7,8 ;,0-16*mmsize
 
 
 ;-----------------------------------------------------------------------------
-; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size)
+; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal denoise_dct, 4, 4, 6
     pxor     m5,  m5
-    shr      r3d, 2
+    shr      r3d, 3
 .loop:
     mova     m0, [r0]
-    pabsd    m1, m0
+    pabsw    m1, m0
+
     mova     m2, [r1]
-    paddd    m2, m1
+    pmovsxwd m3, m1
+    paddd    m2, m3
     mova     [r1], m2
-    pmovzxwd m3, [r2]
-    psubd    m1, m3
-    pcmpgtd  m4, m1, m5
+    mova     m2, [r1 + 16]
+    psrldq   m3, m1, 8
+    pmovsxwd m4, m3
+    paddd    m2, m4
+    mova     [r1 + 16], m2
+
+    movu     m3, [r2]
+    psubsw   m1, m3
+    pcmpgtw  m4, m1, m5
     pand     m1, m4
-    psignd   m1, m0
+    psignw   m1, m0
     mova     [r0], m1
     add      r0, 16
-    add      r1, 16
-    add      r2, 8
+    add      r1, 32
+    add      r2, 16
     dec      r3d
     jnz .loop
     RET
@@ -1195,25 +1576,32 @@ cglobal denoise_dct, 4, 4, 6
 INIT_YMM avx2
 cglobal denoise_dct, 4, 4, 6
     pxor     m5,  m5
-    shr      r3d, 3
+    shr      r3d, 4
 .loop:
     movu     m0, [r0]
-    pabsd    m1, m0
+    pabsw    m1, m0
     movu     m2, [r1]
-    paddd    m2, m1
+    pmovsxwd m4, xm1
+    paddd    m2, m4
     movu     [r1], m2
-    pmovzxwd m3, [r2]
-    psubd    m1, m3
-    pcmpgtd  m4, m1, m5
+    vextracti128 xm4, m1, 1
+    movu     m2, [r1 + 32]
+    pmovsxwd m3, xm4
+    paddd    m2, m3
+    movu     [r1 + 32], m2
+    movu     m3, [r2]
+    psubw    m1, m3
+    pcmpgtw  m4, m1, m5
     pand     m1, m4
-    psignd   m1, m0
+    psignw   m1, m0
     movu     [r0], m1
     add      r0, 32
-    add      r1, 32
-    add      r2, 16
+    add      r1, 64
+    add      r2, 32
     dec      r3d
     jnz .loop
     RET
+
 %if ARCH_X86_64 == 1
 %macro DCT8_PASS_1 4
     vpbroadcastq    m0,                 [r6 + %1]
@@ -1227,7 +1615,7 @@ cglobal denoise_dct, 4, 4, 6
     mova            [r5 + %2],          xm2
 %endmacro
 
-%macro DCT8_PASS_2 1
+%macro DCT8_PASS_2 2
     vbroadcasti128  m4,                 [r6 + %1]
     pmaddwd         m6,                 m0, m4
     pmaddwd         m7,                 m1, m4
@@ -1238,10 +1626,25 @@ cglobal denoise_dct, 4, 4, 6
     phaddd          m6,                 m8
     paddd           m6,                 m5
     psrad           m6,                 DCT_SHIFT2
+
+    vbroadcasti128  m4,                 [r6 + %2]
+    pmaddwd         m10,                m0, m4
+    pmaddwd         m7,                 m1, m4
+    pmaddwd         m8,                 m2, m4
+    pmaddwd         m9,                 m3, m4
+    phaddd          m10,                m7
+    phaddd          m8,                 m9
+    phaddd          m10,                m8
+    paddd           m10,                m5
+    psrad           m10,                DCT_SHIFT2
+
+    packssdw        m6,                 m10
+    vpermq          m10,                m6, 0xD8
+
 %endmacro
 
 INIT_YMM avx2
-cglobal dct8, 3, 7, 10, 0-8*16
+cglobal dct8, 3, 7, 11, 0-8*16
 %if BIT_DEPTH == 10
     %define         DCT_SHIFT          4
     vbroadcasti128  m5,                [pd_8]
@@ -1294,9 +1697,6 @@ cglobal dct8, 3, 7, 10, 0-8*16
     DCT8_PASS_1     7 * 16,             7 * 16, 4, 1
 
     ;pass2
-    mov             r2d,               32
-    lea             r3,                [r2 * 3]
-    lea             r4,                [r1 + r2 * 4]
     vbroadcasti128  m5,                [pd_256]
 
     mova            m0,                [r5]
@@ -1304,22 +1704,14 @@ cglobal dct8, 3, 7, 10, 0-8*16
     mova            m2,                [r5 + 64]
     mova            m3,                [r5 + 96]
 
-    DCT8_PASS_2     0 * 16
-    movu            [r1],              m6
-    DCT8_PASS_2     1 * 16
-    movu            [r1 + r2],         m6
-    DCT8_PASS_2     2 * 16
-    movu            [r1 + r2 * 2],     m6
-    DCT8_PASS_2     3 * 16
-    movu            [r1 + r3],         m6
-    DCT8_PASS_2     4 * 16
-    movu            [r4],              m6
-    DCT8_PASS_2     5 * 16
-    movu            [r4 + r2],         m6
-    DCT8_PASS_2     6 * 16
-    movu            [r4 + r2 * 2],     m6
-    DCT8_PASS_2     7 * 16
-    movu            [r4 + r3],         m6
+    DCT8_PASS_2     0 * 16, 1 * 16
+    movu            [r1],              m10
+    DCT8_PASS_2     2 * 16, 3 * 16
+    movu            [r1 + 32],         m10
+    DCT8_PASS_2     4 * 16, 5 * 16
+    movu            [r1 + 64],         m10
+    DCT8_PASS_2     6 * 16, 7 * 16
+    movu            [r1 + 96],         m10
     RET
 
 %macro DCT16_PASS_1_E 2
@@ -1360,7 +1752,7 @@ cglobal dct8, 3, 7, 10, 0-8*16
     mova            [r5 + %2],         xm10
 %endmacro
 
-%macro DCT16_PASS_2 1
+%macro DCT16_PASS_2 2
     vbroadcasti128  m8,                [r7 + %1]
     vbroadcasti128  m13,               [r8 + %1]
 
@@ -1385,9 +1777,40 @@ cglobal dct8, 3, 7, 10, 0-8*16
     phaddd          m10,               m11
     paddd           m10,               m9
     psrad           m10,               DCT_SHIFT2
+
+
+    vbroadcasti128  m8,                [r7 + %2]
+    vbroadcasti128  m13,               [r8 + %2]
+
+    pmaddwd         m14,               m0, m8
+    pmaddwd         m11,               m1, m13
+    paddd           m14,               m11
+
+    pmaddwd         m11,               m2, m8
+    pmaddwd         m12,               m3, m13
+    paddd           m11,               m12
+    phaddd          m14,               m11
+
+    pmaddwd         m11,               m4, m8
+    pmaddwd         m12,               m5, m13
+    paddd           m11,               m12
+
+    pmaddwd         m12,               m6, m8
+    pmaddwd         m13,               m7, m13
+    paddd           m12,               m13
+    phaddd          m11,               m12
+
+    phaddd          m14,               m11
+    paddd           m14,               m9
+    psrad           m14,               DCT_SHIFT2
+
+    packssdw        m10,               m14
+    vextracti128    xm14,              m10,       1
+    movlhps         xm15,              xm10,      xm14
+    movhlps         xm14,              xm10
 %endmacro
 INIT_YMM avx2
-cglobal dct16, 3, 9, 15, 0-16*mmsize
+cglobal dct16, 3, 9, 16, 0-16*mmsize
 %if BIT_DEPTH == 10
     %define         DCT_SHIFT          5
     vbroadcasti128  m9,                [pd_16]
@@ -1487,7 +1910,7 @@ cglobal dct16, 3, 9, 15, 0-16*mmsize
 
     mov             r5,                rsp
     mov             r4d,               2
-    mov             r2d,               64
+    mov             r2d,               32
     lea             r3,                [r2 * 3]
     vbroadcasti128  m9,                [pd_512]
 
@@ -1504,46 +1927,42 @@ cglobal dct16, 3, 9, 15, 0-16*mmsize
     mova            m6,                [r5 + 3 * 32]        ; [row3lo  row7lo]
     mova            m7,                [r5 + 11 * 32]       ; [row3hi  row7hi]
 
-    DCT16_PASS_2    -8 * 16
-    movu            [r1],              m10
-    DCT16_PASS_2    -7 * 16
-    movu            [r1 + r2],         m10
-    DCT16_PASS_2    -6 * 16
-    movu            [r1 + r2 * 2],     m10
-    DCT16_PASS_2    -5 * 16
-    movu            [r1 + r3],         m10
+    DCT16_PASS_2    -8 * 16, -7 * 16
+    movu            [r1],              xm15
+    movu            [r1 + r2],         xm14
+
+    DCT16_PASS_2    -6 * 16, -5 * 16
+    movu            [r1 + r2 * 2],     xm15
+    movu            [r1 + r3],         xm14
 
     lea             r6,                [r1 + r2 * 4]
-    DCT16_PASS_2    -4 * 16
-    movu            [r6],              m10
-    DCT16_PASS_2    -3 * 16
-    movu            [r6 + r2],         m10
-    DCT16_PASS_2    -2 * 16
-    movu            [r6 + r2 * 2],     m10
-    DCT16_PASS_2    -1 * 16
-    movu            [r6 + r3],         m10
+    DCT16_PASS_2    -4 * 16, -3 * 16
+    movu            [r6],              xm15
+    movu            [r6 + r2],         xm14
+
+    DCT16_PASS_2    -2 * 16, -1 * 16
+    movu            [r6 + r2 * 2],     xm15
+    movu            [r6 + r3],         xm14
 
     lea             r6,                [r6 + r2 * 4]
-    DCT16_PASS_2    0 * 16
-    movu            [r6],              m10
-    DCT16_PASS_2    1 * 16
-    movu            [r6 + r2],         m10
-    DCT16_PASS_2    2 * 16
-    movu            [r6 + r2 * 2],     m10
-    DCT16_PASS_2    3 * 16
-    movu            [r6 + r3],         m10
+    DCT16_PASS_2    0 * 16, 1 * 16
+    movu            [r6],              xm15
+    movu            [r6 + r2],         xm14
+
+    DCT16_PASS_2    2 * 16, 3 * 16
+    movu            [r6 + r2 * 2],     xm15
+    movu            [r6 + r3],         xm14
 
     lea             r6,                [r6 + r2 * 4]
-    DCT16_PASS_2    4 * 16
-    movu            [r6],              m10
-    DCT16_PASS_2    5 * 16
-    movu            [r6 + r2],         m10
-    DCT16_PASS_2    6 * 16
-    movu            [r6 + r2 * 2],     m10
-    DCT16_PASS_2    7 * 16
-    movu            [r6 + r3],         m10
-
-    add             r1,                32
+    DCT16_PASS_2    4 * 16, 5 * 16
+    movu            [r6],              xm15
+    movu            [r6 + r2],         xm14
+
+    DCT16_PASS_2    6 * 16, 7 * 16
+    movu            [r6 + r2 * 2],     xm15
+    movu            [r6 + r3],         xm14
+
+    add             r1,                16
     add             r5,                128
 
     dec             r4d
@@ -1609,6 +2028,7 @@ cglobal dct16, 3, 9, 15, 0-16*mmsize
 
     paddd           xm11,               xm9
     psrad           xm11,               DCT_SHIFT2
+    packssdw        xm11,               xm11
 
 %endmacro
 
@@ -1704,7 +2124,7 @@ cglobal dct32, 3, 9, 16, 0-64*mmsize
     dec             r4d
     jnz             .pass1
 
-    mov             r2d,               128
+    mov             r2d,               64
     lea             r3,                [r2 * 3]
     mov             r5,                rsp
     mov             r4d,               8
@@ -1724,86 +2144,86 @@ cglobal dct32, 3, 9, 16, 0-64*mmsize
     mova            m7,                [r5 + 3 * 64 + 32]
 
     DCT32_PASS_2    0 * 32
-    movu            [r1],              xm11
+    movq            [r1],              xm11
     DCT32_PASS_2    1 * 32
-    movu            [r1 + r2],         xm11
+    movq            [r1 + r2],         xm11
     DCT32_PASS_2    2 * 32
-    movu            [r1 + r2 * 2],     xm11
+    movq            [r1 + r2 * 2],     xm11
     DCT32_PASS_2    3 * 32
-    movu            [r1 + r3],         xm11
+    movq            [r1 + r3],         xm11
 
     lea             r6,                [r1 + r2 * 4]
     DCT32_PASS_2    4 * 32
-    movu            [r6],              xm11
+    movq            [r6],              xm11
     DCT32_PASS_2    5 * 32
-    movu            [r6 + r2],         xm11
+    movq            [r6 + r2],         xm11
     DCT32_PASS_2    6 * 32
-    movu            [r6 + r2 * 2],     xm11
+    movq            [r6 + r2 * 2],     xm11
     DCT32_PASS_2    7 * 32
-    movu            [r6 + r3],         xm11
+    movq            [r6 + r3],         xm11
 
     lea             r6,                [r6 + r2 * 4]
     DCT32_PASS_2    8 * 32
-    movu            [r6],              xm11
+    movq            [r6],              xm11
     DCT32_PASS_2    9 * 32
-    movu            [r6 + r2],         xm11
+    movq            [r6 + r2],         xm11
     DCT32_PASS_2    10 * 32
-    movu            [r6 + r2 * 2],     xm11
+    movq            [r6 + r2 * 2],     xm11
     DCT32_PASS_2    11 * 32
-    movu            [r6 + r3],         xm11
+    movq            [r6 + r3],         xm11
 
     lea             r6,                [r6 + r2 * 4]
     DCT32_PASS_2    12 * 32
-    movu            [r6],              xm11
+    movq            [r6],              xm11
     DCT32_PASS_2    13 * 32
-    movu            [r6 + r2],         xm11
+    movq            [r6 + r2],         xm11
     DCT32_PASS_2    14 * 32
-    movu            [r6 + r2 * 2],     xm11
+    movq            [r6 + r2 * 2],     xm11
     DCT32_PASS_2    15 * 32
-    movu            [r6 + r3],         xm11
+    movq            [r6 + r3],         xm11
 
     lea             r6,                [r6 + r2 * 4]
     DCT32_PASS_2    16 * 32
-    movu            [r6],              xm11
+    movq            [r6],              xm11
     DCT32_PASS_2    17 * 32
-    movu            [r6 + r2],         xm11
+    movq            [r6 + r2],         xm11
     DCT32_PASS_2    18 * 32
-    movu            [r6 + r2 * 2],     xm11
+    movq            [r6 + r2 * 2],     xm11
     DCT32_PASS_2    19 * 32
-    movu            [r6 + r3],         xm11
+    movq            [r6 + r3],         xm11
 
     lea             r6,                [r6 + r2 * 4]
     DCT32_PASS_2    20 * 32
-    movu            [r6],              xm11
+    movq            [r6],              xm11
     DCT32_PASS_2    21 * 32
-    movu            [r6 + r2],         xm11
+    movq            [r6 + r2],         xm11
     DCT32_PASS_2    22 * 32
-    movu            [r6 + r2 * 2],     xm11
+    movq            [r6 + r2 * 2],     xm11
     DCT32_PASS_2    23 * 32
-    movu            [r6 + r3],         xm11
+    movq            [r6 + r3],         xm11
 
     lea             r6,                [r6 + r2 * 4]
     DCT32_PASS_2    24 * 32
-    movu            [r6],              xm11
+    movq            [r6],              xm11
     DCT32_PASS_2    25 * 32
-    movu            [r6 + r2],         xm11
+    movq            [r6 + r2],         xm11
     DCT32_PASS_2    26 * 32
-    movu            [r6 + r2 * 2],     xm11
+    movq            [r6 + r2 * 2],     xm11
     DCT32_PASS_2    27 * 32
-    movu            [r6 + r3],         xm11
+    movq            [r6 + r3],         xm11
 
     lea             r6,                [r6 + r2 * 4]
     DCT32_PASS_2    28 * 32
-    movu            [r6],              xm11
+    movq            [r6],              xm11
     DCT32_PASS_2    29 * 32
-    movu            [r6 + r2],         xm11
+    movq            [r6 + r2],         xm11
     DCT32_PASS_2    30 * 32
-    movu            [r6 + r2 * 2],     xm11
+    movq            [r6 + r2 * 2],     xm11
     DCT32_PASS_2    31 * 32
-    movu            [r6 + r3],         xm11
+    movq            [r6 + r3],         xm11
 
     add             r5,                256
-    add             r1,                16
+    add             r1,                8
 
     dec             r4d
     jnz             .pass2
@@ -1926,28 +2346,25 @@ cglobal idct8, 3, 7, 13, 0-8*16
     lea             r6,                [avx2_idct8_2]
 
     ;pass1
-    mova            m0,                [r0 + 0 * 32]
-    mova            m1,                [r0 + 4 * 32]
-    packssdw        m0,                m1               ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4]
-    mova            m1,                [r0 + 2 * 32]
-    mova            m2,                [r0 + 6 * 32]
-    packssdw        m1,                m2               ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6]
-    mova            m2,                [r0 + 1 * 32]
-    mova            m3,                [r0 + 5 * 32]
-    packssdw        m2,                m3               ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5]
-    mova            m3,                [r0 + 3 * 32]
-    mova            m4,                [r0 + 7 * 32]
-    packssdw        m3,                m4               ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7]
+    mova            m1,                [r0 + 0 * 32]     ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
+    mova            m0,                [r0 + 1 * 32]     ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
+    vpunpcklwd      m5,      m1,       m0                ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
+    vpunpckhwd      m1,      m0                          ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
+    vinserti128     m4,      m5,       xm1,       1      ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
+    vextracti128    xm2,     m5,       1                 ; [1 3 1 3 1 3 1 3]
+    vinserti128     m1,      m1,       xm2,       0      ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
+
+    mova            m2,                [r0 + 2 * 32]     ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
+    mova            m0,                [r0 + 3 * 32]     ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
+    vpunpcklwd      m5,      m2,       m0                ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
+    vpunpckhwd      m2,      m0                          ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
+    vinserti128     m0,      m5,       xm2,       1     ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
+    vextracti128    xm5,     m5,       1                ; [5 7 5 7 5 7 5 7]
+    vinserti128     m2,      m2,       xm5,       0     ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
 
     mova            m5,                [idct8_shuf1]
-
-    punpcklwd       m4,                m0, m1           ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
-    punpckhwd       m0,                m1               ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
     vpermd          m4,                m5, m4
     vpermd          m0,                m5, m0
-
-    punpcklwd       m1,                m2, m3           ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
-    punpckhwd       m2,                m3               ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
     vpermd          m1,                m5, m1
     vpermd          m2,                m5, m2
 
@@ -2065,7 +2482,7 @@ cglobal idct8, 3, 7, 13, 0-8*16
 %endmacro
 
 ;-------------------------------------------------------
-; void idct16(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_YMM avx2
 cglobal idct16, 3, 7, 16, 0-16*mmsize
@@ -2087,37 +2504,53 @@ cglobal idct16, 3, 7, 16, 0-16*mmsize
     mov             r4d, 2
 
 .pass1:
-    movu            m0, [r0 +  0 * 64]
-    movu            m1, [r0 +  8 * 64]
-    packssdw        m0, m1                    ;[0L 8L 0H 8H]
-
-    movu            m1, [r0 +  1 * 64]
-    movu            m2, [r0 +  9 * 64]
-    packssdw        m1, m2                    ;[1L 9L 1H 9H]
-
-    movu            m2, [r0 +  2 * 64]
-    movu            m3, [r0 + 10 * 64]
-    packssdw        m2, m3                    ;[2L 10L 2H 10H]
-
-    movu            m3, [r0 +  3 * 64]
-    movu            m4, [r0 + 11 * 64]
-    packssdw        m3, m4                    ;[3L 11L 3H 11H]
-
-    movu            m4, [r0 +  4 * 64]
-    movu            m5, [r0 + 12 * 64]
-    packssdw        m4, m5                    ;[4L 12L 4H 12H]
-
-    movu            m5, [r0 +  5 * 64]
-    movu            m6, [r0 + 13 * 64]
-    packssdw        m5, m6                    ;[5L 13L 5H 13H]
-
-    movu            m6, [r0 +  6 * 64]
-    movu            m7, [r0 + 14 * 64]
-    packssdw        m6, m7                    ;[6L 14L 6H 14H]
-
-    movu            m7, [r0 +  7 * 64]
-    movu            m8, [r0 + 15 * 64]
-    packssdw        m7, m8                    ;[7L 15L 7H 15H]
+     movu            xm0, [r0 +  0 * 32]
+     movu            xm1, [r0 +  8 * 32]
+     punpckhqdq      xm2, xm0, xm1
+     punpcklqdq      xm0, xm1
+     vinserti128     m0, m0, xm2, 1
+
+     movu            xm1, [r0 +  1 * 32]
+     movu            xm2, [r0 +  9 * 32]
+     punpckhqdq      xm3, xm1, xm2
+     punpcklqdq      xm1, xm2
+     vinserti128     m1, m1, xm3, 1
+
+     movu            xm2, [r0 + 2  * 32]
+     movu            xm3, [r0 + 10 * 32]
+     punpckhqdq      xm4, xm2, xm3
+     punpcklqdq      xm2, xm3
+     vinserti128     m2, m2, xm4, 1
+
+     movu            xm3, [r0 + 3  * 32]
+     movu            xm4, [r0 + 11 * 32]
+     punpckhqdq      xm5, xm3, xm4
+     punpcklqdq      xm3, xm4
+     vinserti128     m3, m3, xm5, 1
+
+     movu            xm4, [r0 + 4  * 32]
+     movu            xm5, [r0 + 12 * 32]
+     punpckhqdq      xm6, xm4, xm5
+     punpcklqdq      xm4, xm5
+     vinserti128     m4, m4, xm6, 1
+
+     movu            xm5, [r0 + 5  * 32]
+     movu            xm6, [r0 + 13 * 32]
+     punpckhqdq      xm7, xm5, xm6
+     punpcklqdq      xm5, xm6
+     vinserti128     m5, m5, xm7, 1
+
+     movu            xm6, [r0 + 6  * 32]
+     movu            xm7, [r0 + 14 * 32]
+     punpckhqdq      xm8, xm6, xm7
+     punpcklqdq      xm6, xm7
+     vinserti128     m6, m6, xm8, 1
+
+     movu            xm7, [r0 + 7  * 32]
+     movu            xm8, [r0 + 15 * 32]
+     punpckhqdq      xm9, xm7, xm8
+     punpcklqdq      xm7, xm8
+     vinserti128     m7, m7, xm9, 1
 
     punpckhwd       m8, m0, m2                ;[8 10]
     punpcklwd       m0, m2                    ;[0 2]
@@ -2160,7 +2593,7 @@ cglobal idct16, 3, 7, 16, 0-16*mmsize
     IDCT_PASS1      4, 10
     IDCT_PASS1      6, 8
 
-    add             r0, 32
+    add             r0, 16
     add             r3, 16
     dec             r4d
     jnz             .pass1
@@ -2328,7 +2761,7 @@ cglobal idct16, 3, 7, 16, 0-16*mmsize
 %endmacro
 
 ;-------------------------------------------------------
-; void idct32(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 
 ; TODO: Reduce PHADDD instruction by PADDD
@@ -2345,54 +2778,69 @@ cglobal idct32, 3, 6, 16, 0-32*64
     mov             r5d, 8
 
 .pass1:
-    movu            xm0,    [r0 +  2 * 128]
-    movu            xm1,    [r0 + 18 * 128]
-    vinserti128     m0, m0, [r0 +  0 * 128], 1
-    vinserti128     m1, m1, [r0 + 16 * 128], 1
-
-    packssdw        m0, m1                      ;[2 18 0 16]
-
-    movu            xm1,    [r0 +  1 * 128]
-    movu            xm2,    [r0 +  9 * 128]
-    vinserti128     m1, m1, [r0 + 17 * 128], 1
-    vinserti128     m2, m2, [r0 + 25 * 128], 1
-    packssdw        m1, m2                      ;[1 9 17 25]
-
-    movu            xm2,    [r0 +  6 * 128]
-    movu            xm3,    [r0 + 22 * 128]
-    vinserti128     m2, m2, [r0 +  4 * 128], 1
-    vinserti128     m3, m3, [r0 + 20 * 128], 1
-    packssdw        m2, m3                      ;[6 22 4 20]
-
-    movu            xm3,    [r0 +  3 * 128]
-    movu            xm4,    [r0 + 11 * 128]
-    vinserti128     m3, m3, [r0 + 19 * 128], 1
-    vinserti128     m4, m4, [r0 + 27 * 128], 1
-    packssdw        m3, m4                      ;[3 11 19 27]
-
-    movu            xm4,    [r0 + 10 * 128]
-    movu            xm5,    [r0 + 26 * 128]
-    vinserti128     m4, m4, [r0 +  8 * 128], 1
-    vinserti128     m5, m5, [r0 + 24 * 128], 1
-    packssdw        m4, m5                      ;[10 26 8 24]
-
-    movu            xm5,    [r0 +  5 * 128]
-    movu            xm6,    [r0 + 13 * 128]
-    vinserti128     m5, m5, [r0 + 21 * 128], 1
-    vinserti128     m6, m6, [r0 + 29 * 128], 1
-    packssdw        m5, m6                      ;[5 13 21 29]
-
-    movu            xm6,    [r0 + 14 * 128]
-    movu            xm7,    [r0 + 30 * 128]
-    vinserti128     m6, m6, [r0 + 12 * 128], 1
-    vinserti128     m7, m7, [r0 + 28 * 128], 1
-    packssdw        m6, m7                      ;[14 30 12 28]
-
-    movu            xm7,    [r0 +  7 * 128]
-    movu            xm8,    [r0 + 15 * 128]
-    vinserti128     m7, m7, [r0 + 23 * 128], 1
-    vinserti128     m8, m8, [r0 + 31 * 128], 1
-    packssdw        m7, m8                      ;[7 15 23 31]
+    movq            xm0,    [r0 +  2 * 64]
+    movq            xm1,    [r0 + 18 * 64]
+    punpcklqdq      xm0, xm0, xm1
+    movq            xm1,    [r0 +  0 * 64]
+    movq            xm2,    [r0 + 16 * 64]
+    punpcklqdq      xm1, xm1, xm2
+    vinserti128     m0,  m0,  xm1, 1             ;[2 18 0 16]
+
+    movq            xm1,    [r0 + 1 * 64]
+    movq            xm2,    [r0 + 9 * 64]
+    punpcklqdq      xm1, xm1, xm2
+    movq            xm2,    [r0 + 17 * 64]
+    movq            xm3,    [r0 + 25 * 64]
+    punpcklqdq      xm2, xm2, xm3
+    vinserti128     m1,  m1,  xm2, 1             ;[1 9 17 25]
+
+    movq            xm2,    [r0 + 6 * 64]
+    movq            xm3,    [r0 + 22 * 64]
+    punpcklqdq      xm2, xm2, xm3
+    movq            xm3,    [r0 + 4 * 64]
+    movq            xm4,    [r0 + 20 * 64]
+    punpcklqdq      xm3, xm3, xm4
+    vinserti128     m2,  m2,  xm3, 1             ;[6 22 4 20]
+
+    movq            xm3,    [r0 + 3 * 64]
+    movq            xm4,    [r0 + 11 * 64]
+    punpcklqdq      xm3, xm3, xm4
+    movq            xm4,    [r0 + 19 * 64]
+    movq            xm5,    [r0 + 27 * 64]
+    punpcklqdq      xm4, xm4, xm5
+    vinserti128     m3,  m3,  xm4, 1             ;[3 11 17 25]
+
+    movq            xm4,    [r0 + 10 * 64]
+    movq            xm5,    [r0 + 26 * 64]
+    punpcklqdq      xm4, xm4, xm5
+    movq            xm5,    [r0 + 8 * 64]
+    movq            xm6,    [r0 + 24 * 64]
+    punpcklqdq      xm5, xm5, xm6
+    vinserti128     m4,  m4,  xm5, 1             ;[10 26 8 24]
+
+    movq            xm5,    [r0 + 5 * 64]
+    movq            xm6,    [r0 + 13 * 64]
+    punpcklqdq      xm5, xm5, xm6
+    movq            xm6,    [r0 + 21 * 64]
+    movq            xm7,    [r0 + 29 * 64]
+    punpcklqdq      xm6, xm6, xm7
+    vinserti128     m5,  m5,  xm6, 1             ;[5 13 21 9]
+
+    movq            xm6,    [r0 + 14 * 64]
+    movq            xm7,    [r0 + 30 * 64]
+    punpcklqdq      xm6, xm6, xm7
+    movq            xm7,    [r0 + 12 * 64]
+    movq            xm8,    [r0 + 28 * 64]
+    punpcklqdq      xm7, xm7, xm8
+    vinserti128     m6,  m6,  xm7, 1             ;[14 30 12 28]
+
+    movq            xm7,    [r0 + 7 * 64]
+    movq            xm8,    [r0 + 15 * 64]
+    punpcklqdq      xm7, xm7, xm8
+    movq            xm8,    [r0 + 23 * 64]
+    movq            xm9,    [r0 + 31 * 64]
+    punpcklqdq      xm8, xm8, xm9
+    vinserti128     m7,  m7,  xm8, 1             ;[7 15 23 31]
 
     punpckhwd       m8, m0, m2                  ;[18 22 16 20]
     punpcklwd       m0, m2                      ;[2 6 0 4]
@@ -2451,7 +2899,7 @@ cglobal idct32, 3, 6, 16, 0-32*64
     IDCT32_PASS1 6
     IDCT32_PASS1 7
 
-    add             r0, 16
+    add             r0, 8
     add             r3, 4
     add             r4, 4
     dec             r5d
@@ -2612,7 +3060,7 @@ cglobal idct32, 3, 6, 16, 0-32*64
     RET
 
 ;-------------------------------------------------------
-; void idct4(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_YMM avx2
 cglobal idct4, 3, 4, 6
@@ -2632,13 +3080,14 @@ cglobal idct4, 3, 4, 6
     add             r2d, r2d
     lea             r3, [r2 * 3]
 
-    movu            m0, [r0]                      ;[00 01 02 03 10 11 12 13]
-    movu            m1, [r0 + 32]                 ;[20 21 22 23 30 31 32 33]
+    movu            m0, [r0]                      ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]
 
-    packssdw        m0, m1                        ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33]
-    pshufb          m0, [idct4_shuf1]             ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33]
-    vpermq          m2, m0, 0x44                  ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
-    vpermq          m0, m0, 0xEE                  ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
+    pshufb          m0, [idct4_shuf1]             ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
+    vextracti128    xm1, m0, 1                    ;[20 22 21 23 30 32 31 33]
+    punpcklwd       xm2, xm0, xm1                 ;[00 20 02 22 01 21 03 23]
+    punpckhwd       xm0, xm1                      ;[10 30 12 32 11 31 13 33]
+    vinserti128     m2, m2, xm2, 1                ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
+    vinserti128     m0, m0, xm0, 1                ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
 
     mova            m1, [avx2_idct4_1]
     mova            m3, [avx2_idct4_1 + 32]
diff --git a/source/common/x86/dct8.h b/source/common/x86/dct8.h
index 3b74f2a..f9516d6 100644
--- a/source/common/x86/dct8.h
+++ b/source/common/x86/dct8.h
@@ -23,23 +23,24 @@
 
 #ifndef X265_DCT8_H
 #define X265_DCT8_H
-void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct8_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_idct32_avx2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 
-void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
 
-void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
-void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
+void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
 
 #endif // ifndef X265_DCT8_H
diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm
index 52fc42c..c62eef6 100644
--- a/source/common/x86/ipfilter8.asm
+++ b/source/common/x86/ipfilter8.asm
@@ -31,6 +31,13 @@ tab_Tm:    db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
            db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
            db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
 
+ALIGN 32
+const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15
+
+ALIGN 32
+const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
+                         dd 2, 3, 3, 4, 4, 5, 5, 6
+
 ALIGN 32
 tab_Lm:    db 0, 1, 2, 3, 4,  5,  6,  7,  1, 2, 3, 4,  5,  6,  7,  8
            db 2, 3, 4, 5, 6,  7,  8,  9,  3, 4, 5, 6,  7,  8,  9,  10
@@ -42,7 +49,6 @@ tab_Vm:    db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
 
 tab_Cm:    db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
 
-tab_c_512:      times 8 dw 512
 tab_c_526336:   times 4 dd 8192*64+2048
 
 tab_ChromaCoeff: db  0, 64,  0,  0
@@ -123,13 +129,63 @@ tab_LumaCoeffVer: times 8 db 0, 0
                   times 8 db 58, -10
                   times 8 db 4, -1
 
-tab_c_128:      times 16 db 0x80
+ALIGN 32
+tab_LumaCoeffVer_32: times 16 db 0, 0
+                     times 16 db 0, 64
+                     times 16 db 0, 0
+                     times 16 db 0, 0
+
+                     times 16 db -1, 4
+                     times 16 db -10, 58
+                     times 16 db 17, -5
+                     times 16 db 1, 0
+
+                     times 16 db -1, 4
+                     times 16 db -11, 40
+                     times 16 db 40, -11
+                     times 16 db 4, -1
+
+                     times 16 db 0, 1
+                     times 16 db -5, 17
+                     times 16 db 58, -10
+                     times 16 db 4, -1
+
+ALIGN 32
+tab_ChromaCoeffVer_32: times 16 db 0, 64
+                       times 16 db 0, 0
+
+                       times 16 db -2, 58
+                       times 16 db 10, -2
+
+                       times 16 db -4, 54
+                       times 16 db 16, -2
+
+                       times 16 db -6, 46
+                       times 16 db 28, -4
+
+                       times 16 db -4, 36
+                       times 16 db 36, -4
+
+                       times 16 db -4, 28
+                       times 16 db 46, -6
+
+                       times 16 db -2, 16
+                       times 16 db 54, -4
+
+                       times 16 db -2, 10
+                       times 16 db 58, -2
+
 tab_c_64_n64:   times 8 db 64, -64
 
+const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+
+ALIGN 32
+interp4_horiz_shuf1:    db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+                        db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 
 SECTION .text
 
-cextern idct4_shuf1
+cextern pb_128
 cextern pw_1
 cextern pw_512
 cextern pw_2000
@@ -171,7 +227,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 %endif
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 
 %rep 2
@@ -203,7 +259,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 %endif
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 
 %rep 4
@@ -235,7 +291,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 %endif
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 
 mov         r5d,        16/2
@@ -285,7 +341,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 %endif
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 
 FILTER_H4_w4_2   t0, t1, t2
@@ -313,7 +369,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 %endif
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 
 %rep 2
@@ -345,7 +401,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 %endif
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 
 %rep 4
@@ -377,7 +433,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 %endif
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 
 %rep 8
@@ -409,7 +465,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 %endif
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 
 mov         r5d,        32/2
@@ -423,6 +479,9 @@ jnz         .loop
 
 RET
 
+ALIGN 32
+const interp_4tap_8x8_horiz_shuf,   dd 0, 4, 1, 5, 2, 6, 3, 7
+
 
 %macro FILTER_H4_w6 3
     movu        %1, [srcq - 1]
@@ -606,7 +665,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 mov           r5d,       %2
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 mova        Tm1,         [tab_Tm + 16]
 
@@ -662,7 +721,7 @@ movd        coef2,       [tab_ChromaCoeff + r4 * 4]
 mov         r5d,          %2
 
 pshufd      coef2,       coef2,      0
-mova        t2,          [tab_c_512]
+mova        t2,          [pw_512]
 mova        Tm0,         [tab_Tm]
 mova        Tm1,         [tab_Tm + 16]
 
@@ -749,7 +808,7 @@ cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
     punpcklqdq  m3, m3
 
 %ifidn %3, pp 
-    mova      m2, [tab_c_512]
+    mova      m2, [pw_512]
 %else
     mova      m2, [pw_2000]
 %endif
@@ -845,7 +904,7 @@ cglobal interp_8tap_horiz_pp_4x4, 4,6,6
     pmulhrsw        m3, [pw_512]
     vextracti128    xm4, m3, 1
     packuswb        xm3, xm4                        ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
-    pshufb          xm3, [idct4_shuf1]              ; [row3 row1 row2 row0]
+    pshufb          xm3, [interp4_shuf]             ; [row3 row1 row2 row0]
 
     lea             r0, [r3 * 3]
     movd            [r2], xm3
@@ -854,7 +913,664 @@ cglobal interp_8tap_horiz_pp_4x4, 4,6,6
     pextrd          [r2+r0], xm3, 3
     RET
 
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7
+    mov             r4d, r4m
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_LumaCoeff + r4 * 8]
+%endif
+
+    mova            m1, [tab_Lm]
+    mova            m2, [tab_Lm + 32]
+
+    ; register map
+    ; m0     - interpolate coeff
+    ; m1, m2 - shuffle order table
+
+    sub             r0, 3
+    lea             r5, [r1 * 3]
+    lea             r4, [r3 * 3]
+
+    ; Row 0
+    vbroadcasti128  m3, [r0]                        ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb          m4, m3, m2
+    pshufb          m3, m1
+    pmaddubsw       m3, m0
+    pmaddubsw       m4, m0
+    phaddw          m3, m4
+    ; Row 1
+    vbroadcasti128  m4, [r0 + r1]                   ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb          m5, m4, m2
+    pshufb          m4, m1
+    pmaddubsw       m4, m0
+    pmaddubsw       m5, m0
+    phaddw          m4, m5
+
+    phaddw          m3, m4                          ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
+    pmulhrsw        m3, [pw_512]
+
+    ; Row 2
+    vbroadcasti128  m4, [r0 + r1 * 2]               ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb          m5, m4, m2
+    pshufb          m4, m1
+    pmaddubsw       m4, m0
+    pmaddubsw       m5, m0
+    phaddw          m4, m5
+    ; Row 3
+    vbroadcasti128  m5, [r0 + r5]                   ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb          m6, m5, m2
+    pshufb          m5, m1
+    pmaddubsw       m5, m0
+    pmaddubsw       m6, m0
+    phaddw          m5, m6
+
+    phaddw          m4, m5                          ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
+    pmulhrsw        m4, [pw_512]
+
+    packuswb        m3, m4
+    vextracti128    xm4, m3, 1
+    punpcklwd       xm5, xm3, xm4
+
+    movq            [r2], xm5
+    movhps          [r2 + r3], xm5
+
+    punpckhwd       xm5, xm3, xm4
+    movq            [r2 + r3 * 2], xm5
+    movhps          [r2 + r4], xm5
+    RET
+
+%macro IPFILTER_LUMA_AVX2_8xN 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7
+    mov             r4d, r4m
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_LumaCoeff + r4 * 8]
+%endif
+
+    mova            m1, [tab_Lm]
+    mova            m2, [tab_Lm + 32]
+
+    ; register map
+    ; m0     - interpolate coeff
+    ; m1, m2 - shuffle order table
+
+    sub             r0, 3
+    lea             r5, [r1 * 3]
+    lea             r6, [r3 * 3]
+    mov             r4d, %2 / 4
+.loop:
+    ; Row 0
+    vbroadcasti128  m3, [r0]                        ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb          m4, m3, m2
+    pshufb          m3, m1
+    pmaddubsw       m3, m0
+    pmaddubsw       m4, m0
+    phaddw          m3, m4
+    ; Row 1
+    vbroadcasti128  m4, [r0 + r1]                   ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb          m5, m4, m2
+    pshufb          m4, m1
+    pmaddubsw       m4, m0
+    pmaddubsw       m5, m0
+    phaddw          m4, m5
+
+    phaddw          m3, m4                          ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
+    pmulhrsw        m3, [pw_512]
+
+    ; Row 2
+    vbroadcasti128  m4, [r0 + r1 * 2]               ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb          m5, m4, m2
+    pshufb          m4, m1
+    pmaddubsw       m4, m0
+    pmaddubsw       m5, m0
+    phaddw          m4, m5
+    ; Row 3
+    vbroadcasti128  m5, [r0 + r5]                   ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb          m6, m5, m2
+    pshufb          m5, m1
+    pmaddubsw       m5, m0
+    pmaddubsw       m6, m0
+    phaddw          m5, m6
+
+    phaddw          m4, m5                          ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
+    pmulhrsw        m4, [pw_512]
+
+    packuswb        m3, m4
+    vextracti128    xm4, m3, 1
+    punpcklwd       xm5, xm3, xm4
+
+    movq            [r2], xm5
+    movhps          [r2 + r3], xm5
+
+    punpckhwd       xm5, xm3, xm4
+    movq            [r2 + r3 * 2], xm5
+    movhps          [r2 + r6], xm5
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    dec             r4d
+    jnz             .loop
+    RET
+%endmacro
+
+IPFILTER_LUMA_AVX2_8xN 8, 8
+IPFILTER_LUMA_AVX2_8xN 8, 16
+IPFILTER_LUMA_AVX2_8xN 8, 32
+
+%macro IPFILTER_LUMA_AVX2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
+    sub               r0,        3
+    mov               r4d,       r4m
+%ifdef PIC
+    lea               r5,        [tab_LumaCoeff]
+    vpbroadcastd      m0,        [r5 + r4 * 8]
+    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
+%else
+    vpbroadcastd      m0,         [tab_LumaCoeff + r4 * 8]
+    vpbroadcastd      m1,         [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+    movu              m3,         [tab_Tm + 16]
+    vpbroadcastd      m7,         [pw_1]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+    ; m2 , m2  shuffle order table
+    ; m7 - pw_1
 
+    mov               r4d,        %2/2
+.loop:
+    ; Row 0
+    vbroadcasti128    m4,         [r0]                        ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m5,         m4,     m3
+    pshufb            m4,         [tab_Tm]
+    pmaddubsw         m4,         m0
+    pmaddubsw         m5,         m1
+    paddw             m4,         m5
+    pmaddwd           m4,         m7
+    vbroadcasti128    m5,         [r0 + 8]                    ; second 8 elements in Row0 
+    pshufb            m6,         m5,     m3
+    pshufb            m5,         [tab_Tm]
+    pmaddubsw         m5,         m0
+    pmaddubsw         m6,         m1
+    paddw             m5,         m6
+    pmaddwd           m5,         m7
+    packssdw          m4,         m5                          ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+    pmulhrsw          m4,         [pw_512]
+    vbroadcasti128    m2,         [r0 + r1]                        ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m5,         m2,     m3
+    pshufb            m2,         [tab_Tm]
+    pmaddubsw         m2,         m0
+    pmaddubsw         m5,         m1
+    paddw             m2,         m5
+    pmaddwd           m2,         m7
+    vbroadcasti128    m5,         [r0 + r1 + 8]                    ; second 8 elements in Row0 
+    pshufb            m6,         m5,     m3
+    pshufb            m5,         [tab_Tm]
+    pmaddubsw         m5,         m0
+    pmaddubsw         m6,         m1
+    paddw             m5,         m6
+    pmaddwd           m5,         m7
+    packssdw          m2,         m5                          ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+    pmulhrsw          m2,         [pw_512]
+    packuswb          m4,         m2
+    vpermq            m4,         m4,     11011000b
+    vextracti128      xm5,        m4,     1
+    pshufd            xm4,        xm4,    11011000b
+    pshufd            xm5,        xm5,    11011000b
+    movu              [r2],       xm4
+    movu              [r2+r3],    xm5
+    lea               r0,         [r0 + r1 * 2]
+    lea               r2,         [r2 + r3 * 2]
+    dec               r4d
+    jnz              .loop
+    RET
+%endmacro
+
+%macro IPFILTER_LUMA_32x_avx2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
+    sub               r0,         3
+    mov               r4d,        r4m
+%ifdef PIC
+    lea               r5,         [tab_LumaCoeff]
+    vpbroadcastd      m0,         [r5 + r4 * 8]
+    vpbroadcastd      m1,         [r5 + r4 * 8 + 4]
+%else
+    vpbroadcastd      m0,         [tab_LumaCoeff + r4 * 8]
+    vpbroadcastd      m1,         [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+    movu              m3,         [tab_Tm + 16]
+    vpbroadcastd      m7,         [pw_1]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+    ; m2 , m2  shuffle order table
+    ; m7 - pw_1
+
+    mov               r4d,        %2
+.loop:
+    ; Row 0
+    vbroadcasti128    m4,         [r0]                        ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m5,         m4,     m3
+    pshufb            m4,         [tab_Tm]
+    pmaddubsw         m4,         m0
+    pmaddubsw         m5,         m1
+    paddw             m4,         m5
+    pmaddwd           m4,         m7
+    vbroadcasti128    m5,         [r0 + 8]
+    pshufb            m6,         m5,     m3
+    pshufb            m5,         [tab_Tm]
+    pmaddubsw         m5,         m0
+    pmaddubsw         m6,         m1
+    paddw             m5,         m6
+    pmaddwd           m5,         m7
+    packssdw          m4,         m5                          ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+    pmulhrsw          m4,         [pw_512]
+    vbroadcasti128    m2,         [r0 + 16]
+    pshufb            m5,         m2,     m3
+    pshufb            m2,         [tab_Tm]
+    pmaddubsw         m2,         m0
+    pmaddubsw         m5,         m1
+    paddw             m2,         m5
+    pmaddwd           m2,         m7
+    vbroadcasti128    m5,         [r0 + 24]
+    pshufb            m6,         m5,     m3
+    pshufb            m5,         [tab_Tm]
+    pmaddubsw         m5,         m0
+    pmaddubsw         m6,         m1
+    paddw             m5,         m6
+    pmaddwd           m5,         m7
+    packssdw          m2,         m5
+    pmulhrsw          m2,         [pw_512]
+    packuswb          m4,         m2
+    vpermq            m4,         m4,     11011000b
+    vextracti128      xm5,        m4,     1
+    pshufd            xm4,        xm4,    11011000b
+    pshufd            xm5,        xm5,    11011000b
+    movu              [r2],       xm4
+    movu              [r2 + 16],  xm5
+    lea               r0,         [r0 + r1]
+    lea               r2,         [r2 + r3]
+    dec               r4d
+    jnz               .loop
+    RET
+%endmacro
+
+%macro IPFILTER_LUMA_64x_avx2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
+    sub               r0,    3
+    mov               r4d,   r4m
+%ifdef PIC
+    lea               r5,        [tab_LumaCoeff]
+    vpbroadcastd      m0,        [r5 + r4 * 8]
+    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
+%else
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+    movu              m3,        [tab_Tm + 16]
+    vpbroadcastd      m7,        [pw_1]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+    ; m2 , m2  shuffle order table
+    ; m7 - pw_1
+
+    mov               r4d,   %2
+.loop:
+    ; Row 0
+    vbroadcasti128    m4,        [r0]                        ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m5,        m4,    m3
+    pshufb            m4,        [tab_Tm]
+    pmaddubsw         m4,        m0
+    pmaddubsw         m5,        m1
+    paddw             m4,        m5
+    pmaddwd           m4,        m7
+    vbroadcasti128    m5,        [r0 + 8]
+    pshufb            m6,        m5,    m3
+    pshufb            m5,        [tab_Tm]
+    pmaddubsw         m5,        m0
+    pmaddubsw         m6,        m1
+    paddw             m5,        m6
+    pmaddwd           m5,        m7
+    packssdw          m4,        m5                          ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+    pmulhrsw          m4,        [pw_512]
+    vbroadcasti128    m2,        [r0 + 16]
+    pshufb            m5,        m2,    m3
+    pshufb            m2,        [tab_Tm]
+    pmaddubsw         m2,        m0
+    pmaddubsw         m5,        m1
+    paddw             m2,        m5
+    pmaddwd           m2,        m7
+    vbroadcasti128    m5,        [r0 + 24]
+    pshufb            m6,        m5,    m3
+    pshufb            m5,        [tab_Tm]
+    pmaddubsw         m5,        m0
+    pmaddubsw         m6,        m1
+    paddw             m5,        m6
+    pmaddwd           m5,        m7
+    packssdw          m2,        m5
+    pmulhrsw          m2,        [pw_512]
+    packuswb          m4,        m2
+    vpermq            m4,        m4,    11011000b
+    vextracti128      xm5,       m4,    1
+    pshufd            xm4,       xm4,   11011000b
+    pshufd            xm5,       xm5,   11011000b
+    movu              [r2],      xm4
+    movu              [r2 + 16], xm5
+
+    vbroadcasti128    m4,        [r0 + 32]
+    pshufb            m5,        m4,    m3
+    pshufb            m4,        [tab_Tm]
+    pmaddubsw         m4,        m0
+    pmaddubsw         m5,        m1
+    paddw             m4,        m5
+    pmaddwd           m4,        m7
+    vbroadcasti128    m5,        [r0 + 40]
+    pshufb            m6,        m5,    m3
+    pshufb            m5,        [tab_Tm]
+    pmaddubsw         m5,        m0
+    pmaddubsw         m6,        m1
+    paddw             m5,        m6
+    pmaddwd           m5,        m7
+    packssdw          m4,        m5
+    pmulhrsw          m4,        [pw_512]
+    vbroadcasti128    m2,        [r0 + 48]
+    pshufb            m5,        m2,    m3
+    pshufb            m2,        [tab_Tm]
+    pmaddubsw         m2,        m0
+    pmaddubsw         m5,        m1
+    paddw             m2,        m5
+    pmaddwd           m2,        m7
+    vbroadcasti128    m5,        [r0 + 56]
+    pshufb            m6,        m5,    m3
+    pshufb            m5,        [tab_Tm]
+    pmaddubsw         m5,        m0
+    pmaddubsw         m6,        m1
+    paddw             m5,        m6
+    pmaddwd           m5,        m7
+    packssdw          m2,        m5
+    pmulhrsw          m2,        [pw_512]
+    packuswb          m4,        m2
+    vpermq            m4,        m4,    11011000b
+    vextracti128      xm5,       m4,    1
+    pshufd            xm4,       xm4,   11011000b
+    pshufd            xm5,       xm5,   11011000b
+    movu              [r2 +32],  xm4
+    movu              [r2 + 48], xm5
+
+    lea               r0,        [r0 + r1]
+    lea               r2,        [r2 + r3]
+    dec               r4d
+    jnz               .loop
+    RET
+%endmacro
+
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_48x64, 4,6,8
+    sub               r0,         3
+    mov               r4d,        r4m
+%ifdef PIC
+    lea               r5,         [tab_LumaCoeff]
+    vpbroadcastd      m0,         [r5 + r4 * 8]
+    vpbroadcastd      m1,         [r5 + r4 * 8 + 4]
+%else
+    vpbroadcastd      m0,         [tab_LumaCoeff + r4 * 8]
+    vpbroadcastd      m1,         [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+    movu              m3,         [tab_Tm + 16]
+    vpbroadcastd      m7,         [pw_1]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+    ; m2 , m2  shuffle order table
+    ; m7 - pw_1
+
+    mov               r4d,        64
+.loop:
+    ; Row 0
+    vbroadcasti128    m4,         [r0]                        ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m5,         m4,     m3
+    pshufb            m4,         [tab_Tm]
+    pmaddubsw         m4,         m0
+    pmaddubsw         m5,         m1
+    paddw             m4,         m5
+    pmaddwd           m4,         m7
+    vbroadcasti128    m5,         [r0 + 8]
+    pshufb            m6,         m5,     m3
+    pshufb            m5,         [tab_Tm]
+    pmaddubsw         m5,         m0
+    pmaddubsw         m6,         m1
+    paddw             m5,         m6
+    pmaddwd           m5,         m7
+    packssdw          m4,         m5                          ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+    pmulhrsw          m4,         [pw_512]
+
+    vbroadcasti128    m2,         [r0 + 16]
+    pshufb            m5,         m2,     m3
+    pshufb            m2,         [tab_Tm]
+    pmaddubsw         m2,         m0
+    pmaddubsw         m5,         m1
+    paddw             m2,         m5
+    pmaddwd           m2,         m7
+    vbroadcasti128    m5,         [r0 + 24]
+    pshufb            m6,         m5,     m3
+    pshufb            m5,         [tab_Tm]
+    pmaddubsw         m5,         m0
+    pmaddubsw         m6,         m1
+    paddw             m5,         m6
+    pmaddwd           m5,         m7
+    packssdw          m2,         m5
+    pmulhrsw          m2,         [pw_512]
+    packuswb          m4,         m2
+    vpermq            m4,         m4,     11011000b
+    vextracti128      xm5,        m4,     1
+    pshufd            xm4,        xm4,    11011000b
+    pshufd            xm5,        xm5,    11011000b
+    movu              [r2],       xm4
+    movu              [r2 + 16],  xm5
+
+    vbroadcasti128    m4,         [r0 + 32]
+    pshufb            m5,         m4,     m3
+    pshufb            m4,         [tab_Tm]
+    pmaddubsw         m4,         m0
+    pmaddubsw         m5,         m1
+    paddw             m4,         m5
+    pmaddwd           m4,         m7
+    vbroadcasti128    m5,         [r0 + 40]
+    pshufb            m6,         m5,     m3
+    pshufb            m5,         [tab_Tm]
+    pmaddubsw         m5,         m0
+    pmaddubsw         m6,         m1
+    paddw             m5,         m6
+    pmaddwd           m5,         m7
+    packssdw          m4,         m5
+    pmulhrsw          m4,         [pw_512]
+    packuswb          m4,         m4
+    vpermq            m4,         m4,     11011000b
+    pshufd            xm4,        xm4,    11011000b
+    movu              [r2 + 32],  xm4
+
+    lea               r0,         [r0 + r1]
+    lea               r2,         [r2 + r3]
+    dec               r4d
+    jnz               .loop
+    RET
+
+INIT_YMM avx2 
+cglobal interp_4tap_horiz_pp_4x4, 4,6,6
+    mov             r4d, r4m
+
+%ifdef PIC
+    lea               r5,           [tab_ChromaCoeff]
+    vpbroadcastd      m0,           [r5 + r4 * 4]
+%else
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    vpbroadcastd      m2,           [pw_1]
+    vbroadcasti128    m1,           [tab_Tm]
+
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1 - shuffle order table
+    ; m2 - constant word 1
+
+    dec                r0
+
+    ; Row 0-1
+    vbroadcasti128    m3,           [r0]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    vinserti128       m3,           m3,      [r0 + r1],     1
+    pshufb            m3,           m1
+    pmaddubsw         m3,           m0
+    pmaddwd           m3,           m2
+
+    ; Row 2-3
+    lea               r0,           [r0 + r1 * 2]
+    vbroadcasti128    m4,           [r0]                      ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    vinserti128       m4,           m4,      [r0 + r1],     1
+    pshufb            m4,           m1
+    pmaddubsw         m4,           m0
+    pmaddwd           m4,           m2
+
+    packssdw          m3,           m4
+    pmulhrsw          m3,           [pw_512]
+    vextracti128      xm4,          m3,     1
+    packuswb          xm3,          xm4
+
+    lea               r0,           [r3 * 3]
+    movd              [r2],         xm3
+    pextrd            [r2+r3],      xm3,     2
+    pextrd            [r2+r3*2],    xm3,     1
+    pextrd            [r2+r0],      xm3,     3
+    RET
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_32x32, 4,6,7
+    mov             r4d, r4m
+
+%ifdef PIC
+    lea               r5,           [tab_ChromaCoeff]
+    vpbroadcastd      m0,           [r5 + r4 * 4]
+%else
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    mova              m1,           [interp4_horiz_shuf1]
+    vpbroadcastd      m2,           [pw_1]
+    mova              m6,           [pw_512]
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1 - shuffle order table
+    ; m2 - constant word 1
+
+    dec               r0
+    mov               r4d,          32
+
+.loop:
+    ; Row 0
+    vbroadcasti128    m3,           [r0]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m3,           m1
+    pmaddubsw         m3,           m0
+    pmaddwd           m3,           m2
+    vbroadcasti128    m4,           [r0 + 4]
+    pshufb            m4,           m1
+    pmaddubsw         m4,           m0
+    pmaddwd           m4,           m2
+    packssdw          m3,           m4
+    pmulhrsw          m3,           m6
+
+    vbroadcasti128    m4,           [r0 + 16]
+    pshufb            m4,           m1
+    pmaddubsw         m4,           m0
+    pmaddwd           m4,           m2
+    vbroadcasti128    m5,           [r0 + 20]
+    pshufb            m5,           m1
+    pmaddubsw         m5,           m0
+    pmaddwd           m5,           m2
+    packssdw          m4,           m5
+    pmulhrsw          m4,           m6
+
+    packuswb          m3,           m4
+    vpermq            m3,           m3,      11011000b
+
+    movu              [r2],         m3
+    lea               r2,           [r2 + r3]
+    lea               r0,           [r0 + r1]
+    dec               r4d
+    jnz               .loop
+    RET
+
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7
+    mov               r4d,          r4m
+
+%ifdef PIC
+    lea               r5,           [tab_ChromaCoeff]
+    vpbroadcastd      m0,           [r5 + r4 * 4]
+%else
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    mova              m6,           [pw_512]
+    mova              m1,           [interp4_horiz_shuf1]
+    vpbroadcastd      m2,           [pw_1]
+
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1 - shuffle order table
+    ; m2 - constant word 1
+
+    dec               r0
+    mov               r4d,          8
+
+.loop:
+    ; Row 0
+    vbroadcasti128    m3,           [r0]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m3,           m1
+    pmaddubsw         m3,           m0
+    pmaddwd           m3,           m2
+    vbroadcasti128    m4,           [r0 + 4]                    ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m4,           m1
+    pmaddubsw         m4,           m0
+    pmaddwd           m4,           m2
+    packssdw          m3,           m4
+    pmulhrsw          m3,           m6
+
+    ; Row 1
+    vbroadcasti128    m4,           [r0 + r1]                   ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m4,           m1
+    pmaddubsw         m4,           m0
+    pmaddwd           m4,           m2
+    vbroadcasti128    m5,           [r0 + r1 + 4]               ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m5,           m1
+    pmaddubsw         m5,           m0
+    pmaddwd           m5,           m2
+    packssdw          m4,           m5
+    pmulhrsw          m4,           m6
+
+    packuswb          m3,           m4
+    vpermq            m3,           m3,      11011000b
+
+    vextracti128      xm4,          m3,       1
+    movu              [r2],         xm3
+    movu              [r2 + r3],    xm4
+    lea               r2,           [r2 + r3 * 2]
+    lea               r0,           [r0 + r1 * 2]
+    dec               r4d
+    jnz               .loop
+    RET
 ;--------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;--------------------------------------------------------------------------------------------------------------
@@ -863,6 +1579,91 @@ cglobal interp_8tap_horiz_pp_4x4, 4,6,6
     IPFILTER_LUMA 12, 16, pp
     IPFILTER_LUMA 4, 16, pp
 
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_8x8, 4,6,6
+    mov               r4d,    r4m
+
+%ifdef PIC
+    lea               r5,           [tab_ChromaCoeff]
+    vpbroadcastd      m0,           [r5 + r4 * 4]
+%else
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    movu              m1,           [tab_Tm]
+    vpbroadcastd      m2,           [pw_1]
+
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1 - shuffle order table
+    ; m2 - constant word 1
+
+    sub               r0,           1
+    mov               r4d,          2
+
+.loop:
+    ; Row 0
+    vbroadcasti128    m3,           [r0]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m3,           m1
+    pmaddubsw         m3,           m0
+    pmaddwd           m3,           m2
+
+    ; Row 1
+    vbroadcasti128    m4,           [r0 + r1]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m4,           m1
+    pmaddubsw         m4,           m0
+    pmaddwd           m4,           m2
+    packssdw          m3,           m4
+    pmulhrsw          m3,           [pw_512]
+    lea               r0,           [r0 + r1 * 2]
+
+    ; Row 2
+    vbroadcasti128    m4,           [r0 ]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m4,           m1
+    pmaddubsw         m4,           m0
+    pmaddwd           m4,           m2
+
+    ; Row 3
+    vbroadcasti128    m5,           [r0 + r1]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb            m5,           m1
+    pmaddubsw         m5,           m0
+    pmaddwd           m5,           m2
+    packssdw          m4,           m5
+    pmulhrsw          m4,           [pw_512]
+
+    packuswb          m3,           m4
+    mova              m5,           [interp_4tap_8x8_horiz_shuf]
+    vpermd            m3,           m5,     m3
+    vextracti128      xm4,          m3,     1
+    movq              [r2],         xm3
+    movhps            [r2 + r3],    xm3
+    lea               r2,           [r2 + r3 * 2]
+    movq              [r2],         xm4
+    movhps            [r2 + r3],    xm4
+    lea               r2,           [r2 + r3 * 2]
+    lea               r0,           [r0 + r1*2]
+    dec               r4d
+    jnz               .loop
+    RET
+
+    IPFILTER_LUMA_AVX2 16, 4
+    IPFILTER_LUMA_AVX2 16, 8
+    IPFILTER_LUMA_AVX2 16, 12 
+    IPFILTER_LUMA_AVX2 16, 16
+    IPFILTER_LUMA_AVX2 16, 32
+    IPFILTER_LUMA_AVX2 16, 64
+
+    IPFILTER_LUMA_32x_avx2 32 , 8
+    IPFILTER_LUMA_32x_avx2 32 , 16
+    IPFILTER_LUMA_32x_avx2 32 , 24
+    IPFILTER_LUMA_32x_avx2 32 , 32
+    IPFILTER_LUMA_32x_avx2 32 , 64
+
+    IPFILTER_LUMA_64x_avx2 64 , 64
+    IPFILTER_LUMA_64x_avx2 64 , 48
+    IPFILTER_LUMA_64x_avx2 64 , 32
+    IPFILTER_LUMA_64x_avx2 64 , 16
+
 ;--------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;--------------------------------------------------------------------------------------------------------------
@@ -1040,7 +1841,7 @@ cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
     mov         r4,         rsp
 
 .loopH:
-    FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
+    FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]
     psubw       m1,         [pw_2000]
     mova        [r4],       m1
 
@@ -1108,7 +1909,7 @@ movd        m0,        [tab_ChromaCoeff + r4 * 4]
 lea         r4,        [r1 * 3]
 lea         r5,        [r0 + 4 * r1]
 pshufb      m0,        [tab_Cm]
-mova        m1,        [tab_c_512]
+mova        m1,        [pw_512]
 
 movd        m2,        [r0]
 movd        m3,        [r0 + r1]
@@ -1181,7 +1982,7 @@ movd        m0,        [tab_ChromaCoeff + r4 * 4]
 
 pshufb      m0,        [tab_Cm]
 
-mova        m1,        [tab_c_512]
+mova        m1,        [pw_512]
 
 mov         r4d,       %2
 lea         r5,        [3 * r1]
@@ -1289,7 +2090,7 @@ pmaddubsw   m3,        m0
 
 phaddw      m2,        m3
 
-pmulhrsw    m2,        [tab_c_512]
+pmulhrsw    m2,        [pw_512]
 packuswb    m2,        m2
 movd        [r2],      m2
 pextrd      [r2 + r3], m2,  1
@@ -1313,7 +2114,7 @@ movd        m0,        [tab_ChromaCoeff + r4 * 4]
 %endif
 
 pshufb      m0,        [tab_Cm]
-mova        m1,        [tab_c_512]
+mova        m1,        [pw_512]
 lea         r5,        [r0 + 4 * r1]
 lea         r4,        [r1 * 3]
 
@@ -1369,6 +2170,51 @@ pextrd      [r2 + r3], m2, 3
 
 RET
 
+INIT_YMM avx2
+cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
+    mov             r4d, r4m
+    shl             r4d, 6
+    sub             r0, r1
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+
+    movd            xm1, [r0]
+    pinsrd          xm1, [r0 + r1], 1
+    pinsrd          xm1, [r0 + r1 * 2], 2
+    pinsrd          xm1, [r0 + r4], 3                       ; m1 = row[3 2 1 0]
+    lea             r0, [r0 + r1 * 4]
+    movd            xm2, [r0]
+    pinsrd          xm2, [r0 + r1], 1
+    pinsrd          xm2, [r0 + r1 * 2], 2                   ; m2 = row[x 6 5 4]
+    vinserti128     m1, m1, xm2, 1                          ; m1 = row[x 6 5 4 3 2 1 0]
+    mova            m2, [interp4_vpp_shuf1]
+    vpermd          m0, m2, m1                              ; m0 = row[4 3 3 2 2 1 1 0]
+    mova            m2, [interp4_vpp_shuf1 + mmsize]
+    vpermd          m1, m2, m1                              ; m1 = row[6 5 5 4 4 3 3 2]
+
+    mova            m2, [interp4_vpp_shuf]
+    pshufb          m0, m0, m2
+    pshufb          m1, m1, m2
+    pmaddubsw       m0, [r5]
+    pmaddubsw       m1, [r5 + mmsize]
+    paddw           m0, m1                                  ; m0 = WORD ROW[3 2 1 0]
+    pmulhrsw        m0, [pw_512]
+    vextracti128    xm1, m0, 1
+    packuswb        xm0, xm1
+    lea             r5, [r3 * 3]
+    movd            [r2], xm0
+    pextrd          [r2 + r3], xm0, 1
+    pextrd          [r2 + r3 * 2], xm0, 2
+    pextrd          [r2 + r5], xm0, 3
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -1388,7 +2234,7 @@ movd        m0,        [tab_ChromaCoeff + r4 * 4]
 
 pshufb      m0,        [tab_Cm]
 
-mova        m1,        [tab_c_512]
+mova        m1,        [pw_512]
 
 mov         r4d,       %2
 
@@ -1590,7 +2436,7 @@ pmaddubsw   m4,        m5
 
 paddw       m0,        m4
 
-mova        m4,        [tab_c_512]
+mova        m4,        [pw_512]
 
 pmulhrsw    m0,        m4
 packuswb    m0,        m0
@@ -2495,7 +3341,7 @@ movd        m5,        [tab_ChromaCoeff + r4 * 4]
 
 pshufb      m6,        m5,       [tab_Vm]
 pshufb      m5,        [tab_Vm + 16]
-mova        m4,        [tab_c_512]
+mova        m4,        [pw_512]
 lea         r5,        [r1 * 3]
 
 mov         r4d,       %2
@@ -2573,6 +3419,84 @@ FILTER_V4_W8_H8_H16_H32 8, 32
 FILTER_V4_W8_H8_H16_H32 8, 12
 FILTER_V4_W8_H8_H16_H32 8, 64
 
+%macro PROCESS_CHROMA_AVX2_W8_8R 0
+    movq            xm1, [r0]                       ; m1 = row 0
+    movq            xm2, [r0 + r1]                  ; m2 = row 1
+    punpcklbw       xm1, xm2                        ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+    movq            xm3, [r0 + r1 * 2]              ; m3 = row 2
+    punpcklbw       xm2, xm3                        ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+    vinserti128     m5, m1, xm2, 1                  ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+    pmaddubsw       m5, [r5]
+    movq            xm4, [r0 + r4]                  ; m4 = row 3
+    punpcklbw       xm3, xm4                        ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+    lea             r0, [r0 + r1 * 4]
+    movq            xm1, [r0]                       ; m1 = row 4
+    punpcklbw       xm4, xm1                        ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+    vinserti128     m2, m3, xm4, 1                  ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+    pmaddubsw       m0, m2, [r5 + 1 * mmsize]
+    paddw           m5, m0
+    pmaddubsw       m2, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 5
+    punpcklbw       xm1, xm3                        ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+    movq            xm4, [r0 + r1 * 2]              ; m4 = row 6
+    punpcklbw       xm3, xm4                        ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+    pmaddubsw       m0, m1, [r5 + 1 * mmsize]
+    paddw           m2, m0
+    pmaddubsw       m1, [r5]
+    movq            xm3, [r0 + r4]                  ; m3 = row 7
+    punpcklbw       xm4, xm3                        ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+    lea             r0, [r0 + r1 * 4]
+    movq            xm0, [r0]                       ; m0 = row 8
+    punpcklbw       xm3, xm0                        ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+    pmaddubsw       m3, m4, [r5 + 1 * mmsize]
+    paddw           m1, m3
+    pmaddubsw       m4, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 9
+    punpcklbw       xm0, xm3                        ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+    movq            xm6, [r0 + r1 * 2]              ; m6 = row 10
+    punpcklbw       xm3, xm6                        ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+    vinserti128     m0, m0, xm3, 1                  ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+    pmaddubsw       m0, [r5 + 1 * mmsize]
+    paddw           m4, m0
+%endmacro
+
+INIT_YMM avx2
+cglobal interp_4tap_vert_pp_8x8, 4, 6, 7
+    mov             r4d, r4m
+    shl             r4d, 6
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r1
+    PROCESS_CHROMA_AVX2_W8_8R
+    lea             r4, [r3 * 3]
+    mova            m3, [pw_512]
+    pmulhrsw        m5, m3                          ; m5 = word: row 0, row 1
+    pmulhrsw        m2, m3                          ; m2 = word: row 2, row 3
+    pmulhrsw        m1, m3                          ; m1 = word: row 4, row 5
+    pmulhrsw        m4, m3                          ; m4 = word: row 6, row 7
+    packuswb        m5, m2
+    packuswb        m1, m4
+    vextracti128    xm2, m5, 1
+    vextracti128    xm4, m1, 1
+    movq            [r2], xm5
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm5
+    movhps          [r2 + r4], xm2
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm1
+    movq            [r2 + r3], xm4
+    movhps          [r2 + r3 * 2], xm1
+    movhps          [r2 + r4], xm4
+    RET
 
 ;-----------------------------------------------------------------------------
 ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -2593,7 +3517,7 @@ movd        m5,        [tab_ChromaCoeff + r4 * 4]
 
 pshufb      m6,        m5,       [tab_Vm]
 pshufb      m5,        [tab_Vm + 16]
-mova        m4,        [tab_c_512]
+mova        m4,        [pw_512]
 
 mov         r4d,       %2
 lea         r5,        [3 * r1]
@@ -2716,7 +3640,7 @@ punpckhbw   m6,        m5,        m7
 pmaddubsw   m6,        m0
 paddw       m2,        m6
 
-mova        m6,        [tab_c_512]
+mova        m6,        [pw_512]
 
 pmulhrsw    m4,        m6
 pmulhrsw    m2,        m6
@@ -2806,7 +3730,7 @@ punpcklbw   m7,        m5,        m6
 pmaddubsw   m7,        m0
 paddw       m4,        m7
 
-mova        m7,        [tab_c_512]
+mova        m7,        [pw_512]
 
 pmulhrsw    m4,        m7
 pmulhrsw    m2,        m7
@@ -2855,6 +3779,217 @@ FILTER_V4_W16_H2 16, 32
 FILTER_V4_W16_H2 16, 24
 FILTER_V4_W16_H2 16, 64
 
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_vert_pp_16x16, 4, 6, 15
+    mov             r4d, r4m
+    shl             r4d, 6
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    mova            m12, [r5]
+    mova            m13, [r5 + mmsize]
+    lea             r4, [r1 * 3]
+    sub             r0, r1
+    lea             r5, [r3 * 3]
+    mova            m14, [pw_512]
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, m12
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, m12
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, m13
+    paddw           m0, m4
+    pmaddubsw       m2, m12
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, m13
+    paddw           m1, m5
+    pmaddubsw       m3, m12
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, m13
+    paddw           m2, m6
+    pmaddubsw       m4, m12
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, m13
+    paddw           m3, m7
+    pmaddubsw       m5, m12
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, m13
+    paddw           m4, m8
+    pmaddubsw       m6, m12
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, m13
+    paddw           m5, m9
+    pmaddubsw       m7, m12
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, m13
+    paddw           m6, m10
+    pmaddubsw       m8, m12
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, m13
+    paddw           m7, m11
+    pmaddubsw       m9, m12
+
+    pmulhrsw        m0, m14                         ; m0 = word: row 0
+    pmulhrsw        m1, m14                         ; m1 = word: row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2
+    pmulhrsw        m3, m14                         ; m3 = word: row 3
+    pmulhrsw        m4, m14                         ; m4 = word: row 4
+    pmulhrsw        m5, m14                         ; m5 = word: row 5
+    pmulhrsw        m6, m14                         ; m6 = word: row 6
+    pmulhrsw        m7, m14                         ; m7 = word: row 7
+    packuswb        m0, m1
+    packuswb        m2, m3
+    packuswb        m4, m5
+    packuswb        m6, m7
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    vextracti128    xm7, m6, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r5], xm3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm4
+    movu            [r2 + r3], xm5
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r5], xm7
+    lea             r2, [r2 + r3 * 4]
+
+    movu            xm11, [r0 + r4]                 ; m11 = row 11
+    punpckhbw       xm6, xm10, xm11
+    punpcklbw       xm10, xm11
+    vinserti128     m10, m10, xm6, 1
+    pmaddubsw       m6, m10, m13
+    paddw           m8, m6
+    pmaddubsw       m10, m12
+    lea             r0, [r0 + r1 * 4]
+    movu            xm6, [r0]                       ; m6 = row 12
+    punpckhbw       xm7, xm11, xm6
+    punpcklbw       xm11, xm6
+    vinserti128     m11, m11, xm7, 1
+    pmaddubsw       m7, m11, m13
+    paddw           m9, m7
+    pmaddubsw       m11, m12
+
+    movu            xm7, [r0 + r1]                  ; m7 = row 13
+    punpckhbw       xm0, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm0, 1
+    pmaddubsw       m0, m6, m13
+    paddw           m10, m0
+    pmaddubsw       m6, m12
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhbw       xm1, xm7, xm0
+    punpcklbw       xm7, xm0
+    vinserti128     m7, m7, xm1, 1
+    pmaddubsw       m1, m7, m13
+    paddw           m11, m1
+    pmaddubsw       m7, m12
+    movu            xm1, [r0 + r4]                  ; m1 = row 15
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m2, m0, m13
+    paddw           m6, m2
+    pmaddubsw       m0, m12
+    lea             r0, [r0 + r1 * 4]
+    movu            xm2, [r0]                       ; m2 = row 16
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m3, m1, m13
+    paddw           m7, m3
+    pmaddubsw       m1, m12
+    movu            xm3, [r0 + r1]                  ; m3 = row 17
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m2, m13
+    paddw           m0, m2
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m3, m13
+    paddw           m1, m3
+
+    pmulhrsw        m8, m14                         ; m8 = word: row 8
+    pmulhrsw        m9, m14                         ; m9 = word: row 9
+    pmulhrsw        m10, m14                        ; m10 = word: row 10
+    pmulhrsw        m11, m14                        ; m11 = word: row 11
+    pmulhrsw        m6, m14                         ; m6 = word: row 12
+    pmulhrsw        m7, m14                         ; m7 = word: row 13
+    pmulhrsw        m0, m14                         ; m0 = word: row 14
+    pmulhrsw        m1, m14                         ; m1 = word: row 15
+    packuswb        m8, m9
+    packuswb        m10, m11
+    packuswb        m6, m7
+    packuswb        m0, m1
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vpermq          m6, m6, 11011000b
+    vpermq          m0, m0, 11011000b
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    vextracti128    xm7, m6, 1
+    vextracti128    xm1, m0, 1
+    movu            [r2], xm8
+    movu            [r2 + r3], xm9
+    movu            [r2 + r3 * 2], xm10
+    movu            [r2 + r5], xm11
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm6
+    movu            [r2 + r3], xm7
+    movu            [r2 + r3 * 2], xm0
+    movu            [r2 + r5], xm1
+    RET
+%endif
+
 ;-----------------------------------------------------------------------------
 ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -2899,7 +4034,7 @@ punpckhbw   m6,        m5,        m7
 pmaddubsw   m6,        m0
 paddw       m2,        m6
 
-mova        m6,        [tab_c_512]
+mova        m6,        [pw_512]
 
 pmulhrsw    m4,        m6
 pmulhrsw    m2,        m6
@@ -2998,7 +4133,7 @@ movd        m0,        [tab_ChromaCoeff + r4 * 4]
 pshufb      m1,        m0,       [tab_Vm]
 pshufb      m0,        [tab_Vm + 16]
 
-mova        m7,        [tab_c_512]
+mova        m7,        [pw_512]
 
 mov         r4d,       %2
 
@@ -3076,6 +4211,96 @@ FILTER_V4_W32 32, 32
 FILTER_V4_W32 32, 48
 FILTER_V4_W32 32, 64
 
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_vert_pp_32x32, 4, 7, 13
+    mov             r4d, r4m
+    shl             r4d, 6
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    mova            m10, [r5]
+    mova            m11, [r5 + mmsize]
+    lea             r4, [r1 * 3]
+    sub             r0, r1
+    lea             r5, [r3 * 3]
+    mova            m12, [pw_512]
+    mov             r6d, 8
+.loopW:
+    movu            m0, [r0]                        ; m0 = row 0
+    movu            m1, [r0 + r1]                   ; m1 = row 1
+    punpcklbw       m2, m0, m1
+    punpckhbw       m3, m0, m1
+    pmaddubsw       m2, m10
+    pmaddubsw       m3, m10
+    movu            m0, [r0 + r1 * 2]               ; m0 = row 2
+    punpcklbw       m4, m1, m0
+    punpckhbw       m5, m1, m0
+    pmaddubsw       m4, m10
+    pmaddubsw       m5, m10
+    movu            m1, [r0 + r4]                   ; m1 = row 3
+    punpcklbw       m6, m0, m1
+    punpckhbw       m7, m0, m1
+    pmaddubsw       m8, m6, m11
+    pmaddubsw       m9, m7, m11
+    pmaddubsw       m6, m10
+    pmaddubsw       m7, m10
+    paddw           m2, m8
+    paddw           m3, m9
+    pmulhrsw        m2, m12
+    pmulhrsw        m3, m12
+    packuswb        m2, m3
+    movu            [r2], m2
+
+    lea             r0, [r0 + r1 * 4]
+    movu            m0, [r0]                        ; m0 = row 4
+    punpcklbw       m2, m1, m0
+    punpckhbw       m3, m1, m0
+    pmaddubsw       m8, m2, m11
+    pmaddubsw       m9, m3, m11
+    pmaddubsw       m2, m10
+    pmaddubsw       m3, m10
+    paddw           m4, m8
+    paddw           m5, m9
+    pmulhrsw        m4, m12
+    pmulhrsw        m5, m12
+    packuswb        m4, m5
+    movu            [r2 + r3], m4
+
+    movu            m1, [r0 + r1]                   ; m1 = row 5
+    punpcklbw       m4, m0, m1
+    punpckhbw       m5, m0, m1
+    pmaddubsw       m4, m11
+    pmaddubsw       m5, m11
+    paddw           m6, m4
+    paddw           m7, m5
+    pmulhrsw        m6, m12
+    pmulhrsw        m7, m12
+    packuswb        m6, m7
+    movu            [r2 + r3 * 2], m6
+
+    movu            m0, [r0 + r1 * 2]               ; m0 = row 6
+    punpcklbw       m6, m1, m0
+    punpckhbw       m7, m1, m0
+    pmaddubsw       m6, m11
+    pmaddubsw       m7, m11
+    paddw           m2, m6
+    paddw           m3, m7
+    pmulhrsw        m2, m12
+    pmulhrsw        m3, m12
+    packuswb        m2, m3
+    movu            [r2 + r5], m2
+
+    lea             r2, [r2 + r3 * 4]
+    dec             r6d
+    jnz             .loopW
+    RET
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3126,7 +4351,7 @@ punpcklbw   m7,        m5,        m6
 pmaddubsw   m7,        m0
 paddw       m4,        m7
 
-mova        m7,        [tab_c_512]
+mova        m7,        [pw_512]
 
 pmulhrsw    m4,        m7
 pmulhrsw    m2,        m7
@@ -3190,7 +4415,7 @@ cglobal luma_p2s, 3, 7, 6
     mov         r4d, r4m
 
     ; load constant
-    mova        m4, [tab_c_128]
+    mova        m4, [pb_128]
     mova        m5, [tab_c_64_n64]
 
 .loopH:
@@ -3379,7 +4604,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
 %endif
 
 %ifidn %3,pp
-    mova      m3, [tab_c_512]
+    mova      m3, [pw_512]
 %else
     mova      m3, [pw_2000]
 %endif
@@ -3421,6 +4646,149 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
     RET
 %endmacro
 
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_4x4, 4,6,8
+    mov             r4d, r4m
+    lea             r5, [r1 * 3]
+    sub             r0, r5
+
+    ; TODO: VPGATHERDD
+    movd            xm1, [r0]                       ; m1 = row0
+    movd            xm2, [r0 + r1]                  ; m2 = row1
+    punpcklbw       xm1, xm2                        ; m1 = [13 03 12 02 11 01 10 00]
+
+    movd            xm3, [r0 + r1 * 2]              ; m3 = row2
+    punpcklbw       xm2, xm3                        ; m2 = [23 13 22 12 21 11 20 10]
+    movd            xm4, [r0 + r5]
+    punpcklbw       xm3, xm4                        ; m3 = [33 23 32 22 31 21 30 20]
+    punpcklwd       xm1, xm3                        ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+
+    lea             r0, [r0 + r1 * 4]
+    movd            xm5, [r0]                       ; m5 = row4
+    punpcklbw       xm4, xm5                        ; m4 = [43 33 42 32 41 31 40 30]
+    punpcklwd       xm2, xm4                        ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]
+    vinserti128     m1, m1, xm2, 1                  ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+    movd            xm2, [r0 + r1]                  ; m2 = row5
+    punpcklbw       xm5, xm2                        ; m5 = [53 43 52 42 51 41 50 40]
+    punpcklwd       xm3, xm5                        ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
+    movd            xm6, [r0 + r1 * 2]              ; m6 = row6
+    punpcklbw       xm2, xm6                        ; m2 = [63 53 62 52 61 51 60 50]
+    punpcklwd       xm4, xm2                        ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]
+    vinserti128     m3, m3, xm4, 1                  ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
+    movd            xm4, [r0 + r5]                  ; m4 = row7
+    punpcklbw       xm6, xm4                        ; m6 = [73 63 72 62 71 61 70 60]
+    punpcklwd       xm5, xm6                        ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
+
+    lea             r0, [r0 + r1 * 4]
+    movd            xm7, [r0]                       ; m7 = row8
+    punpcklbw       xm4, xm7                        ; m4 = [83 73 82 72 81 71 80 70]
+    punpcklwd       xm2, xm4                        ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]
+    vinserti128     m5, m5, xm2, 1                  ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
+    movd            xm2, [r0 + r1]                  ; m2 = row9
+    punpcklbw       xm7, xm2                        ; m7 = [93 83 92 82 91 81 90 80]
+    punpcklwd       xm6, xm7                        ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
+    movd            xm7, [r0 + r1 * 2]              ; m7 = rowA
+    punpcklbw       xm2, xm7                        ; m2 = [A3 93 A2 92 A1 91 A0 90]
+    punpcklwd       xm4, xm2                        ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]
+    vinserti128     m6, m6, xm4, 1                  ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
+
+    ; load filter coeff
+%ifdef PIC
+    lea             r5, [tab_LumaCoeff]
+    vpbroadcastd    m0, [r5 + r4 * 8 + 0]
+    vpbroadcastd    m2, [r5 + r4 * 8 + 4]
+%else
+    vpbroadcastd    m0, [tab_LumaCoeff + r4 * 8 + 0]
+    vpbroadcastd    m2, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+
+    pmaddubsw       m1, m0
+    pmaddubsw       m3, m0
+    pmaddubsw       m5, m2
+    pmaddubsw       m6, m2
+    vbroadcasti128  m0, [pw_1]
+    pmaddwd         m1, m0
+    pmaddwd         m3, m0
+    pmaddwd         m5, m0
+    pmaddwd         m6, m0
+    paddd           m1, m5                          ; m1 = DQWORD ROW[1 0]
+    paddd           m3, m6                          ; m3 = DQWORD ROW[3 2]
+    packssdw        m1, m3                          ; m1 =  QWORD ROW[3 1 2 0]
+
+    ; TODO: does it overflow?
+    pmulhrsw        m1, [pw_512]
+    vextracti128    xm2, m1, 1
+    packuswb        xm1, xm2                        ; m1 =  DWORD ROW[3 1 2 0]
+    movd            [r2], xm1
+    pextrd          [r2 + r3], xm1, 2
+    pextrd          [r2 + r3 * 2], xm1, 1
+    lea             r4, [r3 * 3]
+    pextrd          [r2 + r4], xm1, 3
+    RET
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_ps_4x4, 4, 6, 5
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+    add             r3d, r3d
+
+    movd            xm1, [r0]
+    pinsrd          xm1, [r0 + r1], 1
+    pinsrd          xm1, [r0 + r1 * 2], 2
+    pinsrd          xm1, [r0 + r4], 3                       ; m1 = row[3 2 1 0]
+    lea             r0, [r0 + r1 * 4]
+    movd            xm2, [r0]
+    pinsrd          xm2, [r0 + r1], 1
+    pinsrd          xm2, [r0 + r1 * 2], 2
+    pinsrd          xm2, [r0 + r4], 3                       ; m2 = row[7 6 5 4]
+    vinserti128     m1, m1, xm2, 1                          ; m1 = row[7 6 5 4 3 2 1 0]
+    lea             r0, [r0 + r1 * 4]
+    movd            xm3, [r0]
+    pinsrd          xm3, [r0 + r1], 1
+    pinsrd          xm3, [r0 + r1 * 2], 2                   ; m3 = row[x 10 9 8]
+    vinserti128     m2, m2, xm3, 1                          ; m2 = row[x 10 9 8 7 6 5 4]
+    mova            m3, [interp4_vpp_shuf1]
+    vpermd          m0, m3, m1                              ; m0 = row[4 3 3 2 2 1 1 0]
+    vpermd          m4, m3, m2                              ; m4 = row[8 7 7 6 6 5 5 4]
+    mova            m3, [interp4_vpp_shuf1 + mmsize]
+    vpermd          m1, m3, m1                              ; m1 = row[6 5 5 4 4 3 3 2]
+    vpermd          m2, m3, m2                              ; m2 = row[10 9 9 8 8 7 7 6]
+
+    mova            m3, [interp4_vpp_shuf]
+    pshufb          m0, m0, m3
+    pshufb          m1, m1, m3
+    pshufb          m4, m4, m3
+    pshufb          m2, m2, m3
+    pmaddubsw       m0, [r5]
+    pmaddubsw       m1, [r5 + mmsize]
+    pmaddubsw       m4, [r5 + 2 * mmsize]
+    pmaddubsw       m2, [r5 + 3 * mmsize]
+    paddw           m0, m1
+    paddw           m0, m4
+    paddw           m0, m2                                  ; m0 = WORD ROW[3 2 1 0]
+
+    vbroadcasti128  m3, [pw_2000]
+    psubw           m0, m3
+    vextracti128    xm2, m0, 1
+    lea             r5, [r3 * 3]
+    movq            [r2], xm0
+    movhps          [r2 + r3], xm0
+    movq            [r2 + r3 * 2], xm2
+    movhps          [r2 + r5], xm2
+    RET
+
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
@@ -3451,6 +4819,122 @@ FILTER_VER_LUMA_4xN 4, 8, ps
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_4xN 4, 16, ps
 
+%macro PROCESS_LUMA_AVX2_W8_8R 0
+    movq            xm1, [r0]                       ; m1 = row 0
+    movq            xm2, [r0 + r1]                  ; m2 = row 1
+    punpcklbw       xm1, xm2                        ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+    movq            xm3, [r0 + r1 * 2]              ; m3 = row 2
+    punpcklbw       xm2, xm3                        ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+    vinserti128     m5, m1, xm2, 1                  ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+    pmaddubsw       m5, [r5]
+    movq            xm4, [r0 + r4]                  ; m4 = row 3
+    punpcklbw       xm3, xm4                        ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+    lea             r0, [r0 + r1 * 4]
+    movq            xm1, [r0]                       ; m1 = row 4
+    punpcklbw       xm4, xm1                        ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+    vinserti128     m2, m3, xm4, 1                  ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+    pmaddubsw       m0, m2, [r5 + 1 * mmsize]
+    paddw           m5, m0
+    pmaddubsw       m2, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 5
+    punpcklbw       xm1, xm3                        ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+    movq            xm4, [r0 + r1 * 2]              ; m4 = row 6
+    punpcklbw       xm3, xm4                        ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
+    paddw           m5, m3
+    pmaddubsw       m0, m1, [r5 + 1 * mmsize]
+    paddw           m2, m0
+    pmaddubsw       m1, [r5]
+    movq            xm3, [r0 + r4]                  ; m3 = row 7
+    punpcklbw       xm4, xm3                        ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+    lea             r0, [r0 + r1 * 4]
+    movq            xm0, [r0]                       ; m0 = row 8
+    punpcklbw       xm3, xm0                        ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+    pmaddubsw       m3, m4, [r5 + 3 * mmsize]
+    paddw           m5, m3
+    pmaddubsw       m3, m4, [r5 + 2 * mmsize]
+    paddw           m2, m3
+    pmaddubsw       m3, m4, [r5 + 1 * mmsize]
+    paddw           m1, m3
+    pmaddubsw       m4, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 9
+    punpcklbw       xm0, xm3                        ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+    movq            xm6, [r0 + r1 * 2]              ; m6 = row 10
+    punpcklbw       xm3, xm6                        ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+    vinserti128     m0, m0, xm3, 1                  ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+    pmaddubsw       m3, m0, [r5 + 3 * mmsize]
+    paddw           m2, m3
+    pmaddubsw       m3, m0, [r5 + 2 * mmsize]
+    paddw           m1, m3
+    pmaddubsw       m0, [r5 + 1 * mmsize]
+    paddw           m4, m0
+
+    movq            xm3, [r0 + r4]                  ; m3 = row 11
+    punpcklbw       xm6, xm3                        ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
+    lea             r0, [r0 + r1 * 4]
+    movq            xm0, [r0]                       ; m0 = row 12
+    punpcklbw       xm3, xm0                        ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0]
+    vinserti128     m6, m6, xm3, 1                  ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
+    pmaddubsw       m3, m6, [r5 + 3 * mmsize]
+    paddw           m1, m3
+    pmaddubsw       m6, [r5 + 2 * mmsize]
+    paddw           m4, m6
+    movq            xm3, [r0 + r1]                  ; m3 = row 13
+    punpcklbw       xm0, xm3                        ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
+    movq            xm6, [r0 + r1 * 2]              ; m6 = row 14
+    punpcklbw       xm3, xm6                        ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0]
+    vinserti128     m0, m0, xm3, 1                  ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
+    pmaddubsw       m0, [r5 + 3 * mmsize]
+    paddw           m4, m0
+%endmacro
+
+%macro PROCESS_LUMA_AVX2_W8_4R 0
+    movq            xm1, [r0]                       ; m1 = row 0
+    movq            xm2, [r0 + r1]                  ; m2 = row 1
+    punpcklbw       xm1, xm2                        ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+    movq            xm3, [r0 + r1 * 2]              ; m3 = row 2
+    punpcklbw       xm2, xm3                        ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+    vinserti128     m5, m1, xm2, 1                  ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+    pmaddubsw       m5, [r5]
+    movq            xm4, [r0 + r4]                  ; m4 = row 3
+    punpcklbw       xm3, xm4                        ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+    lea             r0, [r0 + r1 * 4]
+    movq            xm1, [r0]                       ; m1 = row 4
+    punpcklbw       xm4, xm1                        ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+    vinserti128     m2, m3, xm4, 1                  ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+    pmaddubsw       m0, m2, [r5 + 1 * mmsize]
+    paddw           m5, m0
+    pmaddubsw       m2, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 5
+    punpcklbw       xm1, xm3                        ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+    movq            xm4, [r0 + r1 * 2]              ; m4 = row 6
+    punpcklbw       xm3, xm4                        ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
+    paddw           m5, m3
+    pmaddubsw       m0, m1, [r5 + 1 * mmsize]
+    paddw           m2, m0
+    movq            xm3, [r0 + r4]                  ; m3 = row 7
+    punpcklbw       xm4, xm3                        ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+    lea             r0, [r0 + r1 * 4]
+    movq            xm0, [r0]                       ; m0 = row 8
+    punpcklbw       xm3, xm0                        ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+    pmaddubsw       m3, m4, [r5 + 3 * mmsize]
+    paddw           m5, m3
+    pmaddubsw       m3, m4, [r5 + 2 * mmsize]
+    paddw           m2, m3
+    movq            xm3, [r0 + r1]                  ; m3 = row 9
+    punpcklbw       xm0, xm3                        ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+    movq            xm6, [r0 + r1 * 2]              ; m6 = row 10
+    punpcklbw       xm3, xm6                        ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+    vinserti128     m0, m0, xm3, 1                  ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+    pmaddubsw       m3, m0, [r5 + 3 * mmsize]
+    paddw           m2, m3
+%endmacro
+
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
@@ -3473,7 +4957,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
 %endif
 
  %ifidn %3,pp
-    mova      m3, [tab_c_512]
+    mova      m3, [pw_512]
 %else
     mova      m3, [pw_2000]
 %endif
@@ -3520,6 +5004,115 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
     RET
 %endmacro
 
+%macro FILTER_VER_LUMA_AVX2_8xN 2
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r1 * 4]
+    mov             word [rsp], %2 / 8
+    mova            m7, [pw_512]
+
+.loop:
+    PROCESS_LUMA_AVX2_W8_8R
+    pmulhrsw        m5, m7                          ; m5 = word: row 0, row 1
+    pmulhrsw        m2, m7                          ; m2 = word: row 2, row 3
+    pmulhrsw        m1, m7                          ; m1 = word: row 4, row 5
+    pmulhrsw        m4, m7                          ; m4 = word: row 6, row 7
+    packuswb        m5, m2
+    packuswb        m1, m4
+    vextracti128    xm2, m5, 1
+    vextracti128    xm4, m1, 1
+    movq            [r2], xm5
+    movq            [r2 + r3], xm2
+    lea             r2, [r2 + r3 * 2]
+    movhps          [r2], xm5
+    movhps          [r2 + r3], xm2
+    lea             r2, [r2 + r3 * 2]
+    movq            [r2], xm1
+    movq            [r2 + r3], xm4
+    lea             r2, [r2 + r3 * 2]
+    movhps          [r2], xm1
+    movhps          [r2 + r3], xm4
+    lea             r2, [r2 + r3 * 2]
+    sub             r0, r6
+    dec             word [rsp]
+    jnz             .loop
+    RET
+%endmacro
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    PROCESS_LUMA_AVX2_W8_8R
+    lea             r4, [r3 * 3]
+    mova            m3, [pw_512]
+    pmulhrsw        m5, m3                          ; m5 = word: row 0, row 1
+    pmulhrsw        m2, m3                          ; m2 = word: row 2, row 3
+    pmulhrsw        m1, m3                          ; m1 = word: row 4, row 5
+    pmulhrsw        m4, m3                          ; m4 = word: row 6, row 7
+    packuswb        m5, m2
+    packuswb        m1, m4
+    vextracti128    xm2, m5, 1
+    vextracti128    xm4, m1, 1
+    movq            [r2], xm5
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm5
+    movhps          [r2 + r4], xm2
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm1
+    movq            [r2 + r3], xm4
+    movhps          [r2 + r3 * 2], xm1
+    movhps          [r2 + r4], xm4
+    RET
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    PROCESS_LUMA_AVX2_W8_4R
+    lea             r4, [r3 * 3]
+    mova            m3, [pw_512]
+    pmulhrsw        m5, m3                          ; m5 = word: row 0, row 1
+    pmulhrsw        m2, m3                          ; m2 = word: row 2, row 3
+    packuswb        m5, m2
+    vextracti128    xm2, m5, 1
+    movq            [r2], xm5
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm5
+    movhps          [r2 + r4], xm2
+    RET
+
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
@@ -3534,11 +5127,13 @@ FILTER_VER_LUMA_8xN 8, 8, pp
 ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 16, pp
+FILTER_VER_LUMA_AVX2_8xN 8, 16
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 32, pp
+FILTER_VER_LUMA_AVX2_8xN 8, 32
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3581,7 +5176,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
 %endif
 
  %ifidn %3,pp
-    mova      m3, [tab_c_512]
+    mova      m3, [pw_512]
 %else
     mova      m3, [pw_2000]
 %endif
@@ -3674,6 +5269,2260 @@ FILTER_VER_LUMA_12xN 12, 16, pp
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_12xN 12, 16, ps
 
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_12x16, 4, 7, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    mova            m14, [pw_512]
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
+    paddw           m0, m6
+    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
+    paddw           m2, m6
+    pmaddubsw       m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
+    paddw           m1, m7
+    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
+    paddw           m3, m7
+    pmaddubsw       m5, [r5]
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
+    paddw           m0, m8
+    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
+    paddw           m2, m8
+    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
+    paddw           m4, m8
+    pmaddubsw       m6, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
+    paddw           m1, m9
+    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
+    paddw           m3, m9
+    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
+    paddw           m5, m9
+    pmaddubsw       m7, [r5]
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
+    paddw           m2, m10
+    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
+    paddw           m4, m10
+    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
+    paddw           m6, m10
+    pmaddubsw       m8, [r5]
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
+    paddw           m3, m11
+    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
+    paddw           m5, m11
+    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
+    paddw           m7, m11
+    pmaddubsw       m9, [r5]
+    movu            xm11, [r0 + r4]                 ; m11 = row 11
+    punpckhbw       xm12, xm10, xm11
+    punpcklbw       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
+    paddw           m4, m12
+    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
+    paddw           m6, m12
+    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
+    paddw           m8, m12
+    pmaddubsw       m10, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm12, [r0]                      ; m12 = row 12
+    punpckhbw       xm13, xm11, xm12
+    punpcklbw       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
+    paddw           m5, m13
+    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
+    paddw           m7, m13
+    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
+    paddw           m9, m13
+    pmaddubsw       m11, [r5]
+
+    pmulhrsw        m0, m14                         ; m0 = word: row 0
+    pmulhrsw        m1, m14                         ; m1 = word: row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2
+    pmulhrsw        m3, m14                         ; m3 = word: row 3
+    pmulhrsw        m4, m14                         ; m4 = word: row 4
+    pmulhrsw        m5, m14                         ; m5 = word: row 5
+    packuswb        m0, m1
+    packuswb        m2, m3
+    packuswb        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    movq            [r2], xm0
+    pextrd          [r2 + 8], xm0, 2
+    movq            [r2 + r3], xm1
+    pextrd          [r2 + r3 + 8], xm1, 2
+    movq            [r2 + r3 * 2], xm2
+    pextrd          [r2 + r3 * 2 + 8], xm2, 2
+    movq            [r2 + r6], xm3
+    pextrd          [r2 + r6 + 8], xm3, 2
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm4
+    pextrd          [r2 + 8], xm4, 2
+    movq            [r2 + r3], xm5
+    pextrd          [r2 + r3 + 8], xm5, 2
+
+    movu            xm13, [r0 + r1]                 ; m13 = row 13
+    punpckhbw       xm0, xm12, xm13
+    punpcklbw       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
+    paddw           m6, m0
+    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
+    paddw           m8, m0
+    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
+    paddw           m10, m0
+    pmaddubsw       m12, [r5]
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhbw       xm1, xm13, xm0
+    punpcklbw       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
+    paddw           m7, m1
+    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
+    paddw           m9, m1
+    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
+    paddw           m11, m1
+    pmaddubsw       m13, [r5]
+
+    pmulhrsw        m6, m14                         ; m6 = word: row 6
+    pmulhrsw        m7, m14                         ; m7 = word: row 7
+    packuswb        m6, m7
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm7, m6, 1
+    movq            [r2 + r3 * 2], xm6
+    pextrd          [r2 + r3 * 2 + 8], xm6, 2
+    movq            [r2 + r6], xm7
+    pextrd          [r2 + r6 + 8], xm7, 2
+    lea             r2, [r2 + r3 * 4]
+
+    movu            xm1, [r0 + r4]                  ; m1 = row 15
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
+    paddw           m8, m2
+    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
+    paddw           m10, m2
+    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
+    paddw           m12, m2
+    pmaddubsw       m0, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm2, [r0]                       ; m2 = row 16
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
+    paddw           m9, m3
+    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
+    paddw           m11, m3
+    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
+    paddw           m13, m3
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r1]                  ; m3 = row 17
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
+    paddw           m10, m4
+    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
+    paddw           m12, m4
+    pmaddubsw       m2, [r5 + 1 * mmsize]
+    paddw           m0, m2
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
+    paddw           m11, m5
+    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
+    paddw           m13, m5
+    pmaddubsw       m3, [r5 + 1 * mmsize]
+    paddw           m1, m3
+    movu            xm5, [r0 + r4]                  ; m5 = row 19
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
+    paddw           m12, m6
+    pmaddubsw       m4, [r5 + 2 * mmsize]
+    paddw           m0, m4
+    lea             r0, [r0 + r1 * 4]
+    movu            xm6, [r0]                       ; m6 = row 20
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
+    paddw           m13, m7
+    pmaddubsw       m5, [r5 + 2 * mmsize]
+    paddw           m1, m5
+    movu            xm7, [r0 + r1]                  ; m7 = row 21
+    punpckhbw       xm2, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm2, 1
+    pmaddubsw       m6, [r5 + 3 * mmsize]
+    paddw           m0, m6
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
+    punpckhbw       xm3, xm7, xm2
+    punpcklbw       xm7, xm2
+    vinserti128     m7, m7, xm3, 1
+    pmaddubsw       m7, [r5 + 3 * mmsize]
+    paddw           m1, m7
+
+    pmulhrsw        m8, m14                         ; m8 = word: row 8
+    pmulhrsw        m9, m14                         ; m9 = word: row 9
+    pmulhrsw        m10, m14                        ; m10 = word: row 10
+    pmulhrsw        m11, m14                        ; m11 = word: row 11
+    pmulhrsw        m12, m14                        ; m12 = word: row 12
+    pmulhrsw        m13, m14                        ; m13 = word: row 13
+    pmulhrsw        m0, m14                         ; m0 = word: row 14
+    pmulhrsw        m1, m14                         ; m1 = word: row 15
+    packuswb        m8, m9
+    packuswb        m10, m11
+    packuswb        m12, m13
+    packuswb        m0, m1
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vpermq          m12, m12, 11011000b
+    vpermq          m0, m0, 11011000b
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    vextracti128    xm13, m12, 1
+    vextracti128    xm1, m0, 1
+    movq            [r2], xm8
+    pextrd          [r2 + 8], xm8, 2
+    movq            [r2 + r3], xm9
+    pextrd          [r2 + r3 + 8], xm9, 2
+    movq            [r2 + r3 * 2], xm10
+    pextrd          [r2 + r3 * 2 + 8], xm10, 2
+    movq            [r2 + r6], xm11
+    pextrd          [r2 + r6 + 8], xm11, 2
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm12
+    pextrd          [r2 + 8], xm12, 2
+    movq            [r2 + r3], xm13
+    pextrd          [r2 + r3 + 8], xm13, 2
+    movq            [r2 + r3 * 2], xm0
+    pextrd          [r2 + r3 * 2 + 8], xm0, 2
+    movq            [r2 + r6], xm1
+    pextrd          [r2 + r6 + 8], xm1, 2
+    RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x16, 4, 7, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    mova            m14, [pw_512]
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
+    paddw           m0, m6
+    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
+    paddw           m2, m6
+    pmaddubsw       m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
+    paddw           m1, m7
+    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
+    paddw           m3, m7
+    pmaddubsw       m5, [r5]
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
+    paddw           m0, m8
+    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
+    paddw           m2, m8
+    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
+    paddw           m4, m8
+    pmaddubsw       m6, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
+    paddw           m1, m9
+    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
+    paddw           m3, m9
+    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
+    paddw           m5, m9
+    pmaddubsw       m7, [r5]
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
+    paddw           m2, m10
+    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
+    paddw           m4, m10
+    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
+    paddw           m6, m10
+    pmaddubsw       m8, [r5]
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
+    paddw           m3, m11
+    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
+    paddw           m5, m11
+    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
+    paddw           m7, m11
+    pmaddubsw       m9, [r5]
+    movu            xm11, [r0 + r4]                 ; m11 = row 11
+    punpckhbw       xm12, xm10, xm11
+    punpcklbw       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
+    paddw           m4, m12
+    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
+    paddw           m6, m12
+    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
+    paddw           m8, m12
+    pmaddubsw       m10, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm12, [r0]                      ; m12 = row 12
+    punpckhbw       xm13, xm11, xm12
+    punpcklbw       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
+    paddw           m5, m13
+    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
+    paddw           m7, m13
+    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
+    paddw           m9, m13
+    pmaddubsw       m11, [r5]
+
+    pmulhrsw        m0, m14                         ; m0 = word: row 0
+    pmulhrsw        m1, m14                         ; m1 = word: row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2
+    pmulhrsw        m3, m14                         ; m3 = word: row 3
+    pmulhrsw        m4, m14                         ; m4 = word: row 4
+    pmulhrsw        m5, m14                         ; m5 = word: row 5
+    packuswb        m0, m1
+    packuswb        m2, m3
+    packuswb        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm4
+    movu            [r2 + r3], xm5
+
+    movu            xm13, [r0 + r1]                 ; m13 = row 13
+    punpckhbw       xm0, xm12, xm13
+    punpcklbw       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
+    paddw           m6, m0
+    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
+    paddw           m8, m0
+    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
+    paddw           m10, m0
+    pmaddubsw       m12, [r5]
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhbw       xm1, xm13, xm0
+    punpcklbw       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
+    paddw           m7, m1
+    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
+    paddw           m9, m1
+    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
+    paddw           m11, m1
+    pmaddubsw       m13, [r5]
+
+    pmulhrsw        m6, m14                         ; m6 = word: row 6
+    pmulhrsw        m7, m14                         ; m7 = word: row 7
+    packuswb        m6, m7
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm7, m6, 1
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r6], xm7
+    lea             r2, [r2 + r3 * 4]
+
+    movu            xm1, [r0 + r4]                  ; m1 = row 15
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
+    paddw           m8, m2
+    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
+    paddw           m10, m2
+    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
+    paddw           m12, m2
+    pmaddubsw       m0, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm2, [r0]                       ; m2 = row 16
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
+    paddw           m9, m3
+    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
+    paddw           m11, m3
+    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
+    paddw           m13, m3
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r1]                  ; m3 = row 17
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
+    paddw           m10, m4
+    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
+    paddw           m12, m4
+    pmaddubsw       m2, [r5 + 1 * mmsize]
+    paddw           m0, m2
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
+    paddw           m11, m5
+    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
+    paddw           m13, m5
+    pmaddubsw       m3, [r5 + 1 * mmsize]
+    paddw           m1, m3
+    movu            xm5, [r0 + r4]                  ; m5 = row 19
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
+    paddw           m12, m6
+    pmaddubsw       m4, [r5 + 2 * mmsize]
+    paddw           m0, m4
+    lea             r0, [r0 + r1 * 4]
+    movu            xm6, [r0]                       ; m6 = row 20
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
+    paddw           m13, m7
+    pmaddubsw       m5, [r5 + 2 * mmsize]
+    paddw           m1, m5
+    movu            xm7, [r0 + r1]                  ; m7 = row 21
+    punpckhbw       xm2, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm2, 1
+    pmaddubsw       m6, [r5 + 3 * mmsize]
+    paddw           m0, m6
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
+    punpckhbw       xm3, xm7, xm2
+    punpcklbw       xm7, xm2
+    vinserti128     m7, m7, xm3, 1
+    pmaddubsw       m7, [r5 + 3 * mmsize]
+    paddw           m1, m7
+
+    pmulhrsw        m8, m14                         ; m8 = word: row 8
+    pmulhrsw        m9, m14                         ; m9 = word: row 9
+    pmulhrsw        m10, m14                        ; m10 = word: row 10
+    pmulhrsw        m11, m14                        ; m11 = word: row 11
+    pmulhrsw        m12, m14                        ; m12 = word: row 12
+    pmulhrsw        m13, m14                        ; m13 = word: row 13
+    pmulhrsw        m0, m14                         ; m0 = word: row 14
+    pmulhrsw        m1, m14                         ; m1 = word: row 15
+    packuswb        m8, m9
+    packuswb        m10, m11
+    packuswb        m12, m13
+    packuswb        m0, m1
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vpermq          m12, m12, 11011000b
+    vpermq          m0, m0, 11011000b
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    vextracti128    xm13, m12, 1
+    vextracti128    xm1, m0, 1
+    movu            [r2], xm8
+    movu            [r2 + r3], xm9
+    movu            [r2 + r3 * 2], xm10
+    movu            [r2 + r6], xm11
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm12
+    movu            [r2 + r3], xm13
+    movu            [r2 + r3 * 2], xm0
+    movu            [r2 + r6], xm1
+    RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x12, 4, 7, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    mova            m14, [pw_512]
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
+    paddw           m0, m6
+    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
+    paddw           m2, m6
+    pmaddubsw       m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
+    paddw           m1, m7
+    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
+    paddw           m3, m7
+    pmaddubsw       m5, [r5]
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
+    paddw           m0, m8
+    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
+    paddw           m2, m8
+    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
+    paddw           m4, m8
+    pmaddubsw       m6, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
+    paddw           m1, m9
+    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
+    paddw           m3, m9
+    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
+    paddw           m5, m9
+    pmaddubsw       m7, [r5]
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
+    paddw           m2, m10
+    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
+    paddw           m4, m10
+    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
+    paddw           m6, m10
+    pmaddubsw       m8, [r5]
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
+    paddw           m3, m11
+    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
+    paddw           m5, m11
+    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
+    paddw           m7, m11
+    pmaddubsw       m9, [r5]
+    movu            xm11, [r0 + r4]                 ; m11 = row 11
+    punpckhbw       xm12, xm10, xm11
+    punpcklbw       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
+    paddw           m4, m12
+    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
+    paddw           m6, m12
+    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
+    paddw           m8, m12
+    pmaddubsw       m10, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm12, [r0]                      ; m12 = row 12
+    punpckhbw       xm13, xm11, xm12
+    punpcklbw       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
+    paddw           m5, m13
+    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
+    paddw           m7, m13
+    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
+    paddw           m9, m13
+    pmaddubsw       m11, [r5]
+
+    pmulhrsw        m0, m14                         ; m0 = word: row 0
+    pmulhrsw        m1, m14                         ; m1 = word: row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2
+    pmulhrsw        m3, m14                         ; m3 = word: row 3
+    pmulhrsw        m4, m14                         ; m4 = word: row 4
+    pmulhrsw        m5, m14                         ; m5 = word: row 5
+    packuswb        m0, m1
+    packuswb        m2, m3
+    packuswb        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm4
+    movu            [r2 + r3], xm5
+
+    movu            xm13, [r0 + r1]                 ; m13 = row 13
+    punpckhbw       xm0, xm12, xm13
+    punpcklbw       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
+    paddw           m6, m0
+    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
+    paddw           m8, m0
+    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
+    paddw           m10, m0
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhbw       xm1, xm13, xm0
+    punpcklbw       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
+    paddw           m7, m1
+    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
+    paddw           m9, m1
+    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
+    paddw           m11, m1
+
+    pmulhrsw        m6, m14                         ; m6 = word: row 6
+    pmulhrsw        m7, m14                         ; m7 = word: row 7
+    packuswb        m6, m7
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm7, m6, 1
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r6], xm7
+    lea             r2, [r2 + r3 * 4]
+
+    movu            xm1, [r0 + r4]                  ; m1 = row 15
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
+    paddw           m8, m2
+    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
+    paddw           m10, m2
+    lea             r0, [r0 + r1 * 4]
+    movu            xm2, [r0]                       ; m2 = row 16
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
+    paddw           m9, m3
+    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
+    paddw           m11, m3
+    movu            xm3, [r0 + r1]                  ; m3 = row 17
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
+    paddw           m10, m4
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
+    paddw           m11, m5
+
+    pmulhrsw        m8, m14                         ; m8 = word: row 8
+    pmulhrsw        m9, m14                         ; m9 = word: row 9
+    pmulhrsw        m10, m14                        ; m10 = word: row 10
+    pmulhrsw        m11, m14                        ; m11 = word: row 11
+    packuswb        m8, m9
+    packuswb        m10, m11
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    movu            [r2], xm8
+    movu            [r2 + r3], xm9
+    movu            [r2 + r3 * 2], xm10
+    movu            [r2 + r6], xm11
+    RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x8, 4, 7, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    mova            m14, [pw_512]
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
+    paddw           m0, m6
+    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
+    paddw           m2, m6
+    pmaddubsw       m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
+    paddw           m1, m7
+    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
+    paddw           m3, m7
+    pmaddubsw       m5, [r5]
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
+    paddw           m0, m8
+    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
+    paddw           m2, m8
+    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
+    paddw           m4, m8
+    pmaddubsw       m6, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
+    paddw           m1, m9
+    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
+    paddw           m3, m9
+    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
+    paddw           m5, m9
+    pmaddubsw       m7, [r5]
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
+    paddw           m2, m10
+    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
+    paddw           m4, m10
+    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
+    paddw           m6, m10
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
+    paddw           m3, m11
+    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
+    paddw           m5, m11
+    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
+    paddw           m7, m11
+    movu            xm11, [r0 + r4]                 ; m11 = row 11
+    punpckhbw       xm12, xm10, xm11
+    punpcklbw       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
+    paddw           m4, m12
+    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
+    paddw           m6, m12
+    lea             r0, [r0 + r1 * 4]
+    movu            xm12, [r0]                      ; m12 = row 12
+    punpckhbw       xm13, xm11, xm12
+    punpcklbw       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
+    paddw           m5, m13
+    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
+    paddw           m7, m13
+
+    pmulhrsw        m0, m14                         ; m0 = word: row 0
+    pmulhrsw        m1, m14                         ; m1 = word: row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2
+    pmulhrsw        m3, m14                         ; m3 = word: row 3
+    pmulhrsw        m4, m14                         ; m4 = word: row 4
+    pmulhrsw        m5, m14                         ; m5 = word: row 5
+    packuswb        m0, m1
+    packuswb        m2, m3
+    packuswb        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm4
+    movu            [r2 + r3], xm5
+
+    movu            xm13, [r0 + r1]                 ; m13 = row 13
+    punpckhbw       xm0, xm12, xm13
+    punpcklbw       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
+    paddw           m6, m0
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhbw       xm1, xm13, xm0
+    punpcklbw       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
+    paddw           m7, m1
+
+    pmulhrsw        m6, m14                         ; m6 = word: row 6
+    pmulhrsw        m7, m14                         ; m7 = word: row 7
+    packuswb        m6, m7
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm7, m6, 1
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r6], xm7
+    RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x4, 4, 7, 13
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    mova            m12, [pw_512]
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
+    paddw           m0, m6
+    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
+    paddw           m2, m6
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
+    paddw           m1, m7
+    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
+    paddw           m3, m7
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
+    paddw           m0, m8
+    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
+    paddw           m2, m8
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
+    paddw           m1, m9
+    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
+    paddw           m3, m9
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
+    paddw           m2, m10
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
+    paddw           m3, m11
+
+    pmulhrsw        m0, m12                         ; m0 = word: row 0
+    pmulhrsw        m1, m12                         ; m1 = word: row 1
+    pmulhrsw        m2, m12                         ; m2 = word: row 2
+    pmulhrsw        m3, m12                         ; m3 = word: row 3
+    packuswb        m0, m1
+    packuswb        m2, m3
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    RET
+%endif
+
+%macro FILTER_VER_LUMA_AVX2_16xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    lea             r7, [r1 * 4]
+    mova            m14, [pw_512]
+    mov             r8d, %2 / 16
+
+.loop:
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
+    paddw           m0, m6
+    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
+    paddw           m2, m6
+    pmaddubsw       m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
+    paddw           m1, m7
+    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
+    paddw           m3, m7
+    pmaddubsw       m5, [r5]
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
+    paddw           m0, m8
+    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
+    paddw           m2, m8
+    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
+    paddw           m4, m8
+    pmaddubsw       m6, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
+    paddw           m1, m9
+    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
+    paddw           m3, m9
+    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
+    paddw           m5, m9
+    pmaddubsw       m7, [r5]
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
+    paddw           m2, m10
+    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
+    paddw           m4, m10
+    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
+    paddw           m6, m10
+    pmaddubsw       m8, [r5]
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
+    paddw           m3, m11
+    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
+    paddw           m5, m11
+    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
+    paddw           m7, m11
+    pmaddubsw       m9, [r5]
+    movu            xm11, [r0 + r4]                 ; m11 = row 11
+    punpckhbw       xm12, xm10, xm11
+    punpcklbw       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
+    paddw           m4, m12
+    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
+    paddw           m6, m12
+    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
+    paddw           m8, m12
+    pmaddubsw       m10, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm12, [r0]                      ; m12 = row 12
+    punpckhbw       xm13, xm11, xm12
+    punpcklbw       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
+    paddw           m5, m13
+    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
+    paddw           m7, m13
+    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
+    paddw           m9, m13
+    pmaddubsw       m11, [r5]
+
+    pmulhrsw        m0, m14                         ; m0 = word: row 0
+    pmulhrsw        m1, m14                         ; m1 = word: row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2
+    pmulhrsw        m3, m14                         ; m3 = word: row 3
+    pmulhrsw        m4, m14                         ; m4 = word: row 4
+    pmulhrsw        m5, m14                         ; m5 = word: row 5
+    packuswb        m0, m1
+    packuswb        m2, m3
+    packuswb        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm4
+    movu            [r2 + r3], xm5
+
+    movu            xm13, [r0 + r1]                 ; m13 = row 13
+    punpckhbw       xm0, xm12, xm13
+    punpcklbw       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
+    paddw           m6, m0
+    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
+    paddw           m8, m0
+    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
+    paddw           m10, m0
+    pmaddubsw       m12, [r5]
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhbw       xm1, xm13, xm0
+    punpcklbw       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
+    paddw           m7, m1
+    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
+    paddw           m9, m1
+    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
+    paddw           m11, m1
+    pmaddubsw       m13, [r5]
+
+    pmulhrsw        m6, m14                         ; m6 = word: row 6
+    pmulhrsw        m7, m14                         ; m7 = word: row 7
+    packuswb        m6, m7
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm7, m6, 1
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r6], xm7
+    lea             r2, [r2 + r3 * 4]
+
+    movu            xm1, [r0 + r4]                  ; m1 = row 15
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
+    paddw           m8, m2
+    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
+    paddw           m10, m2
+    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
+    paddw           m12, m2
+    pmaddubsw       m0, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm2, [r0]                       ; m2 = row 16
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
+    paddw           m9, m3
+    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
+    paddw           m11, m3
+    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
+    paddw           m13, m3
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r1]                  ; m3 = row 17
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
+    paddw           m10, m4
+    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
+    paddw           m12, m4
+    pmaddubsw       m2, [r5 + 1 * mmsize]
+    paddw           m0, m2
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
+    paddw           m11, m5
+    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
+    paddw           m13, m5
+    pmaddubsw       m3, [r5 + 1 * mmsize]
+    paddw           m1, m3
+    movu            xm5, [r0 + r4]                  ; m5 = row 19
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
+    paddw           m12, m6
+    pmaddubsw       m4, [r5 + 2 * mmsize]
+    paddw           m0, m4
+    lea             r0, [r0 + r1 * 4]
+    movu            xm6, [r0]                       ; m6 = row 20
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
+    paddw           m13, m7
+    pmaddubsw       m5, [r5 + 2 * mmsize]
+    paddw           m1, m5
+    movu            xm7, [r0 + r1]                  ; m7 = row 21
+    punpckhbw       xm2, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm2, 1
+    pmaddubsw       m6, [r5 + 3 * mmsize]
+    paddw           m0, m6
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
+    punpckhbw       xm3, xm7, xm2
+    punpcklbw       xm7, xm2
+    vinserti128     m7, m7, xm3, 1
+    pmaddubsw       m7, [r5 + 3 * mmsize]
+    paddw           m1, m7
+
+    pmulhrsw        m8, m14                         ; m8 = word: row 8
+    pmulhrsw        m9, m14                         ; m9 = word: row 9
+    pmulhrsw        m10, m14                        ; m10 = word: row 10
+    pmulhrsw        m11, m14                        ; m11 = word: row 11
+    pmulhrsw        m12, m14                        ; m12 = word: row 12
+    pmulhrsw        m13, m14                        ; m13 = word: row 13
+    pmulhrsw        m0, m14                         ; m0 = word: row 14
+    pmulhrsw        m1, m14                         ; m1 = word: row 15
+    packuswb        m8, m9
+    packuswb        m10, m11
+    packuswb        m12, m13
+    packuswb        m0, m1
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vpermq          m12, m12, 11011000b
+    vpermq          m0, m0, 11011000b
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    vextracti128    xm13, m12, 1
+    vextracti128    xm1, m0, 1
+    movu            [r2], xm8
+    movu            [r2 + r3], xm9
+    movu            [r2 + r3 * 2], xm10
+    movu            [r2 + r6], xm11
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm12
+    movu            [r2 + r3], xm13
+    movu            [r2 + r3 * 2], xm0
+    movu            [r2 + r6], xm1
+    lea             r2, [r2 + r3 * 4]
+    sub             r0, r7
+    dec             r8d
+    jnz             .loop
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_16xN 16, 32
+FILTER_VER_LUMA_AVX2_16xN 16, 64
+
+%macro PROCESS_LUMA_AVX2_W16_16R 0
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r7, [r0 + r1 * 4]
+    movu            xm4, [r7]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+    movu            xm5, [r7 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
+    paddw           m0, m6
+    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
+    paddw           m2, m6
+    pmaddubsw       m4, [r5]
+    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
+    paddw           m1, m7
+    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
+    paddw           m3, m7
+    pmaddubsw       m5, [r5]
+    movu            xm7, [r7 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
+    paddw           m0, m8
+    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
+    paddw           m2, m8
+    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
+    paddw           m4, m8
+    pmaddubsw       m6, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm8, [r7]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
+    paddw           m1, m9
+    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
+    paddw           m3, m9
+    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
+    paddw           m5, m9
+    pmaddubsw       m7, [r5]
+    movu            xm9, [r7 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
+    paddw           m2, m10
+    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
+    paddw           m4, m10
+    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
+    paddw           m6, m10
+    pmaddubsw       m8, [r5]
+    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
+    paddw           m3, m11
+    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
+    paddw           m5, m11
+    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
+    paddw           m7, m11
+    pmaddubsw       m9, [r5]
+    movu            xm11, [r7 + r4]                 ; m11 = row 11
+    punpckhbw       xm12, xm10, xm11
+    punpcklbw       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
+    paddw           m4, m12
+    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
+    paddw           m6, m12
+    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
+    paddw           m8, m12
+    pmaddubsw       m10, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm12, [r7]                      ; m12 = row 12
+    punpckhbw       xm13, xm11, xm12
+    punpcklbw       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
+    paddw           m5, m13
+    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
+    paddw           m7, m13
+    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
+    paddw           m9, m13
+    pmaddubsw       m11, [r5]
+
+    pmulhrsw        m0, m14                         ; m0 = word: row 0
+    pmulhrsw        m1, m14                         ; m1 = word: row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2
+    pmulhrsw        m3, m14                         ; m3 = word: row 3
+    pmulhrsw        m4, m14                         ; m4 = word: row 4
+    pmulhrsw        m5, m14                         ; m5 = word: row 5
+    packuswb        m0, m1
+    packuswb        m2, m3
+    packuswb        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    lea             r8, [r2 + r3 * 4]
+    movu            [r8], xm4
+    movu            [r8 + r3], xm5
+
+    movu            xm13, [r7 + r1]                 ; m13 = row 13
+    punpckhbw       xm0, xm12, xm13
+    punpcklbw       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
+    paddw           m6, m0
+    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
+    paddw           m8, m0
+    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
+    paddw           m10, m0
+    pmaddubsw       m12, [r5]
+    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
+    punpckhbw       xm1, xm13, xm0
+    punpcklbw       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
+    paddw           m7, m1
+    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
+    paddw           m9, m1
+    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
+    paddw           m11, m1
+    pmaddubsw       m13, [r5]
+
+    pmulhrsw        m6, m14                         ; m6 = word: row 6
+    pmulhrsw        m7, m14                         ; m7 = word: row 7
+    packuswb        m6, m7
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm7, m6, 1
+    movu            [r8 + r3 * 2], xm6
+    movu            [r8 + r6], xm7
+    lea             r8, [r8 + r3 * 4]
+
+    movu            xm1, [r7 + r4]                  ; m1 = row 15
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
+    paddw           m8, m2
+    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
+    paddw           m10, m2
+    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
+    paddw           m12, m2
+    pmaddubsw       m0, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm2, [r7]                       ; m2 = row 16
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
+    paddw           m9, m3
+    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
+    paddw           m11, m3
+    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
+    paddw           m13, m3
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r7 + r1]                  ; m3 = row 17
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
+    paddw           m10, m4
+    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
+    paddw           m12, m4
+    pmaddubsw       m2, [r5 + 1 * mmsize]
+    paddw           m0, m2
+    movu            xm4, [r7 + r1 * 2]              ; m4 = row 18
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
+    paddw           m11, m5
+    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
+    paddw           m13, m5
+    pmaddubsw       m3, [r5 + 1 * mmsize]
+    paddw           m1, m3
+    movu            xm5, [r7 + r4]                  ; m5 = row 19
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
+    paddw           m12, m6
+    pmaddubsw       m4, [r5 + 2 * mmsize]
+    paddw           m0, m4
+    lea             r7, [r7 + r1 * 4]
+    movu            xm6, [r7]                       ; m6 = row 20
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
+    paddw           m13, m7
+    pmaddubsw       m5, [r5 + 2 * mmsize]
+    paddw           m1, m5
+    movu            xm7, [r7 + r1]                  ; m7 = row 21
+    punpckhbw       xm2, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm2, 1
+    pmaddubsw       m6, [r5 + 3 * mmsize]
+    paddw           m0, m6
+    movu            xm2, [r7 + r1 * 2]              ; m2 = row 22
+    punpckhbw       xm3, xm7, xm2
+    punpcklbw       xm7, xm2
+    vinserti128     m7, m7, xm3, 1
+    pmaddubsw       m7, [r5 + 3 * mmsize]
+    paddw           m1, m7
+
+    pmulhrsw        m8, m14                         ; m8 = word: row 8
+    pmulhrsw        m9, m14                         ; m9 = word: row 9
+    pmulhrsw        m10, m14                        ; m10 = word: row 10
+    pmulhrsw        m11, m14                        ; m11 = word: row 11
+    pmulhrsw        m12, m14                        ; m12 = word: row 12
+    pmulhrsw        m13, m14                        ; m13 = word: row 13
+    pmulhrsw        m0, m14                         ; m0 = word: row 14
+    pmulhrsw        m1, m14                         ; m1 = word: row 15
+    packuswb        m8, m9
+    packuswb        m10, m11
+    packuswb        m12, m13
+    packuswb        m0, m1
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vpermq          m12, m12, 11011000b
+    vpermq          m0, m0, 11011000b
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    vextracti128    xm13, m12, 1
+    vextracti128    xm1, m0, 1
+    movu            [r8], xm8
+    movu            [r8 + r3], xm9
+    movu            [r8 + r3 * 2], xm10
+    movu            [r8 + r6], xm11
+    lea             r8, [r8 + r3 * 4]
+    movu            [r8], xm12
+    movu            [r8 + r3], xm13
+    movu            [r8 + r3 * 2], xm0
+    movu            [r8 + r6], xm1
+%endmacro
+
+%macro PROCESS_LUMA_AVX2_W16_8R 0
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r7, [r0 + r1 * 4]
+    movu            xm4, [r7]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+    movu            xm5, [r7 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
+    paddw           m0, m6
+    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
+    paddw           m2, m6
+    pmaddubsw       m4, [r5]
+    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
+    paddw           m1, m7
+    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
+    paddw           m3, m7
+    pmaddubsw       m5, [r5]
+    movu            xm7, [r7 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
+    paddw           m0, m8
+    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
+    paddw           m2, m8
+    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
+    paddw           m4, m8
+    pmaddubsw       m6, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm8, [r7]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
+    paddw           m1, m9
+    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
+    paddw           m3, m9
+    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
+    paddw           m5, m9
+    pmaddubsw       m7, [r5]
+    movu            xm9, [r7 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
+    paddw           m2, m10
+    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
+    paddw           m4, m10
+    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
+    paddw           m6, m10
+    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
+    paddw           m3, m11
+    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
+    paddw           m5, m11
+    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
+    paddw           m7, m11
+    movu            xm11, [r7 + r4]                 ; m11 = row 11
+    punpckhbw       xm12, xm10, xm11
+    punpcklbw       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
+    paddw           m4, m12
+    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
+    paddw           m6, m12
+    lea             r7, [r7 + r1 * 4]
+    movu            xm12, [r7]                      ; m12 = row 12
+    punpckhbw       xm13, xm11, xm12
+    punpcklbw       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
+    paddw           m5, m13
+    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
+    paddw           m7, m13
+
+    pmulhrsw        m0, m14                         ; m0 = word: row 0
+    pmulhrsw        m1, m14                         ; m1 = word: row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2
+    pmulhrsw        m3, m14                         ; m3 = word: row 3
+    pmulhrsw        m4, m14                         ; m4 = word: row 4
+    pmulhrsw        m5, m14                         ; m5 = word: row 5
+    packuswb        m0, m1
+    packuswb        m2, m3
+    packuswb        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    lea             r8, [r2 + r3 * 4]
+    movu            [r8], xm4
+    movu            [r8 + r3], xm5
+
+    movu            xm13, [r7 + r1]                 ; m13 = row 13
+    punpckhbw       xm0, xm12, xm13
+    punpcklbw       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
+    paddw           m6, m0
+    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
+    punpckhbw       xm1, xm13, xm0
+    punpcklbw       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
+    paddw           m7, m1
+
+    pmulhrsw        m6, m14                         ; m6 = word: row 6
+    pmulhrsw        m7, m14                         ; m7 = word: row 7
+    packuswb        m6, m7
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm7, m6, 1
+    movu            [r8 + r3 * 2], xm6
+    movu            [r8 + r6], xm7
+%endmacro
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_24x32, 4, 11, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    lea             r10, [r1 * 4]
+    mova            m14, [pw_512]
+    mov             r9d, 2
+.loopH:
+    PROCESS_LUMA_AVX2_W16_16R
+    add             r2, 16
+    add             r0, 16
+
+    movq            xm1, [r0]                       ; m1 = row 0
+    movq            xm2, [r0 + r1]                  ; m2 = row 1
+    punpcklbw       xm1, xm2
+    movq            xm3, [r0 + r1 * 2]              ; m3 = row 2
+    punpcklbw       xm2, xm3
+    vinserti128     m5, m1, xm2, 1
+    pmaddubsw       m5, [r5]
+    movq            xm4, [r0 + r4]                  ; m4 = row 3
+    punpcklbw       xm3, xm4
+    lea             r7, [r0 + r1 * 4]
+    movq            xm1, [r7]                       ; m1 = row 4
+    punpcklbw       xm4, xm1
+    vinserti128     m2, m3, xm4, 1
+    pmaddubsw       m0, m2, [r5 + 1 * mmsize]
+    paddw           m5, m0
+    pmaddubsw       m2, [r5]
+    movq            xm3, [r7 + r1]                  ; m3 = row 5
+    punpcklbw       xm1, xm3
+    movq            xm4, [r7 + r1 * 2]              ; m4 = row 6
+    punpcklbw       xm3, xm4
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
+    paddw           m5, m3
+    pmaddubsw       m0, m1, [r5 + 1 * mmsize]
+    paddw           m2, m0
+    pmaddubsw       m1, [r5]
+    movq            xm3, [r7 + r4]                  ; m3 = row 7
+    punpcklbw       xm4, xm3
+    lea             r7, [r7 + r1 * 4]
+    movq            xm0, [r7]                       ; m0 = row 8
+    punpcklbw       xm3, xm0
+    vinserti128     m4, m4, xm3, 1
+    pmaddubsw       m3, m4, [r5 + 3 * mmsize]
+    paddw           m5, m3
+    pmaddubsw       m3, m4, [r5 + 2 * mmsize]
+    paddw           m2, m3
+    pmaddubsw       m3, m4, [r5 + 1 * mmsize]
+    paddw           m1, m3
+    pmaddubsw       m4, [r5]
+    movq            xm3, [r7 + r1]                  ; m3 = row 9
+    punpcklbw       xm0, xm3
+    movq            xm6, [r7 + r1 * 2]              ; m6 = row 10
+    punpcklbw       xm3, xm6
+    vinserti128     m0, m0, xm3, 1
+    pmaddubsw       m3, m0, [r5 + 3 * mmsize]
+    paddw           m2, m3
+    pmaddubsw       m3, m0, [r5 + 2 * mmsize]
+    paddw           m1, m3
+    pmaddubsw       m3, m0, [r5 + 1 * mmsize]
+    paddw           m4, m3
+    pmaddubsw       m0, [r5]
+
+    movq            xm3, [r7 + r4]                  ; m3 = row 11
+    punpcklbw       xm6, xm3
+    lea             r7, [r7 + r1 * 4]
+    movq            xm7, [r7]                       ; m7 = row 12
+    punpcklbw       xm3, xm7
+    vinserti128     m6, m6, xm3, 1
+    pmaddubsw       m3, m6, [r5 + 3 * mmsize]
+    paddw           m1, m3
+    pmaddubsw       m3, m6, [r5 + 2 * mmsize]
+    paddw           m4, m3
+    pmaddubsw       m3, m6, [r5 + 1 * mmsize]
+    paddw           m0, m3
+    pmaddubsw       m6, [r5]
+    movq            xm3, [r7 + r1]                  ; m3 = row 13
+    punpcklbw       xm7, xm3
+    movq            xm8, [r7 + r1 * 2]              ; m8 = row 14
+    punpcklbw       xm3, xm8
+    vinserti128     m7, m7, xm3, 1
+    pmaddubsw       m3, m7, [r5 + 3 * mmsize]
+    paddw           m4, m3
+    pmaddubsw       m3, m7, [r5 + 2 * mmsize]
+    paddw           m0, m3
+    pmaddubsw       m3, m7, [r5 + 1 * mmsize]
+    paddw           m6, m3
+    pmaddubsw       m7, [r5]
+    movq            xm3, [r7 + r4]                  ; m3 = row 15
+    punpcklbw       xm8, xm3
+    lea             r7, [r7 + r1 * 4]
+    movq            xm9, [r7]                       ; m9 = row 16
+    punpcklbw       xm3, xm9
+    vinserti128     m8, m8, xm3, 1
+    pmaddubsw       m3, m8, [r5 + 3 * mmsize]
+    paddw           m0, m3
+    pmaddubsw       m3, m8, [r5 + 2 * mmsize]
+    paddw           m6, m3
+    pmaddubsw       m3, m8, [r5 + 1 * mmsize]
+    paddw           m7, m3
+    pmaddubsw       m8, [r5]
+    movq            xm3, [r7 + r1]                  ; m3 = row 17
+    punpcklbw       xm9, xm3
+    movq            xm10, [r7 + r1 * 2]             ; m10 = row 18
+    punpcklbw       xm3, xm10
+    vinserti128     m9, m9, xm3, 1
+    pmaddubsw       m3, m9, [r5 + 3 * mmsize]
+    paddw           m6, m3
+    pmaddubsw       m3, m9, [r5 + 2 * mmsize]
+    paddw           m7, m3
+    pmaddubsw       m3, m9, [r5 + 1 * mmsize]
+    paddw           m8, m3
+    movq            xm3, [r7 + r4]                  ; m3 = row 19
+    punpcklbw       xm10, xm3
+    lea             r7, [r7 + r1 * 4]
+    movq            xm9, [r7]                       ; m9 = row 20
+    punpcklbw       xm3, xm9
+    vinserti128     m10, m10, xm3, 1
+    pmaddubsw       m3, m10, [r5 + 3 * mmsize]
+    paddw           m7, m3
+    pmaddubsw       m3, m10, [r5 + 2 * mmsize]
+    paddw           m8, m3
+    movq            xm3, [r7 + r1]                  ; m3 = row 21
+    punpcklbw       xm9, xm3
+    movq            xm10, [r7 + r1 * 2]             ; m10 = row 22
+    punpcklbw       xm3, xm10
+    vinserti128     m9, m9, xm3, 1
+    pmaddubsw       m3, m9, [r5 + 3 * mmsize]
+    paddw           m8, m3
+
+    pmulhrsw        m5, m14                         ; m5 = word: row 0, row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2, row 3
+    pmulhrsw        m1, m14                         ; m1 = word: row 4, row 5
+    pmulhrsw        m4, m14                         ; m4 = word: row 6, row 7
+    pmulhrsw        m0, m14                         ; m0 = word: row 8, row 9
+    pmulhrsw        m6, m14                         ; m6 = word: row 10, row 11
+    pmulhrsw        m7, m14                         ; m7 = word: row 12, row 13
+    pmulhrsw        m8, m14                         ; m8 = word: row 14, row 15
+    packuswb        m5, m2
+    packuswb        m1, m4
+    packuswb        m0, m6
+    packuswb        m7, m8
+    vextracti128    xm2, m5, 1
+    vextracti128    xm4, m1, 1
+    vextracti128    xm6, m0, 1
+    vextracti128    xm8, m7, 1
+    movq            [r2], xm5
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm5
+    movhps          [r2 + r6], xm2
+    lea             r8, [r2 + r3 * 4]
+    movq            [r8], xm1
+    movq            [r8 + r3], xm4
+    movhps          [r8 + r3 * 2], xm1
+    movhps          [r8 + r6], xm4
+    lea             r8, [r8 + r3 * 4]
+    movq            [r8], xm0
+    movq            [r8 + r3], xm6
+    movhps          [r8 + r3 * 2], xm0
+    movhps          [r8 + r6], xm6
+    lea             r8, [r8 + r3 * 4]
+    movq            [r8], xm7
+    movq            [r8 + r3], xm8
+    movhps          [r8 + r3 * 2], xm7
+    movhps          [r8 + r6], xm8
+
+    sub             r7, r10
+    lea             r0, [r7 - 16]
+    lea             r2, [r8 + r3 * 4 - 16]
+    dec             r9d
+    jnz             .loopH
+    RET
+%endif
+
+%macro FILTER_VER_LUMA_AVX2_32xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    lea             r11, [r1 * 4]
+    mova            m14, [pw_512]
+    mov             r9d, %2 / 16
+.loopH:
+    mov             r10d, %1 / 16
+.loopW:
+    PROCESS_LUMA_AVX2_W16_16R
+    add             r2, 16
+    add             r0, 16
+    dec             r10d
+    jnz             .loopW
+    sub             r7, r11
+    lea             r0, [r7 - 16]
+    lea             r2, [r8 + r3 * 4 - 16]
+    dec             r9d
+    jnz             .loopH
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_32xN 32, 32
+FILTER_VER_LUMA_AVX2_32xN 32, 64
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_32x16, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    mova            m14, [pw_512]
+    mov             r9d, 2
+.loopW:
+    PROCESS_LUMA_AVX2_W16_16R
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_32x24, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    mova            m14, [pw_512]
+    mov             r9d, 2
+.loopW:
+    PROCESS_LUMA_AVX2_W16_16R
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    lea             r9, [r1 * 4]
+    sub             r7, r9
+    lea             r0, [r7 - 16]
+    lea             r2, [r8 + r3 * 4 - 16]
+    mov             r9d, 2
+.loop:
+    PROCESS_LUMA_AVX2_W16_8R
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loop
+    RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_32x8, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    mova            m14, [pw_512]
+    mov             r9d, 2
+.loopW:
+    PROCESS_LUMA_AVX2_W16_8R
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_48x64, 4, 12, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    lea             r11, [r1 * 4]
+    mova            m14, [pw_512]
+    mov             r9d, 4
+.loopH:
+    mov             r10d, 3
+.loopW:
+    PROCESS_LUMA_AVX2_W16_16R
+    add             r2, 16
+    add             r0, 16
+    dec             r10d
+    jnz             .loopW
+    sub             r7, r11
+    lea             r0, [r7 - 32]
+    lea             r2, [r8 + r3 * 4 - 32]
+    dec             r9d
+    jnz             .loopH
+    RET
+%endif
+
+%macro FILTER_VER_LUMA_AVX2_64xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    lea             r11, [r1 * 4]
+    mova            m14, [pw_512]
+    mov             r9d, %2 / 16
+.loopH:
+    mov             r10d, %1 / 16
+.loopW:
+    PROCESS_LUMA_AVX2_W16_16R
+    add             r2, 16
+    add             r0, 16
+    dec             r10d
+    jnz             .loopW
+    sub             r7, r11
+    lea             r0, [r7 - 48]
+    lea             r2, [r8 + r3 * 4 - 48]
+    dec             r9d
+    jnz             .loopH
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_64xN 64, 32
+FILTER_VER_LUMA_AVX2_64xN 64, 48
+FILTER_VER_LUMA_AVX2_64xN 64, 64
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_64x16, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r3 * 3]
+    mova            m14, [pw_512]
+    mov             r9d, 4
+.loopW:
+    PROCESS_LUMA_AVX2_W16_16R
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    RET
+%endif
+
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
@@ -3695,7 +7544,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
 %endif
 
 %ifidn %3,pp
-    mova      m3, [tab_c_512]
+    mova      m3, [pw_512]
 %else
     mova      m3, [pw_2000]
 %endif
@@ -3959,7 +7808,7 @@ cglobal chroma_p2s, 3, 7, 4
     mov         r4d, r4m
 
     ; load constant
-    mova        m2, [tab_c_128]
+    mova        m2, [pb_128]
     mova        m3, [tab_c_64_n64]
 
 .loopH:
diff --git a/source/common/x86/ipfilter8.h b/source/common/x86/ipfilter8.h
index 3949409..6c8683f 100644
--- a/source/common/x86/ipfilter8.h
+++ b/source/common/x86/ipfilter8.h
@@ -25,10 +25,10 @@
 #define X265_IPFILTER8_H
 
 #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
-    void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
-    void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+    void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
 
 #define LUMA_FILTERS(cpu) \
     SETUP_LUMA_FUNC_DEF(4,   4, cpu); \
@@ -58,7 +58,7 @@
     SETUP_LUMA_FUNC_DEF(16, 64, cpu)
 
 #define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
-    void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
+    void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
 
 #define LUMA_SP_FILTERS(cpu) \
     SETUP_LUMA_SP_FUNC_DEF(4,   4, cpu); \
@@ -88,7 +88,7 @@
     SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu);
 
 #define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
-    void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+    void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
 
 #define LUMA_SS_FILTERS(cpu) \
     SETUP_LUMA_SS_FUNC_DEF(4,   4, cpu); \
@@ -120,10 +120,10 @@
 #if HIGH_BIT_DEPTH
 
 #define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
-    void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+    void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
 
 #define CHROMA_VERT_FILTERS(cpu) \
     SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
@@ -208,8 +208,8 @@
     SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu)
 
 #define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
-    void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+    void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
 
 #define CHROMA_HORIZ_FILTERS(cpu) \
     SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
@@ -289,8 +289,8 @@
     SETUP_CHROMA_HORIZ_FUNC_DEF(64, 16, cpu); \
     SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu)
 
-void x265_chroma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
-void x265_luma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_chroma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
+void x265_luma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
 
 CHROMA_VERT_FILTERS(_sse2);
 CHROMA_HORIZ_FILTERS(_sse4);
@@ -319,10 +319,10 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
 #else // if HIGH_BIT_DEPTH
 
 #define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
-    void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
-    void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+    void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
 
 #define CHROMA_FILTERS(cpu) \
     SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
@@ -403,7 +403,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
     SETUP_CHROMA_FUNC_DEF(16, 64, cpu);
 
 #define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
-    void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
+    void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
 
 #define CHROMA_SP_FILTERS(cpu) \
     SETUP_CHROMA_SP_FUNC_DEF(8, 2, cpu); \
@@ -488,7 +488,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
     SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu);
 
 #define SETUP_CHROMA_SS_FUNC_DEF(W, H, cpu) \
-    void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+    void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
 
 #define CHROMA_SS_FILTERS(cpu) \
     SETUP_CHROMA_SS_FUNC_DEF(4, 4, cpu); \
@@ -573,12 +573,14 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
     SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu);
 
 CHROMA_FILTERS(_sse4);
+CHROMA_FILTERS(_avx2);
 CHROMA_SP_FILTERS(_sse2);
 CHROMA_SP_FILTERS_SSE4(_sse4);
 CHROMA_SS_FILTERS(_sse2);
 CHROMA_SS_FILTERS_SSE4(_sse4);
 
 CHROMA_FILTERS_422(_sse4);
+CHROMA_FILTERS_422(_avx2);
 CHROMA_SP_FILTERS_422(_sse2);
 CHROMA_SP_FILTERS_422_SSE4(_sse4);
 CHROMA_SS_FILTERS_422(_sse2);
@@ -588,7 +590,7 @@ CHROMA_FILTERS_444(_sse4);
 CHROMA_SP_FILTERS_444(_sse4);
 CHROMA_SS_FILTERS_444(_sse2);
 
-void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_chroma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
 
 #undef SETUP_CHROMA_FUNC_DEF
 #undef SETUP_CHROMA_SP_FUNC_DEF
@@ -616,8 +618,8 @@ LUMA_SP_FILTERS(_sse4);
 LUMA_SS_FILTERS(_sse2);
 LUMA_FILTERS(_avx2);
 
-void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
-void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_interp_8tap_hv_pp_8x8_ssse3(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
+void x265_luma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
 
 #undef LUMA_FILTERS
 #undef LUMA_SP_FILTERS
diff --git a/source/common/x86/mc.h b/source/common/x86/mc.h
index 95cb609..9bf4611 100644
--- a/source/common/x86/mc.h
+++ b/source/common/x86/mc.h
@@ -25,7 +25,7 @@
 #define X265_MC_H
 
 #define LOWRES(cpu) \
-    void x265_frame_init_lowres_core_ ## cpu(pixel * src0, pixel * dst0, pixel * dsth, pixel * dstv, pixel * dstc, \
+    void x265_frame_init_lowres_core_ ## cpu(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
                                              intptr_t src_stride, intptr_t dst_stride, int width, int height);
 LOWRES(mmx2)
 LOWRES(sse2)
@@ -37,31 +37,31 @@ LOWRES(xop)
     void func ## _mmx2 args; \
     void func ## _sse2 args; \
     void func ## _ssse3 args;
-DECL_SUF(x265_pixel_avg_64x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x48, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_48x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x24, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x8,  (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_24x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x12, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x8,  (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x4,  (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_12x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x32,  (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x16,  (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x8,   (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x4,   (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x16,  (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x8,   (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x4,   (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x48, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_48x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x24, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x8,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_24x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x12, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x8,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x4,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_12x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x32,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x16,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x8,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x4,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_4x16,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_4x8,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_4x4,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
 
 #undef LOWRES
 #undef DECL_SUF
diff --git a/source/common/x86/pixel-util.h b/source/common/x86/pixel-util.h
index 90bb4fc..3c8fe3e 100644
--- a/source/common/x86/pixel-util.h
+++ b/source/common/x86/pixel-util.h
@@ -24,59 +24,52 @@
 #ifndef X265_PIXEL_UTIL_H
 #define X265_PIXEL_UTIL_H
 
-void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-
-void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual16_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual32_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-
-void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
-
-void x265_transpose8_avx2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose16_avx2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose64_avx2(pixel *dest, pixel *src, intptr_t stride);
-
-uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
-
-void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-
-void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
-                                     const uint8_t * pix2, intptr_t stride2, int sums[2][4]);
-void x265_pixel_ssim_4x4x2_core_sse2(const pixel * pix1, intptr_t stride1,
-                                     const pixel * pix2, intptr_t stride2, int sums[2][4]);
-void x265_pixel_ssim_4x4x2_core_avx(const pixel * pix1, intptr_t stride1,
-                                    const pixel * pix2, intptr_t stride2, int sums[2][4]);
+void x265_getResidual4_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual8_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+
+void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose16_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose32_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose64_sse2(pixel* dest, const pixel* src, intptr_t stride);
+
+void x265_transpose8_avx2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose16_avx2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose32_avx2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose64_avx2(pixel* dest, const pixel* src, intptr_t stride);
+
+uint32_t x265_quant_sse4(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_quant_avx2(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_sse4(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_avx2(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+int x265_count_nonzero_ssse3(const int16_t* quantCoeff, int numCoeff);
+
+void x265_weight_pp_sse4(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_pp_avx2(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_sp_sse4(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+
+void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t* pix1, intptr_t stride1,
+                                     const uint8_t* pix2, intptr_t stride2, int sums[2][4]);
+void x265_pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1,
+                                     const pixel* pix2, intptr_t stride2, int sums[2][4]);
+void x265_pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1,
+                                    const pixel* pix2, intptr_t stride2, int sums[2][4]);
 float x265_pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width);
 float x265_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width);
 
-void x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t);
-void x265_scale1D_128to64_avx2(pixel *, pixel *, intptr_t);
-void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t);
+void x265_scale1D_128to64_ssse3(pixel*, const pixel*, intptr_t);
+void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
+void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
 
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
-    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \
-    void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1);
+    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
+    void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  scr1, intptr_t srcStride0, intptr_t srcStride1);
 
 #define CHROMA_PIXELSUB_DEF(cpu) \
     SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
@@ -91,8 +84,8 @@ void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t);
     SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu);
 
 #define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
-    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \
-    void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1);
+    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
+    void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  scr1, intptr_t srcStride0, intptr_t srcStride1);
 
 #define LUMA_PIXELSUB_DEF(cpu) \
     SETUP_LUMA_PIXELSUB_PS_FUNC(8,   8, cpu); \
@@ -109,7 +102,7 @@ CHROMA_PIXELSUB_DEF_422(_sse4);
 CHROMA_PIXELSUB_DEF_422(_sse2);
 
 #define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
-    uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel * pix, intptr_t pixstride);
+    uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(const pixel* pix, intptr_t pixstride);
 
 #define LUMA_PIXELVAR_DEF(cpu) \
     SETUP_LUMA_PIXELVAR_FUNC(8,   8, cpu); \
diff --git a/source/common/x86/pixel-util8.asm b/source/common/x86/pixel-util8.asm
index 38fb52e..8adeb84 100644
--- a/source/common/x86/pixel-util8.asm
+++ b/source/common/x86/pixel-util8.asm
@@ -61,447 +61,6 @@ cextern pd_1
 cextern pd_32767
 cextern pd_n32768
 
-;-----------------------------------------------------------------------------
-; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-%if HIGH_BIT_DEPTH
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
-    %define t7b     r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
-    %define t7b     byte [rsp]
-%endif
-    mov         r4d, r4m
-    mov         r5d, r5m
-    mov         r6d, r6m
-    add         r4d, r4d
-    add         r5d, r5d
-    add         r6d, r6d
-
-    pxor        m4, m4
-    mova        m5, [pw_pixel_max]
-    mov         t7b, 4/2
-.loop:
-    movh        m0, [r0]
-    movh        m1, [r0 + r4]
-    punpcklqdq  m0, m1
-    movh        m2, [r1]
-    movh        m3, [r1 + r4]
-    punpcklqdq  m2, m3
-    paddw       m0, m2
-    CLIPW       m0, m4, m5
-
-    ; store recipred[]
-    movh        [r3], m0
-    movhps      [r3 + r6], m0
-
-    ; store recqt[]
-    movh        [r2], m0
-    movhps      [r2 + r5], m0
-
-    lea         r0, [r0 + r4 * 2]
-    lea         r1, [r1 + r4 * 2]
-    lea         r2, [r2 + r5 * 2]
-    lea         r3, [r3 + r6 * 2]
-
-    dec         t7b
-    jnz        .loop
-    RET
-%else          ;HIGH_BIT_DEPTH
-
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
-    %define t7b     r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
-    %define t7b     byte [rsp]
-%endif
-    mov         r4d, r4m
-    mov         r5d, r5m
-    mov         r6d, r6m
-    add         r5d, r5d
-
-    pxor        m0, m0
-    mov         t7b, 4/2
-.loop:
-    movd        m1, [r0]
-    movd        m2, [r0 + r4]
-    punpckldq   m1, m2
-    punpcklbw   m1, m0
-    movh        m2, [r1]
-    movh        m3, [r1 + r4 * 2]
-    punpcklqdq  m2, m3
-    paddw       m1, m2
-    packuswb    m1, m1
-
-    ; store recon[] and recipred[]
-    movd        [r3], m1
-    pshufd      m2, m1, 1
-    movd        [r3 + r6], m2
-
-    ; store recqt[]
-    punpcklbw   m1, m0
-    movh        [r2], m1
-    movhps      [r2 + r5], m1
-
-    lea         r0, [r0 + r4 * 2]
-    lea         r1, [r1 + r4 * 4]
-    lea         r2, [r2 + r5 * 2]
-    lea         r3, [r3 + r6 * 2]
-
-    dec         t7b
-    jnz        .loop
-    RET
-%endif          ;HIGH_BIT_DEPTH
-
-
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons8, 5,8,4
-    %define t7b     r7b
-%else
-cglobal calcRecons8, 5,7,4,0-1
-    %define t7b     byte [rsp]
-%endif
-
-%if HIGH_BIT_DEPTH
-    mov         r4d, r4m
-    mov         r5d, r5m
-    mov         r6d, r6m
-    add         r4d, r4d
-    add         r5d, r5d
-    add         r6d, r6d
-
-    pxor        m4, m4
-    mova        m5, [pw_pixel_max]
-    mov         t7b, 8/2
-.loop:
-    movu        m0, [r0]
-    movu        m1, [r0 + r4]
-    movu        m2, [r1]
-    movu        m3, [r1 + r4]
-    paddw       m0, m2
-    paddw       m1, m3
-    CLIPW2      m0, m1, m4, m5
-
-    ; store recipred[]
-    movu        [r3], m0
-    movu        [r3 + r6], m1
-
-    ; store recqt[]
-    movu        [r2], m0
-    movu        [r2 + r5], m1
-
-    lea         r0, [r0 + r4 * 2]
-    lea         r1, [r1 + r4 * 2]
-    lea         r2, [r2 + r5 * 2]
-    lea         r3, [r3 + r6 * 2]
-
-    dec         t7b
-    jnz        .loop
-    RET
-%else          ;HIGH_BIT_DEPTH
-
-    mov         r4d, r4m
-    mov         r5d, r5m
-    mov         r6d, r6m
-    add         r5d, r5d
-
-    pxor        m0, m0
-    mov         t7b, 8/2
-.loop:
-    movh        m1, [r0]
-    movh        m2, [r0 + r4]
-    punpcklbw   m1, m0
-    punpcklbw   m2, m0
-    movu        m3, [r1]
-    movu        m4, [r1 + r4 * 2]
-    paddw       m1, m3
-    paddw       m2, m4
-    packuswb    m1, m2
-
-    ; store recon[] and recipred[]
-    movh        [r3], m1
-    movhps      [r3 + r6], m1
-
-    ; store recqt[]
-    punpcklbw   m2, m1, m0
-    punpckhbw   m1, m0
-    movu        [r2], m2
-    movu        [r2 + r5], m1
-
-    lea         r0, [r0 + r4 * 2]
-    lea         r1, [r1 + r4 * 4]
-    lea         r2, [r2 + r5 * 2]
-    lea         r3, [r3 + r6 * 2]
-
-    dec         t7b
-    jnz        .loop
-    RET
-%endif          ;HIGH_BIT_DEPTH
-
-
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
-    %define t7b     r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
-    %define t7b     byte [rsp]
-%endif
-
-    mov         r4d, r4m
-    mov         r5d, r5m
-    mov         r6d, r6m
-    add         r4d, r4d
-    add         r5d, r5d
-    add         r6d, r6d
-
-    pxor        m4, m4
-    mova        m5, [pw_pixel_max]
-    mov         t7b, 16/2
-.loop:
-    movu        m0, [r0]
-    movu        m1, [r0 + 16]
-    movu        m2, [r1]
-    movu        m3, [r1 + 16]
-    paddw       m0, m2
-    paddw       m1, m3
-    CLIPW2      m0, m1, m4, m5
-
-    ; store recipred[]
-    movu        [r3], m0
-    movu        [r3 + 16], m1
-
-    ; store recqt[]
-    movu        [r2], m0
-    movu        [r2 + 16], m1
-
-    movu        m0, [r0 + r4]
-    movu        m1, [r0 + r4 + 16]
-    movu        m2, [r1 + r4]
-    movu        m3, [r1 + r4 + 16]
-    paddw       m0, m2
-    paddw       m1, m3
-    CLIPW2      m0, m1, m4, m5
-
-    ; store recon[] and recipred[]
-    movu        [r3 + r6], m0
-    movu        [r3 + r6 + 16], m1
-
-    ; store recqt[]
-    movu        [r2 + r5], m0
-    movu        [r2 + r5 + 16], m1
-
-    lea         r0, [r0 + r4 * 2]
-    lea         r1, [r1 + r4 * 2]
-    lea         r2, [r2 + r5 * 2]
-    lea         r3, [r3 + r6 * 2]
-
-    dec         t7b
-    jnz        .loop
-    RET
-%else          ;HIGH_BIT_DEPTH
-
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
-    %define t7b     r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
-    %define t7b     byte [rsp]
-%endif
-
-    mov         r4d, r4m
-    mov         r5d, r5m
-    mov         r6d, r6m
-    add         r5d, r5d
-
-    pxor        m0, m0
-    mov         t7b, 16
-.loop:
-    movu        m2, [r0]
-    pmovzxbw    m1, m2
-    punpckhbw   m2, m0
-    paddw       m1, [r1]
-    paddw       m2, [r1 + 16]
-    packuswb    m1, m2
-
-    ; store recon[] and recipred[]
-    movu        [r3], m1
-
-    ; store recqt[]
-    pmovzxbw    m2, m1
-    punpckhbw   m1, m0
-    movu        [r2], m2
-    movu        [r2 + 16], m1
-
-    add         r2, r5
-    add         r3, r6
-    add         r0, r4
-    lea         r1, [r1 + r4 * 2]
-
-    dec         t7b
-    jnz        .loop
-    RET
-%endif          ;HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
-    %define t7b     r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
-    %define t7b     byte [rsp]
-%endif
-
-    mov         r4d, r4m
-    mov         r5d, r5m
-    mov         r6d, r6m
-    add         r4d, r4d
-    add         r5d, r5d
-    add         r6d, r6d
-
-    pxor        m4, m4
-    mova        m5, [pw_pixel_max]
-    mov         t7b, 32/2
-.loop:
-
-    movu        m0, [r0]
-    movu        m1, [r0 + 16]
-    movu        m2, [r1]
-    movu        m3, [r1 + 16]
-    paddw       m0, m2
-    paddw       m1, m3
-    CLIPW2      m0, m1, m4, m5
-
-    ; store recipred[]
-    movu        [r3], m0
-    movu        [r3 + 16], m1
-
-    ; store recqt[]
-    movu        [r2], m0
-    movu        [r2 + 16], m1
-
-    movu        m0, [r0 + 32]
-    movu        m1, [r0 + 48]
-    movu        m2, [r1 + 32]
-    movu        m3, [r1 + 48]
-    paddw       m0, m2
-    paddw       m1, m3
-    CLIPW2      m0, m1, m4, m5
-
-    ; store recon[] and recipred[]
-    movu        [r3 + 32], m0
-    movu        [r3 + 48], m1
-
-    ; store recqt[]
-    movu        [r2 + 32], m0
-    movu        [r2 + 48], m1
-    add         r2, r5
-
-    movu        m0, [r0 + r4]
-    movu        m1, [r0 + r4 + 16]
-    movu        m2, [r1 + r4]
-    movu        m3, [r1 + r4 + 16]
-    paddw       m0, m2
-    paddw       m1, m3
-    CLIPW2      m0, m1, m4, m5
-
-    ; store recon[] and recipred[]
-    movu        [r3 + r6], m0
-    movu        [r3 + r6 + 16], m1
-
-    ; store recqt[]
-    movu        [r2], m0
-    movu        [r2 + 16], m1
-
-    movu        m0, [r0 + r4 + 32]
-    movu        m1, [r0 + r4 + 48]
-    movu        m2, [r1 + r4 + 32]
-    movu        m3, [r1 + r4 + 48]
-    paddw       m0, m2
-    paddw       m1, m3
-    CLIPW2      m0, m1, m4, m5
-
-    ; store recon[] and recipred[]
-    movu        [r3 + r6 + 32], m0
-    movu        [r3 + r6 + 48], m1
-    lea         r3, [r3 + r6 * 2]
-
-    ; store recqt[]
-    movu        [r2 + 32], m0
-    movu        [r2 + 48], m1
-    add         r2, r5
-
-    lea         r0, [r0 + r4 * 2]
-    lea         r1, [r1 + r4 * 2]
-
-    dec         t7b
-    jnz        .loop
-    RET
-%else          ;HIGH_BIT_DEPTH
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
-    %define t7b     r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
-    %define t7b     byte [rsp]
-%endif
-
-    mov         r4d, r4m
-    mov         r5d, r5m
-    mov         r6d, r6m
-    add         r5d, r5d
-
-    pxor        m0, m0
-    mov         t7b, 32
-.loop:
-    movu        m2, [r0]
-    movu        m4, [r0 + 16]
-    pmovzxbw    m1, m2
-    punpckhbw   m2, m0
-    pmovzxbw    m3, m4
-    punpckhbw   m4, m0
-
-    paddw       m1, [r1 + 0 * 16]
-    paddw       m2, [r1 + 1 * 16]
-    packuswb    m1, m2
-
-    paddw       m3, [r1 + 2 * 16]
-    paddw       m4, [r1 + 3 * 16]
-    packuswb    m3, m4
-
-    ; store recon[] and recipred[]
-    movu        [r3], m1
-    movu        [r3 + 16], m3
-
-    ; store recqt[]
-    pmovzxbw    m2, m1
-    punpckhbw   m1, m0
-    movu        [r2 + 0 * 16], m2
-    movu        [r2 + 1 * 16], m1
-    pmovzxbw    m4, m3
-    punpckhbw   m3, m0
-    movu        [r2 + 2 * 16], m4
-    movu        [r2 + 3 * 16], m3
-
-    add         r2, r5
-    add         r3, r6
-    add         r0, r4
-    lea         r1, [r1 + r4 * 2]
-
-    dec         t7b
-    jnz        .loop
-    RET
-%endif          ;HIGH_BIT_DEPTH
-
 
 ;-----------------------------------------------------------------------------
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -861,7 +420,7 @@ cglobal getResidual32, 4,5,7
 
 
 ;-----------------------------------------------------------------------------
-; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
+; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal quant, 5,6,8
@@ -883,7 +442,7 @@ cglobal quant, 5,6,8
     pxor        m7, m7          ; m7 = numZero
 .loop:
     ; 4 coeff
-    movu        m0, [r0]        ; m0 = level
+    pmovsxwd    m0, [r0]        ; m0 = level
     pabsd       m1, m0
     pmulld      m1, [r1]        ; m0 = tmpLevel1
     paddd       m2, m1, m5
@@ -901,7 +460,7 @@ cglobal quant, 5,6,8
     movh        [r3], m3
 
     ; 4 coeff
-    movu        m0, [r0 + 16]   ; m0 = level
+    pmovsxwd    m0, [r0 + 8]    ; m0 = level
     pabsd       m1, m0
     pmulld      m1, [r1 + 16]   ; m0 = tmpLevel1
     paddd       m2, m1, m5
@@ -916,7 +475,7 @@ cglobal quant, 5,6,8
     packssdw    m3, m3
     movh        [r3 + 8], m3
 
-    add         r0, 32
+    add         r0, 16
     add         r1, 32
     add         r2, 32
     add         r3, 16
@@ -953,7 +512,7 @@ cglobal quant, 5,5,10
     pxor            m7, m7              ; m7 = numZero
 .loop:
     ; 8 coeff
-    movu            m0, [r0]            ; m0 = level
+    pmovsxwd        m0, [r0]            ; m0 = level
     pabsd           m1, m0
     pmulld          m1, [r1]            ; m0 = tmpLevel1
     paddd           m2, m1, m5
@@ -966,7 +525,7 @@ cglobal quant, 5,5,10
     psignd          m2, m0
 
     ; 8 coeff
-    movu            m0, [r0 + mmsize]   ; m0 = level
+    pmovsxwd        m0, [r0 + mmsize/2] ; m0 = level
     pabsd           m1, m0
     pmulld          m1, [r1 + mmsize]   ; m0 = tmpLevel1
     paddd           m3, m1, m5
@@ -987,7 +546,7 @@ cglobal quant, 5,5,10
     pminuw          m2, m9
     paddw           m7, m2
 
-    add             r0, mmsize*2
+    add             r0, mmsize
     add             r1, mmsize*2
     add             r2, mmsize*2
     add             r3, mmsize
@@ -1025,7 +584,7 @@ cglobal quant, 5,6,8
     pxor            m7, m7          ; m7 = numZero
 .loop:
     ; 8 coeff
-    movu            m0, [r0]        ; m0 = level
+    pmovsxwd        m0, [r0]        ; m0 = level
     pabsd           m1, m0
     pmulld          m1, [r1]        ; m0 = tmpLevel1
     paddd           m2, m1, m5
@@ -1044,7 +603,7 @@ cglobal quant, 5,6,8
     movu            [r3], xm3
 
     ; 8 coeff
-    movu            m0, [r0 + mmsize]        ; m0 = level
+    pmovsxwd        m0, [r0 + mmsize/2]        ; m0 = level
     pabsd           m1, m0
     pmulld          m1, [r1 + mmsize]        ; m0 = tmpLevel1
     paddd           m2, m1, m5
@@ -1062,7 +621,7 @@ cglobal quant, 5,6,8
     vpermq          m3, m3, q0020
     movu            [r3 + mmsize/2], xm3
 
-    add             r0, mmsize*2
+    add             r0, mmsize
     add             r1, mmsize*2
     add             r2, mmsize*2
     add             r3, mmsize
@@ -1083,7 +642,7 @@ IACA_END
 
 
 ;-----------------------------------------------------------------------------
-; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
+; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal nquant, 3,5,8
@@ -1096,8 +655,8 @@ cglobal nquant, 3,5,8
     shr         r4d, 3
 
 .loop:
-    movu        m0, [r0]        ; m0 = level
-    movu        m1, [r0 + 16]   ; m1 = level
+    pmovsxwd    m0, [r0]        ; m0 = level
+    pmovsxwd    m1, [r0 + 8]    ; m1 = level
 
     pabsd       m2, m0
     pmulld      m2, [r1]        ; m0 = tmpLevel1 * qcoeff
@@ -1114,7 +673,7 @@ cglobal nquant, 3,5,8
     packssdw    m2, m3
 
     movu        [r2], m2
-    add         r0, 32
+    add         r0, 16
     add         r1, 32
     add         r2, 16
 
@@ -1144,14 +703,14 @@ cglobal nquant, 3,5,7
     shr         r4d, 4
 
 .loop:
-    movu        m0, [r0]            ; m0 = level
+    pmovsxwd    m0, [r0]            ; m0 = level
     pabsd       m1, m0
     pmulld      m1, [r1]            ; m0 = tmpLevel1 * qcoeff
     paddd       m1, m4
     psrad       m1, xm3             ; m0 = level1
     psignd      m1, m0
 
-    movu        m0, [r0 + mmsize]   ; m0 = level
+    pmovsxwd    m0, [r0 + mmsize/2] ; m0 = level
     pabsd       m2, m0
     pmulld      m2, [r1 + mmsize]   ; m0 = tmpLevel1 * qcoeff
     paddd       m2, m4
@@ -1162,7 +721,7 @@ cglobal nquant, 3,5,7
     vpermq      m2, m1, q3120
 
     movu        [r2], m2
-    add         r0, mmsize * 2
+    add         r0, mmsize
     add         r1, mmsize * 2
     add         r2, mmsize
 
@@ -1211,15 +770,11 @@ cglobal dequant_normal, 5,5,5
     pmaddwd     m4, m1
     psrad       m3, m0
     psrad       m4, m0
-    packssdw    m3, m3              ; OPT_ME: store must be 32 bits
-    pmovsxwd    m3, m3
-    packssdw    m4, m4
-    pmovsxwd    m4, m4
+    packssdw    m3, m4
     mova        [r1], m3
-    mova        [r1 + 16], m4
 
     add         r0, 16
-    add         r1, 32
+    add         r1, 16
 
     sub         r2d, 8
     jnz        .loop
@@ -1259,13 +814,12 @@ cglobal dequant_normal, 5,5,7
     pmaxsd          m3, m6
     pminsd          m4, m5
     pmaxsd          m4, m6
+    packssdw        m3, m4
     mova            [r1 + 0 * mmsize/2], xm3
-    mova            [r1 + 1 * mmsize/2], xm4
-    vextracti128    [r1 + 2 * mmsize/2], m3, 1
-    vextracti128    [r1 + 3 * mmsize/2], m4, 1
+    vextracti128    [r1 + 1 * mmsize/2], m3, 1
 
     add             r0, mmsize
-    add             r1, mmsize * 2
+    add             r1, mmsize
 
     dec             r2d
     jnz            .loop
diff --git a/source/common/x86/pixel.h b/source/common/x86/pixel.h
index e99b1ee..cb85dbd 100644
--- a/source/common/x86/pixel.h
+++ b/source/common/x86/pixel.h
@@ -57,17 +57,17 @@
     ret x265_pixel_ ## name ## _12x16_ ## suffix args; \
 
 #define DECL_X1(name, suffix) \
-    DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
+    DECL_PIXELS(int, name, suffix, (const pixel*, intptr_t, const pixel*, intptr_t))
 
 #define DECL_X1_SS(name, suffix) \
-    DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t))
+    DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const int16_t*, intptr_t))
 
 #define DECL_X1_SP(name, suffix) \
-    DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, pixel *, intptr_t))
+    DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const pixel*, intptr_t))
 
 #define DECL_X4(name, suffix) \
-    DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \
-    DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *))
+    DECL_PIXELS(void, name ## _x3, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*)) \
+    DECL_PIXELS(void, name ## _x4, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*))
 
 /* sad-a.asm */
 DECL_X1(sad, mmx2)
@@ -103,11 +103,11 @@ DECL_X1(satd, sse4)
 DECL_X1(satd, avx)
 DECL_X1(satd, xop)
 DECL_X1(satd, avx2)
-int x265_pixel_satd_8x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x4_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x12_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t);
+int x265_pixel_satd_8x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x4_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x12_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x64_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
 
 DECL_X1(sa8d, mmx2)
 DECL_X1(sa8d, sse2)
@@ -138,42 +138,42 @@ DECL_X1_SS(ssd_ss, xop)
 DECL_X1_SS(ssd_ss, avx2)
 DECL_X1_SP(ssd_sp, sse4)
 #define DECL_HEVC_SSD(suffix) \
-    int x265_pixel_ssd_32x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_16x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_32x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_32x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_16x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_32x24_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_24x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_32x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_8x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_16x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_16x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_8x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_16x12_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_16x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_8x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
-    int x265_pixel_ssd_8x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t);
+    int x265_pixel_ssd_32x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_16x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_32x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_32x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_16x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_32x24_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_24x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_32x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_8x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_16x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_16x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_8x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_16x12_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_16x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_8x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+    int x265_pixel_ssd_8x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t);
 DECL_HEVC_SSD(sse2)
 DECL_HEVC_SSD(ssse3)
 DECL_HEVC_SSD(avx)
 
-int x265_pixel_ssd_12x16_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_24x32_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_48x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x16_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x32_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
+int x265_pixel_ssd_12x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_24x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_48x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x48_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
 
-int x265_pixel_ssd_s_4_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_8_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_16_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_32_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_32_avx2(int16_t *, intptr_t);
+int x265_pixel_ssd_s_4_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_8_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_16_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_32_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t);
 
 #define ADDAVG(func)  \
-    void x265_ ## func ## _sse4(int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
+    void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
 ADDAVG(addAvg_2x4)
 ADDAVG(addAvg_2x8)
 ADDAVG(addAvg_4x2);
@@ -216,8 +216,8 @@ ADDAVG(addAvg_16x24)
 ADDAVG(addAvg_24x64)
 ADDAVG(addAvg_32x48)
 
-void x265_downShift_16_sse2(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
-void x265_upShift_8_sse4(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift);
+void x265_downShift_16_sse2(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void x265_upShift_8_sse4(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 
 #undef DECL_PIXELS
 #undef DECL_HEVC_SSD
diff --git a/source/common/yuv.cpp b/source/common/yuv.cpp
index fffc215..eedb5b2 100644
--- a/source/common/yuv.cpp
+++ b/source/common/yuv.cpp
@@ -43,21 +43,31 @@ bool Yuv::create(uint32_t size, int csp)
     m_hChromaShift = CHROMA_H_SHIFT(csp);
     m_vChromaShift = CHROMA_V_SHIFT(csp);
 
-    // set width and height
     m_size  = size;
-    m_csize = size >> m_hChromaShift;
     m_part = partitionFromSizes(size, size);
 
-    size_t sizeL = size * size;
-    size_t sizeC = sizeL >> (m_vChromaShift + m_hChromaShift);
+    if (csp == X265_CSP_I400)
+    {
+        CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
+        m_buf[1] = m_buf[2] = 0;
+        m_csize = MAX_INT;
+        return true;
+    }
+    else
+    {
+        m_csize = size >> m_hChromaShift;
 
-    X265_CHECK((sizeC & 15) == 0, "invalid size");
+        size_t sizeL = size * size;
+        size_t sizeC = sizeL >> (m_vChromaShift + m_hChromaShift);
 
-    // memory allocation (padded for SIMD reads)
-    CHECKED_MALLOC(m_buf[0], pixel, sizeL + sizeC * 2 + 8);
-    m_buf[1] = m_buf[0] + sizeL;
-    m_buf[2] = m_buf[0] + sizeL + sizeC;
-    return true;
+        X265_CHECK((sizeC & 15) == 0, "invalid size");
+
+        // memory allocation (padded for SIMD reads)
+        CHECKED_MALLOC(m_buf[0], pixel, sizeL + sizeC * 2 + 8);
+        m_buf[1] = m_buf[0] + sizeL;
+        m_buf[2] = m_buf[0] + sizeL + sizeC;
+        return true;
+    }
 
 fail:
     return false;
@@ -71,7 +81,6 @@ void Yuv::destroy()
 void Yuv::copyToPicYuv(PicYuv& dstPic, uint32_t cuAddr, uint32_t absPartIdx) const
 {
     pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx);
-
     primitives.luma_copy_pp[m_part](dstY, dstPic.m_stride, m_buf[0], m_size);
 
     pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
@@ -82,29 +91,41 @@ void Yuv::copyToPicYuv(PicYuv& dstPic, uint32_t cuAddr, uint32_t absPartIdx) con
 
 void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx)
 {
-    /* We cheat with const_cast internally because the get methods are not capable of
-     * returning const buffers and the primitives are not const aware, but we know
-     * this function does not modify srcPic */
-    PicYuv& srcPicSafe = const_cast<PicYuv&>(srcPic);
-    pixel* srcY = srcPicSafe.getLumaAddr(cuAddr, absPartIdx);
-
+    const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx);
     primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcY, srcPic.m_stride);
 
-    pixel* srcU = srcPicSafe.getCbAddr(cuAddr, absPartIdx);
-    pixel* srcV = srcPicSafe.getCrAddr(cuAddr, absPartIdx);
-    primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPicSafe.m_strideC);
-    primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPicSafe.m_strideC);
+    const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
+    const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
+    primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPic.m_strideC);
+    primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPic.m_strideC);
 }
 
 void Yuv::copyFromYuv(const Yuv& srcYuv)
 {
-    X265_CHECK(m_size <= srcYuv.m_size, "invalid size\n");
+    X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n");
 
     primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
     primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
     primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
 }
 
+/* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */
+void Yuv::copyPUFromYuv(const Yuv& srcYuv, uint32_t absPartIdx, int partEnum, bool bChroma)
+{
+    X265_CHECK(m_size == FENC_STRIDE && m_size >= srcYuv.m_size, "PU buffer size mismatch\n");
+
+    const pixel* srcY = srcYuv.m_buf[0] + getAddrOffset(absPartIdx, srcYuv.m_size);
+    primitives.luma_copy_pp[partEnum](m_buf[0], m_size, srcY, srcYuv.m_size);
+
+    if (bChroma)
+    {
+        const pixel* srcU = srcYuv.m_buf[1] + srcYuv.getChromaAddrOffset(absPartIdx);
+        const pixel* srcV = srcYuv.m_buf[2] + srcYuv.getChromaAddrOffset(absPartIdx);
+        primitives.chroma[m_csp].copy_pp[partEnum](m_buf[1], m_csize, srcU, srcYuv.m_csize);
+        primitives.chroma[m_csp].copy_pp[partEnum](m_buf[2], m_csize, srcV, srcYuv.m_csize);
+    }
+}
+
 void Yuv::copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const
 {
     pixel* dstY = dstYuv.getLumaAddr(absPartIdx);
@@ -120,7 +141,6 @@ void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const
 {
     pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size);
     pixel* dstY = dstYuv.m_buf[0];
-
     primitives.luma_copy_pp[dstYuv.m_part](dstY, dstYuv.m_size, srcY, m_size);
 
     pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
@@ -144,21 +164,19 @@ void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absP
 
     if (bLuma)
     {
-        int16_t* srcY0 = const_cast<ShortYuv&>(srcYuv0).getLumaAddr(absPartIdx);
-        int16_t* srcY1 = const_cast<ShortYuv&>(srcYuv1).getLumaAddr(absPartIdx);
+        const int16_t* srcY0 = srcYuv0.getLumaAddr(absPartIdx);
+        const int16_t* srcY1 = srcYuv1.getLumaAddr(absPartIdx);
         pixel* dstY = getLumaAddr(absPartIdx);
-
         primitives.luma_addAvg[part](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
     }
     if (bChroma)
     {
-        int16_t* srcU0 = const_cast<ShortYuv&>(srcYuv0).getCbAddr(absPartIdx);
-        int16_t* srcV0 = const_cast<ShortYuv&>(srcYuv0).getCrAddr(absPartIdx);
-        int16_t* srcU1 = const_cast<ShortYuv&>(srcYuv1).getCbAddr(absPartIdx);
-        int16_t* srcV1 = const_cast<ShortYuv&>(srcYuv1).getCrAddr(absPartIdx);
+        const int16_t* srcU0 = srcYuv0.getCbAddr(absPartIdx);
+        const int16_t* srcV0 = srcYuv0.getCrAddr(absPartIdx);
+        const int16_t* srcU1 = srcYuv1.getCbAddr(absPartIdx);
+        const int16_t* srcV1 = srcYuv1.getCrAddr(absPartIdx);
         pixel* dstU = getCbAddr(absPartIdx);
         pixel* dstV = getCrAddr(absPartIdx);
-
         primitives.chroma[m_csp].addAvg[part](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
         primitives.chroma[m_csp].addAvg[part](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
     }
@@ -168,7 +186,7 @@ void Yuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size
 {
     const pixel* src = getLumaAddr(absPartIdx);
     pixel* dst = dstYuv.getLumaAddr(absPartIdx);
-    primitives.square_copy_pp[log2Size - 2](dst, dstYuv.m_size, const_cast<pixel*>(src), m_size);
+    primitives.luma_copy_pp[log2Size - 2](dst, dstYuv.m_size, src, m_size);
 }
 
 void Yuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
@@ -178,7 +196,6 @@ void Yuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Si
     const pixel* srcV = getCrAddr(absPartIdx);
     pixel* dstU = dstYuv.getCbAddr(absPartIdx);
     pixel* dstV = dstYuv.getCrAddr(absPartIdx);
-
-    primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, const_cast<pixel*>(srcU), m_csize);
-    primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, const_cast<pixel*>(srcV), m_csize);
+    primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, srcU, m_csize);
+    primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, srcV, m_csize);
 }
diff --git a/source/common/yuv.h b/source/common/yuv.h
index a02987c..5911c30 100644
--- a/source/common/yuv.h
+++ b/source/common/yuv.h
@@ -63,6 +63,9 @@ public:
     // Copy from same size YUV buffer
     void   copyFromYuv(const Yuv& srcYuv);
 
+    // Copy portion of srcYuv into ME prediction buffer
+    void   copyPUFromYuv(const Yuv& srcYuv, uint32_t absPartIdx, int partEnum, bool bChroma);
+
     // Copy Small YUV buffer to the part of other Big YUV buffer
     void   copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const;
 
diff --git a/source/encoder/CMakeLists.txt b/source/encoder/CMakeLists.txt
index 020364f..0e995ed 100644
--- a/source/encoder/CMakeLists.txt
+++ b/source/encoder/CMakeLists.txt
@@ -3,6 +3,9 @@
 if(GCC)
    add_definitions(-Wno-uninitialized)
 endif()
+if(MSVC)
+   add_definitions(/wd4701) # potentially uninitialized local variable 'foo' used
+endif()
 
 add_library(encoder OBJECT ../x265.h
     analysis.cpp analysis.h
diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp
index c62f5f0..8b8d103 100644
--- a/source/encoder/analysis.cpp
+++ b/source/encoder/analysis.cpp
@@ -33,8 +33,6 @@
 #include "rdcost.h"
 #include "encoder.h"
 
-#include "PPA/ppa.h"
-
 using namespace x265;
 
 /* An explanation of rate distortion levels (--rd-level)
@@ -61,9 +59,12 @@ using namespace x265;
  *
  *   RDO selection between merge and skip
  *   sa8d selection of best inter mode
+ *   sa8d decisions include chroma residual cost
  *   RDO selection between (merge/skip) / best inter mode / intra / split
  *
  * rd-level 4 enables RDOQuant
+ *   chroma residual cost included in satd decisions, including subpel refine
+ *    (as a result of --subme 3 being used by preset slow)
  *
  * rd-level 5,6 does RDO for each inter mode
  */
@@ -71,12 +72,15 @@ using namespace x265;
 Analysis::Analysis()
 {
     m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
+    m_reuseIntraDataCTU = NULL;
+    m_reuseInterDataCTU = NULL;
 }
 
 bool Analysis::create(ThreadLocalData *tld)
 {
     m_tld = tld;
     m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
+    m_bChromaSa8d = m_param->rdLevel >= 3;
 
     int csp = m_param->internalCsp;
     uint32_t cuSize = g_maxCUSize;
@@ -116,7 +120,7 @@ void Analysis::destroy()
     }
 }
 
-Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
+Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
 {
     m_slice = ctu.m_slice;
     m_frame = &frame;
@@ -124,27 +128,26 @@ Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuG
     invalidateContexts(0);
     m_quant.setQPforQuant(ctu);
     m_rqt[0].cur.load(initialContext);
-    m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_origPicYuv, ctu.m_cuAddr, 0);
+    m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
 
     uint32_t numPartition = ctu.m_numPartitions;
+    if (m_param->analysisMode)
+    {
+        m_reuseIntraDataCTU = (analysis_intra_data *)m_frame->m_analysisData.intraData;
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData + ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir;
+    }
+
     if (m_slice->m_sliceType == I_SLICE)
     {
         uint32_t zOrder = 0;
-        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
-            compressIntraCU(ctu, cuGeom, m_frame->m_intraData, zOrder);
-        else
+        compressIntraCU(ctu, cuGeom, zOrder);
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
         {
-            compressIntraCU(ctu, cuGeom, NULL, zOrder);
-
-            if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_intraData)
-            {
-                CUData *bestCU = &m_modeDepth[0].bestMode->cu;
-                memcpy(&m_frame->m_intraData->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
-                memcpy(&m_frame->m_intraData->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
-                memcpy(&m_frame->m_intraData->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
-                m_frame->m_intraData->cuAddr[ctu.m_cuAddr] = ctu.m_cuAddr;
-                m_frame->m_intraData->poc[ctu.m_cuAddr] = m_frame->m_poc;
-            }
+            CUData *bestCU = &m_modeDepth[0].bestMode->cu;
+            memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
+            memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
+            memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
         }
     }
     else
@@ -152,10 +155,10 @@ Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuG
         if (!m_param->rdLevel)
         {
             /* In RD Level 0/1, copy source pixels into the reconstructed block so
-             * they are available for intra predictions */
-            m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPicYuv, ctu.m_cuAddr, 0);
-            
-            compressInterCU_rd0_4(ctu, cuGeom); // TODO: this really wants to be compressInterCU_rd0_1
+            * they are available for intra predictions */
+            m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
+
+            compressInterCU_rd0_4(ctu, cuGeom);
 
             /* generate residual for entire CTU at once and copy to reconPic */
             encodeResidue(ctu, cuGeom);
@@ -178,7 +181,7 @@ void Analysis::tryLossless(const CUGeom& cuGeom)
     if (!md.bestMode->distortion)
         /* already lossless */
         return;
-    else if (md.bestMode->cu.m_predMode[0] == MODE_INTRA)
+    else if (md.bestMode->cu.isIntra(0))
     {
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
@@ -195,7 +198,7 @@ void Analysis::tryLossless(const CUGeom& cuGeom)
     }
 }
 
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* shared, uint32_t& zOrder)
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder)
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepth[depth];
@@ -204,20 +207,20 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 
-    if (shared)
+    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
     {
-        uint8_t* sharedDepth = &shared->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        char* sharedPartSizes = &shared->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        uint8_t* sharedModes = &shared->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+        uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+        uint8_t* reuseModes  = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+        char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
 
-        if (mightNotSplit && depth == sharedDepth[zOrder] && zOrder == cuGeom.encodeIdx)
+        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx)
         {
             m_quant.setQPforQuant(parentCTU);
 
-            PartSize size = (PartSize)sharedPartSizes[zOrder];
+            PartSize size = (PartSize)reusePartSizes[zOrder];
             Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
             mode.cu.initSubCU(parentCTU, cuGeom);
-            checkIntra(mode, cuGeom, size, sharedModes);
+            checkIntra(mode, cuGeom, size, &reuseModes[zOrder]);
             checkBestMode(mode, depth);
 
             if (m_bTryLossless)
@@ -227,7 +230,7 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
 
             // increment zOrder offset to point to next best depth in sharedDepth buffer
-            zOrder += g_depthInc[g_maxCUDepth - 1][sharedDepth[zOrder]];
+            zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
             mightSplit = false;
         }
     }
@@ -267,23 +270,23 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
 
         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
         {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
             {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
-                compressIntraCU(parentCTU, childCuData, shared, zOrder);
+                compressIntraCU(parentCTU, childGeom, zOrder);
 
                 // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                 splitPred->addSubCosts(*nd.bestMode);
-                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                 nextContext = &nd.bestMode->contexts;
             }
             else
             {
                 /* record the depth of this non-present sub-CU */
-                splitCU->setEmptyPart(childCuData, subPartIdx);
+                splitCU->setEmptyPart(childGeom, subPartIdx);
                 zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
             }
         }
@@ -300,38 +303,45 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
 }
 
 bool Analysis::findJob(int threadId)
 {
     /* try to acquire a CU mode to analyze */
+    m_pmodeLock.acquire();
     if (m_totalNumJobs > m_numAcquiredJobs)
     {
-        /* ATOMIC_INC returns the incremented value */
-        int id = ATOMIC_INC(&m_numAcquiredJobs);
-        if (m_totalNumJobs >= id)
-        {
-            parallelModeAnalysis(threadId, id - 1);
+        int id = m_numAcquiredJobs++;
+        m_pmodeLock.release();
 
-            if (ATOMIC_INC(&m_numCompletedJobs) == m_totalNumJobs)
-                m_modeCompletionEvent.trigger();
-            return true;
-        }
+        parallelModeAnalysis(threadId, id);
+
+        m_pmodeLock.acquire();
+        if (++m_numCompletedJobs == m_totalNumJobs)
+            m_modeCompletionEvent.trigger();
+        m_pmodeLock.release();
+        return true;
     }
+    else
+        m_pmodeLock.release();
 
+    m_meLock.acquire();
     if (m_totalNumME > m_numAcquiredME)
     {
-        int id = ATOMIC_INC(&m_numAcquiredME);
-        if (m_totalNumME >= id)
-        {
-            parallelME(threadId, id - 1);
+        int id = m_numAcquiredME++;
+        m_meLock.release();
 
-            if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
-                m_meCompletionEvent.trigger();
-            return true;
-        }
+        parallelME(threadId, id);
+
+        m_meLock.acquire();
+        if (++m_numCompletedME == m_totalNumME)
+            m_meCompletionEvent.trigger();
+        m_meLock.release();
+        return true;
     }
+    else
+        m_meLock.release();
 
     return false;
 }
@@ -349,18 +359,14 @@ void Analysis::parallelME(int threadId, int meId)
         slave->m_slice = m_slice;
         slave->m_frame = m_frame;
 
-        PicYuv* fencPic = m_frame->m_origPicYuv;
-        pixel* pu = fencPic->getLumaAddr(m_curMECu->m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx);
-        slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
-        slave->m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
-
-        slave->prepMotionCompensation(*m_curMECu, *m_curGeom, m_curPart);
+        slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
+        slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart);
     }
 
     if (meId < m_slice->m_numRefIdx[0])
-        slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 0, meId);
+        slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId);
     else
-        slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
+        slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
 }
 
 void Analysis::parallelModeAnalysis(int threadId, int jobId)
@@ -376,8 +382,6 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId)
         slave->m_frame = m_frame;
         slave->setQP(*m_slice, m_rdCost.m_qp);
         slave->invalidateContexts(0);
-        if (jobId)
-            slave->m_me.setSourcePlane(m_frame->m_origPicYuv->m_picOrg[0], m_frame->m_origPicYuv->m_stride);
     }
 
     ModeDepth& md = m_modeDepth[m_curGeom->depth];
@@ -389,13 +393,15 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId)
         case 0:
             if (slave != this)
                 slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
-            slave->checkIntraInInter_rd0_4(md.pred[PRED_INTRA], *m_curGeom);
+            slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
             if (m_param->rdLevel > 2)
                 slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
             break;
 
         case 1:
             slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N);
+            if (m_slice->m_sliceType == B_SLICE)
+                slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
             break;
 
         case 2:
@@ -446,6 +452,13 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId)
 
         case 1:
             slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false);
+            md.pred[PRED_BIDIR].rdCost = MAX_INT64;
+            if (m_slice->m_sliceType == B_SLICE)
+            {
+                slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
+                if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
+                    slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom);
+            }
             break;
 
         case 2:
@@ -499,6 +512,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
 
         /* Initialize all prediction CUs based on parentCTU */
         md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
+        md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
         if (m_param->bEnableRectInter)
@@ -520,12 +534,14 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
                 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
         }
 
+        m_pmodeLock.acquire();
         m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4;
         m_numAcquiredJobs = !bTryIntra;
         m_numCompletedJobs = m_numAcquiredJobs;
         m_curGeom = &cuGeom;
         m_bJobsQueued = true;
         JobProvider::enqueue();
+        m_pmodeLock.release();
 
         for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++)
             m_pool->pokeIdleThread();
@@ -572,17 +588,26 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
 
             if (m_param->rdLevel > 2)
             {
-                /* encode best inter */
-                for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                /* RD selection between merge, inter, bidir and intra */
+                if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
                 {
-                    prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                    motionCompensation(bestInter->predYuv, false, true);
+                    for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                    {
+                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->predYuv, false, true);
+                    }
                 }
                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
-
-                /* RD selection between merge, inter and intra */
                 checkBestMode(*bestInter, depth);
 
+                /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
+                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
+                    md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+                {
+                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+                    checkBestMode(md.pred[PRED_BIDIR], depth);
+                }
+
                 if (bTryIntra)
                     checkBestMode(md.pred[PRED_INTRA], depth);
             }
@@ -591,6 +616,9 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
                 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
                     md.bestMode = bestInter;
 
+                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
+                    md.bestMode = &md.pred[PRED_BIDIR];
+
                 if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
                 {
                     md.bestMode = &md.pred[PRED_INTRA];
@@ -614,6 +642,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
             m_modeCompletionEvent.wait();
 
             checkBestMode(md.pred[PRED_2Nx2N], depth);
+            checkBestMode(md.pred[PRED_BIDIR], depth);
 
             if (m_param->bEnableRectInter)
             {
@@ -640,7 +669,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
         if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra)
         {
             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
-            checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
             encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
             checkBestMode(md.pred[PRED_INTRA], depth);
         }
@@ -655,7 +684,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
     bool bNoSplit = false;
     if (md.bestMode)
     {
-        bNoSplit = !!md.bestMode->cu.isSkipped(0);
+        bNoSplit = md.bestMode->cu.isSkipped(0);
         if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
             bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
     }
@@ -674,22 +703,22 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
 
         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
         {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
             {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
-                compressInterCU_dist(parentCTU, childCuData);
+                compressInterCU_dist(parentCTU, childGeom);
 
                 // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                 splitPred->addSubCosts(*nd.bestMode);
 
-                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                 nextContext = &nd.bestMode->contexts;
             }
             else
-                splitCU->setEmptyPart(childCuData, subPartIdx);
+                splitCU->setEmptyPart(childGeom, subPartIdx);
         }
         nextContext->store(splitPred->contexts);
 
@@ -701,10 +730,10 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
         checkBestMode(*splitPred, depth);
     }
 
-    if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
+    if (mightNotSplit)
     {
         /* early-out statistics */
-        FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+        FrameData& curEncData = *m_frame->m_encData;
         FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
         uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
         cuStat.count[depth] += 1;
@@ -716,7 +745,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
 }
 
 void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom)
@@ -734,24 +763,9 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
     {
         bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
 
-        /* Initialize all prediction CUs based on parentCTU */
-        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
+        /* Compute Merge Cost */
         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
-        if (m_param->bEnableRectInter)
-        {
-            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
-        }
-        if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
-        {
-            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
-        }
-
-        /* Compute Merge Cost */
         checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 
         bool earlyskip = false;
@@ -760,14 +774,24 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
 
         if (!earlyskip)
         {
+            md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
             checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N);
-            Mode *bestInter = &md.pred[PRED_2Nx2N];
 
+            if (m_slice->m_sliceType == B_SLICE)
+            {
+                md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
+                checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+            }
+
+            Mode *bestInter = &md.pred[PRED_2Nx2N];
             if (m_param->bEnableRectInter)
             {
+                md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
                 checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N);
                 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
                     bestInter = &md.pred[PRED_Nx2N];
+
+                md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
                 checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN);
                 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
                     bestInter = &md.pred[PRED_2NxN];
@@ -789,18 +813,24 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
 
                 if (bHor)
                 {
+                    md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
                     checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU);
                     if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
                         bestInter = &md.pred[PRED_2NxnU];
+
+                    md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
                     checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD);
                     if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
                         bestInter = &md.pred[PRED_2NxnD];
                 }
                 if (bVer)
                 {
+                    md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
                     checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N);
                     if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
                         bestInter = &md.pred[PRED_nLx2N];
+
+                    md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
                     checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N);
                     if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
                         bestInter = &md.pred[PRED_nRx2N];
@@ -810,37 +840,48 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
             if (m_param->rdLevel >= 3)
             {
                 /* Calculate RD cost of best inter option */
-                for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
                 {
-                    prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                    motionCompensation(bestInter->predYuv, false, true);
+                    for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                    {
+                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->predYuv, false, true);
+                    }
                 }
-
                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
+                checkBestMode(*bestInter, depth);
 
-                if (!md.bestMode || bestInter->rdCost < md.bestMode->rdCost)
-                    md.bestMode = bestInter;
+                /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
+                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
+                    md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+                {
+                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+                    checkBestMode(md.pred[PRED_BIDIR], depth);
+                }
 
                 if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
                     md.bestMode->sa8dCost == MAX_INT64)
                 {
                     md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
-                    checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+                    checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
                     encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
-                    if (md.pred[PRED_INTRA].rdCost < md.bestMode->rdCost)
-                        md.bestMode = &md.pred[PRED_INTRA];
+                    checkBestMode(md.pred[PRED_INTRA], depth);
                 }
             }
             else
             {
-                /* SA8D choice between merge/skip, inter, and intra */
+                /* SA8D choice between merge/skip, inter, bidir, and intra */
                 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
                     md.bestMode = bestInter;
 
+                if (m_slice->m_sliceType == B_SLICE &&
+                    md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
+                    md.bestMode = &md.pred[PRED_BIDIR];
+
                 if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
                 {
                     md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
-                    checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+                    checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
                     if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
                         md.bestMode = &md.pred[PRED_INTRA];
                 }
@@ -854,7 +895,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
                     /* prediction already generated for this CU, and if rd level
                      * is not 0, it is already fully encoded */
                 }
-                else if (md.bestMode->cu.m_predMode[0] == MODE_INTER)
+                else if (md.bestMode->cu.isInter(0))
                 {
                     for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
                     {
@@ -865,8 +906,23 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
                         encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
                     else if (m_param->rdLevel == 1)
                     {
-                        m_rqt[cuGeom.depth].tmpResiYuv.subtract(md.fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
-                        generateCoeffRecon(*md.bestMode, cuGeom);
+                        /* generate recon pixels with no rate distortion considerations */
+                        CUData& cu = md.bestMode->cu;
+                        m_quant.setQPforQuant(cu);
+
+                        uint32_t tuDepthRange[2];
+                        cu.getInterTUQtDepthRange(tuDepthRange, 0);
+
+                        m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
+                        residualTransformQuantInter(*md.bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
+                        if (cu.getQtRootCbf(0))
+                            md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
+                        else
+                        {
+                            md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
+                            if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
+                                cu.setPredModeSubParts(MODE_SKIP);
+                        }
                     }
                 }
                 else
@@ -874,7 +930,20 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
                     if (m_param->rdLevel == 2)
                         encodeIntraInInter(*md.bestMode, cuGeom);
                     else if (m_param->rdLevel == 1)
-                        generateCoeffRecon(*md.bestMode, cuGeom);
+                    {
+                        /* generate recon pixels with no rate distortion considerations */
+                        CUData& cu = md.bestMode->cu;
+                        m_quant.setQPforQuant(cu);
+
+                        uint32_t tuDepthRange[2];
+                        cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+                        uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
+                        residualTransformQuantIntra(*md.bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
+                        getBestIntraModeChroma(*md.bestMode, cuGeom);
+                        residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
+                        md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
+                    }
                 }
             }
         } // !earlyskip
@@ -889,7 +958,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
     bool bNoSplit = false;
     if (md.bestMode)
     {
-        bNoSplit = !!md.bestMode->cu.isSkipped(0);
+        bNoSplit = md.bestMode->cu.isSkipped(0);
         if (mightSplit && depth && depth >= minDepth && !bNoSplit)
             bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
     }
@@ -908,54 +977,48 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
 
         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
         {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
             {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
-                compressInterCU_rd0_4(parentCTU, childCuData);
+                compressInterCU_rd0_4(parentCTU, childGeom);
 
                 // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                 splitPred->addSubCosts(*nd.bestMode);
 
                 if (m_param->rdLevel)
-                    nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+                    nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                 else
-                    nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childCuData.numPartitions * subPartIdx);
+                    nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
                 if (m_param->rdLevel > 1)
                     nextContext = &nd.bestMode->contexts;
             }
             else
-                splitCU->setEmptyPart(childCuData, subPartIdx);
+                splitCU->setEmptyPart(childGeom, subPartIdx);
         }
         nextContext->store(splitPred->contexts);
 
         if (mightNotSplit)
             addSplitFlagCost(*splitPred, cuGeom.depth);
-        else if (m_param->rdLevel <= 1)
-            splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
-        else
+        else if (m_param->rdLevel > 1)
             updateModeCost(*splitPred);
+        else
+            splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
 
         if (!md.bestMode)
             md.bestMode = splitPred;
-        else if (m_param->rdLevel >= 1)
-        {
-            if (splitPred->rdCost < md.bestMode->rdCost)
-                md.bestMode = splitPred;
-        }
-        else
-        {
-            if (splitPred->sa8dCost < md.bestMode->sa8dCost)
-                md.bestMode = splitPred;
-        }
+        else if (m_param->rdLevel > 1)
+            checkBestMode(*splitPred, cuGeom.depth);
+        else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
+            md.bestMode = splitPred;
     }
 
-    if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
+    if (mightNotSplit)
     {
         /* early-out statistics */
-        FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+        FrameData& curEncData = *m_frame->m_encData;
         FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
         uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
         cuStat.count[depth] += 1;
@@ -967,7 +1030,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel)
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
 }
 
 void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom)
@@ -981,27 +1044,39 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
 
     if (mightNotSplit)
     {
-        for (int i = 0; i < MAX_PRED_TYPES; i++)
-            md.pred[i].cu.initSubCU(parentCTU, cuGeom);
-
+        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
+        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
         checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
         bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
 
         if (!earlySkip)
         {
+            md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
             checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false);
             checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
+            if (m_slice->m_sliceType == B_SLICE)
+            {
+                md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
+                checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+                if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
+                {
+                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+                    checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
+                }
+            }
+
             if (m_param->bEnableRectInter)
             {
-                // Nx2N rect
                 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                 {
+                    md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
                     checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false);
                     checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
                 }
                 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                 {
+                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
                     checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false);
                     checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
                 }
@@ -1027,11 +1102,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
                 {
                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                     {
+                        md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
                         checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly);
                         checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
                     }
                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                     {
+                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
                         checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly);
                         checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
                     }
@@ -1040,11 +1117,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
                 {
                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                     {
+                        md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
                         checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly);
                         checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
                     }
                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                     {
+                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
                         checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly);
                         checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
                     }
@@ -1054,11 +1133,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
             if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) &&
                 (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)))
             {
+                md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
                 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
                 checkBestMode(md.pred[PRED_INTRA], depth);
 
                 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
                 {
+                    md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
                     checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
                     checkBestMode(md.pred[PRED_INTRA_NxN], depth);
                 }
@@ -1087,21 +1168,21 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
 
         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
         {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
             {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
-                compressInterCU_rd5_6(parentCTU, childCuData);
+                compressInterCU_rd5_6(parentCTU, childGeom);
 
                 // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                 splitPred->addSubCosts(*nd.bestMode);
-                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                 nextContext = &nd.bestMode->contexts;
             }
             else
-                splitCU->setEmptyPart(childCuData, subPartIdx);
+                splitCU->setEmptyPart(childGeom, subPartIdx);
         }
         nextContext->store(splitPred->contexts);
         if (mightNotSplit)
@@ -1117,7 +1198,7 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
 }
 
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -1148,7 +1229,12 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
 
     bestPred->sa8dCost = MAX_INT64;
     int bestSadCand = -1;
-    int sizeIdx = cuGeom.log2CUSize - 2;
+    int cpart, sizeIdx = cuGeom.log2CUSize - 2;
+    if (m_bChromaSa8d)
+    {
+        int cuSize = 1 << cuGeom.log2CUSize;
+        cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
+    }
     for (uint32_t i = 0; i < maxNumMergeCand; ++i)
     {
         if (m_bFrameParallel &&
@@ -1159,16 +1245,20 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
         tempPred->cu.m_interDir[0] = interDirNeighbours[i];
         tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-        tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+        tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
         tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-        tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
+        tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
 
-        // do MC only for Luma part
         prepMotionCompensation(tempPred->cu, cuGeom, 0);
-        motionCompensation(tempPred->predYuv, true, false);
+        motionCompensation(tempPred->predYuv, true, m_bChromaSa8d);
 
         tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
         tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
+        if (m_bChromaSa8d)
+        {
+            tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
+            tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
+        }
         tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
 
         if (tempPred->sa8dCost < bestPred->sa8dCost)
@@ -1183,8 +1273,11 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
         return;
 
     /* calculate the motion compensation for chroma for the best mode selected */
-    prepMotionCompensation(bestPred->cu, cuGeom, 0);
-    motionCompensation(bestPred->predYuv, false, true);
+    if (!m_bChromaSa8d) /* Chroma MC was done above */
+    {
+        prepMotionCompensation(bestPred->cu, cuGeom, 0);
+        motionCompensation(bestPred->predYuv, false, true);
+    }
 
     if (m_param->rdLevel)
     {
@@ -1197,9 +1290,9 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
         tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
         tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
-        tempPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
+        tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
         tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
-        tempPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+        tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
         tempPred->sa8dCost = bestPred->sa8dCost;
         tempPred->predYuv.copyFromYuv(bestPred->predYuv);
 
@@ -1213,9 +1306,9 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
     /* broadcast sets of MV field data */
     bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
     bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
-    bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
+    bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
     bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
-    bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+    bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
 }
 
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -1269,10 +1362,10 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
         tempPred->cu.m_interDir[0] = interDirNeighbours[i];
         tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-        tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+        tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
         tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-        tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
-        tempPred->cu.setSkipFlagSubParts(false);     /* must be cleared between encode iterations */
+        tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+        tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
 
         prepMotionCompensation(tempPred->cu, cuGeom, 0);
         motionCompensation(tempPred->predYuv, true, true);
@@ -1302,10 +1395,10 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
                 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
                 tempPred->cu.m_interDir[0] = interDirNeighbours[i];
                 tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-                tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+                tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
                 tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-                tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
-                tempPred->cu.setSkipFlagSubParts(false);
+                tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+                tempPred->cu.setPredModeSubParts(MODE_INTER);
                 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
             }
             
@@ -1324,9 +1417,9 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
         uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
         bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0);
         bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0);
-        bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
+        bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
         bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0);
-        bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
+        bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
     }
 }
 
@@ -1335,14 +1428,48 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize
     interMode.initCosts();
     interMode.cu.setPartSizeSubParts(partSize);
     interMode.cu.setPredModeSubParts(MODE_INTER);
+    int numPredDir = m_slice->isInterP() ? 1 : 2;
 
-    if (predInterSearch(interMode, cuGeom, false, false))
+    if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
+    {
+        for (uint32_t part = 0; part < interMode.cu.getNumPartInter(); part++)
+        {
+            MotionData* bestME = interMode.bestME[part];
+            for (int32_t i = 0; i < numPredDir; i++)
+            {
+                bestME[i].ref = m_reuseInterDataCTU->ref;
+                m_reuseInterDataCTU++;
+            }
+        }
+    }
+    if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d))
     {
         /* predInterSearch sets interMode.sa8dBits */
         const Yuv& fencYuv = *interMode.fencYuv;
         Yuv& predYuv = interMode.predYuv;
-        interMode.distortion = primitives.sa8d[cuGeom.log2CUSize - 2](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
+        int part = partitionFromLog2Size(cuGeom.log2CUSize);
+        interMode.distortion = primitives.sa8d[part](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
+        if (m_bChromaSa8d)
+        {
+            uint32_t cuSize = 1 << cuGeom.log2CUSize;
+            int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
+            interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
+            interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
+        }
         interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
+
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+        {
+            for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+            {
+                MotionData* bestME = interMode.bestME[puIdx];
+                for (int32_t i = 0; i < numPredDir; i++)
+                {
+                    m_reuseInterDataCTU->ref = bestME[i].ref;
+                    m_reuseInterDataCTU++;
+                }
+            }
+        }
     }
     else
     {
@@ -1356,11 +1483,37 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize
     interMode.initCosts();
     interMode.cu.setPartSizeSubParts(partSize);
     interMode.cu.setPredModeSubParts(MODE_INTER);
+    int numPredDir = m_slice->isInterP() ? 1 : 2;
 
+    if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
+    {
+        for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+        {
+            MotionData* bestME = interMode.bestME[puIdx];
+            for (int32_t i = 0; i < numPredDir; i++)
+            {
+                bestME[i].ref = m_reuseInterDataCTU->ref;
+                m_reuseInterDataCTU++;
+            }
+        }
+    }
     if (predInterSearch(interMode, cuGeom, bMergeOnly, true))
     {
         /* predInterSearch sets interMode.sa8dBits, but this is ignored */
         encodeResAndCalcRdInterCU(interMode, cuGeom);
+
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+        {
+            for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+            {
+                MotionData* bestME = interMode.bestME[puIdx];
+                for (int32_t i = 0; i < numPredDir; i++)
+                {
+                    m_reuseInterDataCTU->ref = bestME[i].ref;
+                    m_reuseInterDataCTU++;
+                }
+            }
+        }
     }
     else
     {
@@ -1369,221 +1522,151 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize
     }
 }
 
-/* Note that this function does not save the best intra prediction, it must
- * be generated later. It records the best mode in the cu */
-void Analysis::checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom)
+void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
 {
-    CUData& cu = intraMode.cu;
-    uint32_t depth = cu.m_cuDepth[0];
+    CUData& cu = bidir2Nx2N.cu;
 
-    cu.setPartSizeSubParts(SIZE_2Nx2N);
-    cu.setPredModeSubParts(MODE_INTRA);
-
-    uint32_t initTrDepth = 0;
-    uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTrDepth;
-    uint32_t tuSize      = 1 << log2TrSize;
-    const uint32_t absPartIdx  = 0;
-
-    // Reference sample smoothing
-    initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
-
-    pixel* fenc = m_modeDepth[depth].fencYuv.m_buf[0];
-    uint32_t stride = m_modeDepth[depth].fencYuv.m_size;
-
-    pixel *above         = m_refAbove    + tuSize - 1;
-    pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
-    pixel *left          = m_refLeft     + tuSize - 1;
-    pixel *leftFiltered  = m_refLeftFlt  + tuSize - 1;
-    int sad, bsad;
-    uint32_t bits, bbits, mode, bmode;
-    uint64_t cost, bcost;
-
-    // 33 Angle modes once
-    ALIGN_VAR_32(pixel, bufScale[32 * 32]);
-    ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
-    ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
-    int scaleTuSize = tuSize;
-    int scaleStride = stride;
-    int costShift = 0;
-    int sizeIdx = log2TrSize - 2;
-
-    if (tuSize > 32)
+    if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
     {
-        // origin is 64x64, we scale to 32x32 and setup required parameters
-        primitives.scale2D_64to32(bufScale, fenc, stride);
-        fenc = bufScale;
-
-        // reserve space in case primitives need to store data in above
-        // or left buffers
-        pixel _above[4 * 32 + 1];
-        pixel _left[4 * 32 + 1];
-        pixel *aboveScale  = _above + 2 * 32;
-        pixel *leftScale   = _left + 2 * 32;
-        aboveScale[0] = leftScale[0] = above[0];
-        primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
-        primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
-
-        scaleTuSize = 32;
-        scaleStride = 32;
-        costShift = 2;
-        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
-
-        // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
-        above         = aboveScale;
-        left          = leftScale;
-        aboveFiltered = aboveScale;
-        leftFiltered  = leftScale;
+        bidir2Nx2N.sa8dCost = MAX_INT64;
+        bidir2Nx2N.rdCost = MAX_INT64;
+        return;
     }
 
-    pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
-    int predsize = scaleTuSize * scaleTuSize;
-
-    m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
+    const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
+    MV   mvzero(0, 0);
+    int  cpart, partEnum = cuGeom.log2CUSize - 2;
 
-    /* there are three cost tiers for intra modes:
-     *  pred[0]          - mode probable, least cost
-     *  pred[1], pred[2] - less probable, slightly more cost
-     *  non-mpm modes    - all cost the same (rbits) */
-    uint64_t mpms;
-    uint32_t preds[3];
-    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
-
-    // DC
-    primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
-    bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
-    bmode = mode = DC_IDX;
-    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
-    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
-
-    pixel *abovePlanar = above;
-    pixel *leftPlanar  = left;
-
-    if (tuSize & (8 | 16 | 32))
+    if (m_bChromaSa8d)
     {
-        abovePlanar = aboveFiltered;
-        leftPlanar  = leftFiltered;
+        int cuSize = 1 << cuGeom.log2CUSize;
+        cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
     }
 
-    // PLANAR
-    primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
-    sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
-    mode = PLANAR_IDX;
-    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
-    cost = m_rdCost.calcRdSADCost(sad, bits);
-    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
-
-    // Transpose NxN
-    primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
-
-    primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
-
-    bool modeHor;
-    pixel *cmp;
-    intptr_t srcStride;
-
-#define TRY_ANGLE(angle) \
-    modeHor = angle < 18; \
-    cmp = modeHor ? bufTrans : fenc; \
-    srcStride = modeHor ? scaleTuSize : scaleStride; \
-    sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
-    bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
-    cost = m_rdCost.calcRdSADCost(sad, bits)
+    bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
+    bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
+    MotionData* bestME = bidir2Nx2N.bestME[0];
+    int ref0    = bestME[0].ref;
+    MV  mvp0    = bestME[0].mvp;
+    int mvpIdx0 = bestME[0].mvpIdx;
+    int ref1    = bestME[1].ref;
+    MV  mvp1    = bestME[1].mvp;
+    int mvpIdx1 = bestME[1].mvpIdx;
+
+    bidir2Nx2N.initCosts();
+    cu.setPartSizeSubParts(SIZE_2Nx2N);
+    cu.setPredModeSubParts(MODE_INTER);
+    cu.setPUInterDir(3, 0, 0);
+    cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
+    cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
+    cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
+    cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
+    cu.m_mergeFlag[0] = 0;
+
+    /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
+    cu.setPUMv(0, bestME[0].mv, 0, 0);
+    cu.m_mvd[0][0] = bestME[0].mv - mvp0;
+
+    cu.setPUMv(1, bestME[1].mv, 0, 0);
+    cu.m_mvd[1][0] = bestME[1].mv - mvp1;
+
+    prepMotionCompensation(cu, cuGeom, 0);
+    motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d);
+
+    int sa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
+    if (m_bChromaSa8d)
+    {
+        /* Add in chroma distortion */
+        sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
+        sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
+    }
+    bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+    bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
 
-    if (m_param->bEnableFastIntra)
+    bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
+    if (bTryZero)
+    {
+        /* Do not try zero MV if unidir motion predictors are beyond
+         * valid search area */
+        MV mvmin, mvmax;
+        int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
+        setSearchRange(cu, mvzero, merange, mvmin, mvmax);
+        mvmax.y += 2; // there is some pad for subpel refine
+        mvmin <<= 2;
+        mvmax <<= 2;
+
+        bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
+        bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
+    }
+    if (bTryZero)
     {
-        int asad = 0;
-        uint32_t lowmode, highmode, amode = 5, abits = 0;
-        uint64_t acost = MAX_INT64;
+        /* Estimate cost of BIDIR using coincident blocks */
+        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
 
-        /* pick the best angle, sampling at distance of 5 */
-        for (mode = 5; mode < 35; mode += 5)
-        {
-            TRY_ANGLE(mode);
-            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
-        }
+        int zsa8d;
 
-        /* refine best angle at distance 2, then distance 1 */
-        for (uint32_t dist = 2; dist >= 1; dist--)
+        if (m_bChromaSa8d)
         {
-            lowmode = amode - dist;
-            highmode = amode + dist;
+            cu.m_mv[0][0] = mvzero;
+            cu.m_mv[1][0] = mvzero;
 
-            X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
-            TRY_ANGLE(lowmode);
-            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+            prepMotionCompensation(cu, cuGeom, 0);
+            motionCompensation(tmpPredYuv, true, true);
 
-            X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
-            TRY_ANGLE(highmode);
-            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+            zsa8d  = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+            zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
+            zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
         }
-
-        if (amode == 33)
+        else
         {
-            TRY_ANGLE(34);
-            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
-        }
+            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
+            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
+            intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
 
-        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
-    }
-    else // calculate and search all intra prediction angles for lowest cost
-    {
-        for (mode = 2; mode < 35; mode++)
-        {
-            TRY_ANGLE(mode);
-            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+            primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
+            zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
         }
-    }
 
-    cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTrDepth);
-    intraMode.initCosts();
-    intraMode.totalBits = bbits;
-    intraMode.distortion = bsad;
-    intraMode.sa8dCost = bcost;
-    intraMode.sa8dBits = bbits;
-}
+        uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
+        uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+        uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
 
-void Analysis::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
-{
-    CUData& cu = intraMode.cu;
-    Yuv* reconYuv = &intraMode.reconYuv;
-    Yuv* fencYuv = &m_modeDepth[cuGeom.depth].fencYuv;
+        /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
+        checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvp0, mvpIdx0, bits0, zcost);
+        checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvp1, mvpIdx1, bits1, zcost);
 
-    X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
-    X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
+        uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+        zcost = zsa8d + m_rdCost.getCost(zbits);
 
-    m_quant.setQPforQuant(cu);
-
-    uint32_t tuDepthRange[2];
-    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
-
-    m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
-
-    Cost icosts;
-    codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
-    extractIntraResultQT(cu, *reconYuv, 0, 0);
-
-    intraMode.distortion  = icosts.distortion;
-    intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
-
-    m_entropyCoder.resetBits();
-    if (m_slice->m_pps->bTransquantBypassEnabled)
-        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
-    m_entropyCoder.codeSkipFlag(cu, 0);
-    m_entropyCoder.codePredMode(cu.m_predMode[0]);
-    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
-    m_entropyCoder.codePredInfo(cu, 0);
-    intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
+        if (zcost < bidir2Nx2N.sa8dCost)
+        {
+            bidir2Nx2N.sa8dBits = zbits;
+            bidir2Nx2N.sa8dCost = zcost;
 
-    bool bCodeDQP = m_slice->m_pps->bUseDQP;
-    m_entropyCoder.codeCoeff(cu, 0, cuGeom.depth, bCodeDQP, tuDepthRange);
+            cu.setPUMv(0, mvzero, 0, 0);
+            cu.m_mvd[0][0] = mvzero - mvp0;
+            cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
 
-    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
-    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
-    if (m_rdCost.m_psyRd)
-        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+            cu.setPUMv(1, mvzero, 0, 0);
+            cu.m_mvd[1][0] = mvzero - mvp1;
+            cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
 
-    m_entropyCoder.store(intraMode.contexts);
-    updateModeCost(intraMode);
+            if (m_bChromaSa8d)
+                /* real MC was already performed */
+                bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
+            else
+            {
+                prepMotionCompensation(cu, cuGeom, 0);
+                motionCompensation(bidir2Nx2N.predYuv, true, true);
+            }
+        }
+        else if (m_bChromaSa8d)
+        {
+            /* recover overwritten motion vectors */
+            cu.m_mv[0][0] = bestME[0].mv;
+            cu.m_mv[1][0] = bestME[1].mv;
+        }
+    }
 }
 
 void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
@@ -1592,9 +1675,9 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
     {
         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
         {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
-                encodeResidue(ctu, childCuData);
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+                encodeResidue(ctu, childGeom);
         }
         return;
     }
@@ -1602,29 +1685,31 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
     uint32_t absPartIdx = cuGeom.encodeIdx;
     int sizeIdx = cuGeom.log2CUSize - 2;
 
-    Yuv& fencYuv = m_modeDepth[0].fencYuv;
-
     /* reuse the bestMode data structures at the current depth */
     Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
-    Yuv& reconYuv = bestMode->reconYuv;
     CUData& cu = bestMode->cu;
 
     cu.copyFromPic(ctu, cuGeom);
     m_quant.setQPforQuant(cu);
 
-    if (cu.m_predMode[0] == MODE_INTRA)
+    Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
+    if (cuGeom.depth)
+        m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
+    X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
+
+    if (cu.isIntra(0))
     {
         uint32_t tuDepthRange[2];
         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 
-        uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
-        residualTransformQuantIntra(*bestMode, cuGeom, initTrDepth, 0, tuDepthRange);
+        uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
+        residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
         getBestIntraModeChroma(*bestMode, cuGeom);
         residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
     }
-    else if (cu.m_predMode[0] == MODE_INTER)
+    else // if (cu.isInter(0))
     {
-        X265_CHECK(!ctu.m_skipFlag[absPartIdx], "skip not expected prior to transform\n");
+        X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
 
         /* Calculate residual for current CU part into depth sized resiYuv */
 
@@ -1637,16 +1722,16 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
         pixel* predV = predYuv.getCrAddr(absPartIdx);
 
         primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size,
-                                        fencYuv.getLumaAddr(absPartIdx), predY,
+                                        fencYuv.m_buf[0], predY,
                                         fencYuv.m_size, predYuv.m_size);
 
         primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize,
-                                        fencYuv.getCbAddr(absPartIdx), predU,
-                                        fencYuv.m_csize, predYuv.m_csize);
+                                                 fencYuv.m_buf[1], predU,
+                                                 fencYuv.m_csize, predYuv.m_csize);
 
         primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize,
-                                        fencYuv.getCrAddr(absPartIdx), predV,
-                                        fencYuv.m_csize, predYuv.m_csize);
+                                                 fencYuv.m_buf[2], predV,
+                                                 fencYuv.m_csize, predYuv.m_csize);
 
         uint32_t tuDepthRange[2];
         cu.getInterTUQtDepthRange(tuDepthRange, 0);
@@ -1654,57 +1739,38 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
         residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
 
         if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
-            cu.setSkipFlagSubParts(true);
+            cu.setPredModeSubParts(MODE_SKIP);
 
-        PicYuv& reconPicYuv = *m_frame->m_reconPicYuv;
-        if (cu.getQtRootCbf(0)) // TODO: split to each component
-        {
-            /* residualTransformQuantInter() wrote transformed residual back into
-             * resiYuv. Generate the recon pixels by adding it to the prediction */
+        /* residualTransformQuantInter() wrote transformed residual back into
+         * resiYuv. Generate the recon pixels by adding it to the prediction */
 
-            primitives.luma_add_ps[sizeIdx](reconYuv.m_buf[0], reconYuv.m_size,
+        PicYuv& reconPic = *m_frame->m_reconPic;
+        if (cu.m_cbf[0][0])
+            primitives.luma_add_ps[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
                                             predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
-            primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[1], reconYuv.m_csize,
-                                            predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
-            primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[2], reconYuv.m_csize,
-                                            predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
-
-            /* copy the reconstructed part to the recon pic for later intra
-             * predictions */
-            reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, absPartIdx);
-        }
         else
-        {
-            /* copy the prediction pixels to the recon pic for later intra
-             * predictions */
-
-            primitives.luma_copy_pp[sizeIdx](reconPicYuv.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_stride,
+            primitives.luma_copy_pp[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
                                              predY, predYuv.m_size);
-            primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCbAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
+
+        if (cu.m_cbf[1][0])
+            primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+                                                     predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
+        else
+            primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
                                                       predU, predYuv.m_csize);
-            primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCrAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
+
+        if (cu.m_cbf[2][0])
+            primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+                                                     predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
+        else
+            primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
                                                       predV, predYuv.m_csize);
-        }
     }
-    /* else if (cu.m_predMode[0] == MODE_NONE) {} */
 
     checkDQP(cu, cuGeom);
     cu.updatePic(cuGeom.depth);
 }
 
-/* check whether current try is the best with identifying the depth of current try */
-void Analysis::checkBestMode(Mode& mode, uint32_t depth)
-{
-    ModeDepth& md = m_modeDepth[depth];
-    if (md.bestMode)
-    {
-        if (mode.rdCost < md.bestMode->rdCost)
-            md.bestMode = &mode;
-    }
-    else
-        md.bestMode = &mode;
-}
-
 void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
 {
     if (m_param->rdLevel >= 3)
@@ -1817,7 +1883,7 @@ bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom
      * each quantity */
 
     uint32_t depth = cuGeom.depth;
-    FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+    FrameData& curEncData = *m_frame->m_encData;
     FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
     uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
     uint64_t cuCount = cuStat.count[depth];
@@ -1855,7 +1921,7 @@ bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom
     }
 
     // give 60% weight to all CU's and 40% weight to neighbour CU's
-    if (neighCost + cuCount)
+    if (neighCount + cuCount)
     {
         uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
         uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
diff --git a/source/encoder/analysis.h b/source/encoder/analysis.h
index 404cc90..aff7d66 100644
--- a/source/encoder/analysis.h
+++ b/source/encoder/analysis.h
@@ -49,6 +49,7 @@ public:
         PRED_SKIP,
         PRED_INTRA,
         PRED_2Nx2N,
+        PRED_BIDIR,
         PRED_Nx2N,
         PRED_2NxN,
         PRED_SPLIT,
@@ -71,11 +72,15 @@ public:
 
     ModeDepth m_modeDepth[NUM_CU_DEPTH];
     bool      m_bTryLossless;
+    bool      m_bChromaSa8d;
 
+    /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
+    analysis_intra_data* m_reuseIntraDataCTU;
+    analysis_inter_data* m_reuseInterDataCTU;
     Analysis();
     bool create(ThreadLocalData* tld);
     void destroy();
-    Search::Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
+    Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
 
 protected:
 
@@ -83,13 +88,14 @@ protected:
     int           m_totalNumJobs;
     volatile int  m_numAcquiredJobs;
     volatile int  m_numCompletedJobs;
+    Lock          m_pmodeLock;
     Event         m_modeCompletionEvent;
     bool findJob(int threadId);
     void parallelModeAnalysis(int threadId, int jobId);
     void parallelME(int threadId, int meId);
 
     /* full analysis for an I-slice CU */
-    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* sdata, uint32_t &zOrder);
+    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder);
 
     /* full analysis for a P or B slice CU */
     void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom);
@@ -104,20 +110,36 @@ protected:
     void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize);
     void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly);
 
-    /* measure intra options */
-    void checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom);
-    void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+    void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);
 
     /* encode current bestMode losslessly, pick best RD cost */
     void tryLossless(const CUGeom& cuGeom);
 
-    void checkDQP(CUData& cu, const CUGeom& cuGeom);
+    /* add the RD cost of coding a split flag (0 or 1) to the given mode */
     void addSplitFlagCost(Mode& mode, uint32_t depth);
-    void checkBestMode(Mode& mode, uint32_t depth);
+
+    /* update CBF flags and QP values to be internally consistent */
+    void checkDQP(CUData& cu, const CUGeom& cuGeom);
+
+    /* work-avoidance heuristics for RD levels < 5 */
     uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom);
     bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode);
 
+    /* generate residual and recon pixels for an entire CTU recursively (RD0) */
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
+
+    /* check whether current mode is the new best */
+    inline void checkBestMode(Mode& mode, uint32_t depth)
+    {
+        ModeDepth& md = m_modeDepth[depth];
+        if (md.bestMode)
+        {
+            if (mode.rdCost < md.bestMode->rdCost)
+                md.bestMode = &mode;
+        }
+        else
+            md.bestMode = &mode;
+    }
 };
 
 struct ThreadLocalData
diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp
index 66f8e28..74cee73 100644
--- a/source/encoder/api.cpp
+++ b/source/encoder/api.cpp
@@ -73,7 +73,11 @@ x265_encoder *x265_encoder_open(x265_param *p)
     determineLevel(*param, encoder->m_vps);
 
     encoder->create();
-    encoder->init();
+    if (encoder->m_aborted)
+    {
+        delete encoder;
+        return NULL;
+    }
 
     x265_print_params(param);
 
@@ -178,7 +182,6 @@ void x265_encoder_close(x265_encoder *enc)
 extern "C"
 void x265_cleanup(void)
 {
-    destroyROM();
     BitCost::destroy();
 }
 
@@ -198,13 +201,12 @@ void x265_picture_init(x265_param *param, x265_picture *pic)
     pic->forceqp = X265_QP_AUTO;
     if (param->analysisMode)
     {
-        uint32_t numPartitions   = 1 << (g_maxFullDepth * 2);
         uint32_t widthInCU       = (param->sourceWidth  + g_maxCUSize - 1) >> g_maxLog2CUSize;
         uint32_t heightInCU      = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
 
         uint32_t numCUsInFrame   = widthInCU * heightInCU;
         pic->analysisData.numCUsInFrame = numCUsInFrame;
-        pic->analysisData.numPartitions = numPartitions;
+        pic->analysisData.numPartitions = NUM_CU_PARTITIONS;
     }
 }
 
@@ -213,37 +215,3 @@ void x265_picture_free(x265_picture *p)
 {
     return x265_free(p);
 }
-
-int x265_alloc_analysis_data(x265_picture* pic)
-{
-    CHECKED_MALLOC(pic->analysisData.interData, x265_inter_data, pic->analysisData.numCUsInFrame * 85);
-    CHECKED_MALLOC(pic->analysisData.intraData, x265_intra_data, 1);
-    pic->analysisData.intraData->cuAddr     = NULL;
-    pic->analysisData.intraData->depth      = NULL;
-    pic->analysisData.intraData->modes      = NULL;
-    pic->analysisData.intraData->partSizes  = NULL;
-    pic->analysisData.intraData->poc        = NULL;
-    CHECKED_MALLOC(pic->analysisData.intraData->depth, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame);
-    CHECKED_MALLOC(pic->analysisData.intraData->modes, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame);
-    CHECKED_MALLOC(pic->analysisData.intraData->partSizes, char, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame);
-    CHECKED_MALLOC(pic->analysisData.intraData->cuAddr, uint32_t, pic->analysisData.numCUsInFrame);
-    CHECKED_MALLOC(pic->analysisData.intraData->poc, int, pic->analysisData.numCUsInFrame);
-    return 0;
-
-fail:
-    x265_free_analysis_data(pic);
-    return -1;
-}
-
-void x265_free_analysis_data(x265_picture* pic)
-{
-    X265_FREE(pic->analysisData.interData);
-    pic->analysisData.interData = NULL;
-    X265_FREE(pic->analysisData.intraData->depth);
-    X265_FREE(pic->analysisData.intraData->modes);
-    X265_FREE(pic->analysisData.intraData->partSizes);
-    X265_FREE(pic->analysisData.intraData->cuAddr);
-    X265_FREE(pic->analysisData.intraData->poc);
-    X265_FREE(pic->analysisData.intraData);
-    pic->analysisData.intraData = NULL;
-}
diff --git a/source/encoder/bitcost.h b/source/encoder/bitcost.h
index d28486b..674dffa 100644
--- a/source/encoder/bitcost.h
+++ b/source/encoder/bitcost.h
@@ -35,7 +35,7 @@ class BitCost
 {
 public:
 
-    BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0) {}
+    BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0), m_mvp(0) {}
 
     void setQP(unsigned int qp);
 
diff --git a/source/encoder/dpb.cpp b/source/encoder/dpb.cpp
index 1c82a76..9ca1d04 100644
--- a/source/encoder/dpb.cpp
+++ b/source/encoder/dpb.cpp
@@ -52,8 +52,8 @@ DPB::~DPB()
         FrameData* next = m_picSymFreeList->m_freeListNext;
         m_picSymFreeList->destroy();
 
-        m_picSymFreeList->m_reconPicYuv->destroy();
-        delete m_picSymFreeList->m_reconPicYuv;
+        m_picSymFreeList->m_reconPic->destroy();
+        delete m_picSymFreeList->m_reconPic;
 
         delete m_picSymFreeList;
         m_picSymFreeList = next;
@@ -82,7 +82,7 @@ void DPB::recycleUnreferenced()
             curFrame->m_encData->m_freeListNext = m_picSymFreeList;
             m_picSymFreeList = curFrame->m_encData;
             curFrame->m_encData = NULL;
-            curFrame->m_reconPicYuv = NULL;
+            curFrame->m_reconPic = NULL;
         }
     }
 }
diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp
index 44e82af..14a2200 100644
--- a/source/encoder/encoder.cpp
+++ b/source/encoder/encoder.cpp
@@ -51,6 +51,8 @@ static const char *summaryCSVHeader =
     "B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
     "Version\n";
 
+const char* defaultAnalysisFileName = "x265_analysis.dat";
+
 using namespace x265;
 
 Encoder::Encoder()
@@ -78,6 +80,7 @@ Encoder::Encoder()
     m_buOffsetC = NULL;
     m_threadPool = 0;
     m_numThreadLocalData = 0;
+    m_analysisFile = NULL;
 }
 
 void Encoder::create()
@@ -131,7 +134,7 @@ void Encoder::create()
         int cpuCount = getCpuCount();
         if (!p->bEnableWavefront)
             p->frameNumThreads = X265_MIN(cpuCount, (rows + 1) / 2);
-        else if (cpuCount > 32)
+        else if (cpuCount >= 32)
             p->frameNumThreads = 6; // dual-socket 10-core IvyBridge or higher
         else if (cpuCount >= 16)
             p->frameNumThreads = 5; // 8 HT cores, or dual socket
@@ -194,13 +197,13 @@ void Encoder::create()
         m_csvfpt = fopen(m_param->csvfn, "r");
         if (m_csvfpt)
         {
-            // file already exists, re-open for append
+            /* file already exists, re-open for append */
             fclose(m_csvfpt);
             m_csvfpt = fopen(m_param->csvfn, "ab");
         }
         else
         {
-            // new CSV file, write header
+            /* new CSV file, write header */
             m_csvfpt = fopen(m_param->csvfn, "wb");
             if (m_csvfpt)
             {
@@ -218,7 +221,44 @@ void Encoder::create()
         }
     }
 
+    if (m_frameEncoder)
+    {
+        int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
+        int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
+        for (int i = 0; i < m_param->frameNumThreads; i++)
+        {
+            if (!m_frameEncoder[i].init(this, numRows, numCols, i))
+            {
+                x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n");
+                m_aborted = true;
+            }
+        }
+    }
+
+    if (m_param->bEmitHRDSEI)
+        m_rateControl->initHRD(&m_sps);
+    if (!m_rateControl->init(&m_sps))
+        m_aborted = true;
+
+    m_lookahead->init();
+
+    if (m_param->analysisMode)
+    {
+        const char* name = m_param->analysisFileName;
+        if (!name)
+            name = defaultAnalysisFileName;
+        const char* mode = m_param->analysisMode == X265_ANALYSIS_LOAD ? "rb" : "wb";
+        m_analysisFile = fopen(name, mode);
+        if (!m_analysisFile)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "Analysis load/save: failed to open file %s\n", name);
+            m_aborted = true;
+        }
+    }
+
     m_aborted |= parseLambdaFile(m_param);
+
+    m_encodeStartTime = x265_mdate();
 }
 
 void Encoder::destroy()
@@ -270,6 +310,10 @@ void Encoder::destroy()
     X265_FREE(m_buOffsetY);
     X265_FREE(m_buOffsetC);
 
+    if (m_analysisFile)
+        fclose(m_analysisFile);
+    free(m_param->analysisFileName);
+    free(m_param->csvfn);
     if (m_csvfpt)
         fclose(m_csvfpt);
     free(m_param->rc.statFileName); // alloc'd by strdup
@@ -277,29 +321,6 @@ void Encoder::destroy()
     X265_FREE(m_param);
 }
 
-void Encoder::init()
-{
-    if (m_frameEncoder)
-    {
-        int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
-        int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
-        for (int i = 0; i < m_param->frameNumThreads; i++)
-        {
-            if (!m_frameEncoder[i].init(this, numRows, numCols, i))
-            {
-                x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n");
-                m_aborted = true;
-            }
-        }
-    }
-    if (m_param->bEmitHRDSEI)
-        m_rateControl->initHRD(&m_sps);
-    if (!m_rateControl->init(&m_sps))
-        m_aborted = true;
-    m_lookahead->init();
-    m_encodeStartTime = x265_mdate();
-}
-
 void Encoder::updateVbvPlan(RateControl* rc)
 {
     for (int i = 0; i < m_param->frameNumThreads; i++)
@@ -367,14 +388,14 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
                  * allocated by this top level encoder */
                 if (m_cuOffsetY)
                 {
-                    inFrame->m_origPicYuv->m_cuOffsetC = m_cuOffsetC;
-                    inFrame->m_origPicYuv->m_cuOffsetY = m_cuOffsetY;
-                    inFrame->m_origPicYuv->m_buOffsetC = m_buOffsetC;
-                    inFrame->m_origPicYuv->m_buOffsetY = m_buOffsetY;
+                    inFrame->m_fencPic->m_cuOffsetC = m_cuOffsetC;
+                    inFrame->m_fencPic->m_cuOffsetY = m_cuOffsetY;
+                    inFrame->m_fencPic->m_buOffsetC = m_buOffsetC;
+                    inFrame->m_fencPic->m_buOffsetY = m_buOffsetY;
                 }
                 else
                 {
-                    if (!inFrame->m_origPicYuv->createOffsets(m_sps))
+                    if (!inFrame->m_fencPic->createOffsets(m_sps))
                     {
                         m_aborted = true;
                         x265_log(m_param, X265_LOG_ERROR, "memory allocation failure, aborting encode\n");
@@ -384,10 +405,10 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
                     }
                     else
                     {
-                        m_cuOffsetC = inFrame->m_origPicYuv->m_cuOffsetC;
-                        m_cuOffsetY = inFrame->m_origPicYuv->m_cuOffsetY;
-                        m_buOffsetC = inFrame->m_origPicYuv->m_buOffsetC;
-                        m_buOffsetY = inFrame->m_origPicYuv->m_buOffsetY;
+                        m_cuOffsetC = inFrame->m_fencPic->m_cuOffsetC;
+                        m_cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
+                        m_buOffsetC = inFrame->m_fencPic->m_buOffsetC;
+                        m_buOffsetY = inFrame->m_fencPic->m_buOffsetY;
                     }
                 }
             }
@@ -405,9 +426,8 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
 
         /* Copy input picture into a Frame and PicYuv, send to lookahead */
         inFrame->m_poc = ++m_pocLast;
-        inFrame->m_origPicYuv->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
-        inFrame->m_intraData = pic_in->analysisData.intraData;
-        inFrame->m_interData = pic_in->analysisData.interData;
+        inFrame->m_fencPic->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
+
         inFrame->m_userData  = pic_in->userData;
         inFrame->m_pts       = pic_in->pts;
         inFrame->m_forceqp   = pic_in->forceqp;
@@ -436,6 +456,23 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
 
         /* Use the frame types from the first pass, if available */
         int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : pic_in->sliceType;
+
+        /* In analysisSave mode, x265_analysis_data is allocated in pic_in and inFrame points to this */
+        /* Load analysis data before lookahead->addPicture, since sliceType has been decided */
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+        {
+            x265_picture* inputPic = const_cast<x265_picture*>(pic_in);
+            /* readAnalysisFile reads analysis data for the frame and allocates memory based on slicetype */
+            readAnalysisFile(&inputPic->analysisData, inFrame->m_poc);
+            inFrame->m_analysisData.poc = inFrame->m_poc;
+            inFrame->m_analysisData.sliceType = inputPic->analysisData.sliceType;
+            inFrame->m_analysisData.numCUsInFrame = inputPic->analysisData.numCUsInFrame;
+            inFrame->m_analysisData.numPartitions = inputPic->analysisData.numPartitions;
+            inFrame->m_analysisData.interData = inputPic->analysisData.interData;
+            inFrame->m_analysisData.intraData = inputPic->analysisData.intraData;
+            sliceType = inputPic->analysisData.sliceType;
+        }
+
         m_lookahead->addPicture(inFrame, sliceType);
         m_numDelayedPic++;
     }
@@ -454,9 +491,14 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
     if (outFrame)
     {
         Slice *slice = outFrame->m_encData->m_slice;
+
+        /* Free up pic_in->analysisData since it has already been used */
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+            freeAnalysis(&outFrame->m_analysisData);
+
         if (pic_out)
         {
-            PicYuv *recpic = outFrame->m_reconPicYuv;
+            PicYuv *recpic = outFrame->m_reconPic;
             pic_out->poc = slice->m_poc;
             pic_out->bitDepth = X265_DEPTH;
             pic_out->userData = outFrame->m_userData;
@@ -484,16 +526,20 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
             pic_out->stride[1] = (int)(recpic->m_strideC * sizeof(pixel));
             pic_out->planes[2] = recpic->m_picOrg[2];
             pic_out->stride[2] = (int)(recpic->m_strideC * sizeof(pixel));
-        }
 
-        if (m_param->analysisMode)
-        {
-            pic_out->analysisData.interData = outFrame->m_interData;
-            pic_out->analysisData.intraData = outFrame->m_intraData;
-            pic_out->analysisData.numCUsInFrame = slice->m_sps->numCUsInFrame;
-            pic_out->analysisData.numPartitions = slice->m_sps->numPartitions;
+            /* Dump analysis data from pic_out to file in save mode and free */
+            if (m_param->analysisMode == X265_ANALYSIS_SAVE)
+            {
+                pic_out->analysisData.poc = pic_out->poc;
+                pic_out->analysisData.sliceType = pic_out->sliceType;
+                pic_out->analysisData.numCUsInFrame = outFrame->m_analysisData.numCUsInFrame;
+                pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
+                pic_out->analysisData.interData = outFrame->m_analysisData.interData;
+                pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
+                writeAnalysisFile(&pic_out->analysisData);
+                freeAnalysis(&pic_out->analysisData);
+            }
         }
-
         if (slice->m_sliceType == P_SLICE)
         {
             if (slice->m_weightPredTable[0][0][0].bPresentFlag)
@@ -521,8 +567,8 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
         }
         if (m_aborted)
             return -1;
-
         finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits);
+
         // Allow this frame to be recycled if no frame encoders are using it for reference
         if (!pic_out)
         {
@@ -557,10 +603,10 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
             slice->m_pps = &m_pps;
             slice->m_maxNumMergeCand = m_param->maxNumMergeCand;
             slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_CU_PARTITIONS);
-            frameEnc->m_reconPicYuv->m_cuOffsetC = m_cuOffsetC;
-            frameEnc->m_reconPicYuv->m_cuOffsetY = m_cuOffsetY;
-            frameEnc->m_reconPicYuv->m_buOffsetC = m_buOffsetC;
-            frameEnc->m_reconPicYuv->m_buOffsetY = m_buOffsetY;
+            frameEnc->m_reconPic->m_cuOffsetC = m_cuOffsetC;
+            frameEnc->m_reconPic->m_cuOffsetY = m_cuOffsetY;
+            frameEnc->m_reconPic->m_buOffsetC = m_buOffsetC;
+            frameEnc->m_reconPic->m_buOffsetY = m_buOffsetY;
         }
         curEncoder->m_rce.encodeOrder = m_encodedFrameNum++;
         if (m_bframeDelay)
@@ -573,6 +619,20 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
         }
         else
             frameEnc->m_dts = frameEnc->m_reorderedPts;
+        /* Allocate analysis data before encode in save mode. This is allocated in frameEnc*/
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE)
+        {
+            x265_analysis_data* analysis = &frameEnc->m_analysisData;
+            analysis->poc = frameEnc->m_poc;
+            analysis->sliceType = frameEnc->m_lowres.sliceType;
+            uint32_t widthInCU       = (m_param->sourceWidth  + g_maxCUSize - 1) >> g_maxLog2CUSize;
+            uint32_t heightInCU      = (m_param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
+
+            uint32_t numCUsInFrame   = widthInCU * heightInCU;
+            analysis->numCUsInFrame  = numCUsInFrame;
+            analysis->numPartitions  = NUM_CU_PARTITIONS;
+            allocAnalysis(analysis);
+        }
 
         // determine references, setup RPS, etc
         m_dpb->prepareEncode(frameEnc);
@@ -965,7 +1025,7 @@ static const char*digestToString(const unsigned char digest[3][16], int numChar)
 
 void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, uint64_t bits)
 {
-    PicYuv* reconPic = curFrame->m_reconPicYuv;
+    PicYuv* reconPic = curFrame->m_reconPic;
 
     //===== calculate PSNR =====
     int width  = reconPic->m_picWidth - m_sps.conformanceWindow.rightOffset;
@@ -1280,8 +1340,8 @@ void Encoder::initPPS(PPS *pps)
         pps->maxCuDQPDepth = 0;
     }
 
-    pps->chromaCbQpOffset = m_param->cbQpOffset;
-    pps->chromaCrQpOffset = m_param->crQpOffset;
+    pps->chromaQpOffset[0] = m_param->cbQpOffset;
+    pps->chromaQpOffset[1] = m_param->crQpOffset;
 
     pps->bConstrainedIntraPred = m_param->bEnableConstrainedIntra;
     pps->bUseWeightPred = m_param->bEnableWeightedPred;
@@ -1290,13 +1350,10 @@ void Encoder::initPPS(PPS *pps)
     pps->bTransformSkipEnabled = m_param->bEnableTransformSkip;
     pps->bSignHideEnabled = m_param->bEnableSignHiding;
 
-    /* If offsets are ever configured, enable bDeblockingFilterControlPresent and set
-     * deblockingFilterBetaOffsetDiv2 / deblockingFilterTcOffsetDiv2 */
-    bool bDeblockOffsetInPPS = 0;
-    pps->bDeblockingFilterControlPresent = !m_param->bEnableLoopFilter || bDeblockOffsetInPPS;
+    pps->bDeblockingFilterControlPresent = !m_param->bEnableLoopFilter || m_param->deblockingFilterBetaOffset || m_param->deblockingFilterTCOffset;
     pps->bPicDisableDeblockingFilter = !m_param->bEnableLoopFilter;
-    pps->deblockingFilterBetaOffsetDiv2 = 0;
-    pps->deblockingFilterTcOffsetDiv2 = 0;
+    pps->deblockingFilterBetaOffsetDiv2 = m_param->deblockingFilterBetaOffset;
+    pps->deblockingFilterTcOffsetDiv2 = m_param->deblockingFilterTCOffset;
 
     pps->bEntropyCodingSyncEnabled = m_param->bEnableWavefront;
 }
@@ -1330,6 +1387,12 @@ void Encoder::configure(x265_param *p)
         p->bBPyramid = 0;
 
     /* Disable features which are not supported by the current RD level */
+    if (p->rdLevel < 5)
+    {
+        if (p->bEnableCbfFastMode)      /* impossible */
+            x265_log(p, X265_LOG_WARNING, "--fast-cbf disabled, requires --rdlevel 5 or higher\n");
+        p->bEnableCbfFastMode = 0;
+    }
     if (p->rdLevel < 4)
     {
         if (p->psyRdoq > 0)             /* impossible */
@@ -1458,35 +1521,189 @@ void Encoder::configure(x265_param *p)
             x265_log(p, X265_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s);
     }
 
-    //========= set default display window ==================================
+    /* initialize the conformance window */
     m_conformanceWindow.bEnabled = false;
     m_conformanceWindow.rightOffset = 0;
     m_conformanceWindow.topOffset = 0;
     m_conformanceWindow.bottomOffset = 0;
     m_conformanceWindow.leftOffset = 0;
 
-    //======== set pad size if width is not multiple of the minimum CU size =========
-    const uint32_t minCUSize = MIN_CU_SIZE;
-    if (p->sourceWidth & (minCUSize - 1))
+    /* set pad size if width is not multiple of the minimum CU size */
+    if (p->sourceWidth & (MIN_CU_SIZE - 1))
     {
-        uint32_t rem = p->sourceWidth & (minCUSize - 1);
-        uint32_t padsize = minCUSize - rem;
+        uint32_t rem = p->sourceWidth & (MIN_CU_SIZE - 1);
+        uint32_t padsize = MIN_CU_SIZE - rem;
         p->sourceWidth += padsize;
 
-        /* set the confirmation window offsets  */
         m_conformanceWindow.bEnabled = true;
         m_conformanceWindow.rightOffset = padsize;
     }
 
-    //======== set pad size if height is not multiple of the minimum CU size =========
-    if (p->sourceHeight & (minCUSize - 1))
+    /* set pad size if height is not multiple of the minimum CU size */
+    if (p->sourceHeight & (MIN_CU_SIZE - 1))
     {
-        uint32_t rem = p->sourceHeight & (minCUSize - 1);
-        uint32_t padsize = minCUSize - rem;
+        uint32_t rem = p->sourceHeight & (MIN_CU_SIZE - 1);
+        uint32_t padsize = MIN_CU_SIZE - rem;
         p->sourceHeight += padsize;
 
-        /* set the confirmation window offsets  */
         m_conformanceWindow.bEnabled = true;
         m_conformanceWindow.bottomOffset = padsize;
     }
+    if (p->bDistributeModeAnalysis && p->analysisMode)
+    {
+        p->analysisMode = X265_ANALYSIS_OFF;
+        x265_log(p, X265_LOG_WARNING, "Analysis save and load mode not supported for distributed mode analysis\n");
+    }
+}
+
+void Encoder::allocAnalysis(x265_analysis_data* analysis)
+{
+    if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+    {
+        analysis_intra_data *intraData = (analysis_intra_data*)analysis->intraData;
+        CHECKED_MALLOC_ZERO(intraData, analysis_intra_data, 1);
+        CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
+        CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
+        CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame);
+        analysis->intraData = intraData;
+    }
+    else
+    {
+        analysis_inter_data *interData = (analysis_inter_data*)analysis->interData;
+        CHECKED_MALLOC(interData, analysis_inter_data, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2);
+        analysis->interData = interData;
+    }
+    return;
+
+fail:
+    freeAnalysis(analysis);
+    m_aborted = true;
+}
+
+void Encoder::freeAnalysis(x265_analysis_data* analysis)
+{
+    if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+    {
+        X265_FREE(((analysis_intra_data*)analysis->intraData)->depth);
+        X265_FREE(((analysis_intra_data*)analysis->intraData)->modes);
+        X265_FREE(((analysis_intra_data*)analysis->intraData)->partSizes);
+        X265_FREE(analysis->intraData);
+    }
+    else
+        X265_FREE(analysis->interData);
+}
+
+void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc)
+{
+
+#define X265_FREAD(val, size, readSize, fileOffset)\
+    if (fread(val, size, readSize, fileOffset) != readSize)\
+    {\
+        x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\
+        freeAnalysis(analysis);\
+        m_aborted = true;\
+        return;\
+    }\
+
+    static uint64_t consumedBytes = 0;
+    static uint64_t totalConsumedBytes = 0;
+    fseeko(m_analysisFile, totalConsumedBytes, SEEK_SET);
+
+    int poc; uint32_t frameRecordSize;
+    X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
+    X265_FREAD(&poc, sizeof(int), 1, m_analysisFile);
+
+    uint64_t currentOffset = totalConsumedBytes;
+
+    /* Seeking to the right frame Record */
+    while (poc != curPoc && !feof(m_analysisFile))
+    {
+        currentOffset += frameRecordSize;
+        fseeko(m_analysisFile, currentOffset, SEEK_SET);
+        X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
+        X265_FREAD(&poc, sizeof(int), 1, m_analysisFile);
+    }
+
+    if (poc != curPoc || feof(m_analysisFile))
+    {
+        x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc);
+        freeAnalysis(analysis);
+        return;
+    }
+
+    /* Now arrived at the right frame, read the record */
+    analysis->poc = poc;
+    analysis->frameRecordSize = frameRecordSize;
+    X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFile);
+    X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile);
+    X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFile);
+
+    /* Memory is allocated for inter and intra analysis data based on the slicetype */
+    allocAnalysis(analysis);
+
+    if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+    {
+        X265_FREAD(((analysis_intra_data *)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+        X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+        X265_FREAD(((analysis_intra_data *)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+        analysis->sliceType = X265_TYPE_I;
+        consumedBytes += frameRecordSize;
+    }
+    else if (analysis->sliceType == X265_TYPE_P)
+    {
+        X265_FREAD(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
+        consumedBytes += frameRecordSize;
+        totalConsumedBytes = consumedBytes;
+    }
+    else
+    {
+        X265_FREAD(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
+        consumedBytes += frameRecordSize;
+    }
+#undef X265_FREAD
+}
+
+void Encoder::writeAnalysisFile(x265_analysis_data* analysis)
+{
+
+#define X265_FWRITE(val, size, writeSize, fileOffset)\
+    if (fwrite(val, size, writeSize, fileOffset) < writeSize)\
+    {\
+        x265_log(NULL, X265_LOG_ERROR, "Error writing analysis data\n");\
+        freeAnalysis(analysis);\
+        m_aborted = true;\
+        return;\
+    }\
+
+    /* calculate frameRecordSize */
+    analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
+                      sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions);
+    if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+        analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 3;
+    else if (analysis->sliceType == X265_TYPE_P)
+        analysis->frameRecordSize += sizeof(analysis_inter_data) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU;
+    else
+        analysis->frameRecordSize += sizeof(analysis_inter_data) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2;
+
+    X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
+    X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFile);
+    X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFile);
+    X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile);
+    X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFile);
+
+    if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+    {
+        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+    }
+    else if (analysis->sliceType == X265_TYPE_P)
+    {
+        X265_FWRITE(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
+    }
+    else
+    {
+        X265_FWRITE(analysis->interData, sizeof(analysis_inter_data), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
+    }
+#undef X265_FWRITE
 }
diff --git a/source/encoder/encoder.h b/source/encoder/encoder.h
index 8a387c2..27f0736 100644
--- a/source/encoder/encoder.h
+++ b/source/encoder/encoder.h
@@ -74,7 +74,7 @@ struct ThreadLocalData;
 
 class Encoder : public x265_encoder
 {
-private:
+public:
 
     int                m_pocLast;         // time index (POC)
     int                m_encodedFrameNum;
@@ -113,9 +113,7 @@ private:
     int                m_numChromaWPFrames;  // number of P frames with weighted chroma reference
     int                m_numLumaWPBiFrames;  // number of B frames with weighted luma reference
     int                m_numChromaWPBiFrames; // number of B frames with weighted chroma reference
-
-public:
-
+    FILE*              m_analysisFile;
     int                m_conformanceMode;
     VPS                m_vps;
     SPS                m_sps;
@@ -136,12 +134,10 @@ public:
     bool               m_aborted;          // fatal error detected
 
     Encoder();
-
     ~Encoder() {}
 
     void create();
     void destroy();
-    void init();
 
     int encode(const x265_picture* pic, x265_picture *pic_out);
 
@@ -163,12 +159,20 @@ public:
 
     void updateVbvPlan(RateControl* rc);
 
+    void allocAnalysis(x265_analysis_data* analysis);
+
+    void freeAnalysis(x265_analysis_data* analysis);
+
+    void readAnalysisFile(x265_analysis_data* analysis, int poc);
+
+    void writeAnalysisFile(x265_analysis_data* pic);
+
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits);
+
 protected:
 
     void initSPS(SPS *sps);
     void initPPS(PPS *pps);
-
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits);
 };
 }
 
diff --git a/source/encoder/entropy.cpp b/source/encoder/entropy.cpp
index 13eaf57..f7eb566 100644
--- a/source/encoder/entropy.cpp
+++ b/source/encoder/entropy.cpp
@@ -154,8 +154,8 @@ void Entropy::codePPS(const PPS& pps)
     if (pps.bUseDQP)
         WRITE_UVLC(pps.maxCuDQPDepth,      "diff_cu_qp_delta_depth");
 
-    WRITE_SVLC(pps.chromaCbQpOffset,       "pps_cb_qp_offset");
-    WRITE_SVLC(pps.chromaCrQpOffset,       "pps_cr_qp_offset");
+    WRITE_SVLC(pps.chromaQpOffset[0],      "pps_cb_qp_offset");
+    WRITE_SVLC(pps.chromaQpOffset[1],      "pps_cr_qp_offset");
     WRITE_FLAG(0,                          "pps_slice_chroma_qp_offsets_present_flag");
 
     WRITE_FLAG(pps.bUseWeightPred,            "weighted_pred_flag");
@@ -397,7 +397,9 @@ void Entropy::codeSliceHeader(const Slice& slice, FrameData& encData)
         // Ideally this process should not be repeated for each slice in a picture
         if (slice.isIRAP())
             for (int picIdx = 0; picIdx < slice.m_rps.numberOfPictures; picIdx++)
+            {
                 X265_CHECK(!slice.m_rps.bUsed[picIdx], "pic unused failure\n");
+            }
 #endif
 
         WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag");
@@ -515,9 +517,9 @@ void Entropy::encodeCTU(const CUData& ctu, const CUGeom& cuGeom)
 }
 
 /* encode a CU block recursively */
-void Entropy::encodeCU(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP)
+void Entropy::encodeCU(const CUData& ctu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP)
 {
-    const Slice* slice = cu.m_slice;
+    const Slice* slice = ctu.m_slice;
 
     if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
         bEncodeDQP = true;
@@ -527,78 +529,124 @@ void Entropy::encodeCU(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartI
 
     if (!cuUnsplitFlag)
     {
-        uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2;
-        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
+        uint32_t qNumParts = cuGeom.numPartitions >> 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
         {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
-                encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP);
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+                encodeCU(ctu, childGeom, absPartIdx, depth + 1, bEncodeDQP);
         }
         return;
     }
 
     // We need to split, so don't try these modes.
     if (cuSplitFlag) 
-        codeSplitFlag(cu, absPartIdx, depth);
+        codeSplitFlag(ctu, absPartIdx, depth);
 
-    if (depth < cu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth)
+    if (depth < ctu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth)
     {
-        uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2;
-
-        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
+        uint32_t qNumParts = cuGeom.numPartitions >> 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
         {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP);
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
+            encodeCU(ctu, childGeom, absPartIdx, depth + 1, bEncodeDQP);
         }
         return;
     }
 
     if (slice->m_pps->bTransquantBypassEnabled)
-        codeCUTransquantBypassFlag(cu.m_tqBypass[absPartIdx]);
+        codeCUTransquantBypassFlag(ctu.m_tqBypass[absPartIdx]);
 
     if (!slice->isIntra())
-        codeSkipFlag(cu, absPartIdx);
-
-    if (cu.isSkipped(absPartIdx))
     {
-        codeMergeIndex(cu, absPartIdx);
-        finishCU(cu, absPartIdx, depth);
-        return;
+        codeSkipFlag(ctu, absPartIdx);
+        if (ctu.isSkipped(absPartIdx))
+        {
+            codeMergeIndex(ctu, absPartIdx);
+            finishCU(ctu, absPartIdx, depth);
+            return;
+        }
+        codePredMode(ctu.m_predMode[absPartIdx]);
     }
 
-    if (!slice->isIntra())
-        codePredMode(cu.m_predMode[absPartIdx]);
-
-    codePartSize(cu, absPartIdx, depth);
+    codePartSize(ctu, absPartIdx, depth);
 
     // prediction Info ( Intra : direction mode, Inter : Mv, reference idx )
-    codePredInfo(cu, absPartIdx);
+    codePredInfo(ctu, absPartIdx);
 
     uint32_t tuDepthRange[2];
-    if (cu.isIntra(absPartIdx))
-        cu.getIntraTUQtDepthRange(tuDepthRange, absPartIdx);
+    if (ctu.isIntra(absPartIdx))
+        ctu.getIntraTUQtDepthRange(tuDepthRange, absPartIdx);
     else
-        cu.getInterTUQtDepthRange(tuDepthRange, absPartIdx);
+        ctu.getInterTUQtDepthRange(tuDepthRange, absPartIdx);
 
     // Encode Coefficients, allow codeCoeff() to modify bEncodeDQP
-    codeCoeff(cu, absPartIdx, depth, bEncodeDQP, tuDepthRange);
+    codeCoeff(ctu, absPartIdx, bEncodeDQP, tuDepthRange);
 
     // --- write terminating bit ---
-    finishCU(cu, absPartIdx, depth);
+    finishCU(ctu, absPartIdx, depth);
+}
+
+/* Return bit count of signaling inter mode */
+uint32_t Entropy::bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t depth) const
+{
+    uint32_t bits;
+    bits = bitsCodeBin(0, m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); /* not skip */
+    bits += bitsCodeBin(0, m_contextState[OFF_PRED_MODE_CTX]); /* inter */
+    PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
+    switch (partSize)
+    {
+    case SIZE_2Nx2N:
+        bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX]);
+        break;
+
+    case SIZE_2NxN:
+    case SIZE_2NxnU:
+    case SIZE_2NxnD:
+        bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]);
+        bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 1]);
+        if (cu.m_slice->m_sps->maxAMPDepth > depth)
+        {
+            bits += bitsCodeBin((partSize == SIZE_2NxN) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]);
+            if (partSize != SIZE_2NxN)
+                bits++; // encodeBinEP((partSize == SIZE_2NxnU ? 0 : 1));
+        }
+        break;
+
+    case SIZE_Nx2N:
+    case SIZE_nLx2N:
+    case SIZE_nRx2N:
+        bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]);
+        bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 1]);
+        if (depth == g_maxCUDepth && !(cu.m_log2CUSize[absPartIdx] == 3))
+            bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 2]);
+        if (cu.m_slice->m_sps->maxAMPDepth > depth)
+        {
+            bits += bitsCodeBin((partSize == SIZE_Nx2N) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]);
+            if (partSize != SIZE_Nx2N)
+                bits++; // encodeBinEP((partSize == SIZE_nLx2N ? 0 : 1));
+        }
+        break;
+    default:
+        X265_CHECK(0, "invalid CU partition\n");
+        break;
+    }
+
+    return bits;
 }
 
 /* finish encoding a cu and handle end-of-slice conditions */
-void Entropy::finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
+void Entropy::finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth)
 {
-    const Slice* slice = cu.m_slice;
-    X265_CHECK(cu.m_slice->m_endCUAddr == cu.m_slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n");
+    const Slice* slice = ctu.m_slice;
     uint32_t realEndAddress = slice->m_endCUAddr;
-    uint32_t cuAddr = cu.getSCUAddr() + absPartIdx;
+    uint32_t cuAddr = ctu.getSCUAddr() + absPartIdx;
+    X265_CHECK(realEndAddress == slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n");
 
     uint32_t granularityMask = g_maxCUSize - 1;
-    uint32_t cuSize = 1 << cu.m_log2CUSize[absPartIdx];
-    uint32_t rpelx = cu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize;
-    uint32_t bpely = cu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize;
+    uint32_t cuSize = 1 << ctu.m_log2CUSize[absPartIdx];
+    uint32_t rpelx = ctu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize;
+    uint32_t bpely = ctu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize;
     bool granularityBoundary = (((rpelx & granularityMask) == 0 || (rpelx == slice->m_sps->picWidthInLumaSamples )) &&
                                 ((bpely & granularityMask) == 0 || (bpely == slice->m_sps->picHeightInLumaSamples)));
 
@@ -618,41 +666,18 @@ void Entropy::finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
     }
 }
 
-void Entropy::encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLuma, uint32_t offsetChroma, uint32_t absPartIdx,
-                              uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx, bool& bCodeDQP, uint32_t depthRange[2])
+void Entropy::encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
+                              bool& bCodeDQP, const uint32_t depthRange[2])
 {
-    const bool subdiv = cu.m_tuDepth[absPartIdx] + cu.m_cuDepth[absPartIdx] > (uint8_t)depth;
-    uint32_t hChromaShift = cu.m_hChromaShift;
-    uint32_t vChromaShift = cu.m_vChromaShift;
-    uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, trIdx);
-    uint32_t cbfU = cu.getCbf(absPartIdx, TEXT_CHROMA_U, trIdx);
-    uint32_t cbfV = cu.getCbf(absPartIdx, TEXT_CHROMA_V, trIdx);
-
-    if (!trIdx)
-        state.bakAbsPartIdxCU = absPartIdx;
-
-    if (log2TrSize == 2 && cu.m_chromaFormat != X265_CSP_I444)
-    {
-        uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-        if (!(absPartIdx & (partNum - 1)))
-        {
-            state.bakAbsPartIdx   = absPartIdx;
-            state.bakChromaOffset = offsetChroma;
-        }
-        else if ((absPartIdx & (partNum - 1)) == (partNum - 1))
-        {
-            cbfU = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_U, trIdx);
-            cbfV = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_V, trIdx);
-        }
-    }
+    const bool subdiv = cu.m_tuDepth[absPartIdx] > tuDepth;
 
     /* in each of these conditions, the subdiv flag is implied and not signaled,
      * so we have checks to make sure the implied value matches our intentions */
-    if (cu.m_predMode[absPartIdx] == MODE_INTRA && cu.m_partSize[absPartIdx] == SIZE_NxN && depth == cu.m_cuDepth[absPartIdx])
+    if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth)
     {
         X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n");
     }
-    else if (cu.m_predMode[absPartIdx] == MODE_INTER && (cu.m_partSize[absPartIdx] != SIZE_2Nx2N) && depth == cu.m_cuDepth[absPartIdx] &&
+    else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth &&
              cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1)
     {
         X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2TrSize %d, depthRange[0] %d\n", log2TrSize, depthRange[0]);
@@ -671,127 +696,111 @@ void Entropy::encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t
         codeTransformSubdivFlag(subdiv, 5 - log2TrSize);
     }
 
-    const uint32_t trDepthCurr = depth - cu.m_cuDepth[absPartIdx];
-    const bool bFirstCbfOfCU = trDepthCurr == 0;
-
-    bool mCodeAll = true;
-    const uint32_t numPels = 1 << (log2TrSize * 2 - hChromaShift - vChromaShift);
-    if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
-        mCodeAll = false;
-
-    if (bFirstCbfOfCU || mCodeAll)
+    uint32_t hChromaShift = cu.m_hChromaShift;
+    uint32_t vChromaShift = cu.m_vChromaShift;
+    bool bSmallChroma = (log2TrSize - hChromaShift < 2);
+    if (!tuDepth || !bSmallChroma)
     {
-        uint32_t tuSize = 1 << log2TrSize;
-        if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1))
-            codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_U, trDepthCurr, (subdiv == 0));
-        if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1))
-            codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_V, trDepthCurr, (subdiv == 0));
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+            codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+            codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
     }
     else
     {
-        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1), "chroma xform size match failure\n");
-        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1), "chroma xform size match failure\n");
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma xform size match failure\n");
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma xform size match failure\n");
     }
 
     if (subdiv)
     {
-        log2TrSize--;
-        uint32_t numCoeff  = 1 << (log2TrSize * 2);
-        uint32_t numCoeffC = (numCoeff >> (hChromaShift + vChromaShift));
-        trIdx++;
-        ++depth;
-        absPartIdxStep >>= 2;
-        const uint32_t partNum = NUM_CU_PARTITIONS >> (depth << 1);
+        --log2TrSize;
+        ++tuDepth;
 
-        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+        uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
 
-        absPartIdx += partNum;
-        offsetLuma += numCoeff;
-        offsetChroma += numCoeffC;
-        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+        encodeTransform(cu, absPartIdx + 0 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+        encodeTransform(cu, absPartIdx + 1 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+        encodeTransform(cu, absPartIdx + 2 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+        encodeTransform(cu, absPartIdx + 3 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+        return;
+    }
 
-        absPartIdx += partNum;
-        offsetLuma += numCoeff;
-        offsetChroma += numCoeffC;
-        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+    uint32_t absPartIdxC = bSmallChroma ? absPartIdx & 0xFC : absPartIdx;
 
-        absPartIdx += partNum;
-        offsetLuma += numCoeff;
-        offsetChroma += numCoeffC;
-        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+    if (cu.isInter(absPartIdxC) && !tuDepth && !cu.getCbf(absPartIdxC, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdxC, TEXT_CHROMA_V, 0))
+    {
+        X265_CHECK(cu.getCbf(absPartIdxC, TEXT_LUMA, 0), "CBF should have been set\n");
     }
     else
+        codeQtCbfLuma(cu, absPartIdx, tuDepth);
+
+    uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth);
+    uint32_t cbfU = cu.getCbf(absPartIdxC, TEXT_CHROMA_U, tuDepth);
+    uint32_t cbfV = cu.getCbf(absPartIdxC, TEXT_CHROMA_V, tuDepth);
+    if (!(cbfY || cbfU || cbfV))
+        return;
+
+    // dQP: only for CTU once
+    if (cu.m_slice->m_pps->bUseDQP && bCodeDQP)
     {
-        if (cu.m_predMode[absPartIdx] != MODE_INTRA && depth == cu.m_cuDepth[absPartIdx] && !cu.getCbf(absPartIdx, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdx, TEXT_CHROMA_V, 0))
-        {
-            X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n");
-        }
-        else
-            codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+        uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
+        uint32_t absPartIdxLT = absPartIdx & (0xFF << (log2CUSize - LOG2_UNIT_SIZE) * 2);
+        codeDeltaQP(cu, absPartIdxLT);
+        bCodeDQP = false;
+    }
 
-        if (cbfY || cbfU || cbfV)
-        {
-            // dQP: only for CTU once
-            if (cu.m_slice->m_pps->bUseDQP)
-            {
-                if (bCodeDQP)
-                {
-                    codeDeltaQP(cu, state.bakAbsPartIdxCU);
-                    bCodeDQP = false;
-                }
-            }
-        }
-        if (cbfY)
-            codeCoeffNxN(cu, cu.m_trCoeff[0] + offsetLuma, absPartIdx, log2TrSize, TEXT_LUMA);
+    if (cbfY)
+    {
+        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
+        codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2TrSize, TEXT_LUMA);
+        if (!(cbfU || cbfV))
+            return;
+    }
 
-        int chFmt = cu.m_chromaFormat;
-        if (log2TrSize == 2 && chFmt != X265_CSP_I444)
+    if (bSmallChroma)
+    {
+        if ((absPartIdx & 3) != 3)
+            return;
+
+        const uint32_t log2TrSizeC = 2;
+        const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422);
+        const uint32_t curPartNum = 4;
+        uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift));
+        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
         {
-            uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-            if ((absPartIdx & (partNum - 1)) == (partNum - 1))
+            TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdxC);
+            const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
+            do
             {
-                const uint32_t log2TrSizeC = 2;
-                const bool splitIntoSubTUs = (chFmt == X265_CSP_I422);
-
-                uint32_t curPartNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-
-                for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+                if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs))
                 {
-                    TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, state.bakAbsPartIdx);
-                    const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
-                    do
-                    {
-                        uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
-                        if (cbf)
-                        {
-                            uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
-                            codeCoeffNxN(cu, coeffChroma + state.bakChromaOffset + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
-                        }
-                    }
-                    while (tuIterator.isNextSection());
+                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+                    codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
                 }
             }
+            while (tuIterator.isNextSection());
         }
-        else
+    }
+    else
+    {
+        uint32_t log2TrSizeC = log2TrSize - hChromaShift;
+        const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422);
+        uint32_t curPartNum = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
+        uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift));
+        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
         {
-            uint32_t log2TrSizeC = log2TrSize - hChromaShift;
-            const bool splitIntoSubTUs = (chFmt == X265_CSP_I422);
-            uint32_t curPartNum = NUM_CU_PARTITIONS >> (depth << 1);
-            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+            TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdxC);
+            const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
+            do
             {
-                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdx);
-                const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
-                do
+                if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs))
                 {
-                    uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
-                    if (cbf)
-                    {
-                        uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
-                        codeCoeffNxN(cu, coeffChroma + offsetChroma + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
-                    }
+                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+                    codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
                 }
-                while (tuIterator.isNextSection());
             }
+            while (tuIterator.isNextSection());
         }
     }
 }
@@ -808,14 +817,14 @@ void Entropy::codePredInfo(const CUData& cu, uint32_t absPartIdx)
 
             codeIntraDirChroma(cu, absPartIdx, chromaDirMode);
 
-            if ((cu.m_chromaFormat == X265_CSP_I444) && (cu.m_partSize[absPartIdx] == SIZE_NxN))
+            if (cu.m_chromaFormat == X265_CSP_I444 && cu.m_partSize[absPartIdx] != SIZE_2Nx2N)
             {
-                uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2;
-                for (uint32_t i = 1; i <= 3; i++)
+                uint32_t qNumParts = 1 << (cu.m_log2CUSize[absPartIdx] - 1 - LOG2_UNIT_SIZE) * 2;
+                for (uint32_t qIdx = 1; qIdx < 4; ++qIdx)
                 {
-                    uint32_t offset = absPartIdx + i * partOffset;
-                    cu.getAllowedChromaDir(offset, chromaDirMode);
-                    codeIntraDirChroma(cu, offset, chromaDirMode);
+                    absPartIdx += qNumParts;
+                    cu.getAllowedChromaDir(absPartIdx, chromaDirMode);
+                    codeIntraDirChroma(cu, absPartIdx, chromaDirMode);
                 }
             }
         }
@@ -867,7 +876,7 @@ void Entropy::codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list)
         codeRefFrmIdx(cu, absPartIdx, list);
 }
 
-void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2])
+void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2])
 {
     if (!cu.isIntra(absPartIdx))
     {
@@ -877,12 +886,8 @@ void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, b
             return;
     }
 
-    uint32_t log2CUSize   = cu.m_log2CUSize[absPartIdx];
-    uint32_t lumaOffset   = absPartIdx << (LOG2_UNIT_SIZE * 2);
-    uint32_t chromaOffset = lumaOffset >> (cu.m_hChromaShift + cu.m_vChromaShift);
-    uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> (depth << 1);
-    CoeffCodeState state;
-    encodeTransform(cu, state, lumaOffset, chromaOffset, absPartIdx, absPartIdxStep, depth, log2CUSize, 0, bCodeDQP, depthRange);
+    uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
+    encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
 }
 
 void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane)
@@ -1116,7 +1121,7 @@ void Entropy::writeCoefRemainExGolomb(uint32_t codeNumber, uint32_t absGoRice)
         if (codeNumber != 0)
         {
             unsigned long idx;
-            CLZ32(idx, codeNumber + 1);
+            CLZ(idx, codeNumber + 1);
             length = idx;
             codeNumber -= (1 << idx) - 1;
         }
@@ -1145,11 +1150,6 @@ void Entropy::copyFrom(const Entropy& src)
     markValid();
 }
 
-void Entropy::codeMVPIdx(uint32_t symbol)
-{
-    encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]);
-}
-
 void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
 {
     PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
@@ -1200,32 +1200,6 @@ void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth
     }
 }
 
-void Entropy::codePredMode(int predMode)
-{
-    encodeBin(predMode == MODE_INTER ? 0 : 1, m_contextState[OFF_PRED_MODE_CTX]);
-}
-
-void Entropy::codeCUTransquantBypassFlag(uint32_t symbol)
-{
-    encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]);
-}
-
-void Entropy::codeSkipFlag(const CUData& cu, uint32_t absPartIdx)
-{
-    // get context function is here
-    uint32_t symbol = cu.isSkipped(absPartIdx) ? 1 : 0;
-    uint32_t ctxSkip = cu.getCtxSkipFlag(absPartIdx);
-
-    encodeBin(symbol, m_contextState[OFF_SKIP_FLAG_CTX + ctxSkip]);
-}
-
-void Entropy::codeMergeFlag(const CUData& cu, uint32_t absPartIdx)
-{
-    const uint32_t symbol = cu.m_mergeFlag[absPartIdx] ? 1 : 0;
-
-    encodeBin(symbol, m_contextState[OFF_MERGE_FLAG_EXT_CTX]);
-}
-
 void Entropy::codeMergeIndex(const CUData& cu, uint32_t absPartIdx)
 {
     uint32_t numCand = cu.m_slice->m_maxNumMergeCand;
@@ -1246,50 +1220,18 @@ void Entropy::codeMergeIndex(const CUData& cu, uint32_t absPartIdx)
     }
 }
 
-void Entropy::codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
-{
-    X265_CHECK(depth < g_maxCUDepth, "invalid depth\n");
-
-    uint32_t ctx           = cu.getCtxSplitFlag(absPartIdx, depth);
-    uint32_t currSplitFlag = (cu.m_cuDepth[absPartIdx] > depth) ? 1 : 0;
-
-    X265_CHECK(ctx < 3, "ctx out of range\n");
-    encodeBin(currSplitFlag, m_contextState[OFF_SPLIT_FLAG_CTX + ctx]);
-}
-
-void Entropy::codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx)
-{
-    encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]);
-}
-
-uint32_t Entropy::bitsIntraModeNonMPM() const
-{
-    uint32_t mstate = m_contextState[OFF_ADI_CTX];
-    uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 0)) >> 15;
-    return bits + 5; /* fixed cost for encodeBinsEP() */
-}
-
-uint32_t Entropy::bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const
-{
-    X265_CHECK(dir == preds[0] || dir == preds[1] || dir == preds[2], "dir must be a most probable mode\n");
-    uint32_t mstate = m_contextState[OFF_ADI_CTX];
-    uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 1)) >> 15;
-    return bits + (dir == preds[0] ? 1 : 2);
-}
-
 void Entropy::codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple)
 {
     uint32_t dir[4], j;
     uint32_t preds[4][3];
     int predIdx[4];
-    PartSize mode = (PartSize)cu.m_partSize[absPartIdx];
-    uint32_t partNum = isMultiple ? (mode == SIZE_NxN ? 4 : 1) : 1;
-    uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2;
+    uint32_t partNum = isMultiple && cu.m_partSize[absPartIdx] != SIZE_2Nx2N ? 4 : 1;
+    uint32_t qNumParts = 1 << (cu.m_log2CUSize[absPartIdx] - 1 - LOG2_UNIT_SIZE) * 2;
 
-    for (j = 0; j < partNum; j++)
+    for (j = 0; j < partNum; j++, absPartIdx += qNumParts)
     {
-        dir[j] = cu.m_lumaIntraDir[absPartIdx + partOffset * j];
-        cu.getIntraDirLumaPredictor(absPartIdx + partOffset * j, preds[j]);
+        dir[j] = cu.m_lumaIntraDir[absPartIdx];
+        cu.getIntraDirLumaPredictor(absPartIdx, preds[j]);
         predIdx[j] = -1;
         for (uint32_t i = 0; i < 3; i++)
             if (dir[j] == preds[j][i])
@@ -1444,46 +1386,25 @@ void Entropy::codeDeltaQP(const CUData& cu, uint32_t absPartIdx)
     }
 }
 
-void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel)
+void Entropy::codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel)
 {
-    uint32_t ctx = ctxCbf[ttype][trDepth];
+    uint32_t ctx = tuDepth + 2;
 
-    bool canQuadSplit       = (width >= (MIN_TU_SIZE * 2)) && (height >= (MIN_TU_SIZE * 2));
-    uint32_t lowestTUDepth  = trDepth + ((!lowestLevel && !canQuadSplit) ? 1 : 0); // unsplittable TUs inherit their parent's CBF
+    uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
+    bool canQuadSplit       = (log2TrSize - cu.m_hChromaShift > 2);
+    uint32_t lowestTUDepth  = tuDepth + ((!lowestLevel && !canQuadSplit) ? 1 : 0); // unsplittable TUs inherit their parent's CBF
 
-    if ((width != height) && (lowestLevel || !canQuadSplit)) // if sub-TUs are present
+    if (cu.m_chromaFormat == X265_CSP_I422 && (lowestLevel || !canQuadSplit)) // if sub-TUs are present
     {
         uint32_t subTUDepth        = lowestTUDepth + 1;   // if this is the lowest level of the TU-tree, the sub-TUs are directly below.
                                                           // Otherwise, this must be the level above the lowest level (as specified above)
-        uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
-
-        for (uint32_t subTU = 0; subTU < 2; subTU++)
-        {
-            uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
-            uint32_t cbf = cu.getCbf(subTUAbsPartIdx, ttype, subTUDepth);
+        uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
 
-            encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
-        }
+        encodeBin(cu.getCbf(absPartIdx             , ttype, subTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
+        encodeBin(cu.getCbf(absPartIdx + tuNumParts, ttype, subTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
     }
     else
-    {
-        uint32_t cbf = cu.getCbf(absPartIdx, ttype, lowestTUDepth);
-
-        encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
-    }
-}
-
-void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth)
-{
-    uint32_t ctx = ctxCbf[ttype][trDepth];
-    uint32_t cbf = cu.getCbf(absPartIdx, ttype, trDepth);
-    encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
-}
-
-void Entropy::codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth)
-{
-    uint32_t ctx = ctxCbf[ttype][trDepth];
-    encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
+        encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
 }
 
 void Entropy::codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype)
@@ -1497,26 +1418,6 @@ void Entropy::codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint
     encodeBin(useTransformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]);
 }
 
-void Entropy::codeQtRootCbf(uint32_t cbf)
-{
-    encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]);
-}
-
-void Entropy::codeQtCbfZero(TextType ttype, uint32_t trDepth)
-{
-    // this function is only used to estimate the bits when cbf is 0
-    // and will never be called when writing the bitsream.
-    uint32_t ctx = ctxCbf[ttype][trDepth];
-    encodeBin(0, m_contextState[OFF_QT_CBF_CTX + ctx]);
-}
-
-void Entropy::codeQtRootCbfZero()
-{
-    // this function is only used to estimate the bits when cbf is 0
-    // and will never be called when writing the bistream.
-    encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]);
-}
-
 /** Encode (X,Y) position of the last significant coefficient
  * \param posx X component of last coefficient
  * \param posy Y component of last coefficient
@@ -2006,9 +1907,9 @@ void Entropy::encodeBin(uint32_t binValue, uint8_t &ctxModel)
     if ((binValue ^ mstate) & 1)
     {
         // NOTE: lps is non-zero and the maximum of idx is 8 because lps less than 256
-        //numBits   = g_renormTable[lps >> 3];
+        //numBits = g_renormTable[lps >> 3];
         unsigned long idx;
-        CLZ32(idx, lps);
+        CLZ(idx, lps);
         X265_CHECK(state != 63 || idx == 1, "state failure\n");
 
         numBits = 8 - idx;
diff --git a/source/encoder/entropy.h b/source/encoder/entropy.h
index bed06cf..9cd927f 100644
--- a/source/encoder/entropy.h
+++ b/source/encoder/entropy.h
@@ -27,6 +27,7 @@
 #include "common.h"
 #include "bitstream.h"
 #include "frame.h"
+#include "cudata.h"
 #include "contexts.h"
 #include "slice.h"
 
@@ -35,8 +36,6 @@ namespace x265 {
 
 struct SaoCtuParam;
 struct EstBitsSbac;
-class CUData;
-struct CUGeom;
 class ScalingList;
 
 enum SplitType
@@ -154,41 +153,55 @@ public:
     void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
 
     void encodeCTU(const CUData& cu, const CUGeom& cuGeom);
-    void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
-    void codeSaoMerge(uint32_t code)   { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
 
-    void codeCUTransquantBypassFlag(uint32_t symbol);
-    void codeSkipFlag(const CUData& cu, uint32_t absPartIdx);
-    void codeMergeFlag(const CUData& cu, uint32_t absPartIdx);
+    void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple);
+    void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode);
+
     void codeMergeIndex(const CUData& cu, uint32_t absPartIdx);
-    void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
-    void codeMVPIdx(uint32_t symbol);
     void codeMvd(const CUData& cu, uint32_t absPartIdx, int list);
 
     void codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
-    void codePredMode(int predMode);
     void codePredInfo(const CUData& cu, uint32_t absPartIdx);
-    void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx);
-    void codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel);
-    void codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth);
-    void codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth);
-    void codeQtCbfZero(TextType ttype, uint32_t trDepth);
-    void codeQtRootCbfZero();
-    void codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2]);
+    inline void codeQtCbfLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth) { codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); }
+
+    void codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel);
+    void codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2]);
     void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype);
 
-    uint32_t bitsIntraModeNonMPM() const;
-    uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const;
-    void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple);
-    void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode);
+    inline void codeSaoMerge(uint32_t code)                          { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
+    inline void codeMVPIdx(uint32_t symbol)                          { encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); }
+    inline void codeMergeFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.m_mergeFlag[absPartIdx], m_contextState[OFF_MERGE_FLAG_EXT_CTX]); }
+    inline void codeSkipFlag(const CUData& cu, uint32_t absPartIdx)  { encodeBin(cu.isSkipped(absPartIdx), m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); }
+    inline void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth) { encodeBin(cu.m_cuDepth[absPartIdx] > depth, m_contextState[OFF_SPLIT_FLAG_CTX + cu.getCtxSplitFlag(absPartIdx, depth)]); }
+    inline void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx)    { encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]); }
+    inline void codePredMode(int predMode)                                { encodeBin(predMode == MODE_INTRA ? 1 : 0, m_contextState[OFF_PRED_MODE_CTX]); }
+    inline void codeCUTransquantBypassFlag(uint32_t symbol)               { encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]); }
+    inline void codeQtCbfLuma(uint32_t cbf, uint32_t tuDepth)             { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + !tuDepth]); }
+    inline void codeQtCbfChroma(uint32_t cbf, uint32_t tuDepth)           { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + 2 + tuDepth]); }
+    inline void codeQtRootCbf(uint32_t cbf)                               { encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]); }
+
+    void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
 
-    // RDO functions
+    /* RDO functions */
     void estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
     void estCBFBit(EstBitsSbac& estBitsSbac) const;
     void estSignificantCoeffGroupMapBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const;
     void estSignificantMapBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
     void estSignificantCoefficientsBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const;
 
+    inline uint32_t bitsIntraModeNonMPM() const { return bitsCodeBin(0, m_contextState[OFF_ADI_CTX]) + 5; }
+    inline uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const { return bitsCodeBin(1, m_contextState[OFF_ADI_CTX]) + (dir == preds[0] ? 1 : 2); }
+    inline uint32_t estimateCbfBits(uint32_t cbf, TextType ttype, uint32_t tuDepth) const { return bitsCodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctxCbf[ttype][tuDepth]]); }
+    uint32_t bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t depth) const;
+    uint32_t bitsIntraMode(const CUData& cu, uint32_t absPartIdx) const
+    {
+        return bitsCodeBin(0, m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]) + /* not skip */
+               bitsCodeBin(1, m_contextState[OFF_PRED_MODE_CTX]); /* intra */
+    }
+
+    /* these functions are only used to estimate the bits when cbf is 0 and will never be called when writing the bistream. */
+    inline void codeQtRootCbfZero() { encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]); }
+
 private:
 
     /* CABAC private methods */
@@ -200,8 +213,15 @@ private:
     void encodeBinsEP(uint32_t binValues, int numBins);
     void encodeBinTrm(uint32_t binValue);
 
-    void encodeCU(const CUData& cu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP);
-    void finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
+    /* return the bits of encoding the context bin without updating */
+    inline uint32_t bitsCodeBin(uint32_t binValue, uint32_t ctxModel) const
+    {
+        uint64_t fracBits = (m_fracBits & 32767) + sbacGetEntropyBits(ctxModel, binValue);
+        return (uint32_t)(fracBits >> 15);
+    }
+
+    void encodeCU(const CUData& ctu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP);
+    void finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth);
 
     void writeOut();
 
@@ -217,7 +237,6 @@ private:
     void codePredWeightTable(const Slice& slice);
     void codeInterDir(const CUData& cu, uint32_t absPartIdx);
     void codePUWise(const CUData& cu, uint32_t absPartIdx);
-    void codeQtRootCbf(uint32_t cbf);
     void codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list);
     void codeRefFrmIdx(const CUData& cu, uint32_t absPartIdx, int list);
 
@@ -227,16 +246,8 @@ private:
     void codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx);
     void codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype);
 
-    struct CoeffCodeState
-    {
-        uint32_t bakAbsPartIdx;
-        uint32_t bakChromaOffset;
-        uint32_t bakAbsPartIdxCU;
-    };
-
-    void encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma,
-                         uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx,
-                         bool& bCodeDQP, uint32_t depthRange[2]);
+    void encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
+                         bool& bCodeDQP, const uint32_t depthRange[2]);
 
     void copyFrom(const Entropy& src);
     void copyContextsFrom(const Entropy& src);
diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp
index c6e6915..5f4d2f7 100644
--- a/source/encoder/frameencoder.cpp
+++ b/source/encoder/frameencoder.cpp
@@ -29,8 +29,6 @@
 #include "wavefront.h"
 #include "param.h"
 
-#include "PPA/ppa.h"
-
 #include "encoder.h"
 #include "frameencoder.h"
 #include "common.h"
@@ -126,23 +124,24 @@ bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id)
         ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
     }
 
-    if (m_param->noiseReduction)
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
         m_nr = X265_MALLOC(NoiseReduction, 1);
     if (m_nr)
         memset(m_nr, 0, sizeof(NoiseReduction));
     else
-        m_param->noiseReduction = 0;
+        m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
 
     start();
     return ok;
 }
 
 /* Generate a complete list of unique geom sets for the current picture dimensions */
-bool FrameEncoder::initializeGeoms(const FrameData& encData)
+bool FrameEncoder::initializeGeoms()
 {
     /* Geoms only vary between CTUs in the presence of picture edges */
-    int heightRem = m_param->sourceHeight & (m_param->maxCUSize - 1);
-    int widthRem = m_param->sourceWidth & (m_param->maxCUSize - 1);
+    int maxCUSize = m_param->maxCUSize;
+    int heightRem = m_param->sourceHeight & (maxCUSize - 1);
+    int widthRem = m_param->sourceWidth & (maxCUSize - 1);
     int allocGeoms = 1; // body
     if (heightRem && widthRem)
         allocGeoms = 4; // body, right, bottom, corner
@@ -154,33 +153,45 @@ bool FrameEncoder::initializeGeoms(const FrameData& encData)
     if (!m_cuGeoms || !m_ctuGeomMap)
         return false;
 
-    CUGeom cuLocalData[CUGeom::MAX_GEOMS];
-    memset(cuLocalData, 0, sizeof(cuLocalData)); // temporal fix for memcmp
+    // body
+    CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, m_cuGeoms);
+    memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols);
+    if (allocGeoms == 1)
+        return true;
 
-    int countGeoms = 0;
-    for (uint32_t ctuAddr = 0; ctuAddr < m_numRows * m_numCols; ctuAddr++)
+    int countGeoms = 1;
+    if (widthRem)
     {
-        /* TODO: detach this logic from TComDataCU */
-        encData.m_picCTU[ctuAddr].initCTU(*m_frame, ctuAddr, 0);
-        encData.m_picCTU[ctuAddr].calcCTUGeoms(m_param->sourceWidth, m_param->sourceHeight, m_param->maxCUSize, cuLocalData);
-
-        m_ctuGeomMap[ctuAddr] = MAX_INT;
-        for (int i = 0; i < countGeoms; i++)
+        // right
+        CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+        for (int i = 0; i < m_numRows; i++)
         {
-            if (!memcmp(cuLocalData, m_cuGeoms + i * CUGeom::MAX_GEOMS, sizeof(CUGeom) * CUGeom::MAX_GEOMS))
-            {
-                m_ctuGeomMap[ctuAddr] = i * CUGeom::MAX_GEOMS;
-                break;
-            }
+            uint32_t ctuAddr = m_numCols * (i + 1) - 1;
+            m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
         }
+        countGeoms++;
+    }
+    if (heightRem)
+    {
+        // bottom
+        CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+        for (uint32_t i = 0; i < m_numCols; i++)
+        {
+            uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i;
+            m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
+        }
+        countGeoms++;
 
-        if (m_ctuGeomMap[ctuAddr] == MAX_INT)
+        if (widthRem)
         {
-            X265_CHECK(countGeoms < allocGeoms, "geometry match check failure\n");
+            // corner
+            CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+
+            uint32_t ctuAddr = m_numCols * m_numRows - 1;
             m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
-            memcpy(m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS, cuLocalData, sizeof(CUGeom) * CUGeom::MAX_GEOMS);
             countGeoms++;
         }
+        X265_CHECK(countGeoms == allocGeoms, "geometry match check failure\n");
     }
 
     return true;
@@ -191,11 +202,13 @@ bool FrameEncoder::startCompressFrame(Frame* curFrame)
     m_frame = curFrame;
     curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it
     curFrame->m_encData->m_slice->m_mref = m_mref;
+
     if (!m_cuGeoms)
     {
-        if (!initializeGeoms(*curFrame->m_encData))
+        if (!initializeGeoms())
             return false;
     }
+
     m_enable.trigger();
     return true;
 }
@@ -217,7 +230,7 @@ void FrameEncoder::threadMain()
 
 void FrameEncoder::compressFrame()
 {
-    PPAScopeEvent(FrameEncoder_compressFrame);
+    //ProfileScopeEvent(frameThread);
     int64_t startCompressTime = x265_mdate();
     Slice* slice = m_frame->m_encData->m_slice;
 
@@ -252,7 +265,7 @@ void FrameEncoder::compressFrame()
             WeightParam *w = NULL;
             if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
                 w = slice->m_weightPredTable[l][ref];
-            m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPicYuv, w);
+            m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
         }
     }
 
@@ -481,7 +494,7 @@ void FrameEncoder::compressFrame()
         for (int i = 0; i < m_top->m_numThreadLocalData; i++)
         {
             NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID];
-            memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
+            memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
             memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
             memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
         }
@@ -569,7 +582,6 @@ void FrameEncoder::encodeSlice()
 
 void FrameEncoder::compressCTURows()
 {
-    PPAScopeEvent(FrameEncoder_compressRows);
     Slice* slice = m_frame->m_encData->m_slice;
 
     m_bAllRowsStop = false;
@@ -643,12 +655,12 @@ void FrameEncoder::compressCTURows()
                     }
                 }
 
-                processRow(i * 2 + 0, -1);
+                processRowEncoder(i, *m_tld);
             }
 
             // Filter
             if (i >= m_filterRowDelay)
-                processRow((i - m_filterRowDelay) * 2 + 1, -1);
+                m_frameFilter.processRow(i - m_filterRowDelay);
         }
     }
     m_frameTime = (double)m_totalTime / 1000000;
@@ -666,7 +678,7 @@ void FrameEncoder::processRow(int row, int threadId)
         processRowEncoder(realRow, tld);
     else
     {
-        processRowFilter(realRow);
+        m_frameFilter.processRow(realRow);
 
         // NOTE: Active next row
         if (realRow != m_numRows - 1)
@@ -679,8 +691,6 @@ void FrameEncoder::processRow(int row, int threadId)
 // Called by worker threads
 void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
 {
-    PPAScopeEvent(Thread_ProcessRow);
-
     CTURow& curRow = m_rows[row];
 
     {
@@ -707,9 +717,6 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
     Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder;
     FrameData& curEncData = *m_frame->m_encData;
     Slice *slice = curEncData.m_slice;
-    PicYuv* fencPic = m_frame->m_origPicYuv;
-
-    tld.analysis.m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
 
     int64_t startTime = x265_mdate();
     const uint32_t numCols = m_numCols;
@@ -718,6 +725,8 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
 
     while (curRow.completed < numCols)
     {
+        ProfileScopeEvent(encodeCTU);
+
         int col = curRow.completed;
         const uint32_t cuAddr = lineStartCUAddr + col;
         CUData* ctu = curEncData.getPicCTU(cuAddr);
@@ -744,7 +753,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
             int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp);
             tld.analysis.setQP(*slice, qp);
             qp = Clip3(QP_MIN, QP_MAX_SPEC, qp);
-            ctu->setQPSubParts((char)qp, 0, 0);
+            ctu->setQPSubParts((int8_t)qp, 0, 0);
             curEncData.m_rowStat[row].sumQpAq += qp;
         }
         else
@@ -758,7 +767,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
         }
 
         // Does all the CU analysis, returns best top level mode decision
-        Search::Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
+        Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
 
         /* advance top-level row coder to include the context of this CTU.
          * if SAO is disabled, rowCoder writes the final CTU bitstream */
@@ -839,9 +848,13 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
                                 if (dequeueRow(r * 2))
                                     stopRow.active = false;
                                 else
+                                {
+                                    /* we must release the row lock to allow the thread to exit */
+                                    stopRow.lock.release();
                                     GIVE_UP_TIME();
+                                    stopRow.lock.acquire();
+                                }
                             }
-
                             stopRow.lock.release();
 
                             bool bRowBusy = true;
@@ -937,19 +950,22 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
         m_top->m_rateControl->rateControlUpdateStats(&m_rce);
     }
 
-    // trigger row-wise loop filters
-    if (row >= m_filterRowDelay)
+    if (m_param->bEnableWavefront)
     {
-        enableRowFilter(row - m_filterRowDelay);
+        /* trigger row-wise loop filters */
+        if (row >= m_filterRowDelay)
+        {
+            enableRowFilter(row - m_filterRowDelay);
 
-        // NOTE: Active Filter to first row (row 0)
-        if (row == m_filterRowDelay)
-            enqueueRowFilter(0);
-    }
-    if (row == m_numRows - 1)
-    {
-        for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
-            enableRowFilter(i);
+            /* NOTE: Activate filter if first row (row 0) */
+            if (row == m_filterRowDelay)
+                enqueueRowFilter(0);
+        }
+        if (row == m_numRows - 1)
+        {
+            for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
+                enableRowFilter(i);
+        }
     }
 
     m_totalTime += x265_mdate() - startTime;
@@ -971,13 +987,13 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu)
             log->cntIntra[depth]++;
             log->qTreeIntraCnt[depth]++;
 
-            if (ctu.m_partSize[absPartIdx] == SIZE_NONE)
+            if (ctu.m_predMode[absPartIdx] == MODE_NONE)
             {
                 log->totalCu--;
                 log->cntIntra[depth]--;
                 log->qTreeIntraCnt[depth]--;
             }
-            else if (ctu.m_partSize[absPartIdx] == SIZE_NxN)
+            else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
             {
                 /* TODO: log intra modes at absPartIdx +0 to +3 */
                 X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
@@ -1000,7 +1016,7 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu)
             log->totalCu++;
             log->cntTotalCu[depth]++;
 
-            if (ctu.m_partSize[absPartIdx] == SIZE_NONE)
+            if (ctu.m_predMode[absPartIdx] == MODE_NONE)
             {
                 log->totalCu--;
                 log->cntTotalCu[depth]--;
@@ -1011,7 +1027,7 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu)
                 log->cntSkipCu[depth]++;
                 log->qTreeSkipCnt[depth]++;
             }
-            else if (ctu.m_predMode[absPartIdx] == MODE_INTER)
+            else if (ctu.isInter(absPartIdx))
             {
                 log->cntInter[depth]++;
                 log->qTreeInterCnt[depth]++;
@@ -1021,12 +1037,12 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu)
                 else
                     log->cuInterDistribution[depth][AMP_ID]++;
             }
-            else if (ctu.m_predMode[absPartIdx] == MODE_INTRA)
+            else if (ctu.isIntra(absPartIdx))
             {
                 log->cntIntra[depth]++;
                 log->qTreeIntraCnt[depth]++;
 
-                if (ctu.m_partSize[absPartIdx] == SIZE_NxN)
+                if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
                 {
                     X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
                     log->cntIntraNxN++;
@@ -1061,7 +1077,8 @@ void FrameEncoder::noiseReductionUpdate()
             m_nr->count[cat] >>= 1;
         }
 
-        uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
+        int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter;
+        uint64_t scaledCount = (uint64_t)nrStrength * m_nr->count[cat];
 
         for (int i = 0; i < coefCount; i++)
         {
@@ -1091,8 +1108,8 @@ int FrameEncoder::calcQpForCu(uint32_t ctuAddr, double baseQp)
 
     /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
     double qp_offset = 0;
-    uint32_t maxBlockCols = (m_frame->m_origPicYuv->m_picWidth + (16 - 1)) / 16;
-    uint32_t maxBlockRows = (m_frame->m_origPicYuv->m_picHeight + (16 - 1)) / 16;
+    uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
+    uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16;
     uint32_t noOfBlocks = g_maxCUSize / 16;
     uint32_t block_y = (ctuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks;
     uint32_t block_x = (ctuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth;
diff --git a/source/encoder/frameencoder.h b/source/encoder/frameencoder.h
index 625c025..ec5cf5b 100644
--- a/source/encoder/frameencoder.h
+++ b/source/encoder/frameencoder.h
@@ -185,7 +185,7 @@ public:
 
 protected:
 
-    bool initializeGeoms(const FrameData& encData);
+    bool initializeGeoms();
 
     /* analyze / compress frame, can be run in parallel within reference constraints */
     void compressFrame();
@@ -204,7 +204,6 @@ protected:
     /* Called by WaveFront::findJob() */
     void processRow(int row, int threadId);
     void processRowEncoder(int row, ThreadLocalData& tld);
-    void processRowFilter(int row) { m_frameFilter.processRow(row); }
 
     void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); }
     void enqueueRowFilter(int row)  { WaveFront::enqueueRow(row * 2 + 1); }
diff --git a/source/encoder/framefilter.cpp b/source/encoder/framefilter.cpp
index aee75c6..f6c233d 100644
--- a/source/encoder/framefilter.cpp
+++ b/source/encoder/framefilter.cpp
@@ -29,7 +29,6 @@
 #include "framefilter.h"
 #include "frameencoder.h"
 #include "wavefront.h"
-#include "PPA/ppa.h"
 
 using namespace x265;
 
@@ -84,7 +83,7 @@ void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
 
 void FrameFilter::processRow(int row)
 {
-    PPAScopeEvent(Thread_filterCU);
+    ProfileScopeEvent(filterCTURow);
 
     if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
     {
@@ -100,19 +99,19 @@ void FrameFilter::processRow(int row)
         for (uint32_t col = 0; col < numCols; col++)
         {
             uint32_t cuAddr = lineStartCUAddr + col;
-            CUData* cu = encData.getPicCTU(cuAddr);
+            const CUData* ctu = encData.getPicCTU(cuAddr);
 
-            m_deblock.deblockCTU(cu, Deblock::EDGE_VER);
+            m_deblock.deblockCTU(ctu, Deblock::EDGE_VER);
 
             if (col > 0)
             {
-                CUData* cuPrev = encData.getPicCTU(cuAddr - 1);
-                m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
+                const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
+                m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
             }
         }
 
-        CUData* cuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
-        m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
+        const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
+        m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
     }
 
     // SAO
@@ -156,7 +155,7 @@ uint32_t FrameFilter::getCUHeight(int rowNum) const
 
 void FrameFilter::processRowPost(int row)
 {
-    PicYuv *reconPic = m_frame->m_reconPicYuv;
+    PicYuv *reconPic = m_frame->m_reconPic;
     const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
     const uint32_t lineStartCUAddr = row * numCols;
     const int realH = getCUHeight(row);
@@ -209,19 +208,19 @@ void FrameFilter::processRowPost(int row)
     uint32_t cuAddr = lineStartCUAddr;
     if (m_param->bEnablePsnr)
     {
-        PicYuv* origPic = m_frame->m_origPicYuv;
+        PicYuv* fencPic = m_frame->m_fencPic;
 
         intptr_t stride = reconPic->m_stride;
         uint32_t width  = reconPic->m_picWidth - m_pad[0];
         uint32_t height = getCUHeight(row);
 
-        uint64_t ssdY = computeSSD(origPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
+        uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
         height >>= m_vChromaShift;
         width  >>= m_hChromaShift;
         stride = reconPic->m_strideC;
 
-        uint64_t ssdU = computeSSD(origPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
-        uint64_t ssdV = computeSSD(origPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
+        uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
+        uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
 
         m_frameEncoder->m_SSDY += ssdY;
         m_frameEncoder->m_SSDU += ssdU;
@@ -229,10 +228,10 @@ void FrameFilter::processRowPost(int row)
     }
     if (m_param->bEnableSsim && m_ssimBuf)
     {
-        pixel *rec = m_frame->m_reconPicYuv->m_picOrg[0];
-        pixel *org = m_frame->m_origPicYuv->m_picOrg[0];
-        intptr_t stride1 = m_frame->m_origPicYuv->m_stride;
-        intptr_t stride2 = m_frame->m_reconPicYuv->m_stride;
+        pixel *rec = m_frame->m_reconPic->m_picOrg[0];
+        pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
+        intptr_t stride1 = m_frame->m_fencPic->m_stride;
+        intptr_t stride2 = m_frame->m_reconPic->m_stride;
         uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
         uint32_t bStart = (row == 0);
         uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
@@ -243,7 +242,7 @@ void FrameFilter::processRowPost(int row)
         /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
         * to avoid alignment of ssim blocks with DCT blocks. */
         minPixY += bStart ? 2 : -6;
-        m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, org + 2 + minPixY * stride2, stride2,
+        m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
                                                 m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
         m_frameEncoder->m_ssimCnt += ssim_cnt;
     }
@@ -422,8 +421,8 @@ static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absP
     uint32_t size = g_maxCUSize >> depth;
     int part = partitionFromSizes(size, size);
 
-    PicYuv* reconPic = frame.m_reconPicYuv;
-    PicYuv* fencPic  = frame.m_origPicYuv;
+    PicYuv* reconPic = frame.m_reconPic;
+    PicYuv* fencPic  = frame.m_fencPic;
 
     pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
     pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
diff --git a/source/encoder/motion.cpp b/source/encoder/motion.cpp
index f6129ff..61376ac 100644
--- a/source/encoder/motion.cpp
+++ b/source/encoder/motion.cpp
@@ -34,6 +34,7 @@
 using namespace x265;
 
 namespace {
+
 struct SubpelWorkload
 {
     int hpel_iters;
@@ -43,7 +44,7 @@ struct SubpelWorkload
     bool hpel_satd;
 };
 
-SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
+const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
 {
     { 1, 4, 0, 4, false }, // 4 SAD HPEL only
     { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
@@ -54,15 +55,14 @@ SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
     { 2, 8, 1, 8, true },  // 2x8 SATD HPEL + 8 SATD QPEL
     { 2, 8, 2, 8, true },  // 2x8 SATD HPEL + 2x8 SATD QPEL
 };
-}
 
-static int size_scale[NUM_LUMA_PARTITIONS];
-#define SAD_THRESH(v) (bcost < (((v >> 4) * size_scale[partEnum])))
+int sizeScale[NUM_LUMA_PARTITIONS];
+#define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
 
-static void init_scales(void)
+void initScales(void)
 {
 #define SETUP_SCALE(W, H) \
-    size_scale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
+    sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
     SETUP_SCALE(4, 4);
     SETUP_SCALE(8, 8);
     SETUP_SCALE(8, 4);
@@ -91,51 +91,18 @@ static void init_scales(void)
 #undef SETUP_SCALE
 }
 
-MotionEstimate::MotionEstimate()
-    : searchMethod(3)
-    , subpelRefine(5)
-{
-    if (size_scale[0] == 0)
-        init_scales();
-
-    fenc = X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE);
-}
-
-MotionEstimate::~MotionEstimate()
-{
-    X265_FREE(fenc);
-}
-
-void MotionEstimate::setSourcePU(intptr_t offset, int width, int height)
-{
-    partEnum = partitionFromSizes(width, height);
-    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
-    sad = primitives.sad[partEnum];
-    satd = primitives.satd[partEnum];
-    sa8d = primitives.sa8d_inter[partEnum];
-    sad_x3 = primitives.sad_x3[partEnum];
-    sad_x4 = primitives.sad_x4[partEnum];
-
-    blockwidth = width;
-    blockheight = height;
-    blockOffset = offset;
-
-    /* copy PU block into cache */
-    primitives.luma_copy_pp[partEnum](fenc, FENC_STRIDE, fencplane + offset, fencLumaStride);
-}
-
 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
-static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
-static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
-static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
-static const MV hex4[16] =
+const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
+const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
+const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
+const MV hex4[16] =
 {
-    MV(0, -4),  MV(0, 4),  MV(-2, -3), MV(2, -3),
+    MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
     MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
-    MV(-4, 0),  MV(4, 0),  MV(-4, 1),  MV(4, 1),
+    MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
     MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
 };
-static const MV offsets[] =
+const MV offsets[] =
 {
     MV(-1, 0), MV(0, -1),
     MV(-1, -1), MV(1, -1),
@@ -147,8 +114,8 @@ static const MV offsets[] =
     MV(1, 0), MV(0, 1),
 }; // offsets for Two Point Search
 
-/* sum of absolute differences between MV candidates */
-static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidates)
+/* sum of absolute differences between MV candidates, used for adaptive ME range */
+inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
 {
     int sum = 0;
 
@@ -161,6 +128,77 @@ static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidate
     return sum;
 }
 
+}
+
+MotionEstimate::MotionEstimate()
+{
+    ctuAddr = -1;
+    absPartIdx = -1;
+    searchMethod = X265_HEX_SEARCH;
+    subpelRefine = 2;
+    bChromaSATD = false;
+    chromaSatd = NULL;
+}
+
+void MotionEstimate::init(int method, int refine, int csp)
+{
+    if (!sizeScale[0])
+        initScales();
+
+    searchMethod = method;
+    subpelRefine = refine;
+    fencPUYuv.create(FENC_STRIDE, csp);
+}
+
+MotionEstimate::~MotionEstimate()
+{
+    fencPUYuv.destroy();
+}
+
+/* Called by lookahead, luma only, no use of PicYuv */
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
+{
+    partEnum = partitionFromSizes(pwidth, pheight);
+    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
+    sad = primitives.sad[partEnum];
+    satd = primitives.satd[partEnum];
+    sad_x3 = primitives.sad_x3[partEnum];
+    sad_x4 = primitives.sad_x4[partEnum];
+
+    blockwidth = pwidth;
+    blockOffset = offset;
+    absPartIdx = ctuAddr = -1;
+
+    /* copy PU block into cache */
+    primitives.luma_copy_pp[partEnum](fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
+    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
+}
+
+/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
+void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
+{
+    partEnum = partitionFromSizes(pwidth, pheight);
+    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
+    sad = primitives.sad[partEnum];
+    satd = primitives.satd[partEnum];
+    sad_x3 = primitives.sad_x3[partEnum];
+    sad_x4 = primitives.sad_x4[partEnum];
+    chromaSatd = primitives.chroma[fencPUYuv.m_csp].satd[partEnum];
+
+    /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
+     * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
+    bChromaSATD = subpelRefine > 2 && chromaSatd;
+    X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
+
+    ctuAddr = _ctuAddr;
+    absPartIdx = cuPartIdx + puPartIdx;
+    blockwidth = pwidth;
+    blockOffset = 0;
+
+    /* copy PU from CU Yuv */
+    fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
+}
+
 #define COST_MV_PT_DIST(mx, my, point, dist) \
     do \
     { \
@@ -291,8 +329,9 @@ void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
                                        int              merange)
 {
     ALIGN_VAR_16(int, costs[16]);
-    pixel *fref = ref->fpelPlane + blockOffset;
-    size_t stride = ref->lumaStride;
+    pixel* fenc = fencPUYuv.m_buf[0];
+    pixel* fref = ref->fpelPlane[0] + blockOffset;
+    intptr_t stride = ref->lumaStride;
 
     MV omv = bmv;
     int saved = bcost;
@@ -532,8 +571,11 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref,
                                    MV &             outQMv)
 {
     ALIGN_VAR_16(int, costs[16]);
-    size_t stride = ref->lumaStride;
-    pixel *fref = ref->fpelPlane + blockOffset;
+    if (ctuAddr >= 0)
+        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
+    intptr_t stride = ref->lumaStride;
+    pixel* fenc = fencPUYuv.m_buf[0];
+    pixel* fref = ref->fpelPlane[0] + blockOffset;
 
     setMVP(qmvp);
 
@@ -561,9 +603,7 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref,
     MV bmv = pmv.roundToFPel();
     int bcost = bprecost;
     if (pmv.isSubpel())
-    {
         bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
-    }
 
     // measure SAD cost at MV(0) if MVP is not zero
     if (pmv.notZero())
@@ -577,21 +617,35 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref,
     }
 
     // measure SAD cost at each QPEL motion vector candidate
-    for (int i = 0; i < numCandidates; i++)
+    if (ref->isLowres)
     {
-        MV m = mvc[i].clipped(qmvmin, qmvmax);
-        if (m.notZero() && m != pmv && m != bestpre) // check already measured
+        for (int i = 0; i < numCandidates; i++)
         {
-            int cost;
-            if (ref->isLowres)
-                cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
-            else
-                cost = subpelCompare(ref, m, sad) + mvcost(m);
-
-            if (cost < bprecost)
+            MV m = mvc[i].clipped(qmvmin, qmvmax);
+            if (m.notZero() && m != pmv && m != bestpre) // check already measured
             {
-                bprecost = cost;
-                bestpre = m;
+                int cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
+                if (cost < bprecost)
+                {
+                    bprecost = cost;
+                    bestpre = m;
+                }
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < numCandidates; i++)
+        {
+            MV m = mvc[i].clipped(qmvmin, qmvmax);
+            if (m.notZero() && m != pmv && m != bestpre) // check already measured
+            {
+                int cost = subpelCompare(ref, m, sad) + mvcost(m);
+                if (cost < bprecost)
+                {
+                    bprecost = cost;
+                    bestpre = m;
+                }
             }
         }
     }
@@ -780,7 +834,7 @@ me_hex2:
                     mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
                     denom++;
                 }
-                mvd += x265_predictor_difference(mvc, numCandidates);
+                mvd += predictorDifference(mvc, numCandidates);
             }
 
             sad_ctx = SAD_THRESH(1000) ? 0
@@ -1043,7 +1097,7 @@ me_hex2:
     else
         bmv = bmv.toQPel(); // promote search bmv to qpel
 
-    SubpelWorkload& wl = workload[this->subpelRefine];
+    const SubpelWorkload& wl = workload[this->subpelRefine];
 
     if (!bcost)
     {
@@ -1053,11 +1107,11 @@ me_hex2:
     }
     else if (ref->isLowres)
     {
-        int bdir = 0, cost;
+        int bdir = 0;
         for (int i = 1; i <= wl.hpel_dirs; i++)
         {
             MV qmv = bmv + square1[i] * 2;
-            cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
+            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
             COPY2_IF_LT(bcost, cost, bdir, i);
         }
 
@@ -1068,7 +1122,7 @@ me_hex2:
         for (int i = 1; i <= wl.qpel_dirs; i++)
         {
             MV qmv = bmv + square1[i];
-            cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
+            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
             COPY2_IF_LT(bcost, cost, bdir, i);
         }
 
@@ -1088,11 +1142,11 @@ me_hex2:
 
         for (int iter = 0; iter < wl.hpel_iters; iter++)
         {
-            int bdir = 0, cost;
+            int bdir = 0;
             for (int i = 1; i <= wl.hpel_dirs; i++)
             {
                 MV qmv = bmv + square1[i] * 2;
-                cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
+                int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
                 COPY2_IF_LT(bcost, cost, bdir, i);
             }
 
@@ -1108,11 +1162,11 @@ me_hex2:
 
         for (int iter = 0; iter < wl.qpel_iters; iter++)
         {
-            int bdir = 0, cost;
+            int bdir = 0;
             for (int i = 1; i <= wl.qpel_dirs; i++)
             {
                 MV qmv = bmv + square1[i];
-                cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
+                int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
                 COPY2_IF_LT(bcost, cost, bdir, i);
             }
 
@@ -1130,40 +1184,100 @@ me_hex2:
 
 int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
 {
+    intptr_t refStride = ref->lumaStride;
+    pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
     int xFrac = qmv.x & 0x3;
     int yFrac = qmv.y & 0x3;
+    int cost;
+    intptr_t lclStride = fencPUYuv.m_size;
+    X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
 
-    if ((yFrac | xFrac) == 0)
-    {
-        pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
-        return cmp(fenc, FENC_STRIDE, fref, ref->lumaStride);
-    }
+    if (!(yFrac | xFrac))
+        cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
     else
     {
-        /* We are taking a short-cut here if the reference is weighted. To be
+        /* we are taking a short-cut here if the reference is weighted. To be
          * accurate we should be interpolating unweighted pixels and weighting
-         * the final 16bit values prior to rounding and downshifting. Instead we
+         * the final 16bit values prior to rounding and down shifting. Instead we
          * are simply interpolating the weighted full-pel pixels. Not 100%
          * accurate but good enough for fast qpel ME */
         ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
-        pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
-        if (yFrac == 0)
+        if (!yFrac)
+            primitives.luma_hpp[partEnum](fref, refStride, subpelbuf, lclStride, xFrac);
+        else if (!xFrac)
+            primitives.luma_vpp[partEnum](fref, refStride, subpelbuf, lclStride, yFrac);
+        else
         {
-            primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac);
+            ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_LUMA)]);
+
+            int filterSize = NTAPS_LUMA;
+            int halfFilterSize = filterSize >> 1;
+            primitives.luma_hps[partEnum](fref, refStride, immed, blockwidth, xFrac, 1);
+            primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, lclStride, yFrac);
         }
-        else if (xFrac == 0)
+        cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
+    }
+
+    if (bChromaSATD)
+    {
+        int csp    = fencPUYuv.m_csp;
+        int hshift = fencPUYuv.m_hChromaShift;
+        int vshift = fencPUYuv.m_vChromaShift;
+        int shiftHor = (2 + hshift);
+        int shiftVer = (2 + vshift);
+        lclStride = fencPUYuv.m_csize;
+
+        intptr_t refStrideC = ref->reconPic->m_strideC;
+        intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
+
+        const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
+        const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
+
+        xFrac = qmv.x & ((1 << shiftHor) - 1);
+        yFrac = qmv.y & ((1 << shiftVer) - 1);
+
+        if (!(yFrac | xFrac))
         {
-            primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac);
+            cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
+            cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
         }
         else
         {
-            ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]);
+            ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
+            if (!yFrac)
+            {
+                primitives.chroma[csp].filter_hpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
+                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
 
-            int filterSize = NTAPS_LUMA;
-            int halfFilterSize = filterSize >> 1;
-            primitives.luma_hps[partEnum](fref, ref->lumaStride, immed, blockwidth, xFrac, 1);
-            primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac);
+                primitives.chroma[csp].filter_hpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
+                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+            }
+            else if (!xFrac)
+            {
+                primitives.chroma[csp].filter_vpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
+                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+
+                primitives.chroma[csp].filter_vpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
+                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+            }
+            else
+            {
+                ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
+
+                int extStride = blockwidth >> hshift;
+                int filterSize = NTAPS_CHROMA;
+                int halfFilterSize = (filterSize >> 1);
+
+                primitives.chroma[csp].filter_hps[partEnum](refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
+                primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
+                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+
+                primitives.chroma[csp].filter_hps[partEnum](refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
+                primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
+                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+            }
         }
-        return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE);
     }
+
+    return cost;
 }
diff --git a/source/encoder/motion.h b/source/encoder/motion.h
index 51687f5..a5fddd7 100644
--- a/source/encoder/motion.h
+++ b/source/encoder/motion.h
@@ -28,6 +28,7 @@
 #include "reference.h"
 #include "mv.h"
 #include "bitcost.h"
+#include "yuv.h"
 
 namespace x265 {
 // private x265 namespace
@@ -36,63 +37,59 @@ class MotionEstimate : public BitCost
 {
 protected:
 
-    /* Aligned copy of original pixels, extra room for manual alignment */
-    pixel *fencplane;
-    intptr_t fencLumaStride;
-
-    pixelcmp_t sad;
-    pixelcmp_t satd;
-    pixelcmp_t sa8d;
-    pixelcmp_x3_t sad_x3;
-    pixelcmp_x4_t sad_x4;
-
     intptr_t blockOffset;
-    int partEnum;
+    
+    int ctuAddr;
+    int absPartIdx;  // part index of PU, including CU offset within CTU
+
     int searchMethod;
     int subpelRefine;
 
-    /* subpel generation buffers */
     int blockwidth;
     int blockheight;
 
+    pixelcmp_t sad;
+    pixelcmp_x3_t sad_x3;
+    pixelcmp_x4_t sad_x4;
+    pixelcmp_t satd;
+    pixelcmp_t chromaSatd;
+
     MotionEstimate& operator =(const MotionEstimate&);
 
 public:
 
     static const int COST_MAX = 1 << 28;
 
-    pixel *fenc;
+    Yuv fencPUYuv;
+    int partEnum;
+    bool bChromaSATD;
 
     MotionEstimate();
-
     ~MotionEstimate();
 
-    void setSearchMethod(int i) { searchMethod = i; }
-
-    void setSubpelRefine(int i) { subpelRefine = i; }
+    void init(int method, int refine, int csp);
 
     /* Methods called at slice setup */
 
-    void setSourcePlane(pixel *Y, intptr_t luma)
-    {
-        fencplane = Y;
-        fencLumaStride = luma;
-    }
-
-    void setSourcePU(intptr_t offset, int pwidth, int pheight);
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight);
+    void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight);
 
     /* buf*() and motionEstimate() methods all use cached fenc pixels and thus
      * require setSourcePU() to be called prior. */
 
-    inline int bufSAD(pixel *fref, intptr_t stride)  { return sad(fenc, FENC_STRIDE, fref, stride); }
+    inline int bufSAD(const pixel* fref, intptr_t stride)  { return sad(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
 
-    inline int bufSA8D(pixel *fref, intptr_t stride) { return sa8d(fenc, FENC_STRIDE, fref, stride); }
+    inline int bufSATD(const pixel* fref, intptr_t stride) { return satd(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
 
-    inline int bufSATD(pixel *fref, intptr_t stride) { return satd(fenc, FENC_STRIDE, fref, stride); }
+    inline int bufChromaSATD(const Yuv& refYuv, int puPartIdx)
+    {
+        return chromaSatd(refYuv.getCbAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[1], fencPUYuv.m_csize) +
+               chromaSatd(refYuv.getCrAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[2], fencPUYuv.m_csize);
+    }
 
-    int motionEstimate(ReferencePlanes *ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
+    int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
 
-    int subpelCompare(ReferencePlanes * ref, const MV &qmv, pixelcmp_t);
+    int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
 
 protected:
 
diff --git a/source/encoder/nal.cpp b/source/encoder/nal.cpp
index c38c651..b0bc4c7 100644
--- a/source/encoder/nal.cpp
+++ b/source/encoder/nal.cpp
@@ -193,12 +193,10 @@ uint32_t NALList::serializeSubstreams(uint32_t* streamSizeBytes, uint32_t stream
         {
             for (uint32_t i = 0; i < inSize; i++)
             {
-                if (bytes > 2 && !out[bytes - 2] && !out[bytes - 3] && out[bytes - 1] <= 0x03)
+                if (bytes >= 2 && !out[bytes - 2] && !out[bytes - 1] && inBytes[i] <= 0x03)
                 {
                     /* inject 0x03 to prevent emulating a start code */
-                    out[bytes] = out[bytes - 1];
-                    out[bytes - 1] = 0x03;
-                    bytes++;
+                    out[bytes++] = 3;
                 }
 
                 out[bytes++] = inBytes[i];
diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp
index f54b101..4f31fe1 100644
--- a/source/encoder/ratecontrol.cpp
+++ b/source/encoder/ratecontrol.cpp
@@ -40,8 +40,7 @@
 using namespace x265;
 
 /* Amortize the partial cost of I frames over the next N frames */
-const double RateControl::s_amortizeFraction = 0.85;
-const int RateControl::s_amortizeFrames = 75;
+
 const int RateControl::s_slidingWindowFrames = 20;
 const char *RateControl::s_defaultStatFileName = "x265_2pass.log";
 
@@ -173,8 +172,8 @@ static inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcSt
 /* Find the total AC energy of each block in all planes */
 uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t block_y)
 {
-    intptr_t stride = curFrame->m_origPicYuv->m_stride;
-    intptr_t cStride = curFrame->m_origPicYuv->m_strideC;
+    intptr_t stride = curFrame->m_fencPic->m_stride;
+    intptr_t cStride = curFrame->m_fencPic->m_strideC;
     intptr_t blockOffsetLuma = block_x + (block_y * stride);
     int colorFormat = m_param->internalCsp;
     int hShift = CHROMA_H_SHIFT(colorFormat);
@@ -183,9 +182,9 @@ uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t blo
 
     uint32_t var;
 
-    var  = acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat);
-    var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat);
-    var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat);
+    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat);
+    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat);
+    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat);
     x265_emms();
     return var;
 }
@@ -193,8 +192,8 @@ uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t blo
 void RateControl::calcAdaptiveQuantFrame(Frame *curFrame)
 {
     /* Actual adaptive quantization */
-    int maxCol = curFrame->m_origPicYuv->m_picWidth;
-    int maxRow = curFrame->m_origPicYuv->m_picHeight;
+    int maxCol = curFrame->m_fencPic->m_picWidth;
+    int maxRow = curFrame->m_fencPic->m_picHeight;
 
     for (int y = 0; y < 3; y++)
     {
@@ -323,6 +322,12 @@ RateControl::RateControl(x265_param *p)
     m_bTerminated = false;
     m_finalFrameCount = 0;
     m_numEntries = 0;
+    m_amortizeFraction = 0.85;
+    m_amortizeFrames = 75;
+    if (m_param->totalFrames <= 2 * m_fps)
+    {
+        m_amortizeFraction = m_amortizeFrames = 0;
+    }
     if (m_param->rc.rateControlMode == X265_RC_CRF)
     {
         m_param->rc.qp = (int)m_param->rc.rfConstant;
@@ -494,12 +499,12 @@ bool RateControl::init(const SPS *sps)
     /* Frame Predictors and Row predictors used in vbv */
     for (int i = 0; i < 5; i++)
     {
-        m_pred[i].coeff = 2.0;
+        m_pred[i].coeff = 1.5;
         m_pred[i].count = 1.0;
         m_pred[i].decay = 0.5;
         m_pred[i].offset = 0.0;
     }
-    m_predBfromP = m_pred[0];
+    m_pred[0].coeff = 1.0;
     if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead))
     {
         /* If the user hasn't defined the stat filename, use the default value */
@@ -1157,7 +1162,7 @@ int RateControl::rateControlStart(Frame* curFrame, RateControlEntry* rce, Encode
     rce->leadingNoBSatd = m_leadingNoBSatd;
     if (curFrame->m_forceqp)
     {
-        m_qp = int32_t(curFrame->m_forceqp + 0.5) - 1;
+        m_qp = (int32_t)(curFrame->m_forceqp + 0.5) - 1;
         m_qp = Clip3(QP_MIN, QP_MAX_MAX, m_qp);
         rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = m_qp;
     }
@@ -1343,6 +1348,25 @@ fail:
     return false;
 }
 
+double RateControl::tuneAbrQScaleFromFeedback(double qScale)
+{
+    double abrBuffer = 2 * m_param->rc.rateTolerance * m_bitrate;
+    if (m_currentSatd)
+    {
+        /* use framesDone instead of POC as poc count is not serial with bframes enabled */
+        double overflow = 1.0;
+        double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
+        double wantedBits = timeDone * m_bitrate;
+        if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames)
+        {
+            abrBuffer *= X265_MAX(1, sqrt(timeDone));
+            overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer);
+            qScale *= overflow;
+        }
+    }
+    return qScale;
+}
+
 double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
 {
     double q;
@@ -1415,17 +1439,25 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
             q += m_pbOffset / 2;
         else
             q += m_pbOffset;
-        rce->qpNoVbv = q;
-        double qScale = x265_qp2qScale(q);
 
-        if (!m_2pass && m_isVbv)
+        double qScale = x265_qp2qScale(q);
+        if (m_isCbr)
         {
-            if (m_leadingBframes > 5)
+            qScale = tuneAbrQScaleFromFeedback(qScale);
+            if (!m_isAbrReset)
             {
-                qScale = clipQscale(curFrame, rce, qScale);
-                m_lastQScaleFor[m_sliceType] = qScale;
+                double lmin = m_lastQScaleFor[P_SLICE] / m_lstep;
+                double lmax = m_lastQScaleFor[P_SLICE] * m_lstep;
+                qScale = Clip3(lmin, lmax, qScale);
             }
-            rce->frameSizePlanned = predictSize(&m_predBfromP, qScale, (double)m_leadingNoBSatd);
+            q = x265_qScale2qp(qScale);
+        }
+        rce->qpNoVbv = q;
+        if (!m_2pass && m_isVbv)
+        {
+            qScale = clipQscale(curFrame, rce, qScale);
+            m_lastQScaleFor[m_sliceType] = qScale;
+            rce->frameSizePlanned = predictSize(&m_pred[m_sliceType], qScale, (double)m_currentSatd);
         }
         else if (m_2pass && m_isVbv)
         {
@@ -1508,7 +1540,7 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
              * tradeoff between quality and bitrate precision. But at large
              * tolerances, the bit distribution approaches that of 2pass. */
 
-            double wantedBits, overflow = 1;
+            double overflow = 1;
 
             m_shortTermCplxSum *= 0.5;
             m_shortTermCplxCount *= 0.5;
@@ -1528,25 +1560,10 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
             {
                 if (!m_param->rc.bStatRead)
                     checkAndResetABR(rce, false);
-                q = getQScale(rce, m_wantedBitsWindow / m_cplxrSum);
-
-                /* ABR code can potentially be counterproductive in CBR, so just
-                 * don't bother.  Don't run it if the frame complexity is zero
-                 * either. */
-                if (!m_isCbr && m_currentSatd)
-                {
-                    /* use framesDone instead of POC as poc count is not serial with bframes enabled */
-                    double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
-                    wantedBits = timeDone * m_bitrate;
-                    if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames)
-                    {
-                        abrBuffer *= X265_MAX(1, sqrt(timeDone));
-                        overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer);
-                        q *= overflow;
-                    }
-                }
+                double initialQScale = getQScale(rce, m_wantedBitsWindow / m_cplxrSum);
+                q = tuneAbrQScaleFromFeedback(initialQScale);
+                overflow = q / initialQScale;
             }
-
             if (m_sliceType == I_SLICE && m_param->keyframeMax > 1
                 && m_lastNonBPictType != I_SLICE && !m_isAbrReset)
             {
@@ -1574,7 +1591,7 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
             {
                 q = x265_qp2qScale(CRF_INIT_QP) / fabs(m_param->rc.ipFactor);
             }
-            else if (m_framesDone == 0 && !m_isVbv)
+            else if (m_framesDone == 0 && !m_isVbv && m_param->rc.rateControlMode == X265_RC_ABR)
             {
                 /* for ABR alone, clip the first I frame qp */
                 double lqmax = x265_qp2qScale(ABR_INIT_QP_MAX) * m_lstep;
@@ -1615,8 +1632,8 @@ void RateControl::rateControlUpdateStats(RateControlEntry* rce)
             if (m_partialResidualFrames)
                 rce->rowTotalBits += m_partialResidualCost * m_partialResidualFrames;
 
-            m_partialResidualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
-            m_partialResidualCost = (int)((rce->rowTotalBits * s_amortizeFraction) /m_partialResidualFrames);
+            m_partialResidualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax);
+            m_partialResidualCost = (int)((rce->rowTotalBits * m_amortizeFraction) /m_partialResidualFrames);
             rce->rowTotalBits -= m_partialResidualCost * m_partialResidualFrames;
         }
         else if (m_partialResidualFrames)
@@ -1725,13 +1742,11 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
             {
                 double frameQ[3];
                 double curBits;
-                if (m_sliceType == B_SLICE)
-                    curBits = predictSize(&m_predBfromP, q, (double)m_currentSatd);
-                else
-                    curBits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd);
+                curBits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd);
                 double bufferFillCur = m_bufferFill - curBits;
                 double targetFill;
-                double totalDuration = 0;
+                double totalDuration = m_frameDuration;
+                bool isIFramePresent = m_sliceType == I_SLICE ? true : false;
                 frameQ[P_SLICE] = m_sliceType == I_SLICE ? q * m_param->rc.ipFactor : (m_sliceType == B_SLICE ? q / m_param->rc.pbFactor : q);
                 frameQ[B_SLICE] = frameQ[P_SLICE] * m_param->rc.pbFactor;
                 frameQ[I_SLICE] = frameQ[P_SLICE] / m_param->rc.ipFactor;
@@ -1747,20 +1762,23 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
                         bufferFillCur += wantedFrameSize;
                     int64_t satd = curFrame->m_lowres.plannedSatd[j] >> (X265_DEPTH - 8);
                     type = IS_X265_TYPE_I(type) ? I_SLICE : IS_X265_TYPE_B(type) ? B_SLICE : P_SLICE;
+                    if (type == I_SLICE) 
+                        isIFramePresent = true;
                     curBits = predictSize(&m_pred[type], frameQ[type], (double)satd);
                     bufferFillCur -= curBits;
                 }
 
-                /* Try to get the buffer at least 50% filled, but don't set an impossible goal. */
-                targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * 0.5);
+                /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */
+                double tol = isIFramePresent ? 1 / totalDuration : totalDuration < 0.5 ? 2 : 1;
+                targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5 , m_bufferSize * (1 - 0.8 * totalDuration * tol));
                 if (bufferFillCur < targetFill)
                 {
                     q *= 1.01;
                     loopTerminate |= 1;
                     continue;
                 }
-                /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */
-                targetFill = Clip3(m_bufferSize * 0.8, m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
+                /* Try to get the buffer atleast 50% filled, but don't set an impossible goal. */
+                targetFill = Clip3(m_bufferSize - (m_bufferSize * totalDuration * 0.5), m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
                 if (m_isCbr && bufferFillCur > targetFill)
                 {
                     q /= 1.01;
@@ -1810,25 +1828,6 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
         if (pbits > rce->frameSizeMaximum)
             q *= pbits / rce->frameSizeMaximum;
 
-        // Check B-frame complexity, and use up any bits that would
-        // overflow before the next P-frame.
-        if (m_leadingBframes <= 5 && m_sliceType == P_SLICE && !m_singleFrameVbv)
-        {
-            int nb = m_leadingBframes;
-            double bits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd);
-            double bbits = predictSize(&m_predBfromP, q * m_param->rc.pbFactor, (double)m_currentSatd);
-            double space;
-            if (bbits > m_bufferRate)
-                nb = 0;
-            double pbbits = nb * bbits;
-
-            space = m_bufferFill + (1 + nb) * m_bufferRate - m_bufferSize;
-            if (pbbits < space)
-                q *= X265_MAX(pbbits / space, bits / (0.5 * m_bufferSize));
-
-            q = X265_MAX(q0 / 2, q);
-        }
-
         if (!m_isCbr || (m_isAbr && m_currentSatd >= rce->movingAvgSum && q <= q0 / 2))
             q = X265_MAX(q0, q);
 
@@ -1899,22 +1898,24 @@ double RateControl::predictRowsSizeSum(Frame* curFrame, RateControlEntry* rce, d
                     && refQScale > 0
                     && refRowSatdCost > 0)
                 {
-                    if (abs(int32_t(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2)
+                    if (abs((int32_t)(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2)
                     {
                         double predTotal = refRowBits * satdCostForPendingCus / refRowSatdCost * refQScale / qScale;
-                        totalSatdBits += int32_t((pred_s + predTotal) * 0.5);
+                        totalSatdBits += (int32_t)((pred_s + predTotal) * 0.5);
                         continue;
                     }
                 }
-                totalSatdBits += int32_t(pred_s);
+                totalSatdBits += (int32_t)pred_s;
             }
-            else
+            else if (picType == P_SLICE)
             {
                 /* Our QP is lower than the reference! */
                 double pred_intra = predictSize(rce->rowPred[1], qScale, intraCost);
                 /* Sum: better to overestimate than underestimate by using only one of the two predictors. */
-                totalSatdBits += int32_t(pred_intra + pred_s);
+                totalSatdBits += (int32_t)(pred_intra + pred_s);
             }
+            else
+                totalSatdBits += (int32_t)pred_s;
         }
     }
 
@@ -1969,16 +1970,8 @@ int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateCo
 
     if (row < sps.numCuInHeight - 1)
     {
-        /* B-frames shouldn't use lower QP than their reference frames. */
-        if (rce->sliceType == B_SLICE)
-        {
-            Frame* refSlice1 = curEncData.m_slice->m_refPicList[0][0];
-            Frame* refSlice2 = curEncData.m_slice->m_refPicList[1][0];
-            qpMin = X265_MAX(qpMin, X265_MAX(refSlice1->m_encData->m_rowStat[row].diagQp, refSlice2->m_encData->m_rowStat[row].diagQp));
-            qpVbv = X265_MAX(qpVbv, qpMin);
-        }
         /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
-        double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_param->rc.rateTolerance;
+        double rcTol = (bufferLeftPlanned * 0.2) / m_param->frameNumThreads * m_param->rc.rateTolerance;
         int32_t encodedBitsSoFar = 0;
         double accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
 
@@ -1996,7 +1989,7 @@ int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateCo
 
         while (qpVbv < qpMax
                && ((accFrameBits > rce->frameSizePlanned + rcTol) ||
-                   (rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.5) ||
+                   (rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.2) ||
                    (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv)))
         {
             qpVbv += stepSize;
@@ -2085,7 +2078,7 @@ void RateControl::updatePredictor(Predictor *p, double q, double var, double bit
 {
     if (var < 10)
         return;
-    const double range = 1.5;
+    const double range = 2;
     double old_coeff = p->coeff / p->count;
     double new_coeff = bits * q / var;
     double new_coeff_clipped = Clip3(old_coeff / range, old_coeff * range, new_coeff);
@@ -2220,8 +2213,8 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry*
                 /* previous I still had a residual; roll it into the new loan */
                 if (m_residualFrames)
                     bits += m_residualCost * m_residualFrames;
-                m_residualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
-                m_residualCost = (int)((bits * s_amortizeFraction) / m_residualFrames);
+                m_residualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax);
+                m_residualCost = (int)((bits * m_amortizeFraction) / m_residualFrames);
                 bits -= m_residualCost * m_residualFrames;
             }
             else if (m_residualFrames)
@@ -2257,16 +2250,6 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry*
 
     if (m_isVbv)
     {
-        if (rce->sliceType == B_SLICE)
-        {
-            m_bframeBits += actualBits;
-            if (rce->bLastMiniGopBFrame)
-            {
-                if (rce->bframes != 0)
-                    updatePredictor(&m_predBfromP, x265_qp2qScale(rce->qpaRc), (double)rce->leadingNoBSatd, (double)m_bframeBits / rce->bframes);
-                m_bframeBits = 0;
-            }
-        }
         updateVbv(actualBits, rce);
 
         if (m_param->bEmitHRDSEI)
diff --git a/source/encoder/ratecontrol.h b/source/encoder/ratecontrol.h
index 5b86147..13e1701 100644
--- a/source/encoder/ratecontrol.h
+++ b/source/encoder/ratecontrol.h
@@ -233,11 +233,10 @@ public:
     void initHRD(SPS* sps);
     int rateControlSliceType(int frameNum);
     bool cuTreeReadFor2Pass(Frame* curFrame);
+    double tuneAbrQScaleFromFeedback(double qScale);
 
 protected:
 
-    static const double s_amortizeFraction;
-    static const int    s_amortizeFrames;
     static const int    s_slidingWindowFrames;
     static const char  *s_defaultStatFileName;
 
@@ -245,6 +244,8 @@ protected:
     int m_partialResidualFrames;
     int m_residualCost;
     int m_partialResidualCost;
+    int m_amortizeFrames;
+    double m_amortizeFraction;
 
     double getQScale(RateControlEntry *rce, double rateFactor);
     double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
diff --git a/source/encoder/rdcost.h b/source/encoder/rdcost.h
index 10bfff3..8bc7f18 100644
--- a/source/encoder/rdcost.h
+++ b/source/encoder/rdcost.h
@@ -37,30 +37,37 @@ public:
     /* all weights and factors stored as FIX8 */
     uint64_t  m_lambda2;
     uint64_t  m_lambda;
-    uint64_t  m_cbDistortionWeight;
-    uint64_t  m_crDistortionWeight;
+    uint32_t  m_chromaDistWeight[2];
+    uint32_t  m_psyRdBase;
     uint32_t  m_psyRd;
     int       m_qp;
 
-    void setPsyRdScale(double scale)                { m_psyRd = (uint32_t)floor(256.0 * scale * 0.33); }
-    void setCbDistortionWeight(uint16_t weightFix8) { m_cbDistortionWeight = weightFix8; }
-    void setCrDistortionWeight(uint16_t weightFix8) { m_crDistortionWeight = weightFix8; }
+    void setPsyRdScale(double scale)                { m_psyRdBase = (uint32_t)floor(256.0 * scale * 0.33); }
 
     void setQP(const Slice& slice, int qp)
     {
         m_qp = qp;
+        int qpCb, qpCr;
+        /* Scale PSY RD factor by a slice type factor */
+        static const uint32_t psyScaleFix8[3] = { 300, 256, 96 }; /* B, P, I */
+        m_psyRd = (m_psyRdBase * psyScaleFix8[slice.m_sliceType]) >> 8;
 
         setLambda(x265_lambda2_tab[qp], x265_lambda_tab[qp]);
-
-        int qpCb = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCbQpOffset);
+        if (slice.m_sps->chromaFormatIdc == X265_CSP_I420)
+            qpCb = Clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice.m_pps->chromaQpOffset[0]]);
+        else
+            qpCb = X265_MIN(qp + slice.m_pps->chromaQpOffset[0], QP_MAX_SPEC);
         int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
         uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
-        setCbDistortionWeight(lambdaOffset);
+        m_chromaDistWeight[0] = lambdaOffset;
 
-        int qpCr = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCrQpOffset);
+        if (slice.m_sps->chromaFormatIdc == X265_CSP_I420)
+            qpCr = Clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice.m_pps->chromaQpOffset[0]]);
+        else
+            qpCr = X265_MIN(qp + slice.m_pps->chromaQpOffset[0], QP_MAX_SPEC);
         chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
         lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
-        setCrDistortionWeight(lambdaOffset);
+        m_chromaDistWeight[1] = lambdaOffset;
     }
 
     void setLambda(double lambda2, double lambda)
@@ -72,18 +79,18 @@ public:
     inline uint64_t calcRdCost(uint32_t distortion, uint32_t bits) const
     {
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
-                   "calcRdCost wrap detected dist: %d, bits %d, lambda: %d\n", distortion, bits, (int)m_lambda2);
+                   "calcRdCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
         return distortion + ((bits * m_lambda2 + 128) >> 8);
     }
 
     /* return the difference in energy between the source block and the recon block */
-    inline int psyCost(int size, pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) const
+    inline int psyCost(int size, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) const
     {
         return primitives.psy_cost_pp[size](source, sstride, recon, rstride);
     }
 
     /* return the difference in energy between the source block and the recon block */
-    inline int psyCost(int size, int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride) const
+    inline int psyCost(int size, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) const
     {
         return primitives.psy_cost_ss[size](source, sstride, recon, rstride);
     }
@@ -97,22 +104,15 @@ public:
     inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits) const
     {
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
-                   "calcRdSADCost wrap detected dist: %d, bits %d, lambda: "X265_LL"\n", sadCost, bits, m_lambda);
+                   "calcRdSADCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", sadCost, bits, m_lambda);
         return sadCost + ((bits * m_lambda + 128) >> 8);
     }
 
-    inline uint32_t scaleChromaDistCb(uint32_t dist) const
-    {
-        X265_CHECK(dist <= (UINT64_MAX - 128) / m_cbDistortionWeight,
-                   "scaleChromaDistCb wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_cbDistortionWeight);
-        return (uint32_t)(((dist * m_cbDistortionWeight) + 128) >> 8);
-    }
-
-    inline uint32_t scaleChromaDistCr(uint32_t dist) const
+    inline uint32_t scaleChromaDist(uint32_t plane, uint32_t dist) const
     {
-        X265_CHECK(dist <= (UINT64_MAX - 128) / m_crDistortionWeight,
-                   "scaleChromaDistCr wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_crDistortionWeight);
-        return (uint32_t)(((dist * m_crDistortionWeight) + 128) >> 8);
+        X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
+                   "scaleChromaDist wrap detected dist: %u, lambda: %u\n", dist, m_chromaDistWeight[plane - 1]);
+        return (uint32_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
     }
 
     inline uint32_t getCost(uint32_t bits) const
diff --git a/source/encoder/reference.cpp b/source/encoder/reference.cpp
index 958042b..425174c 100644
--- a/source/encoder/reference.cpp
+++ b/source/encoder/reference.cpp
@@ -33,86 +33,142 @@ using namespace x265;
 
 MotionReference::MotionReference()
 {
-    m_weightBuffer = NULL;
+    weightBuffer[0] = NULL;
+    weightBuffer[1] = NULL;
+    weightBuffer[2] = NULL;
 }
 
-int MotionReference::init(PicYuv* recPic, WeightParam *w)
+MotionReference::~MotionReference()
+{
+    X265_FREE(weightBuffer[0]);
+    X265_FREE(weightBuffer[1]);
+    X265_FREE(weightBuffer[2]);
+}
+
+int MotionReference::init(PicYuv* recPic, WeightParam *wp, const x265_param& p)
 {
-    m_reconPic = recPic;
+    reconPic = recPic;
+    numWeightedRows = 0;
     lumaStride = recPic->m_stride;
-    intptr_t startpad = recPic->m_lumaMarginY * lumaStride + recPic->m_lumaMarginX;
+    chromaStride = recPic->m_strideC;
+    numInterpPlanes = p.subpelRefine > 2 ? 3 : 1; /* is chroma satd possible? */
 
-    /* directly reference the pre-extended integer pel plane */
-    fpelPlane = recPic->m_picBuf[0] + startpad;
+    /* directly reference the extended integer pel planes */
+    fpelPlane[0] = recPic->m_picOrg[0];
+    fpelPlane[1] = recPic->m_picOrg[1];
+    fpelPlane[2] = recPic->m_picOrg[2];
     isWeighted = false;
 
-    if (w)
+    if (wp)
     {
-        if (!m_weightBuffer)
+        uint32_t numCUinHeight = (reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
+
+        int marginX = reconPic->m_lumaMarginX;
+        int marginY = reconPic->m_lumaMarginY;
+        intptr_t stride = reconPic->m_stride;
+        int cuHeight = g_maxCUSize;
+
+        for (int c = 0; c < numInterpPlanes; c++)
         {
-            uint32_t numCUinHeight = (recPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
-            size_t padheight = (numCUinHeight * g_maxCUSize) + recPic->m_lumaMarginY * 2;
-            m_weightBuffer = X265_MALLOC(pixel, lumaStride * padheight);
-            if (!m_weightBuffer)
-                return -1;
+            if (c == 1)
+            {
+                marginX = reconPic->m_chromaMarginX;
+                marginY = reconPic->m_chromaMarginY;
+                stride  = reconPic->m_strideC;
+                cuHeight >>= reconPic->m_vChromaShift;
+            }
+
+            if (wp[c].bPresentFlag)
+            {
+                if (!weightBuffer[c])
+                {
+                    size_t padheight = (numCUinHeight * cuHeight) + marginY * 2;
+                    weightBuffer[c] = X265_MALLOC(pixel, stride * padheight);
+                    if (!weightBuffer[c])
+                        return -1;
+                }
+
+                /* use our buffer which will have weighted pixels written to it */
+                fpelPlane[c] = weightBuffer[c] + marginY * stride + marginX;
+                X265_CHECK(recPic->m_picOrg[c] - recPic->m_picBuf[c] == marginY * stride + marginX, "PicYuv pad calculation mismatch\n");
+
+                w[c].weight = wp[c].inputWeight;
+                w[c].offset = wp[c].inputOffset * (1 << (X265_DEPTH - 8));
+                w[c].shift = wp[c].log2WeightDenom;
+                w[c].round = w[c].shift ? 1 << (w[c].shift - 1) : 0;
+            }
         }
 
         isWeighted = true;
-        weight = w->inputWeight;
-        offset = w->inputOffset * (1 << (X265_DEPTH - 8));
-        shift  = w->log2WeightDenom;
-        round  = shift ? 1 << (shift - 1) : 0;
-        m_numWeightedRows = 0;
-
-        /* use our buffer which will have weighted pixels written to it */
-        fpelPlane = m_weightBuffer + startpad;
     }
 
     return 0;
 }
 
-MotionReference::~MotionReference()
-{
-    X265_FREE(m_weightBuffer);
-}
-
-void MotionReference::applyWeight(int rows, int numRows)
+void MotionReference::applyWeight(int finishedRows, int maxNumRows)
 {
-    rows = X265_MIN(rows, numRows);
-    if (m_numWeightedRows >= rows)
+    finishedRows = X265_MIN(finishedRows, maxNumRows);
+    if (numWeightedRows >= finishedRows)
         return;
-    int marginX = m_reconPic->m_lumaMarginX;
-    int marginY = m_reconPic->m_lumaMarginY;
-    pixel* src = (pixel*)m_reconPic->m_picOrg[0] + (m_numWeightedRows * (int)g_maxCUSize * lumaStride);
-    pixel* dst = fpelPlane + ((m_numWeightedRows * (int)g_maxCUSize) * lumaStride);
-    int width = m_reconPic->m_picWidth;
-    int height = ((rows - m_numWeightedRows) * g_maxCUSize);
-    if (rows == numRows)
-        height = ((m_reconPic->m_picHeight % g_maxCUSize) ? (m_reconPic->m_picHeight % g_maxCUSize) : g_maxCUSize);
-
-    // Computing weighted CU rows
-    int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
-    int padwidth = (width + 15) & ~15;  // weightp assembly needs even 16 byte widths
-    primitives.weight_pp(src, dst, lumaStride, padwidth, height,
-                         weight, round << correction, shift + correction, offset);
-
-    // Extending Left & Right
-    primitives.extendRowBorder(dst, lumaStride, width, height, marginX);
-
-    // Extending Above
-    if (m_numWeightedRows == 0)
+
+    int marginX = reconPic->m_lumaMarginX;
+    int marginY = reconPic->m_lumaMarginY;
+    intptr_t stride = reconPic->m_stride;
+    int width   = reconPic->m_picWidth;
+    int height  = (finishedRows - numWeightedRows) * g_maxCUSize;
+    if (finishedRows == maxNumRows && (reconPic->m_picHeight % g_maxCUSize))
     {
-        pixel *pixY = fpelPlane - marginX;
-        for (int y = 0; y < marginY; y++)
-            memcpy(pixY - (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel));
+        /* the last row may be partial height */
+        height -= g_maxCUSize;
+        height += reconPic->m_picHeight % g_maxCUSize;
     }
+    int cuHeight = g_maxCUSize;
 
-    // Extending Bottom
-    if (rows == numRows)
+    for (int c = 0; c < numInterpPlanes; c++)
     {
-        pixel *pixY = fpelPlane - marginX + (m_reconPic->m_picHeight - 1) * lumaStride;
-        for (int y = 0; y < marginY; y++)
-            memcpy(pixY + (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel));
+        if (c == 1)
+        {
+            marginX = reconPic->m_chromaMarginX;
+            marginY = reconPic->m_chromaMarginY;
+            stride  = reconPic->m_strideC;
+            width    >>= reconPic->m_hChromaShift;
+            height   >>= reconPic->m_vChromaShift;
+            cuHeight >>= reconPic->m_vChromaShift;
+        }
+
+        /* Do not generate weighted predictions if using original picture */
+        if (fpelPlane[c] == reconPic->m_picOrg[c])
+            continue;
+
+        const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride;
+        pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
+
+        // Computing weighted CU rows
+        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
+        int padwidth = (width + 15) & ~15;              // weightp assembly needs even 16 byte widths
+        primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset);
+
+        // Extending Left & Right
+        primitives.extendRowBorder(dst, stride, width, height, marginX);
+
+        // Extending Above
+        if (numWeightedRows == 0)
+        {
+            pixel *pixY = fpelPlane[c] - marginX;
+            for (int y = 0; y < marginY; y++)
+                memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
+        }
+
+        // Extending Bottom
+        if (finishedRows == maxNumRows)
+        {
+            int picHeight = reconPic->m_picHeight;
+            if (c) picHeight >>= reconPic->m_vChromaShift;
+            pixel *pixY = fpelPlane[c] - marginX + (picHeight - 1) * stride;
+            for (int y = 0; y < marginY; y++)
+                memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
+        }
     }
-    m_numWeightedRows = rows;
+
+    numWeightedRows = finishedRows;
 }
diff --git a/source/encoder/reference.h b/source/encoder/reference.h
index 3fb9afd..6b33499 100644
--- a/source/encoder/reference.h
+++ b/source/encoder/reference.h
@@ -25,13 +25,13 @@
 #define X265_REFERENCE_H
 
 #include "primitives.h"
+#include "picyuv.h"
 #include "lowres.h"
 #include "mv.h"
 
 namespace x265 {
 // private x265 namespace
 
-class PicYuv;
 struct WeightParam;
 
 class MotionReference : public ReferencePlanes
@@ -40,12 +40,12 @@ public:
 
     MotionReference();
     ~MotionReference();
-    int  init(PicYuv*, WeightParam* w = NULL);
+    int  init(PicYuv*, WeightParam* wp, const x265_param& p);
     void applyWeight(int rows, int numRows);
 
-    PicYuv* m_reconPic;
-    pixel*  m_weightBuffer;
-    int     m_numWeightedRows;
+    pixel*  weightBuffer[3];
+    int     numInterpPlanes;
+    int     numWeightedRows;
 
 protected:
 
diff --git a/source/encoder/sao.cpp b/source/encoder/sao.cpp
index 1179fe0..9836aa7 100644
--- a/source/encoder/sao.cpp
+++ b/source/encoder/sao.cpp
@@ -176,8 +176,11 @@ void SAO::allocSaoParam(SAOParam* saoParam) const
 void SAO::startSlice(Frame* frame, Entropy& initState, int qp)
 {
     Slice* slice = frame->m_encData->m_slice;
-
-    int qpCb = Clip3(0, QP_MAX_MAX, qp + slice->m_pps->chromaCbQpOffset);
+    int qpCb = qp;
+    if (m_param->internalCsp == X265_CSP_I420)
+        qpCb = Clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
+    else
+        qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);
     m_lumaLambda = x265_lambda2_tab[qp];
     m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma
     m_frame = frame;
@@ -225,8 +228,8 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane)
 {
     int x, y;
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
-    pixel* rec = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr);
-    intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride;
+    pixel* rec = m_frame->m_reconPic->getPlaneAddr(plane, addr);
+    intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
     uint32_t picWidth  = m_param->sourceWidth;
     uint32_t picHeight = m_param->sourceHeight;
     int ctuWidth  = g_maxCUSize;
@@ -436,7 +439,7 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane)
 /* Process SAO all units */
 void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
 {
-    intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride;
+    intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
     uint32_t picWidth  = m_param->sourceWidth;
     int ctuWidth  = g_maxCUSize;
     int ctuHeight = g_maxCUSize;
@@ -449,12 +452,12 @@ void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
 
     if (!idxY)
     {
-        pixel* rec = m_frame->m_reconPicYuv->m_picOrg[plane];
+        pixel* rec = m_frame->m_reconPic->m_picOrg[plane];
         memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
     }
 
     int addr = idxY * m_numCuInWidth;
-    pixel* rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr);
+    pixel* rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr);
 
     for (int i = 0; i < ctuHeight + 1; i++)
     {
@@ -506,7 +509,7 @@ void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
         }
         else if (idxX != (m_numCuInWidth - 1))
         {
-            rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr);
+            rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr);
 
             for (int i = 0; i < ctuHeight + 1; i++)
             {
@@ -543,12 +546,12 @@ void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc)
 void SAO::calcSaoStatsCu(int addr, int plane)
 {
     int x, y;
-    CUData* cu = m_frame->m_encData->getPicCTU(addr);
-    const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr);
-    const pixel* rec0  = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr);
+    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
+    const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
+    const pixel* rec0  = m_frame->m_reconPic->getPlaneAddr(plane, addr);
     const pixel* fenc;
     const pixel* rec;
-    intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride;
+    intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
     uint32_t picWidth  = m_param->sourceWidth;
     uint32_t picHeight = m_param->sourceHeight;
     int ctuWidth  = g_maxCUSize;
@@ -789,10 +792,10 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
     int addr = idxX + m_numCuInWidth * idxY;
 
     int x, y;
-    CUData* cu = frame->m_encData->getPicCTU(addr);
+    const CUData* cu = frame->m_encData->getPicCTU(addr);
     const pixel* fenc;
     const pixel* rec;
-    intptr_t stride = m_frame->m_reconPicYuv->m_stride;
+    intptr_t stride = m_frame->m_reconPic->m_stride;
     uint32_t picWidth  = m_param->sourceWidth;
     uint32_t picHeight = m_param->sourceHeight;
     int ctuWidth  = g_maxCUSize;
@@ -826,7 +829,7 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
     {
         if (plane == 1)
         {
-            stride = frame->m_reconPicYuv->m_strideC;
+            stride = frame->m_reconPic->m_strideC;
             picWidth  >>= m_hChromaShift;
             picHeight >>= m_vChromaShift;
             ctuWidth  >>= m_hChromaShift;
@@ -845,8 +848,8 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
         stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
         count = m_countPreDblk[addr][plane][SAO_BO];
 
-        const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr);
-        const pixel* rec0  = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr);
+        const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
+        const pixel* rec0  = m_frame->m_reconPic->getPlaneAddr(plane, addr);
         fenc = fenc0;
         rec  = rec0;
 
diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp
index cd86318..bc0dc94 100644
--- a/source/encoder/search.cpp
+++ b/source/encoder/search.cpp
@@ -37,6 +37,8 @@ using namespace x265;
 #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
 #endif
 
+#define MVP_IDX_BITS 1
+
 ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
 ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
 
@@ -66,11 +68,10 @@ bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
     m_numLayers = g_log2Size[param.maxCUSize] - 2;
 
     m_rdCost.setPsyRdScale(param.psyRd);
-    m_me.setSearchMethod(param.searchMethod);
-    m_me.setSubpelRefine(param.subpelRefine);
+    m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
 
     bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
-    if (m_param->noiseReduction)
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
         ok &= m_quant.allocNoiseReduction(param);
 
     ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
@@ -163,70 +164,55 @@ void Search::invalidateContexts(int fromDepth)
 void Search::invalidateContexts(int) {}
 #endif
 
-void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height)
+void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
 {
-    uint32_t fullDepth  = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepthL   = cu.m_tuDepth[absPartIdx];
-    uint32_t subdiv     = tuDepthL > trDepth;
+    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
+    uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 
-    bool mCodeAll = true;
-    const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift);
-    if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
-        mCodeAll = false;
-
-    if (mCodeAll)
+    if (!(log2TrSize - m_hChromaShift < 2))
     {
-        if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1))
-            m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv);
-
-        if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1))
-            m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv);
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
     }
 
     if (subdiv)
     {
-        absPartIdxStep >>= 2;
-        width  >>= 1;
-        height >>= 1;
-
-        uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-        for (uint32_t part = 0; part < 4; part++)
-            codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
     }
 }
 
-void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype)
+void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
 {
-    if (!cu.getCbf(absPartIdx, ttype, trDepth))
+    if (!cu.getCbf(absPartIdx, ttype, tuDepth))
         return;
 
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 
-    if (tuDepthL > trDepth)
+    if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
-        uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-        for (uint32_t part = 0; part < 4; part++)
-            codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
 
         return;
     }
 
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-
-    uint32_t trDepthC = trDepth;
+    uint32_t tuDepthC = tuDepth;
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-    
-    if (log2TrSizeC == 1)
-    {
-        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n");
-        trDepthC--;
-        log2TrSizeC++;
-        uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
-        bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
-        if (!bFirstQ)
+
+    if (log2TrSizeC < 2)
+    {
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        if (absPartIdx & 3)
             return;
+        log2TrSizeC = 2;
+        tuDepthC--;
     }
 
     uint32_t qtLayer = log2TrSize - 2;
@@ -243,17 +229,17 @@ void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absP
         uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
         coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
         uint32_t subTUSize = 1 << (log2TrSizeC * 2);
-        uint32_t partIdxesPerSubTU  = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1);
-        if (cu.getCbf(absPartIdx, ttype, trDepth + 1))
+        uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
+        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
             m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
-        if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1))
-            m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype);
+        if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+            m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
     }
 }
 
-void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2])
+void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
 {
-    uint32_t fullDepth  = mode.cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth  = mode.cu.m_cuDepth[0] + tuDepth;
     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
     uint32_t qtLayer    = log2TrSize - 2;
     uint32_t sizeIdx    = log2TrSize - 2;
@@ -280,20 +266,20 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
         if (mightSplit)
             m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
 
-        pixel*   fenc     = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
+        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
         pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
         int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
         uint32_t stride   = mode.fencYuv->m_size;
 
         // init availability pattern
         uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-        initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
 
         // get prediction signal
         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 
         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
-        cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
 
         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
         coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
@@ -312,9 +298,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
         }
         else
             // no coded residual, recon = pred
-            primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
+            primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
 
-        bCBF = !!numSig << trDepth;
+        bCBF = !!numSig << tuDepth;
         cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
         fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride);
 
@@ -338,21 +324,21 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
         }
         else
         {
-            uint32_t qtNumParts = cuGeom.numPartitions >> 2;
-            if (!trDepth)
+            uint32_t qNumParts = cuGeom.numPartitions >> 2;
+            if (!tuDepth)
             {
-                for (uint32_t part = 0; part < 4; part++)
-                    m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
             }
-            else if (!(absPartIdx & (qtNumParts - 1)))
+            else if (!(absPartIdx & (qNumParts - 1)))
                 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
         }
         if (log2TrSize != depthRange[0])
             m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
 
-        m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
 
-        if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
             m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
 
         fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
@@ -380,26 +366,25 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
         }
 
         // code split block
-        uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-        uint32_t absPartIdxSub = absPartIdx;
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
 
         int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
         if (m_param->bEnableTSkipFast)
-            checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN;
+            checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
 
         Cost splitCost;
         uint32_t cbf = 0;
-        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
             if (checkTransformSkip)
-                codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost);
+                codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
             else
-                codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange);
+                codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
 
-            cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
         }
-        for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
-            cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth);
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
 
         if (mightNotSplit && log2TrSize != depthRange[0])
         {
@@ -428,16 +413,16 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
             m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
 
             // recover transform index and Cbf values
-            cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+            cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
             cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
             cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
         }
     }
 
     // set reconstruction for next intra prediction blocks if full TU prediction won
-    pixel*   picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
-    primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
 
     outCost.rdcost     += fullCost.rdcost;
     outCost.distortion += fullCost.distortion;
@@ -445,9 +430,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
     outCost.energy     += fullCost.energy;
 }
 
-void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost)
+void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
 {
-    uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
     uint32_t tuSize = 1 << log2TrSize;
 
@@ -462,7 +447,7 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
     int      bTSkip = 0;
     uint32_t bCBF = 0;
 
-    pixel*   fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+    const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
     pixel*   pred = predYuv->getLumaAddr(absPartIdx);
     int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
     uint32_t stride = fencYuv->m_size;
@@ -470,12 +455,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
 
     // init availability pattern
     uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-    initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+    initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
 
     // get prediction signal
     predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 
-    cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+    cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
 
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
@@ -518,12 +503,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
         }
         else
             // no residual coded, recon = pred
-            primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
+            primitives.luma_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
 
         uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride);
 
         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
-        cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+        cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
 
         if (useTSkip)
             m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
@@ -548,20 +533,20 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
         }
         else
         {
-            uint32_t qtNumParts = cuGeom.numPartitions >> 2;
-            if (!trDepth)
+            uint32_t qNumParts = cuGeom.numPartitions >> 2;
+            if (!tuDepth)
             {
-                for (uint32_t part = 0; part < 4; part++)
-                    m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
             }
-            else if (!(absPartIdx & (qtNumParts - 1)))
+            else if (!(absPartIdx & (qNumParts - 1)))
                 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
         }
         m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
 
-        m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
 
-        if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
             m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
 
         uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
@@ -591,19 +576,19 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
     if (bTSkip)
     {
         memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
-        primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
+        primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
     }
     else if (checkTransformSkip)
     {
         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
-        cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+        cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
         m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
     }
 
     // set reconstruction for next intra prediction blocks
-    pixel*   picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
-    primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
 
     outCost.rdcost += fullCost.rdcost;
     outCost.distortion += fullCost.distortion;
@@ -612,11 +597,11 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
 }
 
 /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
-void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2])
+void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2])
 {
     CUData& cu = mode.cu;
 
-    uint32_t fullDepth   = cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth   = cu.m_cuDepth[0] + tuDepth;
     uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
     bool     bCheckFull  = log2TrSize <= depthRange[1];
 
@@ -629,22 +614,22 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3
 
     if (bCheckFull)
     {
-        pixel*   fenc      = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
+        const pixel* fenc  = mode.fencYuv->getLumaAddr(absPartIdx);
         pixel*   pred      = mode.predYuv.getLumaAddr(absPartIdx);
         int16_t* residual  = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
-        pixel*   picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-        intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
+        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+        intptr_t picStride = m_frame->m_reconPic->m_stride;
         uint32_t stride    = mode.fencYuv->m_size;
         uint32_t sizeIdx   = log2TrSize - 2;
         uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
         coeff_t* coeff        = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
 
-        initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 
         X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
-        cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
 
         primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
@@ -652,11 +637,11 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3
         {
             m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
             primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
-            cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+            cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
         }
         else
         {
-            primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
+            primitives.luma_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
             cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
         }
     }
@@ -665,26 +650,25 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3
         X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
         
         /* code split block */
-        uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
         uint32_t cbf = 0;
-        for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange);
-            cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+            residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange);
+            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
         }
-        for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
-            cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth);
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+            cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth);
     }
 }
 
-void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx)
+void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
 {
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepth   = cu.m_tuDepth[absPartIdx];
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 
-    if (tuDepth == trDepth)
+    if (tuDepth == cu.m_tuDepth[absPartIdx])
     {
-        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
         uint32_t qtLayer    = log2TrSize - 2;
 
         // copy transform coefficients
@@ -698,88 +682,80 @@ void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, u
     }
     else
     {
-        uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
-            extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
     }
 }
 
+inline void offsetCBFs(uint8_t subTUCBF[2])
+{
+    uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
+    subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
+    subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
+}
+
 /* 4:2:2 post-TU split processing */
-void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx)
+void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
 {
     uint32_t depth = cu.m_cuDepth[0];
-    uint32_t fullDepth = depth + trDepth;
+    uint32_t fullDepth = depth + tuDepth;
     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 
-    uint32_t trDepthC = trDepth;
     if (log2TrSize == 2)
     {
-        X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n");
-        trDepthC--;
+        X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        ++log2TrSize;
     }
 
-    uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1;
+    uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
 
     // move the CBFs down a level and set the parent CBF
     uint8_t subTUCBF[2];
-    uint8_t combinedSubTUCBF = 0;
-
-    for (uint32_t subTU = 0; subTU < 2; subTU++)
-    {
-        const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
+    subTUCBF[0] = cu.getCbf(absPartIdx            , ttype, tuDepth);
+    subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
+    offsetCBFs(subTUCBF);
 
-        subTUCBF[subTU]   = cu.getCbf(subTUAbsPartIdx, ttype, trDepth);
-        combinedSubTUCBF |= subTUCBF[subTU];
-    }
-
-    for (uint32_t subTU = 0; subTU < 2; subTU++)
-    {
-        const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
-        const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF;
-
-        cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU);
-    }
+    cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx             , tuNumParts);
+    cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
 }
 
 /* returns distortion */
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
 {
     CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 
-    if (tuDepthL > trDepth)
+    if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
-        uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
         uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
-        for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy);
-            splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
-            splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
+            outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
+            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
-        for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
         {
-            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth);
-            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth);
+            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth);
+            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth);
         }
 
         return outDist;
     }
 
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
 
-    uint32_t trDepthC = trDepth;
-    if (log2TrSizeC == 1)
+    uint32_t tuDepthC = tuDepth;
+    if (log2TrSizeC < 2)
     {
-        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n");
-        trDepthC--;
-        log2TrSizeC++;
-        uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
-        bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
-        if (!bFirstQ)
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        if (absPartIdx & 3)
             return 0;
+        log2TrSizeC = 2;
+        tuDepthC--;
     }
 
     if (m_bEnableRDOQ)
@@ -788,13 +764,13 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
     checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
     if (checkTransformSkip)
-        return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy);
+        return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
 
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t tuSize = 1 << log2TrSizeC;
     uint32_t outDist = 0;
 
-    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 
     for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
@@ -806,7 +782,7 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
         {
             uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
 
-            pixel*   fenc     = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
             pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
             int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t stride   = mode.fencYuv->m_csize;
@@ -817,11 +793,11 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
             pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
 
-            pixel*   picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
-            intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
+            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+            intptr_t picStride = m_frame->m_reconPic->m_strideC;
 
             // init availability pattern
-            initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
             pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
@@ -837,44 +813,42 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
 
             primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
-            uint32_t tmpDist;
             if (numSig)
             {
                 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
                 primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride);
-                cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
             }
             else
             {
                 // no coded residual, recon = pred
-                primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
+                primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
                 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
             }
 
-            tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride);
-            outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+            outDist += m_rdCost.scaleChromaDist(chromaId, primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride));
 
             if (m_rdCost.m_psyRd)
                 psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride);
 
-            primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
+            primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
         }
         while (tuIterator.isNextSection());
 
         if (splitType == VERTICAL_SPLIT)
-            offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
     }
 
     return outDist;
 }
 
 /* returns distortion */
-uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
 {
     CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-    uint32_t log2TrSizeC = 2;
+    const uint32_t log2TrSizeC = 2;
     uint32_t tuSize = 4;
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t outDist = 0;
@@ -887,7 +861,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
     ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
     ALIGN_VAR_32(pixel,   tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
 
-    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 
     for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
@@ -899,11 +873,11 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
         {
             uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
 
-            pixel*   fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
             pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
             int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t stride = mode.fencYuv->m_csize;
-            uint32_t sizeIdxC = log2TrSizeC - 2;
+            const uint32_t sizeIdxC = log2TrSizeC - 2;
 
             uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
             coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
@@ -911,7 +885,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
 
             // init availability pattern
-            initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
             pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
@@ -943,7 +917,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
                 {
                     m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
                     primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride);
-                    cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                 }
                 else if (useTSkip)
                 {
@@ -952,11 +926,11 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
                 }
                 else
                 {
-                    primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
+                    primitives.luma_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                 }
                 uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride);
-                tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+                tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
 
                 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 
@@ -991,15 +965,15 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
             if (bTSkip)
             {
                 memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
-                primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
+                primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
             }
 
-            cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+            cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
             cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 
-            pixel*   reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
-            intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
-            primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
+            pixel*   reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+            intptr_t picStride = m_frame->m_reconPic->m_strideC;
+            primitives.luma_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
 
             outDist += bDist;
             psyEnergy += bEnergy;
@@ -1007,34 +981,27 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
         while (tuIterator.isNextSection());
 
         if (splitType == VERTICAL_SPLIT)
-            offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
     }
 
     m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
     return outDist;
 }
 
-void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad)
+void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
 {
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
     uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
 
-    if (tuDepthL == trDepth)
+    if (tuDepthL == tuDepth || log2TrSizeC == 2)
     {
-        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
-        if (tuQuad)
-        {
-            log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */
-            trDepth--;     /* also adjust the number of coeff read */
-        }
-
         // copy transform coefficients
         uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
         uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
 
-        uint32_t qtLayer   = log2TrSize - 2;
+        uint32_t qtLayer   = log2TrSize - 2 - (tuDepthL - tuDepth);
         coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
         coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
         coeff_t* coeffDstU = cu.m_trCoeff[1]           + coeffOffsetC;
@@ -1047,38 +1014,29 @@ void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absP
     }
     else
     {
-        if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444)
-            /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */
-            extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true);
-        else
-        {
-            uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
-                extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false);
-        }
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
     }
 }
 
-void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx)
+void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx)
 {
     CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
     
-    if (tuDepthL == trDepth)
+    if (tuDepth == cu.m_tuDepth[absPartIdx])
     {
-        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-        uint32_t trDepthC = trDepth;
-        if (log2TrSizeC == 1)
+        uint32_t tuDepthC = tuDepth;
+        if (log2TrSizeC < 2)
         {
-            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n");
-            trDepthC--;
-            log2TrSizeC++;
-            uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
-            bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
-            if (!bFirstQ)
+            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+            if (absPartIdx & 3)
                 return;
+            log2TrSizeC = 2;
+            tuDepthC--;
         }
 
         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
@@ -1086,7 +1044,7 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr
         uint32_t stride = mode.fencYuv->m_csize;
         const int sizeIdxC = log2TrSizeC - 2;
 
-        uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+        uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
         const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 
         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
@@ -1098,20 +1056,20 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr
             {
                 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
 
-                pixel*   fenc         = const_cast<pixel*>(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC));
+                const pixel*   fenc   = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
                 pixel*   pred         = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
                 int16_t* residual     = resiYuv.getChromaAddr(chromaId, absPartIdxC);
                 pixel*   recon        = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
                 uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
                 coeff_t* coeff        = cu.m_trCoeff[ttype] + coeffOffsetC;
-                pixel*   picReconC    = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
-                uint32_t picStride    = m_frame->m_reconPicYuv->m_strideC;
+                pixel*   picReconC    = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+                uint32_t picStride    = m_frame->m_reconPic->m_strideC;
 
                 uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
                 if (chromaPredMode == DM_CHROMA_IDX)
                     chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
                 chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
-                initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+                initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
                 pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 
                 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
@@ -1124,36 +1082,36 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr
                 {
                     m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
                     primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
-                    primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
-                    cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+                    primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
+                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                 }
                 else
                 {
-                    primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride);
-                    primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
+                    primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride);
+                    primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                 }
             }
             while (tuIterator.isNextSection());
 
             if (splitType == VERTICAL_SPLIT)
-                offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx);
+                offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
         }
     }
     else
     {
-        uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
         uint32_t splitCbfU = 0, splitCbfV = 0;
-        for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC);
-            splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1);
-            splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1);
+            residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx);
+            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
-        for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
         {
-            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth);
-            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth);
+            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
+            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
         }
     }
 }
@@ -1188,7 +1146,7 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize
     intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
 
     bool bCodeDQP = m_slice->m_pps->bUseDQP;
-    m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange);
+    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
     m_entropyCoder.store(intraMode.contexts);
     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
@@ -1198,7 +1156,224 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize
     updateModeCost(intraMode);
 }
 
-uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes)
+/* Note that this function does not save the best intra prediction, it must
+ * be generated later. It records the best mode in the cu */
+void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+    CUData& cu = intraMode.cu;
+    uint32_t depth = cu.m_cuDepth[0];
+
+    cu.setPartSizeSubParts(SIZE_2Nx2N);
+    cu.setPredModeSubParts(MODE_INTRA);
+
+    const uint32_t initTuDepth = 0;
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
+    uint32_t tuSize = 1 << log2TrSize;
+    const uint32_t absPartIdx = 0;
+
+    // Reference sample smoothing
+    initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
+
+    const pixel* fenc = intraMode.fencYuv->m_buf[0];
+    uint32_t stride = intraMode.fencYuv->m_size;
+
+    pixel* above = m_refAbove + tuSize - 1;
+    pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
+    pixel* left = m_refLeft + tuSize - 1;
+    pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
+    int sad, bsad;
+    uint32_t bits, bbits, mode, bmode;
+    uint64_t cost, bcost;
+
+    // 33 Angle modes once
+    ALIGN_VAR_32(pixel, bufScale[32 * 32]);
+    ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
+    ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
+    int scaleTuSize = tuSize;
+    int scaleStride = stride;
+    int costShift = 0;
+    int sizeIdx = log2TrSize - 2;
+
+    if (tuSize > 32)
+    {
+        // origin is 64x64, we scale to 32x32 and setup required parameters
+        primitives.scale2D_64to32(bufScale, fenc, stride);
+        fenc = bufScale;
+
+        // reserve space in case primitives need to store data in above
+        // or left buffers
+        pixel _above[4 * 32 + 1];
+        pixel _left[4 * 32 + 1];
+        pixel* aboveScale = _above + 2 * 32;
+        pixel* leftScale = _left + 2 * 32;
+        aboveScale[0] = leftScale[0] = above[0];
+        primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
+        primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
+
+        scaleTuSize = 32;
+        scaleStride = 32;
+        costShift = 2;
+        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
+
+        // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
+        above = aboveScale;
+        left = leftScale;
+        aboveFiltered = aboveScale;
+        leftFiltered = leftScale;
+    }
+
+    pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
+    int predsize = scaleTuSize * scaleTuSize;
+
+    m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
+
+    /* there are three cost tiers for intra modes:
+    *  pred[0]          - mode probable, least cost
+    *  pred[1], pred[2] - less probable, slightly more cost
+    *  non-mpm modes    - all cost the same (rbits) */
+    uint64_t mpms;
+    uint32_t preds[3];
+    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
+
+    // DC
+    primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+    bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+    bmode = mode = DC_IDX;
+    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
+
+    pixel* abovePlanar = above;
+    pixel* leftPlanar = left;
+
+    if (tuSize & (8 | 16 | 32))
+    {
+        abovePlanar = aboveFiltered;
+        leftPlanar = leftFiltered;
+    }
+
+    // PLANAR
+    primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+    sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+    mode = PLANAR_IDX;
+    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+    cost = m_rdCost.calcRdSADCost(sad, bits);
+    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+
+    // Transpose NxN
+    primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
+
+    primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+
+    bool modeHor;
+    const pixel* cmp;
+    intptr_t srcStride;
+
+#define TRY_ANGLE(angle) \
+    modeHor = angle < 18; \
+    cmp = modeHor ? bufTrans : fenc; \
+    srcStride = modeHor ? scaleTuSize : scaleStride; \
+    sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
+    bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+    cost = m_rdCost.calcRdSADCost(sad, bits)
+
+    if (m_param->bEnableFastIntra)
+    {
+        int asad = 0;
+        uint32_t lowmode, highmode, amode = 5, abits = 0;
+        uint64_t acost = MAX_INT64;
+
+        /* pick the best angle, sampling at distance of 5 */
+        for (mode = 5; mode < 35; mode += 5)
+        {
+            TRY_ANGLE(mode);
+            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
+        }
+
+        /* refine best angle at distance 2, then distance 1 */
+        for (uint32_t dist = 2; dist >= 1; dist--)
+        {
+            lowmode = amode - dist;
+            highmode = amode + dist;
+
+            X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
+            TRY_ANGLE(lowmode);
+            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+
+            X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
+            TRY_ANGLE(highmode);
+            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+        }
+
+        if (amode == 33)
+        {
+            TRY_ANGLE(34);
+            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
+        }
+
+        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
+    }
+    else // calculate and search all intra prediction angles for lowest cost
+    {
+        for (mode = 2; mode < 35; mode++)
+        {
+            TRY_ANGLE(mode);
+            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+        }
+    }
+
+    cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
+    intraMode.initCosts();
+    intraMode.totalBits = bbits;
+    intraMode.distortion = bsad;
+    intraMode.sa8dCost = bcost;
+    intraMode.sa8dBits = bbits;
+}
+
+void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+    CUData& cu = intraMode.cu;
+    Yuv* reconYuv = &intraMode.reconYuv;
+    const Yuv* fencYuv = intraMode.fencYuv;
+
+    X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
+    X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
+
+    m_quant.setQPforQuant(cu);
+
+    uint32_t tuDepthRange[2];
+    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+    m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
+
+    Cost icosts;
+    codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
+    extractIntraResultQT(cu, *reconYuv, 0, 0);
+
+    intraMode.distortion = icosts.distortion;
+    intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
+
+    m_entropyCoder.resetBits();
+    if (m_slice->m_pps->bTransquantBypassEnabled)
+        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
+    m_entropyCoder.codeSkipFlag(cu, 0);
+    m_entropyCoder.codePredMode(cu.m_predMode[0]);
+    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
+    m_entropyCoder.codePredInfo(cu, 0);
+    intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
+
+    bool bCodeDQP = m_slice->m_pps->bUseDQP;
+    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
+
+    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
+    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
+    if (m_rdCost.m_psyRd)
+        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+
+    m_entropyCoder.store(intraMode.contexts);
+    updateModeCost(intraMode);
+}
+
+uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
 {
     CUData& cu = intraMode.cu;
     Yuv* reconYuv = &intraMode.reconYuv;
@@ -1206,37 +1381,37 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
     const Yuv* fencYuv = intraMode.fencYuv;
 
     uint32_t depth        = cu.m_cuDepth[0];
-    uint32_t initTrDepth  = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1;
-    uint32_t numPU        = 1 << (2 * initTrDepth);
-    uint32_t log2TrSize   = cu.m_log2CUSize[0] - initTrDepth;
+    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
+    uint32_t numPU        = 1 << (2 * initTuDepth);
+    uint32_t log2TrSize   = cu.m_log2CUSize[0] - initTuDepth;
     uint32_t tuSize       = 1 << log2TrSize;
     uint32_t qNumParts    = cuGeom.numPartitions >> 2;
     uint32_t sizeIdx      = log2TrSize - 2;
     uint32_t absPartIdx   = 0;
     uint32_t totalDistortion = 0;
 
-    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN;
+    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
 
     // loop over partitions
-    for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts)
+    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
     {
         uint32_t bmode = 0;
 
         if (sharedModes)
-            bmode = sharedModes[pu];
+            bmode = sharedModes[puIdx];
         else
         {
             // Reference sample smoothing
-            initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
+            initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
 
             // determine set of modes to be tested (using prediction signal only)
-            pixel*   fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+            const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
             uint32_t stride = predYuv->m_size;
 
-            pixel *above = m_refAbove + tuSize - 1;
-            pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
-            pixel *left = m_refLeft + tuSize - 1;
-            pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
+            pixel* above = m_refAbove + tuSize - 1;
+            pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
+            pixel* left = m_refLeft + tuSize - 1;
+            pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
 
             // 33 Angle modes once
             ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
@@ -1250,8 +1425,8 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
 
             if (tuSize > 32)
             {
-                pixel *aboveScale = _above + 2 * 32;
-                pixel *leftScale = _left + 2 * 32;
+                pixel* aboveScale = _above + 2 * 32;
+                pixel* leftScale = _left + 2 * 32;
 
                 // origin is 64x64, we scale to 32x32 and setup required parameters
                 primitives.scale2D_64to32(bufScale, fenc, stride);
@@ -1296,8 +1471,8 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
             modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
 
             // PLANAR
-            pixel *abovePlanar = above;
-            pixel *leftPlanar = left;
+            pixel* abovePlanar = above;
+            pixel* leftPlanar = left;
             if (tuSize >= 8 && tuSize <= 32)
             {
                 abovePlanar = aboveFiltered;
@@ -1316,7 +1491,7 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
             for (int mode = 2; mode < 35; mode++)
             {
                 bool modeHor = (mode < 18);
-                pixel *cmp = (modeHor ? buf_trans : fenc);
+                const pixel* cmp = (modeHor ? buf_trans : fenc);
                 intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
                 bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
                 sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
@@ -1330,7 +1505,7 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
              * levels and at higher depths */
             uint64_t candCostList[MAX_RD_INTRA_MODES];
             uint32_t rdModeList[MAX_RD_INTRA_MODES];
-            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1);
+            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
             for (int i = 0; i < maxCandCount; i++)
                 candCostList[i] = MAX_INT64;
 
@@ -1346,51 +1521,50 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
                 if (candCostList[i] == MAX_INT64)
                     break;
                 m_entropyCoder.load(m_rqt[depth].cur);
-                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth);
+                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
 
                 Cost icosts;
                 if (checkTransformSkip)
-                    codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
                 else
-                    codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange);
+                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
                 COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
             }
         }
 
         /* remeasure best mode, allowing TU splits */
-        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth);
+        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
         m_entropyCoder.load(m_rqt[depth].cur);
 
         Cost icosts;
         if (checkTransformSkip)
-            codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
         else
-            codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange);
+            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
         totalDistortion += icosts.distortion;
 
-        extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx);
+        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
 
         // set reconstruction for next intra prediction blocks
-        if (pu != numPU - 1)
+        if (puIdx != numPU - 1)
         {
             /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
              * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
              * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
              * that the contexts should be tracked through each PU */
-            pixel*   dst         = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-            uint32_t dststride   = m_frame->m_reconPicYuv->m_stride;
-            pixel*   src         = reconYuv->getLumaAddr(absPartIdx);
+            pixel*   dst         = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+            uint32_t dststride   = m_frame->m_reconPic->m_stride;
+            const pixel*   src   = reconYuv->getLumaAddr(absPartIdx);
             uint32_t srcstride   = reconYuv->m_size;
-            primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
+            primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
         }
     }
 
     if (numPU > 1)
     {
         uint32_t combCbfY = 0;
-        uint32_t partIdx  = 0;
-        for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
-            combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1);
+        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
+            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
 
         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
             cu.m_cbf[0][offs] |= combCbfY;
@@ -1415,17 +1589,19 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
     uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
     uint32_t tuSize = 1 << log2TrSizeC;
     int32_t scaleTuSize = tuSize;
+    uint32_t tuDepth = 0;
     int32_t costShift = 0;
 
     if (tuSize > 32)
     {
         scaleTuSize = 32;
+        tuDepth = 1;
         costShift = 2;
         log2TrSizeC = 5;
     }
 
-    Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1);
-    Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2);
+    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1);
+    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2);
     cu.getAllowedChromaDir(0, modeList);
 
     // check chroma modes
@@ -1440,7 +1616,7 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
         uint64_t cost = 0;
         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
         {
-            pixel* fenc = fencYuv->m_buf[chromaId];
+            const pixel* fenc = fencYuv->m_buf[chromaId];
             pixel* pred = predYuv->m_buf[chromaId];
             pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
 
@@ -1465,19 +1641,18 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
     Yuv& reconYuv = intraMode.reconYuv;
 
     uint32_t depth       = cu.m_cuDepth[0];
-    uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444;
-    uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTrDepth;
+    uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
+    uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTuDepth;
     uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
     uint32_t totalDistortion = 0;
 
     int part = partitionFromLog2Size(log2TrSize);
 
-    TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
+    TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
 
     do
     {
         uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
-        int cuSize = 1 << cu.m_log2CUSize[absPartIdxC];
 
         uint32_t bestMode = 0;
         uint32_t bestDist = 0;
@@ -1496,9 +1671,9 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
             // restore context models
             m_entropyCoder.load(m_rqt[depth].cur);
 
-            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
+            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
             uint32_t psyEnergy = 0;
-            uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy);
+            uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
 
             if (m_slice->m_pps->bTransformSkipEnabled)
                 m_entropyCoder.load(m_rqt[depth].cur);
@@ -1512,14 +1687,14 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
             }
             else
             {
-                uint32_t qtNumParts = cuGeom.numPartitions >> 2;
-                if (!(absPartIdxC & (qtNumParts - 1)))
+                uint32_t qNumParts = cuGeom.numPartitions >> 2;
+                if (!(absPartIdxC & (qNumParts - 1)))
                     m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
             }
 
-            codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize);
-            codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U);
-            codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V);
+            codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
+            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
+            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
             uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
             uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
 
@@ -1528,7 +1703,7 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
                 bestCost = cost;
                 bestDist = dist;
                 bestMode = modeList[mode];
-                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false);
+                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
                 memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
                 memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
                 memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
@@ -1539,14 +1714,15 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
         if (!tuIterator.isLastSection())
         {
             uint32_t zorder    = cuGeom.encodeIdx + absPartIdxC;
-            uint32_t dststride = m_frame->m_reconPicYuv->m_strideC;
-            pixel *src, *dst;
+            uint32_t dststride = m_frame->m_reconPic->m_strideC;
+            const pixel* src;
+            pixel* dst;
 
-            dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder);
+            dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder);
             src = reconYuv.getCbAddr(absPartIdxC);
             primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
 
-            dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder);
+            dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder);
             src = reconYuv.getCrAddr(absPartIdxC);
             primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
         }
@@ -1555,23 +1731,23 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
         memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
         memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
         memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
-        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth);
+        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
         totalDistortion += bestDist;
     }
     while (tuIterator.isNextSection());
 
-    if (initTrDepth != 0)
+    if (initTuDepth != 0)
     {
         uint32_t combCbfU = 0;
         uint32_t combCbfV = 0;
-        uint32_t partIdx  = 0;
-        for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep)
+        uint32_t qNumParts = tuIterator.absPartIdxStep;
+        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1);
-            combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1);
+            combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
+            combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
         }
 
-        for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++)
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
         {
             cu.m_cbf[1][offs] |= combCbfU;
             cu.m_cbf[2][offs] |= combCbfV;
@@ -1615,13 +1791,17 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me
             continue;
 
         cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
-        cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx;
+        cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx;
         cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
-        cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx;
+        cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
 
         prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(tempYuv, true, false);
+        motionCompensation(tempYuv, true, m_me.bChromaSATD);
+
         uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
+        if (m_me.bChromaSATD)
+            costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx);
+
         uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
         costCand = costCand + m_rdCost.getCost(bitsCand);
         if (costCand < outCost)
@@ -1642,41 +1822,45 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me
 /* this function assumes the caller has configured its MotionEstimation engine with the
  * correct source plane and source PU, and has called prepMotionCompensation() to set
  * m_puAbsPartIdx, m_puWidth, and m_puHeight */
-void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref)
+void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref)
 {
     uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
     bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
 
-    MV amvpCand[AMVP_NUM_CANDS];
     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
-    int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc);
+    int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
 
-    uint32_t bestCost = MAX_INT;
     int mvpIdx = 0;
     int merange = m_param->searchRange;
-    for (int i = 0; i < AMVP_NUM_CANDS; i++)
+    MotionData* bestME = interMode.bestME[part];
+
+    if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1])
     {
-        MV mvCand = amvpCand[i];
+        uint32_t bestCost = MAX_INT;
+        for (int i = 0; i < AMVP_NUM_CANDS; i++)
+        {
+            MV mvCand = interMode.amvpCand[list][ref][i];
 
-        // NOTE: skip mvCand if Y is > merange and -FN>1
-        if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
-            continue;
+            // NOTE: skip mvCand if Y is > merange and -FN>1
+            if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+                continue;
 
-        cu.clipMv(mvCand);
+            interMode.cu.clipMv(mvCand);
 
-        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
-        predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand);
-        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+            Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
+            predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
+            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
 
-        if (bestCost > cost)
-        {
-            bestCost = cost;
-            mvpIdx = i;
+            if (bestCost > cost)
+            {
+                bestCost = cost;
+                mvpIdx = i;
+            }
         }
     }
 
-    MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx];
-    setSearchRange(cu, mvp, merange, mvmin, mvmax);
+    MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx];
+    setSearchRange(interMode.cu, mvp, merange, mvmin, mvmax);
 
     int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
 
@@ -1685,34 +1869,32 @@ void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGe
     uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
 
     /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
-    checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost);
+    checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost);
 
     /* tie goes to the smallest ref ID, just like --no-pme */
-    ScopedLock _lock(master.m_outputLock);
-    if (cost < master.m_bestME[list].cost ||
-       (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref))
+    ScopedLock _lock(master.m_meLock);
+    if (cost < bestME[list].cost ||
+       (cost == bestME[list].cost && ref < bestME[list].ref))
     {
-        master.m_bestME[list].mv = outmv;
-        master.m_bestME[list].mvp = mvp;
-        master.m_bestME[list].mvpIdx = mvpIdx;
-        master.m_bestME[list].ref = ref;
-        master.m_bestME[list].cost = cost;
-        master.m_bestME[list].bits = bits;
+        bestME[list].mv = outmv;
+        bestME[list].mvp = mvp;
+        bestME[list].mvpIdx = mvpIdx;
+        bestME[list].ref = ref;
+        bestME[list].cost = cost;
+        bestME[list].bits = bits;
     }
 }
 
 /* search of the best candidate for inter prediction
  * returns true if predYuv was filled with a motion compensated prediction */
-bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma)
+bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
 {
     CUData& cu = interMode.cu;
     Yuv* predYuv = &interMode.predYuv;
 
-    MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
 
     const Slice *slice = m_slice;
-    PicYuv* fencPic = m_frame->m_origPicYuv;
     int numPart     = cu.getNumPartInter();
     int numPredDir  = slice->isInterP() ? 1 : 2;
     const int* numRefIdx = slice->m_numRefIdx;
@@ -1727,23 +1909,24 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
 
     for (int puIdx = 0; puIdx < numPart; puIdx++)
     {
+        MotionData* bestME = interMode.bestME[puIdx];
+
         /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
         initMotionCompensation(cu, cuGeom, puIdx);
 
-        pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
-        m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
+        m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
 
         uint32_t mrgCost = MAX_UINT;
 
-        /* find best cost merge candidate */
-        if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N)
+        /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
+        if (cu.m_partSize[0] != SIZE_2Nx2N)
         {
             merge.absPartIdx = m_puAbsPartIdx;
             merge.width      = m_puWidth;
             merge.height     = m_puHeight;
             mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
 
-            if (bMergeOnly && cu.m_log2CUSize[0] > 3)
+            if (bMergeOnly)
             {
                 if (mrgCost == MAX_UINT)
                 {
@@ -1762,33 +1945,88 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
                 totalmebits += merge.bits;
 
                 prepMotionCompensation(cu, cuGeom, puIdx);
-                motionCompensation(*predYuv, true, bChroma);
+                motionCompensation(*predYuv, true, bChromaSA8D);
                 continue;
             }
         }
 
-        MotionData bidir[2];
-        uint32_t bidirCost = MAX_UINT;
-        int bidirBits = 0;
-
-        m_bestME[0].cost = MAX_UINT;
-        m_bestME[1].cost = MAX_UINT;
+        bestME[0].cost = MAX_UINT;
+        bestME[1].cost = MAX_UINT;
 
         getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
 
-        if (bDistributed)
+        /* Uni-directional prediction */
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0)
         {
-            m_curMECu = &cu;
-            m_curGeom = &cuGeom;
+            for (int l = 0; l < numPredDir; l++)
+            {
+                int ref = bestME[l].ref;
+                uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
+                bits += getTUBits(ref, numRefIdx[l]);
+
+                int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
+
+                // Pick the best possible MVP from AMVP candidates based on least residual
+                int mvpIdx = 0;
+                int merange = m_param->searchRange;
+
+                if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
+                {
+                    uint32_t bestCost = MAX_INT;
+                    for (int i = 0; i < AMVP_NUM_CANDS; i++)
+                    {
+                        MV mvCand = interMode.amvpCand[l][ref][i];
+
+                        // NOTE: skip mvCand if Y is > merange and -FN>1
+                        if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+                            continue;
+
+                        cu.clipMv(mvCand);
+                        predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+                        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
 
-            /* this worker might already be enqueued for pmode, so other threads
-             * might be looking at the ME job counts at any time, do these sets
-             * in a safe order */
+                        if (bestCost > cost)
+                        {
+                            bestCost = cost;
+                            mvpIdx = i;
+                        }
+                    }
+                }
+
+                MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
+
+                int satdCost;
+                setSearchRange(cu, mvp, merange, mvmin, mvmax);
+                satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
+
+                /* Get total cost of partition, but only include MV bit cost once */
+                bits += m_me.bitcost(outmv);
+                uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
+
+                /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
+                checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+
+                if (cost < bestME[l].cost)
+                {
+                    bestME[l].mv = outmv;
+                    bestME[l].mvp = mvp;
+                    bestME[l].mvpIdx = mvpIdx;
+                    bestME[l].cost = cost;
+                    bestME[l].bits = bits;
+                }
+            }
+        }
+        else if (bDistributed)
+        {
+            m_meLock.acquire();
+            m_curInterMode = &interMode;
+            m_curGeom = &cuGeom;
             m_curPart = puIdx;
             m_totalNumME = 0;
             m_numAcquiredME = 1;
             m_numCompletedME = 0;
             m_totalNumME = numRefIdx[0] + numRefIdx[1];
+            m_meLock.release();
 
             if (!m_bJobsQueued)
                 JobProvider::enqueue();
@@ -1796,34 +2034,43 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
             for (int i = 1; i < m_totalNumME; i++)
                 m_pool->pokeIdleThread();
 
-            while (m_totalNumME > m_numAcquiredME)
+            do
             {
-                int id = ATOMIC_INC(&m_numAcquiredME);
-                if (m_totalNumME >= id)
+                m_meLock.acquire();
+                if (m_totalNumME > m_numAcquiredME)
                 {
-                    id -= 1;
+                    int id = m_numAcquiredME++;
+                    m_meLock.release();
+
                     if (id < numRefIdx[0])
-                        singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id);
+                        singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id);
                     else
-                        singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]);
+                        singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]);
 
-                    if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
-                        m_meCompletionEvent.trigger();
+                    m_meLock.acquire();
+                    m_numCompletedME++;
+                    m_meLock.release();
                 }
+                else
+                    m_meLock.release();
             }
+            while (m_totalNumME > m_numAcquiredME);
+
             if (!m_bJobsQueued)
                 JobProvider::dequeue();
 
             /* we saved L0-0 for ourselves */
-            singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0);
-            if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
+            singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0);
+
+            m_meLock.acquire();
+            if (++m_numCompletedME == m_totalNumME)
                 m_meCompletionEvent.trigger();
+            m_meLock.release();
 
             m_meCompletionEvent.wait();
         }
         else
         {
-            // Uni-directional prediction
             for (int l = 0; l < numPredDir; l++)
             {
                 for (int ref = 0; ref < numRefIdx[l]; ref++)
@@ -1831,33 +2078,36 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
                     uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
                     bits += getTUBits(ref, numRefIdx[l]);
 
-                    int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc);
+                    int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
 
                     // Pick the best possible MVP from AMVP candidates based on least residual
-                    uint32_t bestCost = MAX_INT;
                     int mvpIdx = 0;
                     int merange = m_param->searchRange;
 
-                    for (int i = 0; i < AMVP_NUM_CANDS; i++)
+                    if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
                     {
-                        MV mvCand = amvpCand[l][ref][i];
+                        uint32_t bestCost = MAX_INT;
+                        for (int i = 0; i < AMVP_NUM_CANDS; i++)
+                        {
+                            MV mvCand = interMode.amvpCand[l][ref][i];
 
-                        // NOTE: skip mvCand if Y is > merange and -FN>1
-                        if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
-                            continue;
+                            // NOTE: skip mvCand if Y is > merange and -FN>1
+                            if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+                                continue;
 
-                        cu.clipMv(mvCand);
-                        predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand);
-                        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+                            cu.clipMv(mvCand);
+                            predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+                            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
 
-                        if (bestCost > cost)
-                        {
-                            bestCost = cost;
-                            mvpIdx  = i;
+                            if (bestCost > cost)
+                            {
+                                bestCost = cost;
+                                mvpIdx = i;
+                            }
                         }
                     }
 
-                    MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx];
+                    MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
 
                     setSearchRange(cu, mvp, merange, mvmin, mvmax);
                     int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
@@ -1867,45 +2117,67 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
                     uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
 
                     /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
-                    checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+                    checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
 
-                    if (cost < m_bestME[l].cost)
+                    if (cost < bestME[l].cost)
                     {
-                        m_bestME[l].mv = outmv;
-                        m_bestME[l].mvp = mvp;
-                        m_bestME[l].mvpIdx = mvpIdx;
-                        m_bestME[l].ref = ref;
-                        m_bestME[l].cost = cost;
-                        m_bestME[l].bits = bits;
+                        bestME[l].mv = outmv;
+                        bestME[l].mvp = mvp;
+                        bestME[l].mvpIdx = mvpIdx;
+                        bestME[l].ref = ref;
+                        bestME[l].cost = cost;
+                        bestME[l].bits = bits;
                     }
                 }
             }
         }
 
         /* Bi-directional prediction */
-        if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT)
+        MotionData bidir[2];
+        uint32_t bidirCost = MAX_UINT;
+        int bidirBits = 0;
+
+        if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
+            cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N &&     /* 2Nx2N biprediction is handled elsewhere */
+            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
         {
-            bidir[0] = m_bestME[0];
-            bidir[1] = m_bestME[1];
+            bidir[0] = bestME[0];
+            bidir[1] = bestME[1];
+
+            int satdCost;
 
-            /* Generate reference subpels */
-            PicYuv* refPic0  = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv;
-            PicYuv* refPic1  = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv;
-            Yuv*    bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
-            predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv);
-            predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv);
+            if (m_me.bChromaSATD)
+            {
+                cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
+                cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+                cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
+                cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
 
-            pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx);
-            pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx);
+                prepMotionCompensation(cu, cuGeom, puIdx);
+                motionCompensation(tmpPredYuv, true, true);
 
-            int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
-            primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32);
-            int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+                           m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+            }
+            else
+            {
+                PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
+                PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
+                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
 
-            bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+                /* Generate reference subpels */
+                predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
+                predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
+
+                primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
+                                                                                              bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
+                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+            }
+
+            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
             bidirCost = satdCost + m_rdCost.getCost(bidirBits);
 
-            bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero();
+            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
             if (bTryZero)
             {
                 /* Do not try zero MV if unidir motion predictors are beyond
@@ -1917,38 +2189,48 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
                 mvmin <<= 2;
                 mvmax <<= 2;
 
-                bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax);
-                bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax);
+                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
+                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
             }
             if (bTryZero)
             {
-                // coincident blocks of the two reference pictures
-                pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
-                pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
-                intptr_t refStride = slice->m_mref[0][0].lumaStride;
+                /* coincident blocks of the two reference pictures */
+                if (m_me.bChromaSATD)
+                {
+                    cu.m_mv[0][m_puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+                    cu.m_mv[1][m_puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
 
-                primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
-                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+                    prepMotionCompensation(cu, cuGeom, puIdx);
+                    motionCompensation(tmpPredYuv, true, true);
 
-                MV mvp0 = m_bestME[0].mvp;
-                int mvpIdx0 = m_bestME[0].mvpIdx;
-                uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
+                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+                               m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+                }
+                else
+                {
+                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
 
-                MV mvp1 = m_bestME[1].mvp;
-                int mvpIdx1 = m_bestME[1].mvpIdx;
-                uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+                    primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
+                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+                }
 
-                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
+                MV mvp0 = bestME[0].mvp;
+                int mvpIdx0 = bestME[0].mvpIdx;
+                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
 
-                if (bDistributed)
-                {
-                    cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc);
-                    cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc);
-                }
+                MV mvp1 = bestME[1].mvp;
+                int mvpIdx1 = bestME[1].mvpIdx;
+                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+
+                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
 
                 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
-                checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
-                checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
+                checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
+                checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
 
                 if (cost < bidirCost)
                 {
@@ -1965,7 +2247,7 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
         }
 
         /* select best option and store into CU */
-        if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost)
+        if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
         {
             cu.m_mergeFlag[m_puAbsPartIdx] = true;
             cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
@@ -1977,39 +2259,39 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
 
             totalmebits += merge.bits;
         }
-        else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost)
+        else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
         {
             lastMode = 2;
 
             cu.m_mergeFlag[m_puAbsPartIdx] = false;
             cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
             cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
             cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
             cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
 
             cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
             cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
             cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
 
             totalmebits += bidirBits;
         }
-        else if (m_bestME[0].cost <= m_bestME[1].cost)
+        else if (bestME[0].cost <= bestME[1].cost)
         {
             lastMode = 0;
 
             cu.m_mergeFlag[m_puAbsPartIdx] = false;
             cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp;
-            cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx;
+            cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
+            cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
+            cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx;
 
             cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
             cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
 
-            totalmebits += m_bestME[0].bits;
+            totalmebits += bestME[0].bits;
         }
         else
         {
@@ -2017,19 +2299,19 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
 
             cu.m_mergeFlag[m_puAbsPartIdx] = false;
             cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp;
-            cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx;
+            cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
+            cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
+            cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx;
 
             cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
             cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
 
-            totalmebits += m_bestME[1].bits;
+            totalmebits += bestME[1].bits;
         }
 
         prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(*predYuv, true, bChroma);
+        motionCompensation(*predYuv, true, bChromaSA8D);
     }
 
     interMode.sa8dBits += totalmebits;
@@ -2147,7 +2429,7 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
 
     // No residual coding : SKIP mode
 
-    cu.setSkipFlagSubParts(true);
+    cu.setPredModeSubParts(MODE_SKIP);
     cu.clearCbf();
     cu.setTUDepthSubParts(0, 0, depth);
 
@@ -2158,8 +2440,8 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
     interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
     // Chroma
     part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
-    interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
-    interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+    interMode.distortion += m_rdCost.scaleChromaDist(1, primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+    interMode.distortion += m_rdCost.scaleChromaDist(2, primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
 
     m_entropyCoder.load(m_rqt[depth].cur);
     m_entropyCoder.resetBits();
@@ -2212,8 +2494,8 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
     if (!cu.m_tqBypass[0])
     {
         uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
-        cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
-        cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
+        cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
+        cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
 
         /* Consider the RD cost of not signaling any residual */
         m_entropyCoder.load(m_rqt[depth].cur);
@@ -2247,7 +2529,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
     uint32_t coeffBits, bits;
     if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
     {
-        cu.setSkipFlagSubParts(true);
+        cu.setPredModeSubParts(MODE_SKIP);
 
         /* Merge/Skip */
         m_entropyCoder.resetBits();
@@ -2270,7 +2552,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
         uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
 
         bool bCodeDQP = m_slice->m_pps->bUseDQP;
-        m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange);
+        m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
         bits = m_entropyCoder.getNumberOfWrittenBits();
 
         coeffBits = bits - mvBits;
@@ -2285,8 +2567,8 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
 
     // update with clipped distortion and cost (qp estimation loop uses unclipped values)
     uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
-    bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
-    bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+    bestDist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+    bestDist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
     if (m_rdCost.m_psyRd)
         interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
 
@@ -2297,41 +2579,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
     updateModeCost(interMode);
 }
 
-void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom)
-{
-    CUData& cu = mode.cu;
-
-    m_quant.setQPforQuant(mode.cu);
-
-    if (cu.m_predMode[0] == MODE_INTER)
-    {
-        uint32_t tuDepthRange[2];
-        cu.getInterTUQtDepthRange(tuDepthRange, 0);
-
-        residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange);
-        if (cu.getQtRootCbf(0))
-            mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
-        else
-        {
-            mode.reconYuv.copyFromYuv(mode.predYuv);
-            if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
-                cu.setSkipFlagSubParts(true);
-        }
-    }
-    else if (cu.m_predMode[0] == MODE_INTRA)
-    {
-        uint32_t tuDepthRange[2];
-        cu.getIntraTUQtDepthRange(tuDepthRange, 0);
-
-        uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
-        residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange);
-        getBestIntraModeChroma(mode, cuGeom);
-        residualQTIntraChroma(mode, cuGeom, 0, 0);
-        mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO: 
-    }
-}
-
-void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2])
+void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2])
 {
     CUData& cu = mode.cu;
     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
@@ -2340,7 +2588,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
     uint32_t tuDepth = depth - cu.m_cuDepth[0];
 
     bool bCheckFull = log2TrSize <= depthRange[1];
-    if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
+    if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
         bCheckFull = false;
 
     if (bCheckFull)
@@ -2349,13 +2597,12 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
         bool bCodeChroma = true;
         uint32_t tuDepthC = tuDepth;
-        if (log2TrSizeC == 1)
+        if (log2TrSizeC < 2)
         {
-            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
-            log2TrSizeC++;
+            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+            log2TrSizeC = 2;
             tuDepthC--;
-            uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-            bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+            bCodeChroma = !(absPartIdx & 3);
         }
 
         uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
@@ -2372,10 +2619,10 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
         const Yuv* fencYuv = mode.fencYuv;
 
-        int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx);
+        int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
         uint32_t strideResiY = resiYuv.m_size;
 
-        pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
         uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
 
         if (numSigY)
@@ -2409,7 +2656,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
                 cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
 
                 int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
-                pixel*   fencCb = const_cast<pixel*>(fencYuv->getCbAddr(absPartIdxC));
+                const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
                 uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
                 if (numSigU)
                 {
@@ -2423,7 +2670,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
                 }
 
                 int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
-                pixel*   fencCr = const_cast<pixel*>(fencYuv->getCrAddr(absPartIdxC));
+                const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
                 uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
                 if (numSigV)
                 {
@@ -2449,16 +2696,16 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
     {
         X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
 
-        const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
-        for (uint32_t i = 0; i < 4; i++)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange);
-            ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
-            ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
-            vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+            residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange);
+            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
-        for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++)
+        for (uint32_t i = 0; i < 4 * qNumParts; i++)
         {
             cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
             cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
@@ -2467,15 +2714,26 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
     }
 }
 
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2])
+uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
+{
+    uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
+
+    if (m_rdCost.m_psyRd)
+        return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
+    else
+        return m_rdCost.calcRdCost(dist, nullBits);
+}
+
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
 {
     CUData& cu = mode.cu;
     uint32_t log2TrSize = g_maxLog2CUSize - depth;
 
     bool bCheckSplit = log2TrSize > depthRange[0];
     bool bCheckFull = log2TrSize <= depthRange[1];
+    bool bSplitPresentFlag = bCheckSplit && bCheckFull;
 
-    if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
+    if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
         bCheckFull = false;
 
     X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
@@ -2485,12 +2743,12 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
     bool bCodeChroma = true;
     uint32_t tuDepthC = tuDepth;
-    if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
+    if (log2TrSizeC < 2)
     {
-        log2TrSizeC++;
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        log2TrSizeC = 2;
         tuDepthC--;
-        uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-        bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+        bCodeChroma = !(absPartIdx & 3);
     }
 
     // code full block
@@ -2499,9 +2757,9 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
 
     uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
     uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
-    uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
-    uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
-    uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
 
@@ -2532,57 +2790,25 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
         if (m_bEnableRDOQ)
             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 
-        pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
-        int16_t *resi = resiYuv.getLumaAddr(absPartIdx);
+        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
+        int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
         numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
 
         m_entropyCoder.resetBits();
-        m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
-        if (cbfFlag[TEXT_LUMA][0])
-            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
-        singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
-
-        uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
-
-        if (bCodeChroma)
-        {
-            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
-            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
-            {
-                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
-                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
-
-                do
-                {
-                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
-                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
-
-                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
 
-                    if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
-                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
-
-                    fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
-                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
-                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
-                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
-
-                    m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
-                    if (cbfFlag[chromaId][tuIterator.section])
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
-
-                    uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
-                    singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+        if (bSplitPresentFlag && log2TrSize > depthRange[0])
+            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
+        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
 
-                    singleBitsPrev = newBits;
-                }
-                while (tuIterator.isNextSection());
-            }
-        }
+        // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
+        // So it is valid if we encode coefficients and then cbfs at least for analysis.
+//        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
+        if (cbfFlag[TEXT_LUMA][0])
+            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
 
-        const uint32_t numCoeffY = 1 << (log2TrSize * 2);
-        const uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
+        uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
+        singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
 
         X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
         uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
@@ -2590,156 +2816,168 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
         if (m_rdCost.m_psyRd)
             psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
 
-        int16_t *curResiY    = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
+        int16_t* curResiY    = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
         uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
 
         if (cbfFlag[TEXT_LUMA][0])
         {
             m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
 
+            // non-zero cost calculation for luma - This is an approximation
+            // finally we have to encode correct cbf after comparing with null cost
             const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
-            uint32_t nonZeroPsyEnergyY = 0;
+            uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
+            uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
             if (m_rdCost.m_psyRd)
+            {
                 nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
+                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
+            }
+            else
+                singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
 
             if (cu.m_tqBypass[0])
             {
-                distY = nonZeroDistY;
-                psyEnergyY = nonZeroPsyEnergyY;
+                singleDist[TEXT_LUMA][0] = nonZeroDistY;
+                singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
             }
             else
             {
-                uint64_t singleCostY = 0;
-                if (m_rdCost.m_psyRd)
-                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY);
-                else
-                    singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
-                m_entropyCoder.resetBits();
-                m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
-                const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
-                uint64_t nullCostY = 0;
-                if (m_rdCost.m_psyRd)
-                    nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
-                else
-                    nullCostY = m_rdCost.calcRdCost(distY, nullBitsY);
+                // zero-cost calculation for luma. This is an approximation
+                // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
+                // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
+                uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+
                 if (nullCostY < singleCostY)
                 {
                     cbfFlag[TEXT_LUMA][0] = 0;
+                    singleBits[TEXT_LUMA][0] = 0;
+                    primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
 #if CHECKED_BUILD || _DEBUG
+                    uint32_t numCoeffY = 1 << (log2TrSize << 1);
                     memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
 #endif
                     if (checkTransformSkipY)
                         minCost[TEXT_LUMA][0] = nullCostY;
+                    singleDist[TEXT_LUMA][0] = distY;
+                    singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
                 }
                 else
                 {
-                    distY = nonZeroDistY;
-                    psyEnergyY = nonZeroPsyEnergyY;
                     if (checkTransformSkipY)
                         minCost[TEXT_LUMA][0] = singleCostY;
+                    singleDist[TEXT_LUMA][0] = nonZeroDistY;
+                    singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
                 }
             }
         }
-        else if (checkTransformSkipY)
+        else
         {
-            m_entropyCoder.resetBits();
-            m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
-            const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
-            if (m_rdCost.m_psyRd)
-                minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
-            else
-                minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY);
+            if (checkTransformSkipY)
+                minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+            primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
+            singleDist[TEXT_LUMA][0] = distY;
+            singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
         }
 
-        singleDistComp[TEXT_LUMA][0] = distY;
-        singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
-        if (!cbfFlag[TEXT_LUMA][0])
-            primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
         cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
 
         if (bCodeChroma)
         {
-            uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+            uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
             {
                 uint32_t distC = 0, psyEnergyC = 0;
                 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
                 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
 
-            do
-            {
-                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
-                uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+                do
+                {
+                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
 
-                int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
 
-                distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+                    if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
+                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
 
-                if (cbfFlag[chromaId][tuIterator.section])
-                {
-                    m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
-                                            log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
-                    uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
-                    const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
-                    uint32_t nonZeroPsyEnergyC = 0;
-                    if (m_rdCost.m_psyRd)
-                        nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
-
-                    if (cu.m_tqBypass[0])
-                    {
-                        distC = nonZeroDistC;
-                        psyEnergyC = nonZeroPsyEnergyC;
-                    }
-                    else
+                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
+                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
+                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
+                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
+
+                    //Coding cbf flags has been removed from here
+//                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth);
+                    if (cbfFlag[chromaId][tuIterator.section])
+                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
+                    uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
+                    singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+                    singleBitsPrev = newBits;
+
+                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+                    distC = m_rdCost.scaleChromaDist(chromaId, primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+
+                    if (cbfFlag[chromaId][tuIterator.section])
                     {
-                        uint64_t singleCostC = 0;
+                        m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
+                                                log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
+
+                        // non-zero cost calculation for luma, same as luma - This is an approximation
+                        // finally we have to encode correct cbf after comparing with null cost
+                        uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+                        uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
+                        uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
+                        uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
                         if (m_rdCost.m_psyRd)
-                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                        {
+                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                        }
                         else
-                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
-                        m_entropyCoder.resetBits();
-                        m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth);
-                        const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
-                        uint64_t nullCostC = 0;
-                        if (m_rdCost.m_psyRd)
-                            nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
+                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
+
+                        if (cu.m_tqBypass[0])
+                        {
+                            singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+                            singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+                        }
                         else
-                            nullCostC = m_rdCost.calcRdCost(distC, nullBitsC);
-                        if (nullCostC < singleCostC)
                         {
-                            cbfFlag[chromaId][tuIterator.section] = 0;
+                            //zero-cost calculation for chroma. This is an approximation
+                            uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
+
+                            if (nullCostC < singleCostC)
+                            {
+                                cbfFlag[chromaId][tuIterator.section] = 0;
+                                singleBits[chromaId][tuIterator.section] = 0;
+                                primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
 #if CHECKED_BUILD || _DEBUG
+                                uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
                                 memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
 #endif
                                 if (checkTransformSkipC)
                                     minCost[chromaId][tuIterator.section] = nullCostC;
+                                singleDist[chromaId][tuIterator.section] = distC;
+                                singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
                             }
                             else
                             {
-                                distC = nonZeroDistC;
-                                psyEnergyC = nonZeroPsyEnergyC;
                                 if (checkTransformSkipC)
                                     minCost[chromaId][tuIterator.section] = singleCostC;
+                                singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+                                singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
                             }
                         }
                     }
-                    else if (checkTransformSkipC)
+                    else
                     {
-                        m_entropyCoder.resetBits();
-                        m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC);
-                        const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
-                        if (m_rdCost.m_psyRd)
-                            minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
-                        else
-                            minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC);
-                    }
-
-                    singleDistComp[chromaId][tuIterator.section] = distC;
-                    singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC;
-
-                    if (!cbfFlag[chromaId][tuIterator.section])
+                        if (checkTransformSkipC)
+                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
                         primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
+                        singleDist[chromaId][tuIterator.section] = distC;
+                        singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
+                    }
 
                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
                 }
@@ -2763,14 +3001,14 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
             if (m_bEnableRDOQ)
                 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 
-            fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+            fenc = fencYuv->getLumaAddr(absPartIdx);
             resi = resiYuv.getLumaAddr(absPartIdx);
             uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
 
             if (numSigTSkipY)
             {
                 m_entropyCoder.resetBits();
-                m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth);
+                m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
                 m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
                 const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
 
@@ -2791,12 +3029,13 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
             else
             {
-                singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
-                singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
+                singleDist[TEXT_LUMA][0] = nonZeroDistY;
+                singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
                 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
                 bestTransformMode[TEXT_LUMA][0] = 1;
+                uint32_t numCoeffY = 1 << (log2TrSize << 1);
                 memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
-                primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
+                primitives.luma_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
             }
 
             cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
@@ -2821,7 +3060,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                     uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
                     uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
 
-                    int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
 
                     ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
                     ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
@@ -2831,42 +3070,43 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                     if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
                         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
 
-                    fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
+                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
                     resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
                     uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
 
                     m_entropyCoder.resetBits();
-                    singleBitsComp[chromaId][tuIterator.section] = 0;
+                    singleBits[chromaId][tuIterator.section] = 0;
 
                     if (numSigTSkipC)
                     {
-                        m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth);
+                        m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
                         m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
-                        singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
+                        singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
 
                         m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
                                                 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
                         uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
-                        nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
+                        nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
                         if (m_rdCost.m_psyRd)
                         {
                             nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
-                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
                         }
                         else
-                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
+                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
                     }
 
                     if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
                         cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
                     else
                     {
-                        singleDistComp[chromaId][tuIterator.section] = nonZeroDistC;
-                        singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+                        singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+                        singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
                         cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
                         bestTransformMode[chromaId][tuIterator.section] = 1;
+                        uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
                         memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
-                        primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
+                        primitives.luma_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
                     }
 
                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
@@ -2875,66 +3115,55 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
             }
         }
 
+        // Here we were encoding cbfs and coefficients, after calculating distortion above.
+        // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
+        // bits required for coefficients and added with number of cbf bits. As I tested the order does not
+        // make any difference. But bit confused whether I should load the original context as below.
         m_entropyCoder.load(m_rqt[depth].rqtRoot);
-
         m_entropyCoder.resetBits();
 
-        if (log2TrSize > depthRange[0])
-            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
-
+        //Encode cbf flags
         if (bCodeChroma)
         {
             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
             {
                 if (!splitIntoSubTUs)
-                    m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth);
+                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
                 else
                 {
                     offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
-                    for (uint32_t subTU = 0; subTU < 2; subTU++)
-                        m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth);
+                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
+                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth);
                 }
             }
         }
 
-        m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
-        if (cbfFlag[TEXT_LUMA][0])
-            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
 
-        if (bCodeChroma)
-        {
-            uint32_t subTUSize = 1 << (log2TrSizeC * 2);
-            uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
-            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+        uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
 
-            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
-            {
-                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
-                if (!splitIntoSubTUs)
-                {
-                    if (cbfFlag[chromaId][0])
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId);
-                }
-                else
-                {
-                    for (uint32_t subTU = 0; subTU < 2; subTU++)
-                    {
-                        if (cbfFlag[chromaId][subTU])
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId);
-                    }
-                }
-            }
+        uint32_t coeffBits = 0;
+        coeffBits = singleBits[TEXT_LUMA][0];
+        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
+        {
+            coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
+            coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
         }
 
-        fullCost.distortion += singleDistComp[TEXT_LUMA][0];
-        fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also
+        // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
+        // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for
+        // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks.
+        // For that reason, I am collecting individual coefficient bits only.
+        fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
+
+        fullCost.distortion += singleDist[TEXT_LUMA][0];
+        fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
         {
-            fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex];
-            fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex];
+            fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
+            fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
         }
 
-        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
         if (m_rdCost.m_psyRd)
             fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
         else
@@ -2951,31 +3180,40 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
         }
 
         Cost splitCost;
-        const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
+        {
+            // Subdiv flag can be encoded at the start of anlysis of splitted blocks.
+            m_entropyCoder.resetBits();
+            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
+            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+        }
+
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
-        for (uint32_t i = 0; i < 4; ++i)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange);
-            ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA,     tuDepth + 1);
-            ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
-            vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+            estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange);
+            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
+            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
-        for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i)
+        for (uint32_t i = 0; i < 4 * qNumParts; ++i)
         {
             cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
             cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
             cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
         }
 
+        // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
+        // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
+        // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
+        // at depth 0 (for example).
         m_entropyCoder.load(m_rqt[depth].rqtRoot);
         m_entropyCoder.resetBits();
 
-        encodeResidualQT(cu, absPartIdx, depth, true,  TEXT_LUMA, depthRange);
-        encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange);
-        encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange);
-        encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange);
-
-        splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+        codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange);
+        uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
+        splitCost.bits += splitCbfBits;
 
         if (m_rdCost.m_psyRd)
             splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
@@ -2999,15 +3237,18 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
         cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
         if (bCodeChroma)
         {
-            const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
-
-            uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
-            for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
+            if (!splitIntoSubTUs)
             {
-                const uint32_t  subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
-
-                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU);
-                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU);
+                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
+                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
+            }
+            else
+            {
+                uint32_t tuNumParts = absPartIdxStep >> 1;
+                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx             , tuNumParts);
+                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx             , tuNumParts);
+                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
             }
         }
         X265_CHECK(bCheckFull, "check-full must be set\n");
@@ -3019,23 +3260,21 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
 
     if (bCodeChroma)
     {
-        uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
-        uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
-
-        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+        if (!splitIntoSubTUs)
         {
-            for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
-            {
-                const uint32_t  subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
+            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
+            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
+        }
+        else
+        {
+            uint32_t tuNumParts = absPartIdxStep >> 1;
 
-                if (splitIntoSubTUs)
-                {
-                    uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1];
-                    cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
-                }
-                else
-                    cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
-            }
+            offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
+            offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
+            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx             , tuNumParts);
+            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx             , tuNumParts);
+            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
         }
     }
 
@@ -3045,51 +3284,65 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
     outCosts.energy     += fullCost.energy;
 }
 
-void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2])
+void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2])
 {
     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
-    X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n");
+    X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
 
-    const uint32_t curTuDepth  = depth - cu.m_cuDepth[0];
-    const uint32_t tuDepth     = cu.m_tuDepth[absPartIdx];
-    const bool     bSubdiv     = curTuDepth != tuDepth;
+    const uint32_t tuDepth     = depth - cu.m_cuDepth[0];
+    const bool     bSubdiv     = tuDepth != cu.m_tuDepth[absPartIdx];
     const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
 
-    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-    
-    const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
+    if (!(log2TrSize - m_hChromaShift < 2))
+    {
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
+    }
+    else
+    {
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
+    }
 
-    if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])
-        m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize);
+    if (!bSubdiv)
+    {
+        m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
+    }
+    else
+    {
+        uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange);
+    }
+}
 
-    bool mCodeAll = true;
-    uint32_t trWidthC  = 1 << log2TrSizeC;
-    uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
+void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2])
+{
+    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
+    X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n");
 
-    const uint32_t numPels = trWidthC * trHeightC;
-    if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
-        mCodeAll = false;
+    const uint32_t curTuDepth  = depth - cu.m_cuDepth[0];
+    const uint32_t tuDepth     = cu.m_tuDepth[absPartIdx];
+    const bool     bSubdiv     = curTuDepth != tuDepth;
+    const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
 
-    if (bSubdivAndCbf)
+    if (bSubdiv)
     {
-        const bool bFirstCbfOfCU = curTuDepth == 0;
-        if (bFirstCbfOfCU || mCodeAll)
-        {
-            uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +  curTuDepth) << 1);
-            if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1))
-                m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv);
-            if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1))
-                m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv);
-        }
-        else
+        if (cu.getCbf(absPartIdx, ttype, curTuDepth))
         {
-            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n");
-            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n");
+            uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+            for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+                encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange);
         }
+        return;
     }
-
-    if (!bSubdiv)
+    else
     {
+        const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
+        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
+
         // Luma
         const uint32_t qtLayer = log2TrSize - 2;
         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
@@ -3098,65 +3351,51 @@ void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t de
         // Chroma
         bool bCodeChroma = true;
         uint32_t tuDepthC = tuDepth;
-        if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
+        if (log2TrSize == 2 && m_csp != X265_CSP_I444)
         {
+            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
             log2TrSizeC++;
             tuDepthC--;
-            uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-            bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+            bCodeChroma = !(absPartIdx & 3);
         }
 
-        if (bSubdivAndCbf)
-            m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth);
-        else
+        if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
+            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+
+        if (bCodeChroma)
         {
-            if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
-                m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+            coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
+            coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
 
-            if (bCodeChroma)
+            if (!splitIntoSubTUs)
             {
-                uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
-                coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
-                coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
-
-                if (!splitIntoSubTUs)
+                if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
+                    m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
+                if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
+                    m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+            }
+            else
+            {
+                uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
+                uint32_t subTUSize = 1 << (log2TrSizeC * 2);
+                if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
                 {
-                    if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
+                    if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
                         m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
-                    if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+                    if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+                        m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U);
                 }
-                else
+                if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
                 {
-                    uint32_t partIdxesPerSubTU  = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1);
-                    uint32_t subTUSize = 1 << (log2TrSizeC * 2);
-                    if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
-                    {
-                        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
-                        if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
-                    }
-                    if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
-                    {
-                        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
-                        if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
-                    }
+                    if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
+                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+                    if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V);
                 }
             }
         }
     }
-    else
-    {
-        if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth))
-        {
-            const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
-            for (uint32_t i = 0; i < 4; ++i)
-                encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange);
-        }
-    }
 }
 
 void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
@@ -3164,28 +3403,27 @@ void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartI
     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
     const uint32_t curTrMode = depth - cu.m_cuDepth[0];
     const uint32_t tuDepth   = cu.m_tuDepth[absPartIdx];
+    const uint32_t log2TrSize = g_maxLog2CUSize - depth;
 
     if (curTrMode < tuDepth)
     {
-        uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
-        for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv)
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
             saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
         return;
     }
 
-    const uint32_t log2TrSize = g_maxLog2CUSize - depth;
     const uint32_t qtLayer = log2TrSize - 2;
 
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
     bool bCodeChroma = true;
     uint32_t tuDepthC = tuDepth;
-    if (log2TrSizeC == 1)
+    if (log2TrSizeC < 2)
     {
-        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
-        log2TrSizeC++;
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        log2TrSizeC = 2;
         tuDepthC--;
-        uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
-        bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+        bCodeChroma = !(absPartIdx & 3);
     }
 
     m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
diff --git a/source/encoder/search.h b/source/encoder/search.h
index 79ed94a..a59fbf3 100644
--- a/source/encoder/search.h
+++ b/source/encoder/search.h
@@ -35,9 +35,6 @@
 #include "entropy.h"
 #include "motion.h"
 
-#define MVP_IDX_BITS 1
-#define NUM_LAYERS 4
-
 namespace x265 {
 // private namespace
 
@@ -68,6 +65,64 @@ struct RQTData
     Yuv      bidirPredYuv[2];
 };
 
+struct MotionData
+{
+    MV       mv;
+    MV       mvp;
+    int      mvpIdx;
+    int      ref;
+    uint32_t cost;
+    int      bits;
+    bool     costZero;
+};
+
+struct Mode
+{
+    CUData     cu;
+    const Yuv* fencYuv;
+    Yuv        predYuv;
+    Yuv        reconYuv;
+    Entropy    contexts;
+
+    enum { MAX_INTER_PARTS = 2 };
+
+    MotionData bestME[MAX_INTER_PARTS][2];
+    MV         amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
+
+    uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
+    uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
+    uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
+    uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
+    uint32_t   distortion; // sum of partition SSE distortion
+    uint32_t   totalBits;  // sum of partition bits (mv + coeff)
+    uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
+    uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
+
+    void initCosts()
+    {
+        rdCost = 0;
+        sa8dCost = 0;
+        sa8dBits = 0;
+        psyEnergy = 0;
+        distortion = 0;
+        totalBits = 0;
+        mvBits = 0;
+        coeffBits = 0;
+    }
+
+    void addSubCosts(const Mode& subMode)
+    {
+        rdCost += subMode.rdCost;
+        sa8dCost += subMode.sa8dCost;
+        sa8dBits += subMode.sa8dBits;
+        psyEnergy += subMode.psyEnergy;
+        distortion += subMode.distortion;
+        totalBits += subMode.totalBits;
+        mvBits += subMode.mvBits;
+        coeffBits += subMode.coeffBits;
+    }
+};
+
 inline int getTUBits(int idx, int numIdx)
 {
     return idx + (idx < numIdx - 1);
@@ -98,58 +153,6 @@ public:
     uint32_t        m_numLayers;
     uint32_t        m_refLagPixels;
 
-    struct Mode
-    {
-        CUData     cu;
-        const Yuv* fencYuv;
-        Yuv        predYuv;
-        Yuv        reconYuv;
-        Entropy    contexts;
-
-        uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
-        uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
-        uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
-        uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
-        uint32_t   distortion; // sum of partition SSE distortion
-        uint32_t   totalBits;  // sum of partition bits (mv + coeff)
-        uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
-        uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
-
-        void initCosts()
-        {
-            rdCost = 0;
-            sa8dCost = 0;
-            sa8dBits = 0;
-            psyEnergy = 0;
-            distortion = 0;
-            totalBits = 0;
-            mvBits = 0;
-            coeffBits = 0;
-        }
-
-        void addSubCosts(const Mode& subMode)
-        {
-            rdCost += subMode.rdCost;
-            sa8dCost += subMode.sa8dCost;
-            sa8dBits += subMode.sa8dBits;
-            psyEnergy += subMode.psyEnergy;
-            distortion += subMode.distortion;
-            totalBits += subMode.totalBits;
-            mvBits += subMode.mvBits;
-            coeffBits += subMode.coeffBits;
-        }
-    };
-
-    struct MotionData
-    {
-        MV  mv;
-        MV  mvp;
-        int mvpIdx;
-        int ref;
-        uint32_t cost;
-        int bits;
-    };
-
     Search();
     ~Search();
 
@@ -162,6 +165,11 @@ public:
     // full RD search of intra modes. if sharedModes is not NULL, it directly uses them
     void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes);
 
+    // select best intra mode using only sa8d costs, cannot measure NxN intra
+    void     checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+    // encode luma mode selected by checkIntraInInter, then pick and encode a chroma mode
+    void     encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+
     // estimation inter prediction (non-skip)
     bool     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma);
 
@@ -169,38 +177,41 @@ public:
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
     void     encodeResAndCalcRdSkipCU(Mode& interMode);
 
-    void     generateCoeffRecon(Mode& mode, const CUGeom& cuGeom);
-    void     residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2]);
+    // encode residual without rd-cost
+    void     residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2]);
+    void     residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2]);
+    void     residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx);
 
-    uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const;
+    // pick be chroma mode from available using just sa8d costs
+    void     getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
 
 protected:
 
     /* motion estimation distribution */
     ThreadLocalData* m_tld;
-    CUData*       m_curMECu;
+    Mode*         m_curInterMode;
     const CUGeom* m_curGeom;
     int           m_curPart;
-    MotionData    m_bestME[2];
     uint32_t      m_listSelBits[3];
     int           m_totalNumME;
     volatile int  m_numAcquiredME;
     volatile int  m_numCompletedME;
     Event         m_meCompletionEvent;
-    Lock          m_outputLock;
+    Lock          m_meLock;
     bool          m_bJobsQueued;
-    void     singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref);
+    void     singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref);
 
     void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth);
 
     // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
-    uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes);
+    uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
 
     // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
     uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
 
-    void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx,  uint32_t absPartIdxStep, uint32_t width, uint32_t height);
-    void     codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype);
+    void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
+    void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2]);
+    void     codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype);
 
     struct Cost
     {
@@ -211,24 +222,24 @@ protected:
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };
 
-    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, uint32_t depthRange[2]);
+    uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
+    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
 
-    void     encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]);
+    // estimate bit cost of residual QT
+    void     encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, TextType ttype, const uint32_t depthRange[2]);
 
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
-    void     codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, uint32_t depthRange[2]);
-    void     codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& costs);
-    void     extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx);
+    void     codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
+    void     codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs);
+    void     extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
 
     // generate chroma prediction, generate residual and recon
-    uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
-    uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
-    void     extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad);
-
-    void     residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2]);
-    void     residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx);
+    uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
+    uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
+    void     extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
 
-    void     offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx);
+    // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
+    void     offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx);
 
     struct MergeData
     {
@@ -258,7 +269,9 @@ protected:
     /* intra helper functions */
     enum { MAX_RD_INTRA_MODES = 16 };
     static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
-    void     getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
+
+    // get most probable luma modes for CU part, and bit cost of all non mpm modes
+    uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const;
 
     void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); }
 };
diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp
index cc70c20..a792760 100644
--- a/source/encoder/slicetype.cpp
+++ b/source/encoder/slicetype.cpp
@@ -111,7 +111,7 @@ void Lookahead::destroy()
 /* Called by API thread */
 void Lookahead::addPicture(Frame *curFrame, int sliceType)
 {
-    PicYuv *orig = curFrame->m_origPicYuv;
+    PicYuv *orig = curFrame->m_fencPic;
 
     curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
 
@@ -192,7 +192,7 @@ Frame* Lookahead::getDecidedPicture()
 /* Called by pool worker threads */
 bool Lookahead::findJob(int)
 {
-    if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1)
+    if (m_bReady > 0 && ATOMIC_DEC(&m_bReady) == 0)
     {
         m_inputQueueLock.acquire();
         slicetypeDecide();
@@ -290,6 +290,8 @@ void Lookahead::getEstimatedPictureCost(Frame *curFrame)
 /* called by API thread or worker thread with inputQueueLock acquired */
 void Lookahead::slicetypeDecide()
 {
+    ProfileScopeEvent(slicetypeDecideEV);
+
     ScopedLock lock(m_decideLock);
 
     Lowres *frames[X265_LOOKAHEAD_MAX];
@@ -417,7 +419,6 @@ void Lookahead::slicetypeDecide()
         list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF;
         brefs++;
     }
-
     /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
     if (m_param->rc.rateControlMode != X265_RC_CQP)
     {
@@ -524,14 +525,12 @@ void Lookahead::slicetypeDecide()
 void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
 {
     int prevNonB = 0, curNonB = 1, idx = 0;
-    bool isNextNonB = false;
-
     while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
         curNonB++;
-
     int nextNonB = keyframe ? prevNonB : curNonB;
-    int nextB = keyframe ? prevNonB + 1 : curNonB + 1;
-
+    int nextB = prevNonB + 1;
+    int nextBRef = 0;
+    int miniGopEnd = keyframe ? prevNonB : curNonB;
     while (curNonB < numFrames + !keyframe)
     {
         /* P/I cost: This shouldn't include the cost of nextNonB */
@@ -540,38 +539,53 @@ void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
             int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
             frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);
             frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;
+            /* Save the nextNonB Cost in each B frame of the current miniGop */
+            if (curNonB > miniGopEnd)
+            {
+                for (int j = nextB; j < miniGopEnd; j++)
+                {
+                    frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx];
+                    frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx];
+                
+                }
+            }
             idx++;
         }
         /* Handle the B-frames: coded order */
-        for (int i = prevNonB + 1; i < curNonB; i++, idx++)
-        {
-            frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, prevNonB, curNonB, i);
-            frames[nextNonB]->plannedType[idx] = X265_TYPE_B;
-        }
+        if (m_param->bBPyramid && curNonB - prevNonB > 1)
+            nextBRef = (prevNonB + curNonB + 1) / 2;
 
-        for (int i = nextB; i <= curNonB; i++)
+        for (int i = prevNonB + 1; i < curNonB; i++, idx++)
         {
-            for (int j = frames[i]->indB + i + 1; j <= curNonB; j++, frames[i]->indB++)
+            int64_t satdCost = 0; int type = X265_TYPE_B;
+            if (nextBRef)
             {
-                if (j == curNonB)
+                if (i == nextBRef)
                 {
-                    if (isNextNonB)
-                    {
-                        int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
-                        frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, p0, curNonB, curNonB);
-                        frames[i]->plannedType[frames[i]->indB] = frames[curNonB]->sliceType;
-                    }
+                    satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef);
+                    type = X265_TYPE_BREF;
                 }
+                else if (i < nextBRef)
+                    satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i);
                 else
-                {
-                    frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, prevNonB, curNonB, j);
-                    frames[i]->plannedType[frames[i]->indB] = X265_TYPE_B;
-                }
+                    satdCost = vbvFrameCost(frames, nextBRef, curNonB, i);
             }
-            if (i == curNonB && !isNextNonB)
-                isNextNonB = true;
-        }
+            else
+                satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i);
+            frames[nextNonB]->plannedSatd[idx] = satdCost;
+            frames[nextNonB]->plannedType[idx] = type;
+            /* Save the nextB Cost in each B frame of the current miniGop */
 
+            for (int j = nextB; j < miniGopEnd; j++)
+            {
+                if (nextBRef && i == nextBRef)
+                    break;
+                if (j >= i && j !=nextBRef)
+                    continue;
+                frames[j]->plannedSatd[frames[j]->indB] = satdCost;
+                frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B;
+            }
+        }
         prevNonB = curNonB;
         curNonB++;
         while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
@@ -1238,7 +1252,7 @@ void CostEstimate::init(x265_param *_param, Frame *curFrame)
 
     if (m_param->bEnableWeightedPred)
     {
-        PicYuv *orig = curFrame->m_origPicYuv;
+        PicYuv *orig = curFrame->m_fencPic;
         m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY;
         intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX;
 
@@ -1249,7 +1263,7 @@ void CostEstimate::init(x265_param *_param, Frame *curFrame)
             m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
         }
 
-        m_weightedRef.fpelPlane = m_weightedRef.lowresPlane[0];
+        m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0];
         m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride;
         m_weightedRef.isLowres = true;
         m_weightedRef.isWeighted = false;
@@ -1290,7 +1304,6 @@ int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b,
         for (int i = 0; i < m_heightInCU; i++)
         {
             m_rows[i].init();
-            m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
             if (!fenc->bIntraCalculated)
                 fenc->rowSatds[0][0][i] = 0;
             fenc->rowSatds[b - p0][p1 - b][i] = 0;
@@ -1351,7 +1364,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara
 {
     Lowres *fenc = frames[b];
     Lowres *ref  = frames[p0];
-    pixel *src = ref->fpelPlane;
+    pixel *src = ref->fpelPlane[0];
     intptr_t stride = fenc->lumaStride;
 
     if (wp)
@@ -1365,7 +1378,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara
 
         primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
                              scale, round << correction, denom + correction, offset);
-        src = m_weightedRef.fpelPlane;
+        src = m_weightedRef.fpelPlane[0];
     }
 
     uint32_t cost = 0;
@@ -1376,7 +1389,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara
     {
         for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8)
         {
-            int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane + pixoff, stride);
+            int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride);
             cost += X265_MIN(satd, fenc->intraCost[mb]);
         }
     }
@@ -1469,6 +1482,8 @@ void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0)
 
 void CostEstimate::processRow(int row, int /*threadId*/)
 {
+    ProfileScopeEvent(costEstimateRow);
+
     int realrow = m_heightInCU - 1 - row;
     Lowres **frames = m_curframes;
     ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0];
@@ -1531,7 +1546,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
     const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
                                 cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
 
-    m_me.setSourcePU(pelOffset, cuSize, cuSize);
+    m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
 
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
     int lowresPenalty = 4;
@@ -1592,12 +1607,13 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
         }
         if (bBidir)
         {
-            pixel subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE], subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
+            ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+            ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
             intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
             pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0);
             pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1);
 
-            pixel ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
+            ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
             primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
             int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
             COPY2_IF_LT(bcost, bicost, listused, 3);
@@ -1626,9 +1642,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
 
         // Copy Left
         for (int i = 0; i < cuSize + 1; i++)
-        {
             left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
-        }
 
         for (int i = 0; i < cuSize; i++)
         {
@@ -1652,22 +1666,22 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
 
         // generate 35 intra predictions into m_predictions
         pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
-        int icost = m_me.COST_MAX, cost;
+        int icost = m_me.COST_MAX;
         primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
-        cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+        int cost = m_me.bufSATD(m_predictions, cuSize);
         if (cost < icost)
             icost = cost;
         pixel *above = (cuSize >= 8) ? above1 : above0;
         pixel *left  = (cuSize >= 8) ? left1 : left0;
         primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0);
-        cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+        cost = m_me.bufSATD(m_predictions, cuSize);
         if (cost < icost)
             icost = cost;
         primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
 
         // calculate satd costs, keep least cost
         ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
-        primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
+        primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
 
         int acost = m_me.COST_MAX;
         uint32_t mode, lowmode = 4;
@@ -1676,7 +1690,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
             if (mode < 18)
                 cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
             else
-                cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+                cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
             COPY2_IF_LT(acost, cost, lowmode, mode);
         }
         for (uint32_t dist = 2; dist >= 1; dist--)
@@ -1685,14 +1699,14 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
             if (mode < 18)
                 cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
             else
-                cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+                cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
             COPY2_IF_LT(acost, cost, lowmode, mode);
 
             mode = lowmode + dist;
             if (mode < 18)
                 cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
             else
-                cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+                cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
             COPY2_IF_LT(acost, cost, lowmode, mode);
         }
         if (acost < icost)
@@ -1701,6 +1715,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
         const int intraPenalty = 5 * m_lookAheadLambda;
         icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
         fenc->intraCost[cuXY] = icost;
+        fenc->intraMode[cuXY] = (uint8_t)lowmode;
         int icostAq = icost;
         if (bFrameScoreCU)
         {
diff --git a/source/encoder/slicetype.h b/source/encoder/slicetype.h
index 8805e90..2b6e265 100644
--- a/source/encoder/slicetype.h
+++ b/source/encoder/slicetype.h
@@ -73,8 +73,7 @@ public:
     EstimateRow()
     {
         m_me.setQP(X265_LOOKAHEAD_QP);
-        m_me.setSearchMethod(X265_HEX_SEARCH);
-        m_me.setSubpelRefine(1);
+        m_me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
         m_predictions = X265_MALLOC(pixel, 35 * 8 * 8);
         m_merange = 16;
         m_lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
diff --git a/source/encoder/weightPrediction.cpp b/source/encoder/weightPrediction.cpp
index 3bf5a45..cd6f4f7 100644
--- a/source/encoder/weightPrediction.cpp
+++ b/source/encoder/weightPrediction.cpp
@@ -219,7 +219,7 @@ namespace x265 {
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
 {
     WeightParam wp[2][MAX_NUM_REF][3];
-    PicYuv *fencPic = frame.m_origPicYuv;
+    PicYuv *fencPic = frame.m_fencPic;
     Lowres& fenc    = frame.m_lowres;
 
     Cache cache;
@@ -329,7 +329,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
                     if (!refFrame->m_bChromaExtended)
                     {
                         refFrame->m_bChromaExtended = true;
-                        PicYuv *refPic = refFrame->m_origPicYuv;
+                        PicYuv *refPic = refFrame->m_fencPic;
                         int width = refPic->m_picWidth >> cache.hshift;
                         int height = refPic->m_picHeight >> cache.vshift;
                         extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
@@ -363,7 +363,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
             case 1:
                 orig = fencPic->m_picOrg[1];
                 stride = fencPic->m_strideC;
-                fref = refFrame->m_origPicYuv->m_picOrg[1];
+                fref = refFrame->m_fencPic->m_picOrg[1];
 
                 /* Clamp the chroma dimensions to the nearest multiple of
                  * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
@@ -381,7 +381,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
                 break;
 
             case 2:
-                fref = refFrame->m_origPicYuv->m_picOrg[2];
+                fref = refFrame->m_fencPic->m_picOrg[2];
                 orig = fencPic->m_picOrg[2];
                 stride = fencPic->m_strideC;
                 width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
diff --git a/source/test/CMakeLists.txt b/source/test/CMakeLists.txt
index ff3312f..ff1e141 100644
--- a/source/test/CMakeLists.txt
+++ b/source/test/CMakeLists.txt
@@ -23,6 +23,3 @@ add_executable(TestBench ${YASM_SRC}
     ipfilterharness.cpp ipfilterharness.h
     intrapredharness.cpp intrapredharness.h)
 target_link_libraries(TestBench x265-static ${PLATFORM_LIBS})
-
-add_executable(PoolTest testpool.cpp)
-target_link_libraries(PoolTest x265-static ${PLATFORM_LIBS})
diff --git a/source/test/intrapredharness.cpp b/source/test/intrapredharness.cpp
index 97eff94..38c8cf4 100644
--- a/source/test/intrapredharness.cpp
+++ b/source/test/intrapredharness.cpp
@@ -31,8 +31,6 @@ IntraPredHarness::IntraPredHarness()
 {
     for (int i = 0; i < INPUT_SIZE; i++)
         pixel_buff[i] = rand() % PIXEL_MAX;
-
-    initROM();
 }
 
 bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width)
diff --git a/source/test/ipfilterharness.cpp b/source/test/ipfilterharness.cpp
index f23e84b..1bcee04 100644
--- a/source/test/ipfilterharness.cpp
+++ b/source/test/ipfilterharness.cpp
@@ -586,9 +586,9 @@ bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const Encode
 
     for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
     {
-        if (opt.chroma_p2s[csp])
+        if (opt.chroma[csp].p2s)
         {
-            if (!check_IPFilter_primitive(ref.chroma_p2s[csp], opt.chroma_p2s[csp], 1, csp))
+            if (!check_IPFilter_primitive(ref.chroma[csp].p2s, opt.chroma[csp].p2s, 1, csp))
             {
                 printf("chroma_p2s[%s]", x265_source_csp_names[csp]);
                 return false;
@@ -725,10 +725,10 @@ void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPr
     for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
     {
         printf("= Color Space %s =\n", x265_source_csp_names[csp]);
-        if (opt.chroma_p2s[csp])
+        if (opt.chroma[csp].p2s)
         {
             printf("chroma_p2s\t");
-            REPORT_SPEEDUP(opt.chroma_p2s[csp], ref.chroma_p2s[csp],
+            REPORT_SPEEDUP(opt.chroma[csp].p2s, ref.chroma[csp].p2s,
                            pixel_buff, srcStride, IPF_vec_output_s, width, height);
         }
         for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp
index 88e4676..f9e16d6 100644
--- a/source/test/mbdstharness.cpp
+++ b/source/test/mbdstharness.cpp
@@ -65,17 +65,17 @@ MBDstHarness::MBDstHarness()
         short_test_buff[0][i]    = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
         int_test_buff[0][i]      = rand() % PIXEL_MAX;
         int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
-        int_denoise_test_buff1[0][i] = int_denoise_test_buff2[0][i] = (rand() & UNSIGNED_SHORT_MAX) - (rand() & UNSIGNED_SHORT_MAX);
+        short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
 
         short_test_buff[1][i]    = -PIXEL_MAX;
         int_test_buff[1][i]      = -PIXEL_MAX;
         int_idct_test_buff[1][i] = SHORT_MIN;
-        int_denoise_test_buff1[1][i] = int_denoise_test_buff2[1][i] = -UNSIGNED_SHORT_MAX;
+        short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
 
         short_test_buff[2][i]    = PIXEL_MAX;
         int_test_buff[2][i]      = PIXEL_MAX;
         int_idct_test_buff[2][i] = SHORT_MAX;
-        int_denoise_test_buff1[2][i] = int_denoise_test_buff2[2][i] = UNSIGNED_SHORT_MAX;
+        short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
 
         mbuf1[i] = rand() & PIXEL_MAX;
         mbufdct[i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
@@ -96,16 +96,16 @@ MBDstHarness::MBDstHarness()
 bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, intptr_t width)
 {
     int j = 0;
-    intptr_t cmp_size = sizeof(int) * width * width;
+    intptr_t cmp_size = sizeof(short) * width * width;
 
     for (int i = 0; i < ITERS; i++)
     {
         int index = rand() % TEST_CASES;
 
-        ref(short_test_buff[index] + j, mintbuf3, width);
-        checked(opt, short_test_buff[index] + j, mintbuf4, width);
+        ref(short_test_buff[index] + j, mshortbuf2, width);
+        checked(opt, short_test_buff[index] + j, mshortbuf3, width);
 
-        if (memcmp(mintbuf3, mintbuf4, cmp_size))
+        if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
             return false;
 
         reportfail();
@@ -124,8 +124,8 @@ bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, intptr_t width)
     {
         int index = rand() % TEST_CASES;
 
-        ref(int_idct_test_buff[index] + j, mshortbuf2, width);
-        checked(opt, int_idct_test_buff[index] + j, mshortbuf3, width);
+        ref(short_test_buff[index] + j, mshortbuf2, width);
+        checked(opt, short_test_buff[index] + j, mshortbuf3, width);
 
         if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
             return false;
@@ -156,10 +156,10 @@ bool MBDstHarness::check_dequant_primitive(dequant_normal_t ref, dequant_normal_
         int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
         int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
 
-        ref(short_test_buff[index] + j, mintbuf3, width * height, scale, shift);
-        checked(opt, short_test_buff[index] + j, mintbuf4, width * height, scale, shift);
+        ref(short_test_buff[index] + j, mshortbuf2, width * height, scale, shift);
+        checked(opt, short_test_buff[index] + j, mshortbuf3, width * height, scale, shift);
 
-        if (memcmp(mintbuf3, mintbuf4, sizeof(int) * height * width))
+        if (memcmp(mshortbuf2, mshortbuf3, sizeof(int16_t) * height * width))
             return false;
 
         reportfail();
@@ -175,6 +175,10 @@ bool MBDstHarness::check_dequant_primitive(dequant_scaling_t ref, dequant_scalin
 
     for (int i = 0; i < ITERS; i++)
     {
+
+        memset(mshortbuf2, 0, MAX_TU_SIZE * sizeof(int16_t));
+        memset(mshortbuf3, 0, MAX_TU_SIZE * sizeof(int16_t));
+
         int log2TrSize = (rand() % 4) + 2;
 
         int width = (1 << log2TrSize);
@@ -185,13 +189,13 @@ bool MBDstHarness::check_dequant_primitive(dequant_scaling_t ref, dequant_scalin
         int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
         int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
 
-        int cmp_size = sizeof(int) * height * width;
+        int cmp_size = sizeof(int16_t) * height * width;
         int index1 = rand() % TEST_CASES;
 
-        ref(short_test_buff[index1] + j, mintbuf3, mintbuf1, width * height, per, shift);
-        checked(opt, short_test_buff[index1] + j, mintbuf4, mintbuf2, width * height, per, shift);
+        ref(short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf2, width * height, per, shift);
+        checked(opt, short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf3, width * height, per, shift);
 
-        if (memcmp(mintbuf1, mintbuf2, cmp_size))
+        if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
             return false;
 
         reportfail();
@@ -222,8 +226,8 @@ bool MBDstHarness::check_quant_primitive(quant_t ref, quant_t opt)
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
 
-        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
-        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
+        refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
+        optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
 
         if (memcmp(mintbuf1, mintbuf3, cmp_size))
             return false;
@@ -261,8 +265,8 @@ bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt)
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
 
-        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
-        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
+        refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
+        optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
 
         if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
             return false;
@@ -324,6 +328,7 @@ bool MBDstHarness::check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t op
         int log2TrSize = s + 2;
         int num = 1 << (log2TrSize * 2);
         int cmp_size = sizeof(int) * num;
+        int cmp_short = sizeof(short) * num;
 
         for (int i = 0; i < ITERS; i++)
         {
@@ -336,10 +341,10 @@ bool MBDstHarness::check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t op
 
             int index = rand() % TEST_CASES;
 
-            ref(int_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
-            checked(opt, int_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
+            ref(short_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
+            checked(opt, short_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
 
-            if (memcmp(int_denoise_test_buff1[index] + j, int_denoise_test_buff2[index] + j, cmp_size))
+            if (memcmp(short_denoise_test_buff1[index] + j, short_denoise_test_buff2[index] + j, cmp_short))
                 return false;
 
             if (memcmp(mubuf1, mubuf2, cmp_size))
@@ -454,7 +459,7 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
         if (opt.dct[value])
         {
             printf("%s\t", dctInfo[value].name);
-            REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mintbuf3, dctInfo[value].width);
+            REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mshortbuf2, dctInfo[value].width);
         }
     }
 
@@ -463,32 +468,32 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
         if (opt.idct[value])
         {
             printf("%s\t", idctInfo[value].name);
-            REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mbufidct, mshortbuf2, idctInfo[value].width);
+            REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mshortbuf3, mshortbuf2, idctInfo[value].width);
         }
     }
 
     if (opt.dequant_normal)
     {
         printf("dequant_normal\t");
-        REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mintbuf3, 32 * 32, 70, 1);
+        REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
     }
 
     if (opt.dequant_scaling)
     {
         printf("dequant_scaling\t");
-        REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1);
+        REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mshortbuf2, 32 * 32, 5, 1);
     }
 
     if (opt.quant)
     {
         printf("quant\t\t");
-        REPORT_SPEEDUP(opt.quant, ref.quant, int_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
+        REPORT_SPEEDUP(opt.quant, ref.quant, short_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
     }
 
     if (opt.nquant)
     {
         printf("nquant\t\t");
-        REPORT_SPEEDUP(opt.nquant, ref.nquant, int_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
+        REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
     }
 
     if (opt.count_nonzero)
@@ -503,7 +508,7 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
     if (opt.denoiseDct)
     {
         printf("denoiseDct\t");
-        REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, int_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
+        REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, short_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
     }
 
 }
diff --git a/source/test/mbdstharness.h b/source/test/mbdstharness.h
index a8b4de2..284892a 100644
--- a/source/test/mbdstharness.h
+++ b/source/test/mbdstharness.h
@@ -60,8 +60,8 @@ protected:
     uint32_t mubuf2[MAX_TU_SIZE];
     uint16_t mushortbuf1[MAX_TU_SIZE];
 
-    int int_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
-    int int_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
+    int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
+    int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
 
     bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
     bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp
index bb6e0e6..5c91bdc 100644
--- a/source/test/pixelharness.cpp
+++ b/source/test/pixelharness.cpp
@@ -344,7 +344,7 @@ bool PixelHarness::check_downscale_t(downscale_t ref, downscale_t opt)
     return true;
 }
 
-bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt)
+bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
     ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -359,8 +359,8 @@ bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t op
         int shift = (rand() % 7 + 1);
 
         int index = i % TEST_CASES;
-        checked(opt, opt_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
-        ref(ref_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
+        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+        ref(ref_dest, short_test_buff[index] + j, stride, shift);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
             return false;
@@ -372,60 +372,7 @@ bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t op
     return true;
 }
 
-bool PixelHarness::check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt)
-{
-    ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
-    ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
-
-    int j = 0;
-    intptr_t stride = STRIDE;
-    for (int i = 0; i < ITERS; i++)
-    {
-        int shift = (rand() % 7 + 1);
-
-        int index = i % TEST_CASES;
-        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
-        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
-            return false;
-
-        reportfail();
-        j += INCR;
-    }
-
-    return true;
-}
-
-bool PixelHarness::check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt)
-{
-    ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
-    ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
-
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
-
-    int j = 0;
-    intptr_t stride = STRIDE;
-    for (int i = 0; i < ITERS; i++)
-    {
-        int shift = (rand() % 7 + 1);
-
-        int index = i % TEST_CASES;
-        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
-        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
-            return false;
-
-        reportfail();
-        j += INCR;
-    }
-
-    return true;
-}
-
-bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt)
+bool PixelHarness::check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
     ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -440,8 +387,8 @@ bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t op
         int shift = (rand() % 7 + 1);
 
         int index = i % TEST_CASES;
-        checked(opt, opt_dest, int_test_buff[index] + j, stride, shift);
-        ref(ref_dest, int_test_buff[index] + j, stride, shift);
+        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+        ref(ref_dest, short_test_buff[index] + j, stride, shift);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
             return false;
@@ -479,7 +426,7 @@ bool PixelHarness::check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt)
     return true;
 }
 
-bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
+bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
     ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -494,8 +441,8 @@ bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
         int shift = (rand() % 7 + 1);
 
         int index = i % TEST_CASES;
-        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
-        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
+        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+        ref(ref_dest, short_test_buff[index] + j, stride, shift);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
             return false;
@@ -507,7 +454,7 @@ bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
     return true;
 }
 
-bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt)
+bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
     ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -1308,50 +1255,40 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
             }
         }
 
-        if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
+        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
         {
-            if (!check_cvt16to32_shr_t(ref.cvt16to32_shr[i], opt.cvt16to32_shr[i]))
+            if (!check_cpy2Dto1D_shl_t(ref.cpy2Dto1D_shl[i], opt.cpy2Dto1D_shl[i]))
             {
-                printf("cvt16to32_shr failed!\n");
+                printf("cpy2Dto1D_shl failed!\n");
                 return false;
             }
         }
 
-        if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
+        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
         {
-            if (!check_cvt32to16_shl_t(ref.cvt32to16_shl[i], opt.cvt32to16_shl[i]))
+            if (!check_cpy2Dto1D_shr_t(ref.cpy2Dto1D_shr[i], opt.cpy2Dto1D_shr[i]))
             {
-                printf("cvt32to16_shl failed!\n");
+                printf("cpy2Dto1D_shr failed!\n");
                 return false;
             }
         }
 
-        if ((i < BLOCK_64x64) && opt.copy_shl[i])
+        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
         {
-            if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i]))
+            if (!check_cpy1Dto2D_shl_t(ref.cpy1Dto2D_shl[i], opt.cpy1Dto2D_shl[i]))
             {
-                printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i);
+                printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
                 return false;
             }
         }
 
-    }
-
-    if (opt.cvt32to16_shr)
-    {
-        if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr))
+        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
         {
-            printf("cvt32to16 failed!\n");
-            return false;
-        }
-    }
-
-    if (opt.cvt16to32_shl)
-    {
-        if (!check_cvt16to32_shl_t(ref.cvt16to32_shl, opt.cvt16to32_shl))
-        {
-            printf("cvt16to32_shl failed!\n");
-            return false;
+            if (!check_cpy1Dto2D_shr_t(ref.cpy1Dto2D_shr[i], opt.cpy1Dto2D_shr[i]))
+            {
+                printf("cpy1Dto2D_shr[%dx%d] failed!\n", 4 << i, 4 << i);
+                return false;
+            }
         }
     }
 
@@ -1373,9 +1310,9 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
         }
     }
 
-    if (opt.frame_init_lowres_core)
+    if (opt.frameInitLowres)
     {
-        if (!check_downscale_t(ref.frame_init_lowres_core, opt.frame_init_lowres_core))
+        if (!check_downscale_t(ref.frameInitLowres, opt.frameInitLowres))
         {
             printf("downscale failed!\n");
             return false;
@@ -1445,15 +1382,6 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
         }
     }
 
-    if (opt.copy_shr)
-    {
-        if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr))
-        {
-            printf("copy_shr failed!\n");
-            return false;
-        }
-    }
-
     return true;
 }
 
@@ -1674,42 +1602,35 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
             REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE);
         }
 
-        if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
+        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
         {
-            HEADER("cvt16to32_shr[%dx%d]", 4 << i, 4 << i);
-            REPORT_SPEEDUP(opt.cvt16to32_shr[i], ref.cvt16to32_shr[i], ibuf1, sbuf2, STRIDE, 3, 4);
+            HEADER("cpy2Dto1D_shl[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cpy2Dto1D_shl[i], ref.cpy2Dto1D_shl[i], sbuf1, sbuf2, STRIDE, MAX_TR_DYNAMIC_RANGE - X265_DEPTH - (i + 2));
         }
 
-        if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
+        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
         {
-            HEADER("cvt32to16_shl[%dx%d]", 4 << i, 4 << i);
-            REPORT_SPEEDUP(opt.cvt32to16_shl[i], ref.cvt32to16_shl[i], sbuf2, ibuf1, STRIDE, 3);
+            HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cpy2Dto1D_shr[i], ref.cpy2Dto1D_shr[i], sbuf1, sbuf2, STRIDE, 3);
         }
 
-        if ((i < BLOCK_64x64) && opt.copy_cnt[i])
+        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
         {
-            HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
-            REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
+            HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cpy1Dto2D_shl[i], ref.cpy1Dto2D_shl[i], sbuf1, sbuf2, STRIDE, 64);
         }
 
-        if ((i < BLOCK_64x64) && opt.copy_shl[i])
+        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
         {
-            HEADER("copy_shl[%dx%d]", 4 << i, 4 << i);
-            REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64);
+            HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cpy1Dto2D_shr[i], ref.cpy1Dto2D_shr[i], sbuf1, sbuf2, STRIDE, 64);
         }
 
-    }
-
-    if (opt.cvt32to16_shr)
-    {
-        HEADER0("cvt32to16_shr");
-        REPORT_SPEEDUP(opt.cvt32to16_shr, ref.cvt32to16_shr, sbuf1, ibuf1, 64, 5, 64);
-    }
-
-    if (opt.cvt16to32_shl)
-    {
-        HEADER0("cvt16to32_shl");
-        REPORT_SPEEDUP(opt.cvt16to32_shl, ref.cvt16to32_shl, ibuf1, sbuf1, 64, 5, 64);
+        if ((i < BLOCK_64x64) && opt.copy_cnt[i])
+        {
+            HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
+        }
     }
 
     if (opt.weight_pp)
@@ -1724,10 +1645,10 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
         REPORT_SPEEDUP(opt.weight_sp, ref.weight_sp, (int16_t*)sbuf1, pbuf1, 64, 64, 32, 32, 128, 1 << 9, 10, 100);
     }
 
-    if (opt.frame_init_lowres_core)
+    if (opt.frameInitLowres)
     {
         HEADER0("downscale");
-        REPORT_SPEEDUP(opt.frame_init_lowres_core, ref.frame_init_lowres_core, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
+        REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
     }
 
     if (opt.scale1D_128to64)
@@ -1771,11 +1692,4 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
         HEADER0("planecopy_cp");
         REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
     }
-
-    if (opt.copy_shr)
-    {
-        HEADER0("copy_shr");
-        REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64);
-    }
-
 }
diff --git a/source/test/pixelharness.h b/source/test/pixelharness.h
index 1255d99..b35d958 100644
--- a/source/test/pixelharness.h
+++ b/source/test/pixelharness.h
@@ -80,13 +80,11 @@ protected:
     bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
     bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
     bool check_downscale_t(downscale_t ref, downscale_t opt);
-    bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
-    bool check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt);
-    bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
-    bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
+    bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
+    bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
+    bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
+    bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
     bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
-    bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
-    bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt);
     bool check_pixel_var(var_t ref, var_t opt);
     bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
     bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
diff --git a/source/test/testpool.cpp b/source/test/testpool.cpp
deleted file mode 100644
index 01f037b..0000000
--- a/source/test/testpool.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Steve Borho <steve@borho.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com
- *****************************************************************************/
-
-#include "common.h"
-#include "threadpool.h"
-#include "wavefront.h"
-#include "threading.h"
-#include "md5.h"
-#include "PPA/ppa.h"
-
-#include <sstream>
-#include <iostream>
-
-using namespace x265;
-
-struct CUData
-{
-    CUData()
-    {
-        memset(digest, 0, sizeof(digest));
-    }
-
-    unsigned char digest[16];
-};
-
-struct RowData
-{
-    RowData() : active(false), curCol(0) {}
-
-    Lock          lock;
-    volatile bool active;
-    volatile int  curCol;
-};
-
-// Create a fake frame class with manufactured data in each CU block.  We
-// need to create an MD5 hash such that each CU's hash includes the hashes
-// of the blocks that would have HEVC data dependencies (left, top-left,
-// top, top-right).  This will give us one deterministic output hash.  We
-// then generate the same hash using the thread pool and wave-front parallelism
-// to verify the thread-pool behavior and the wave-front schedule data
-// structures.
-class MD5Frame : public WaveFront
-{
-private:
-
-    CUData  *cu;
-    RowData *row;
-    int      numrows;
-    int      numcols;
-    Event    complete;
-
-public:
-
-    MD5Frame(ThreadPool *pool) : WaveFront(pool), cu(0), row(0) {}
-
-    virtual ~MD5Frame()
-    {
-        // ensure no threads are lingering on FindJob() before allowing
-        // this object's vtable to be destroyed
-        JobProvider::flush();
-
-        delete[] this->cu;
-        delete[] this->row;
-    }
-
-    void initialize(int cols, int rows);
-
-    void encode();
-
-    void processRow(int row, int threadid);
-};
-
-void MD5Frame::initialize(int cols, int rows)
-{
-    this->cu = new CUData[rows * cols];
-    this->row = new RowData[rows];
-    this->numrows = rows;
-    this->numcols = cols;
-
-    if (!this->WaveFront::init(rows))
-    {
-        assert(!"Unable to initialize job queue");
-    }
-}
-
-void MD5Frame::encode()
-{
-    this->JobProvider::enqueue();
-
-    this->WaveFront::enqueueRow(0);
-
-    // NOTE: When EnableRow after enqueueRow at first row, we'd better call pokeIdleThread, it will release a thread to do job
-    this->WaveFront::enableRow(0);
-    this->m_pool->pokeIdleThread();
-
-    this->complete.wait();
-
-    this->JobProvider::dequeue();
-
-    unsigned int *outdigest = (unsigned int*)this->cu[this->numrows * this->numcols - 1].digest;
-
-    std::stringstream ss;
-
-    for (int i = 0; i < 4; i++)
-    {
-        ss << std::hex << outdigest[i];
-    }
-
-    if (ss.str().compare("da667b741a7a9d0ee862158da2dd1882"))
-        std::cout << "Bad hash: " << ss.str() << std::endl;
-}
-
-void MD5Frame::processRow(int rownum, int)
-{
-    // Called by worker thread
-    RowData &curRow = this->row[rownum];
-
-    assert(rownum < this->numrows && rownum >= 0);
-    assert(curRow.curCol < this->numcols);
-
-    while (curRow.curCol < this->numcols)
-    {
-        int id = rownum * this->numcols + curRow.curCol;
-        CUData  &curCTU = this->cu[id];
-        MD5 hash;
-
-        // * Fake CTU processing *
-        PPAStartCpuEventFunc(encode_block);
-        memset(curCTU.digest, id, sizeof(curCTU.digest));
-        hash.update(curCTU.digest, sizeof(curCTU.digest));
-        if (curRow.curCol > 0)
-            hash.update(this->cu[id - 1].digest, sizeof(curCTU.digest));
-
-        if (rownum > 0)
-        {
-            if (curRow.curCol > 0)
-                hash.update(this->cu[id - this->numcols - 1].digest, sizeof(curCTU.digest));
-
-            hash.update(this->cu[id - this->numcols].digest, sizeof(curCTU.digest));
-            if (curRow.curCol < this->numcols - 1)
-                hash.update(this->cu[id - this->numcols + 1].digest, sizeof(curCTU.digest));
-        }
-
-        hash.finalize(curCTU.digest);
-        PPAStopCpuEventFunc(encode_block);
-
-        curRow.curCol++;
-
-        if (curRow.curCol >= 2 && rownum < this->numrows - 1)
-        {
-            ScopedLock below(this->row[rownum + 1].lock);
-
-            if (this->row[rownum + 1].active == false &&
-                this->row[rownum + 1].curCol + 2 <= curRow.curCol)
-            {
-                // set active indicator so row is only enqueued once
-                // row stays marked active until blocked or done
-                this->row[rownum + 1].active = true;
-                this->WaveFront::enqueueRow(rownum + 1);
-                this->WaveFront::enableRow(rownum + 1);
-            }
-        }
-
-        ScopedLock self(curRow.lock);
-
-        if (rownum > 0 &&
-            curRow.curCol < this->numcols - 1 &&
-            this->row[rownum - 1].curCol < curRow.curCol + 2)
-        {
-            // row is blocked, quit job
-            curRow.active = false;
-            return;
-        }
-    }
-
-    // * Row completed *
-
-    if (rownum == this->numrows - 1)
-        this->complete.trigger();
-}
-
-int main(int, char **)
-{
-    ThreadPool *pool;
-
-    PPA_INIT();
-
-    pool = ThreadPool::allocThreadPool(1);
-    {
-        MD5Frame frame(pool);
-        frame.initialize(60, 40);
-        frame.encode();
-    }
-    pool->release();
-    pool = ThreadPool::allocThreadPool(2);
-    {
-        MD5Frame frame(pool);
-        frame.initialize(60, 40);
-        frame.encode();
-    }
-    pool->release();
-    pool = ThreadPool::allocThreadPool(4);
-    {
-        MD5Frame frame(pool);
-        frame.initialize(60, 40);
-        frame.encode();
-    }
-    pool->release();
-    pool = ThreadPool::allocThreadPool(8);
-    {
-        MD5Frame frame(pool);
-        frame.initialize(60, 40);
-        frame.encode();
-    }
-    pool->release();
-
-    return 0;
-}
diff --git a/source/x265.cpp b/source/x265.cpp
index 474cea9..ab7c93f 100644
--- a/source/x265.cpp
+++ b/source/x265.cpp
@@ -37,7 +37,6 @@
 /* Visual Leak Detector */
 #include <vld.h>
 #endif
-#include "PPA/ppa.h"
 
 #include <signal.h>
 #include <errno.h>
@@ -155,6 +154,11 @@ static const struct option long_options[] =
     { "aq-strength",    required_argument, NULL, 0 },
     { "ipratio",        required_argument, NULL, 0 },
     { "pbratio",        required_argument, NULL, 0 },
+    { "qcomp",          required_argument, NULL, 0 },
+    { "qpstep",         required_argument, NULL, 0 },
+    { "ratetol",        required_argument, NULL, 0 },
+    { "cplxblur",       required_argument, NULL, 0 },
+    { "qblur",          required_argument, NULL, 0 },
     { "cbqpoffs",       required_argument, NULL, 0 },
     { "crqpoffs",       required_argument, NULL, 0 },
     { "rd",             required_argument, NULL, 0 },
@@ -165,8 +169,10 @@ static const struct option long_options[] =
     { "no-lossless",          no_argument, NULL, 0 },
     { "no-signhide",          no_argument, NULL, 0 },
     { "signhide",             no_argument, NULL, 0 },
-    { "no-lft",               no_argument, NULL, 0 },
-    { "lft",                  no_argument, NULL, 0 },
+    { "no-lft",               no_argument, NULL, 0 }, /* DEPRECATED */
+    { "lft",                  no_argument, NULL, 0 }, /* DEPRECATED */
+    { "no-deblock",           no_argument, NULL, 0 },
+    { "deblock",        required_argument, NULL, 0 },
     { "no-sao",               no_argument, NULL, 0 },
     { "sao",                  no_argument, NULL, 0 },
     { "no-sao-non-deblock",   no_argument, NULL, 0 },
@@ -203,7 +209,8 @@ static const struct option long_options[] =
     { "lambda-file",    required_argument, NULL, 0 },
     { "b-intra",              no_argument, NULL, 0 },
     { "no-b-intra",           no_argument, NULL, 0 },
-    { "nr",             required_argument, NULL, 0 },
+    { "nr-intra",       required_argument, NULL, 0 },
+    { "nr-inter",       required_argument, NULL, 0 },
     { "stats",          required_argument, NULL, 0 },
     { "pass",           required_argument, NULL, 0 },
     { "slow-firstpass",       no_argument, NULL, 0 },
@@ -268,8 +275,6 @@ struct CLIOptions
     void showHelp(x265_param *param);
     bool parse(int argc, char **argv, x265_param* param);
     bool parseQPFile(x265_picture &pic_org);
-    void readAnalysisFile(x265_picture* pic, x265_param*);
-    void writeAnalysisFile(x265_picture* pic, x265_param*);
     bool validateFanout(x265_param*);
 };
 
@@ -291,7 +296,7 @@ void CLIOptions::destroy()
 
 void CLIOptions::writeNALs(const x265_nal* nal, uint32_t nalcount)
 {
-    PPAScopeEvent(bitstream_write);
+    ProfileScopeEvent(bitstreamWrite);
     for (uint32_t i = 0; i < nalcount; i++)
     {
         bitstreamFile.write((const char*)nal->payload, nal->sizeBytes);
@@ -335,11 +340,14 @@ void CLIOptions::printVersion(x265_param *param)
 
 void CLIOptions::showHelp(x265_param *param)
 {
+    int level = param->logLevel;
     x265_param_default(param);
     printVersion(param);
 
-#define H0 printf
 #define OPT(value) (value ? "enabled" : "disabled")
+#define H0 printf
+#define H1 if (level >= X265_LOG_DEBUG) printf
+
     H0("\nSyntax: x265 [options] infile [-o] outfile\n");
     H0("    infile can be YUV or Y4M\n");
     H0("    outfile is raw HEVC bitstream\n");
@@ -351,18 +359,18 @@ void CLIOptions::showHelp(x265_param *param)
     H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", logLevelNames[param->logLevel + 1]);
     H0("   --no-progress                 Disable CLI progress reports\n");
     H0("   --[no-]cu-stats               Enable logging stats about distribution of cu across all modes. Default %s\n",OPT(param->bLogCuStats));
-    H0("   --csv <filename>              Comma separated log file, log level >= 3 frame log, else one line per run\n");
+    H1("   --csv <filename>              Comma separated log file, log level >= 3 frame log, else one line per run\n");
     H0("\nInput Options:\n");
     H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
-    H0("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
+    H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
     H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
     H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
-    H0("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
-    H0("   --input-csp <string>          Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
+    H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
+    H1("   --input-csp <string>          Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
     H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
     H0("   --seek <integer>              First frame to encode\n");
-    H0("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
-    H0("   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled\n");
+    H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
+    H1("   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled\n");
     H0("\nQuality reporting metrics:\n");
     H0("   --[no-]ssim                   Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
     H0("   --[no-]psnr                   Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
@@ -381,33 +389,34 @@ void CLIOptions::showHelp(x265_param *param)
     H0("-p/--preset <string>             Trade off performance for compression efficiency. Default medium\n");
     H0("                                 ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
     H0("-t/--tune <string>               Tune the settings for a particular type of source or situation:\n");
-    H0("                                 psnr, ssim, zerolatency, or fastdecode\n");
+    H0("                                 psnr, ssim, grain, zerolatency, fastdecode or cbr\n");
     H0("\nQuad-Tree size and depth:\n");
-    H0("-s/--ctu <64|32|16>              Maximum CU size (default: 64x64). Default %d\n", param->maxCUSize);
+    H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
     H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
-    H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
-    H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
     H0("\nAnalysis:\n");
     H0("   --rd <0..6>                   Level of RD in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
-    H0("   --psy-rd <0..2.0>             Strength of psycho-visual rate distortion optimization, 0 to disable. Default %f\n", param->psyRd);
-    H0("   --psy-rdoq <0..50.0>          Strength of psycho-visual optimization in quantization, 0 to disable. Default %f\n", param->psyRdoq);
-    H0("   --nr <integer>                An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled\n");
-    H0("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
+    H0("   --psy-rd <0..2.0>             Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
+    H0("   --psy-rdoq <0..50.0>          Strength of psycho-visual optimization in quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
-    H0("   --[no-]fast-cbf               Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode));
+    H1("   --[no-]fast-cbf               Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode));
+    H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
+    H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
+    H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
     H0("\nCoding tools:\n");
     H0("-w/--[no-]weightp                Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
     H0("   --[no-]weightb                Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
     H0("   --[no-]cu-lossless            Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
     H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
-    H0("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
+    H1("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
     H0("\nTemporal / motion search options:\n");
     H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
     H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
     H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
     H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
-    H0("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
+    H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
+    H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
+    H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
     H0("\nSpatial / intra options:\n");
     H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
     H0("   --[no-]constrained-intra      Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra));
@@ -422,57 +431,59 @@ void CLIOptions::showHelp(x265_param *param)
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
     H0("   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
-    H0("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
+    H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
     H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
     H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
     H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
-    H0("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
-    H0("                                 Format of each line: framenumber frametype QP\n");
-    H0("                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
-    H0("                                 QPs are restricted by qpmin/qpmax.\n");
-    H0("\nRate control, Quantization:\n");
+    H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
+    H1("                                 Format of each line: framenumber frametype QP\n");
+    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
+    H1("                                 QPs are restricted by qpmin/qpmax.\n");
+    H0("\nRate control, Adaptive Quantization:\n");
     H0("   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
-    H0("-q/--qp <integer>                QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n");
-    H0("   --crf <float>                 Quality-based VBR (0-51). Default %f\n", param->rc.rfConstant);
-    H0("   --[no-]lossless               Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless));
-    H0("   --crf-max <float>             With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax);
-    H0("                                 May cause VBV underflows!\n");
-    H0("   --crf-min <float>             With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin);
-    H0("                                 this specifies a minimum rate factor value for encode!\n");
+    H1("-q/--qp <integer>                QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n");
+    H0("   --crf <float>                 Quality-based VBR (0-51). Default %.1f\n", param->rc.rfConstant);
+    H1("   --[no-]lossless               Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless));
+    H1("   --crf-max <float>             With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax);
+    H1("                                 May cause VBV underflows!\n");
+    H1("   --crf-min <float>             With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin);
+    H1("                                 this specifies a minimum rate factor value for encode!\n");
     H0("   --vbv-maxrate <integer>       Max local bitrate (kbit/s). Default %d\n", param->rc.vbvMaxBitrate);
     H0("   --vbv-bufsize <integer>       Set size of the VBV buffer (kbit). Default %d\n", param->rc.vbvBufferSize);
-    H0("   --vbv-init <float>            Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %f\n", param->rc.vbvBufferInit);
-    H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode);
-    H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default %f\n", param->rc.aqStrength);
-    H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
-    H0("   --ipratio <float>             QP factor between I and P. Default %f\n", param->rc.ipFactor);
-    H0("   --pbratio <float>             QP factor between P and B. Default %f\n", param->rc.pbFactor);
-    H0("   --cbqpoffs <integer>          Chroma Cb QP Offset. Default %d\n", param->cbQpOffset);
-    H0("   --crqpoffs <integer>          Chroma Cr QP Offset. Default %d\n", param->crQpOffset);
-    H0("   --stats                       Filename for stats file in multipass pass rate control. Default x265_2pass.log\n");
+    H0("   --vbv-init <float>            Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit);
     H0("   --pass                        Multi pass rate control.\n"
        "                                   - 1 : First pass, creates stats file\n"
        "                                   - 2 : Last pass, does not overwrite stats file\n"
        "                                   - 3 : Nth pass, overwrites stats file\n");
+    H0("   --stats                       Filename for stats file in multipass pass rate control. Default x265_2pass.log\n");
     H0("   --[no-]slow-firstpass         Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass));
     H0("   --analysis-mode <string|int>  save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode);
     H0("   --analysis-file <filename>    Specify file name used for either dumping or reading analysis data.\n");
-    H0("   --scaling-list <string>       Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
-    H0("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
-    H0("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
-    H0("                                 Blank lines and lines starting with hash(#) are ignored\n");
-    H0("                                 Comma is considered to be white-space\n");
+    H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode);
+    H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
+    H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
+    H1("   --ipratio <float>             QP factor between I and P. Default %.2f\n", param->rc.ipFactor);
+    H1("   --pbratio <float>             QP factor between P and B. Default %.2f\n", param->rc.pbFactor);
+    H1("   --qcomp <float>               Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress);
+    H1("   --ratetol <float>             Degree of rate fluctuation that can be tolerated. Default %.2f\n", param->rc.rateTolerance);
+    H1("   --cbqpoffs <integer>          Chroma Cb QP Offset. Default %d\n", param->cbQpOffset);
+    H1("   --crqpoffs <integer>          Chroma Cr QP Offset. Default %d\n", param->crQpOffset);
+    H1("   --scaling-list <string>       Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
+    H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
+    H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
+    H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
+    H1("                                 Comma is considered to be white-space\n");
     H0("\nLoop filters (deblock and SAO):\n");
-    H0("   --[no-]lft                    Enable Deblocking Loop Filter. Default %s\n", OPT(param->bEnableLoopFilter));
+    H0("   --[no-]deblock                Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter));
     H0("   --[no-]sao                    Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
-    H0("   --[no-]sao-non-deblock        Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
+    H1("   --[no-]sao-non-deblock        Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
     H0("\nVUI options:\n");
     H0("   --sar <width:height|int>      Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n");
     H0("                                 Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");
     H0("                                 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n");
     H0("                                 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default %d\n", param->vui.aspectRatioIdc);
-    H0("   --crop-rect <string>          Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n");
-    H0("   --overscan <string>           Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
+    H1("   --crop-rect <string>          Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n");
+    H1("   --overscan <string>           Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
     H0("   --videoformat <string>        Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n");
     H0("   --range <string>              Specify black level and range of luma and chroma signals as full or limited Default limited\n");
     H0("   --colorprim <string>          Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,\n");
@@ -480,21 +491,25 @@ void CLIOptions::showHelp(x265_param *param)
     H0("   --transfer <string>           Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,\n");
     H0("                                 smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
     H0("                                 bt2020-10, bt2020-12. Default undef\n");
-    H0("   --colormatrix <string>        Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
-    H0("                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
-    H0("   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
+    H1("   --colormatrix <string>        Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
+    H1("                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
+    H1("   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
     H0("\nBitstream options:\n");
     H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
     H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
-    H0("   --[no-]hrd                    Enable HRD parameters signalling. Default %s\n", OPT(param->bEmitHRDSEI));
+    H0("   --[no-]hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
     H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
-    H0("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
-    H0("\nReconstructed video options (debugging):\n");
-    H0("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
-    H0("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
+    H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
+    H1("\nReconstructed video options (debugging):\n");
+    H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
+    H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
 #undef OPT
 #undef H0
-    printf("\n\nFull documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n");
+#undef H1
+
+    if (level < X265_LOG_DEBUG)
+        printf("\nUse --log-level full --help for a full listing\n");
+    printf("\n\nComplete documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n");
     exit(0);
 }
 
@@ -510,7 +525,6 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param)
     const char *preset = NULL;
     const char *tune = NULL;
     const char *profile = NULL;
-    const char *analysisfn = "x265_analysis.dat";
 
     if (argc <= 1)
     {
@@ -603,7 +617,6 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param)
             OPT("profile") profile = optarg; /* handled last */
             OPT("preset") /* handled above */;
             OPT("tune")   /* handled above */;
-            OPT("analysis-file") analysisfn = optarg;
             OPT("qpfile")
             {
                 this->qpfile = fopen(optarg, "rb");
@@ -751,163 +764,9 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param)
         x265_log(NULL, X265_LOG_ERROR, "failed to open bitstream file <%s> for writing\n", bitstreamfn);
         return true;
     }
-
-    if (param->analysisMode)
-    {
-        const char *mode = param->analysisMode == X265_ANALYSIS_SAVE ? "wb" : "rb";
-        this->analysisFile = fopen(analysisfn, mode);
-        if (!this->analysisFile)
-        {
-            x265_log(NULL, X265_LOG_ERROR, "failed to open analysis file %s\n", analysisfn);
-            return true;
-        }
-    }
-
     return false;
 }
 
-bool CLIOptions::validateFanout(x265_param *param)
-{
-#define CMP_OPT_FANOUT(opt, param_val)\
-    {\
-        bErr = 0;\
-        p = strstr(paramBuf, opt "=");\
-        char* q = strstr(paramBuf, "no-"opt);\
-        if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\
-            bErr = 1;\
-        else if (!param_val && !q)\
-            bErr = 1;\
-        else if (param_val && (q || !strstr(paramBuf, opt)))\
-            bErr = 1;\
-        if (bErr)\
-        {\
-            x265_log(param, X265_LOG_ERROR, "different " opt " setting than given in analysis file (%d vs %d)\n", param_val, i);\
-            X265_FREE(paramBuf);\
-            return false;\
-        }\
-    }
-
-    char *p = NULL, *paramBuf;
-    int i, j;
-    uint32_t k , l;
-    bool bErr = false;
-
-    paramBuf = X265_MALLOC(char, MAXPARAMSIZE);
-    if (!paramBuf)
-        return false;
-
-    fread(paramBuf, 1, MAXPARAMSIZE, this->analysisFile);
-
-    /* check whether fanout options are compatible */
-    if (strncmp(paramBuf, "#options:", 9))
-    {
-        x265_log(param, X265_LOG_ERROR, "options list in analysis file is not valid\n");
-        X265_FREE(paramBuf);
-        return false;
-    }
-
-    char* buf = strchr(paramBuf, '\n');
-    if (!buf)
-    {
-        x265_log(param, X265_LOG_ERROR, "Malformed analysis file\n");
-        X265_FREE(paramBuf);
-        return false;
-    }
-    *buf = '\0';
-    fseek(this->analysisFile, (int)strlen(paramBuf) + 1, SEEK_SET);
-
-    if (sscanf(paramBuf, "#options: %dx%d", &i, &j) != 2)
-    {
-        x265_log(param, X265_LOG_ERROR, "Resolution specified in analysis file is not valid\n");
-        X265_FREE(paramBuf);
-        return false;
-    }
-    if ((p = strstr(paramBuf, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2)
-    {
-        x265_log(param, X265_LOG_ERROR, "fps specified in analysis file is not valid\n");
-        X265_FREE(paramBuf);
-        return false;
-    }
-    if (k != param->fpsNum || l != param->fpsDenom)
-    {
-        x265_log(param, X265_LOG_ERROR, "fps mismatch than given in analysis file (%u/%u vs %u/%u)\n",
-            param->fpsNum, param->fpsDenom, k, l);
-        X265_FREE(paramBuf);
-        return false;
-    }
-
-    CMP_OPT_FANOUT("bitdepth", param->internalBitDepth);
-    CMP_OPT_FANOUT("weightp", param->bEnableWeightedPred);
-    CMP_OPT_FANOUT("bframes", param->bframes);
-    CMP_OPT_FANOUT("b-pyramid", param->bBPyramid);
-    CMP_OPT_FANOUT("b-adapt", param->bFrameAdaptive);
-    CMP_OPT_FANOUT("open-gop", param->bOpenGOP);
-    CMP_OPT_FANOUT("keyint", param->keyframeMax);
-    CMP_OPT_FANOUT("min-keyint", param->keyframeMin);
-    CMP_OPT_FANOUT("scenecut", param->scenecutThreshold);
-    CMP_OPT_FANOUT("ctu", (int)param->maxCUSize);
-    CMP_OPT_FANOUT("ref", param->maxNumReferences);
-    CMP_OPT_FANOUT("rc-lookahead", param->lookaheadDepth);
-
-#undef CMP_OPT_FANOUT
-
-    X265_FREE(paramBuf);
-    return true;
-}
-
-void CLIOptions::readAnalysisFile(x265_picture* pic, x265_param* p)
-{
-    int poc, width, height;
-    uint32_t numPart, numCU;
-    fread(&width, sizeof(int), 1, this->analysisFile);
-    fread(&height, sizeof(int), 1, this->analysisFile);
-    fread(&poc, sizeof(int), 1, this->analysisFile);
-    fread(&pic->sliceType, sizeof(int), 1, this->analysisFile);
-    fread(&numCU, sizeof(int), 1, this->analysisFile);
-    fread(&numPart, sizeof(int), 1, this->analysisFile);
-
-    if (poc != pic->poc || width != p->sourceWidth || height != p->sourceHeight)
-    {
-        x265_log(NULL, X265_LOG_WARNING, "Error in reading intra-inter data.\n");
-        x265_free_analysis_data(pic);
-        return;
-    }
-
-    fread(pic->analysisData.intraData->depth,
-        sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
-    fread(pic->analysisData.intraData->modes,
-        sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
-    fread(pic->analysisData.intraData->partSizes,
-        sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
-    fread(pic->analysisData.intraData->poc,
-        sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile);
-    fread(pic->analysisData.intraData->cuAddr,
-        sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile);
-    fread(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile);
-}
-
-void CLIOptions::writeAnalysisFile(x265_picture* pic, x265_param *p)
-{
-    uint64_t seekTo = pic->poc * this->analysisRecordSize + this->analysisHeaderSize;
-    fseeko(this->analysisFile, seekTo, SEEK_SET);
-    fwrite(&p->sourceWidth, sizeof(int), 1, this->analysisFile);
-    fwrite(&p->sourceHeight, sizeof(int), 1, this->analysisFile);
-    fwrite(&pic->poc, sizeof(int), 1, this->analysisFile);
-    fwrite(&pic->sliceType, sizeof(int), 1, this->analysisFile);
-    fwrite(&pic->analysisData.numCUsInFrame, sizeof(int), 1, this->analysisFile);
-    fwrite(&pic->analysisData.numPartitions, sizeof(int), 1, this->analysisFile);
-
-    fwrite(pic->analysisData.intraData->depth,
-        sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
-    fwrite(pic->analysisData.intraData->modes,
-        sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
-    fwrite(pic->analysisData.intraData->partSizes,
-        sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
-    fwrite(pic->analysisData.intraData->poc, sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile);
-    fwrite(pic->analysisData.intraData->cuAddr, sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile);
-    fwrite(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile);
-}
-
 bool CLIOptions::parseQPFile(x265_picture &pic_org)
 {
     int32_t num = -1, qp, ret;
@@ -948,7 +807,7 @@ int main(int argc, char **argv)
     // This uses Microsoft's proprietary WCHAR type, but this only builds on Windows to start with
     VLDSetReportOptions(VLD_OPT_REPORT_TO_DEBUGGER | VLD_OPT_REPORT_TO_FILE, L"x265_leaks.txt");
 #endif
-    PPA_INIT();
+    PROFILE_INIT();
 
     x265_param *param = x265_param_alloc();
     CLIOptions cliopt;
@@ -979,7 +838,8 @@ int main(int argc, char **argv)
 
     x265_picture pic_orig, pic_out;
     x265_picture *pic_in = &pic_orig;
-    x265_picture *pic_recon = cliopt.recon ? &pic_out : NULL;
+    /* Allocate recon picture if analysisMode is enabled */
+    x265_picture *pic_recon = (cliopt.recon || !!param->analysisMode) ? &pic_out : NULL;
     uint32_t inFrameCount = 0;
     uint32_t outFrameCount = 0;
     x265_nal *p_nal;
@@ -1000,35 +860,12 @@ int main(int argc, char **argv)
 
     x265_picture_init(param, pic_in);
 
-    if (param->analysisMode && !pic_recon)
-    {
-        x265_log(NULL, X265_LOG_ERROR, "Must specify recon with analysis-mode option.\n");
-        goto fail;
-    }
     if (param->analysisMode)
     {
-        if (param->analysisMode == X265_ANALYSIS_SAVE)
+        if (param->bDistributeModeAnalysis || param->bDistributeMotionEstimation)
         {
-            char *p = x265_param2string(param);
-            if (!p)
-            {
-                x265_log(NULL, X265_LOG_ERROR, "analysis: buffer allocation failure, aborting");
-                goto fail;
-            }
-            uint32_t numCU = pic_in->analysisData.numCUsInFrame;
-            uint32_t numPart = pic_in->analysisData.numPartitions;
-
-            cliopt.analysisRecordSize = ((sizeof(int) * 4 + sizeof(uint32_t) * 2) + sizeof(x265_inter_data) * numCU * 85 +
-                    sizeof(uint8_t) * 2 * numPart * numCU + sizeof(char) * numPart * numCU + sizeof(int) * numCU + sizeof(uint32_t) * numCU);
-
-            fprintf(cliopt.analysisFile, "#options: %s\n", p);
-            cliopt.analysisHeaderSize = ftell(cliopt.analysisFile);
-            X265_FREE(p);
-        }
-        else
-        {
-            if (!cliopt.validateFanout(param))
-                goto fail;
+            x265_log(NULL, X265_LOG_ERROR, "Analysis load/save options incompatible with pmode/pme");
+            goto fail;
         }
     }
 
@@ -1069,13 +906,6 @@ int main(int argc, char **argv)
                 ditherImage(*pic_in, param->sourceWidth, param->sourceHeight, errorBuf, X265_DEPTH);
                 pic_in->bitDepth = X265_DEPTH;
             }
-            if (param->analysisMode)
-            {
-                x265_alloc_analysis_data(pic_in);
-
-                if (param->analysisMode == X265_ANALYSIS_LOAD)
-                    cliopt.readAnalysisFile(pic_in, param);
-            }
         }
 
         int numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, pic_in, pic_recon);
@@ -1085,15 +915,9 @@ int main(int argc, char **argv)
             break;
         }
         outFrameCount += numEncoded;
-        if (numEncoded && pic_recon)
-        {
-            cliopt.recon->writePicture(pic_out);
-            if (param->analysisMode == X265_ANALYSIS_SAVE)
-                cliopt.writeAnalysisFile(pic_recon, param);
-            if (param->analysisMode)
-                x265_free_analysis_data(pic_recon);
-        }
 
+        if (numEncoded && pic_recon && cliopt.recon)
+            cliopt.recon->writePicture(pic_out);
         if (nal)
             cliopt.writeNALs(p_nal, nal);
 
@@ -1106,15 +930,8 @@ int main(int argc, char **argv)
     {
         uint32_t numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, NULL, pic_recon);
         outFrameCount += numEncoded;
-        if (numEncoded && pic_recon)
-        {
+        if (numEncoded && pic_recon && cliopt.recon)
             cliopt.recon->writePicture(pic_out);
-            if (param->analysisMode == X265_ANALYSIS_SAVE)
-                cliopt.writeAnalysisFile(pic_recon, param);
-            if (param->analysisMode)
-                x265_free_analysis_data(pic_recon);
-        }
-
         if (nal)
             cliopt.writeNALs(p_nal, nal);
 
diff --git a/source/x265.def.in b/source/x265.def.in
index e78bfc1..9e964f6 100644
--- a/source/x265.def.in
+++ b/source/x265.def.in
@@ -9,8 +9,6 @@ x265_param_free
 x265_picture_init
 x265_picture_alloc
 x265_picture_free
-x265_alloc_analysis_data
-x265_free_analysis_data
 x265_param_apply_profile
 x265_max_bit_depth
 x265_version_str
diff --git a/source/x265.h b/source/x265.h
index e5474b7..f7c2360 100644
--- a/source/x265.h
+++ b/source/x265.h
@@ -88,36 +88,16 @@ typedef struct x265_nal
     uint8_t* payload;
 } x265_nal;
 
-/* Stores inter (motion estimation) analysis data for a single frame */
-typedef struct x265_inter_data
-{
-    uint32_t zOrder;
-    int      ref[2];
-    int      costZero[2];
-    int16_t  mvx[2];
-    int16_t  mvy[2];
-    uint32_t depth;
-    int      poc;
-    uint32_t cuAddr;
-} x265_inter_data;
-
-/* Stores intra (motion estimation) analysis data for a single frame */
-typedef struct x265_intra_data
-{
-    uint8_t*  depth;
-    uint8_t*  modes;
-    char*     partSizes;
-    int*      poc;
-    uint32_t* cuAddr;
-} x265_intra_data;
-
 /* Stores all analysis data for a single frame */
 typedef struct x265_analysis_data
 {
-    x265_inter_data* interData;
-    x265_intra_data* intraData;
+    uint32_t         frameRecordSize;
+    int32_t          poc;
+    int32_t          sliceType;
     uint32_t         numCUsInFrame;
     uint32_t         numPartitions;
+    void*            interData;
+    void*            intraData;
 } x265_analysis_data;
 
 /* Used to pass pictures into the encoder, and to get picture data back out of
@@ -290,7 +270,6 @@ typedef enum
 #define X265_ANALYSIS_OFF  0
 #define X265_ANALYSIS_SAVE 1
 #define X265_ANALYSIS_LOAD 2
-
 typedef struct
 {
     int planes;
@@ -419,7 +398,7 @@ typedef struct x265_param
      * per-slice statistics to this log file in encode order. Otherwise the
      * encoder will emit per-stream statistics into the log file when
      * x265_encoder_log is called (presumably at the end of the encode) */
-    const char *csvfn;
+    char *csvfn;
 
     /* Enable the generation of SEI messages for each encoded frame containing
      * the hashes of the three reconstructed picture planes. Most decoders will
@@ -504,13 +483,13 @@ typedef struct x265_param
 
     /* The additional depth the residual quadtree is allowed to recurse beyond
      * the coding quadtree, for inter coded blocks. This must be between 1 and
-     * 3. The higher the value the more efficiently the residual can be
+     * 4. The higher the value the more efficiently the residual can be
      * compressed by the DCT transforms, at the expense of much more compute */
     uint32_t  tuQTMaxInterDepth;
 
     /* The additional depth the residual quadtree is allowed to recurse beyond
      * the coding quadtree, for intra coded blocks. This must be between 1 and
-     * 3. The higher the value the more efficiently the residual can be
+     * 4. The higher the value the more efficiently the residual can be
      * compressed by the DCT transforms, at the expense of much more compute */
     uint32_t  tuQTMaxIntraDepth;
 
@@ -664,7 +643,7 @@ typedef struct x265_param
 
     /* Enable the use of `coded block flags` (flags set to true when a residual
      * has been coded for a given block) to avoid intra analysis in likely skip
-     * blocks. Default is disabled */
+     * blocks. Only applicable in RD levels 5 and 6. Default is disabled */
     int       bEnableCbfFastMode;
 
     /* Enable early skip decisions to avoid intra and inter analysis in likely
@@ -712,9 +691,10 @@ typedef struct x265_param
      * buffer and use this analysis information to reduce the amount of work
      * the encoder must perform. Default X265_ANALYSIS_OFF */
     int       analysisMode;
+    /* Filename for analysisMode save/load. Default name is "x265_analysis.dat" */
+    char*     analysisFileName;
 
     /*== Coding tools ==*/
-
     /* Enable the implicit signaling of the sign bit of the last coefficient of
      * each transform unit. This saves one bit per TU at the expense of figuring
      * out which coefficient can be toggled with the least distortion.
@@ -735,9 +715,17 @@ typedef struct x265_param
     /* Enable the deblocking loop filter, which improves visual quality by
      * reducing blocking effects at block edges, particularly at lower bitrates
      * or higher QP. When enabled it adds another CU row of reference lag,
-     * reducing frame parallelism effectiveness.  Default is enabled */
+     * reducing frame parallelism effectiveness. Default is enabled */
     int       bEnableLoopFilter;
 
+    /* deblocking filter tC offset [-6, 6] -6 light filter, 6 strong.
+     * This is the coded div2 value, actual offset is doubled at use */
+    int       deblockingFilterTCOffset;
+
+    /* deblocking filter Beta offset [-6, 6] -6 light filter, 6 strong
+     * This is the coded div2 value, actual offset is doubled at use */
+    int       deblockingFilterBetaOffset;
+
     /* Enable the Sample Adaptive Offset loop filter, which reduces distortion
      * effects by adjusting reconstructed sample values based on histogram
      * analysis to better approximate the original samples. When enabled it adds
@@ -769,9 +757,13 @@ typedef struct x265_param
      * regardless of this setting. */
     int       bIntraInBFrames;
 
-    /* An integer value in range of 100 to 1000, which denotes strength of noise
-     * reduction */
-    int       noiseReduction;
+    /* An integer value in range of 0 to 2000, which denotes strength of noise
+     * reduction in intra CUs. 0 means disabled */
+    int       noiseReductionIntra;
+
+    /* An integer value in range of 0 to 2000, which denotes strength of noise
+     * reduction in inter CUs. 0 means disabled */
+    int       noiseReductionInter;
 
     /* The lossless flag enables true lossless coding, by bypassing scaling,
      * transform, quantization and in-loop filter processes. This is used for
@@ -802,7 +794,7 @@ typedef struct x265_param
         int       bitrate;
 
         /* The degree of rate fluctuation that x265 tolerates. Rate tolerance is used
-         * alongwith overflow (difference between actual and target bitrate), to adjust
+         * along with overflow (difference between actual and target bitrate), to adjust
          * qp. Default is 1.0 */
         double    rateTolerance;
 
@@ -824,12 +816,12 @@ typedef struct x265_param
         double    rfConstant;
 
         /* Enable adaptive quantization. This mode distributes available bits between all
-         * macroblocks of a frame, assigning more bits to low complexity areas. Turning
+         * CTUs of a frame, assigning more bits to low complexity areas. Turning
          * this ON will usually affect PSNR negatively, however SSIM and visual quality
-         * generally improves. Default: X265_AQ_AUTO_VARIANCE */
+         * generally improves. Default: X265_AQ_VARIANCE */
         int       aqMode;
 
-        /* Sets the strength of AQ bias towards low detail macroblocks. Valid only if
+        /* Sets the strength of AQ bias towards low detail CTUs. Valid only if
          * AQ is enabled. Default value: 1.0. Acceptable values between 0.0 and 3.0 */
         double    aqStrength;
 
@@ -856,14 +848,15 @@ typedef struct x265_param
         /* In CRF mode, minimum CRF as caused by VBV */
         double    rfConstantMin;
 
-        /* Two pass (INCOMPLETE) */
+        /* Multi-pass encoding */
         /* Enable writing the stats in a multipass encode to the stat output file */
         int       bStatWrite;
 
         /* Enable loading data from the stat input file in a multi pass encode */
         int       bStatRead;
 
-        /* Filename of the 2pass output/input stats file */
+        /* Filename of the 2pass output/input stats file, if unspecified the
+         * encoder will default to using x265_2pass.log */
         char*     statFileName;
 
         /* temporally blur quants */
@@ -991,7 +984,7 @@ void x265_setup_primitives(x265_param *param, int cpu);
  *  special in any way, but using this method together with x265_param_free()
  *  and x265_param_parse() to set values by name allows the application to treat
  *  x265_param as an opaque data struct for version safety */
-x265_param *x265_param_alloc();
+x265_param *x265_param_alloc(void);
 
 /* x265_param_free:
  *  Use x265_param_free() to release storage for an x265_param instance
@@ -1039,7 +1032,7 @@ static const char * const x265_preset_names[] = { "ultrafast", "superfast", "ver
  *      100 times faster than placebo!
  *
  *      Currently available tunings are: */
-static const char * const x265_tune_names[] = { "psnr", "ssim", "zerolatency", "fastdecode", 0 };
+static const char * const x265_tune_names[] = { "psnr", "ssim", "grain", "zerolatency", "fastdecode", "cbr", 0 };
 
 /*      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
 int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
@@ -1049,22 +1042,12 @@ int x265_param_default_preset(x265_param *, const char *preset, const char *tune
  *  special in any way, but using this method together with x265_picture_free()
  *  and x265_picture_init() allows some version safety. New picture fields will
  *  always be added to the end of x265_picture */
-x265_picture *x265_picture_alloc();
+x265_picture *x265_picture_alloc(void);
 
 /* x265_picture_free:
  *  Use x265_picture_free() to release storage for an x265_picture instance
  *  allocated by x265_picture_alloc() */
 void x265_picture_free(x265_picture *);
-
-/* x265_alloc_analysis_data:
- *  Allocate memory to hold analysis data, returns 0 on success else negative */
-int x265_alloc_analysis_data(x265_picture*);
-
-/* x265_free_analysis_data:
- *  Use x265_free_analysis_data to release storage of members allocated by
- *  x265_alloc_analysis_data */
-void x265_free_analysis_data(x265_picture*);
-
 /***
  * Initialize an x265_picture structure to default values. It sets the pixel
  * depth and color space to the encoder's internal values and sets the slice